Qwen3 ASR and Forced Aligner #43838

ebezzam · 2026-05-11T11:15:07Z

I added this because otherwise an existing self.num_labels (=5000) was getting overwritten by 2 during conversion.

-Original file line number
+Diff line change
@@ Expand Up / @@ -1125,6 +1125,8 @@ @@
             title: PE Audio
           - local: model_doc/pop2piano
             title: Pop2Piano
+          - local: model_doc/qwen3_asr
+            title: Qwen3 ASR
           - local: model_doc/seamless_m4t
             title: Seamless-M4T
           - local: model_doc/seamless_m4t_v2
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -402,6 +402,30 @@ def make_list_of_audio( @@
         raise ValueError("Invalid input type. Must be a single audio or a list of audio")
+    def make_list_of_audio_chat_template(
+        audio: list[AudioInput] | AudioInput | str | list[str],
+    ) -> AudioInput:
+        """
+        Ensure that the output is a list of audio. Unlike `make_list_of_audio`, this function also accepts a URL string or
+        local path, as accepted by chat templates.
+        Args:
+            audio (`Union[list[AudioInput], AudioInput]`):
+                The input audio. Can be a URL string, local path, numpy/torch array,  or a list of these.
+        Returns:
+            list: A list of audio.
+        """
+        # Handle string inputs
+        if isinstance(audio, str):
+            return [audio]
+        if isinstance(audio, (list, tuple)) and audio and all(isinstance(a, str) for a in audio):
+            return list(audio)
+        # Handle numpy/torch array inputs
+        return make_list_of_audio(audio)
     def hertz_to_mel(freq: float | np.ndarray, mel_scale: str = "htk") -> float | np.ndarray:
         """
         Convert frequency from hertz to mels.
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -259,7 +259,7 @@ def __post_init__(self, **kwargs): @@
             # Our configs prev wouldn't save `id2label` for 2 labels because it is the default. In all other
             # cases we expect the config dict to have an `id2label` field if it's a clf model, or not otherwise
             if self.id2label is None:
-                self.num_labels = kwargs.get("num_labels", 2)
+                self.num_labels = kwargs.get("num_labels", self.num_labels if self.num_labels is not None else 2)
             else:
                 if kwargs.get("num_labels") is not None and len(self.id2label) != kwargs.get("num_labels"):
                     logger.warning(
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -245,7 +245,11 @@ def __init__(self, config): @@
             else:
                 classifier_dropout = 0.1
             self.dropout = nn.Dropout(classifier_dropout)
-            self.score = nn.Linear(config.get_text_config().hidden_size, config.num_labels)
+            self.score = nn.Linear(
+                config.get_text_config().hidden_size,
+                config.num_labels,
+                bias=getattr(config, "token_classification_bias", True),
+            )
             # Initialize weights and apply final processing
             self.post_init()
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ @@
     import numpy as np
-    from ...audio_utils import AudioInput, make_list_of_audio
+    from ...audio_utils import AudioInput, make_list_of_audio_chat_template
     from ...feature_extraction_utils import BatchFeature
     from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
     from ...tokenization_utils_base import TextInput
@@ Expand Down Expand Up / @@ -200,14 +200,9 @@ def apply_transcription_request( @@
             """
-            if isinstance(audio, str):
-                audio_items: list[str | np.ndarray] = [audio]
-            elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
-                audio_items = list(audio)
-            else:
-                audio_items = list(make_list_of_audio(audio))
-                if is_torch_available():
-                    audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
+            audio_items: list[str | np.ndarray] = list(make_list_of_audio_chat_template(audio))
+            if is_torch_available():
+                audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
             batch_size = len(audio_items)
             if batch_size == 0:
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Qwen3 ASR and Forced Aligner #43838

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

ebezzam May 11, 2026

Uh oh!

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -507,6 +507,8 @@ @@
             ("qwen3_5_moe_vision", "Qwen3_5MoeVisionConfig"),
             ("qwen3_5_text", "Qwen3_5TextConfig"),
             ("qwen3_5_vision", "Qwen3_5VisionConfig"),
+            ("qwen3_asr", "Qwen3ASRConfig"),
+            ("qwen3_asr_encoder", "Qwen3ASREncoderConfig"),
             ("qwen3_moe", "Qwen3MoeConfig"),
             ("qwen3_next", "Qwen3NextConfig"),
             ("qwen3_omni_moe", "Qwen3OmniMoeConfig"),
@@ Expand Down Expand Up / @@ -848,6 +850,7 @@ @@
             ("qwen3_5_moe_vision", "qwen3_5_moe"),
             ("qwen3_5_text", "qwen3_5"),
             ("qwen3_5_vision", "qwen3_5"),
+            ("qwen3_asr_encoder", "qwen3_asr"),
             ("qwen3_omni_moe_audio_encoder", "qwen3_omni_moe"),
             ("qwen3_omni_moe_talker_code_predictor", "qwen3_omni_moe"),
             ("qwen3_omni_moe_talker_text", "qwen3_omni_moe"),
@@ Expand Down Expand Up / @@ -959,6 +962,7 @@ @@
             ("pe_audio", "PeAudioFeatureExtractor"),
             ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
             ("pop2piano", "Pop2PianoFeatureExtractor"),
+            ("qwen3_asr", "Qwen3ASRFeatureExtractor"),
             ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
             ("speech_to_text", "Speech2TextFeatureExtractor"),
             ("speecht5", "SpeechT5FeatureExtractor"),
@@ Expand Down Expand Up / @@ -1071,6 +1075,7 @@ @@
             ("qwen2_5_vl", "Qwen2_5_VLProcessor"),
             ("qwen2_audio", "Qwen2AudioProcessor"),
             ("qwen2_vl", "Qwen2VLProcessor"),
+            ("qwen3_asr", "Qwen3ASRProcessor"),
             ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"),
             ("qwen3_vl", "Qwen3VLProcessor"),
             ("sam", "SamProcessor"),
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
             ("qwen3_5_moe_vision", "Qwen3_5MoeVisionModel"),
             ("qwen3_5_text", "Qwen3_5TextModel"),
             ("qwen3_5_vision", "Qwen3_5VisionModel"),
+            ("qwen3_asr", "Qwen3ASRModel"),
+            ("qwen3_asr_encoder", "Qwen3ASREncoder"),
             ("qwen3_moe", "Qwen3MoeModel"),
             ("qwen3_next", "Qwen3NextModel"),
             ("qwen3_vl", "Qwen3VLModel"),
@@ Expand Down Expand Up @@
             ("openai-gpt", "OpenAIGPTLMHeadModel"),
             ("paligemma", "PaliGemmaForConditionalGeneration"),
             ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),
+            ("qwen3_asr", "Qwen3ASRForConditionalGeneration"),
             ("roberta", "RobertaForMaskedLM"),
             ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
             ("roc_bert", "RoCBertForPreTraining"),
@@ Expand Down Expand Up @@
             ("phi4_multimodal", "Phi4MultimodalForCausalLM"),
             ("qwen2_5_omni", "Qwen2_5OmniForConditionalGeneration"),
             ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),
+            ("qwen3_asr", "Qwen3ASRForConditionalGeneration"),
             ("qwen3_omni_moe", "Qwen3OmniMoeForConditionalGeneration"),
             ("vibevoice_asr", "VibeVoiceAsrForConditionalGeneration"),
             ("voxtral", "VoxtralForConditionalGeneration"),
@@ Expand Down Expand Up @@
             ("plbart", "PLBartForConditionalGeneration"),
             ("prophetnet", "ProphetNetForConditionalGeneration"),
             ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),
+            ("qwen3_asr", "Qwen3ASRForConditionalGeneration"),
             ("seamless_m4t", "SeamlessM4TForTextToText"),
             ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToText"),
             ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
@@ Expand All @@
             ("moonshine", "MoonshineForConditionalGeneration"),
             ("moonshine_streaming", "MoonshineStreamingForConditionalGeneration"),
             ("pop2piano", "Pop2PianoForConditionalGeneration"),
+            ("qwen3_asr", "Qwen3ASRForConditionalGeneration"),
             ("seamless_m4t", "SeamlessM4TForSpeechToText"),
             ("seamless_m4t_v2", "SeamlessM4Tv2ForSpeechToText"),
             ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
@@ Expand Down Expand Up @@
             ("qwen2_moe", "Qwen2MoeForTokenClassification"),
             ("qwen3", "Qwen3ForTokenClassification"),
             ("qwen3_5", "Qwen3_5ForTokenClassification"),
+            ("qwen3_asr", "Qwen3ASRForTokenClassification"),
             ("qwen3_moe", "Qwen3MoeForTokenClassification"),
             ("qwen3_next", "Qwen3NextForTokenClassification"),
             ("rembert", "RemBertForTokenClassification"),
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -282,6 +282,7 @@ @@
             ("qwen3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
             ("qwen3_5", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
             ("qwen3_5_moe", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
+            ("qwen3_asr", "Qwen2Tokenizer" if is_tokenizers_available() else None),
             ("qwen3_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
             ("qwen3_next", "Qwen2Tokenizer" if is_tokenizers_available() else None),
             ("qwen3_omni_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -144,10 +144,8 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel): @@
         def _init_weights(self, module):
             super()._init_weights(module)
             if isinstance(module, SinusoidsPositionEmbedding):
-                log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
-                inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
-                scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
-                init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
+                position_embeddings = module.compute_default_singular_positional_embedding()
+                init.copy_(module.positional_embedding, position_embeddings)
             elif isinstance(module, UpSample1d):
                 filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
                 init.copy_(module.filter, filter_tensor)
@@ Expand Down Expand Up @@
             self.max_timescale = max_timescale
             if channels % 2 != 0:
                 raise ValueError("SinusoidsPositionEmbedding needs even channels input")
-            log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
-            inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
-            scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
-            self.register_buffer(
-                "positional_embedding",
-                torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
-                persistent=False,
-            )
+            position_embedding = self.compute_default_singular_positional_embedding()
+            self.register_buffer("positional_embedding", position_embedding, persistent=False)
+        def compute_default_singular_positional_embedding(self):
+            log_timescale_increment = np.log(self.max_timescale) / (self.channels // 2 - 1)
+            inv_timescales = torch.exp(-log_timescale_increment * torch.arange(self.channels // 2).float())
+            scaled_time = torch.arange(self.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+            return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
         def forward(self, seqlen: int):
             return self.positional_embedding[:seqlen, :]
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    # Copyright 2026 The HuggingFace Team. All rights reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    from typing import TYPE_CHECKING
+    from ...utils import _LazyModule
+    from ...utils.import_utils import define_import_structure
+    if TYPE_CHECKING:
+        from .configuration_qwen3_asr import *
+        from .feature_extraction_qwen3_asr import *
+        from .modeling_qwen3_asr import *
+        from .processing_qwen3_asr import *
+    else:
+        import sys
+        _file = globals()["__file__"]
+        sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

Uh oh!

Qwen3 ASR and Forced Aligner #43838

Are you sure you want to change the base?

Uh oh!

Qwen3 ASR and Forced Aligner #43838

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

ebezzam May 11, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!