From 000ac61ad38fde9d4f4e3ef2399282f037abe228 Mon Sep 17 00:00:00 2001
From: anonymous626 <131758638+anonymous626@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:40:40 +0800
Subject: [PATCH] Add audio transcription scrip for reproduce

This script transcribes audio files from a specified directory using the WhisperX model, aligns the output, and saves the results in JSON format.
---
 reproduce/transcribe.py | 53 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 reproduce/transcribe.py

diff --git a/reproduce/transcribe.py b/reproduce/transcribe.py
new file mode 100644
index 0000000..790f72d
--- /dev/null
+++ b/reproduce/transcribe.py
@@ -0,0 +1,53 @@
+import json
+import os
+import whisperx
+import gc 
+from whisperx.alignment import DEFAULT_ALIGN_MODELS_TORCH, DEFAULT_ALIGN_MODELS_HF 
+
+device = "cuda" 
+batch_size = 64 # reduce if low on GPU mem
+compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
+
+# 1. Transcribe with original whisper (batched)
+model = whisperx.load_model("large-v3", device, compute_type=compute_type)
+# 3. Assign speaker labels
+HF_TOKEN = "YOUR_HF_TOKEN"
+# diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
+
+root = "./lvbench_vdb"
+for file in os.listdir(root):
+    # save model to local path (optional)
+    # model_dir = "/path/"
+    # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
+    
+    if not file.endswith(".mp3"):
+        continue
+    
+    audio_file = os.path.join(root, file)
+
+    if os.path.exists(audio_file.replace(".mp3", ".json")):
+        print(f"File {audio_file.replace('.mp3', '.json')} already exists, skipping...")
+        with open(audio_file.replace(".mp3", ".json"), "r") as f:
+            legacy_result = json.load(f)
+    else:
+        legacy_result = None
+
+    audio = whisperx.load_audio(audio_file)
+    result = model.transcribe(audio, batch_size=batch_size)
+
+    if result["language"] in DEFAULT_ALIGN_MODELS_TORCH or \
+        result["language"] in DEFAULT_ALIGN_MODELS_HF:
+        lang = result["language"]
+    else:
+        lang = 'en'
+        print(f"Language {result['language']} not supported, using English instead for {audio_file}.")
+
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=lang, device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+    with open(audio_file.replace(".mp3", ".json"), "w") as f:
+        json.dump(result, f, indent=4)
+    print(f"saved as {audio_file.replace('.mp3', '.json')}")
+
+