|
| 1 | +import io |
| 2 | +import pathlib |
| 3 | +import modal |
| 4 | + |
| 5 | +# ---------- Modal App and Image ---------- |
| 6 | + |
| 7 | +stub = modal.App("dia-tts-app") |
| 8 | + |
| 9 | +# Base CUDA‑ready Debian image + Python 3.10 |
| 10 | +# We install torch with GPU wheels, Dia itself and a couple of audio helpers. |
| 11 | +image = ( |
| 12 | + modal.Image.debian_slim(python_version="3.10") |
| 13 | + # SoundFile requires libsndfile at the OS level. |
| 14 | + .apt_install("git", "ffmpeg", "libsndfile1").pip_install( |
| 15 | + # CUDA enabled torch/torchaudio wheels |
| 16 | + "torch==2.6.0", # Modal already has the CUDA runtime so the PyPI wheel is fine |
| 17 | + "torchaudio==2.6.0", |
| 18 | + "soundfile", |
| 19 | + # Install Dia straight from GitHub ― keeps you on the latest commit |
| 20 | + "git+https://github.com/nari-labs/dia.git", |
| 21 | + ) |
| 22 | +) |
| 23 | + |
| 24 | +# ---------- Remote function ---------- |
| 25 | + |
| 26 | + |
| 27 | +@stub.function( |
| 28 | + image=image, |
| 29 | + gpu="A10G", # ~10 GB VRAM, plenty for Dia 1.6 B |
| 30 | + timeout=60 * 15, # 15‑minute max runtime (first run downloads ~4 GB weights) |
| 31 | +) |
| 32 | +def tts(text: str) -> bytes: |
| 33 | + """Generate speech from *text* and return WAV bytes.""" |
| 34 | + from dia.model import Dia |
| 35 | + import soundfile as sf |
| 36 | + |
| 37 | + # Load the pretrained model (cached inside the container after the first call) |
| 38 | + model = Dia.from_pretrained("nari-labs/Dia-1.6B") |
| 39 | + |
| 40 | + # Dia uses 44.1 kHz, 16‑bit WAV output |
| 41 | + wav = model.generate(text) |
| 42 | + |
| 43 | + # Serialise the NumPy array returned by Dia to an in‑memory WAV file |
| 44 | + buf = io.BytesIO() |
| 45 | + sf.write(buf, wav, 44100, format="WAV", subtype="PCM_16") |
| 46 | + buf.seek(0) |
| 47 | + return buf.read() |
| 48 | + |
| 49 | + |
| 50 | +# ---------- Local entry‑point ---------- |
| 51 | + |
| 52 | + |
| 53 | +@stub.local_entrypoint() |
| 54 | +def main( |
| 55 | + script: str = "[S1] Dia running on Modal! [S2] Sounds good, doesn’t it?", |
| 56 | + out: str = "output.wav", |
| 57 | +): |
| 58 | + """CLI entry‑point executed on your MacBook via `modal run`. |
| 59 | +
|
| 60 | + Examples |
| 61 | + -------- |
| 62 | + $ modal run dia_modal_stub.py --script "[S1] Hello world" --out hello.wav |
| 63 | + """ |
| 64 | + audio = tts.remote(script) # Remote GPU call ➜ audio bytes back to your laptop |
| 65 | + pathlib.Path(out).write_bytes(audio) |
| 66 | + print(f"\N{MUSICAL NOTE} Saved {out} ({len(audio) // 1024} KB)") |
0 commit comments