From 6b48a206875771d37ec4581b534cf2613dd15c89 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 27 Apr 2023 12:47:10 +0000 Subject: [PATCH 1/2] fix asr infer.py --- paddlespeech/cli/asr/infer.py | 8 ++++---- paddlespeech/cli/ssl/infer.py | 2 +- paddlespeech/cli/whisper/infer.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7a7aef8b0e5..231a00f4d80 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,9 +25,6 @@ import numpy as np import paddle import soundfile -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -37,6 +34,9 @@ from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -274,7 +274,7 @@ def preprocess(self, model_type: str, input: Union[str, os.PathLike]): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py index dce7c77815d..44fbb425791 100644 --- a/paddlespeech/cli/ssl/infer.py +++ b/paddlespeech/cli/ssl/infer.py @@ -245,7 +245,7 @@ def preprocess(self, model_type: str, input: Union[str, os.PathLike]): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index ebcca890b59..17e8c0b8c9c 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -253,7 +253,7 @@ def preprocess(self, model_type: str, input: Union[str, os.PathLike]): # fbank audio = log_mel_spectrogram(audio, resource_path=self.resource_path) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len From c6a83b07754914c5f7af5192f9a5e3e4632a0402 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Fri, 28 Apr 2023 03:21:13 +0000 Subject: [PATCH 2/2] add readme. --- README.md | 1 + README_cn.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 4f03f57459e..f9ae1ead7e7 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved. - 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167). - 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server). - 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized. diff --git a/README_cn.md b/README_cn.md index e5e18f0c59e..25a716d2961 100644 --- a/README_cn.md +++ b/README_cn.md @@ -183,6 +183,7 @@ - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 ### 近期更新 +- ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。 - 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167). - 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。 - 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例,包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5),效果持续优化中。