diff --git a/README.md b/README.md index c9b9e8916..7500dd472 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ from funasr import AutoModel # use vad, punc, spk or not as you need model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", vad_model="fsmn-vad", vad_model_revision="v2.0.2", - punc_model="ct-punc-c", punc_model_revision="v2.0.2", + punc_model="ct-punc-c", punc_model_revision="v2.0.3", # spk_model="cam++", spk_model_revision="v2.0.2", ) res = model.generate(input=f"{model.model_path}/example/asr_example.wav", diff --git a/README_zh.md b/README_zh.md index 9cd18977d..b19e7c29a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -89,7 +89,7 @@ from funasr import AutoModel # use vad, punc, spk or not as you need model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", vad_model="fsmn-vad", vad_model_revision="v2.0.2", - punc_model="ct-punc-c", punc_model_revision="v2.0.2", + punc_model="ct-punc-c", punc_model_revision="v2.0.3", # spk_model="cam++", spk_model_revision="v2.0.2", ) res = model.generate(input=f"{model.model_path}/example/asr_example.wav", diff --git a/examples/industrial_data_pretraining/bicif_paraformer/demo.py b/examples/industrial_data_pretraining/bicif_paraformer/demo.py index a06b308d1..f1b1496e5 100644 --- a/examples/industrial_data_pretraining/bicif_paraformer/demo.py +++ b/examples/industrial_data_pretraining/bicif_paraformer/demo.py @@ -10,7 +10,7 @@ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", - punc_model_revision="v2.0.2", + punc_model_revision="v2.0.3", spk_model="damo/speech_campplus_sv_zh-cn_16k-common", spk_model_revision="v2.0.2", ) diff --git a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh index 09e1c8393..55efdf2ce 100644 --- a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh @@ -4,7 +4,7 @@ model_revision="v2.0.2" vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" vad_model_revision="v2.0.2" punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" -punc_model_revision="v2.0.2" +punc_model_revision="v2.0.3" spk_model="damo/speech_campplus_sv_zh-cn_16k-common" spk_model_revision="v2.0.2" diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py index b4453e927..e17a83168 100644 --- a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py +++ b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py @@ -10,7 +10,7 @@ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", - punc_model_revision="v2.0.2", + punc_model_revision="v2.0.3", spk_model="damo/speech_campplus_sv_zh-cn_16k-common", spk_model_revision="v2.0.2" ) diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh index 98a325dd4..b8610cb70 100644 --- a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh +++ b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh @@ -4,7 +4,7 @@ model_revision="v2.0.2" vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" vad_model_revision="v2.0.2" punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" -punc_model_revision="v2.0.2" +punc_model_revision="v2.0.3" spk_model="damo/speech_campplus_sv_zh-cn_16k-common" spk_model_revision="v2.0.2" diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py index 78af3aa1d..724191d9e 100644 --- a/examples/industrial_data_pretraining/paraformer/demo.py +++ b/examples/industrial_data_pretraining/paraformer/demo.py @@ -5,7 +5,12 @@ from funasr import AutoModel -model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2") +model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.3", + # vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", + # vad_model_revision="v2.0.2", + # punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + # punc_model_revision="v2.0.3", + ) res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") print(res) diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py index 19ad1c9c5..a2029564d 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py +++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py @@ -10,7 +10,7 @@ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", - punc_model_revision="v2.0.2", + punc_model_revision="v2.0.3", spk_model="damo/speech_campplus_sv_zh-cn_16k-common", spk_model_revision="v2.0.2", ) diff --git a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh index 61029e114..f33568437 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh @@ -4,7 +4,7 @@ model_revision="v2.0.2" vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" vad_model_revision="v2.0.2" punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" -punc_model_revision="v2.0.2" +punc_model_revision="v2.0.3" python funasr/bin/inference.py \ +model=${model} \ diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 0538f6623..ca6189dd2 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -391,7 +391,7 @@ def inference_with_vad(self, input, input_len=None, **cfg): if self.punc_model is not None: self.punc_kwargs.update(cfg) punc_res = self.inference(result["text"], model=self.punc_model, kwargs=self.punc_kwargs, **cfg) - result["text_with_punc"] = punc_res[0]["text"] + result["text"] = punc_res[0]["text"] # speaker embedding cluster after resorted if self.spk_model is not None: diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py index 468d23f39..0c4f14aae 100644 --- a/funasr/models/paraformer/model.py +++ b/funasr/models/paraformer/model.py @@ -451,7 +451,7 @@ def inference(self, self.nbest = kwargs.get("nbest", 1) meta_data = {} - if isinstance(data_in, torch.Tensor): # fbank + if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank speech, speech_lengths = data_in, data_lengths if len(speech.shape) < 3: speech = speech[None, :, :]