Merge pull request #1247 from alibaba-damo-academy/funasr1.0

Funasr1.0
modelscope · Jan 15, 2024 · ddbc8b5 · ddbc8b5
2 parents 84465d6 + 1233c0d
commit ddbc8b5
Show file tree

Hide file tree

Showing 29 changed files with 1,471 additions and 1,484 deletions.
diff --git a/README.md b/README.md
@@ -90,12 +90,15 @@ Notes: Support recognition of single audio file, as well as file list in Kaldi-s
 ### Speech Recognition (Non-streaming)
 ```python
 from funasr import AutoModel
-
-model = AutoModel(model="paraformer-zh")
-# for the long duration wav, you could add vad model
-# model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc")
-
-res = model(input="asr_example_zh.wav", batch_size=64)
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
+                  spk_model="cam++", spk_model_revision="v2.0.2")
+res = model(input=f"{model.model_path}/example/asr_example.wav", 
+            batch_size=16, 
+            hotword='魔搭')
 print(res)
 ```
 Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
@@ -108,7 +111,7 @@ chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
 encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
 decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
 
-model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.0")
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")
 
 import soundfile
 import os
@@ -163,7 +166,7 @@ for i in range(total_chunk_num):
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="ct-punc", model_revision="v2.0.1")
+model = AutoModel(model="ct-punc", model_revision="v2.0.2")
 
 res = model(input="那今天的会就到这里吧 happy new year 明年见")
 print(res)
@@ -172,7 +175,7 @@ print(res)
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="fa-zh", model_revision="v2.0.0")
+model = AutoModel(model="fa-zh", model_revision="v2.0.2")
 
 wav_file = f"{model.model_path}/example/asr_example.wav"
 text_file = f"{model.model_path}/example/asr_example.wav"

diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py
@@ -18,5 +18,5 @@
 fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
 
 for batch_idx, fbank_dict in enumerate(fbanks):
-	res = model(**fbank_dict)
-	print(res)
+    res = model(**fbank_dict)
+    print(res)
diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
@@ -11,8 +11,10 @@
                   vad_model_revision="v2.0.2",
                   punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
                   punc_model_revision="v2.0.2",
+                  spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                  spk_model_revision="v2.0.2",
                   )
 
-res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-            hotword='达摩院 磨搭')
+res = model(input=f"{model.model_path}/example/asr_example.wav",
+            hotword='达摩院 魔搭')
 print(res)
diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py
@@ -274,12 +274,9 @@ def generate(self, input, input_len=None, model=None, kwargs=None, key=None, **c
     def generate_with_vad(self, input, input_len=None, **cfg):
 
         # step.1: compute the vad model
-        model = self.vad_model
-        kwargs = self.vad_kwargs
-        kwargs.update(cfg)
+        self.vad_kwargs.update(cfg)
         beg_vad = time.time()
-        res = self.generate(input, input_len=input_len, model=model, kwargs=kwargs, **cfg)
-        vad_res = res
+        res = self.generate(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg)
         end_vad = time.time()
         print(f"time cost vad: {end_vad - beg_vad:0.3f}")
 
@@ -312,10 +309,7 @@ def generate_with_vad(self, input, input_len=None, **cfg):
             if not len(sorted_data):
                 logging.info("decoding, utt: {}, empty speech".format(key))
                 continue
-
 
-            # if kwargs["device"] == "cpu":
-            #     batch_size = 0
             if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
                 batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])