Skip to content

Commit

Permalink
Merge pull request #1247 from alibaba-damo-academy/funasr1.0
Browse files Browse the repository at this point in the history
Funasr1.0
  • Loading branch information
R1ckShi authored Jan 15, 2024
2 parents 84465d6 + 1233c0d commit ddbc8b5
Show file tree
Hide file tree
Showing 29 changed files with 1,471 additions and 1,484 deletions.
21 changes: 12 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,15 @@ Notes: Support recognition of single audio file, as well as file list in Kaldi-s
### Speech Recognition (Non-streaming)
```python
from funasr import AutoModel

model = AutoModel(model="paraformer-zh")
# for the long duration wav, you could add vad model
# model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc")

res = model(input="asr_example_zh.wav", batch_size=64)
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
spk_model="cam++", spk_model_revision="v2.0.2")
res = model(input=f"{model.model_path}/example/asr_example.wav",
batch_size=16,
hotword='魔搭')
print(res)
```
Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
Expand All @@ -108,7 +111,7 @@ chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.0")
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")

import soundfile
import os
Expand Down Expand Up @@ -163,7 +166,7 @@ for i in range(total_chunk_num):
```python
from funasr import AutoModel

model = AutoModel(model="ct-punc", model_revision="v2.0.1")
model = AutoModel(model="ct-punc", model_revision="v2.0.2")

res = model(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
Expand All @@ -172,7 +175,7 @@ print(res)
```python
from funasr import AutoModel

model = AutoModel(model="fa-zh", model_revision="v2.0.0")
model = AutoModel(model="fa-zh", model_revision="v2.0.2")

wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/asr_example.wav"
Expand Down
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@
fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)

for batch_idx, fbank_dict in enumerate(fbanks):
res = model(**fbank_dict)
print(res)
res = model(**fbank_dict)
print(res)
6 changes: 4 additions & 2 deletions examples/industrial_data_pretraining/seaco_paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.2",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='达摩院 磨搭')
res = model(input=f"{model.model_path}/example/asr_example.wav",
hotword='达摩院 魔搭')
print(res)
10 changes: 2 additions & 8 deletions funasr/bin/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,9 @@ def generate(self, input, input_len=None, model=None, kwargs=None, key=None, **c
def generate_with_vad(self, input, input_len=None, **cfg):

# step.1: compute the vad model
model = self.vad_model
kwargs = self.vad_kwargs
kwargs.update(cfg)
self.vad_kwargs.update(cfg)
beg_vad = time.time()
res = self.generate(input, input_len=input_len, model=model, kwargs=kwargs, **cfg)
vad_res = res
res = self.generate(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg)
end_vad = time.time()
print(f"time cost vad: {end_vad - beg_vad:0.3f}")

Expand Down Expand Up @@ -312,10 +309,7 @@ def generate_with_vad(self, input, input_len=None, **cfg):
if not len(sorted_data):
logging.info("decoding, utt: {}, empty speech".format(key))
continue


# if kwargs["device"] == "cpu":
# batch_size = 0
if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])

Expand Down
Loading

0 comments on commit ddbc8b5

Please sign in to comment.