Skip to content

Commit

Permalink
Merge pull request #1250 from alibaba-damo-academy/funasr1.0
Browse files Browse the repository at this point in the history
Funasr1.0
  • Loading branch information
R1ckShi authored Jan 16, 2024
2 parents 296d73f + b7cb19b commit eba1fcc
Show file tree
Hide file tree
Showing 20 changed files with 581 additions and 519 deletions.
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
spk_model="cam++", spk_model_revision="v2.0.2")
res = model(input=f"{model.model_path}/example/asr_example.wav",
batch_size=64,
hotword='魔搭')
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
batch_size=64,
hotword='魔搭')
print(res)
```
Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
Expand All @@ -124,7 +124,7 @@ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```
Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
Expand All @@ -135,7 +135,7 @@ from funasr import AutoModel

model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")
wav_file = f"{model.model_path}/example/asr_example.wav"
res = model(input=wav_file)
res = model.generate(input=wav_file)
print(res)
```
### Voice Activity Detection (Non-streaming)
Expand All @@ -156,7 +156,7 @@ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
Expand All @@ -165,7 +165,7 @@ for i in range(total_chunk_num):
from funasr import AutoModel

model = AutoModel(model="ct-punc", model_revision="v2.0.2")
res = model(input="那今天的会就到这里吧 happy new year 明年见")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
### Timestamp Prediction
Expand All @@ -175,7 +175,7 @@ from funasr import AutoModel
model = AutoModel(model="fa-zh", model_revision="v2.0.2")
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model(input=(wav_file, text_file), data_type=("sound", "text"))
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
[//]: # (FunASR supports inference and fine-tuning of models trained on industrial datasets of tens of thousands of hours. For more details, please refer to ([modelscope_egs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)). It also supports training and fine-tuning of models on academic standard datasets. For more details, please refer to([egs](https://alibaba-damo-academy.github.io/FunASR/en/academic_recipe/asr_recipe.html)). The models include speech recognition (ASR), speech activity detection (VAD), punctuation recovery, language model, speaker verification, speaker separation, and multi-party conversation speech recognition. For a detailed list of models, please refer to the [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md):)
Expand Down Expand Up @@ -229,10 +229,16 @@ The use of pretraining model is subject to [model license](./MODEL_LICENSE)
}
@inproceedings{gao22b_interspeech,
author={Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan},
title={{Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition}},
title={Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={2063--2067},
doi={10.21437/Interspeech.2022-9996}
}
@inproceedings{shi2023seaco,
author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang},
title={SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability},
year={2023},
booktitle={ICASSP2024}
}
```
18 changes: 12 additions & 6 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
spk_model="cam++", spk_model_revision="v2.0.2")
res = model(input=f"{model.model_path}/example/asr_example.wav",
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
batch_size=64,
hotword='魔搭')
print(res)
Expand Down Expand Up @@ -121,7 +121,7 @@ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```

Expand All @@ -134,7 +134,7 @@ from funasr import AutoModel
model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")

wav_file = f"{model.model_path}/example/asr_example.wav"
res = model(input=wav_file)
res = model.generate(input=wav_file)
print(res)
```

Expand All @@ -156,7 +156,7 @@ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
Expand All @@ -167,7 +167,7 @@ from funasr import AutoModel

model = AutoModel(model="ct-punc", model_revision="v2.0.2")

res = model(input="那今天的会就到这里吧 happy new year 明年见")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```

Expand All @@ -179,7 +179,7 @@ model = AutoModel(model="fa-zh", model_revision="v2.0.0")

wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model(input=(wav_file, text_file), data_type=("sound", "text"))
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
更多详细用法([示例](examples/industrial_data_pretraining)
Expand Down Expand Up @@ -242,4 +242,10 @@ FunASR支持预训练或者进一步微调的模型进行服务部署。目前
pages={2063--2067},
doi={10.21437/Interspeech.2022-9996}
}
@article{shi2023seaco,
author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang},
title={{SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability}},
year=2023,
journal={arXiv preprint arXiv:2308.03266(accepted by ICASSP2024)},
}
```
16 changes: 8 additions & 8 deletions examples/industrial_data_pretraining/bicif_paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from funasr import AutoModel

model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.2",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.2",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.2",
model_revision="v2.0.2",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.2",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60)
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60)
print(res)
2 changes: 1 addition & 1 deletion examples/industrial_data_pretraining/campplus_sv/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='达摩院 魔搭')
print(res)
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/ct_transformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
print(res)


from funasr import AutoModel

model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
rec_result_all = "outputs: "
cache = {}
for vad in vads:
rec_result = model(input=vad, cache=cache)
rec_result = model.generate(input=vad, cache=cache)
print(rec_result)
rec_result_all += rec_result[0]['text']

Expand Down
2 changes: 1 addition & 1 deletion examples/industrial_data_pretraining/emotion2vec/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.1")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs")
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
chunk_size = 60000 # ms
model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.2")

res = model(input=wav_file, chunk_size=chunk_size, )
res = model.generate(input=wav_file, chunk_size=chunk_size, )
print(res)


Expand All @@ -28,7 +28,7 @@
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk,
res = model.generate(input=speech_chunk,
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.2")

res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
res = model.generate(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
"欢迎大家来到魔搭社区进行体验"),
data_type=("sound", "text"),
batch_size=2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
spk_model_revision="v2.0.2"
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='达摩院 磨搭')
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='达摩院 磨搭')
print(res)
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)


Expand All @@ -18,5 +18,5 @@
fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)

for batch_idx, fbank_dict in enumerate(fbanks):
res = model(**fbank_dict)
res = model.generate(**fbank_dict)
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.2")
cache = {}
res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
chunk_size=chunk_size,
encoder_chunk_look_back=encoder_chunk_look_back,
decoder_chunk_look_back=decoder_chunk_look_back,
Expand All @@ -32,11 +32,11 @@
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model(input=speech_chunk,
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
encoder_chunk_look_back=encoder_chunk_look_back,
decoder_chunk_look_back=decoder_chunk_look_back,
)
res = model.generate(input=speech_chunk,
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
encoder_chunk_look_back=encoder_chunk_look_back,
decoder_chunk_look_back=decoder_chunk_look_back,
)
print(res)
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/seaco_paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
spk_model_revision="v2.0.2",
)

res = model(input=f"{model.model_path}/example/asr_example.wav",
hotword='达摩院 魔搭')
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
hotword='达摩院 魔搭')
print(res)
3 changes: 2 additions & 1 deletion funasr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ def import_submodules(package, recursive=True):

import_submodules(__name__)

from funasr.bin.inference import AutoModel, AutoFrontend
from funasr.auto.auto_model import AutoModel
from funasr.auto.auto_frontend import AutoFrontend
Empty file added funasr/auto/__init__.py
Empty file.
Loading

0 comments on commit eba1fcc

Please sign in to comment.