Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[datapipe] Support wenet datapipe #182

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/hi_xiaowen/s0/conf/ds_tcn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ dataset_conf:
resample_conf:
resample_rate: 16000
speed_perturb: false
feature_extraction_conf:
feature_type: 'fbank'
feats_type: 'fbank'
fbank_conf:
num_mel_bins: 40
frame_shift: 10
frame_length: 25
Expand All @@ -20,6 +20,7 @@ dataset_conf:
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: false
batch_conf:
batch_size: 256

Expand Down
11 changes: 8 additions & 3 deletions examples/hi_xiaowen/s0/conf/ds_tcn_ctc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ dataset_conf:
filter_conf:
max_length: 2048
min_length: 0
token_max_length: 200
token_min_length: 1
max_output_input_ratio: 1
min_output_input_ratio: 0.0005
resample_conf:
resample_rate: 16000
speed_perturb: false
feature_extraction_conf:
feature_type: 'fbank'
feats_type: 'fbank'
fbank_conf:
num_mel_bins: 40
frame_shift: 10
frame_length: 25
Expand All @@ -20,8 +24,9 @@ dataset_conf:
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: false
batch_conf:
batch_size: 256
batch_size: 64

model:
hidden_dim: 256
Expand Down
13 changes: 10 additions & 3 deletions examples/hi_xiaowen/s0/local/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,29 @@ def main():
type=str,
help='dir containing all the wav files')
parser.add_argument('path', type=str, help='path to the json file')
parser.add_argument('dict', type=str, help='path to the dict file')
parser.add_argument('out_dir', type=str, help='out dir')
args = parser.parse_args()

id2token = {}
with open(args.dict, 'r', encoding='utf-8') as f:
for line in f:
token, idx = line.strip().split()
id2token[int(idx)] = token

with open(args.path, 'r', encoding='utf-8') as f:
data = json.load(f)
utt_id, label = [], []
utt_id, text = [], []
for entry in data:
utt_id.append(entry['utt_id'])
label.append(int(entry['keyword_id']))
text.append(id2token[entry['keyword_id']])

abs_dir = os.path.abspath(args.wav_dir)
wav_path = os.path.join(args.out_dir, 'wav.scp')
text_path = os.path.join(args.out_dir, 'text')
with open(wav_path, 'w', encoding='utf-8') as f_wav, \
open(text_path, 'w', encoding='utf-8') as f_text:
for utt, l in zip(utt_id, label):
for utt, l in zip(utt_id, text):
f_wav.write('{} {}\n'.format(utt,
os.path.join(abs_dir, utt + ".wav")))
f_text.write('{} {}\n'.format(utt, l))
Expand Down
21 changes: 14 additions & 7 deletions examples/hi_xiaowen/s0/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Preparing datasets..."
mkdir -p dict
echo "<filler> -1" > dict/words.txt
echo "Hi_Xiaowen 0" >> dict/words.txt
echo "Nihao_Wenwen 1" >> dict/words.txt
echo "<FILLER> -1" > dict/dict.txt
echo "<HI_XIAOWEN> 0" >> dict/dict.txt
echo "<NIHAO_WENWEN> 1" >> dict/dict.txt
awk '{print $1}' dict/dict.txt > dict/words.txt

for folder in train dev test; do
mkdir -p data/$folder
for prefix in p n; do
mkdir -p data/${prefix}_$folder
json_path=$download_dir/mobvoi_hotword_dataset_resources/${prefix}_$folder.json
local/prepare_data.py $download_dir/mobvoi_hotword_dataset $json_path \
data/${prefix}_$folder
dict/dict.txt data/${prefix}_$folder
done
cat data/p_$folder/wav.scp data/n_$folder/wav.scp > data/$folder/wav.scp
cat data/p_$folder/text data/n_$folder/text > data/$folder/text
Expand Down Expand Up @@ -82,6 +83,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--num_keywords $num_keywords \
--min_duration 50 \
--seed 666 \
--dict ./dict \
$cmvn_opts \
${checkpoint:+--checkpoint $checkpoint}
fi
Expand All @@ -101,10 +103,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--gpu 0 \
--batch_size 256 \
--checkpoint $score_checkpoint \
--score_file $result_dir/score.txt \
--score_file $result_dir/score.txt \
--dict ./dict \
--num_workers 8

for keyword in 0 1; do
for keyword in `tail -n +2 dict/words.txt`; do
python wekws/bin/compute_det.py \
--keyword $keyword \
--test_data data/test/data.list \
Expand All @@ -115,8 +118,12 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

# plot det curve
python wekws/bin/plot_det_curve.py \
--keywords_dict dict/words.txt \
--keywords_dict dict/dict.txt \
--stats_dir $result_dir \
--xlim 2 \
--x_step 1 \
--ylim 5 \
--y_step 1 \
--figure_file $result_dir/det.png
fi

Expand Down
17 changes: 8 additions & 9 deletions examples/hi_xiaowen/s0/run_ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,18 @@ fi
if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
echo "Preparing datasets..."
mkdir -p dict
echo "<filler> -1" > dict/words.txt
echo "Hi_Xiaowen 0" >> dict/words.txt
echo "Nihao_Wenwen 1" >> dict/words.txt
echo "<FILLER> -1" > dict/dict.txt
echo "<HI_XIAOWEN> 0" >> dict/dict.txt
echo "<NIHAO_WENWEN> 1" >> dict/dict.txt
awk '{print $1}' dict/dict.txt > dict/words.txt

for folder in train dev test; do
mkdir -p data/$folder
for prefix in p n; do
mkdir -p data/${prefix}_$folder
json_path=$download_dir/mobvoi_hotword_dataset_resources/${prefix}_$folder.json
local/prepare_data.py $download_dir/mobvoi_hotword_dataset $json_path \
data/${prefix}_$folder
dict/dict.txt data/${prefix}_$folder
done
cat data/p_$folder/wav.scp data/n_$folder/wav.scp > data/$folder/wav.scp
cat data/p_$folder/text data/n_$folder/text > data/$folder/text
Expand All @@ -74,8 +75,8 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

# and we also copy the tokens and lexicon that used in
# https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun/summary
cp mobvoi_kws_transcription/tokens.txt data/tokens.txt
cp mobvoi_kws_transcription/lexicon.txt data/lexicon.txt
awk '{print $1, $2-1}' mobvoi_kws_transcription/tokens.txt > dict/dict.txt
echo '<SILENCE>' > dict/words.txt

fi

Expand All @@ -90,9 +91,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

# Here we use tokens.txt and lexicon.txt to convert txt into index
tools/make_list.py data/$x/wav.scp data/$x/text \
data/$x/wav.dur data/$x/data.list \
--token_file data/tokens.txt \
--lexicon_file data/lexicon.txt
data/$x/wav.dur data/$x/data.list
done
fi

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ pyflakes==2.2.0
lmdb
scipy
tqdm
langid
pypinyin
5 changes: 2 additions & 3 deletions tools/compute_cmvn_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,9 @@ def __getitem__(self, idx):

with open(args.train_config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
feat_dim = configs['dataset_conf']['feature_extraction_conf'][
feat_type = configs['dataset_conf']['feats_type']
feat_dim = configs['dataset_conf'][f'{feat_type}_conf'][
'num_mel_bins']
feat_type = configs['dataset_conf']['feature_extraction_conf'][
'feature_type']
resample_rate = 0
if 'resample_conf' in configs['dataset_conf']:
resample_rate = configs['dataset_conf']['resample_conf'][
Expand Down
Loading