-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsplit.py
51 lines (42 loc) · 1.73 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import subprocess
import random
import os
from pathlib import Path
import librosa
from scipy.io import wavfile
import numpy as np
import torch
import csv
import whisper
import argparse
a="character"
def split_long_audio(model, filepaths, save_dir="data_dir", out_sr=22050):
if isinstance(filepaths, str):
filepaths = [filepaths]
for file_idx, filepath in enumerate(filepaths):
save_path = Path(save_dir)
save_path.mkdir(exist_ok=True, parents=True)
print(f"Transcribing file {file_idx}: '{filepath}' to segments...")
result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5)
segments = result['segments']
wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)
wav, _ = librosa.effects.trim(wav, top_db=20)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)
wav2 /= max(wav2.max(), -wav2.min())
for i, seg in enumerate(segments):
start_time = seg['start']
end_time = seg['end']
wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]
wav_seg_name = f"{a}_{i}.wav"
out_fpath = save_path / wav_seg_name
wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))
whisper_size = "large"
whisper_model = whisper.load_model(whisper_size)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--filename", default="linghua.wav")
args = parser.parse_args()
split_long_audio(whisper_model, "./" + args.filename, "./custom_character_voice/character/")