-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdicti.py
109 lines (92 loc) · 4.04 KB
/
dicti.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os, sys
import pandas
from gtts import gTTS
import requests
import urllib
import json
import base64
import sox
import re
import subprocess
################################################################################
## file paths and config
################################################################################
audio = "glagoli/pravilni/"
audio_temp = "glagoli/pravilni/_source/"
# wordslist = "3001.csv"
wordslist = "glagoli/help.csv"
last_word_order = "last"
## read number from file last
last = open(last_word_order, "r")
latest = int(last.read())
print "==="
print "starting at: " + str(latest)
print "==="
last.close()
################################################################################
## get text to speach croatian website cookies
################################################################################
session = requests.Session()
# response = session.get('https://alfanum.co.rs/index.php/sr/demonstracija/demonstracija-tts')
# # print(session.cookies.get_dict())
cookies = session.cookies.get_dict()
################################################################################
## get english, serbian and/or croatian audio function
################################################################################
# pretty straight foward how to get mp3 but it looks like those mp3 files lack some characteristics, important for merging
def getEnglishAudio(word,tempname):
# tts = gTTS(word)
# tts.save(audio_temp+tempname+"_en.mp3")
return audio_temp+tempname+"_en.mp3"
# this website doesn't have request's monitoring so i was able to hit the script constantly
def getCroatianAudio(word,tempname):
# response = requests.post("https://www.hsm360.com/wp-content/plugins/hsm-screen-reader/lib/tts_req.php",data={
# 'input_text': word,
# 'rate':'0.995',
# 'pitch':'0.65',
# })
# data = response.json()
# fileurl = data["file_url"]
# urllib.urlretrieve (fileurl, audio_temp+tempname+"_ba.mp3")
return audio_temp+tempname+"_ba.mp3"
# this website has controls requests
def getSerbianAudio(word,tempname):
response = requests.post("https://www.alfanum.co.rs/tts_req.php",data={
'input_text': word,
'outlang': 'sr',
'speaker':'AlfaNum Ivana',
'rate':'0.9995',
'pitch':'0.875',
'port':'5040',
'enc':'1',
'address': 'tts4.alfanum.co.rs',
'server_id': '0' },cookies=cookies)
data = response.json()
fileurl = "https://tts4.alfanum.co.rs:5050/ttsnovi/"+data["file"]
urllib.urlretrieve (fileurl, audio_temp+tempname+"_ba.mp3")
return audio_temp+tempname+"_ba.mp3"
################################################################################
## generate combined audio with pause
################################################################################
cbn = sox.Combiner()
def generateCombinedAudio(file1,file2,name):
print "------------------------"
print file1 + " & " + file2 + " => " + name +'.mp3'
print "------------------------"
command = 'ffmpeg -i '+file1+' -i '+file2+' -filter_complex "[0:a:0][1:a:0]concat=n=2:v=0:a=1[outa]" -map "[outa]" '+audio+finalname+'.mp3'
output = subprocess.check_output(['bash','-c', command])
################################################################################
## loop the list of words in csv
################################################################################
wordslistfile = pandas.read_csv(wordslist,skipinitialspace = True, quotechar = '"')
for index, row in wordslistfile.iterrows():
# if index > latest:
if index < 199:
file1 = getEnglishAudio(row['english'].strip(), str(index))
file2 = getCroatianAudio(row['bosnian'], str(index))
finalname = row['english'].split(',')[0].strip().replace(" ", "_").replace("'", "")+'-'+ row['bosnian'].split(',')[0].strip().replace(" ", "_")
generateCombinedAudio(file1,file2,finalname)
last = open(last_word_order, "w")
last.write(str(index))
last.close()
print " < vedran alajbegovic - [email protected] > 2019 SARAJEVO, BOSNA I HERCEGOVINA"