Showing
13 changed files
with
582 additions
and
0 deletions
PKG-INFO
0 → 100644
1 | +Metadata-Version: 1.1 | ||
2 | +Name: etaloncorpuscreator | ||
3 | +Version: 0.1 | ||
4 | +Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3 | ||
5 | +Home-page: https://github.com/dangrebenkin/audiocorpusbuilder | ||
6 | +Author: Daniel Grebenkin | ||
7 | +Author-email: d.grebenkin@g.nsu.ru | ||
8 | +License: Apache License Version 2.0 | ||
9 | +Description: UNKNOWN | ||
10 | +Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3 | ||
11 | +Platform: Linux | ||
12 | +Classifier: Development Status :: 3 - Alpha | ||
13 | +Classifier: Intended Audience :: Science/Research | ||
14 | +Classifier: Intended Audience :: Developers | ||
15 | +Classifier: Topic :: Software Development | ||
16 | +Classifier: Topic :: Scientific/Engineering | ||
17 | +Classifier: License :: OSI Approved :: Apache Software License | ||
18 | +Classifier: Programming Language :: Python :: 3.6 | ||
19 | +Classifier: Programming Language :: Python :: 3.7 | ||
20 | +Classifier: Programming Language :: Python :: 3.8 |
README.md
0 → 100644
1 | +# About | ||
2 | + | ||
3 | +Etaloncorpuscreator-package was made to automatically create a russian language audio corpus from YouTube videotracks playlists: it downloads video's audio and subtitles, makes pairs "sound-text", doing forced alignment and saves new corpus and varieties. | ||
4 | + | ||
5 | +# Installing | ||
6 | + | ||
7 | +For installation you need Python 3.6 or later, OC Linux and sphinx3 on your local machine. | ||
8 | + | ||
9 | +# Start | ||
10 | + | ||
11 | +To run etaloncorpuscreator you shoild prepare directories for audiotracks, subtitles, results. Also you need to create playlists.txt with playlists' links, every link should be on the separate line. | ||
12 | + | ||
13 | +# Arguments | ||
14 | + | ||
15 | +All arguments are required for program use. | ||
16 | + | ||
17 | +1. -p URL_list | ||
18 | + | ||
19 | +Playlists txt-file path. | ||
20 | + | ||
21 | +2. -a directory_audio | ||
22 | + | ||
23 | +Path to download audiotracks. | ||
24 | + | ||
25 | +3. -s directory_subtitles | ||
26 | + | ||
27 | +Path to download subtitles. | ||
28 | + | ||
29 | +4. -r directory_results | ||
30 | + | ||
31 | +Path for audio results. | ||
32 | + | ||
33 | +5. -am sphinx_model_path | ||
34 | + | ||
35 | +Your acoustic model path. | ||
36 | + | ||
37 | +6. -dict dictionary_path | ||
38 | + | ||
39 | +Your dictionary path. | ||
40 | + | ||
41 | +7. -dict_f dictionary_filler_path | ||
42 | + | ||
43 | +Your dictionary filler path. | ||
44 | + | ||
45 | +8. -ar directory_alignment_results | ||
46 | + | ||
47 | +Path for alignment results. | ||
48 | + | ||
49 | +# Usage | ||
50 | + | ||
51 | +eccr [-p URL_list] [-a directory_audio] [-s directory_subtitles] [-r directory_results] [-am sphinx_model_path] [-dict dictionary_path] [-dict_f dictionary_filler_path] [-ar directory_alignment_results] | ||
52 | + | ||
53 | +# Example | ||
54 | + | ||
55 | +eccr -p playlists.txt -a Audio -s Subs -r Results -am ./voxforge_ru_sphinx/model_parameters/voxforge_ru.cd_cont_200 -dict ./voxforge_ru_sphinx/voxforge_ru.dic -dict_f ./voxforge_ru_sphinx/voxforge_ru.filler -ar Alignment |
etaloncorpuscreator.egg-info/PKG-INFO
0 → 100644
1 | +Metadata-Version: 1.1 | ||
2 | +Name: etaloncorpuscreator | ||
3 | +Version: 0.1 | ||
4 | +Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3 | ||
5 | +Home-page: https://github.com/dangrebenkin/audiocorpusbuilder | ||
6 | +Author: Daniel Grebenkin | ||
7 | +Author-email: d.grebenkin@g.nsu.ru | ||
8 | +License: Apache License Version 2.0 | ||
9 | +Description: UNKNOWN | ||
10 | +Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3 | ||
11 | +Platform: Linux | ||
12 | +Classifier: Development Status :: 3 - Alpha | ||
13 | +Classifier: Intended Audience :: Science/Research | ||
14 | +Classifier: Intended Audience :: Developers | ||
15 | +Classifier: Topic :: Software Development | ||
16 | +Classifier: Topic :: Scientific/Engineering | ||
17 | +Classifier: License :: OSI Approved :: Apache Software License | ||
18 | +Classifier: Programming Language :: Python :: 3.6 | ||
19 | +Classifier: Programming Language :: Python :: 3.7 | ||
20 | +Classifier: Programming Language :: Python :: 3.8 |
etaloncorpuscreator.egg-info/SOURCES.txt
0 → 100644
1 | +README.md | ||
2 | +setup.cfg | ||
3 | +setup.py | ||
4 | +etaloncorpuscreator/__init__.py | ||
5 | +etaloncorpuscreator/__main__.py | ||
6 | +etaloncorpuscreator/corpus_creator.py | ||
7 | +etaloncorpuscreator.egg-info/PKG-INFO | ||
8 | +etaloncorpuscreator.egg-info/SOURCES.txt | ||
9 | +etaloncorpuscreator.egg-info/dependency_links.txt | ||
10 | +etaloncorpuscreator.egg-info/entry_points.txt | ||
11 | +etaloncorpuscreator.egg-info/requires.txt | ||
12 | +etaloncorpuscreator.egg-info/top_level.txt |
1 | + |
etaloncorpuscreator.egg-info/requires.txt
0 → 100644
etaloncorpuscreator.egg-info/top_level.txt
0 → 100644
1 | +etaloncorpuscreator |
etaloncorpuscreator/__init__.py
0 → 100644
etaloncorpuscreator/__main__.py
0 → 100644
etaloncorpuscreator/corpus_creator.py
0 → 100644
1 | +import os | ||
2 | +import re | ||
3 | +import shutil | ||
4 | +import codecs | ||
5 | +import pandas | ||
6 | +import librosa | ||
7 | +import argparse | ||
8 | +import subprocess | ||
9 | +from datetime import datetime | ||
10 | + | ||
11 | +# audiocorpusbuilder | ||
12 | + | ||
13 | +results = [] | ||
14 | +subtitles = [] | ||
15 | +wavs = [] | ||
16 | +subtitles_file = [] | ||
17 | +startpoints =[] | ||
18 | +finishpoints =[] | ||
19 | +filenamecounter = 1 | ||
20 | +counter = 1 | ||
21 | +total_number = 0 | ||
22 | + | ||
23 | +wav_names = [] | ||
24 | +variaties = [] | ||
25 | +txt_names = [] | ||
26 | + | ||
27 | +def getting_sound_and_subtitles(link, directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2): | ||
28 | + | ||
29 | + global subtitles,wavs,subtitle_file,startpoints,finishpoints,counter | ||
30 | + | ||
31 | + list_of_videos = "youtube-dl -j --flat-playlist "+link+" | jq -r '.id' | sed 's_^_https://youtu.be/_' >"+directory_results+"videos.txt" | ||
32 | + list_of_videos_str = os.popen(list_of_videos).read() | ||
33 | + with open(directory_results+'videos.txt','r') as videos_in_playlist: | ||
34 | + lots_of_videos_demo = videos_in_playlist.readlines() | ||
35 | + lots_of_videos_demo = list(lots_of_videos_demo) | ||
36 | + for video in lots_of_videos_demo: | ||
37 | + amount_subs = len(os.listdir(directory_subtitles)) | ||
38 | + pre_sub = "youtube-dl -i --skip-download --write-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video | ||
39 | + sub = os.popen(pre_sub).read() | ||
40 | + new_amount_subs = len(os.listdir(directory_subtitles)) | ||
41 | + if amount_subs+1 == new_amount_subs: | ||
42 | + pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video | ||
43 | + audio = os.popen(pre_audio).read() | ||
44 | + else: | ||
45 | + pre_sub = "youtube-dl -i --skip-download --write-auto-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video | ||
46 | + sub = os.popen(pre_sub).read() | ||
47 | + another_new_amount_subs = len(os.listdir(directory_subtitles)) | ||
48 | + if amount_subs+1 == another_new_amount_subs: | ||
49 | + pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video | ||
50 | + audio = os.popen(pre_audio).read() | ||
51 | + else: | ||
52 | + pass | ||
53 | + | ||
54 | + | ||
55 | + subtitlesfiles = os.listdir(directory_subtitles) | ||
56 | + for file2 in subtitlesfiles: | ||
57 | + subtitles.append(file2) | ||
58 | + audiofiles = os.listdir(directory_audio) | ||
59 | + for file1 in audiofiles: | ||
60 | + wavs.append(file1) | ||
61 | + | ||
62 | + subtitles.sort() | ||
63 | + wavs.sort() | ||
64 | + counter_limit = len(wavs) | ||
65 | + | ||
66 | + for wav in wavs: | ||
67 | + wavdivision(wav,directory_audio,directory_results,directory_subtitles,counter_limit) | ||
68 | + subtitles_file.clear() | ||
69 | + startpoints.clear() | ||
70 | + finishpoints.clear() | ||
71 | + | ||
72 | + wavs.clear() | ||
73 | + subtitles.clear() | ||
74 | + counter_limit=0 | ||
75 | + counter=1 | ||
76 | + used_wavs = [os.path.join(directory_audio,w) for w in os.listdir(directory_audio)] | ||
77 | + for w in used_wavs: | ||
78 | + os.remove(w) | ||
79 | + used_subs = [os.path.join(directory_subtitles,s) for s in os.listdir(directory_subtitles)] | ||
80 | + for s in used_subs: | ||
81 | + os.remove(s) | ||
82 | + os.remove(directory_results+'videos.txt') | ||
83 | + | ||
84 | + os.chdir(directory_results) | ||
85 | + f = os.listdir(directory_results) | ||
86 | + | ||
87 | + wavs = [] | ||
88 | + txts = [] | ||
89 | + | ||
90 | + for folder in f: | ||
91 | + path = os.path.abspath(folder) | ||
92 | + os.chdir(path) | ||
93 | + files_in_folder = os.listdir(path) | ||
94 | + files_in_folder.sort() | ||
95 | + | ||
96 | + for e in files_in_folder: | ||
97 | + u = os.path.abspath(e) | ||
98 | + if 'wav' in e: | ||
99 | + e = os.path.abspath(e) | ||
100 | + wavs.append(e) | ||
101 | + if 'txt' in e: | ||
102 | + e = os.path.abspath(e) | ||
103 | + txts.append(e) | ||
104 | + | ||
105 | + os.chdir(directory_results) | ||
106 | + | ||
107 | + preparations(sphinx_model_path,wavs,txts,dictionary_path,dictionary_filler_path,directory_results2) | ||
108 | + | ||
109 | + | ||
110 | +def subtitlesdivision(file,directory_subtitles): | ||
111 | + | ||
112 | + global subtitles_file,startpoints,finishpoints | ||
113 | + | ||
114 | + with open(directory_subtitles+file, 'r') as subtitles2: | ||
115 | + k = subtitles2.readlines() | ||
116 | + k = list(k) | ||
117 | + time_moments = [] | ||
118 | + for string in k: | ||
119 | + piece_of_time = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', string) | ||
120 | + if piece_of_time != []: | ||
121 | + string_index = k.index(string) | ||
122 | + string_index_plus_one = string_index+1 | ||
123 | + if k[string_index_plus_one] != []: | ||
124 | + j = k[string_index_plus_one] | ||
125 | + ko = k[string_index_plus_one-1] | ||
126 | + piece_of_time2 = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', ko) | ||
127 | + j2 = k[string_index_plus_one+1] | ||
128 | + if j2 != []: | ||
129 | + subtitle = j+j2 | ||
130 | + subtitle = subtitle.lower() | ||
131 | + subtitle = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', subtitle) | ||
132 | + subtitle = ' '.join(subtitle) | ||
133 | + subtitles_file.append(subtitle) | ||
134 | + time_moments.append(piece_of_time2) | ||
135 | + else: | ||
136 | + j = j.lower() | ||
137 | + j = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', j) | ||
138 | + j = ' '.join(j) | ||
139 | + subtitles_file.append(j) | ||
140 | + time_moments.append(piece_of_time2) | ||
141 | + | ||
142 | + for moment in time_moments: | ||
143 | + for time_seconds in moment: | ||
144 | + o1 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[0]) | ||
145 | + o2 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[1]) | ||
146 | + | ||
147 | + for element1 in o1: | ||
148 | + h2 = int(element1[0]) | ||
149 | + m2 = int(element1[1]) | ||
150 | + s2 = int(element1[2]) | ||
151 | + ms2 = (int(element1[3])) * 1000 | ||
152 | + g1 = datetime(2019, 5, 6, 0, 0, 0, 0) | ||
153 | + g2 = datetime(2019, 5, 6, h2, m2, s2, ms2) | ||
154 | + g3 = (g2 - g1) | ||
155 | + g51 = g3.total_seconds() | ||
156 | + startpoints.append(g51) | ||
157 | + | ||
158 | + for element2 in o2: | ||
159 | + g1 = datetime(2019, 5, 6, 0, 0, 0, 0) | ||
160 | + h22 = int(element2[0]) | ||
161 | + m22 = int(element2[1]) | ||
162 | + s22 = int(element2[2]) | ||
163 | + ms22 = (int(element2[3])) * 1000 | ||
164 | + g22 = datetime(2019, 5, 6, h22, m22, s22, ms22) | ||
165 | + g32 = (g22 - g1) | ||
166 | + g52 = g32.total_seconds() | ||
167 | + finishpoints.append(g52) | ||
168 | + | ||
169 | +def wavdivision(sound,directory_audio,directory_results,directory_subtitles,counter_limit): | ||
170 | + | ||
171 | + global subtitles,filenamecounter,subtitles_file,startpoints,finishpoints,counter | ||
172 | + | ||
173 | + for textfile in subtitles: | ||
174 | + subtitlesdivision(textfile,directory_subtitles) | ||
175 | + subtitles.remove(textfile) | ||
176 | + break | ||
177 | + y, sr = librosa.load(directory_audio+sound,mono=True) | ||
178 | + | ||
179 | + def finalmoment(start,finish,filenamecounter): | ||
180 | + j = y[int(start)*sr:int(finish)*sr] | ||
181 | + os.chdir(directory_results+new_folder) | ||
182 | + librosa.output.write_wav(str(filenamecounter)+'.wav', j, sr) | ||
183 | + for subtitletext in subtitles_file: | ||
184 | + new_file_name_for_text = str(filenamecounter)+'.txt' | ||
185 | + with open(new_file_name_for_text, 'w') as gh: | ||
186 | + gh.write(subtitletext) | ||
187 | + subtitles_file.remove(subtitletext) | ||
188 | + break | ||
189 | + | ||
190 | + os.chdir(directory_results) | ||
191 | + new_folder = str(sound) | ||
192 | + os.mkdir(new_folder) | ||
193 | + os.chdir(directory_results+new_folder) | ||
194 | + | ||
195 | + for moment1,moment2 in zip(startpoints,finishpoints): | ||
196 | + finalmoment(moment1, moment2,filenamecounter) | ||
197 | + filenamecounter += 1 | ||
198 | + print (counter,' from ',counter_limit) | ||
199 | + counter+=1 | ||
200 | + | ||
201 | + | ||
202 | +# sphinxforcealigner | ||
203 | + | ||
204 | +def preparations(sphinx_model_path,wavs,txts,d_path,d_f_path,directory_results2): | ||
205 | + | ||
206 | + global results,wav_names,variaties,txt_names | ||
207 | + | ||
208 | + dir = os.path.join(directory_results2,"f_ali") | ||
209 | + if not os.path.exists(dir): | ||
210 | + os.mkdir(dir) | ||
211 | + os.chdir(dir) | ||
212 | + else: | ||
213 | + os.chdir(dir) | ||
214 | + | ||
215 | + phlabdir = os.path.join(dir,"phsegdir") | ||
216 | + | ||
217 | + # slicing files to 100 | ||
218 | + | ||
219 | + while len(wavs) != 0 and len(txts) != 0: | ||
220 | + | ||
221 | + slice_audios = wavs[0:100] | ||
222 | + slice_annotations = txts[0:100] | ||
223 | + | ||
224 | + os.mkdir('txt') | ||
225 | + os.mkdir('wav') | ||
226 | + os.mkdir('phsegdir') | ||
227 | + | ||
228 | + #preparing .transcription | ||
229 | + | ||
230 | + for t in slice_annotations: | ||
231 | + shutil.copy(t,'txt') | ||
232 | + | ||
233 | + for t in slice_annotations: | ||
234 | + with codecs.open (t,encoding="utf8",errors='ignore') as annotation: | ||
235 | + text_string = annotation.read() | ||
236 | + real_string = text_string.replace('\n','') | ||
237 | + with codecs.open('f_ali.transcription','a',encoding="utf8",errors='ignore') as text_file: | ||
238 | + t = os.path.basename(t) | ||
239 | + t = re.sub('.txt','',t) | ||
240 | + text_file.write('<s>'+' '+str(real_string)+' '+'</s>'+' '+'('+t+')'+'\n') | ||
241 | + | ||
242 | + #preparing .fileids | ||
243 | + | ||
244 | + for t in slice_audios: | ||
245 | + shutil.copy(t,'wav') | ||
246 | + | ||
247 | + list_w = os.listdir('wav') | ||
248 | + | ||
249 | + for path in list_w: | ||
250 | + a = 'wav/'+path | ||
251 | + | ||
252 | + #convertion | ||
253 | + audio_dir = os.path.dirname(a) | ||
254 | + command1 = "sox '"+a+"' -r 16000 -b 16 -c 1 '"+audio_dir+"/temporary_audio_wav.wav'" | ||
255 | + execute = os.popen(command1).read() | ||
256 | + os.rename (audio_dir+'/temporary_audio_wav.wav',a) | ||
257 | + | ||
258 | + for s in slice_audios: | ||
259 | + with open ('f_ali.fileids','a') as wav_scp_file: | ||
260 | + s = os.path.basename(s) | ||
261 | + s = re.sub('.wav',' ',s) | ||
262 | + s = 'wav/'+s | ||
263 | + wav_scp_file.write(str(s)+'\n') | ||
264 | + | ||
265 | + #preparing features and doing alignment | ||
266 | + | ||
267 | + command1 = 'export LD_LIBRARY_PATH=/usr/local/lib' | ||
268 | + command2 = 'cd '+dir | ||
269 | + command3 = 'sphinx_fe -argfile '+sphinx_model_path+'/feat.params -samprate 16000 -c f_ali.fileids -di . -do . -ei wav -eo mfc -mswav yes' | ||
270 | + command4 = 'sphinx3_align -hmm '+sphinx_model_path+' -dict '+d_path+' -fdict '+d_f_path+' -ctl '+dir+'/f_ali.fileids -cepdir . -cepext .mfc -insent '+dir+'/f_ali.transcription -outsent '+dir+'/f_ali.out -phsegdir '+dir+'/phsegdir' | ||
271 | + | ||
272 | + execute1 = os.popen(command1).read() | ||
273 | + execute2 = os.popen(command2).read() | ||
274 | + execute3 = os.popen(command3).read() | ||
275 | + execute4 = os.popen(command4).read() | ||
276 | + | ||
277 | + # getting results | ||
278 | + | ||
279 | + result_path = os.path.join(dir,"f_ali.out") | ||
280 | + with codecs.open(result_path,'r',encoding="utf8",errors='ignore') as result_file: | ||
281 | + res = result_file.readlines() | ||
282 | + for text_res in res: | ||
283 | + results.append(text_res) | ||
284 | + os.remove(result_path) | ||
285 | + | ||
286 | + wavs = list(set(wavs) - set(slice_audios)) | ||
287 | + txts = list(set(txts) - set(slice_annotations)) | ||
288 | + | ||
289 | + wavs.sort() | ||
290 | + txts.sort() | ||
291 | + | ||
292 | + #______ | ||
293 | + | ||
294 | + all_wavs_path = os.path.join(dir,'wav') | ||
295 | + all_txts_path = os.path.join(dir,'txt') | ||
296 | + | ||
297 | + list_results_files = os.listdir(phlabdir) | ||
298 | + | ||
299 | + os.chdir(phlabdir) | ||
300 | + | ||
301 | + for result_file in list_results_files: | ||
302 | + | ||
303 | + wav_name = re.sub('.phseg','.wav',result_file) | ||
304 | + txt_name = re.sub('.phseg','.txt',result_file) | ||
305 | + | ||
306 | + wav_name = os.path.join(all_wavs_path,wav_name) | ||
307 | + txt_name = os.path.join(all_txts_path,txt_name) | ||
308 | + | ||
309 | + shutil.copy(wav_name,dir) | ||
310 | + shutil.copy(txt_name,dir) | ||
311 | + | ||
312 | + new_wav_name = os.path.join(dir,wav_name) | ||
313 | + new_wav_name = re.sub('wav/','',new_wav_name) | ||
314 | + new_txt_name = os.path.join(dir,txt_name) | ||
315 | + new_txt_name = re.sub('txt/','',new_txt_name) | ||
316 | + | ||
317 | + wav_names.append(new_wav_name) | ||
318 | + txt_names.append(new_txt_name) | ||
319 | + | ||
320 | + u = codecs.open (result_file, 'r', encoding = 'utf-8',errors='ignore') | ||
321 | + u = u.readlines() | ||
322 | + for line in u: | ||
323 | + if 'Total score:' in line: | ||
324 | + variaty = re.findall('\d+',line) | ||
325 | + variaty = '-'+variaty[0] | ||
326 | + variaties.append(variaty) | ||
327 | + | ||
328 | + #removing files and directories for new files | ||
329 | + | ||
330 | + os.chdir(dir) | ||
331 | + shutil.rmtree('txt/') | ||
332 | + shutil.rmtree('wav/') | ||
333 | + shutil.rmtree('phsegdir/') | ||
334 | + os.remove('f_ali.transcription') | ||
335 | + os.remove('f_ali.fileids') | ||
336 | + | ||
337 | + total_result = os.path.join(directory_results2,'results.txt') | ||
338 | + with codecs.open(total_result,'a',encoding="utf8",errors='ignore') as result_total: | ||
339 | + for el in results: | ||
340 | + result_total.write(el) | ||
341 | + results.clear() | ||
342 | + | ||
343 | +# arguments parser | ||
344 | + | ||
345 | +def main(): | ||
346 | + | ||
347 | + parser = argparse.ArgumentParser() | ||
348 | + | ||
349 | + parser.add_argument('-p', '--playlist_file', dest='URL_list', type=str, | ||
350 | + help='playlists txt-file path', required=True) | ||
351 | + parser.add_argument('-a', '--audio_path', dest='directory_audio', type=str, | ||
352 | + help='path to download audiotracks', required=True) | ||
353 | + parser.add_argument('-s','--subs_path', dest='directory_subtitles', type=str, | ||
354 | + help='path to download subtitles', required=True) | ||
355 | + parser.add_argument('-r', '--results_path', dest='directory_results', type=str, | ||
356 | + help='path for results', required=True) | ||
357 | + | ||
358 | + parser.add_argument('-am', '--sphinx_model_path', dest='sphinx_model_path', type=str, | ||
359 | + help='your acoustic model path', required=True) | ||
360 | + parser.add_argument('-dict', '--dictionary_path', dest='dictionary_path', type=str, | ||
361 | + help='your dictionary path', required=True) | ||
362 | + parser.add_argument('-dict_f', '--dictionary_filler_path', dest='dictionary_filler_path', type=str, | ||
363 | + help='your dictionary filler path', required=True) | ||
364 | + parser.add_argument('-ar', '--ali_results_path', dest='directory_alignment_results', type=str, | ||
365 | + help='path for alignment results', required=True) | ||
366 | + | ||
367 | + args = parser.parse_args() | ||
368 | + | ||
369 | + directory_audio = os.path.abspath(args.directory_audio)+'/' | ||
370 | + directory_subtitles = os.path.abspath(args.directory_subtitles)+'/' | ||
371 | + directory_results = os.path.abspath(args.directory_results)+'/' | ||
372 | + URL_list = os.path.abspath(args.URL_list) | ||
373 | + sphinx_model_path = os.path.abspath(args.sphinx_model_path) | ||
374 | + dictionary_path = os.path.abspath(args.dictionary_path) | ||
375 | + dictionary_filler_path = os.path.abspath(args.dictionary_filler_path) | ||
376 | + directory_results2 = os.path.abspath(args.directory_alignment_results) | ||
377 | + | ||
378 | + with open(URL_list, 'r') as playlists_links: | ||
379 | + lots_of_playlists = playlists_links.readlines() | ||
380 | + lots_of_playlists = list(lots_of_playlists) | ||
381 | + for i in lots_of_playlists: | ||
382 | + i = re.sub("\n", '', i) | ||
383 | + if i=='': | ||
384 | + pass | ||
385 | + else: | ||
386 | + getting_sound_and_subtitles(i,directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2) | ||
387 | + wavs.clear() | ||
388 | + | ||
389 | + #creating total csv | ||
390 | + | ||
391 | + os.chdir (directory_results2) | ||
392 | + | ||
393 | + dict = {'wav_dir': wav_names , 'txt_dir': txt_names, 'variaty': variaties} | ||
394 | + df = pandas.DataFrame(dict) | ||
395 | + | ||
396 | + df.to_csv ('Total_results.csv', index = False, header=True) | ||
397 | + | ||
398 | + |
setup.cfg
0 → 100644
setup.py
0 → 100644
1 | +from setuptools import setup, find_packages, Extension | ||
2 | +from os.path import join, dirname | ||
3 | + | ||
4 | +setup( | ||
5 | + name='etaloncorpuscreator', | ||
6 | + version='0.1', | ||
7 | + description='command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3', | ||
8 | + url='https://github.com/dangrebenkin/audiocorpusbuilder', | ||
9 | + author='Daniel Grebenkin', | ||
10 | + author_email = 'd.grebenkin@g.nsu.ru', | ||
11 | + license='Apache License Version 2.0', | ||
12 | + keywords=['dataset', 'librosa', 'youtube-dl', 'youtube', 'forced alignment', 'sphinx','sphinx3'], | ||
13 | + packages = find_packages(), | ||
14 | + platforms = 'Linux', | ||
15 | + entry_points ={ | ||
16 | + 'console_scripts': [ | ||
17 | + 'eccr = etaloncorpuscreator.corpus_creator:main' | ||
18 | + ] | ||
19 | + }, | ||
20 | + install_requires=[ | ||
21 | + 'pandas >= 1.1.1', | ||
22 | + 'audioread >= 2.0.0', | ||
23 | + 'numpy >= 1.15.0', | ||
24 | + 'packaging >= 18', | ||
25 | + 'scipy >= 1.0.0', | ||
26 | + 'scikit-learn >= 0.14.0, != 0.19.0', | ||
27 | + 'joblib >= 0.14', | ||
28 | + 'decorator >= 3.0.0', | ||
29 | + 'resampy >= 0.2.2', | ||
30 | + 'numba == 0.48', | ||
31 | + 'soundfile >= 0.9.0', | ||
32 | + 'pooch >= 1.0', | ||
33 | + 'librosa==0.7.0', | ||
34 | + 'youtube-dl>=2020.1.1' | ||
35 | + ], | ||
36 | + classifiers=[ | ||
37 | + 'Development Status :: 3 - Alpha', | ||
38 | + 'Intended Audience :: Science/Research', | ||
39 | + 'Intended Audience :: Developers', | ||
40 | + 'Topic :: Software Development', | ||
41 | + 'Topic :: Scientific/Engineering', | ||
42 | + 'License :: OSI Approved :: Apache Software License', | ||
43 | + 'Programming Language :: Python :: 3.6', | ||
44 | + 'Programming Language :: Python :: 3.7', | ||
45 | + 'Programming Language :: Python :: 3.8'] | ||
46 | +) | ||
47 | + |
Please
register
or
login
to post a comment