Showing
13 changed files
with
582 additions
and
0 deletions
PKG-INFO
0 → 100644
1 | +Metadata-Version: 1.1 | |
2 | +Name: etaloncorpuscreator | |
3 | +Version: 0.1 | |
4 | +Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3 | |
5 | +Home-page: https://github.com/dangrebenkin/audiocorpusbuilder | |
6 | +Author: Daniel Grebenkin | |
7 | +Author-email: d.grebenkin@g.nsu.ru | |
8 | +License: Apache License Version 2.0 | |
9 | +Description: UNKNOWN | |
10 | +Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3 | |
11 | +Platform: Linux | |
12 | +Classifier: Development Status :: 3 - Alpha | |
13 | +Classifier: Intended Audience :: Science/Research | |
14 | +Classifier: Intended Audience :: Developers | |
15 | +Classifier: Topic :: Software Development | |
16 | +Classifier: Topic :: Scientific/Engineering | |
17 | +Classifier: License :: OSI Approved :: Apache Software License | |
18 | +Classifier: Programming Language :: Python :: 3.6 | |
19 | +Classifier: Programming Language :: Python :: 3.7 | |
20 | +Classifier: Programming Language :: Python :: 3.8 | ... | ... |
README.md
0 → 100644
1 | +# About | |
2 | + | |
3 | +Etaloncorpuscreator-package was made to automatically create a russian language audio corpus from YouTube videotracks playlists: it downloads video's audio and subtitles, makes pairs "sound-text", doing forced alignment and saves new corpus and varieties. | |
4 | + | |
5 | +# Installing | |
6 | + | |
7 | +For installation you need Python 3.6 or later, OC Linux and sphinx3 on your local machine. | |
8 | + | |
9 | +# Start | |
10 | + | |
11 | +To run etaloncorpuscreator you shoild prepare directories for audiotracks, subtitles, results. Also you need to create playlists.txt with playlists' links, every link should be on the separate line. | |
12 | + | |
13 | +# Arguments | |
14 | + | |
15 | +All arguments are required for program use. | |
16 | + | |
17 | +1. -p URL_list | |
18 | + | |
19 | +Playlists txt-file path. | |
20 | + | |
21 | +2. -a directory_audio | |
22 | + | |
23 | +Path to download audiotracks. | |
24 | + | |
25 | +3. -s directory_subtitles | |
26 | + | |
27 | +Path to download subtitles. | |
28 | + | |
29 | +4. -r directory_results | |
30 | + | |
31 | +Path for audio results. | |
32 | + | |
33 | +5. -am sphinx_model_path | |
34 | + | |
35 | +Your acoustic model path. | |
36 | + | |
37 | +6. -dict dictionary_path | |
38 | + | |
39 | +Your dictionary path. | |
40 | + | |
41 | +7. -dict_f dictionary_filler_path | |
42 | + | |
43 | +Your dictionary filler path. | |
44 | + | |
45 | +8. -ar directory_alignment_results | |
46 | + | |
47 | +Path for alignment results. | |
48 | + | |
49 | +# Usage | |
50 | + | |
51 | +eccr [-p URL_list] [-a directory_audio] [-s directory_subtitles] [-r directory_results] [-am sphinx_model_path] [-dict dictionary_path] [-dict_f dictionary_filler_path] [-ar directory_alignment_results] | |
52 | + | |
53 | +# Example | |
54 | + | |
55 | +eccr -p playlists.txt -a Audio -s Subs -r Results -am ./voxforge_ru_sphinx/model_parameters/voxforge_ru.cd_cont_200 -dict ./voxforge_ru_sphinx/voxforge_ru.dic -dict_f ./voxforge_ru_sphinx/voxforge_ru.filler -ar Alignment | ... | ... |
etaloncorpuscreator.egg-info/PKG-INFO
0 → 100644
1 | +Metadata-Version: 1.1 | |
2 | +Name: etaloncorpuscreator | |
3 | +Version: 0.1 | |
4 | +Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3 | |
5 | +Home-page: https://github.com/dangrebenkin/audiocorpusbuilder | |
6 | +Author: Daniel Grebenkin | |
7 | +Author-email: d.grebenkin@g.nsu.ru | |
8 | +License: Apache License Version 2.0 | |
9 | +Description: UNKNOWN | |
10 | +Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3 | |
11 | +Platform: Linux | |
12 | +Classifier: Development Status :: 3 - Alpha | |
13 | +Classifier: Intended Audience :: Science/Research | |
14 | +Classifier: Intended Audience :: Developers | |
15 | +Classifier: Topic :: Software Development | |
16 | +Classifier: Topic :: Scientific/Engineering | |
17 | +Classifier: License :: OSI Approved :: Apache Software License | |
18 | +Classifier: Programming Language :: Python :: 3.6 | |
19 | +Classifier: Programming Language :: Python :: 3.7 | |
20 | +Classifier: Programming Language :: Python :: 3.8 | ... | ... |
etaloncorpuscreator.egg-info/SOURCES.txt
0 → 100644
1 | +README.md | |
2 | +setup.cfg | |
3 | +setup.py | |
4 | +etaloncorpuscreator/__init__.py | |
5 | +etaloncorpuscreator/__main__.py | |
6 | +etaloncorpuscreator/corpus_creator.py | |
7 | +etaloncorpuscreator.egg-info/PKG-INFO | |
8 | +etaloncorpuscreator.egg-info/SOURCES.txt | |
9 | +etaloncorpuscreator.egg-info/dependency_links.txt | |
10 | +etaloncorpuscreator.egg-info/entry_points.txt | |
11 | +etaloncorpuscreator.egg-info/requires.txt | |
12 | +etaloncorpuscreator.egg-info/top_level.txt | |
\ No newline at end of file | ... | ... |
1 | + | ... | ... |
etaloncorpuscreator.egg-info/requires.txt
0 → 100644
etaloncorpuscreator.egg-info/top_level.txt
0 → 100644
1 | +etaloncorpuscreator | ... | ... |
etaloncorpuscreator/__init__.py
0 → 100644
etaloncorpuscreator/__main__.py
0 → 100644
etaloncorpuscreator/corpus_creator.py
0 → 100644
1 | +import os | |
2 | +import re | |
3 | +import shutil | |
4 | +import codecs | |
5 | +import pandas | |
6 | +import librosa | |
7 | +import argparse | |
8 | +import subprocess | |
9 | +from datetime import datetime | |
10 | + | |
11 | +# audiocorpusbuilder | |
12 | + | |
13 | +results = [] | |
14 | +subtitles = [] | |
15 | +wavs = [] | |
16 | +subtitles_file = [] | |
17 | +startpoints =[] | |
18 | +finishpoints =[] | |
19 | +filenamecounter = 1 | |
20 | +counter = 1 | |
21 | +total_number = 0 | |
22 | + | |
23 | +wav_names = [] | |
24 | +variaties = [] | |
25 | +txt_names = [] | |
26 | + | |
27 | +def getting_sound_and_subtitles(link, directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2): | |
28 | + | |
29 | + global subtitles,wavs,subtitle_file,startpoints,finishpoints,counter | |
30 | + | |
31 | + list_of_videos = "youtube-dl -j --flat-playlist "+link+" | jq -r '.id' | sed 's_^_https://youtu.be/_' >"+directory_results+"videos.txt" | |
32 | + list_of_videos_str = os.popen(list_of_videos).read() | |
33 | + with open(directory_results+'videos.txt','r') as videos_in_playlist: | |
34 | + lots_of_videos_demo = videos_in_playlist.readlines() | |
35 | + lots_of_videos_demo = list(lots_of_videos_demo) | |
36 | + for video in lots_of_videos_demo: | |
37 | + amount_subs = len(os.listdir(directory_subtitles)) | |
38 | + pre_sub = "youtube-dl -i --skip-download --write-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video | |
39 | + sub = os.popen(pre_sub).read() | |
40 | + new_amount_subs = len(os.listdir(directory_subtitles)) | |
41 | + if amount_subs+1 == new_amount_subs: | |
42 | + pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video | |
43 | + audio = os.popen(pre_audio).read() | |
44 | + else: | |
45 | + pre_sub = "youtube-dl -i --skip-download --write-auto-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video | |
46 | + sub = os.popen(pre_sub).read() | |
47 | + another_new_amount_subs = len(os.listdir(directory_subtitles)) | |
48 | + if amount_subs+1 == another_new_amount_subs: | |
49 | + pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video | |
50 | + audio = os.popen(pre_audio).read() | |
51 | + else: | |
52 | + pass | |
53 | + | |
54 | + | |
55 | + subtitlesfiles = os.listdir(directory_subtitles) | |
56 | + for file2 in subtitlesfiles: | |
57 | + subtitles.append(file2) | |
58 | + audiofiles = os.listdir(directory_audio) | |
59 | + for file1 in audiofiles: | |
60 | + wavs.append(file1) | |
61 | + | |
62 | + subtitles.sort() | |
63 | + wavs.sort() | |
64 | + counter_limit = len(wavs) | |
65 | + | |
66 | + for wav in wavs: | |
67 | + wavdivision(wav,directory_audio,directory_results,directory_subtitles,counter_limit) | |
68 | + subtitles_file.clear() | |
69 | + startpoints.clear() | |
70 | + finishpoints.clear() | |
71 | + | |
72 | + wavs.clear() | |
73 | + subtitles.clear() | |
74 | + counter_limit=0 | |
75 | + counter=1 | |
76 | + used_wavs = [os.path.join(directory_audio,w) for w in os.listdir(directory_audio)] | |
77 | + for w in used_wavs: | |
78 | + os.remove(w) | |
79 | + used_subs = [os.path.join(directory_subtitles,s) for s in os.listdir(directory_subtitles)] | |
80 | + for s in used_subs: | |
81 | + os.remove(s) | |
82 | + os.remove(directory_results+'videos.txt') | |
83 | + | |
84 | + os.chdir(directory_results) | |
85 | + f = os.listdir(directory_results) | |
86 | + | |
87 | + wavs = [] | |
88 | + txts = [] | |
89 | + | |
90 | + for folder in f: | |
91 | + path = os.path.abspath(folder) | |
92 | + os.chdir(path) | |
93 | + files_in_folder = os.listdir(path) | |
94 | + files_in_folder.sort() | |
95 | + | |
96 | + for e in files_in_folder: | |
97 | + u = os.path.abspath(e) | |
98 | + if 'wav' in e: | |
99 | + e = os.path.abspath(e) | |
100 | + wavs.append(e) | |
101 | + if 'txt' in e: | |
102 | + e = os.path.abspath(e) | |
103 | + txts.append(e) | |
104 | + | |
105 | + os.chdir(directory_results) | |
106 | + | |
107 | + preparations(sphinx_model_path,wavs,txts,dictionary_path,dictionary_filler_path,directory_results2) | |
108 | + | |
109 | + | |
110 | +def subtitlesdivision(file,directory_subtitles): | |
111 | + | |
112 | + global subtitles_file,startpoints,finishpoints | |
113 | + | |
114 | + with open(directory_subtitles+file, 'r') as subtitles2: | |
115 | + k = subtitles2.readlines() | |
116 | + k = list(k) | |
117 | + time_moments = [] | |
118 | + for string in k: | |
119 | + piece_of_time = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', string) | |
120 | + if piece_of_time != []: | |
121 | + string_index = k.index(string) | |
122 | + string_index_plus_one = string_index+1 | |
123 | + if k[string_index_plus_one] != []: | |
124 | + j = k[string_index_plus_one] | |
125 | + ko = k[string_index_plus_one-1] | |
126 | + piece_of_time2 = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', ko) | |
127 | + j2 = k[string_index_plus_one+1] | |
128 | + if j2 != []: | |
129 | + subtitle = j+j2 | |
130 | + subtitle = subtitle.lower() | |
131 | + subtitle = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', subtitle) | |
132 | + subtitle = ' '.join(subtitle) | |
133 | + subtitles_file.append(subtitle) | |
134 | + time_moments.append(piece_of_time2) | |
135 | + else: | |
136 | + j = j.lower() | |
137 | + j = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', j) | |
138 | + j = ' '.join(j) | |
139 | + subtitles_file.append(j) | |
140 | + time_moments.append(piece_of_time2) | |
141 | + | |
142 | + for moment in time_moments: | |
143 | + for time_seconds in moment: | |
144 | + o1 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[0]) | |
145 | + o2 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[1]) | |
146 | + | |
147 | + for element1 in o1: | |
148 | + h2 = int(element1[0]) | |
149 | + m2 = int(element1[1]) | |
150 | + s2 = int(element1[2]) | |
151 | + ms2 = (int(element1[3])) * 1000 | |
152 | + g1 = datetime(2019, 5, 6, 0, 0, 0, 0) | |
153 | + g2 = datetime(2019, 5, 6, h2, m2, s2, ms2) | |
154 | + g3 = (g2 - g1) | |
155 | + g51 = g3.total_seconds() | |
156 | + startpoints.append(g51) | |
157 | + | |
158 | + for element2 in o2: | |
159 | + g1 = datetime(2019, 5, 6, 0, 0, 0, 0) | |
160 | + h22 = int(element2[0]) | |
161 | + m22 = int(element2[1]) | |
162 | + s22 = int(element2[2]) | |
163 | + ms22 = (int(element2[3])) * 1000 | |
164 | + g22 = datetime(2019, 5, 6, h22, m22, s22, ms22) | |
165 | + g32 = (g22 - g1) | |
166 | + g52 = g32.total_seconds() | |
167 | + finishpoints.append(g52) | |
168 | + | |
169 | +def wavdivision(sound,directory_audio,directory_results,directory_subtitles,counter_limit): | |
170 | + | |
171 | + global subtitles,filenamecounter,subtitles_file,startpoints,finishpoints,counter | |
172 | + | |
173 | + for textfile in subtitles: | |
174 | + subtitlesdivision(textfile,directory_subtitles) | |
175 | + subtitles.remove(textfile) | |
176 | + break | |
177 | + y, sr = librosa.load(directory_audio+sound,mono=True) | |
178 | + | |
179 | + def finalmoment(start,finish,filenamecounter): | |
180 | + j = y[int(start)*sr:int(finish)*sr] | |
181 | + os.chdir(directory_results+new_folder) | |
182 | + librosa.output.write_wav(str(filenamecounter)+'.wav', j, sr) | |
183 | + for subtitletext in subtitles_file: | |
184 | + new_file_name_for_text = str(filenamecounter)+'.txt' | |
185 | + with open(new_file_name_for_text, 'w') as gh: | |
186 | + gh.write(subtitletext) | |
187 | + subtitles_file.remove(subtitletext) | |
188 | + break | |
189 | + | |
190 | + os.chdir(directory_results) | |
191 | + new_folder = str(sound) | |
192 | + os.mkdir(new_folder) | |
193 | + os.chdir(directory_results+new_folder) | |
194 | + | |
195 | + for moment1,moment2 in zip(startpoints,finishpoints): | |
196 | + finalmoment(moment1, moment2,filenamecounter) | |
197 | + filenamecounter += 1 | |
198 | + print (counter,' from ',counter_limit) | |
199 | + counter+=1 | |
200 | + | |
201 | + | |
202 | +# sphinxforcealigner | |
203 | + | |
204 | +def preparations(sphinx_model_path,wavs,txts,d_path,d_f_path,directory_results2): | |
205 | + | |
206 | + global results,wav_names,variaties,txt_names | |
207 | + | |
208 | + dir = os.path.join(directory_results2,"f_ali") | |
209 | + if not os.path.exists(dir): | |
210 | + os.mkdir(dir) | |
211 | + os.chdir(dir) | |
212 | + else: | |
213 | + os.chdir(dir) | |
214 | + | |
215 | + phlabdir = os.path.join(dir,"phsegdir") | |
216 | + | |
217 | + # slicing files to 100 | |
218 | + | |
219 | + while len(wavs) != 0 and len(txts) != 0: | |
220 | + | |
221 | + slice_audios = wavs[0:100] | |
222 | + slice_annotations = txts[0:100] | |
223 | + | |
224 | + os.mkdir('txt') | |
225 | + os.mkdir('wav') | |
226 | + os.mkdir('phsegdir') | |
227 | + | |
228 | + #preparing .transcription | |
229 | + | |
230 | + for t in slice_annotations: | |
231 | + shutil.copy(t,'txt') | |
232 | + | |
233 | + for t in slice_annotations: | |
234 | + with codecs.open (t,encoding="utf8",errors='ignore') as annotation: | |
235 | + text_string = annotation.read() | |
236 | + real_string = text_string.replace('\n','') | |
237 | + with codecs.open('f_ali.transcription','a',encoding="utf8",errors='ignore') as text_file: | |
238 | + t = os.path.basename(t) | |
239 | + t = re.sub('.txt','',t) | |
240 | + text_file.write('<s>'+' '+str(real_string)+' '+'</s>'+' '+'('+t+')'+'\n') | |
241 | + | |
242 | + #preparing .fileids | |
243 | + | |
244 | + for t in slice_audios: | |
245 | + shutil.copy(t,'wav') | |
246 | + | |
247 | + list_w = os.listdir('wav') | |
248 | + | |
249 | + for path in list_w: | |
250 | + a = 'wav/'+path | |
251 | + | |
252 | + #convertion | |
253 | + audio_dir = os.path.dirname(a) | |
254 | + command1 = "sox '"+a+"' -r 16000 -b 16 -c 1 '"+audio_dir+"/temporary_audio_wav.wav'" | |
255 | + execute = os.popen(command1).read() | |
256 | + os.rename (audio_dir+'/temporary_audio_wav.wav',a) | |
257 | + | |
258 | + for s in slice_audios: | |
259 | + with open ('f_ali.fileids','a') as wav_scp_file: | |
260 | + s = os.path.basename(s) | |
261 | + s = re.sub('.wav',' ',s) | |
262 | + s = 'wav/'+s | |
263 | + wav_scp_file.write(str(s)+'\n') | |
264 | + | |
265 | + #preparing features and doing alignment | |
266 | + | |
267 | + command1 = 'export LD_LIBRARY_PATH=/usr/local/lib' | |
268 | + command2 = 'cd '+dir | |
269 | + command3 = 'sphinx_fe -argfile '+sphinx_model_path+'/feat.params -samprate 16000 -c f_ali.fileids -di . -do . -ei wav -eo mfc -mswav yes' | |
270 | + command4 = 'sphinx3_align -hmm '+sphinx_model_path+' -dict '+d_path+' -fdict '+d_f_path+' -ctl '+dir+'/f_ali.fileids -cepdir . -cepext .mfc -insent '+dir+'/f_ali.transcription -outsent '+dir+'/f_ali.out -phsegdir '+dir+'/phsegdir' | |
271 | + | |
272 | + execute1 = os.popen(command1).read() | |
273 | + execute2 = os.popen(command2).read() | |
274 | + execute3 = os.popen(command3).read() | |
275 | + execute4 = os.popen(command4).read() | |
276 | + | |
277 | + # getting results | |
278 | + | |
279 | + result_path = os.path.join(dir,"f_ali.out") | |
280 | + with codecs.open(result_path,'r',encoding="utf8",errors='ignore') as result_file: | |
281 | + res = result_file.readlines() | |
282 | + for text_res in res: | |
283 | + results.append(text_res) | |
284 | + os.remove(result_path) | |
285 | + | |
286 | + wavs = list(set(wavs) - set(slice_audios)) | |
287 | + txts = list(set(txts) - set(slice_annotations)) | |
288 | + | |
289 | + wavs.sort() | |
290 | + txts.sort() | |
291 | + | |
292 | + #______ | |
293 | + | |
294 | + all_wavs_path = os.path.join(dir,'wav') | |
295 | + all_txts_path = os.path.join(dir,'txt') | |
296 | + | |
297 | + list_results_files = os.listdir(phlabdir) | |
298 | + | |
299 | + os.chdir(phlabdir) | |
300 | + | |
301 | + for result_file in list_results_files: | |
302 | + | |
303 | + wav_name = re.sub('.phseg','.wav',result_file) | |
304 | + txt_name = re.sub('.phseg','.txt',result_file) | |
305 | + | |
306 | + wav_name = os.path.join(all_wavs_path,wav_name) | |
307 | + txt_name = os.path.join(all_txts_path,txt_name) | |
308 | + | |
309 | + shutil.copy(wav_name,dir) | |
310 | + shutil.copy(txt_name,dir) | |
311 | + | |
312 | + new_wav_name = os.path.join(dir,wav_name) | |
313 | + new_wav_name = re.sub('wav/','',new_wav_name) | |
314 | + new_txt_name = os.path.join(dir,txt_name) | |
315 | + new_txt_name = re.sub('txt/','',new_txt_name) | |
316 | + | |
317 | + wav_names.append(new_wav_name) | |
318 | + txt_names.append(new_txt_name) | |
319 | + | |
320 | + u = codecs.open (result_file, 'r', encoding = 'utf-8',errors='ignore') | |
321 | + u = u.readlines() | |
322 | + for line in u: | |
323 | + if 'Total score:' in line: | |
324 | + variaty = re.findall('\d+',line) | |
325 | + variaty = '-'+variaty[0] | |
326 | + variaties.append(variaty) | |
327 | + | |
328 | + #removing files and directories for new files | |
329 | + | |
330 | + os.chdir(dir) | |
331 | + shutil.rmtree('txt/') | |
332 | + shutil.rmtree('wav/') | |
333 | + shutil.rmtree('phsegdir/') | |
334 | + os.remove('f_ali.transcription') | |
335 | + os.remove('f_ali.fileids') | |
336 | + | |
337 | + total_result = os.path.join(directory_results2,'results.txt') | |
338 | + with codecs.open(total_result,'a',encoding="utf8",errors='ignore') as result_total: | |
339 | + for el in results: | |
340 | + result_total.write(el) | |
341 | + results.clear() | |
342 | + | |
343 | +# arguments parser | |
344 | + | |
345 | +def main(): | |
346 | + | |
347 | + parser = argparse.ArgumentParser() | |
348 | + | |
349 | + parser.add_argument('-p', '--playlist_file', dest='URL_list', type=str, | |
350 | + help='playlists txt-file path', required=True) | |
351 | + parser.add_argument('-a', '--audio_path', dest='directory_audio', type=str, | |
352 | + help='path to download audiotracks', required=True) | |
353 | + parser.add_argument('-s','--subs_path', dest='directory_subtitles', type=str, | |
354 | + help='path to download subtitles', required=True) | |
355 | + parser.add_argument('-r', '--results_path', dest='directory_results', type=str, | |
356 | + help='path for results', required=True) | |
357 | + | |
358 | + parser.add_argument('-am', '--sphinx_model_path', dest='sphinx_model_path', type=str, | |
359 | + help='your acoustic model path', required=True) | |
360 | + parser.add_argument('-dict', '--dictionary_path', dest='dictionary_path', type=str, | |
361 | + help='your dictionary path', required=True) | |
362 | + parser.add_argument('-dict_f', '--dictionary_filler_path', dest='dictionary_filler_path', type=str, | |
363 | + help='your dictionary filler path', required=True) | |
364 | + parser.add_argument('-ar', '--ali_results_path', dest='directory_alignment_results', type=str, | |
365 | + help='path for alignment results', required=True) | |
366 | + | |
367 | + args = parser.parse_args() | |
368 | + | |
369 | + directory_audio = os.path.abspath(args.directory_audio)+'/' | |
370 | + directory_subtitles = os.path.abspath(args.directory_subtitles)+'/' | |
371 | + directory_results = os.path.abspath(args.directory_results)+'/' | |
372 | + URL_list = os.path.abspath(args.URL_list) | |
373 | + sphinx_model_path = os.path.abspath(args.sphinx_model_path) | |
374 | + dictionary_path = os.path.abspath(args.dictionary_path) | |
375 | + dictionary_filler_path = os.path.abspath(args.dictionary_filler_path) | |
376 | + directory_results2 = os.path.abspath(args.directory_alignment_results) | |
377 | + | |
378 | + with open(URL_list, 'r') as playlists_links: | |
379 | + lots_of_playlists = playlists_links.readlines() | |
380 | + lots_of_playlists = list(lots_of_playlists) | |
381 | + for i in lots_of_playlists: | |
382 | + i = re.sub("\n", '', i) | |
383 | + if i=='': | |
384 | + pass | |
385 | + else: | |
386 | + getting_sound_and_subtitles(i,directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2) | |
387 | + wavs.clear() | |
388 | + | |
389 | + #creating total csv | |
390 | + | |
391 | + os.chdir (directory_results2) | |
392 | + | |
393 | + dict = {'wav_dir': wav_names , 'txt_dir': txt_names, 'variaty': variaties} | |
394 | + df = pandas.DataFrame(dict) | |
395 | + | |
396 | + df.to_csv ('Total_results.csv', index = False, header=True) | |
397 | + | |
398 | + | ... | ... |
setup.cfg
0 → 100644
setup.py
0 → 100644
1 | +from setuptools import setup, find_packages, Extension | |
2 | +from os.path import join, dirname | |
3 | + | |
4 | +setup( | |
5 | + name='etaloncorpuscreator', | |
6 | + version='0.1', | |
7 | + description='command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3', | |
8 | + url='https://github.com/dangrebenkin/audiocorpusbuilder', | |
9 | + author='Daniel Grebenkin', | |
10 | + author_email = 'd.grebenkin@g.nsu.ru', | |
11 | + license='Apache License Version 2.0', | |
12 | + keywords=['dataset', 'librosa', 'youtube-dl', 'youtube', 'forced alignment', 'sphinx','sphinx3'], | |
13 | + packages = find_packages(), | |
14 | + platforms = 'Linux', | |
15 | + entry_points ={ | |
16 | + 'console_scripts': [ | |
17 | + 'eccr = etaloncorpuscreator.corpus_creator:main' | |
18 | + ] | |
19 | + }, | |
20 | + install_requires=[ | |
21 | + 'pandas >= 1.1.1', | |
22 | + 'audioread >= 2.0.0', | |
23 | + 'numpy >= 1.15.0', | |
24 | + 'packaging >= 18', | |
25 | + 'scipy >= 1.0.0', | |
26 | + 'scikit-learn >= 0.14.0, != 0.19.0', | |
27 | + 'joblib >= 0.14', | |
28 | + 'decorator >= 3.0.0', | |
29 | + 'resampy >= 0.2.2', | |
30 | + 'numba == 0.48', | |
31 | + 'soundfile >= 0.9.0', | |
32 | + 'pooch >= 1.0', | |
33 | + 'librosa==0.7.0', | |
34 | + 'youtube-dl>=2020.1.1' | |
35 | + ], | |
36 | + classifiers=[ | |
37 | + 'Development Status :: 3 - Alpha', | |
38 | + 'Intended Audience :: Science/Research', | |
39 | + 'Intended Audience :: Developers', | |
40 | + 'Topic :: Software Development', | |
41 | + 'Topic :: Scientific/Engineering', | |
42 | + 'License :: OSI Approved :: Apache Software License', | |
43 | + 'Programming Language :: Python :: 3.6', | |
44 | + 'Programming Language :: Python :: 3.7', | |
45 | + 'Programming Language :: Python :: 3.8'] | |
46 | +) | |
47 | + | ... | ... |
Please
register
or
login
to post a comment