uploading files

Daniil Grebenkin
Showing 13 changed files with 582 additions and 0 deletions
PKG-INFO
README.md
etaloncorpuscreator.egg-info/PKG-INFO
etaloncorpuscreator.egg-info/SOURCES.txt
etaloncorpuscreator.egg-info/dependency_links.txt
etaloncorpuscreator.egg-info/entry_points.txt
etaloncorpuscreator.egg-info/requires.txt
etaloncorpuscreator.egg-info/top_level.txt
etaloncorpuscreator/__init__.py
etaloncorpuscreator/__main__.py
etaloncorpuscreator/corpus_creator.py
setup.cfg
setup.py
--- a/PKG-INFO 0 → 100644
View file @305ee56
+++ b/PKG-INFO 0 → 100644
View file @305ee56
+Metadata-Version: 1.1
+Name: etaloncorpuscreator
+Version: 0.1
+Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3
+Home-page: https://github.com/dangrebenkin/audiocorpusbuilder
+Author: Daniel Grebenkin
+Author-email: d.grebenkin@g.nsu.ru
+License: Apache License Version 2.0
+Description: UNKNOWN
+Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3
+Platform: Linux
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development
+Classifier: Topic :: Scientific/Engineering
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
--- a/README.md 0 → 100644
View file @305ee56
+++ b/README.md 0 → 100644
View file @305ee56
+# About
+
+Etaloncorpuscreator-package was made to automatically create a russian language audio corpus from YouTube videotracks playlists: it downloads video's audio and subtitles, makes pairs "sound-text", doing forced alignment and saves new corpus and varieties.
+
+# Installing
+
+For installation you need Python 3.6 or later, OC Linux and sphinx3 on your local machine.
+
+# Start
+
+To run etaloncorpuscreator you shoild prepare directories for audiotracks, subtitles, results. Also you need to create playlists.txt with playlists' links, every link should be on the separate line.
+
+# Arguments
+
+All arguments are required for program use.
+
+1. -p URL_list
+
+Playlists txt-file path.
+
+2. -a directory_audio
+
+Path to download audiotracks.
+
+3. -s directory_subtitles
+
+Path to download subtitles.
+
+4. -r directory_results
+
+Path for audio results.
+
+5. -am sphinx_model_path
+
+Your acoustic model path.
+
+6. -dict dictionary_path
+
+Your dictionary path.
+
+7. -dict_f dictionary_filler_path
+
+Your dictionary filler path.
+
+8. -ar directory_alignment_results
+
+Path for alignment results.
+
+# Usage
+
+eccr [-p URL_list] [-a directory_audio] [-s directory_subtitles] [-r directory_results] [-am sphinx_model_path] [-dict dictionary_path] [-dict_f dictionary_filler_path] [-ar directory_alignment_results]
+
+# Example
+
+eccr -p playlists.txt -a Audio -s Subs -r Results -am ./voxforge_ru_sphinx/model_parameters/voxforge_ru.cd_cont_200 -dict ./voxforge_ru_sphinx/voxforge_ru.dic -dict_f ./voxforge_ru_sphinx/voxforge_ru.filler -ar Alignment
--- a/etaloncorpuscreator.egg-info/PKG-INFO 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/PKG-INFO 0 → 100644
View file @305ee56
+Metadata-Version: 1.1
+Name: etaloncorpuscreator
+Version: 0.1
+Summary: command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3
+Home-page: https://github.com/dangrebenkin/audiocorpusbuilder
+Author: Daniel Grebenkin
+Author-email: d.grebenkin@g.nsu.ru
+License: Apache License Version 2.0
+Description: UNKNOWN
+Keywords: dataset,librosa,youtube-dl,youtube,forced alignment,sphinx,sphinx3
+Platform: Linux
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development
+Classifier: Topic :: Scientific/Engineering
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
--- a/etaloncorpuscreator.egg-info/SOURCES.txt 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/SOURCES.txt 0 → 100644
View file @305ee56
+README.md
+setup.cfg
+setup.py
+etaloncorpuscreator/__init__.py
+etaloncorpuscreator/__main__.py
+etaloncorpuscreator/corpus_creator.py
+etaloncorpuscreator.egg-info/PKG-INFO
+etaloncorpuscreator.egg-info/SOURCES.txt
+etaloncorpuscreator.egg-info/dependency_links.txt
+etaloncorpuscreator.egg-info/entry_points.txt
+etaloncorpuscreator.egg-info/requires.txt
+etaloncorpuscreator.egg-info/top_level.txt
\ No newline at end of file
--- a/etaloncorpuscreator.egg-info/dependency_links.txt 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/dependency_links.txt 0 → 100644
View file @305ee56
+
--- a/etaloncorpuscreator.egg-info/entry_points.txt 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/entry_points.txt 0 → 100644
View file @305ee56
+[console_scripts]
+eccr = etaloncorpuscreator.corpus_creator:main
+
--- a/etaloncorpuscreator.egg-info/requires.txt 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/requires.txt 0 → 100644
View file @305ee56
+pandas>=1.1.1
+audioread>=2.0.0
+numpy>=1.15.0
+packaging>=18
+scipy>=1.0.0
+scikit-learn!=0.19.0,>=0.14.0
+joblib>=0.14
+decorator>=3.0.0
+resampy>=0.2.2
+numba==0.48
+soundfile>=0.9.0
+pooch>=1.0
+librosa==0.7.0
+youtube-dl>=2020.1.1
--- a/etaloncorpuscreator.egg-info/top_level.txt 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator.egg-info/top_level.txt 0 → 100644
View file @305ee56
+etaloncorpuscreator
--- a/etaloncorpuscreator/__init__.py 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator/__init__.py 0 → 100644
View file @305ee56
--- a/etaloncorpuscreator/__main__.py 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator/__main__.py 0 → 100644
View file @305ee56
+# -*- coding: utf-8 -*-
+
+from .corpus_creator import main
+main()
--- a/etaloncorpuscreator/corpus_creator.py 0 → 100644
View file @305ee56
+++ b/etaloncorpuscreator/corpus_creator.py 0 → 100644
View file @305ee56
+import os
+import re
+import shutil
+import codecs
+import pandas
+import librosa
+import argparse
+import subprocess
+from datetime import datetime
+
+# audiocorpusbuilder
+
+results = []
+subtitles = []
+wavs = []
+subtitles_file = []
+startpoints =[]
+finishpoints =[]
+filenamecounter = 1
+counter = 1
+total_number = 0
+
+wav_names = []
+variaties = []
+txt_names = []
+
+def getting_sound_and_subtitles(link, directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2):   
+	
+	global subtitles,wavs,subtitle_file,startpoints,finishpoints,counter
+
+	list_of_videos = "youtube-dl -j --flat-playlist "+link+" | jq -r '.id' | sed 's_^_https://youtu.be/_' >"+directory_results+"videos.txt"
+	list_of_videos_str = os.popen(list_of_videos).read()
+	with open(directory_results+'videos.txt','r') as videos_in_playlist:
+		lots_of_videos_demo = videos_in_playlist.readlines()
+	lots_of_videos_demo = list(lots_of_videos_demo)
+	for video in lots_of_videos_demo:
+		amount_subs = len(os.listdir(directory_subtitles))
+		pre_sub = "youtube-dl -i --skip-download --write-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video
+		sub = os.popen(pre_sub).read()
+		new_amount_subs = len(os.listdir(directory_subtitles))
+		if amount_subs+1 == new_amount_subs:
+			pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video
+			audio = os.popen(pre_audio).read() 
+		else:
+			pre_sub = "youtube-dl -i --skip-download --write-auto-sub --sub-lang ru -o '"+directory_subtitles+"%(title)s.%(ext)s'"+" "+video
+			sub = os.popen(pre_sub).read()
+			another_new_amount_subs = len(os.listdir(directory_subtitles))
+			if amount_subs+1 == another_new_amount_subs:
+				pre_audio = "youtube-dl -i --extract-audio --audio-format wav -o '"+directory_audio+"%(title)s.%(ext)s'"+" "+video
+				audio = os.popen(pre_audio).read()
+			else:
+				pass  
+
+	
+	subtitlesfiles = os.listdir(directory_subtitles)
+	for file2 in subtitlesfiles:
+		subtitles.append(file2) 
+	audiofiles = os.listdir(directory_audio)
+	for file1 in audiofiles:
+		wavs.append(file1)	
+	
+	subtitles.sort()
+	wavs.sort()
+	counter_limit = len(wavs)
+	
+	for wav in wavs:
+		wavdivision(wav,directory_audio,directory_results,directory_subtitles,counter_limit)
+		subtitles_file.clear()
+		startpoints.clear()
+		finishpoints.clear()
+	
+	wavs.clear()
+	subtitles.clear()
+	counter_limit=0
+	counter=1
+	used_wavs = [os.path.join(directory_audio,w) for w in os.listdir(directory_audio)]
+	for w in used_wavs:
+		os.remove(w) 
+	used_subs = [os.path.join(directory_subtitles,s) for s in os.listdir(directory_subtitles)]
+	for s in used_subs:
+		os.remove(s)
+	os.remove(directory_results+'videos.txt')
+	
+	os.chdir(directory_results)
+	f = os.listdir(directory_results)
+
+	wavs = []
+	txts = []
+
+	for folder in f:
+		path = os.path.abspath(folder)
+		os.chdir(path)
+		files_in_folder = os.listdir(path)
+		files_in_folder.sort()
+
+		for e in files_in_folder:
+			u = os.path.abspath(e)
+			if 'wav' in e:
+				e = os.path.abspath(e)
+				wavs.append(e)
+			if 'txt' in e:
+				e = os.path.abspath(e)
+				txts.append(e)
+				
+		os.chdir(directory_results)
+		
+	preparations(sphinx_model_path,wavs,txts,dictionary_path,dictionary_filler_path,directory_results2)
+
+
+def subtitlesdivision(file,directory_subtitles):
+	
+	global subtitles_file,startpoints,finishpoints
+	
+	with open(directory_subtitles+file, 'r') as subtitles2:
+		k = subtitles2.readlines()
+	k = list(k) 
+	time_moments = []
+	for string in k:
+		piece_of_time = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', string) 
+		if piece_of_time != []:
+			string_index = k.index(string)
+			string_index_plus_one = string_index+1
+			if k[string_index_plus_one] != []:
+				j = k[string_index_plus_one]
+				ko = k[string_index_plus_one-1]
+				piece_of_time2 = re.findall('(\d{2}:\d{2}:\d{2}.\d{3}) --> (\d{2}:\d{2}:\d{2}.\d{3})', ko)
+				j2 = k[string_index_plus_one+1]
+				if j2 != []:
+					subtitle = j+j2
+					subtitle = subtitle.lower()
+					subtitle = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', subtitle)
+					subtitle = ' '.join(subtitle)
+					subtitles_file.append(subtitle)
+					time_moments.append(piece_of_time2)
+				else:
+					j = j.lower()
+					j = re.findall(r'([А-я]\w+|[а-я]|[0-9]\d+)', j)
+					j = ' '.join(j)
+					subtitles_file.append(j)
+					time_moments.append(piece_of_time2)
+				
+	for moment in time_moments:
+		for time_seconds in moment:
+			o1 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[0])
+			o2 = re.findall('(\d{2}):(\d{2}):(\d{2}).(\d{3})',time_seconds[1])
+
+			for element1 in o1:
+				h2 = int(element1[0])
+				m2 = int(element1[1])
+				s2 = int(element1[2])
+				ms2 = (int(element1[3])) * 1000
+				g1 = datetime(2019, 5, 6, 0, 0, 0, 0)
+				g2 = datetime(2019, 5, 6, h2, m2, s2, ms2)
+				g3 = (g2 - g1)
+				g51 = g3.total_seconds()
+				startpoints.append(g51)
+
+			for element2 in o2:
+				g1 = datetime(2019, 5, 6, 0, 0, 0, 0)
+				h22 = int(element2[0])
+				m22 = int(element2[1])
+				s22 = int(element2[2])
+				ms22 = (int(element2[3])) * 1000
+				g22 = datetime(2019, 5, 6, h22, m22, s22, ms22)
+				g32 = (g22 - g1)
+				g52 = g32.total_seconds()
+				finishpoints.append(g52)
+
+def wavdivision(sound,directory_audio,directory_results,directory_subtitles,counter_limit):
+	
+	global subtitles,filenamecounter,subtitles_file,startpoints,finishpoints,counter
+	
+	for textfile in subtitles:
+		subtitlesdivision(textfile,directory_subtitles)
+		subtitles.remove(textfile)
+		break
+	y, sr = librosa.load(directory_audio+sound,mono=True)
+
+	def finalmoment(start,finish,filenamecounter):
+		j = y[int(start)*sr:int(finish)*sr]                                  
+		os.chdir(directory_results+new_folder)
+		librosa.output.write_wav(str(filenamecounter)+'.wav', j, sr)
+		for subtitletext in subtitles_file:
+			new_file_name_for_text = str(filenamecounter)+'.txt'
+			with open(new_file_name_for_text, 'w') as gh:
+				gh.write(subtitletext)
+			subtitles_file.remove(subtitletext)
+			break
+		
+	os.chdir(directory_results)
+	new_folder = str(sound)
+	os.mkdir(new_folder)
+	os.chdir(directory_results+new_folder)
+
+	for moment1,moment2 in zip(startpoints,finishpoints):
+		finalmoment(moment1, moment2,filenamecounter)
+		filenamecounter += 1
+	print (counter,' from ',counter_limit)
+	counter+=1
+
+
+# sphinxforcealigner
+
+def preparations(sphinx_model_path,wavs,txts,d_path,d_f_path,directory_results2):
+	
+	global results,wav_names,variaties,txt_names
+	
+	dir = os.path.join(directory_results2,"f_ali")
+	if not os.path.exists(dir):
+		os.mkdir(dir)
+		os.chdir(dir)
+	else:
+		os.chdir(dir)
+		
+	phlabdir = os.path.join(dir,"phsegdir")
+		
+	# slicing files to 100
+	
+	while len(wavs) != 0 and len(txts) != 0:
+		
+		slice_audios = wavs[0:100]
+		slice_annotations = txts[0:100]  		
+		
+		os.mkdir('txt')
+		os.mkdir('wav')
+		os.mkdir('phsegdir')
+		
+		#preparing .transcription 
+		
+		for t in slice_annotations:
+			shutil.copy(t,'txt')
+		
+		for t in slice_annotations:
+			with codecs.open (t,encoding="utf8",errors='ignore') as annotation:
+				text_string = annotation.read()
+				real_string = text_string.replace('\n','')
+				with codecs.open('f_ali.transcription','a',encoding="utf8",errors='ignore') as text_file:
+					t = os.path.basename(t)
+					t = re.sub('.txt','',t)
+					text_file.write('<s>'+' '+str(real_string)+' '+'</s>'+' '+'('+t+')'+'\n')
+	   
+		#preparing .fileids
+
+		for t in slice_audios:
+			shutil.copy(t,'wav')
+		
+		list_w = os.listdir('wav')
+
+		for path in list_w:
+			a = 'wav/'+path 
+			
+			#convertion
+			audio_dir = os.path.dirname(a)
+			command1 = "sox '"+a+"' -r 16000 -b 16 -c 1 '"+audio_dir+"/temporary_audio_wav.wav'" 
+			execute = os.popen(command1).read()
+			os.rename (audio_dir+'/temporary_audio_wav.wav',a)
+		
+		for s in slice_audios:
+			with open ('f_ali.fileids','a') as wav_scp_file:
+				s = os.path.basename(s)
+				s = re.sub('.wav',' ',s)
+				s = 'wav/'+s
+				wav_scp_file.write(str(s)+'\n')
+
+		#preparing features and doing alignment
+	   
+		command1 = 'export LD_LIBRARY_PATH=/usr/local/lib'
+		command2 = 'cd '+dir
+		command3 = 'sphinx_fe -argfile '+sphinx_model_path+'/feat.params -samprate 16000 -c f_ali.fileids -di . -do . -ei wav -eo mfc -mswav yes'
+		command4 = 'sphinx3_align -hmm '+sphinx_model_path+' -dict '+d_path+' -fdict '+d_f_path+' -ctl '+dir+'/f_ali.fileids -cepdir . -cepext .mfc -insent '+dir+'/f_ali.transcription -outsent '+dir+'/f_ali.out -phsegdir '+dir+'/phsegdir'
+		
+		execute1 = os.popen(command1).read()
+		execute2 = os.popen(command2).read()
+		execute3 = os.popen(command3).read()
+		execute4 = os.popen(command4).read()
+		
+		# getting results
+		
+		result_path = os.path.join(dir,"f_ali.out")
+		with codecs.open(result_path,'r',encoding="utf8",errors='ignore') as result_file:
+			res = result_file.readlines()
+			for text_res in res:
+				results.append(text_res)
+		os.remove(result_path)
+	
+		wavs = list(set(wavs) - set(slice_audios))
+		txts = list(set(txts) - set(slice_annotations))
+		
+		wavs.sort()
+		txts.sort()
+		
+		#______
+		
+		all_wavs_path = os.path.join(dir,'wav')
+		all_txts_path = os.path.join(dir,'txt')
+		
+		list_results_files = os.listdir(phlabdir)
+		
+		os.chdir(phlabdir)
+		
+		for result_file in list_results_files:
+			
+			wav_name = re.sub('.phseg','.wav',result_file)
+			txt_name = re.sub('.phseg','.txt',result_file)
+			
+			wav_name = os.path.join(all_wavs_path,wav_name)
+			txt_name = os.path.join(all_txts_path,txt_name)
+
+			shutil.copy(wav_name,dir)
+			shutil.copy(txt_name,dir)
+			
+			new_wav_name = os.path.join(dir,wav_name)
+			new_wav_name = re.sub('wav/','',new_wav_name)
+			new_txt_name = os.path.join(dir,txt_name)
+			new_txt_name = re.sub('txt/','',new_txt_name)
+			
+			wav_names.append(new_wav_name)
+			txt_names.append(new_txt_name)
+
+			u = codecs.open (result_file, 'r', encoding = 'utf-8',errors='ignore')
+			u = u.readlines()
+			for line in u:
+				if 'Total score:' in line:
+					variaty = re.findall('\d+',line)
+					variaty = '-'+variaty[0]
+					variaties.append(variaty)
+		
+		#removing files and directories for new files
+		
+		os.chdir(dir)
+		shutil.rmtree('txt/')
+		shutil.rmtree('wav/')
+		shutil.rmtree('phsegdir/')
+		os.remove('f_ali.transcription')
+		os.remove('f_ali.fileids')
+	
+	total_result = os.path.join(directory_results2,'results.txt')
+	with codecs.open(total_result,'a',encoding="utf8",errors='ignore') as result_total:
+		for el in results:
+			result_total.write(el)
+	results.clear()
+
+# arguments parser
+
+def main():	
+
+	parser = argparse.ArgumentParser()
+	
+	parser.add_argument('-p', '--playlist_file', dest='URL_list', type=str,
+						help='playlists txt-file path', required=True)
+	parser.add_argument('-a', '--audio_path', dest='directory_audio', type=str,
+						help='path to download audiotracks', required=True)
+	parser.add_argument('-s','--subs_path', dest='directory_subtitles', type=str,
+						help='path to download subtitles', required=True)
+	parser.add_argument('-r', '--results_path', dest='directory_results', type=str,
+						help='path for results', required=True)
+						
+	parser.add_argument('-am', '--sphinx_model_path', dest='sphinx_model_path', type=str,
+						help='your acoustic model path', required=True)
+	parser.add_argument('-dict', '--dictionary_path', dest='dictionary_path', type=str,
+						help='your dictionary path', required=True)
+	parser.add_argument('-dict_f', '--dictionary_filler_path', dest='dictionary_filler_path', type=str,
+						help='your dictionary filler path', required=True)
+	parser.add_argument('-ar', '--ali_results_path', dest='directory_alignment_results', type=str,
+						help='path for alignment results', required=True)
+
+	args = parser.parse_args()
+
+	directory_audio = os.path.abspath(args.directory_audio)+'/'
+	directory_subtitles = os.path.abspath(args.directory_subtitles)+'/'
+	directory_results = os.path.abspath(args.directory_results)+'/'
+	URL_list = os.path.abspath(args.URL_list)
+	sphinx_model_path = os.path.abspath(args.sphinx_model_path)
+	dictionary_path = os.path.abspath(args.dictionary_path)
+	dictionary_filler_path = os.path.abspath(args.dictionary_filler_path)
+	directory_results2 = os.path.abspath(args.directory_alignment_results)
+
+	with open(URL_list, 'r') as playlists_links:
+		lots_of_playlists = playlists_links.readlines()
+	lots_of_playlists = list(lots_of_playlists)
+	for i in lots_of_playlists:
+		i = re.sub("\n", '', i)
+		if i=='':
+			pass
+		else:
+			getting_sound_and_subtitles(i,directory_audio, directory_subtitles,directory_results,sphinx_model_path,dictionary_path,dictionary_filler_path,directory_results2)
+			wavs.clear()
+			
+	#creating total csv
+
+	os.chdir (directory_results2)
+
+	dict = {'wav_dir': wav_names , 'txt_dir': txt_names, 'variaty': variaties}     
+	df = pandas.DataFrame(dict) 
+
+	df.to_csv ('Total_results.csv', index = False, header=True)
+
+    
--- a/setup.cfg 0 → 100644
View file @305ee56
+++ b/setup.cfg 0 → 100644
View file @305ee56
+[metadata]
+description-file = README.md
+
+[egg_info]
+tag_build = 
+tag_date = 0
+
--- a/setup.py 0 → 100644
View file @305ee56
+++ b/setup.py 0 → 100644
View file @305ee56
+from setuptools import setup, find_packages, Extension
+from os.path import join, dirname
+
+setup(
+    name='etaloncorpuscreator',              
+    version='0.1',                         
+    description='command-line package for automatical creation of russian language audio corpus from YouTube audiotracks and subtitles with using forced alignment by sphinx3',
+    url='https://github.com/dangrebenkin/audiocorpusbuilder',
+    author='Daniel Grebenkin',
+    author_email = 'd.grebenkin@g.nsu.ru',
+    license='Apache License Version 2.0',
+    keywords=['dataset', 'librosa', 'youtube-dl', 'youtube', 'forced alignment', 'sphinx','sphinx3'],     
+    packages = find_packages(), 
+    platforms = 'Linux',
+    entry_points ={ 
+        'console_scripts': [ 
+            'eccr = etaloncorpuscreator.corpus_creator:main'
+        ]
+    },
+    install_requires=[
+	'pandas >= 1.1.1',
+        'audioread >= 2.0.0',
+        'numpy >= 1.15.0',
+        'packaging >= 18',
+        'scipy >= 1.0.0',
+        'scikit-learn >= 0.14.0, != 0.19.0',
+        'joblib >= 0.14',
+        'decorator >= 3.0.0',
+        'resampy >= 0.2.2',
+        'numba == 0.48',
+        'soundfile >= 0.9.0',
+        'pooch >= 1.0',
+        'librosa==0.7.0',
+        'youtube-dl>=2020.1.1'
+    ],
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Science/Research',
+        'Intended Audience :: Developers',
+        'Topic :: Software Development',
+        'Topic :: Scientific/Engineering',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',  
+        'Programming Language :: Python :: 3.8']            
+) 
+