Skip to content
This repository has been archived by the owner on Mar 8, 2023. It is now read-only.

Commit

Permalink
New VoxForge Importer and generation final MITADS-Speech Dataset (#130)
Browse files Browse the repository at this point in the history
  • Loading branch information
eziolotta authored May 22, 2021
1 parent 0cd7e74 commit 188c828
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 6 deletions.
43 changes: 43 additions & 0 deletions MITADS-Speech/assets/corpora_collector/mitads-speech-full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
##
name: 'mitads-speech-full'
version: '0.1'
description: 'MITADS-Speech Dataset, filter audio more than 20 second'

corpus2collect:
voxforge:
filter:
max_duration: 20
evalita2009:
filter:
max_duration: 20
mspka:
filter:
max_duration: 20
siwis:
filter:
max_duration: 20
#common_voice:
# filter:
# max_duration: 20

m-ailabs:
filter:
max_duration: 20

mls:
filter:
max_duration: 20
comments_contains:
## filter ancient work by author
- Dante Alighieri
- Giovanni Francesco Straparola
- Niccolò Machiavelli
##filter title book that is present in m-ailabs
- Novelle per un anno
- Galatea
- Il fu Mattia Pascal
- Ritratto del Diavolo
- Contessa di Karolystria
- meraviglie del Duemila
- Malavoglia

9 changes: 8 additions & 1 deletion MITADS-Speech/corpora_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
help='root folder of csv dataset to collect, also is root of output csv'
'default is root_project/MITADS-Speech-output')

collector_parser.add_argument('-d', '--dataset_output', type=str, default='',
help='root folder output dataset'
'default is csv_folder')

collector_parser.add_argument('-z', '--zip_output', type=str, default='true',
help='if true collect files into .zip. If false files are copyed to a folder in csv_folder')

Expand Down Expand Up @@ -164,6 +168,9 @@ def collect_datasets(config,args):

zip_output = True if args.zip_output.lower()=='true' else False
csv_corpus_rootdir = args.csv_folder

final_dataset_root = csv_corpus_rootdir if args.dataset_output=='' else args.dataset_output

corpus2collect = config['corpus2collect']


Expand All @@ -183,7 +190,7 @@ def collect_datasets(config,args):
final_corpora_name = config['name']
final_corpora_version = config['version']
output_corpora_foldername = final_corpora_name + '_' + 'v' + final_corpora_version
corpora_output_dir = os.path.join(csv_corpus_rootdir, output_corpora_foldername)
corpora_output_dir = os.path.join(final_dataset_root, output_corpora_foldername)

if not path.exists(corpora_output_dir):
print('No path "%s" - creating ...' % corpora_output_dir)
Expand Down
4 changes: 3 additions & 1 deletion MITADS-Speech/corpora_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def _maybe_convert_sets(self,corpus:Corpus):

## all examples are processed, even if the resample is not necessary, the duration or other filters should be evaluated
samples = [ [a,corpus.make_wav_resample, corpus.utterences[a]] for a in corpus.audios ]
##self.one_sample(samples[0])
#self.one_sample(samples[23])
# Mutable counters for the concurrent embedded routine
counter = get_counter()
print(f"Converting audio files to wav {SAMPLE_RATE}hz Mono")
Expand Down Expand Up @@ -331,6 +331,8 @@ def row_validation(self,filename,duration,comments):
def one_sample(self,sample):

delete_original_if_resampled = True
##set to false if you want run importer more time (ex. local test)
#delete_original_if_resampled = False

orig_filename = sample[0]
make_wav_resample = sample[1]
Expand Down
3 changes: 2 additions & 1 deletion MITADS-Speech/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ sox
progressbar2==3.47.0
## pycopy-shutil problem istall on colab
charset_normalizer
ds-ctcdecoder==0.9.3
ds-ctcdecoder==0.9.3
PyYAML
23 changes: 20 additions & 3 deletions MITADS-Speech/siwis_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def get_corpus(self):
text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt","IT")
##read transcript in prompts.txt
transcripts = {}
##cp1252 if windows os
encoding = 'cp1252' if os.name == 'nt' else 'utf-8'
##encoding prompts files is cp1252
encoding = 'cp1252'
###read transcript from prompts file
with open(os.path.join(self.origin_data_path,self.extract_dir, "prompts","ALL_IT_prompts_iso.txt"), "r",encoding=encoding) as f:
line = f.readline()
Expand Down Expand Up @@ -103,6 +103,13 @@ def get_speaker_id(self,audio_file_path):
# Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid.
def validate_label(self,label):
##import unicodedata
## normalize remove absent char è ò à
#label = (
# unicodedata.normalize("NFKD", label.strip())
# .encode("ascii", "ignore")
# .decode("ascii", "ignore")
# )

label = label.replace("-", " ")
label = label.replace("_", " ")
Expand Down Expand Up @@ -154,20 +161,30 @@ def validate_label(self,label):
label = label.replace("741", "settecentoquarantuno")
label = label.replace("103", "settecentoquarantuno")
########################
##other to clean
label = label.replace("\ufeff", "")
##

if re.search(r"[0-9]|[\[\]&*{]", label) is not None:
return None


label = label.strip()
label = label.lower()

##DEBUG - decomment for checking normalization char by char
#DEBUG_ALPHABET = ' ,\',a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,à,è,é,ì,í,ò,ó,ô,ù,ú'.split(',')
#for c in label:
# if(c not in DEBUG_ALPHABET):
# print('CHECK char:'+ c)

return label if label else None

if __name__ == "__main__":

from corpora_importer import importer_parser
args = importer_parser.parse_args()
#args.download_directory = "F:\\DATASET-MODELS\\speech_dataset\\CORPORA-IT-AUDIO\\SIWIS"
#args.csv_output_folder = "F:\\DATASET-MODELS\\speech_dataset\\new-speech-corpora-it"

corpus_name=CORPUS_NAME
archive_url = 'https://phonogenres.unige.ch/downloads/siwis_latest.zip'
Expand Down
117 changes: 117 additions & 0 deletions MITADS-Speech/voxforge_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
import time
import os
import re
from corpora_importer import ArchiveImporter,Corpus,string_escape


import urllib
from bs4 import BeautifulSoup
import time
CORPUS_NAME = 'voxforge'

class VoxforgeImporter(ArchiveImporter):


def get_corpus(self):
##extract training and development datasets
##do data merge, ArchiveImporter make final train/test/dev datasets
utterances = {}
audios = []
wav_dir = os.path.join(self.origin_data_path, self.archive_name, "wav")
text_file = os.path.join(self.origin_data_path, self.archive_name, "etc","PROMPTS")

wav_files = [f for f in os.listdir(wav_dir) if os.path.isfile(os.path.join(wav_dir, f))]
count=0

with open(text_file,encoding='utf-8') as f:
for line in f:
temp_2 = line.split(" ", 1)
ref_url = temp_2[0]
transcript = temp_2[1].lower()
transcript = transcript.replace('\n','')

temp = ref_url.split('/')
speaker_id = temp[0]
file_n = temp[-1]
for wav_file in wav_files:
if(file_n in wav_file):
##found , is this
wav_file_path = os.path.join(wav_dir,wav_file)
utterances[ wav_file_path] = transcript
audios.append(wav_file_path)
count +=1
break


##collect corpus
corpus = Corpus(utterances,audios)
#################
## VoxForge need wav resample
##
corpus.make_wav_resample = True
return corpus

def get_speaker_id(self,audio_file_path):

return self.archive_name


def get_voxforge_bad_speaker():

l = []
l.append("anonymous-20080504-qvg")
l.append("anonymous-20080723-ouv")
l.append("anonymous-20080725-dey")
l.append("Vistaus-20080718-mrm")
#l.append("")
#l.append("")


return l



if __name__ == "__main__":

from corpora_importer import importer_parser
args = importer_parser.parse_args()

corpus_name=CORPUS_NAME
archivie_urls = []

#voxforge_url = "http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit"
voxforge_url = "http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/Main/16kHz_16bit/"


html_page = urllib.request.urlopen(voxforge_url)
soup = BeautifulSoup(html_page, "html.parser")

# list all links
archivies = [l["href"] for l in soup.find_all("a") if ".tgz" in l["href"]]

bad_speakers = get_voxforge_bad_speaker()
for i in range(len(archivies)):
archivie_url = voxforge_url + '' + archivies[i]

speaker_id = archivies[i].split('.')[0]

if(speaker_id in bad_speakers):
##filter bad speaker
print("filter speaker {}".format(speaker_id))
continue

csv_append_mode = not i==0

_importer = VoxforgeImporter(corpus_name,archivie_url,data_dir=args.download_directory,output_path=args.csv_output_folder,csv_append_mode=csv_append_mode)

try:
_importer.run()
except Exception as e:
print(str(e))
print('ARCHIVE CORRUPTED {}'.format(_importer.archive_name))
##some archive is corrupted, pass
continue

##sleep ...host interrupt connection
time.sleep(2)

0 comments on commit 188c828

Please sign in to comment.