# SPDX-License-Identifier: CECILL-2.1 struct MultilingualLibriSpeechCorpus end const MultilingualLibriSpeech = MultilingualLibriSpeechCorpus() const MLS_LANG_CODE = Dict( :deu => "german", :eng => "english", :esp => "spanish", :fra => "french", :ita => "italian", :nld => "dutch", :pol => "polish", :prt => "portuguese" ) const MLS_AUDIO_URLS = Dict( :deu => "https://dl.fbaipublicfiles.com/mls/mls_german.tar.gz", :eng => "https://dl.fbaipublicfiles.com/mls/mls_english.tar.gz", :esp => "https://dl.fbaipublicfiles.com/mls/mls_spanish.tar.gz", :fra => "https://dl.fbaipublicfiles.com/mls/mls_french.tar.gz", :ita => "https://dl.fbaipublicfiles.com/mls/mls_italian.tar.gz", :nld => "https://dl.fbaipublicfiles.com/mls/mls_dutch.tar.gz", :pol => "https://dl.fbaipublicfiles.com/mls/mls_polish.tar.gz", :prt => "https://dl.fbaipublicfiles.com/mls/mls_portuguese.tar.gz" ) const MLS_LM_URLS = Dict( :deu => "https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz", :eng => "https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz", :esp => "https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz", :fra => "https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz", :ita => "https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz", :nld => "https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz", :pol => "https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz", :prt => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz" ) function Base.download(::MultilingualLibriSpeechCorpus, lang, outdir) donefile = joinpath(outdir, ".download.done") if ! isfile(donefile) run(`mkdir -p $outdir`) @info "downloading the corpus" run(`wget -P $outdir $(MLS_AUDIO_URLS[lang])`) tarpath = joinpath(outdir, "mls_$(MLS_LANG_CODE[lang]).tar.gz") @info "extracting" run(`tar -xf $tarpath -C $outdir`) run(`rm $tarpath`) @info "downloading LM data" run(`wget -P $outdir $(MLS_LM_URLS[lang])`) tarpath = joinpath(outdir, "mls_lm_$(MLS_LANG_CODE[lang]).tar.gz") @info "extracting" run(`tar -xf $tarpath -C $outdir`) run(`rm $tarpath`) run(pipeline(`date`, stdout = donefile)) end @info "dataset in $outdir" end function recordings(::MultilingualLibriSpeechCorpus, lang, dir, subset) subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[lang])", subset, "audio") recs = Dict() for d1 in readdir(subsetdir; join = true) for d2 in readdir(d1; join = true) for path in readdir(d2; join = true) id = replace(basename(path), ".flac" => "") r = Recording( id, CmdAudioSource(`sox $path -t wav -`); channels = [1], samplerate = 16000 ) recs[r.id] = r end end end recs end function supervisions(::MultilingualLibriSpeechCorpus, lang, dir, subset) trans = joinpath(dir, "mls_$(MLS_LANG_CODE[lang])", subset, "transcripts.txt") sups = Dict() open(trans, "r") do f for line in eachline(f) tokens = split(line) s = Supervision(tokens[1], tokens[1]; channel = 1, data = Dict("text" => join(tokens[2:end], " "))) sups[s.id] = s end end sups end function prepare(multils::MultilingualLibriSpeechCorpus, lang, dir) # 1. Recording manifests. for subset in ["train", "dev", "test"] out = joinpath(dir, "recording-manifest-$subset.jsonl") @info "preparing recording manifest ($subset) $out" if ! isfile(out) recs = recordings(multils, lang, dir, subset) open(out, "w") do f writemanifest(f, recs) end end end # 2. Supervision manifests. for subset in ["train", "dev", "test"] out = joinpath(dir, "supervision-manifest-$subset.jsonl") @info "preparing supervision manifest ($subset) $out" if ! isfile(out) sups = supervisions(multils, lang, dir, subset) open(out, "w") do f writemanifest(f, sups) end end end end