# SPDX-License-Identifier: CECILL-2.1 struct MultilingualLibriSpeech <: SpeechCorpus lang name function MultilingualLibriSpeech(lang) new(lang, "multilingual_librispeech") end end const MLS_LANG_CODE = Dict( "deu" => "german", "eng" => "english", "esp" => "spanish", "fra" => "french", "ita" => "italian", "nld" => "dutch", "pol" => "polish", "prt" => "portuguese" ) const MLS_AUDIO_URLS = Dict( "deu" => "https://dl.fbaipublicfiles.com/mls/mls_german.tar.gz", "eng" => "https://dl.fbaipublicfiles.com/mls/mls_english.tar.gz", "esp" => "https://dl.fbaipublicfiles.com/mls/mls_spanish.tar.gz", "fra" => "https://dl.fbaipublicfiles.com/mls/mls_french.tar.gz", "ita" => "https://dl.fbaipublicfiles.com/mls/mls_italian.tar.gz", "nld" => "https://dl.fbaipublicfiles.com/mls/mls_dutch.tar.gz", "pol" => "https://dl.fbaipublicfiles.com/mls/mls_polish.tar.gz", "prt" => "https://dl.fbaipublicfiles.com/mls/mls_portuguese.tar.gz" ) const MLS_LM_URLS = Dict( "deu" => "https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz", "eng" => "https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz", "esp" => "https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz", "fra" => "https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz", "ita" => "https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz", "nld" => "https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz", "pol" => "https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz", "prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz" ) function Base.download(corpus::MultilingualLibriSpeech, outdir) dir = path(corpus, outdir) donefile = joinpath(dir, ".download.done") if ! isfile(donefile) run(`mkdir -p $dir`) @info "downloading the corpus" run(`wget -P $dir $(MLS_AUDIO_URLS[corpus.lang])`) tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang]).tar.gz") @info "extracting" run(`tar -xf $tarpath -C $dir`) run(`rm $tarpath`) @info "downloading LM data" run(`wget -P $dir $(MLS_LM_URLS[corpus.lang])`) tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[corpus.lang]).tar.gz") @info "extracting" run(`tar -xf $tarpath -C $dir`) run(`rm $tarpath`) run(pipeline(`date`, stdout = donefile)) end @info "dataset in $dir" corpus end function recordings(corpus::MultilingualLibriSpeech, dir, subset) subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio") recs = Dict() for d1 in readdir(subsetdir; join = true) for d2 in readdir(d1; join = true) for path in readdir(d2; join = true) id = replace(basename(path), ".flac" => "") r = Recording( id, CmdAudioSource(`sox $path -t wav -`); channels = [1], samplerate = 16000 ) recs[r.id] = r end end end recs end function annotations(corpus::MultilingualLibriSpeech, dir, subset) trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt") sups = Dict() open(trans, "r") do f for line in eachline(f) tokens = split(line) s = Annotation(tokens[1], tokens[1]; channel = 1, data = Dict("text" => join(tokens[2:end], " "))) sups[s.id] = s end end sups end function prepare(corpus::MultilingualLibriSpeech, outdir) dir = path(corpus, outdir) # 1. Recording manifests. for subset in ["train", "dev", "test"] out = joinpath(dir, "recording-manifest-$subset.jsonl") @info "preparing recording manifest ($subset) $out" if ! isfile(out) recs = recordings(corpus, dir, subset) open(out, "w") do f writemanifest(f, recs) end end end # 2. Annotation manifests. for subset in ["train", "dev", "test"] out = joinpath(dir, "annotation-manifest-$subset.jsonl") @info "preparing annotation manifest ($subset) $out" if ! isfile(out) sups = annotations(corpus, dir, subset) open(out, "w") do f writemanifest(f, sups) end end end corpus end