Newer
Older
# SPDX-License-Identifier: CECILL-2.1
struct MultilingualLibriSpeech <: SpeechCorpus
lang
name
function MultilingualLibriSpeech(lang)
new(lang, "multilingual_librispeech")
end
end
const MLS_LANG_CODE = Dict(
"deu" => "german",
"eng" => "english",
"esp" => "spanish",
"fra" => "french",
"ita" => "italian",
"nld" => "dutch",
"pol" => "polish",
"prt" => "portuguese"
)
const MLS_AUDIO_URLS = Dict(
"deu" => "https://dl.fbaipublicfiles.com/mls/mls_german.tar.gz",
"eng" => "https://dl.fbaipublicfiles.com/mls/mls_english.tar.gz",
"esp" => "https://dl.fbaipublicfiles.com/mls/mls_spanish.tar.gz",
"fra" => "https://dl.fbaipublicfiles.com/mls/mls_french.tar.gz",
"ita" => "https://dl.fbaipublicfiles.com/mls/mls_italian.tar.gz",
"nld" => "https://dl.fbaipublicfiles.com/mls/mls_dutch.tar.gz",
"pol" => "https://dl.fbaipublicfiles.com/mls/mls_polish.tar.gz",
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_portuguese.tar.gz"
)
const MLS_LM_URLS = Dict(
"deu" => "https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz",
"eng" => "https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz",
"esp" => "https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz",
"fra" => "https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz",
"ita" => "https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz",
"nld" => "https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz",
"pol" => "https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz",
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
function Base.download(corpus::MultilingualLibriSpeech, outdir)
dir = path(corpus, outdir)
donefile = joinpath(dir, ".download.done")
@info "downloading the corpus"
run(`wget -P $dir $(MLS_AUDIO_URLS[corpus.lang])`)
tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
run(`rm $tarpath`)
@info "downloading LM data"
run(`wget -P $dir $(MLS_LM_URLS[corpus.lang])`)
tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
run(`rm $tarpath`)
run(pipeline(`date`, stdout = donefile))
end
function recordings(corpus::MultilingualLibriSpeech, dir, subset)
subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
recs = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
for path in readdir(d2; join = true)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
recs[r.id] = r
end
end
end
recs
end
function annotations(corpus::MultilingualLibriSpeech, dir, subset)
trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
sups = Dict()
open(trans, "r") do f
for line in eachline(f)
tokens = split(line)
s = Annotation(tokens[1], tokens[1]; channel = 1,
data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s
end
end
sups
end
function prepare(corpus::MultilingualLibriSpeech, outdir)
dir = path(corpus, outdir)
# 1. Recording manifests.
for subset in ["train", "dev", "test"]
out = joinpath(dir, "recording-manifest-$subset.jsonl")
@info "preparing recording manifest ($subset) $out"
open(out, "w") do f
writemanifest(f, recs)
end
for subset in ["train", "dev", "test"]
out = joinpath(dir, "annotation-manifest-$subset.jsonl")
@info "preparing annotation manifest ($subset) $out"
sups = annotations(corpus, dir, subset)
open(out, "w") do f
writemanifest(f, sups)
end