Forked from
FAST / SpeechDatasets.jl
17 commits ahead of the upstream repository.
-
Nicolas Denier authoredNicolas Denier authored
multilingual_librispeech.jl 4.26 KiB
# SPDX-License-Identifier: CECILL-C
const MLS_LANG_CODE = Dict(
"deu" => "german",
"eng" => "english",
"esp" => "spanish",
"fra" => "french",
"ita" => "italian",
"nld" => "dutch",
"pol" => "polish",
"prt" => "portuguese"
)
const MLS_AUDIO_URLS = Dict(
"deu" => "https://dl.fbaipublicfiles.com/mls/mls_german.tar.gz",
"eng" => "https://dl.fbaipublicfiles.com/mls/mls_english.tar.gz",
"esp" => "https://dl.fbaipublicfiles.com/mls/mls_spanish.tar.gz",
"fra" => "https://dl.fbaipublicfiles.com/mls/mls_french.tar.gz",
"ita" => "https://dl.fbaipublicfiles.com/mls/mls_italian.tar.gz",
"nld" => "https://dl.fbaipublicfiles.com/mls/mls_dutch.tar.gz",
"pol" => "https://dl.fbaipublicfiles.com/mls/mls_polish.tar.gz",
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_portuguese.tar.gz"
)
const MLS_LM_URLS = Dict(
"deu" => "https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz",
"eng" => "https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz",
"esp" => "https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz",
"fra" => "https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz",
"ita" => "https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz",
"nld" => "https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz",
"pol" => "https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz",
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
)
const mlls_id = get_nametype("Multilingual LibriSpeech")
declareBuilder(mlls_id, kwargs=(;lang="eng"))
function Base.download(builder::DatasetBuilder{mlls_id}, dir::AbstractString)
lang = builder.kwargs.lang
donefile = joinpath(dir, ".download.done")
if ! isfile(donefile)
run(`mkdir -p $dir`)
@info "downloading the corpus for language $lang"
run(`wget -P $dir $(MLS_AUDIO_URLS[lang])`)
tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[lang]).tar.gz")
@info "extracting"
run(`tar -xf $tarpath -C $dir`)
run(`rm $tarpath`)
@info "downloading LM data"
run(`wget -P $dir $(MLS_LM_URLS[lang])`)
tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[lang]).tar.gz")
@info "extracting"
run(`tar -xf $tarpath -C $dir`)
run(`rm $tarpath`)
run(pipeline(`date`, stdout = donefile))
end
@info "dataset in $dir"
end
function mlls_recordings(inputdir, subset, lang)
subsetdir = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "audio")
recs = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
for path in readdir(d2; join = true)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
recs[r.id] = r
end
end
end
recs
end
function mlls_annotations(inputdir, subset, lang)
trans = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "transcripts.txt")
sups = Dict()
open(trans, "r") do f
for line in eachline(f)
tokens = split(line)
s = Annotation(tokens[1], tokens[1]; channels = [1],
data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s
end
end
sups
end
function prepare(::DatasetBuilder{mlls_id}, inputdir, outputdir; lang="eng")
outputdir = mkpath(outputdir)
# 1. Recording manifests.
out = joinpath(outputdir, "recordings.jsonl")
@info "preparing recording manifest $out"
if ! isfile(out)
open(out, "a") do f
for subset in ["train", "dev", "test"]
recs = mlls_recordings(inputdir, subset, lang)
writemanifest(f, recs)
end
end
end
# 2. Annotation manifests.
for subset in ["train", "dev", "test"]
out = joinpath(outputdir, "annotations-$subset.jsonl")
@info "preparing annotation manifest ($subset) $out"
if ! isfile(out)
sups = mlls_annotations(inputdir, subset, lang)
open(out, "w") do f
writemanifest(f, sups)
end
end
end
end