# SPDX-License-Identifier: CECILL-2.1 ####################################################################### const MINILS_URL = Dict( "dev" => "https://www.openslr.org/resources/31/dev-clean-2.tar.gz", "train" => "https://www.openslr.org/resources/31/train-clean-5.tar.gz" ) const MINILS_SUBSETS = Dict( "train" => "train-clean-5", "dev" => "dev-clean-2" ) ####################################################################### struct MINILIBRISPEECH <: SpeechCorpus recordings train dev test end function minils_recordings(dir, subset) subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset]) recs = Dict() for d1 in readdir(subsetdir; join = true) for d2 in readdir(d1; join = true) for path in readdir(d2; join = true) endswith(path, ".flac") || continue id = replace(basename(path), ".flac" => "") r = Recording( id, CmdAudioSource(`sox $path -t wav -`); channels = [1], samplerate = 16000 ) recs[r.id] = r end end end recs end function minils_supervisions(dir, subset) subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset]) sups = Dict() for d1 in readdir(subsetdir; join = true) for d2 in readdir(d1; join = true) k1 = d1 |> basename k2 = d2 |> basename open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f for line in eachline(f) tokens = split(line) s = Supervision( tokens[1], # supervision id tokens[1]; # recording id channels = [1], data = Dict("text" => join(tokens[2:end], " ")) ) sups[s.id] = s end end end end sups end function minils_download(dir) donefile = joinpath(dir, ".download.done") if ! isfile(donefile) run(`mkdir -p $dir`) @debug "downloading the corpus" for subset in ["train", "dev"] run(`wget --no-check-certificate -P $dir $(MINILS_URL[subset])`) tarpath = joinpath(dir, "$(MINILS_SUBSETS[subset]).tar.gz") @debug "extracting" run(`tar -xf $tarpath -C $dir`) run(`rm $tarpath`) end run(pipeline(`date`, stdout = donefile)) end @debug "dataset in $dir" end function minils_prepare(dir) # 1. Recording manifest. out = joinpath(dir, "recordings.jsonl") if ! isfile(out) open(out, "w") do f for subset in ["train", "dev"] @debug "preparing recording manifest ($subset) $out" recs = minils_recordings(dir, subset) writemanifest(f, recs) end end end # 2. Supervision manifests. for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")] out = joinpath(dir, "supervisions-$name.jsonl") if ! isfile(out) @debug "preparing supervision manifest ($subset) $out" sups = minils_supervisions(dir, subset) open(out, "w") do f writemanifest(f, sups) end end end end function MINILIBRISPEECH(dir, subset) minils_download(dir) minils_prepare(dir) dataset(dir, subset) end