Newer
Older
# SPDX-License-Identifier: .1
#######################################################################
const MINILS_URL = Dict(
"dev" => "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
"train" => "https://www.openslr.org/resources/31/train-clean-5.tar.gz"
)
const MINILS_SUBSETS = Dict(
"train" => "train-clean-5",
"dev" => "dev-clean-2"
)
#######################################################################
const minils_id = get_nametype("Mini LibriSpeech")
function minils_recordings(dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
recs = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
for path in readdir(d2; join = true)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
Nicolas Denier
committed
AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
recs[r.id] = r
end
end
end
recs
end
function minils_annotations(dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
sups = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
k1 = d1 |> basename
k2 = d2 |> basename
open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f
for line in eachline(f)
tokens = split(line)
s = Annotation(
tokens[1], # annotation id
tokens[1]; # recording id
channels = [1],
data = Dict("text" => join(tokens[2:end], " "))
)
sups[s.id] = s
end
end
end
end
sups
end
function Base.download(::DatasetBuilder{minils_id}, dir::AbstractString)
run(`wget --no-check-certificate -P $dir $(MINILS_URL[subset])`)
tarpath = joinpath(dir, "$(MINILS_SUBSETS[subset]).tar.gz")
run(`rm $tarpath`)
end
run(pipeline(`date`, stdout = donefile))
end
function prepare(::DatasetBuilder{minils_id}, inputdir, outputdir)
Nicolas Denier
committed
outputdir = mkpath(outputdir)
Nicolas Denier
committed
out = joinpath(outputdir, "recordings.jsonl")
Nicolas Denier
committed
open(out, "a") do f
for subset in ["train", "dev"]
@debug "preparing recording manifest ($subset) $out"
Nicolas Denier
committed
recs = minils_recordings(inputdir, subset)
for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
Nicolas Denier
committed
out = joinpath(outputdir, "annotations-$name.jsonl")
@debug "preparing annotation manifest ($subset) $out"
Nicolas Denier
committed
sups = minils_annotations(inputdir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
end
end