Skip to content
Snippets Groups Projects
Verified Commit 386df4df authored by Lucas Ondel Yang's avatar Lucas Ondel Yang
Browse files

added mini librispeech

parent f708d245
No related branches found
No related tags found
No related merge requests found
# SPDX-License-Identifier: CECILL-2.1
struct MiniLibriSpeechCorpus end
const MiniLibriSpeech = MiniLibriSpeechCorpus()
const MINILS_URL = Dict(
"dev" => "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
"train" => "https://www.openslr.org/resources/31/train-clean-5.tar.gz"
)
const MINILS_SUBSETS = Dict("train" => "train-clean-5", "dev" => "dev-clean-2")
function recordings(::MiniLibriSpeechCorpus, dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
recs = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
for path in readdir(d2; join = true)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
recs[r.id] = r
end
end
end
recs
end
function supervisions(::MiniLibriSpeechCorpus, dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
sups = Dict()
for d1 in readdir(subsetdir; join = true)
for d2 in readdir(d1; join = true)
k1 = d1 |> basename
k2 = d2 |> basename
open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f
for line in eachline(f)
tokens = split(line)
s = Supervision(tokens[1], tokens[1]; channel = 1,
data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s
end
end
end
end
sups
end
function Base.download(::MiniLibriSpeechCorpus, outdir)
donefile = joinpath(outdir, ".download.done")
if ! isfile(donefile)
run(`mkdir -p $outdir`)
@info "downloading the corpus"
for subset in ["train", "dev"]
run(`wget --no-check-certificate -P $outdir $(MINILS_URL[subset])`)
tarpath = joinpath(outdir, "$(MINILS_SUBSETS[subset]).tar.gz")
@info "extracting"
run(`tar -xf $tarpath -C $outdir`)
run(`rm $tarpath`)
end
run(pipeline(`date`, stdout = donefile))
end
@info "dataset in $outdir"
end
function prepare(minils::MiniLibriSpeechCorpus, dir)
# 1. Recording manifests.
for subset in ["train", "dev"]
out = joinpath(dir, "recording-manifest-$subset.jsonl")
@info "preparing recording manifest ($subset) $out"
if ! isfile(out)
recs = recordings(minils, dir, subset)
open(out, "w") do f
writemanifest(f, recs)
end
end
end
# 2. Supervision manifests.
for subset in ["train", "dev"]
out = joinpath(dir, "supervision-manifest-$subset.jsonl")
@info "preparing supervision manifest ($subset) $out"
if ! isfile(out)
sups = supervisions(minils, dir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment