Skip to content
Snippets Groups Projects
Select Git revision
  • 644037699a77780bbf8fc56b06e90a9194f5405a
  • main default protected
  • epiv
  • docs
  • cicd
  • split
  • v0.23.0 protected
  • v0.22.0 protected
  • v0.21.1 protected
  • v0.21.0 protected
  • v0.20.2 protected
  • v0.20.1 protected
  • v0.20.0 protected
  • v0.19.0 protected
  • v0.18.0 protected
  • v0.17.2 protected
  • v0.17.1 protected
  • v0.17.0 protected
  • v0.16.0 protected
19 results

mini_librispeech.jl

Blame
  • mini_librispeech.jl 3.42 KiB
    # SPDX-License-Identifier: .1
    
    #######################################################################
    
    const MINILS_URL = Dict(
        "dev" => "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
        "train" => "https://www.openslr.org/resources/31/train-clean-5.tar.gz"
    )
    const MINILS_SUBSETS = Dict(
        "train" => "train-clean-5",
        "dev" => "dev-clean-2"
    )
    
    #######################################################################
    
    struct MINILIBRISPEECH <: SpeechCorpus
        recordings
        train
        dev
        test
    end
    
    function minils_recordings(dir, subset)
        subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
        recs = Dict()
    
        for d1 in readdir(subsetdir; join = true)
            for d2 in readdir(d1; join = true)
                for path in readdir(d2; join = true)
                    endswith(path, ".flac") || continue
                    id = replace(basename(path), ".flac" =>  "")
                    r = Recording(
                        id,
                        CmdAudioSource(`sox $path -t wav -`);
                        channels = [1],
                        samplerate = 16000
                    )
                    recs[r.id] = r
                end
            end
        end
        recs
    end
    
    function minils_annotations(dir, subset)
        subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
        sups = Dict()
        for d1 in readdir(subsetdir; join = true)
            for d2 in readdir(d1; join = true)
                k1 = d1 |> basename
                k2 = d2 |> basename
                open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f
                    for line in eachline(f)
                        tokens = split(line)
                        s = Annotation(
                            tokens[1], # annotation id
                            tokens[1]; # recording id
                            channels = [1],
                            data = Dict("text" => join(tokens[2:end], " "))
                        )
                        sups[s.id] = s
                    end
                end
            end
        end
    
        sups
    end
    
    function minils_download(dir)
        donefile = joinpath(dir, ".download.done")
        if ! isfile(donefile)
            run(`mkdir -p $dir`)
            @debug "downloading the corpus"
            for subset in ["train", "dev"]
                run(`wget --no-check-certificate -P $dir $(MINILS_URL[subset])`)
                tarpath = joinpath(dir, "$(MINILS_SUBSETS[subset]).tar.gz")
                @debug "extracting"
                run(`tar -xf $tarpath -C $dir`)
                run(`rm $tarpath`)
            end
    
            run(pipeline(`date`, stdout = donefile))
        end
        @debug "dataset in $dir"
    end
    
    function minils_prepare(dir)
        # 1. Recording manifest.
        out = joinpath(dir, "recordings.jsonl")
        if ! isfile(out)
            open(out, "w") do f
                for subset in ["train", "dev"]
                    @debug "preparing recording manifest ($subset) $out"
                    recs = minils_recordings(dir, subset)
                    writemanifest(f, recs)
                end
            end
        end
    
        # 2. Annotation manifests.
        for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
            out = joinpath(dir, "annotations-$name.jsonl")
            if ! isfile(out)
                @debug "preparing annotation manifest ($subset) $out"
                sups = minils_annotations(dir, subset)
                open(out, "w") do f
                    writemanifest(f, sups)
                end
            end
        end
    end
    
    
    function MINILIBRISPEECH(dir, subset)
        minils_download(dir)
        minils_prepare(dir)
        dataset(dir, subset)
    end