Skip to content
Snippets Groups Projects
multilingual_librispeech.jl 4.35 KiB
Newer Older
  • Learn to ignore specific revisions
  • # SPDX-License-Identifier: CECILL-2.1
    
    
    struct MultilingualLibriSpeech <: SpeechCorpus
        lang
        name
    
        function MultilingualLibriSpeech(lang)
            new(lang, "multilingual_librispeech")
        end
    end
    
    
    const MLS_LANG_CODE = Dict(
    
        "deu" => "german",
        "eng" => "english",
        "esp" => "spanish",
        "fra" => "french",
        "ita" => "italian",
        "nld" => "dutch",
        "pol" => "polish",
        "prt" => "portuguese"
    
    )
    
    const MLS_AUDIO_URLS = Dict(
    
        "deu" => "https://dl.fbaipublicfiles.com/mls/mls_german.tar.gz",
        "eng" => "https://dl.fbaipublicfiles.com/mls/mls_english.tar.gz",
        "esp" => "https://dl.fbaipublicfiles.com/mls/mls_spanish.tar.gz",
        "fra" => "https://dl.fbaipublicfiles.com/mls/mls_french.tar.gz",
        "ita" => "https://dl.fbaipublicfiles.com/mls/mls_italian.tar.gz",
        "nld" => "https://dl.fbaipublicfiles.com/mls/mls_dutch.tar.gz",
        "pol" => "https://dl.fbaipublicfiles.com/mls/mls_polish.tar.gz",
        "prt" => "https://dl.fbaipublicfiles.com/mls/mls_portuguese.tar.gz"
    
    )
    
    const MLS_LM_URLS = Dict(
    
        "deu" => "https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz",
        "eng" => "https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz",
        "esp" => "https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz",
        "fra" => "https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz",
        "ita" => "https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz",
        "nld" => "https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz",
        "pol" => "https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz",
        "prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
    
    function Base.download(corpus::MultilingualLibriSpeech, outdir)
        dir = path(corpus, outdir)
        donefile = joinpath(dir, ".download.done")
    
        if ! isfile(donefile)
    
            run(`mkdir -p $dir`)
    
            @info "downloading the corpus"
    
            run(`wget -P $dir $(MLS_AUDIO_URLS[corpus.lang])`)
            tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
    
            @info "extracting"
    
            run(`tar -xf $tarpath -C $dir`)
    
            run(`rm $tarpath`)
    
            @info "downloading LM data"
    
            run(`wget -P $dir $(MLS_LM_URLS[corpus.lang])`)
            tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
    
            @info "extracting"
    
            run(`tar -xf $tarpath -C $dir`)
    
            run(`rm $tarpath`)
    
            run(pipeline(`date`, stdout = donefile))
        end
    
        @info "dataset in $dir"
    
        corpus
    
    function recordings(corpus::MultilingualLibriSpeech, dir, subset)
        subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
    
        recs = Dict()
    
        for d1 in readdir(subsetdir; join = true)
            for d2 in readdir(d1; join = true)
                for path in readdir(d2; join = true)
                    id = replace(basename(path), ".flac" =>  "")
                    r = Recording(
                        id,
                        CmdAudioSource(`sox $path -t wav -`);
                        channels = [1],
                        samplerate = 16000
                    )
                    recs[r.id] = r
                end
            end
        end
        recs
    end
    
    
    function annotations(corpus::MultilingualLibriSpeech, dir, subset)
    
        trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
    
        sups = Dict()
        open(trans, "r") do f
            for line in eachline(f)
                tokens = split(line)
    
                s = Annotation(tokens[1], tokens[1]; channel = 1,
    
                                data = Dict("text" => join(tokens[2:end], " ")))
                sups[s.id] = s
            end
        end
        sups
    end
    
    
    function prepare(corpus::MultilingualLibriSpeech, outdir)
        dir = path(corpus, outdir)
    
    
        # 1. Recording manifests.
    
        for subset in ["train", "dev", "test"]
            out = joinpath(dir, "recording-manifest-$subset.jsonl")
            @info "preparing recording manifest ($subset) $out"
    
                recs = recordings(corpus, dir, subset)
    
                open(out, "w") do f
                    writemanifest(f, recs)
                end
    
        # 2. Annotation manifests.
    
        for subset in ["train", "dev", "test"]
    
            out = joinpath(dir, "annotation-manifest-$subset.jsonl")
            @info "preparing annotation manifest ($subset) $out"
    
                sups = annotations(corpus, dir, subset)
    
                open(out, "w") do f
                    writemanifest(f, sups)
                end