Compare revisions

64403769 · 64403769 · 64403769 · 64403769 · 64403769 · 64403769
--- a/src/builder.jl
+++ b/src/builder.jl
+"""
+    struct DatasetBuilder{name}
+Allow to dispatch main dataset functions (`download()`, `prepare()`).   
+# Parameter
+- `name` Dataset identifier
+# Fields
+- `kwargs::NamedTuple` Keyword arguments supported by the dataset associated to `name`
+"""
+struct DatasetBuilder{name}
+    kwargs::NamedTuple
+end
+"""
+    DatasetBuilder(name::Symbol)
+Construct a DatasetBuilder for a given name.
+Implementations for each name are done by calling [`declareBuilder(name)`](@ref) (automatically done for each supported name).
+"""
+DatasetBuilder(name::Symbol) = DatasetBuilder{name}()
+"""
+    get_kwargs(func_name::Function, args_types::Tuple)
+Return a `NamedTuple` containing each supported kwarg and its default value for a given method.
+# Arguments
+- `func_name` is the name of the function
+- `args_types` is a tuple of argument types for the desired method
+"""
+function get_kwargs(func_name::Function, args_types::Tuple)
+    kwargs_names = Base.kwarg_decl(methods(func_name, args_types)[1])
+    l = length(kwargs_names)
+    if l==0
+        # no kwargs
+        return (;)
+    else
+        # lowered form of the method contains kwargs default values, but some are hidden
+        code = code_lowered(func_name, args_types)[1].code
+        str_code = ["$c" for c in code]
+        # get index corresponding to the function
+        index = findall(x -> occursin("$func_name", x), str_code)[1]
+        # get lowered value of each kwarg
+        values = code[index].args[2:2+l-1]
+        # get back the original value according to the lowered value type
+        kwargs_values = map(v -> 
+            if v isa Core.SSAValue
+                eval(code[v.id])
+            elseif v isa GlobalRef
+                eval(v)
+            else 
+                v
+            end
+            , values)
+        # reconstruct kwargs
+        NamedTuple(zip(kwargs_names, kwargs_values))
+    end
+end
+"""
+    get_nametype(name::String)
+Return a symbol corresponding to the name. This symbol is used to identify the dataset.
+"""
+get_nametype(name::String) = Symbol(replace(name, " "=>"")) # simply remove space
+"""
+    get_dataset_kwargs(name::String)
+Return a `NamedTuple` containing each supported kwarg and its default value for a dataset identified by name.
+"""
+get_dataset_kwargs(name::String) = get_dataset_kwargs(get_nametype(name))
+"""
+    get_dataset_kwargs(name::Symbol)
+Return a `NamedTuple` containing each supported kwarg and its default value for a dataset identified by symbol (nametype).
+"""
+get_dataset_kwargs(name::Symbol) = get_kwargs(prepare, (DatasetBuilder{name}, AbstractString, AbstractString))
+"""
+    declareBuilder(name::Symbol)
+Declare a functor for a DatasetBuilder of type `name`.\n
+A `DatasetBuilder{name}` object can now be created, and will hold the supported kwargs for the corresponding dataset.
+"""
+function declareBuilder(name::Symbol)
+    kwargs = get_dataset_kwargs(name)
+    quotedname = Meta.quot(name)
+    eval(Meta.parse("(::Type{DatasetBuilder{$quotedname}})() = DatasetBuilder{$quotedname}($kwargs)"))
+end
+# Each dataset should implement prepare() (and optionnaly download()) to be usable in dataset()
+"""
+    Base.download(builder::DatasetBuilder{name}, dir::AbstractString)
+Download the dataset identified by `name` into `dir`.\n
+Each dataset has its own implementation if download is supported (for example, a proprietary dataset might not implements download).
+"""
+Base.download
+"""
+    prepare(::DatasetBuilder{name}, inputdir, outputdir; <keyword arguments>)
+Create manifest files into `outputdir` from dataset in `inputdir`. \n
+Each dataset has its own implementation, and can have optional keyword arguments, they can be accessed with [`get_dataset_kwargs(name::String)`](@ref).\n
+Implementing this function is mandatory for a dataset to be compatible with `dataset()`
+"""
+function prepare end
--- a/src/corpora/avid.jl
+++ b/src/corpora/avid.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
+const avid_id = get_nametype("AVID")
 function avid_recordings(dir::AbstractString)
    checkdir(dir)
@@ -12,7 +14,7 @@ function avid_recordings(dir::AbstractString)
            id = filename
            path = joinpath(root, file)
-            audio_src = FileAudioSource(path)
+            audio_src = AudioSources.FileAudioSource(path)
            recordings[id] = Recording(
                id,
@@ -43,7 +45,7 @@ function get_metadata(filename, metadatadict)
    file_metadata = split(file_metadata, ",")
    metadata = Dict(
        headers[i] => file_metadata[i]
-        for i = 1:length(headers)
+        for i in eachindex(headers)
    )
    return metadata
 end
@@ -79,7 +81,7 @@ function avid_annotations(dir)
 end
-function download_avid(dir)
+function Base.download(::DatasetBuilder{avid_id}, dir::AbstractString)
    @info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
    url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
    filename = "AVID.zip"
@@ -93,10 +95,7 @@ function download_avid(dir)
 end
-function avid_prepare(datadir, outputdir)
+function prepare(::DatasetBuilder{avid_id}, datadir, outputdir)
-    # Validate the data directory
-    isdir(datadir) || (datadir = download_avid(datadir))
    # Create the output directory.
    outputdir = mkpath(outputdir)
    rm(joinpath(outputdir, "recordings.jsonl"), force=true)
@@ -128,13 +127,3 @@ function avid_prepare(datadir, outputdir)
        writemanifest(f, annotations)
    end
 end
-function AVID(datadir, outputdir)
-    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
-          isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
-          isfile(joinpath(outputdir, "annotations.jsonl")))
-        avid_prepare(datadir, outputdir)
-    end
-    dataset(outputdir, "")
-end
--- a/src/corpora/corpora.json
+++ b/src/corpora/corpora.json
+[
+    {
+        "name": "AVID",
+        "lang": "eng",
+        "license": "CC BY 4.0",
+        "source": "https://zenodo.org/records/10524873",
+        "authors": ["Manila Kodali", "Paavo Alku", "Sudarsana Reddy Kadiri"],
+        "description": "Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."
+    },
+    {
+        "name": "INA Diachrony",
+        "lang": "fra",
+        "license": "proprietary",
+        "description": "Voice recordings and transcriptions sorted by time period, sex and speaker."
+    },
+    {
+        "name": "Mini LibriSpeech",
+        "lang": "eng",
+        "license": "CC BY 4.0",
+        "source": "https://www.openslr.org/31/",
+        "authors": ["Vassil Panayotov", "Daniel Povey"],
+        "description": "Subset of LibriSpeech corpus for purpose of regression testing.",
+        "subsets": ["train", "dev"]
+    },
+    {
+        "name": "Multilingual LibriSpeech",
+        "lang": ["eng", "fra", "prt", "esp", "deu", "nld", "ita", "pol"],
+        "license": "CC BY 4.0",
+        "source": "http://www.openslr.org/94",
+        "authors": ["Vineel Pratap", "Qiantong Xu", "Anuroop Sriram", "Gabriel Synnaeve", "Ronan Collobert"],
+        "description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish",
+        "subsets": ["train", "dev", "test"]
+    },
+    {
+        "name": "TIMIT",
+        "lang": "eng",
+        "license": "LDC User Agreement for Non-Members",
+        "source": "https://catalog.ldc.upenn.edu/LDC93S1",
+        "authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
+        "description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems.",
+        "subsets": ["train", "dev", "test"]
+    },
+    {
+        "name": "Speech2Tex",
+        "lang": "fra",
+        "license": "proprietary",
+        "authors": ["Lorenzo Brucato"],
+        "description": "Recordings of read equations, literal transcriptions and latex transcriptions."
+    }
+]
\ No newline at end of file
--- a/src/corpora/ina_diachrony.jl
+++ b/src/corpora/ina_diachrony.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
+const ina_diachrony_id = get_nametype("INA Diachrony")
 function ina_diachrony_recordings(dir::AbstractString)
    checkdir(dir)
@@ -12,7 +13,7 @@ function ina_diachrony_recordings(dir::AbstractString)
            id = "ina_diachrony§$filename"
            path = joinpath(root, file)
-            audio_src = FileAudioSource(path)
+            audio_src = AudioSource.FileAudioSource(path)
            recordings[id] = Recording(
                id,
@@ -116,7 +117,7 @@ function ina_diachrony_annotations_csv(dir)
 end
-function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
+function prepare(::DatasetBuilder{ina_diachrony_id}, ina_wav_dir, outputdir; ina_csv_dir=nothing)
    # Validate the data directory
    for d in [ina_wav_dir, ina_csv_dir]
        isnothing(d) || checkdir(d)
@@ -150,11 +151,3 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
        writemanifest(f, annotations)
    end
 end
-function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
-    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
-          isfile(joinpath(outputdir, "annotations.jsonl")))
-        ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
-    end
-    dataset(outputdir, "")
-end
--- a/src/corpora/mini_librispeech.jl
+++ b/src/corpora/mini_librispeech.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: .1
 #######################################################################
@@ -13,12 +13,8 @@ const MINILS_SUBSETS = Dict(
 #######################################################################
-struct MINILIBRISPEECH <: SpeechCorpus
+const minils_id = get_nametype("Mini LibriSpeech")
-    recordings
-    train
-    dev
-    test
-end
 function minils_recordings(dir, subset)
    subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
@@ -31,7 +27,7 @@ function minils_recordings(dir, subset)
                id = replace(basename(path), ".flac" =>  "")
                r = Recording(
                    id,
-                    CmdAudioSource(`sox $path -t wav -`);
+                    AudioSources.CmdAudioSource(`sox $path -t wav -`);
                    channels = [1],
                    samplerate = 16000
                )
@@ -67,7 +63,7 @@ function minils_annotations(dir, subset)
    sups
 end
-function minils_download(dir)
+function Base.download(::DatasetBuilder{minils_id}, dir::AbstractString)
    donefile = joinpath(dir, ".download.done")
    if ! isfile(donefile)
        run(`mkdir -p $dir`)
@@ -85,14 +81,16 @@ function minils_download(dir)
    @debug "dataset in $dir"
 end
-function minils_prepare(dir)
+function prepare(::DatasetBuilder{minils_id}, inputdir, outputdir)
+    outputdir = mkpath(outputdir)
    # 1. Recording manifest.
-    out = joinpath(dir, "recordings.jsonl")
+    out = joinpath(outputdir, "recordings.jsonl")
    if ! isfile(out)
-        open(out, "w") do f
+        open(out, "a") do f
            for subset in ["train", "dev"]
                @debug "preparing recording manifest ($subset) $out"
-                recs = minils_recordings(dir, subset)
+                recs = minils_recordings(inputdir, subset)
                writemanifest(f, recs)
            end
        end
@@ -100,21 +98,13 @@ function minils_prepare(dir)
    # 2. Annotation manifests.
    for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
-        out = joinpath(dir, "annotations-$name.jsonl")
+        out = joinpath(outputdir, "annotations-$name.jsonl")
        if ! isfile(out)
            @debug "preparing annotation manifest ($subset) $out"
-            sups = minils_annotations(dir, subset)
+            sups = minils_annotations(inputdir, subset)
            open(out, "w") do f
                writemanifest(f, sups)
            end
        end
    end
 end
-function MINILIBRISPEECH(dir, subset)
-    minils_download(dir)
-    minils_prepare(dir)
-    dataset(dir, subset)
-end
--- a/src/corpora/multilingual_librispeech.jl
+++ b/src/corpora/multilingual_librispeech.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
-struct MultilingualLibriSpeech <: SpeechCorpus
-    lang
-    name
-    function MultilingualLibriSpeech(lang)
-        new(lang, "multilingual_librispeech")
-    end
-end
 const MLS_LANG_CODE = Dict(
    "deu" => "german",
@@ -42,21 +33,24 @@ const MLS_LM_URLS = Dict(
    "prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
 )
-function Base.download(corpus::MultilingualLibriSpeech, outdir)
+const mlls_id = get_nametype("Multilingual LibriSpeech")
-    dir = path(corpus, outdir)
+function Base.download(builder::DatasetBuilder{mlls_id}, dir::AbstractString)
+    lang = builder.kwargs.lang
    donefile = joinpath(dir, ".download.done")
    if ! isfile(donefile)
        run(`mkdir -p $dir`)
-        @info "downloading the corpus"
+        @info "downloading the corpus for language $lang"
-        run(`wget -P $dir $(MLS_AUDIO_URLS[corpus.lang])`)
+        run(`wget -P $dir $(MLS_AUDIO_URLS[lang])`)
-        tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
+        tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[lang]).tar.gz")
        @info "extracting"
        run(`tar -xf $tarpath -C $dir`)
        run(`rm $tarpath`)
        @info "downloading LM data"
-        run(`wget -P $dir $(MLS_LM_URLS[corpus.lang])`)
+        run(`wget -P $dir $(MLS_LM_URLS[lang])`)
-        tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[corpus.lang]).tar.gz")
+        tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[lang]).tar.gz")
        @info "extracting"
        run(`tar -xf $tarpath -C $dir`)
        run(`rm $tarpath`)
@@ -64,12 +58,10 @@ function Base.download(corpus::MultilingualLibriSpeech, outdir)
        run(pipeline(`date`, stdout = donefile))
    end
    @info "dataset in $dir"
-    corpus
 end
-function recordings(corpus::MultilingualLibriSpeech, dir, subset)
+function mlls_recordings(inputdir, subset, lang)
-    subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
+    subsetdir = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "audio")
    recs = Dict()
    for d1 in readdir(subsetdir; join = true)
@@ -78,7 +70,7 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
                id = replace(basename(path), ".flac" =>  "")
                r = Recording(
                    id,
-                    CmdAudioSource(`sox $path -t wav -`);
+                    AudioSources.CmdAudioSource(`sox $path -t wav -`);
                    channels = [1],
                    samplerate = 16000
                )
@@ -89,13 +81,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
    recs
 end
-function annotations(corpus::MultilingualLibriSpeech, dir, subset)
+function mlls_annotations(inputdir, subset, lang)
-    trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
+    trans = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "transcripts.txt")
    sups = Dict()
    open(trans, "r") do f
        for line in eachline(f)
            tokens = split(line)
-            s = Annotation(tokens[1], tokens[1]; channel = 1,
+            s = Annotation(tokens[1], tokens[1]; channels = [1],
                            data = Dict("text" => join(tokens[2:end], " ")))
            sups[s.id] = s
        end
@@ -103,16 +95,16 @@ function annotations(corpus::MultilingualLibriSpeech, dir, subset)
    sups
 end
-function prepare(corpus::MultilingualLibriSpeech, outdir)
+function prepare(::DatasetBuilder{mlls_id}, inputdir, outputdir; lang="eng")
-    dir = path(corpus, outdir)
+    outputdir = mkpath(outputdir)
    # 1. Recording manifests.
-    for subset in ["train", "dev", "test"]
+    out = joinpath(outputdir, "recordings.jsonl")
-        out = joinpath(dir, "recording-manifest-$subset.jsonl")
+    @info "preparing recording manifest $out"
-        @info "preparing recording manifest ($subset) $out"
+    if ! isfile(out)
-        if ! isfile(out)
+        open(out, "a") do f
-            recs = recordings(corpus, dir, subset)
+            for subset in ["train", "dev", "test"]
-            open(out, "w") do f
+                recs = mlls_recordings(inputdir, subset, lang)
                writemanifest(f, recs)
            end
        end
@@ -120,16 +112,13 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
    # 2. Annotation manifests.
    for subset in ["train", "dev", "test"]
-        out = joinpath(dir, "annotation-manifest-$subset.jsonl")
+        out = joinpath(outputdir, "annotations-$subset.jsonl")
        @info "preparing annotation manifest ($subset) $out"
        if ! isfile(out)
-            sups = annotations(corpus, dir, subset)
+            sups = mlls_annotations(inputdir, subset, lang)
            open(out, "w") do f
                writemanifest(f, sups)
            end
        end
    end
-    corpus
 end
--- a/src/corpora/speech2tex.jl
+++ b/src/corpora/speech2tex.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
+const speech2tex_id = get_nametype("Speech2Tex")
 function speech2tex_recordings(dir::AbstractString)
    checkdir(dir)
@@ -12,7 +14,7 @@ function speech2tex_recordings(dir::AbstractString)
            id = filename
            path = joinpath(root, file)
-            audio_src = FileAudioSource(path)
+            audio_src = AudioSources.FileAudioSource(path)
            recordings[id] = Recording(
                id,
@@ -80,7 +82,7 @@ function speech2tex_annotations(audiodir, transcriptiondir, texdir)
    annotations
 end
-function speech2tex_prepare(datadir, outputdir)
+function prepare(::DatasetBuilder{speech2tex_id}, datadir, outputdir)
    # Validate the data directory
    checkdir(datadir)
@@ -95,6 +97,7 @@ function speech2tex_prepare(datadir, outputdir)
    recordings = speech2tex_recordings(recordings_path)
    manifestpath = joinpath(outputdir, "recordings.jsonl")
+    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, recordings)
    end
@@ -111,12 +114,3 @@ function speech2tex_prepare(datadir, outputdir)
        writemanifest(f, annotations)
    end
 end
-function SPEECH2TEX(datadir, outputdir)
-    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
-          isfile(joinpath(outputdir, "annotations.jsonl")))
-        speech2tex_prepare(datadir, outputdir)
-    end
-    dataset(outputdir, "")
-end
--- a/src/corpora/timit.jl
+++ b/src/corpora/timit.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
 #######################################################################
@@ -11,7 +11,7 @@ const TIMIT_SUBSETS = Dict(
 const TIMIT_DEV_SPK_LIST = Set([
-"faks0",
+    "faks0",
    "fdac1",
    "fjem0",
    "mgwt0",
@@ -223,10 +223,14 @@ TIMIT_PHONE_MAP39 = Dict(
 #######################################################################
+const timit_id = get_nametype("TIMIT")
-function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
+function prepare(::DatasetBuilder{timit_id}, timitdir, dir; formantsdir=nothing, audio_fmt="SPHERE")
    # Validate the data directory
    ! isdir(timitdir) && throw(ArgumentError("invalid path $(timitdir)"))
+    if formantsdir !== nothing
+        ! isdir(formantsdir) && throw(ArgumentError("invalid path $(formantsdir)"))
+    end
    # Create the output directory.
    dir = mkpath(dir)
@@ -256,9 +260,9 @@ function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
    # Annotations
    @info "Extracting annotations from $timitdir/train"
-    train_annotations = timit_annotations(joinpath(timitdir, "train"))
+    train_annotations = timit_annotations(joinpath(timitdir, "train"), formantsdir)
    @info "Extracting annotations from $timitdir/test"
-    test_annotations = timit_annotations(joinpath(timitdir, "test"))
+    test_annotations = timit_annotations(joinpath(timitdir, "test"), formantsdir)
    annotations = merge(train_annotations, test_annotations)
@@ -315,9 +319,9 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
            id = "timit_$(spk)_$(name)"
            audio_src = if fmt == "SPHERE"
-                CmdAudioSource(`sph2pipe -f wav $path`)
+                AudioSources.CmdAudioSource(`sph2pipe -f wav $path`)
            else
-                FileAudioSource(path)
+                AudioSources.FileAudioSource(path)
            end
            recordings[id] = Recording(
@@ -332,8 +336,15 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
 end
-function timit_annotations(dir)
+function timit_annotations(dir, formantsdir=nothing)
    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+    if formantsdir !== nothing
+        ddir = last(splitdir(dir))
+        formantsdir = joinpath(formantsdir, ddir)
+        ! isdir(formantsdir) && throw(ArgumentError("expected directory $formantsdir"))
+    end
    splitline(line) = rsplit(line, limit=3)
    annotations = Dict()
@@ -341,7 +352,7 @@ function timit_annotations(dir)
    for (root, subdirs, files) in walkdir(dir)
        for file in files
-            name, ext = splitext(file)
+            name, _ = splitext(file)
            _, dialect, spk = rsplit(root, "/", limit=3)
            # Annotation files already processed (".wrd" and ".phn")
@@ -355,11 +366,34 @@ function timit_annotations(dir)
            # Phones
            ppath = joinpath(root, name * ".phn")
-            palign = Tuple{Int,Int,String}[]
+            if formantsdir !== nothing
-            for line in eachline(ppath)
+                forpath = joinpath(formantsdir, dialect, spk, name * ".ft")
-                t0, t1, p = split(line)
+            else 
-                push!(palign, (parse(Int, t0), parse(Int, t1), String(p)))
+                forpath = ""
+            end
+            if isfile(forpath)
+                # Read availabled formants values  
+                palign = Tuple{Int,Int,String,NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}}[]
+                for line in eachline(forpath)
+                    t0, t1, p, f1, f2, f3, f4, b1, b2, b3, b4 = split(line)
+                    push!(palign, 
+                    (
+                        parse(Int, t0), parse(Int, t1), String(p),
+                        (parse(Float32, f1), parse(Float32, b1)), 
+                        (parse(Float32, f2), parse(Float32, b2)),
+                        (parse(Float32, f3), parse(Float32, b3)), 
+                        (parse(Float32, f4), parse(Float32, b4))
+                    ))
+                end
+            else 
+                palign = Tuple{Int,Int,String}[]
+                for line in eachline(ppath)
+                    t0, t1, p = split(line)
+                    push!(palign, (parse(Int, t0), parse(Int, t1), String(p)))
+                end
            end
            sentence_type = if startswith(name, "sa")
                "dialect"
@@ -389,15 +423,3 @@ function timit_annotations(dir)
    end
    annotations
 end
-function TIMIT(timitdir, dir, subset)
-    if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
-          isfile(joinpath(dir, "annotations-train.jsonl")) &&
-          isfile(joinpath(dir, "annotations-dev.jsonl")) &&
-          isfile(joinpath(dir, "annotations-test.jsonl")))
-        timit_prepare(timitdir, dir)
-    end
-    dataset(dir, subset)
-end
--- a/src/dataset.jl
+++ b/src/dataset.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
+"""
+    struct SpeechDatasetInfos
+Store metadata about a dataset.
+# Fields
+- `name` Dataset official name
+- `lang` Language or list of languages (ISO 639-3 code)
+- `license` License name
+- `source` URL to the dataset publication or content
+- `authors` list of authors
+- `description` A few sentences describing the content or main purpose 
+- `subsets` List of available subsets (for example ["train", "test"])
+"""
+@kwdef struct SpeechDatasetInfos
+    name::AbstractString = ""
+    lang::Union{AbstractString, Vector{AbstractString}} = ""
+    license::AbstractString = ""
+    source::AbstractString = ""
+    authors::Vector{AbstractString} = []
+    description::AbstractString = ""
+    subsets::Vector{AbstractString} = []
+end
+function SpeechDatasetInfos(infos::AbstractDict)
+    kwargs = NamedTuple()
+    for key in fieldnames(SpeechDatasetInfos)
+        val = get(infos, String(key), nothing)
+        # merge new (key=val) if key was found
+        kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs
+    end
+    SpeechDatasetInfos(;kwargs...)
+end
+"""
+    SpeechDatasetInfos(name::AbstractString)
+Construct a SpeechDatasetInfos from the Dataset name.
+"""
+function SpeechDatasetInfos(name::AbstractString)
+    corpora_infos = JSON.parsefile(corpora_file)
+    infos = filter(x -> x["name"]==name, corpora_infos)[1]
+    SpeechDatasetInfos(infos)
+end
+"""
+    struct SpeechDataset <: MLUtils.AbstractDataContainer
+Store all dataset recordings and annotations. \n
+It can be iterated, and will give a `Tuple{Recording, Annotation}` for each entry. Indexation can be done with integer or id.
+# Fields
+- `infos::SpeechDatasetInfos`
+- `idxs::Vector{AbstractString}` id indexes to access elements
+- `annotations::Dict{AbstractString, Annotation}` Annotation for each index
+- `recordings::Dict{AbstractString, Recording}` Recording for each index
+"""
 struct SpeechDataset <: MLUtils.AbstractDataContainer
+    infos::SpeechDatasetInfos
    idxs::Vector{AbstractString}
    annotations::Dict{AbstractString, Annotation}
    recordings::Dict{AbstractString, Recording}
 end
-"""
+function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording})
-dataset(manifestroot)
+    idxs = collect(keys(annotations))
+    SpeechDataset(infos, idxs, annotations, recordings)
-Load `SpeechDataset` from manifest files stored in `manifestroot`.
+end
-Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
-See also [`Annotation`](@ref).
-# Examples
-```julia-repl
-julia> ds = dataset("./manifests", :train)
-SpeechDataset(
-    ...
-)
-julia> ds[1]
-(
-    (samples=[...], sampling_rate=16_000),
-    Dict(
-        "text" => "Annotation text here"
-    )
-)
-```
 """
-function dataset(manifestroot::AbstractString, partition)
+    SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
-    partition_name = partition == "" ? "" : "-$(partition)"
+Create a SpeechDataset from manifest files and subset.
-    annot_path =  joinpath(manifestroot, "annotations$(partition_name).jsonl") 
+"""
+function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
+    subset_name = subset == "" ? "" : "-$(subset)"
+    annot_path =  joinpath(manifestroot, "annotations$(subset_name).jsonl") 
    rec_path = joinpath(manifestroot, "recordings.jsonl")
-    annotations = load(Annotation, annot_path)
+    annotations = load_manifest(Annotation, annot_path)
-    recordings = load(Recording, rec_path)
+    recordings = load_manifest(Recording, rec_path)
-    dataset(annotations, recordings)
+    SpeechDataset(infos, annotations, recordings)
-end
-function dataset(annotations::AbstractDict, recordings::AbstractDict)
-    idxs = collect(keys(annotations))
-    SpeechDataset(idxs, annotations, recordings)
 end
 Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
 Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
-# Fix1 -> partial funcion with fixed 1st argument
+# Fix1 -> partial function with fixed 1st argument
 Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
 Base.length(d::SpeechDataset) = length(d.idxs)
@@ -65,6 +98,117 @@ function Base.filter(fn, d::SpeechDataset)
        k ∈ idset
    end
-    SpeechDataset(fidxs, fannotations, frecs)
+    SpeechDataset(d.infos, fidxs, fannotations, frecs)
 end
+"""
+    struct_summary(object; additional=[])
+Display fields and values of given object.
+Can display additional informations if provided as (n,2) matrix
+"""
+function struct_summary(object; additional=[])
+    properties = collect(propertynames(object))
+	values = [getproperty(object, p) for p in properties]
+    description = Array{String}(undef, length(properties), 2)
+	description[:,1] = string.(properties)
+	description[:,2] = string.(values)
+    if ! isempty(additional)
+        description = vcat(description, additional)
+        push!(values, additional[:,1]...)
+    end
+    gap = maximum(length.(description[:,1]))+4
+    for (i, line) in enumerate(eachrow(description))
+        # remove vector type from string representation
+        if typeof(values[i]) <: Vector
+            line[2] = replace(line[2], r".*\["=>"[")
+        end
+        # print <field> <gap> <value>
+        println(line[1], repeat(' ', gap-length(line[1])), line[2])
+    end
+end
+"""
+    Base.summary(infos::SpeechDatasetInfos)
+Display fields and values of given SpeechDatasetInfos
+"""
+function Base.summary(infos::SpeechDatasetInfos)
+    struct_summary(infos)
+end
+"""
+    Base.summary(dataset::SpeechDataset)
+Display informations about given SpeechDataset
+"""
+function Base.summary(dataset::SpeechDataset)
+    additional = ["elements" string(length(dataset))]
+    struct_summary(dataset.infos, additional=additional)
+end
+function get_outfiles(subsets)
+    outfiles = ["recordings.jsonl"]
+    if isempty(subsets)
+        push!(outfiles, "annotations.jsonl")
+    else 
+        annotationsfiles = ["annotations-$subset.jsonl" for subset in subsets]
+        push!(outfiles, annotationsfiles...)
+    end
+    outfiles
+end
+"""
+    dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; <keyword arguments>)
+Extract recordings and annotations for desired dataset.\n
+Return a SpeechDataset object.\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `annotations-<subset>.jsonl` containing each annotation and associated metadata
+# Arguments
+- `name` Name of the dataset. Supported names are $corpora_names.
+- `inputdir` Name of dataset directory. If the directory does not exists, it is created and the data is downloaded if possible. Not all datasets can be downloaded, for example proprietary datasets does not implements a download function.
+- `outputdir` is the output directory for manifest files.
+# Keyword Arguments
+Common kwargs are
+- `subset` Part of the dataset to load (for example "train" or "test").
+- `lang` ISO 639-3 code of the language.
+Other kwargs can be available depending on the dataset, they can be accessed with [`get_dataset_kwargs(name::String)`](@ref).
+"""
+function dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; kwargs...)
+    # check name
+    name ∈ corpora_names || throw(ArgumentError("Name $name is not supported, try one of $corpora_names."))
+    nametype = get_nametype(name)
+    dataset(DatasetBuilder(nametype), name, inputdir, outputdir; kwargs...)
+end
+function dataset(builder::DatasetBuilder, name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; subset="", kwargs...)
+    kwargs = values(kwargs)
+    # check kwargs
+    all([k in keys(builder.kwargs) for k in keys(kwargs)]) || throw(ArgumentError("Unsupported keyword argument. Available are $(builder.kwargs)"))
+    # download if possible and inputdir doesn't exists yet  
+    downloadable = hasmethod(download, Tuple{typeof(builder), AbstractString})
+    if downloadable && ! isdir(inputdir)
+        download(builder, inputdir)
+    end
+    # load infos from dataset name
+    infos = SpeechDatasetInfos(name)
+    # Check subset value 
+    if ! isempty(infos.subsets) && isempty(subset)
+        throw(ArgumentError("The subset argument is required for this dataset, try one of $(infos.subsets)."))
+    end
+    subset ∈ [infos.subsets ; ""] || throw(ArgumentError("Subset $subset is not supported, try one of $(infos.subsets)."))
+    # Check lang value if provided
+    if :lang ∈ keys(kwargs) 
+        kwargs.lang ∈ infos.lang || throw(ArgumentError("Language $lang is not supported, try one of $(infos.lang)."))
+    end
+    # prepare if outfiles are not present
+    outfiles = get_outfiles(infos.subsets)
+    if ! all(isfile.(joinpath.(outputdir, outfiles)))
+        prepare(builder, inputdir, outputdir; kwargs...)
+    end
+    SpeechDataset(infos, outputdir, subset)
+end
--- a/src/lexicons.jl
+++ b/src/lexicons.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
 const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
@@ -17,7 +17,7 @@ end
    CMUDICT(path)
 Return the dictionary of pronunciation loaded from the CMU sphinx dictionary.
-The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent
+The CMU dictionary will be donwloaded and stored into to `path`. Subsequent
 calls will only read the file `path` without downloading again the data.
 """
 function CMUDICT(path)
@@ -76,9 +76,8 @@ end
 """
    MFAFRDICT(path)
-Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a)
+Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a).
 """
 function MFAFRDICT(path)
    if ! isfile(path)
        mkpath(dirname(path))

--- a/src/manifest_io.jl
+++ b/src/manifest_io.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
 #=====================================================================#
 # JSON serialization of a manifest item
-function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.FileAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -13,7 +13,7 @@ function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
    print(io, repeat(" ", indent), "}")
 end
-function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.URLAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -23,7 +23,7 @@ function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
    print(io, repeat(" ", indent), "}")
 end
-function Base.show(io::IO, m::MIME"application/json", s::CmdAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.CmdAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -78,11 +78,11 @@ end
 function AudioSource(d::Dict)
    if d["type"] == "path"
-        T = FileAudioSource
+        T = AudioSources.FileAudioSource
    elseif d["type"] == "url"
-        T = URLAudioSource
+        T = AudioSources.URLAudioSource
    elseif d["type"] == "cmd"
-        T = CmdAudioSource
+        T = AudioSources.CmdAudioSource
    else
        throw(ArgumentError("invalid type: $(d["type"])"))
    end
@@ -116,7 +116,7 @@ function writemanifest(io::IO, manifest::Dict)
 end
 function readmanifest(io::IO, T)
-    manifest = Dict()
+    manifest = Dict{AbstractString, T}()
    for line in eachline(io)
        item = JSON.parse(line) |> T
        manifest[item.id] = item
@@ -129,12 +129,12 @@ manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
 manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl"
 """
-    load(Annotation, path)
+    load_manifest(Annotation, path)
-    load(Recording, path)
+    load_manifest(Recording, path)
 Load Recording/Annotation manifest from `path`.
 """
-load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
+load_manifest(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
 function checkdir(dir::AbstractString)
    isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))

--- a/src/manifest_item.jl
+++ b/src/manifest_item.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
 """
    abstract type ManifestItem end
@@ -26,18 +26,18 @@ If the channels or the sample rate are not provided then they will be
 read from `source`.
 !!! warning
-    When preparing large corpus, not providing the channes and/or the
+    When preparing large corpus, not providing the channels and/or the
    sample rate can drastically reduce the speed as it forces to read
    source.
 """
-struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
+struct Recording{Ts<:AudioSources.AbstractAudioSource} <: ManifestItem
    id::AbstractString
    source::Ts
    channels::Vector{Int}
    samplerate::Int
 end
-function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
+function Recording(uttid, s::AudioSources.AbstractAudioSource; channels = missing, samplerate = missing)
    if ismissing(channels) || ismissing(samplerate)
        x, sr = loadaudio(s)
        samplerate = ismissing(samplerate) ? Int(sr) : samplerate
@@ -83,19 +83,16 @@ Annotation(id, recid; channels = missing, start = -1, duration = -1, data = miss
 """
-    load(recording[; start = -1, duration = -1, channels = recording.channels])
+    load(recording::Recording [; start = -1, duration = -1, channels = recording.channels])
    load(recording, annotation)
-Load the signal from a recording. `start`, `duration` (in seconds) can
+Load the signal from a recording. `start`, `duration` (in seconds)
-be used to load only a segment. If an `annotation` is given, function
-will return on the portion of the signal corresponding to the
-annotation segment.
-The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array
+The function returns a tuple `(x, sr)` where `x` is a ``N×C`` array 
 - ``N`` is the length of the signal and ``C`` is the number of channels
 - and `sr` is the sampling rate of the signal.
 """
-function load(r::Recording; start = -1, duration = -1, channels = r.channels)
+function AudioSources.load(r::Recording; start = -1, duration = -1, channels = r.channels)
    if start >= 0 && duration >= 0
        s = Int(floor(start * r.samplerate + 1))
        e = Int(ceil(duration * r.samplerate))
@@ -104,9 +101,14 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
        subrange = (:)
    end
-    x, sr = loadaudio(r.source, subrange)
+    AudioSources.load(r.source, true, subrange=subrange, ch=channels)
-    x[:,channels], sr
 end
-load(r::Recording, a::Annotation) = load(r; start = a.start, duration = a.duration, channels = a.channels)
+"""
+    load(r::Recording, a::Annotation)
+    load(t::Tuple{Recording, Annotation})
+Load only a segment of the recording referenced in the annotation.
+"""
+AudioSources.load(r::Recording, a::Annotation) = AudioSources.load(r; start = a.start, duration = a.duration, channels = a.channels)
+AudioSources.load(t::Tuple{Recording, Annotation}) = AudioSources.load(t[1], t[2])
--- a/src/speechcorpus.jl
+++ b/src/speechcorpus.jl
-# SPDX-License-Identifier: CECILL-2.1
-"""
-    abstract type SpeechCorpus end
-Abstract type for all speech corpora.
-"""
-abstract type SpeechCorpus end
-"""
-    lang(corpus)
-Return the ISO 639-3 code of the language of the corpus.
-"""
-lang
-"""
-    name(corpus)
-Return the name identifier of the corpus.
-"""
-name
-"""
-    download(corpus, rootdir)
-Download the data of the corpus to `dir`.
-"""
-Base.download
-"""
-    prepare(corpus, rootdir)
-Prepare the manifests of corpus.
-"""
-prepare
No results found