Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • fast/speechdatasets.jl
  • PTAL/Datasets/SpeechDatasets.jl
2 results
Show changes
"""
struct DatasetBuilder{name}
Allow to dispatch main dataset functions (`download()`, `prepare()`).
# Parameter
- `name` Dataset identifier
# Fields
- `kwargs::NamedTuple` Keyword arguments supported by the dataset associated to `name`
"""
struct DatasetBuilder{name}
kwargs::NamedTuple
end
"""
DatasetBuilder(name::Symbol)
Construct a DatasetBuilder for a given name.
Implementations for each name are done by calling [`declareBuilder(name)`](@ref) (automatically done for each supported name).
"""
DatasetBuilder(name::Symbol) = DatasetBuilder{name}()
"""
get_kwargs(func_name::Function, args_types::Tuple)
Return a `NamedTuple` containing each supported kwarg and its default value for a given method.
# Arguments
- `func_name` is the name of the function
- `args_types` is a tuple of argument types for the desired method
"""
function get_kwargs(func_name::Function, args_types::Tuple)
kwargs_names = Base.kwarg_decl(methods(func_name, args_types)[1])
l = length(kwargs_names)
if l==0
# no kwargs
return (;)
else
# lowered form of the method contains kwargs default values, but some are hidden
code = code_lowered(func_name, args_types)[1].code
str_code = ["$c" for c in code]
# get index corresponding to the function
index = findall(x -> occursin("$func_name", x), str_code)[1]
# get lowered value of each kwarg
values = code[index].args[2:2+l-1]
# get back the original value according to the lowered value type
kwargs_values = map(v ->
if v isa Core.SSAValue
eval(code[v.id])
elseif v isa GlobalRef
eval(v)
else
v
end
, values)
# reconstruct kwargs
NamedTuple(zip(kwargs_names, kwargs_values))
end
end
"""
get_nametype(name::String)
Return a symbol corresponding to the name. This symbol is used to identify the dataset.
"""
get_nametype(name::String) = Symbol(replace(name, " "=>"")) # simply remove space
"""
get_dataset_kwargs(name::String)
Return a `NamedTuple` containing each supported kwarg and its default value for a dataset identified by name.
"""
get_dataset_kwargs(name::String) = get_dataset_kwargs(get_nametype(name))
"""
get_dataset_kwargs(name::Symbol)
Return a `NamedTuple` containing each supported kwarg and its default value for a dataset identified by symbol (nametype).
"""
get_dataset_kwargs(name::Symbol) = get_kwargs(prepare, (DatasetBuilder{name}, AbstractString, AbstractString))
"""
declareBuilder(name::Symbol)
Declare a functor for a DatasetBuilder of type `name`.\n
A `DatasetBuilder{name}` object can now be created, and will hold the supported kwargs for the corresponding dataset.
"""
function declareBuilder(name::Symbol)
kwargs = get_dataset_kwargs(name)
quotedname = Meta.quot(name)
eval(Meta.parse("(::Type{DatasetBuilder{$quotedname}})() = DatasetBuilder{$quotedname}($kwargs)"))
end
# Each dataset should implement prepare() (and optionnaly download()) to be usable in dataset()
"""
Base.download(builder::DatasetBuilder{name}, dir::AbstractString)
Download the dataset identified by `name` into `dir`.\n
Each dataset has its own implementation if download is supported (for example, a proprietary dataset might not implements download).
"""
Base.download
"""
prepare(::DatasetBuilder{name}, inputdir, outputdir; <keyword arguments>)
Create manifest files into `outputdir` from dataset in `inputdir`. \n
Each dataset has its own implementation, and can have optional keyword arguments, they can be accessed with [`get_dataset_kwargs(name::String)`](@ref).\n
Implementing this function is mandatory for a dataset to be compatible with `dataset()`
"""
function prepare end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
const avid_id = get_nametype("AVID")
function avid_recordings(dir::AbstractString) function avid_recordings(dir::AbstractString)
checkdir(dir) checkdir(dir)
...@@ -12,7 +14,7 @@ function avid_recordings(dir::AbstractString) ...@@ -12,7 +14,7 @@ function avid_recordings(dir::AbstractString)
id = filename id = filename
path = joinpath(root, file) path = joinpath(root, file)
audio_src = FileAudioSource(path) audio_src = AudioSources.FileAudioSource(path)
recordings[id] = Recording( recordings[id] = Recording(
id, id,
...@@ -43,7 +45,7 @@ function get_metadata(filename, metadatadict) ...@@ -43,7 +45,7 @@ function get_metadata(filename, metadatadict)
file_metadata = split(file_metadata, ",") file_metadata = split(file_metadata, ",")
metadata = Dict( metadata = Dict(
headers[i] => file_metadata[i] headers[i] => file_metadata[i]
for i = 1:length(headers) for i in eachindex(headers)
) )
return metadata return metadata
end end
...@@ -79,7 +81,7 @@ function avid_annotations(dir) ...@@ -79,7 +81,7 @@ function avid_annotations(dir)
end end
function download_avid(dir) function Base.download(::DatasetBuilder{avid_id}, dir::AbstractString)
@info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)" @info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1" url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
filename = "AVID.zip" filename = "AVID.zip"
...@@ -93,10 +95,7 @@ function download_avid(dir) ...@@ -93,10 +95,7 @@ function download_avid(dir)
end end
function avid_prepare(datadir, outputdir) function prepare(::DatasetBuilder{avid_id}, datadir, outputdir)
# Validate the data directory
isdir(datadir) || (datadir = download_avid(datadir))
# Create the output directory. # Create the output directory.
outputdir = mkpath(outputdir) outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true) rm(joinpath(outputdir, "recordings.jsonl"), force=true)
...@@ -128,13 +127,3 @@ function avid_prepare(datadir, outputdir) ...@@ -128,13 +127,3 @@ function avid_prepare(datadir, outputdir)
writemanifest(f, annotations) writemanifest(f, annotations)
end end
end end
function AVID(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
avid_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end
[
{
"name": "AVID",
"lang": "eng",
"license": "CC BY 4.0",
"source": "https://zenodo.org/records/10524873",
"authors": ["Manila Kodali", "Paavo Alku", "Sudarsana Reddy Kadiri"],
"description": "Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."
},
{
"name": "INA Diachrony",
"lang": "fra",
"license": "proprietary",
"description": "Voice recordings and transcriptions sorted by time period, sex and speaker."
},
{
"name": "Mini LibriSpeech",
"lang": "eng",
"license": "CC BY 4.0",
"source": "https://www.openslr.org/31/",
"authors": ["Vassil Panayotov", "Daniel Povey"],
"description": "Subset of LibriSpeech corpus for purpose of regression testing.",
"subsets": ["train", "dev"]
},
{
"name": "Multilingual LibriSpeech",
"lang": ["eng", "fra", "prt", "esp", "deu", "nld", "ita", "pol"],
"license": "CC BY 4.0",
"source": "http://www.openslr.org/94",
"authors": ["Vineel Pratap", "Qiantong Xu", "Anuroop Sriram", "Gabriel Synnaeve", "Ronan Collobert"],
"description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish",
"subsets": ["train", "dev", "test"]
},
{
"name": "TIMIT",
"lang": "eng",
"license": "LDC User Agreement for Non-Members",
"source": "https://catalog.ldc.upenn.edu/LDC93S1",
"authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
"description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems.",
"subsets": ["train", "dev", "test"]
},
{
"name": "Speech2Tex",
"lang": "fra",
"license": "proprietary",
"authors": ["Lorenzo Brucato"],
"description": "Recordings of read equations, literal transcriptions and latex transcriptions."
}
]
\ No newline at end of file
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
const ina_diachrony_id = get_nametype("INA Diachrony")
function ina_diachrony_recordings(dir::AbstractString) function ina_diachrony_recordings(dir::AbstractString)
checkdir(dir) checkdir(dir)
...@@ -12,7 +13,7 @@ function ina_diachrony_recordings(dir::AbstractString) ...@@ -12,7 +13,7 @@ function ina_diachrony_recordings(dir::AbstractString)
id = "ina_diachrony§$filename" id = "ina_diachrony§$filename"
path = joinpath(root, file) path = joinpath(root, file)
audio_src = FileAudioSource(path) audio_src = AudioSource.FileAudioSource(path)
recordings[id] = Recording( recordings[id] = Recording(
id, id,
...@@ -116,7 +117,7 @@ function ina_diachrony_annotations_csv(dir) ...@@ -116,7 +117,7 @@ function ina_diachrony_annotations_csv(dir)
end end
function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) function prepare(::DatasetBuilder{ina_diachrony_id}, ina_wav_dir, outputdir; ina_csv_dir=nothing)
# Validate the data directory # Validate the data directory
for d in [ina_wav_dir, ina_csv_dir] for d in [ina_wav_dir, ina_csv_dir]
isnothing(d) || checkdir(d) isnothing(d) || checkdir(d)
...@@ -150,11 +151,3 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) ...@@ -150,11 +151,3 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
writemanifest(f, annotations) writemanifest(f, annotations)
end end
end end
function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
dataset(outputdir, "")
end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: .1
####################################################################### #######################################################################
...@@ -13,12 +13,8 @@ const MINILS_SUBSETS = Dict( ...@@ -13,12 +13,8 @@ const MINILS_SUBSETS = Dict(
####################################################################### #######################################################################
struct MINILIBRISPEECH <: SpeechCorpus const minils_id = get_nametype("Mini LibriSpeech")
recordings
train
dev
test
end
function minils_recordings(dir, subset) function minils_recordings(dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset]) subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
...@@ -31,7 +27,7 @@ function minils_recordings(dir, subset) ...@@ -31,7 +27,7 @@ function minils_recordings(dir, subset)
id = replace(basename(path), ".flac" => "") id = replace(basename(path), ".flac" => "")
r = Recording( r = Recording(
id, id,
CmdAudioSource(`sox $path -t wav -`); AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1], channels = [1],
samplerate = 16000 samplerate = 16000
) )
...@@ -67,7 +63,7 @@ function minils_annotations(dir, subset) ...@@ -67,7 +63,7 @@ function minils_annotations(dir, subset)
sups sups
end end
function minils_download(dir) function Base.download(::DatasetBuilder{minils_id}, dir::AbstractString)
donefile = joinpath(dir, ".download.done") donefile = joinpath(dir, ".download.done")
if ! isfile(donefile) if ! isfile(donefile)
run(`mkdir -p $dir`) run(`mkdir -p $dir`)
...@@ -85,14 +81,16 @@ function minils_download(dir) ...@@ -85,14 +81,16 @@ function minils_download(dir)
@debug "dataset in $dir" @debug "dataset in $dir"
end end
function minils_prepare(dir) function prepare(::DatasetBuilder{minils_id}, inputdir, outputdir)
outputdir = mkpath(outputdir)
# 1. Recording manifest. # 1. Recording manifest.
out = joinpath(dir, "recordings.jsonl") out = joinpath(outputdir, "recordings.jsonl")
if ! isfile(out) if ! isfile(out)
open(out, "w") do f open(out, "a") do f
for subset in ["train", "dev"] for subset in ["train", "dev"]
@debug "preparing recording manifest ($subset) $out" @debug "preparing recording manifest ($subset) $out"
recs = minils_recordings(dir, subset) recs = minils_recordings(inputdir, subset)
writemanifest(f, recs) writemanifest(f, recs)
end end
end end
...@@ -100,21 +98,13 @@ function minils_prepare(dir) ...@@ -100,21 +98,13 @@ function minils_prepare(dir)
# 2. Annotation manifests. # 2. Annotation manifests.
for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")] for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
out = joinpath(dir, "annotations-$name.jsonl") out = joinpath(outputdir, "annotations-$name.jsonl")
if ! isfile(out) if ! isfile(out)
@debug "preparing annotation manifest ($subset) $out" @debug "preparing annotation manifest ($subset) $out"
sups = minils_annotations(dir, subset) sups = minils_annotations(inputdir, subset)
open(out, "w") do f open(out, "w") do f
writemanifest(f, sups) writemanifest(f, sups)
end end
end end
end end
end end
function MINILIBRISPEECH(dir, subset)
minils_download(dir)
minils_prepare(dir)
dataset(dir, subset)
end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
struct MultilingualLibriSpeech <: SpeechCorpus
lang
name
function MultilingualLibriSpeech(lang)
new(lang, "multilingual_librispeech")
end
end
const MLS_LANG_CODE = Dict( const MLS_LANG_CODE = Dict(
"deu" => "german", "deu" => "german",
...@@ -42,21 +33,24 @@ const MLS_LM_URLS = Dict( ...@@ -42,21 +33,24 @@ const MLS_LM_URLS = Dict(
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz" "prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
) )
function Base.download(corpus::MultilingualLibriSpeech, outdir) const mlls_id = get_nametype("Multilingual LibriSpeech")
dir = path(corpus, outdir)
function Base.download(builder::DatasetBuilder{mlls_id}, dir::AbstractString)
lang = builder.kwargs.lang
donefile = joinpath(dir, ".download.done") donefile = joinpath(dir, ".download.done")
if ! isfile(donefile) if ! isfile(donefile)
run(`mkdir -p $dir`) run(`mkdir -p $dir`)
@info "downloading the corpus" @info "downloading the corpus for language $lang"
run(`wget -P $dir $(MLS_AUDIO_URLS[corpus.lang])`) run(`wget -P $dir $(MLS_AUDIO_URLS[lang])`)
tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang]).tar.gz") tarpath = joinpath(dir, "mls_$(MLS_LANG_CODE[lang]).tar.gz")
@info "extracting" @info "extracting"
run(`tar -xf $tarpath -C $dir`) run(`tar -xf $tarpath -C $dir`)
run(`rm $tarpath`) run(`rm $tarpath`)
@info "downloading LM data" @info "downloading LM data"
run(`wget -P $dir $(MLS_LM_URLS[corpus.lang])`) run(`wget -P $dir $(MLS_LM_URLS[lang])`)
tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[corpus.lang]).tar.gz") tarpath = joinpath(dir, "mls_lm_$(MLS_LANG_CODE[lang]).tar.gz")
@info "extracting" @info "extracting"
run(`tar -xf $tarpath -C $dir`) run(`tar -xf $tarpath -C $dir`)
run(`rm $tarpath`) run(`rm $tarpath`)
...@@ -64,12 +58,10 @@ function Base.download(corpus::MultilingualLibriSpeech, outdir) ...@@ -64,12 +58,10 @@ function Base.download(corpus::MultilingualLibriSpeech, outdir)
run(pipeline(`date`, stdout = donefile)) run(pipeline(`date`, stdout = donefile))
end end
@info "dataset in $dir" @info "dataset in $dir"
corpus
end end
function recordings(corpus::MultilingualLibriSpeech, dir, subset) function mlls_recordings(inputdir, subset, lang)
subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio") subsetdir = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "audio")
recs = Dict() recs = Dict()
for d1 in readdir(subsetdir; join = true) for d1 in readdir(subsetdir; join = true)
...@@ -78,7 +70,7 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset) ...@@ -78,7 +70,7 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
id = replace(basename(path), ".flac" => "") id = replace(basename(path), ".flac" => "")
r = Recording( r = Recording(
id, id,
CmdAudioSource(`sox $path -t wav -`); AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1], channels = [1],
samplerate = 16000 samplerate = 16000
) )
...@@ -89,13 +81,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset) ...@@ -89,13 +81,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
recs recs
end end
function annotations(corpus::MultilingualLibriSpeech, dir, subset) function mlls_annotations(inputdir, subset, lang)
trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt") trans = joinpath(inputdir, "mls_$(MLS_LANG_CODE[lang])", subset, "transcripts.txt")
sups = Dict() sups = Dict()
open(trans, "r") do f open(trans, "r") do f
for line in eachline(f) for line in eachline(f)
tokens = split(line) tokens = split(line)
s = Annotation(tokens[1], tokens[1]; channel = 1, s = Annotation(tokens[1], tokens[1]; channels = [1],
data = Dict("text" => join(tokens[2:end], " "))) data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s sups[s.id] = s
end end
...@@ -103,16 +95,16 @@ function annotations(corpus::MultilingualLibriSpeech, dir, subset) ...@@ -103,16 +95,16 @@ function annotations(corpus::MultilingualLibriSpeech, dir, subset)
sups sups
end end
function prepare(corpus::MultilingualLibriSpeech, outdir) function prepare(::DatasetBuilder{mlls_id}, inputdir, outputdir; lang="eng")
dir = path(corpus, outdir) outputdir = mkpath(outputdir)
# 1. Recording manifests. # 1. Recording manifests.
for subset in ["train", "dev", "test"] out = joinpath(outputdir, "recordings.jsonl")
out = joinpath(dir, "recording-manifest-$subset.jsonl") @info "preparing recording manifest $out"
@info "preparing recording manifest ($subset) $out" if ! isfile(out)
if ! isfile(out) open(out, "a") do f
recs = recordings(corpus, dir, subset) for subset in ["train", "dev", "test"]
open(out, "w") do f recs = mlls_recordings(inputdir, subset, lang)
writemanifest(f, recs) writemanifest(f, recs)
end end
end end
...@@ -120,16 +112,13 @@ function prepare(corpus::MultilingualLibriSpeech, outdir) ...@@ -120,16 +112,13 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
# 2. Annotation manifests. # 2. Annotation manifests.
for subset in ["train", "dev", "test"] for subset in ["train", "dev", "test"]
out = joinpath(dir, "annotation-manifest-$subset.jsonl") out = joinpath(outputdir, "annotations-$subset.jsonl")
@info "preparing annotation manifest ($subset) $out" @info "preparing annotation manifest ($subset) $out"
if ! isfile(out) if ! isfile(out)
sups = annotations(corpus, dir, subset) sups = mlls_annotations(inputdir, subset, lang)
open(out, "w") do f open(out, "w") do f
writemanifest(f, sups) writemanifest(f, sups)
end end
end end
end end
corpus
end end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
const speech2tex_id = get_nametype("Speech2Tex")
function speech2tex_recordings(dir::AbstractString) function speech2tex_recordings(dir::AbstractString)
checkdir(dir) checkdir(dir)
...@@ -12,7 +14,7 @@ function speech2tex_recordings(dir::AbstractString) ...@@ -12,7 +14,7 @@ function speech2tex_recordings(dir::AbstractString)
id = filename id = filename
path = joinpath(root, file) path = joinpath(root, file)
audio_src = FileAudioSource(path) audio_src = AudioSources.FileAudioSource(path)
recordings[id] = Recording( recordings[id] = Recording(
id, id,
...@@ -80,7 +82,7 @@ function speech2tex_annotations(audiodir, transcriptiondir, texdir) ...@@ -80,7 +82,7 @@ function speech2tex_annotations(audiodir, transcriptiondir, texdir)
annotations annotations
end end
function speech2tex_prepare(datadir, outputdir) function prepare(::DatasetBuilder{speech2tex_id}, datadir, outputdir)
# Validate the data directory # Validate the data directory
checkdir(datadir) checkdir(datadir)
...@@ -95,6 +97,7 @@ function speech2tex_prepare(datadir, outputdir) ...@@ -95,6 +97,7 @@ function speech2tex_prepare(datadir, outputdir)
recordings = speech2tex_recordings(recordings_path) recordings = speech2tex_recordings(recordings_path)
manifestpath = joinpath(outputdir, "recordings.jsonl") manifestpath = joinpath(outputdir, "recordings.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f open(manifestpath, "w") do f
writemanifest(f, recordings) writemanifest(f, recordings)
end end
...@@ -111,12 +114,3 @@ function speech2tex_prepare(datadir, outputdir) ...@@ -111,12 +114,3 @@ function speech2tex_prepare(datadir, outputdir)
writemanifest(f, annotations) writemanifest(f, annotations)
end end
end end
function SPEECH2TEX(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
speech2tex_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
####################################################################### #######################################################################
...@@ -11,7 +11,7 @@ const TIMIT_SUBSETS = Dict( ...@@ -11,7 +11,7 @@ const TIMIT_SUBSETS = Dict(
const TIMIT_DEV_SPK_LIST = Set([ const TIMIT_DEV_SPK_LIST = Set([
"faks0", "faks0",
"fdac1", "fdac1",
"fjem0", "fjem0",
"mgwt0", "mgwt0",
...@@ -223,10 +223,14 @@ TIMIT_PHONE_MAP39 = Dict( ...@@ -223,10 +223,14 @@ TIMIT_PHONE_MAP39 = Dict(
####################################################################### #######################################################################
const timit_id = get_nametype("TIMIT")
function timit_prepare(timitdir, dir; audio_fmt="SPHERE") function prepare(::DatasetBuilder{timit_id}, timitdir, dir; formantsdir=nothing, audio_fmt="SPHERE")
# Validate the data directory # Validate the data directory
! isdir(timitdir) && throw(ArgumentError("invalid path $(timitdir)")) ! isdir(timitdir) && throw(ArgumentError("invalid path $(timitdir)"))
if formantsdir !== nothing
! isdir(formantsdir) && throw(ArgumentError("invalid path $(formantsdir)"))
end
# Create the output directory. # Create the output directory.
dir = mkpath(dir) dir = mkpath(dir)
...@@ -256,9 +260,9 @@ function timit_prepare(timitdir, dir; audio_fmt="SPHERE") ...@@ -256,9 +260,9 @@ function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
# Annotations # Annotations
@info "Extracting annotations from $timitdir/train" @info "Extracting annotations from $timitdir/train"
train_annotations = timit_annotations(joinpath(timitdir, "train")) train_annotations = timit_annotations(joinpath(timitdir, "train"), formantsdir)
@info "Extracting annotations from $timitdir/test" @info "Extracting annotations from $timitdir/test"
test_annotations = timit_annotations(joinpath(timitdir, "test")) test_annotations = timit_annotations(joinpath(timitdir, "test"), formantsdir)
annotations = merge(train_annotations, test_annotations) annotations = merge(train_annotations, test_annotations)
...@@ -315,9 +319,9 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE") ...@@ -315,9 +319,9 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
id = "timit_$(spk)_$(name)" id = "timit_$(spk)_$(name)"
audio_src = if fmt == "SPHERE" audio_src = if fmt == "SPHERE"
CmdAudioSource(`sph2pipe -f wav $path`) AudioSources.CmdAudioSource(`sph2pipe -f wav $path`)
else else
FileAudioSource(path) AudioSources.FileAudioSource(path)
end end
recordings[id] = Recording( recordings[id] = Recording(
...@@ -332,8 +336,15 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE") ...@@ -332,8 +336,15 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
end end
function timit_annotations(dir) function timit_annotations(dir, formantsdir=nothing)
! isdir(dir) && throw(ArgumentError("expected directory $dir")) ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
if formantsdir !== nothing
ddir = last(splitdir(dir))
formantsdir = joinpath(formantsdir, ddir)
! isdir(formantsdir) && throw(ArgumentError("expected directory $formantsdir"))
end
splitline(line) = rsplit(line, limit=3) splitline(line) = rsplit(line, limit=3)
annotations = Dict() annotations = Dict()
...@@ -341,7 +352,7 @@ function timit_annotations(dir) ...@@ -341,7 +352,7 @@ function timit_annotations(dir)
for (root, subdirs, files) in walkdir(dir) for (root, subdirs, files) in walkdir(dir)
for file in files for file in files
name, ext = splitext(file) name, _ = splitext(file)
_, dialect, spk = rsplit(root, "/", limit=3) _, dialect, spk = rsplit(root, "/", limit=3)
# Annotation files already processed (".wrd" and ".phn") # Annotation files already processed (".wrd" and ".phn")
...@@ -355,11 +366,34 @@ function timit_annotations(dir) ...@@ -355,11 +366,34 @@ function timit_annotations(dir)
# Phones # Phones
ppath = joinpath(root, name * ".phn") ppath = joinpath(root, name * ".phn")
palign = Tuple{Int,Int,String}[] if formantsdir !== nothing
for line in eachline(ppath) forpath = joinpath(formantsdir, dialect, spk, name * ".ft")
t0, t1, p = split(line) else
push!(palign, (parse(Int, t0), parse(Int, t1), String(p))) forpath = ""
end
if isfile(forpath)
# Read availabled formants values
palign = Tuple{Int,Int,String,NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}}[]
for line in eachline(forpath)
t0, t1, p, f1, f2, f3, f4, b1, b2, b3, b4 = split(line)
push!(palign,
(
parse(Int, t0), parse(Int, t1), String(p),
(parse(Float32, f1), parse(Float32, b1)),
(parse(Float32, f2), parse(Float32, b2)),
(parse(Float32, f3), parse(Float32, b3)),
(parse(Float32, f4), parse(Float32, b4))
))
end
else
palign = Tuple{Int,Int,String}[]
for line in eachline(ppath)
t0, t1, p = split(line)
push!(palign, (parse(Int, t0), parse(Int, t1), String(p)))
end
end end
sentence_type = if startswith(name, "sa") sentence_type = if startswith(name, "sa")
"dialect" "dialect"
...@@ -389,15 +423,3 @@ function timit_annotations(dir) ...@@ -389,15 +423,3 @@ function timit_annotations(dir)
end end
annotations annotations
end end
function TIMIT(timitdir, dir, subset)
if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
isfile(joinpath(dir, "annotations-train.jsonl")) &&
isfile(joinpath(dir, "annotations-dev.jsonl")) &&
isfile(joinpath(dir, "annotations-test.jsonl")))
timit_prepare(timitdir, dir)
end
dataset(dir, subset)
end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
"""
struct SpeechDatasetInfos
Store metadata about a dataset.
# Fields
- `name` Dataset official name
- `lang` Language or list of languages (ISO 639-3 code)
- `license` License name
- `source` URL to the dataset publication or content
- `authors` list of authors
- `description` A few sentences describing the content or main purpose
- `subsets` List of available subsets (for example ["train", "test"])
"""
@kwdef struct SpeechDatasetInfos
name::AbstractString = ""
lang::Union{AbstractString, Vector{AbstractString}} = ""
license::AbstractString = ""
source::AbstractString = ""
authors::Vector{AbstractString} = []
description::AbstractString = ""
subsets::Vector{AbstractString} = []
end
function SpeechDatasetInfos(infos::AbstractDict)
kwargs = NamedTuple()
for key in fieldnames(SpeechDatasetInfos)
val = get(infos, String(key), nothing)
# merge new (key=val) if key was found
kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs
end
SpeechDatasetInfos(;kwargs...)
end
"""
SpeechDatasetInfos(name::AbstractString)
Construct a SpeechDatasetInfos from the Dataset name.
"""
function SpeechDatasetInfos(name::AbstractString)
corpora_infos = JSON.parsefile(corpora_file)
infos = filter(x -> x["name"]==name, corpora_infos)[1]
SpeechDatasetInfos(infos)
end
"""
struct SpeechDataset <: MLUtils.AbstractDataContainer
Store all dataset recordings and annotations. \n
It can be iterated, and will give a `Tuple{Recording, Annotation}` for each entry. Indexation can be done with integer or id.
# Fields
- `infos::SpeechDatasetInfos`
- `idxs::Vector{AbstractString}` id indexes to access elements
- `annotations::Dict{AbstractString, Annotation}` Annotation for each index
- `recordings::Dict{AbstractString, Recording}` Recording for each index
"""
struct SpeechDataset <: MLUtils.AbstractDataContainer struct SpeechDataset <: MLUtils.AbstractDataContainer
infos::SpeechDatasetInfos
idxs::Vector{AbstractString} idxs::Vector{AbstractString}
annotations::Dict{AbstractString, Annotation} annotations::Dict{AbstractString, Annotation}
recordings::Dict{AbstractString, Recording} recordings::Dict{AbstractString, Recording}
end end
""" function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording})
dataset(manifestroot) idxs = collect(keys(annotations))
SpeechDataset(infos, idxs, annotations, recordings)
Load `SpeechDataset` from manifest files stored in `manifestroot`. end
Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
See also [`Annotation`](@ref).
# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
...
)
julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
"text" => "Annotation text here"
)
)
```
""" """
function dataset(manifestroot::AbstractString, partition) SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
partition_name = partition == "" ? "" : "-$(partition)" Create a SpeechDataset from manifest files and subset.
annot_path = joinpath(manifestroot, "annotations$(partition_name).jsonl") """
function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
subset_name = subset == "" ? "" : "-$(subset)"
annot_path = joinpath(manifestroot, "annotations$(subset_name).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl") rec_path = joinpath(manifestroot, "recordings.jsonl")
annotations = load(Annotation, annot_path) annotations = load_manifest(Annotation, annot_path)
recordings = load(Recording, rec_path) recordings = load_manifest(Recording, rec_path)
dataset(annotations, recordings) SpeechDataset(infos, annotations, recordings)
end
function dataset(annotations::AbstractDict, recordings::AbstractDict)
idxs = collect(keys(annotations))
SpeechDataset(idxs, annotations, recordings)
end end
Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key] Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx]) Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument # Fix1 -> partial function with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs) Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs) Base.length(d::SpeechDataset) = length(d.idxs)
...@@ -65,6 +98,117 @@ function Base.filter(fn, d::SpeechDataset) ...@@ -65,6 +98,117 @@ function Base.filter(fn, d::SpeechDataset)
k idset k idset
end end
SpeechDataset(fidxs, fannotations, frecs) SpeechDataset(d.infos, fidxs, fannotations, frecs)
end end
"""
struct_summary(object; additional=[])
Display fields and values of given object.
Can display additional informations if provided as (n,2) matrix
"""
function struct_summary(object; additional=[])
properties = collect(propertynames(object))
values = [getproperty(object, p) for p in properties]
description = Array{String}(undef, length(properties), 2)
description[:,1] = string.(properties)
description[:,2] = string.(values)
if ! isempty(additional)
description = vcat(description, additional)
push!(values, additional[:,1]...)
end
gap = maximum(length.(description[:,1]))+4
for (i, line) in enumerate(eachrow(description))
# remove vector type from string representation
if typeof(values[i]) <: Vector
line[2] = replace(line[2], r".*\["=>"[")
end
# print <field> <gap> <value>
println(line[1], repeat(' ', gap-length(line[1])), line[2])
end
end
"""
Base.summary(infos::SpeechDatasetInfos)
Display fields and values of given SpeechDatasetInfos
"""
function Base.summary(infos::SpeechDatasetInfos)
struct_summary(infos)
end
"""
Base.summary(dataset::SpeechDataset)
Display informations about given SpeechDataset
"""
function Base.summary(dataset::SpeechDataset)
additional = ["elements" string(length(dataset))]
struct_summary(dataset.infos, additional=additional)
end
function get_outfiles(subsets)
outfiles = ["recordings.jsonl"]
if isempty(subsets)
push!(outfiles, "annotations.jsonl")
else
annotationsfiles = ["annotations-$subset.jsonl" for subset in subsets]
push!(outfiles, annotationsfiles...)
end
outfiles
end
"""
dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; <keyword arguments>)
Extract recordings and annotations for desired dataset.\n
Return a SpeechDataset object.\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `annotations-<subset>.jsonl` containing each annotation and associated metadata
# Arguments
- `name` Name of the dataset. Supported names are $corpora_names.
- `inputdir` Name of dataset directory. If the directory does not exists, it is created and the data is downloaded if possible. Not all datasets can be downloaded, for example proprietary datasets does not implements a download function.
- `outputdir` is the output directory for manifest files.
# Keyword Arguments
Common kwargs are
- `subset` Part of the dataset to load (for example "train" or "test").
- `lang` ISO 639-3 code of the language.
Other kwargs can be available depending on the dataset, they can be accessed with [`get_dataset_kwargs(name::String)`](@ref).
"""
function dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; kwargs...)
# check name
name corpora_names || throw(ArgumentError("Name $name is not supported, try one of $corpora_names."))
nametype = get_nametype(name)
dataset(DatasetBuilder(nametype), name, inputdir, outputdir; kwargs...)
end
function dataset(builder::DatasetBuilder, name::AbstractString, inputdir::AbstractString, outputdir::AbstractString; subset="", kwargs...)
kwargs = values(kwargs)
# check kwargs
all([k in keys(builder.kwargs) for k in keys(kwargs)]) || throw(ArgumentError("Unsupported keyword argument. Available are $(builder.kwargs)"))
# download if possible and inputdir doesn't exists yet
downloadable = hasmethod(download, Tuple{typeof(builder), AbstractString})
if downloadable && ! isdir(inputdir)
download(builder, inputdir)
end
# load infos from dataset name
infos = SpeechDatasetInfos(name)
# Check subset value
if ! isempty(infos.subsets) && isempty(subset)
throw(ArgumentError("The subset argument is required for this dataset, try one of $(infos.subsets)."))
end
subset [infos.subsets ; ""] || throw(ArgumentError("Subset $subset is not supported, try one of $(infos.subsets)."))
# Check lang value if provided
if :lang keys(kwargs)
kwargs.lang infos.lang || throw(ArgumentError("Language $lang is not supported, try one of $(infos.lang)."))
end
# prepare if outfiles are not present
outfiles = get_outfiles(infos.subsets)
if ! all(isfile.(joinpath.(outputdir, outfiles)))
prepare(builder, inputdir, outputdir; kwargs...)
end
SpeechDataset(infos, outputdir, subset)
end
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40" const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
...@@ -17,7 +17,7 @@ end ...@@ -17,7 +17,7 @@ end
CMUDICT(path) CMUDICT(path)
Return the dictionary of pronunciation loaded from the CMU sphinx dictionary. Return the dictionary of pronunciation loaded from the CMU sphinx dictionary.
The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent The CMU dictionary will be donwloaded and stored into to `path`. Subsequent
calls will only read the file `path` without downloading again the data. calls will only read the file `path` without downloading again the data.
""" """
function CMUDICT(path) function CMUDICT(path)
...@@ -76,9 +76,8 @@ end ...@@ -76,9 +76,8 @@ end
""" """
MFAFRDICT(path) MFAFRDICT(path)
Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a) Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a).
""" """
function MFAFRDICT(path) function MFAFRDICT(path)
if ! isfile(path) if ! isfile(path)
mkpath(dirname(path)) mkpath(dirname(path))
......
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
#=====================================================================# #=====================================================================#
# JSON serialization of a manifest item # JSON serialization of a manifest item
function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource) function Base.show(io::IO, m::MIME"application/json", s::AudioSources.FileAudioSource)
compact = get(io, :compact, false) compact = get(io, :compact, false)
indent = get(io, :indent, 0) indent = get(io, :indent, 0)
printfn = compact ? print : println printfn = compact ? print : println
...@@ -13,7 +13,7 @@ function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource) ...@@ -13,7 +13,7 @@ function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
print(io, repeat(" ", indent), "}") print(io, repeat(" ", indent), "}")
end end
function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource) function Base.show(io::IO, m::MIME"application/json", s::AudioSources.URLAudioSource)
compact = get(io, :compact, false) compact = get(io, :compact, false)
indent = get(io, :indent, 0) indent = get(io, :indent, 0)
printfn = compact ? print : println printfn = compact ? print : println
...@@ -23,7 +23,7 @@ function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource) ...@@ -23,7 +23,7 @@ function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
print(io, repeat(" ", indent), "}") print(io, repeat(" ", indent), "}")
end end
function Base.show(io::IO, m::MIME"application/json", s::CmdAudioSource) function Base.show(io::IO, m::MIME"application/json", s::AudioSources.CmdAudioSource)
compact = get(io, :compact, false) compact = get(io, :compact, false)
indent = get(io, :indent, 0) indent = get(io, :indent, 0)
printfn = compact ? print : println printfn = compact ? print : println
...@@ -78,11 +78,11 @@ end ...@@ -78,11 +78,11 @@ end
function AudioSource(d::Dict) function AudioSource(d::Dict)
if d["type"] == "path" if d["type"] == "path"
T = FileAudioSource T = AudioSources.FileAudioSource
elseif d["type"] == "url" elseif d["type"] == "url"
T = URLAudioSource T = AudioSources.URLAudioSource
elseif d["type"] == "cmd" elseif d["type"] == "cmd"
T = CmdAudioSource T = AudioSources.CmdAudioSource
else else
throw(ArgumentError("invalid type: $(d["type"])")) throw(ArgumentError("invalid type: $(d["type"])"))
end end
...@@ -116,7 +116,7 @@ function writemanifest(io::IO, manifest::Dict) ...@@ -116,7 +116,7 @@ function writemanifest(io::IO, manifest::Dict)
end end
function readmanifest(io::IO, T) function readmanifest(io::IO, T)
manifest = Dict() manifest = Dict{AbstractString, T}()
for line in eachline(io) for line in eachline(io)
item = JSON.parse(line) |> T item = JSON.parse(line) |> T
manifest[item.id] = item manifest[item.id] = item
...@@ -129,12 +129,12 @@ manifestname(::Type{<:Recording}, name) = "recordings.jsonl" ...@@ -129,12 +129,12 @@ manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl" manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl"
""" """
load(Annotation, path) load_manifest(Annotation, path)
load(Recording, path) load_manifest(Recording, path)
Load Recording/Annotation manifest from `path`. Load Recording/Annotation manifest from `path`.
""" """
load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r") load_manifest(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
function checkdir(dir::AbstractString) function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory")) isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
......
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
""" """
abstract type ManifestItem end abstract type ManifestItem end
...@@ -26,18 +26,18 @@ If the channels or the sample rate are not provided then they will be ...@@ -26,18 +26,18 @@ If the channels or the sample rate are not provided then they will be
read from `source`. read from `source`.
!!! warning !!! warning
When preparing large corpus, not providing the channes and/or the When preparing large corpus, not providing the channels and/or the
sample rate can drastically reduce the speed as it forces to read sample rate can drastically reduce the speed as it forces to read
source. source.
""" """
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem struct Recording{Ts<:AudioSources.AbstractAudioSource} <: ManifestItem
id::AbstractString id::AbstractString
source::Ts source::Ts
channels::Vector{Int} channels::Vector{Int}
samplerate::Int samplerate::Int
end end
function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing) function Recording(uttid, s::AudioSources.AbstractAudioSource; channels = missing, samplerate = missing)
if ismissing(channels) || ismissing(samplerate) if ismissing(channels) || ismissing(samplerate)
x, sr = loadaudio(s) x, sr = loadaudio(s)
samplerate = ismissing(samplerate) ? Int(sr) : samplerate samplerate = ismissing(samplerate) ? Int(sr) : samplerate
...@@ -83,19 +83,16 @@ Annotation(id, recid; channels = missing, start = -1, duration = -1, data = miss ...@@ -83,19 +83,16 @@ Annotation(id, recid; channels = missing, start = -1, duration = -1, data = miss
""" """
load(recording[; start = -1, duration = -1, channels = recording.channels]) load(recording::Recording [; start = -1, duration = -1, channels = recording.channels])
load(recording, annotation) load(recording, annotation)
Load the signal from a recording. `start`, `duration` (in seconds) can Load the signal from a recording. `start`, `duration` (in seconds)
be used to load only a segment. If an `annotation` is given, function
will return on the portion of the signal corresponding to the
annotation segment.
The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array The function returns a tuple `(x, sr)` where `x` is a ``N×C`` array
- ``N`` is the length of the signal and ``C`` is the number of channels - ``N`` is the length of the signal and ``C`` is the number of channels
- and `sr` is the sampling rate of the signal. - and `sr` is the sampling rate of the signal.
""" """
function load(r::Recording; start = -1, duration = -1, channels = r.channels) function AudioSources.load(r::Recording; start = -1, duration = -1, channels = r.channels)
if start >= 0 && duration >= 0 if start >= 0 && duration >= 0
s = Int(floor(start * r.samplerate + 1)) s = Int(floor(start * r.samplerate + 1))
e = Int(ceil(duration * r.samplerate)) e = Int(ceil(duration * r.samplerate))
...@@ -104,9 +101,14 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels) ...@@ -104,9 +101,14 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
subrange = (:) subrange = (:)
end end
x, sr = loadaudio(r.source, subrange) AudioSources.load(r.source, true, subrange=subrange, ch=channels)
x[:,channels], sr
end end
load(r::Recording, a::Annotation) = load(r; start = a.start, duration = a.duration, channels = a.channels) """
load(r::Recording, a::Annotation)
load(t::Tuple{Recording, Annotation})
Load only a segment of the recording referenced in the annotation.
"""
AudioSources.load(r::Recording, a::Annotation) = AudioSources.load(r; start = a.start, duration = a.duration, channels = a.channels)
AudioSources.load(t::Tuple{Recording, Annotation}) = AudioSources.load(t[1], t[2])
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type SpeechCorpus end
Abstract type for all speech corpora.
"""
abstract type SpeechCorpus end
"""
lang(corpus)
Return the ISO 639-3 code of the language of the corpus.
"""
lang
"""
name(corpus)
Return the name identifier of the corpus.
"""
name
"""
download(corpus, rootdir)
Download the data of the corpus to `dir`.
"""
Base.download
"""
prepare(corpus, rootdir)
Prepare the manifests of corpus.
"""
prepare