# SPDX-License-Identifier: CECILL-C ####################################################################### const TIMIT_SUBSETS = Dict( "train" => "train", "dev" => "dev", "test" => "test" ) const TIMIT_DEV_SPK_LIST = Set([ "faks0", "fdac1", "fjem0", "mgwt0", "mjar0", "mmdb1", "mmdm2", "mpdf0", "fcmh0", "fkms0", "mbdg0", "mbwm0", "mcsh0", "fadg0", "fdms0", "fedw0", "mgjf0", "mglb0", "mrtk0", "mtaa0", "mtdt0", "mthc0", "mwjg0", "fnmr0", "frew0", "fsem0", "mbns0", "mmjr0", "mdls0", "mdlf0", "mdvc0", "mers0", "fmah0", "fdrw0", "mrcs0", "mrjm4", "fcal1", "mmwh0", "fjsj0", "majc0", "mjsw0", "mreb0", "fgjd0", "fjmg0", "mroa0", "mteb0", "mjfc0", "mrjr0", "fmml0", "mrws1" ]) const TIMIT_TEST_SPK_LIST = Set([ "mdab0", "mwbt0", "felc0", "mtas1", "mwew0", "fpas0", "mjmp0", "mlnt0", "fpkt0", "mlll0", "mtls0", "fjlm0", "mbpm0", "mklt0", "fnlp0", "mcmj0", "mjdh0", "fmgd0", "mgrt0", "mnjm0", "fdhc0", "mjln0", "mpam0", "fmld0" ]) TIMIT_PHONE_MAP48 = Dict( "aa" => "aa", "ae" => "ae", "ah" => "ah", "ao" => "ao", "aw" => "aw", "ax" => "ax", "ax-h" => "ax", "axr" => "er", "ay" => "ay", "b" => "b", "bcl" => "vcl", "ch" => "ch", "d" => "d", "dcl" => "vcl", "dh" => "dh", "dx" => "dx", "eh" => "eh", "el" => "el", "em" => "m", "en" => "en", "eng" => "ng", "epi" => "epi", "er" => "er", "ey" => "ey", "f" => "f", "g" => "g", "gcl" => "vcl", "h#" => "sil", "hh" => "hh", "hv" => "hh", "ih" => "ih", "ix" => "ix", "iy" => "iy", "jh" => "jh", "k" => "k", "kcl" => "cl", "l" => "l", "m" => "m", "n" => "n", "ng" => "ng", "nx" => "n", "ow" => "ow", "oy" => "oy", "p" => "p", "pau" => "sil", "pcl" => "cl", "q" => "", "r" => "r", "s" => "s", "sh" => "sh", "t" => "t", "tcl" => "cl", "th" => "th", "uh" => "uh", "uw" => "uw", "ux" => "uw", "v" => "v", "w" => "w", "y" => "y", "z" => "z", "zh" => "zh" ) TIMIT_PHONE_MAP39 = Dict( "aa" => "aa", "ae" => "ae", "ah" => "ah", "ao" => "aa", "aw" => "aw", "ax" => "ah", "ax-h" => "ah", "axr" => "er", "ay" => "ay", "b" => "b", "bcl" => "sil", "ch" => "ch", "d" => "d", "dcl" => "sil", "dh" => "dh", "dx" => "dx", "eh" => "eh", "el" => "l", "em" => "m", "en" => "n", "eng" => "ng", "epi" => "sil", "er" => "er", "ey" => "ey", "f" => "f", "g" => "g", "gcl" => "sil", "h#" => "sil", "hh" => "hh", "hv" => "hh", "ih" => "ih", "ix" => "ih", "iy" => "iy", "jh" => "jh", "k" => "k", "kcl" => "sil", "l" => "l", "m" => "m", "n" => "n", "ng" => "ng", "nx" => "n", "ow" => "ow", "oy" => "oy", "p" => "p", "pau" => "sil", "pcl" => "sil", "q" => "", "r" => "r", "s" => "s", "sh" => "sh", "t" => "t", "tcl" => "sil", "th" => "th", "uh" => "uh", "uw" => "uw", "ux" => "uw", "v" => "v", "w" => "w", "y" => "y", "z" => "z", "zh" => "sh" ) ####################################################################### function timit_prepare(timitdir, dir, formantsdir=nothing; audio_fmt="SPHERE") # Validate the data directory ! isdir(timitdir) && throw(ArgumentError("invalid path $(timitdir)")) if formantsdir !== nothing ! isdir(formantsdir) && throw(ArgumentError("invalid path $(formantsdir)")) end # Create the output directory. dir = mkpath(dir) rm(joinpath(dir, "recordings.jsonl"), force=true) ## Recordings @info "Extracting recordings from $timitdir/train" train_recordings = timit_recordings(joinpath(timitdir, "train"); fmt=audio_fmt) # We extract the name of speakers that are not in the dev set TIMIT_TRAIN_SPK_LIST = Set() for id in keys(train_recordings) _, spk, _ = split(id, "_") if spk ∉ TIMIT_DEV_SPK_LIST push!(TIMIT_TRAIN_SPK_LIST, spk) end end @info "Extracting recordings from $timitdir/test" test_recordings = timit_recordings(joinpath(timitdir, "test"); fmt=audio_fmt) recordings = merge(train_recordings, test_recordings) manifestpath = joinpath(dir, "recordings.jsonl") open(manifestpath, "a") do f writemanifest(f, recordings) end # Annotations @info "Extracting annotations from $timitdir/train" train_annotations = timit_annotations(joinpath(timitdir, "train"), formantsdir) @info "Extracting annotations from $timitdir/test" test_annotations = timit_annotations(joinpath(timitdir, "test"), formantsdir) annotations = merge(train_annotations, test_annotations) train_annotations = filter(annotations) do (k, v) stype = v.data["sentence type"] spk = v.data["speaker"] ( (stype == "compact" || stype == "diverse") && spk ∈ TIMIT_TRAIN_SPK_LIST ) end dev_annotations = filter(annotations) do (k, v) stype = v.data["sentence type"] spk = v.data["speaker"] ( (stype == "compact" || stype == "diverse") && spk ∈ TIMIT_DEV_SPK_LIST ) end test_annotations = filter(annotations) do (k, v) stype = v.data["sentence type"] spk = v.data["speaker"] ( (stype == "compact" || stype == "diverse") && spk ∈ TIMIT_TEST_SPK_LIST ) end for (x, y) in ("train" => train_annotations, "dev" => dev_annotations, "test" => test_annotations) manifestpath = joinpath(dir, "annotations-$(x).jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, y) end end end function timit_recordings(dir::AbstractString; fmt="SPHERE") ! isdir(dir) && throw(ArgumentError("expected directory $dir")) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files name, ext = splitext(file) ext != ".wav" && continue spk = basename(root) path = joinpath(root, file) id = "timit_$(spk)_$(name)" audio_src = if fmt == "SPHERE" CmdAudioSource(`sph2pipe -f wav $path`) else FileAudioSource(path) end recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 16000 ) end end recordings end function timit_annotations(dir, formantsdir=nothing) ! isdir(dir) && throw(ArgumentError("expected directory $dir")) if formantsdir !== nothing ddir = last(splitdir(dir)) formantsdir = joinpath(formantsdir, ddir) ! isdir(formantsdir) && throw(ArgumentError("expected directory $formantsdir")) end splitline(line) = rsplit(line, limit=3) annotations = Dict() processed = Set() for (root, subdirs, files) in walkdir(dir) for file in files name, _ = splitext(file) _, dialect, spk = rsplit(root, "/", limit=3) # Annotation files already processed (".wrd" and ".phn") idtuple = (dialect, spk, name) (idtuple in processed) && continue push!(processed, (dialect, spk, name)) # Words wpath = joinpath(root, name * ".wrd") words = [last(split(line)) for line in eachline(wpath)] # Phones ppath = joinpath(root, name * ".phn") if formantsdir !== nothing forpath = joinpath(formantsdir, dialect, spk, name * ".ft") else forpath = "" end if isfile(forpath) # Read availabled formants values palign = Tuple{Int,Int,String,NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}, NTuple{2, Float32}}[] for line in eachline(forpath) t0, t1, p, f1, f2, f3, f4, b1, b2, b3, b4 = split(line) push!(palign, ( parse(Int, t0), parse(Int, t1), String(p), (parse(Float32, f1), parse(Float32, b1)), (parse(Float32, f2), parse(Float32, b2)), (parse(Float32, f3), parse(Float32, b3)), (parse(Float32, f4), parse(Float32, b4)) )) end else palign = Tuple{Int,Int,String}[] for line in eachline(ppath) t0, t1, p = split(line) push!(palign, (parse(Int, t0), parse(Int, t1), String(p))) end end sentence_type = if startswith(name, "sa") "dialect" elseif startswith(name, "sx") "compact" else # startswith(name, "si") "diverse" end id = "timit_$(spk)_$(name)" annotations[id] = Annotation( id, # recording id and annotation id are the same since we have id, # a one-to-one mapping -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) Dict( "text" => join(words, " "), "sentence type" => sentence_type, "alignment" => palign, "dialect" => dialect, "speaker" => spk, "sex" => string(first(spk)), ) ) end end annotations end function TIMIT(timitdir, dir, subset, formantsdir=nothing) if ! (isfile(joinpath(dir, "recordings.jsonl")) && isfile(joinpath(dir, "annotations-train.jsonl")) && isfile(joinpath(dir, "annotations-dev.jsonl")) && isfile(joinpath(dir, "annotations-test.jsonl"))) timit_prepare(timitdir, dir, formantsdir) end dataset(dir, subset) end