Skip to content
Snippets Groups Projects
Verified Commit 76869121 authored by Lucas Ondel Yang's avatar Lucas Ondel Yang
Browse files

version 0.11

parent 13bf3ced
No related branches found
No related tags found
No related merge requests found
# Tags
## [0.11.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.11.0) - 21/05/2024
### Added
- filtering speech dataset based on recording id.
### Improved
- Faster TIMIT preparation
## [0.10.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.10.0) - 22/02/2024
### Added
- extract alignments from TIMIT
......
name = "SpeechDatasets"
uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>"]
version = "0.10.0"
version = "0.11.0"
[deps]
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
......
......@@ -232,23 +232,71 @@ function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
dir = mkpath(dir)
rm(joinpath(dir, "recordings.jsonl"), force=true)
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
sdir = joinpath(timitdir, subdir)
# Recordings
manifestpath = joinpath(dir, "recordings.jsonl")
@debug "preparing $manifestpath"
recordings = timit_recordings(sdir; fmt=audio_fmt)
open(manifestpath, "a") do f
writemanifest(f, recordings)
## Recordings
@info "Extracting recordings from $timitdir/train"
train_recordings = timit_recordings(joinpath(timitdir, "train"); fmt=audio_fmt)
# We extract the name of speakers that are not in the dev set
TIMIT_TRAIN_SPK_LIST = Set()
for id in keys(train_recordings)
_, spk, _ = split(id, "_")
if spk TIMIT_DEV_SPK_LIST
push!(TIMIT_TRAIN_SPK_LIST, spk)
end
end
@info "Extracting recordings from $timitdir/test"
test_recordings = timit_recordings(joinpath(timitdir, "test"); fmt=audio_fmt)
recordings = merge(train_recordings, test_recordings)
manifestpath = joinpath(dir, "recordings.jsonl")
open(manifestpath, "a") do f
writemanifest(f, recordings)
end
# Annotations
@info "Extracting annotations from $timitdir/train"
train_annotations = timit_annotations(joinpath(timitdir, "train"))
@info "Extracting annotations from $timitdir/test"
test_annotations = timit_annotations(joinpath(timitdir, "test"))
annotations = merge(train_annotations, test_annotations)
train_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_TRAIN_SPK_LIST
)
end
dev_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_DEV_SPK_LIST
)
end
test_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_TEST_SPK_LIST
)
end
for (x, y) in ("train" => train_annotations,
"dev" => dev_annotations,
"test" => test_annotations)
manifestpath = joinpath(dir, "annotations-$(x).jsonl")
@info "Creating $manifestpath"
# Annotations
manifestpath = joinpath(dir, "annotations-$(subset).jsonl")
@debug "Preparing $manifestpath"
annotations = timit_annotations(sdir)
open(manifestpath, "w") do f
writemanifest(f, annotations)
writemanifest(f, y)
end
end
end
......@@ -289,41 +337,48 @@ function timit_annotations(dir)
splitline(line) = rsplit(line, limit=3)
annotations = Dict()
tuple_ids = String[]
processed = Set()
for (root, subdirs, files) in walkdir(dir)
for file in files
name, ext = splitext(file)
_, dialect, spk = rsplit(root, "/", limit=3)
idtuple = join((dialect, spk, name), "/")
# Annotation files already processed (".wrd" and ".phn")
if (idtuple in tuple_ids) && continue end
push!(tuple_ids, idtuple)
words = name * ".wrd"
phones = name * ".phn"
wpath = joinpath(root, words)
ppath = joinpath(root, phones)
id = "timit_$(spk)_$(name)"
# Words
wlines = map(splitline, eachline(wpath))
starts, ends, words = zip(wlines...)
start_ts = parse(Int, first(starts)) / 16000
end_ts = parse(Int, last(ends)) / 16000
dur = end_ts - start_ts
# Annotation files already processed (".wrd" and ".phn")
idtuple = (dialect, spk, name)
(idtuple in processed) && continue
push!(processed, (dialect, spk, name))
# Words
wpath = joinpath(root, name * ".wrd")
words = [last(split(line)) for line in eachline(wpath)]
# Phones
plines = map(splitline, eachline(ppath))
pstarts, pends, phones = zip(plines...)
palign = [(parse(Int, s), parse(Int, e), TIMIT_PHONE_MAP48[p]) for (s, e, p) in zip(pstarts, pends, phones)]
ppath = joinpath(root, name * ".phn")
palign = Tuple{Int,Int,String}[]
for line in eachline(ppath)
t0, t1, p = split(line)
push!(palign, (parse(Int, t0), parse(Int, t1), String(p)))
end
sentence_type = if startswith(name, "sa")
"dialect"
elseif startswith(name, "sx")
"compact"
else # startswith(name, "si")
"diverse"
end
id = "timit_$(spk)_$(name)"
annotations[id] = Annotation(
id, id, start_ts, dur, [1],
id, # recording id and annotation id are the same since we have
id, # a one-to-one mapping
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
Dict(
"text" => join(words, " "),
"sentence type" => sentence_type,
"alignment" => palign,
"dialect" => dialect,
"speaker" => spk,
......
......@@ -57,3 +57,10 @@ Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex,
Base.length(d::SpeechDataset) = length(d.idxs)
function Base.filter(fn, d::SpeechDataset)
fidxs = filter(fn, d.idxs)
fannotations = filter(k_v -> fn(first(k_v)), d.annotations)
frecs = filter(k_v -> fn(first(k_v)), d.recordings)
SpeechDataset(fidxs, fannotations, frecs, :custom)
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment