Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • fast/speechdatasets.jl
  • PTAL/Datasets/SpeechDatasets.jl
2 results
Show changes
Commits on Source (30)
*outputdir/
Manifest.toml
notebook-test.jl
# Tags
## [0.15.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.15.0) - 19/06/2024
### Changed
- Added support for Speech2Tex dataset
## [0.14.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.14.0) - 11/06/2024
### Changed
- Added support for AVID dataset
## [0.13.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.13.0) - 10/06/2024
### Changed
- Added support for INA Diachrony dataset
### Fixed
- Fixed Minilibrispeech data prep
## [0.12.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.12.0) - 21/05/2024
### Changed
- `SpeechDataset` is a collection of tuple of `Recording` and `Annotation`.
## [0.11.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.11.0) - 21/05/2024
### Added
- filtering speech dataset based on recording id.
### Improved
- Faster TIMIT preparation
## [0.10.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.10.0) - 22/02/2024
### Added
- extract alignments from TIMIT
### Changed
- `Supervision` is now `Annotation`
## [0.9.4](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.9.4)- 22/02/2024
# Fixed
- TIMIT data preparation
## [0.9.3](https://https://gitlab.lisn.upsaclay.fr/fast/speechcorpora.jl/-/tags/v0.9.3)- 12/02/2024
# Fixed
......
name = "SpeechDatasets"
uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>"]
version = "0.9.3"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>",
"Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>",
"Nicolas DENIER <nicolas.denier@lisn.fr>"]
version = "0.15.0"
[deps]
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
......@@ -9,5 +11,7 @@ MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
[compat]
julia = "1.10"
JSON = "0.21"
julia = "1.8"
SpeechFeatures = "0.8"
# SpeechCorpora.jl
# SpeechDatasets.jl
A Julia package to download and prepare speech corpus.
......@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus.
Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry)
to your julia installation. Then, install the package as usual:
```
pkg> add SpeechCorpora
pkg> add SpeechDatasets
```
## Example
......@@ -18,9 +18,19 @@ julia> using SpeechDatasets
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
...
julia> dataset = MINILIBRISPEECH("/path/to/timit/dir", "outputdir", :train) # :dev | :test
julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
...
julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
...
julia> dataset = AVID("/path/to/avid/dir", "outputdir")
...
julia> dataset = SPEECH2TEX("/path/to/speech2tex/dir", "outputdir")
...
julia> for ((signal, fs), supervision) in dataset
# do something
end
......@@ -36,5 +46,5 @@ julia> TIMITDICT("/path/to/timit/dir")
## License
This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE)
This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE))
......@@ -9,8 +9,7 @@ import MLUtils
export
# ManifestItem
Recording,
Supervision,
Alignment,
Annotation,
load,
# Manifest interface
......@@ -27,10 +26,14 @@ export
MultilingualLibriSpeech,
MINILIBRISPEECH,
TIMIT,
INADIACHRONY,
AVID,
SPEECH2TEX,
# Lexicon
CMUDICT,
TIMITDICT,
MFAFRDICT,
# Dataset
dataset
......@@ -41,9 +44,7 @@ include("manifest_io.jl")
include("dataset.jl")
# Supported corpora
include("corpora/multilingual_librispeech.jl")
include("corpora/mini_librispeech.jl")
include("corpora/timit.jl")
include.("corpora/".*filter(contains(r".jl$"), readdir("src/corpora/")))
include("lexicons.jl")
......
# SPDX-License-Identifier: CECILL-2.1
function avid_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = filename
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function load_metadata_files(dir::AbstractString)
tasksdict = Dict('s' => "SENT", 'p' => "PARA")
metadatadict = Dict(key =>
readlines(joinpath(dir, "Metadata_with_labels_$(tasksdict[key]).csv"))
for key in keys(tasksdict))
return metadatadict
end
function get_metadata(filename, metadatadict)
task = split(filename, "_")[3][1]
headers = metadatadict[task][1]
headers = split(headers, ",")
file_metadata = filter(x -> contains(x, filename), metadatadict[task])[1]
file_metadata = split(file_metadata, ",")
metadata = Dict(
headers[i] => file_metadata[i]
for i = 1:length(headers)
)
return metadata
end
function avid_annotations(dir)
checkdir(dir)
annotations = Dict()
metadatadict = load_metadata_files(dir)
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from csv files
metadata = get_metadata(filename, metadatadict)
id = filename
# generate annotation
annotations[id] = Annotation(
id, # audio id
id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function download_avid(dir)
@info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
filename = "AVID.zip"
filepath = joinpath(dir,filename)
run(`mkdir -p $dir`)
run(`wget $url -O $filepath`)
@info "Download complete, extracting files"
run(`unzip $filepath -d $dir`)
run(`rm $filepath`)
return joinpath(datadir, "/AVID")
end
function avid_prepare(datadir, outputdir)
# Validate the data directory
isdir(datadir) || (datadir = download_avid(datadir))
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
recordings = Array{Dict}(undef, 2)
recordings_path = joinpath(datadir, "Repository 2")
@info "Extracting recordings from $recordings_path"
recordings[1] = avid_recordings(recordings_path)
# Calibration tones
calibtones_path = joinpath(datadir, "Calibration_tones")
@info "Extracting recordings from $calibtones_path"
recordings[2] = avid_recordings(calibtones_path)
for (i, manifestpath) in enumerate([joinpath(outputdir, "recordings.jsonl"), joinpath(outputdir, "calibration_tones.jsonl")])
open(manifestpath, "w") do f
writemanifest(f, recordings[i])
end
end
# Annotations
annotations_path = recordings_path
@info "Extracting annotations from $annotations_path"
annotations = avid_annotations(annotations_path)
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function AVID(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
avid_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end
# SPDX-License-Identifier: CECILL-2.1
function ina_diachrony_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = "ina_diachrony§$filename"
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function ina_diachrony_get_metadata(filename)
metadata = split(filename, "§")
age, sex = split(metadata[2], "_")
Dict(
"speaker" => metadata[3],
"timeperiod" => metadata[1],
"age" => age,
"sex" => sex,
)
end
function ina_diachrony_annotations_whole(dir)
checkdir(dir)
annotations = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from filename
metadata = ina_diachrony_get_metadata(filename)
# extract transcription text (same filename but .txt)
textfilepath = joinpath(root, "$filename.txt")
metadata["text"] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
id = "ina_diachrony§$filename"
annotation_id = id*"§0"
# generate annotation
annotations[annotation_id] = Annotation(
id, # audio id
annotation_id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function ina_diachrony_annotations_csv(dir)
checkdir(dir)
annotations = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".csv" && continue
# extract metadata from filename
metadata = ina_diachrony_get_metadata(filename)
id = "ina_diachrony§$filename"
# generate annotation for each line in csv
open(joinpath(root, file)) do f
header = readline(f)
line = 1
# read till end of file
while ! eof(f)
current_line = readline(f)
start_time, end_time, text = split(current_line, ",", limit=3)
start_time = parse(Float64, start_time)
duration = parse(Float64, end_time)-start_time
metadata["text"] = text
annotation_id = id*$line"
annotations[id] = Annotation(
id, # audio id
annotation_id, # annotation id
start_time, # start
duration, # duration
[1], # only 1 channel (mono recording)
metadata # additional informations
)
line += 1
end
end
end
end
annotations
end
function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
# Validate the data directory
for d in [ina_wav_dir, ina_csv_dir]
isnothing(d) || checkdir(d)
end
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
@info "Extracting recordings from $ina_wav_dir"
recordings = ina_diachrony_recordings(ina_wav_dir)
manifestpath = joinpath(outputdir, "recordings.jsonl")
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Annotations
@info "Extracting annotations from $ina_wav_dir"
annotations = ina_diachrony_annotations_whole(ina_wav_dir)
if ! isnothing(ina_csv_dir)
@info "Extracting annotations from $ina_csv_dir"
csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
annotations = merge(annotations, csv_annotations)
end
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
dataset(outputdir, "")
end
......@@ -42,7 +42,7 @@ function minils_recordings(dir, subset)
recs
end
function minils_supervisions(dir, subset)
function minils_annotations(dir, subset)
subsetdir = joinpath(dir, "LibriSpeech", MINILS_SUBSETS[subset])
sups = Dict()
for d1 in readdir(subsetdir; join = true)
......@@ -52,8 +52,8 @@ function minils_supervisions(dir, subset)
open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f
for line in eachline(f)
tokens = split(line)
s = Supervision(
tokens[1], # supervision id
s = Annotation(
tokens[1], # annotation id
tokens[1]; # recording id
channels = [1],
data = Dict("text" => join(tokens[2:end], " "))
......@@ -98,12 +98,12 @@ function minils_prepare(dir)
end
end
# 2. Supervision manifests.
# 2. Annotation manifests.
for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
out = joinpath(dir, "supervisions-$name.jsonl")
out = joinpath(dir, "annotations-$name.jsonl")
if ! isfile(out)
@debug "preparing supervision manifest ($subset) $out"
sups = minils_supervisions(dir, subset)
@debug "preparing annotation manifest ($subset) $out"
sups = minils_annotations(dir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
......
......@@ -89,13 +89,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
recs
end
function supervisions(corpus::MultilingualLibriSpeech, dir, subset)
function annotations(corpus::MultilingualLibriSpeech, dir, subset)
trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
sups = Dict()
open(trans, "r") do f
for line in eachline(f)
tokens = split(line)
s = Supervision(tokens[1], tokens[1]; channel = 1,
s = Annotation(tokens[1], tokens[1]; channel = 1,
data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s
end
......@@ -118,12 +118,12 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
end
end
# 2. Supervision manifests.
# 2. Annotation manifests.
for subset in ["train", "dev", "test"]
out = joinpath(dir, "supervision-manifest-$subset.jsonl")
@info "preparing supervision manifest ($subset) $out"
out = joinpath(dir, "annotation-manifest-$subset.jsonl")
@info "preparing annotation manifest ($subset) $out"
if ! isfile(out)
sups = supervisions(corpus, dir, subset)
sups = annotations(corpus, dir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
......
# SPDX-License-Identifier: CECILL-2.1
function speech2tex_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = filename
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 48000
)
end
end
recordings
end
extract_digits(str::AbstractString) = filter(c->isdigit(c), str)
isnumber(str::AbstractString) = extract_digits(str)==str
function speech2tex_get_metadata(filename)
# possible cases: line123_p1 line123_124_p1 line123_p1_part2 (not observed but also supported: line123_124_p1_part2)
split_name = split(filename, "_")
metadata = Dict()
if isnumber(split_name[2])
metadata["line"] = extract_digits(split_name[1])*"_"*split_name[2]
metadata["speaker"] = split_name[3]
else
metadata["line"] = extract_digits(split_name[1])
metadata["speaker"] = split_name[2]
end
if occursin("part", split_name[end])
metadata["part"] = extract_digits(split_name[end])
end
metadata
end
function speech2tex_annotations(audiodir, transcriptiondir, texdir)
checkdir.([audiodir, transcriptiondir, texdir])
annotations = Dict()
for (root, subdirs, files) in walkdir(audiodir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from csv files
metadata = speech2tex_get_metadata(filename)
# extract transcription and tex (same filenames but .txt)
dirdict = Dict(transcriptiondir => "transcription", texdir => "latex")
for (d, label) in dirdict
textfilepath = joinpath(d, "$filename.txt")
metadata[label] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
end
id = filename
# generate annotation
annotations[id] = Annotation(
id, # audio id
id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function speech2tex_prepare(datadir, outputdir)
# Validate the data directory
checkdir(datadir)
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
recordings = Array{Dict}(undef, 2)
recordings_path = joinpath(datadir, "audio")
@info "Extracting recordings from $recordings_path"
recordings = speech2tex_recordings(recordings_path)
manifestpath = joinpath(outputdir, "recordings.jsonl")
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Annotations
transcriptiondir = joinpath(datadir, "sequences")
texdir = joinpath(datadir, "latex")
@info "Extracting annotations from $transcriptiondir and $texdir"
annotations = speech2tex_annotations(recordings_path, transcriptiondir, texdir)
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function SPEECH2TEX(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
speech2tex_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end
......@@ -232,31 +232,71 @@ function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
dir = mkpath(dir)
rm(joinpath(dir, "recordings.jsonl"), force=true)
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
sdir = joinpath(timitdir, subdir)
# Recordings
manifestpath = joinpath(dir, "recordings.jsonl")
@debug "preparing $manifestpath"
recordings = timit_recordings(sdir; fmt=audio_fmt)
open(manifestpath, "a") do f
writemanifest(f, recordings)
## Recordings
@info "Extracting recordings from $timitdir/train"
train_recordings = timit_recordings(joinpath(timitdir, "train"); fmt=audio_fmt)
# We extract the name of speakers that are not in the dev set
TIMIT_TRAIN_SPK_LIST = Set()
for id in keys(train_recordings)
_, spk, _ = split(id, "_")
if spk TIMIT_DEV_SPK_LIST
push!(TIMIT_TRAIN_SPK_LIST, spk)
end
end
# Supervision
manifestpath = joinpath(dir, "supervisions-$(subset).jsonl")
@debug "Preparing $manifestpath"
supervisions = timit_supervisions(sdir)
open(manifestpath, "w") do f
writemanifest(f, supervisions)
end
@info "Extracting recordings from $timitdir/test"
test_recordings = timit_recordings(joinpath(timitdir, "test"); fmt=audio_fmt)
recordings = merge(train_recordings, test_recordings)
manifestpath = joinpath(dir, "recordings.jsonl")
open(manifestpath, "a") do f
writemanifest(f, recordings)
end
# Annotations
@info "Extracting annotations from $timitdir/train"
train_annotations = timit_annotations(joinpath(timitdir, "train"))
@info "Extracting annotations from $timitdir/test"
test_annotations = timit_annotations(joinpath(timitdir, "test"))
annotations = merge(train_annotations, test_annotations)
train_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_TRAIN_SPK_LIST
)
end
dev_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_DEV_SPK_LIST
)
end
test_annotations = filter(annotations) do (k, v)
stype = v.data["sentence type"]
spk = v.data["speaker"]
(
(stype == "compact" || stype == "diverse") &&
spk TIMIT_TEST_SPK_LIST
)
end
for (x, y) in ("train" => train_annotations,
"dev" => dev_annotations,
"test" => test_annotations)
manifestpath = joinpath(dir, "annotations-$(x).jsonl")
@info "Creating $manifestpath"
# Alignments
manifestpath = joinpath(dir, "alignments-$(subset).jsonl")
@debug "Preparing $manifestpath"
alignments = timit_alignments(sdir)
open(manifestpath, "w") do f
writemanifest(f, alignments)
writemanifest(f, y)
end
end
end
......@@ -292,30 +332,54 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
end
function timit_supervisions(dir)
function timit_annotations(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
splitline(line) = rsplit(line, limit=3)
supervisions = Dict()
annotations = Dict()
processed = Set()
for (root, subdirs, files) in walkdir(dir)
for file in files
name, ext = splitext(file)
ext != ".wrd" && continue
_, dialect, spk = rsplit(root, "/", limit=3)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
slines = map(splitline, eachline(path))
starts, ends, words = zip(slines...)
start_ts = parse(Int, first(starts)) / 16000
end_ts = parse(Int, last(ends)) / 16000
dur = end_ts - start_ts
# Annotation files already processed (".wrd" and ".phn")
idtuple = (dialect, spk, name)
(idtuple in processed) && continue
push!(processed, (dialect, spk, name))
# Words
wpath = joinpath(root, name * ".wrd")
words = [last(split(line)) for line in eachline(wpath)]
# Phones
ppath = joinpath(root, name * ".phn")
palign = Tuple{Int,Int,String}[]
for line in eachline(ppath)
t0, t1, p = split(line)
push!(palign, (parse(Int, t0), parse(Int, t1), String(p)))
end
supervisions[id] = Supervision(
id, id, start_ts, dur, [1],
sentence_type = if startswith(name, "sa")
"dialect"
elseif startswith(name, "sx")
"compact"
else # startswith(name, "si")
"diverse"
end
id = "timit_$(spk)_$(name)"
annotations[id] = Annotation(
id, # recording id and annotation id are the same since we have
id, # a one-to-one mapping
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
Dict(
"text" => join(words, " "),
"sentence type" => sentence_type,
"alignment" => palign,
"dialect" => dialect,
"speaker" => spk,
"sex" => string(first(spk)),
......@@ -323,64 +387,15 @@ function timit_supervisions(dir)
)
end
end
supervisions
end
function timit_alignments(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
splitline(line) = rsplit(line, limit=3)
alignments = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
name, ext = splitext(file)
ext != ".phn" && continue
_, dialect, spk = rsplit(root, "/", limit=3)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
slines = map(splitline, eachline(path))
starts, ends, phones = zip(slines...)
start_sample = parse(Int, first(starts))
end_sample = parse(Int, last(ends))
dur = end_sample - start_sample
palign = [(parse(Int, s), parse(Int, e), TIMIT_PHONE_MAP48[p]) for (s, e, p) in zip(starts, ends, phones)]
@show palign
alignments[id] = Alignment(
id,
id,
start_sample,
dur,
Dict(
"phones" => palign,
"dialect" => dialect,
"speaker" => spk,
"sex" => string(first(spk))
)
)
end
end
alignments
annotations
end
function TIMIT(timitdir, dir, subset)
if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
isfile(joinpath(dir, "supervisions-train.jsonl")) &&
isfile(joinpath(dir, "supervisions-dev.jsonl")) &&
isfile(joinpath(dir, "supervisions-test.jsonl")) &&
isfile(joinpath(dir, "alignments-train.jsonl")) &&
isfile(joinpath(dir, "alignments-dev.jsonl")) &&
isfile(joinpath(dir, "alignments-test.jsonl")))
isfile(joinpath(dir, "annotations-train.jsonl")) &&
isfile(joinpath(dir, "annotations-dev.jsonl")) &&
isfile(joinpath(dir, "annotations-test.jsonl")))
timit_prepare(timitdir, dir)
end
dataset(dir, subset)
......
......@@ -2,21 +2,18 @@
struct SpeechDataset <: MLUtils.AbstractDataContainer
idxs::Vector{AbstractString}
supervisions::Dict{AbstractString, Supervision}
alignments::Dict{AbstractString, Alignment}
annotations::Dict{AbstractString, Annotation}
recordings::Dict{AbstractString, Recording}
partition::Symbol
end
"""
dataset(manifestroot, partition)
dataset(manifestroot)
Load `SpeechDataset` from manifest files stored in `manifestroot`.
Partition is specified by `partition`, e.g. `:train`, `:test`.
Each item of the dataset is a nested tuple `((samples, sampling_rate), Supervision.data)`.
Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
See also [`Supervision`](@ref).
See also [`Annotation`](@ref).
# Examples
```julia-repl
......@@ -28,35 +25,46 @@ julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
"text" => "Supervision text here"
"text" => "Annotation text here"
)
)
```
"""
function dataset(manifestroot, partition)
sup_path = joinpath(manifestroot, "supervisions-$(partition).jsonl")
alg_path = joinpath(manifestroot, "alignments-$(partition).jsonl")
function dataset(manifestroot::AbstractString, partition)
partition_name = partition == "" ? "" : "-$(partition)"
annot_path = joinpath(manifestroot, "annotations$(partition_name).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl")
supervisions = load(Supervision, sup_path)
alignments = load(Alignment, alg_path)
annotations = load(Annotation, annot_path)
recordings = load(Recording, rec_path)
dataset(supervisions, alignments, recordings, partition)
dataset(annotations, recordings)
end
function dataset(supervisions, alignments, recordings, partition)
idxs = collect(keys(supervisions))
SpeechDataset(idxs, supervisions, alignments, recordings, Symbol(partition))
function dataset(annotations::AbstractDict, recordings::AbstractDict)
idxs = collect(keys(annotations))
SpeechDataset(idxs, annotations, recordings)
end
function Base.getindex(d::SpeechDataset, key::AbstractString)
sup = d.supervisions[key]
rec = d.recordings[sup.recording_id]
samples, sr = load(rec, sup)
(samples=samples, sampling_rate=sr), sup.data
end
Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
function Base.filter(fn, d::SpeechDataset)
fidxs = filter(d.idxs) do i
fn((d.recordings[i], d.annotations[i]))
end
idset = Set(fidxs)
fannotations = filter(d.annotations) do (k, v)
k idset
end
frecs = filter(d.recordings) do (k, v)
k idset
end
SpeechDataset(fidxs, fannotations, frecs)
end
......@@ -2,7 +2,7 @@
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
const FRMFA_DICT_URL = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/french/mfa/french_mfa.dict"
function normalizeword(word)
String(uppercase(word))
......@@ -71,3 +71,29 @@ function TIMITDICT(timitdir)
end
lexicon
end
"""
MFAFRDICT(path)
Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a)
"""
function MFAFRDICT(path)
if ! isfile(path)
mkpath(dirname(path))
dir = mktempdir()
run(`wget -P $dir $FRMFA_DICT_URL`)
mv(joinpath(dir, "french_mfa.dict"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
lexicon
end
\ No newline at end of file
......@@ -53,21 +53,21 @@ function Base.show(io::IO, m::MIME"application/json", r::Recording)
print(io, "}")
end
function Base.show(io::IO, m::MIME"application/json", s::Supervision)
function Base.show(io::IO, m::MIME"application/json", a::Annotation)
compact = get(io, :compact, false)
indent = compact ? 0 : 2
printfn = compact ? print : println
printfn(io, "{")
printfn(io, repeat(" ", indent), "\"id\": \"", s.id, "\", ")
printfn(io, repeat(" ", indent), "\"recording_id\": \"", s.recording_id, "\", ")
printfn(io, repeat(" ", indent), "\"start\": ", s.start, ", ")
printfn(io, repeat(" ", indent), "\"duration\": ", s.duration, ", ")
printfn(io, repeat(" ", indent), "\"channels\": ", s.channels |> json, ", ")
printfn(io, repeat(" ", indent), "\"data\": ", s.data |> json)
printfn(io, repeat(" ", indent), "\"id\": \"", a.id, "\", ")
printfn(io, repeat(" ", indent), "\"recording_id\": \"", a.recording_id, "\", ")
printfn(io, repeat(" ", indent), "\"start\": ", a.start, ", ")
printfn(io, repeat(" ", indent), "\"duration\": ", a.duration, ", ")
printfn(io, repeat(" ", indent), "\"channels\": ", a.channels |> json, ", ")
printfn(io, repeat(" ", indent), "\"data\": ", a.data |> json)
print(io, "}")
end
function JSON.json(r::Union{Recording, Supervision}; compact = true)
function JSON.json(r::Union{Recording, Annotation}; compact = true)
out = IOBuffer()
show(IOContext(out, :compact => compact), MIME("application/json"), r)
String(take!(out))
......@@ -96,7 +96,7 @@ Recording(d::Dict) = Recording(
d["samplerate"]
)
Supervision(d::Dict) = Supervision(
Annotation(d::Dict) = Annotation(
d["id"],
d["recording_id"],
d["start"],
......@@ -105,14 +105,6 @@ Supervision(d::Dict) = Supervision(
d["data"]
)
Alignment(d::Dict) = Alignment(
d["id"],
d["recording_id"],
d["start"],
d["duration"],
d["data"]
)
#=====================================================================#
# Writing / reading manifest from file.
......@@ -134,15 +126,16 @@ end
# Some utilities
manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
manifestname(::Type{<:Alignment}, name) = "alignments-$name.jsonl"
manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl"
"""
load(Supervision, path)
load(Alignments, path)
load(Annotation, path)
load(Recording, path)
Load Recording/Supervisions/Alignments manifest from `path`.
Load Recording/Annotation manifest from `path`.
"""
load(T::Type{<:Union{Recording, Supervision, Alignment}}, path) = open(f -> readmanifest(f, T), path, "r")
load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end
......@@ -47,7 +47,7 @@ function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate
end
"""
struct Supervision <: ManifestItem
struct Annotation <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
......@@ -56,20 +56,20 @@ end
data::Dict
end
A "supervision" defines a segment of a recording on a single channel.
An "annotation" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision. `start` and `duration` (in seconds) defines,
annotation. `start` and `duration` (in seconds) defines,
where the segment is locatated within the recoding `recording_id`.
# Constructor
Supervision(id, recording_id, start, duration, channel, data)
Supervision(id, recording_id[; channel = missing, start = -1, duration = -1, data = missing)
Annotation(id, recording_id, start, duration, channel, data)
Annotation(id, recording_id[; channel = missing, start = -1, duration = -1, data = missing)
If `start` and/or `duration` are negative, the segment is considered to
be the whole sequence length of the recording.
"""
struct Supervision <: ManifestItem
struct Annotation <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
......@@ -78,51 +78,18 @@ struct Supervision <: ManifestItem
data::Dict
end
Supervision(id, recid; channels = missing, start = -1, duration = -1, data = missing) =
Supervision(id, recid, start, duration, channels, data)
"""
struct Alignments <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
data::Dict
end
An "alignment" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
alignments. `start` and `duration` (in seconds) defines,
where the segment is locatated within the recoding `recording_id`.
# Constructor
Alignment(id, recording_id, start, duration, data)
Alignment(id, recording_id[;start = -1, duration = -1, data = missing)
If `start` and/or `duration` are negative, the segment is considered to
be the whole sequence length of the recording.
"""
struct Alignment <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
data::Dict
end
Alignment(id, recid; start = -1, duration = -1, data = missing) =
Alignment(id, recid, start, duration, data)
Annotation(id, recid; channels = missing, start = -1, duration = -1, data = missing) =
Annotation(id, recid, start, duration, channels, data)
"""
load(recording[; start = -1, duration = -1, channels = recording.channels])
load(recording, supervision)
load(recording, annotation)
Load the signal from a recording. `start`, `duration` (in seconds) can
be used to load only a segment. If a `supervision` is given, function
be used to load only a segment. If an `annotation` is given, function
will return on the portion of the signal corresponding to the
supervision segment.
annotation segment.
The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array
- ``N`` is the length of the signal and ``C`` is the number of channels
......@@ -141,5 +108,5 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
x[:,channels], sr
end
load(r::Recording, s::Supervision) = load(r; start = s.start, duration = s.duration, channels = s.channels)
load(r::Recording, a::Annotation) = load(r; start = a.start, duration = a.duration, channels = a.channels)