Skip to content
Snippets Groups Projects
Commit f2442632 authored by Lucas Ondel Yang's avatar Lucas Ondel Yang
Browse files

Merge branch 'avid' into 'main'

Add support for AVID dataset

See merge request fast/speechdatasets.jl!16
parents 4335df3a 54f0ee8b
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,9 @@ julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
...
julia> dataset = AVID("/path/to/avid/dir", "outputdir")
...
julia> for ((signal, fs), supervision) in dataset
# do something
......
......@@ -27,6 +27,7 @@ export
MINILIBRISPEECH,
TIMIT,
INADIACHRONY,
AVID,
# Lexicon
CMUDICT,
......@@ -46,6 +47,7 @@ include("corpora/multilingual_librispeech.jl")
include("corpora/mini_librispeech.jl")
include("corpora/timit.jl")
include("corpora/ina_diachrony.jl")
include("corpora/avid.jl")
include("lexicons.jl")
......
# SPDX-License-Identifier: CECILL-2.1
function avid_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = filename
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function load_metadata_files(dir::AbstractString)
tasksdict = Dict('s' => "SENT", 'p' => "PARA")
metadatadict = Dict(key =>
readlines(joinpath(dir, "Metadata_with_labels_$(tasksdict[key]).csv"))
for key in keys(tasksdict))
return metadatadict
end
function get_metadata(filename, metadatadict)
task = split(filename, "_")[3][1]
headers = metadatadict[task][1]
headers = split(headers, ",")
file_metadata = filter(x -> contains(x, filename), metadatadict[task])[1]
file_metadata = split(file_metadata, ",")
metadata = Dict(
headers[i] => file_metadata[i]
for i = 1:length(headers)
)
return metadata
end
function avid_annotations(dir)
checkdir(dir)
annotations = Dict()
metadatadict = load_metadata_files(dir)
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from csv files
metadata = get_metadata(filename, metadatadict)
id = filename
# generate annotation
annotations[id] = Annotation(
id, # audio id
id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function download_avid(dir)
@info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
filename = "AVID.zip"
filepath = joinpath(dir,filename)
run(`mkdir -p $dir`)
run(`wget $url -O $filepath`)
@info "Download complete, extracting files"
run(`unzip $filepath -d $dir`)
run(`rm $filepath`)
return joinpath(datadir, "/AVID")
end
function avid_prepare(datadir, outputdir)
# Validate the data directory
isdir(datadir) || (datadir = download_avid(datadir))
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
recordings = Array{Dict}(undef, 2)
recordings_path = joinpath(datadir, "Repository 2")
@info "Extracting recordings from $recordings_path"
recordings[1] = avid_recordings(recordings_path)
# Calibration tones
calibtones_path = joinpath(datadir, "Calibration_tones")
@info "Extracting recordings from $calibtones_path"
recordings[2] = avid_recordings(calibtones_path)
for (i, manifestpath) in enumerate([joinpath(outputdir, "recordings.jsonl"), joinpath(outputdir, "calibration_tones.jsonl")])
open(manifestpath, "w") do f
writemanifest(f, recordings[i])
end
end
# Annotations
annotations_path = recordings_path
@info "Extracting annotations from $annotations_path"
annotations = avid_annotations(annotations_path)
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function AVID(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
avid_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end
# SPDX-License-Identifier: CECILL-2.1
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end
function ina_diachrony_recordings(dir::AbstractString)
checkdir(dir)
......
......@@ -136,3 +136,6 @@ Load Recording/Annotation manifest from `path`.
"""
load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment