# SPDX-License-Identifier: CECILL-2.1 function avid_recordings(dir::AbstractString) checkdir(dir) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue id = filename path = joinpath(root, file) audio_src = FileAudioSource(path) recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 16000 ) end end recordings end function load_metadata_files(dir::AbstractString) tasksdict = Dict('s' => "SENT", 'p' => "PARA") metadatadict = Dict(key => readlines(joinpath(dir, "Metadata_with_labels_$(tasksdict[key]).csv")) for key in keys(tasksdict)) return metadatadict end function get_metadata(filename, metadatadict) task = split(filename, "_")[3][1] headers = metadatadict[task][1] headers = split(headers, ",") file_metadata = filter(x -> contains(x, filename), metadatadict[task])[1] file_metadata = split(file_metadata, ",") metadata = Dict( headers[i] => file_metadata[i] for i = 1:length(headers) ) return metadata end function avid_annotations(dir) checkdir(dir) annotations = Dict() metadatadict = load_metadata_files(dir) for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue # extract metadata from csv files metadata = get_metadata(filename, metadatadict) id = filename # generate annotation annotations[id] = Annotation( id, # audio id id, # annotation id -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) metadata # additional informations ) end end annotations end function download_avid(dir) @info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)" url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1" filename = "AVID.zip" filepath = joinpath(dir,filename) run(`mkdir -p $dir`) run(`wget $url -O $filepath`) @info "Download complete, extracting files" run(`unzip $filepath -d $dir`) run(`rm $filepath`) return joinpath(datadir, "/AVID") end function avid_prepare(datadir, outputdir) # Validate the data directory isdir(datadir) || (datadir = download_avid(datadir)) # Create the output directory. outputdir = mkpath(outputdir) rm(joinpath(outputdir, "recordings.jsonl"), force=true) # Recordings recordings = Array{Dict}(undef, 2) recordings_path = joinpath(datadir, "Repository 2") @info "Extracting recordings from $recordings_path" recordings[1] = avid_recordings(recordings_path) # Calibration tones calibtones_path = joinpath(datadir, "Calibration_tones") @info "Extracting recordings from $calibtones_path" recordings[2] = avid_recordings(calibtones_path) for (i, manifestpath) in enumerate([joinpath(outputdir, "recordings.jsonl"), joinpath(outputdir, "calibration_tones.jsonl")]) open(manifestpath, "w") do f writemanifest(f, recordings[i]) end end # Annotations annotations_path = recordings_path @info "Extracting annotations from $annotations_path" annotations = avid_annotations(annotations_path) manifestpath = joinpath(outputdir, "annotations.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, annotations) end end function AVID(datadir, outputdir) if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && isfile(joinpath(outputdir, "calibration_tones.jsonl")) && isfile(joinpath(outputdir, "annotations.jsonl"))) avid_prepare(datadir, outputdir) end dataset(outputdir, "") end