Skip to content
Snippets Groups Projects
avid.jl 4.07 KiB
Newer Older
# SPDX-License-Identifier: CECILL-2.1

function avid_recordings(dir::AbstractString)
    checkdir(dir)

    recordings = Dict()
    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            id = filename
            path = joinpath(root, file)

            audio_src = FileAudioSource(path)

            recordings[id] = Recording(
                id,
                audio_src;
                channels = [1],
                samplerate = 16000
            )
        end
    end
    recordings
end


function load_metadata_files(dir::AbstractString)
    tasksdict = Dict('s' => "SENT", 'p' => "PARA")
    metadatadict = Dict(key => 
        readlines(joinpath(dir, "Metadata_with_labels_$(tasksdict[key]).csv")) 
        for key in keys(tasksdict))
    return metadatadict
end


function get_metadata(filename, metadatadict)
    task = split(filename, "_")[3][1]
    headers = metadatadict[task][1]
    headers = split(headers, ",")
    file_metadata = filter(x -> contains(x, filename), metadatadict[task])[1]
    file_metadata = split(file_metadata, ",")
    metadata = Dict(
        headers[i] => file_metadata[i]
        for i = 1:length(headers)
    )
    return metadata
end


function avid_annotations(dir)
    checkdir(dir)

    annotations = Dict()
    metadatadict = load_metadata_files(dir)

    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            # extract metadata from csv files
            metadata = get_metadata(filename, metadatadict)
            
            id = filename
            # generate annotation
            annotations[id] = Annotation(
                id, # audio id
                id, # annotation id
                -1,  # start and duration is -1 means that we take the whole
                -1,  # recording
                [1], # only 1 channel (mono recording)
                metadata # additional informations   
            )
        end
    end
    annotations
end


function download_avid(dir)
    @info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
    url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
    filename = "AVID.zip"
    filepath = joinpath(dir,filename)
    run(`mkdir -p $dir`)
    run(`wget $url -O $filepath`)
    @info "Download complete, extracting files"
    run(`unzip $filepath -d $dir`)
    run(`rm $filepath`)
    return joinpath(datadir, "/AVID")
end


function avid_prepare(datadir, outputdir)
    # Validate the data directory
    isdir(datadir) || (datadir = download_avid(datadir))

    # Create the output directory.
    outputdir = mkpath(outputdir)
    rm(joinpath(outputdir, "recordings.jsonl"), force=true)

    # Recordings
    recordings = Array{Dict}(undef, 2)
    recordings_path = joinpath(datadir, "Repository 2")
    @info "Extracting recordings from $recordings_path"
    recordings[1] = avid_recordings(recordings_path)
    # Calibration tones
    calibtones_path = joinpath(datadir, "Calibration_tones")
    @info "Extracting recordings from $calibtones_path"
    recordings[2] = avid_recordings(calibtones_path)

    for (i, manifestpath) in enumerate([joinpath(outputdir, "recordings.jsonl"), joinpath(outputdir, "calibration_tones.jsonl")])
        open(manifestpath, "w") do f
            writemanifest(f, recordings[i])
        end
    end

    # Annotations
    annotations_path = recordings_path
    @info "Extracting annotations from $annotations_path"
    annotations = avid_annotations(annotations_path)
        
    manifestpath = joinpath(outputdir, "annotations.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, annotations)
    end
end


function AVID(datadir, outputdir)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        avid_prepare(datadir, outputdir)
    end
    dataset(outputdir, "")
end