Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# SPDX-License-Identifier: CECILL-2.1
function avid_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = filename
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function load_metadata_files(dir::AbstractString)
tasksdict = Dict('s' => "SENT", 'p' => "PARA")
metadatadict = Dict(key =>
readlines(joinpath(dir, "Metadata_with_labels_$(tasksdict[key]).csv"))
for key in keys(tasksdict))
return metadatadict
end
function get_metadata(filename, metadatadict)
task = split(filename, "_")[3][1]
headers = metadatadict[task][1]
headers = split(headers, ",")
file_metadata = filter(x -> contains(x, filename), metadatadict[task])[1]
file_metadata = split(file_metadata, ",")
metadata = Dict(
headers[i] => file_metadata[i]
for i = 1:length(headers)
)
return metadata
end
function avid_annotations(dir)
checkdir(dir)
annotations = Dict()
metadatadict = load_metadata_files(dir)
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from csv files
metadata = get_metadata(filename, metadatadict)
id = filename
# generate annotation
annotations[id] = Annotation(
id, # audio id
id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function download_avid(dir)
@info "Directory $dir not found.\nDownloading AVID dataset (9.9 GB)"
url = "https://zenodo.org/records/10524873/files/AVID.zip?download=1"
filename = "AVID.zip"
filepath = joinpath(dir,filename)
run(`mkdir -p $dir`)
run(`wget $url -O $filepath`)
@info "Download complete, extracting files"
run(`unzip $filepath -d $dir`)
run(`rm $filepath`)
return joinpath(datadir, "/AVID")
end
function avid_prepare(datadir, outputdir)
# Validate the data directory
isdir(datadir) || (datadir = download_avid(datadir))
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
recordings = Array{Dict}(undef, 2)
recordings_path = joinpath(datadir, "Repository 2")
@info "Extracting recordings from $recordings_path"
recordings[1] = avid_recordings(recordings_path)
# Calibration tones
calibtones_path = joinpath(datadir, "Calibration_tones")
@info "Extracting recordings from $calibtones_path"
recordings[2] = avid_recordings(calibtones_path)
for (i, manifestpath) in enumerate([joinpath(outputdir, "recordings.jsonl"), joinpath(outputdir, "calibration_tones.jsonl")])
open(manifestpath, "w") do f
writemanifest(f, recordings[i])
end
end
# Annotations
annotations_path = recordings_path
@info "Extracting annotations from $annotations_path"
annotations = avid_annotations(annotations_path)
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function AVID(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
avid_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end