Newer
Older
# SPDX-License-Identifier: CECILL-2.1
const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal"
const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end
function ina_diachrony_recordings(dir::AbstractString)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function get_metadata(filename)
metadata = split(filename, "§")
timeperiod = metadata[1]
age, sex = split(metadata[2], "_")
speaker = metadata[3]
return timeperiod, age, sex, speaker
end
function ina_diachrony_annotations_whole(dir)
annotations = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from filename
timeperiod, age, sex, speaker = get_metadata(filename)
# extract transcription text (same filename but .txt)
textfilepath = joinpath(root, "$filename.txt")
text = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
id = "ina_diachrony§$filename"
annotation_id = id*"§0"
annotations[annotation_id] = Annotation(
id, # audio id
annotation_id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
"text" => text,
"speaker" => speaker,
"timeperiod" => timeperiod,
"age" => age,
"sex" => sex,
)
)
end
end
annotations
end
function ina_diachrony_annotations_csv(dir)
annotations = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".csv" && continue
# extract metadata from filename
timeperiod, age, sex, speaker = get_metadata(filename)
id = "ina_diachrony§$filename"
# generate annotation for each line in csv
open(joinpath(root, file)) do f
header = readline(f)
line = 1
# read till end of file
while ! eof(f)
current_line = readline(f)
start_time, end_time, text = split(current_line, ",", limit=3)
start_time = parse(Float64, start_time)
duration = parse(Float64, end_time)-start_time
annotations[id] = Annotation(
id, # audio id
annotation_id, # annotation id
start_time, # start
duration, # duration
[1], # only 1 channel (mono recording)
"text" => text,
"speaker" => speaker,
"timeperiod" => timeperiod,
"age" => age,
"sex" => sex,
)
)
line += 1
end
end
end
end
annotations
end
function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
# Validate the data directory
for d in [ina_wav_dir, ina_csv_dir]
end
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
@info "Extracting recordings from $ina_wav_dir"
recordings = ina_diachrony_recordings(ina_wav_dir)
manifestpath = joinpath(outputdir, "recordings.jsonl")
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Annotations
@info "Extracting annotations from $ina_wav_dir"
annotations = ina_diachrony_annotations_whole(ina_wav_dir)
if ! isnothing(ina_csv_dir)
@info "Extracting annotations from $ina_csv_dir"
csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
annotations = merge(annotations, csv_annotations)
end
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
dataset(outputdir, "")
end