# SPDX-License-Identifier: CECILL-2.1 const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal" const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified" function checkdir(dir::AbstractString) isdir(dir) || throw(ArgumentError("$dir is not an existing directory")) end function ina_diachrony_recordings(dir::AbstractString) checkdir(dir) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue id = "ina_diachrony§$filename" path = joinpath(root, file) audio_src = FileAudioSource(path) recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 16000 ) end end recordings end function get_metadata(filename) metadata = split(filename, "§") timeperiod = metadata[1] age, sex = split(metadata[2], "_") speaker = metadata[3] return timeperiod, age, sex, speaker end function ina_diachrony_annotations_whole(dir) checkdir(dir) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue # extract metadata from filename timeperiod, age, sex, speaker = get_metadata(filename) # extract transcription text (same filename but .txt) textfilepath = joinpath(root, "$filename.txt") text = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : "" id = "ina_diachrony§$filename" annotation_id = id*"§0" # generate annotation annotations[annotation_id] = Annotation( id, # audio id annotation_id, # annotation id -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) Dict( # additional informations "text" => text, "speaker" => speaker, "timeperiod" => timeperiod, "age" => age, "sex" => sex, ) ) end end annotations end function ina_diachrony_annotations_csv(dir) checkdir(dir) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".csv" && continue # extract metadata from filename timeperiod, age, sex, speaker = get_metadata(filename) id = "ina_diachrony§$filename" # generate annotation for each line in csv open(joinpath(root, file)) do f header = readline(f) line = 1 # read till end of file while ! eof(f) current_line = readline(f) start_time, end_time, text = split(current_line, ",", limit=3) start_time = parse(Float64, start_time) duration = parse(Float64, end_time)-start_time annotation_id = id*"§$line" annotations[id] = Annotation( id, # audio id annotation_id, # annotation id start_time, # start duration, # duration [1], # only 1 channel (mono recording) Dict( # additional informations "text" => text, "speaker" => speaker, "timeperiod" => timeperiod, "age" => age, "sex" => sex, ) ) line += 1 end end end end annotations end function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) # Validate the data directory for d in [ina_wav_dir, ina_csv_dir] isnothing(d) || checkdir(d) end # Create the output directory. outputdir = mkpath(outputdir) rm(joinpath(outputdir, "recordings.jsonl"), force=true) # Recordings @info "Extracting recordings from $ina_wav_dir" recordings = ina_diachrony_recordings(ina_wav_dir) manifestpath = joinpath(outputdir, "recordings.jsonl") open(manifestpath, "w") do f writemanifest(f, recordings) end # Annotations @info "Extracting annotations from $ina_wav_dir" annotations = ina_diachrony_annotations_whole(ina_wav_dir) if ! isnothing(ina_csv_dir) @info "Extracting annotations from $ina_csv_dir" csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir) annotations = merge(annotations, csv_annotations) end manifestpath = joinpath(outputdir, "annotations.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, annotations) end end function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing) if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && isfile(joinpath(outputdir, "annotations.jsonl"))) ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) end dataset(outputdir, "") end