# SPDX-License-Identifier: CECILL-2.1 const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal" const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified" function ina_diachrony_recordings(dir::AbstractString) ! isdir(dir) && throw(ArgumentError("expected directory $dir")) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue id = "ina_diachrony§$(filename)" path = joinpath(root, file) audio_src = FileAudioSource(path) recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 16000 ) end end recordings end function ina_diachrony_annotations_whole(dir) ! isdir(dir) && throw(ArgumentError("expected directory $dir")) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue metadata = split(filename, "§") timeperiod = metadata[1] age, sex = split(metadata[2], "_") speaker = metadata[3] id = "ina_diachrony§$(filename)" # extract text textfilename = "$(filename).txt" text = isfile(textfilename) ? readlines(textfilename) : "" annotation_id = id*"§0" annotations[annotation_id] = Annotation( id, # audio id annotation_id, # annotation id -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) Dict( "text" => text, "speaker" => speaker, "timeperiod" => timeperiod, "age" => age, "sex" => sex, ) ) end end annotations end function ina_diachrony_annotations_csv(dir) ! isdir(dir) && throw(ArgumentError("expected directory $dir")) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".csv" && continue metadata = split(filename, "§") timeperiod = metadata[1] age, sex = split(metadata[2], "_") speaker = metadata[3] id = "ina_diachrony§$(filename)" open(joinpath(root, file)) do f header = readline(f) line = 1 # read till end of file while ! eof(f) current_line = readline(f) start_time, end_time, text = split(current_line, ",", limit=3) start_time = parse(Float64, start_time) duration = parse(Float64, end_time)-start_time annotation_id = id*"§$(line)" annotations[id] = Annotation( id, # audio id annotation_id, # annotation id start_time, # start duration, # duration [1], # only 1 channel (mono recording) Dict( "text" => text, "speaker" => speaker, "timeperiod" => timeperiod, "age" => age, "sex" => sex, ) ) line += 1 end end end end annotations end function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) # Validate the data directory for d in [ina_wav_dir, ina_csv_dir] ! isdir(d) && throw(ArgumentError("invalid path $(d)")) end # Create the output directory. outputdir = mkpath(outputdir) rm(joinpath(outputdir, "recordings.jsonl"), force=true) # Recordings @info "Extracting recordings from $ina_wav_dir" recordings = ina_diachrony_recordings(ina_wav_dir) manifestpath = joinpath(outputdir, "recordings.jsonl") open(manifestpath, "w") do f writemanifest(f, recordings) end # Annotations @info "Extracting annotations from $ina_wav_dir" whole_annotations = ina_diachrony_annotations_whole(ina_wav_dir) #@info "Extracting annotations from $ina_csv_dir" #csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir) #annotations = merge(whole_annotations, csv_annotations) annotations = whole_annotations manifestpath = joinpath(outputdir, "annotations.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, annotations) end end function INADIACHRONY(ina_wav_dir, ina_csv_dir, outputdir) if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && isfile(joinpath(outputdir, "annotations.jsonl"))) ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) end dataset(outputdir, "") end