# SPDX-License-Identifier: CECILL-C function ina_diachrony_recordings(dir::AbstractString) checkdir(dir) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue id = "ina_diachrony§$filename" path = joinpath(root, file) audio_src = FileAudioSource(path) recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 16000 ) end end recordings end function ina_diachrony_get_metadata(filename) metadata = split(filename, "§") age, sex = split(metadata[2], "_") Dict( "speaker" => metadata[3], "timeperiod" => metadata[1], "age" => age, "sex" => sex, ) end function ina_diachrony_annotations_whole(dir) checkdir(dir) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue # extract metadata from filename metadata = ina_diachrony_get_metadata(filename) # extract transcription text (same filename but .txt) textfilepath = joinpath(root, "$filename.txt") metadata["text"] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : "" id = "ina_diachrony§$filename" annotation_id = id*"§0" # generate annotation annotations[annotation_id] = Annotation( id, # audio id annotation_id, # annotation id -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) metadata # additional informations ) end end annotations end function ina_diachrony_annotations_csv(dir) checkdir(dir) annotations = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".csv" && continue # extract metadata from filename metadata = ina_diachrony_get_metadata(filename) id = "ina_diachrony§$filename" # generate annotation for each line in csv open(joinpath(root, file)) do f header = readline(f) line = 1 # read till end of file while ! eof(f) current_line = readline(f) start_time, end_time, text = split(current_line, ",", limit=3) start_time = parse(Float64, start_time) duration = parse(Float64, end_time)-start_time metadata["text"] = text annotation_id = id*"§$line" annotations[id] = Annotation( id, # audio id annotation_id, # annotation id start_time, # start duration, # duration [1], # only 1 channel (mono recording) metadata # additional informations ) line += 1 end end end end annotations end function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) # Validate the data directory for d in [ina_wav_dir, ina_csv_dir] isnothing(d) || checkdir(d) end # Create the output directory. outputdir = mkpath(outputdir) rm(joinpath(outputdir, "recordings.jsonl"), force=true) # Recordings @info "Extracting recordings from $ina_wav_dir" recordings = ina_diachrony_recordings(ina_wav_dir) manifestpath = joinpath(outputdir, "recordings.jsonl") open(manifestpath, "w") do f writemanifest(f, recordings) end # Annotations @info "Extracting annotations from $ina_wav_dir" annotations = ina_diachrony_annotations_whole(ina_wav_dir) if ! isnothing(ina_csv_dir) @info "Extracting annotations from $ina_csv_dir" csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir) annotations = merge(annotations, csv_annotations) end manifestpath = joinpath(outputdir, "annotations.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, annotations) end end function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing) if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && isfile(joinpath(outputdir, "annotations.jsonl"))) ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) end dataset(outputdir, "") end