Skip to content
Snippets Groups Projects
ina_diachrony.jl 5.45 KiB
Newer Older
# SPDX-License-Identifier: CECILL-2.1

const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal"
const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"

function checkdir(dir::AbstractString)
    isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end

function ina_diachrony_recordings(dir::AbstractString)
    checkdir(dir)

    recordings = Dict()
    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            id = "ina_diachrony§$filename"
            path = joinpath(root, file)

            audio_src = FileAudioSource(path)

            recordings[id] = Recording(
                id,
                audio_src;
                channels = [1],
                samplerate = 16000
            )
        end
    end
    recordings
end


function get_metadata(filename)
    metadata = split(filename, "§")
    timeperiod = metadata[1]
    age, sex = split(metadata[2], "_")
    speaker = metadata[3]
    return timeperiod, age, sex, speaker
end


function ina_diachrony_annotations_whole(dir)
    checkdir(dir)

    annotations = Dict()

    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            # extract metadata from filename
            timeperiod, age, sex, speaker = get_metadata(filename)
            
            # extract transcription text (same filename but .txt)
            textfilepath = joinpath(root, "$filename.txt")
            text = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
            
            id = "ina_diachrony§$filename"
            # generate annotation
            annotations[annotation_id] = Annotation(
                id, # audio id
                annotation_id, # annotation id
                -1,  # start and duration is -1 means that we take the whole
                -1,  # recording
                [1], # only 1 channel (mono recording)
                Dict( # additional informations
                     "text" => text,
                     "speaker" => speaker,
                     "timeperiod" => timeperiod,
                     "age" => age,
                     "sex" => sex,
                )
            )
        end
    end
    annotations
end

function ina_diachrony_annotations_csv(dir)
    checkdir(dir)

    annotations = Dict()

    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".csv" && continue

            # extract metadata from filename
            timeperiod, age, sex, speaker = get_metadata(filename)
            id = "ina_diachrony§$filename"
            # generate annotation for each line in csv
            open(joinpath(root, file)) do f
                header = readline(f)   
                line = 1 
                # read till end of file
                while ! eof(f) 
                    current_line = readline(f)
                    start_time, end_time, text = split(current_line, ",", limit=3)
                    start_time = parse(Float64, start_time)
                    duration = parse(Float64, end_time)-start_time
                    annotation_id = id*$line"
                    annotations[id] = Annotation(
                        id, # audio id
                        annotation_id, # annotation id
                        start_time,  # start
                        duration,  # duration
                        [1], # only 1 channel (mono recording)
                        Dict( # additional informations
                            "text" => text,
                            "speaker" => speaker,
                            "timeperiod" => timeperiod,
                            "age" => age,
                            "sex" => sex,
                        )
                    )
                    line += 1
                end
            end

        end
    end
    annotations
end


function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    # Validate the data directory
    for d in [ina_wav_dir, ina_csv_dir]
        isnothing(d) || checkdir(d)
    end

    # Create the output directory.
    outputdir = mkpath(outputdir)
    rm(joinpath(outputdir, "recordings.jsonl"), force=true)

    # Recordings
    @info "Extracting recordings from $ina_wav_dir"
    recordings = ina_diachrony_recordings(ina_wav_dir)

    manifestpath = joinpath(outputdir, "recordings.jsonl")
    open(manifestpath, "w") do f
        writemanifest(f, recordings)
    end

    # Annotations
    @info "Extracting annotations from $ina_wav_dir"
    annotations = ina_diachrony_annotations_whole(ina_wav_dir)
    if ! isnothing(ina_csv_dir)
        @info "Extracting annotations from $ina_csv_dir"
        csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
        annotations = merge(annotations, csv_annotations)
    end
        
    manifestpath = joinpath(outputdir, "annotations.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, annotations)
    end
end

function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    end
    dataset(outputdir, "")
end