Skip to content
Snippets Groups Projects
ina_diachrony.jl 5.17 KiB
Newer Older
# SPDX-License-Identifier: CECILL-2.1

const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal"
const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"


function ina_diachrony_recordings(dir::AbstractString)
    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))

    recordings = Dict()
    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            id = "ina_diachrony§$(filename)"
            path = joinpath(root, file)

            audio_src = FileAudioSource(path)

            recordings[id] = Recording(
                id,
                audio_src;
                channels = [1],
                samplerate = 16000
            )
        end
    end
    recordings
end


function ina_diachrony_annotations_whole(dir)
    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))

    annotations = Dict()

    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue

            metadata = split(filename, "§")
            timeperiod = metadata[1]
            age, sex = split(metadata[2], "_")
            speaker = metadata[3]

            id = "ina_diachrony§$(filename)"

            # extract text
            textfilename = "$(filename).txt"
            text = isfile(textfilename) ? readlines(textfilename) : ""
            
            annotation_id = id*"§0"
            annotations[annotation_id] = Annotation(
                id, # audio id
                annotation_id, # annotation id
                -1,  # start and duration is -1 means that we take the whole
                -1,  # recording
                [1], # only 1 channel (mono recording)
                Dict(
                     "text" => text,
                     "speaker" => speaker,
                     "timeperiod" => timeperiod,
                     "age" => age,
                     "sex" => sex,
                )
            )
        end
    end
    annotations
end

function ina_diachrony_annotations_csv(dir)
    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))

    annotations = Dict()

    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".csv" && continue

            metadata = split(filename, "§")
            timeperiod = metadata[1]
            age, sex = split(metadata[2], "_")
            speaker = metadata[3]

            id = "ina_diachrony§$(filename)"

            open(joinpath(root, file)) do f
                header = readline(f)   
                line = 1 
                # read till end of file
                while ! eof(f) 
                    current_line = readline(f)
                    start_time, end_time, text = split(current_line, ",", limit=3)
                    start_time = parse(Float64, start_time)
                    duration = parse(Float64, end_time)-start_time
                    annotation_id = id*$(line)"
                    annotations[id] = Annotation(
                        id, # audio id
                        annotation_id, # annotation id
                        start_time,  # start
                        duration,  # duration
                        [1], # only 1 channel (mono recording)
                        Dict(
                            "text" => text,
                            "speaker" => speaker,
                            "timeperiod" => timeperiod,
                            "age" => age,
                            "sex" => sex,
                        )
                    )
                    line += 1
                end
            end

        end
    end
    annotations
end


function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    # Validate the data directory
    for d in [ina_wav_dir, ina_csv_dir]
        ! isdir(d) && throw(ArgumentError("invalid path $(d)"))
    end

    # Create the output directory.
    outputdir = mkpath(outputdir)
    rm(joinpath(outputdir, "recordings.jsonl"), force=true)

    # Recordings
    @info "Extracting recordings from $ina_wav_dir"
    recordings = ina_diachrony_recordings(ina_wav_dir)

    manifestpath = joinpath(outputdir, "recordings.jsonl")
    open(manifestpath, "w") do f
        writemanifest(f, recordings)
    end

    # Annotations
    @info "Extracting annotations from $ina_wav_dir"
    whole_annotations = ina_diachrony_annotations_whole(ina_wav_dir)
    #@info "Extracting annotations from $ina_csv_dir"
    #csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
    #annotations = merge(whole_annotations, csv_annotations)
    annotations = whole_annotations

    manifestpath = joinpath(outputdir, "annotations.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, annotations)
    end
end

function INADIACHRONY(ina_wav_dir, ina_csv_dir, outputdir)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    end
    dataset(outputdir, "")
end