Skip to content
Snippets Groups Projects
ina_diachrony.jl 4.61 KiB
Newer Older
  • Learn to ignore specific revisions
  • # SPDX-License-Identifier: CECILL-C
    
    const ina_diachrony_id = get_nametype("INA Diachrony")
    
    
    function ina_diachrony_recordings(dir::AbstractString)
    
        checkdir(dir)
    
    
        recordings = Dict()
        for (root, subdirs, files) in walkdir(dir)
            for file in files
                filename, ext = splitext(file)
                ext != ".wav" && continue
                
    
                id = "ina_diachrony§$filename"
    
                audio_src = AudioSource.FileAudioSource(path)
    
    
                recordings[id] = Recording(
                    id,
                    audio_src;
                    channels = [1],
                    samplerate = 16000
                )
            end
        end
        recordings
    end
    
    
    
    function ina_diachrony_get_metadata(filename)
    
        metadata = split(filename, "§")
        age, sex = split(metadata[2], "_")
    
        Dict(
            "speaker" => metadata[3],
            "timeperiod" => metadata[1],
            "age" => age,
            "sex" => sex,
        )
    
    function ina_diachrony_annotations_whole(dir)
    
        checkdir(dir)
    
    
        annotations = Dict()
    
        for (root, subdirs, files) in walkdir(dir)
            for file in files
                filename, ext = splitext(file)
                ext != ".wav" && continue
                
    
                # extract metadata from filename
    
                metadata = ina_diachrony_get_metadata(filename)
    
                
                # extract transcription text (same filename but .txt)
                textfilepath = joinpath(root, "$filename.txt")
    
                metadata["text"] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
    
                
                id = "ina_diachrony§$filename"
    
                # generate annotation
    
                annotations[annotation_id] = Annotation(
                    id, # audio id
                    annotation_id, # annotation id
                    -1,  # start and duration is -1 means that we take the whole
                    -1,  # recording
                    [1], # only 1 channel (mono recording)
    
                    metadata # additional informations
    
    function ina_diachrony_annotations_csv(dir)
    
        checkdir(dir)
    
    
        annotations = Dict()
    
        for (root, subdirs, files) in walkdir(dir)
            for file in files
                filename, ext = splitext(file)
                ext != ".csv" && continue
    
    
                # extract metadata from filename
    
                metadata = ina_diachrony_get_metadata(filename)
    
                id = "ina_diachrony§$filename"
                # generate annotation for each line in csv
    
                open(joinpath(root, file)) do f
                    header = readline(f)   
                    line = 1 
                    # read till end of file
                    while ! eof(f) 
                        current_line = readline(f)
                        start_time, end_time, text = split(current_line, ",", limit=3)
                        start_time = parse(Float64, start_time)
                        duration = parse(Float64, end_time)-start_time
    
                        metadata["text"] = text
    
                        annotation_id = id*$line"
    
                        annotations[id] = Annotation(
                            id, # audio id
                            annotation_id, # annotation id
                            start_time,  # start
                            duration,  # duration
                            [1], # only 1 channel (mono recording)
    
                            metadata # additional informations
    
    function prepare(::DatasetBuilder{ina_diachrony_id}, ina_wav_dir, outputdir; ina_csv_dir=nothing)
    
        # Validate the data directory
        for d in [ina_wav_dir, ina_csv_dir]
    
            isnothing(d) || checkdir(d)
    
        end
    
        # Create the output directory.
        outputdir = mkpath(outputdir)
        rm(joinpath(outputdir, "recordings.jsonl"), force=true)
    
        # Recordings
        @info "Extracting recordings from $ina_wav_dir"
        recordings = ina_diachrony_recordings(ina_wav_dir)
    
        manifestpath = joinpath(outputdir, "recordings.jsonl")
        open(manifestpath, "w") do f
            writemanifest(f, recordings)
        end
    
        # Annotations
        @info "Extracting annotations from $ina_wav_dir"
    
        annotations = ina_diachrony_annotations_whole(ina_wav_dir)
        if ! isnothing(ina_csv_dir)
            @info "Extracting annotations from $ina_csv_dir"
            csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
            annotations = merge(annotations, csv_annotations)
        end
            
    
        manifestpath = joinpath(outputdir, "annotations.jsonl")
        @info "Creating $manifestpath"
        open(manifestpath, "w") do f
            writemanifest(f, annotations)
        end
    end