speech2tex.jl

# SPDX-License-Identifier: CECILL-C

function speech2tex_recordings(dir::AbstractString)
    checkdir(dir)

    recordings = Dict()
    for (root, subdirs, files) in walkdir(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            id = filename
            path = joinpath(root, file)

            audio_src = FileAudioSource(path)

            recordings[id] = Recording(
                id,
                audio_src;
                channels = [1],
                samplerate = 48000
            )
        end
    end
    recordings
end

extract_digits(str::AbstractString) = filter(c->isdigit(c), str)
isnumber(str::AbstractString) = extract_digits(str)==str

function speech2tex_get_metadata(filename)
    # possible cases: line123_p1  line123_124_p1  line123_p1_part2  (not observed but also supported: line123_124_p1_part2)
    split_name = split(filename, "_")
    metadata = Dict()
    if isnumber(split_name[2])
        metadata["line"] = extract_digits(split_name[1])*"_"*split_name[2]
        metadata["speaker"] = split_name[3]
    else 
        metadata["line"] = extract_digits(split_name[1])
        metadata["speaker"] = split_name[2]
    end
    if occursin("part", split_name[end])
        metadata["part"] = extract_digits(split_name[end])
    end
    metadata
end


function speech2tex_annotations(audiodir, transcriptiondir, texdir)
    checkdir.([audiodir, transcriptiondir, texdir])

    annotations = Dict()

    for (root, subdirs, files) in walkdir(audiodir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
            
            # extract metadata from csv files
            metadata = speech2tex_get_metadata(filename)

            # extract transcription and tex (same filenames but .txt)
            dirdict = Dict(transcriptiondir => "transcription", texdir => "latex")
            for (d, label) in dirdict
                textfilepath = joinpath(d, "$filename.txt")
                metadata[label] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
            end
            id = filename
            # generate annotation
            annotations[id] = Annotation(
                id, # audio id
                id, # annotation id
                -1,  # start and duration is -1 means that we take the whole
                -1,  # recording
                [1], # only 1 channel (mono recording)
                metadata # additional informations   
            )
        end
    end
    annotations
end

function speech2tex_prepare(datadir, outputdir)
    # Validate the data directory
    checkdir(datadir)

    # Create the output directory.
    outputdir = mkpath(outputdir)
    rm(joinpath(outputdir, "recordings.jsonl"), force=true)

    # Recordings
    recordings = Array{Dict}(undef, 2)
    recordings_path = joinpath(datadir, "audio")
    @info "Extracting recordings from $recordings_path"
    recordings = speech2tex_recordings(recordings_path)

    manifestpath = joinpath(outputdir, "recordings.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, recordings)
    end

    # Annotations
    transcriptiondir = joinpath(datadir, "sequences")
    texdir = joinpath(datadir, "latex")
    @info "Extracting annotations from $transcriptiondir and $texdir"
    annotations = speech2tex_annotations(recordings_path, transcriptiondir, texdir)
        
    manifestpath = joinpath(outputdir, "annotations.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
        writemanifest(f, annotations)
    end
end


function SPEECH2TEX(datadir, outputdir)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        speech2tex_prepare(datadir, outputdir)
    end
    dataset(outputdir, "")
end