# SPDX-License-Identifier: CECILL-C function speech2tex_recordings(dir::AbstractString) checkdir(dir) recordings = Dict() for (root, subdirs, files) in walkdir(dir) for file in files filename, ext = splitext(file) ext != ".wav" && continue id = filename path = joinpath(root, file) audio_src = FileAudioSource(path) recordings[id] = Recording( id, audio_src; channels = [1], samplerate = 48000 ) end end recordings end extract_digits(str::AbstractString) = filter(c->isdigit(c), str) isnumber(str::AbstractString) = extract_digits(str)==str function speech2tex_get_metadata(filename) # possible cases: line123_p1 line123_124_p1 line123_p1_part2 (not observed but also supported: line123_124_p1_part2) split_name = split(filename, "_") metadata = Dict() if isnumber(split_name[2]) metadata["line"] = extract_digits(split_name[1])*"_"*split_name[2] metadata["speaker"] = split_name[3] else metadata["line"] = extract_digits(split_name[1]) metadata["speaker"] = split_name[2] end if occursin("part", split_name[end]) metadata["part"] = extract_digits(split_name[end]) end metadata end function speech2tex_annotations(audiodir, transcriptiondir, texdir) checkdir.([audiodir, transcriptiondir, texdir]) annotations = Dict() for (root, subdirs, files) in walkdir(audiodir) for file in files filename, ext = splitext(file) ext != ".wav" && continue # extract metadata from csv files metadata = speech2tex_get_metadata(filename) # extract transcription and tex (same filenames but .txt) dirdict = Dict(transcriptiondir => "transcription", texdir => "latex") for (d, label) in dirdict textfilepath = joinpath(d, "$filename.txt") metadata[label] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : "" end id = filename # generate annotation annotations[id] = Annotation( id, # audio id id, # annotation id -1, # start and duration is -1 means that we take the whole -1, # recording [1], # only 1 channel (mono recording) metadata # additional informations ) end end annotations end function speech2tex_prepare(datadir, outputdir) # Validate the data directory checkdir(datadir) # Create the output directory. outputdir = mkpath(outputdir) rm(joinpath(outputdir, "recordings.jsonl"), force=true) # Recordings recordings = Array{Dict}(undef, 2) recordings_path = joinpath(datadir, "audio") @info "Extracting recordings from $recordings_path" recordings = speech2tex_recordings(recordings_path) manifestpath = joinpath(outputdir, "recordings.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, recordings) end # Annotations transcriptiondir = joinpath(datadir, "sequences") texdir = joinpath(datadir, "latex") @info "Extracting annotations from $transcriptiondir and $texdir" annotations = speech2tex_annotations(recordings_path, transcriptiondir, texdir) manifestpath = joinpath(outputdir, "annotations.jsonl") @info "Creating $manifestpath" open(manifestpath, "w") do f writemanifest(f, annotations) end end function SPEECH2TEX(datadir, outputdir) if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && isfile(joinpath(outputdir, "annotations.jsonl"))) speech2tex_prepare(datadir, outputdir) end dataset(outputdir, "") end