Newer
Older
# SPDX-License-Identifier: CECILL-C
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
function speech2tex_recordings(dir::AbstractString)
checkdir(dir)
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
id = filename
path = joinpath(root, file)
audio_src = FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 48000
)
end
end
recordings
end
extract_digits(str::AbstractString) = filter(c->isdigit(c), str)
isnumber(str::AbstractString) = extract_digits(str)==str
function speech2tex_get_metadata(filename)
# possible cases: line123_p1 line123_124_p1 line123_p1_part2 (not observed but also supported: line123_124_p1_part2)
split_name = split(filename, "_")
metadata = Dict()
if isnumber(split_name[2])
metadata["line"] = extract_digits(split_name[1])*"_"*split_name[2]
metadata["speaker"] = split_name[3]
else
metadata["line"] = extract_digits(split_name[1])
metadata["speaker"] = split_name[2]
end
if occursin("part", split_name[end])
metadata["part"] = extract_digits(split_name[end])
end
metadata
end
function speech2tex_annotations(audiodir, transcriptiondir, texdir)
checkdir.([audiodir, transcriptiondir, texdir])
annotations = Dict()
for (root, subdirs, files) in walkdir(audiodir)
for file in files
filename, ext = splitext(file)
ext != ".wav" && continue
# extract metadata from csv files
metadata = speech2tex_get_metadata(filename)
# extract transcription and tex (same filenames but .txt)
dirdict = Dict(transcriptiondir => "transcription", texdir => "latex")
for (d, label) in dirdict
textfilepath = joinpath(d, "$filename.txt")
metadata[label] = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
end
id = filename
# generate annotation
annotations[id] = Annotation(
id, # audio id
id, # annotation id
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
metadata # additional informations
)
end
end
annotations
end
function speech2tex_prepare(datadir, outputdir)
# Validate the data directory
checkdir(datadir)
# Create the output directory.
outputdir = mkpath(outputdir)
rm(joinpath(outputdir, "recordings.jsonl"), force=true)
# Recordings
recordings = Array{Dict}(undef, 2)
recordings_path = joinpath(datadir, "audio")
@info "Extracting recordings from $recordings_path"
recordings = speech2tex_recordings(recordings_path)
manifestpath = joinpath(outputdir, "recordings.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Annotations
transcriptiondir = joinpath(datadir, "sequences")
texdir = joinpath(datadir, "latex")
@info "Extracting annotations from $transcriptiondir and $texdir"
annotations = speech2tex_annotations(recordings_path, transcriptiondir, texdir)
manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, annotations)
end
end
function SPEECH2TEX(datadir, outputdir)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
speech2tex_prepare(datadir, outputdir)
end
dataset(outputdir, "")
end