Skip to content
Snippets Groups Projects
Commit 2fea769a authored by Nicolas Denier's avatar Nicolas Denier
Browse files

fix text file reading, update readme

parent b24e4214
No related branches found
No related tags found
No related merge requests found
outputdir/ *outputdir/
...@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus. ...@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus.
Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry) Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry)
to your julia installation. Then, install the package as usual: to your julia installation. Then, install the package as usual:
``` ```
pkg> add SpeechCorpora pkg> add SpeechDatasets
``` ```
## Example ## Example
...@@ -18,9 +18,13 @@ julia> using SpeechDatasets ...@@ -18,9 +18,13 @@ julia> using SpeechDatasets
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
... ...
julia> dataset = MINILIBRISPEECH("/path/to/timit/dir", "outputdir", :train) # :dev | :test julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
... ...
julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
...
julia> for ((signal, fs), supervision) in dataset julia> for ((signal, fs), supervision) in dataset
# do something # do something
end end
...@@ -36,5 +40,5 @@ julia> TIMITDICT("/path/to/timit/dir") ...@@ -36,5 +40,5 @@ julia> TIMITDICT("/path/to/timit/dir")
## License ## License
This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE) This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE))
...@@ -3,9 +3,12 @@ ...@@ -3,9 +3,12 @@
const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal" const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal"
const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified" const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
end
function ina_diachrony_recordings(dir::AbstractString) function ina_diachrony_recordings(dir::AbstractString)
! isdir(dir) && throw(ArgumentError("expected directory $dir")) checkdir(dir)
recordings = Dict() recordings = Dict()
for (root, subdirs, files) in walkdir(dir) for (root, subdirs, files) in walkdir(dir)
...@@ -13,7 +16,7 @@ function ina_diachrony_recordings(dir::AbstractString) ...@@ -13,7 +16,7 @@ function ina_diachrony_recordings(dir::AbstractString)
filename, ext = splitext(file) filename, ext = splitext(file)
ext != ".wav" && continue ext != ".wav" && continue
id = "ina_diachrony§$(filename)" id = "ina_diachrony§$filename"
path = joinpath(root, file) path = joinpath(root, file)
audio_src = FileAudioSource(path) audio_src = FileAudioSource(path)
...@@ -30,8 +33,17 @@ function ina_diachrony_recordings(dir::AbstractString) ...@@ -30,8 +33,17 @@ function ina_diachrony_recordings(dir::AbstractString)
end end
function get_metadata(filename)
metadata = split(filename, "§")
timeperiod = metadata[1]
age, sex = split(metadata[2], "_")
speaker = metadata[3]
return timeperiod, age, sex, speaker
end
function ina_diachrony_annotations_whole(dir) function ina_diachrony_annotations_whole(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir")) checkdir(dir)
annotations = Dict() annotations = Dict()
...@@ -39,26 +51,24 @@ function ina_diachrony_annotations_whole(dir) ...@@ -39,26 +51,24 @@ function ina_diachrony_annotations_whole(dir)
for file in files for file in files
filename, ext = splitext(file) filename, ext = splitext(file)
ext != ".wav" && continue ext != ".wav" && continue
metadata = split(filename, "§")
timeperiod = metadata[1]
age, sex = split(metadata[2], "_")
speaker = metadata[3]
id = "ina_diachrony§$(filename)"
# extract text
textfilename = "$(filename).txt"
text = isfile(textfilename) ? readlines(textfilename) : ""
# extract metadata from filename
timeperiod, age, sex, speaker = get_metadata(filename)
# extract transcription text (same filename but .txt)
textfilepath = joinpath(root, "$filename.txt")
text = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
id = "ina_diachrony§$filename"
annotation_id = id*"§0" annotation_id = id*"§0"
# generate annotation
annotations[annotation_id] = Annotation( annotations[annotation_id] = Annotation(
id, # audio id id, # audio id
annotation_id, # annotation id annotation_id, # annotation id
-1, # start and duration is -1 means that we take the whole -1, # start and duration is -1 means that we take the whole
-1, # recording -1, # recording
[1], # only 1 channel (mono recording) [1], # only 1 channel (mono recording)
Dict( Dict( # additional informations
"text" => text, "text" => text,
"speaker" => speaker, "speaker" => speaker,
"timeperiod" => timeperiod, "timeperiod" => timeperiod,
...@@ -71,8 +81,9 @@ function ina_diachrony_annotations_whole(dir) ...@@ -71,8 +81,9 @@ function ina_diachrony_annotations_whole(dir)
annotations annotations
end end
function ina_diachrony_annotations_csv(dir) function ina_diachrony_annotations_csv(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir")) checkdir(dir)
annotations = Dict() annotations = Dict()
...@@ -81,13 +92,11 @@ function ina_diachrony_annotations_csv(dir) ...@@ -81,13 +92,11 @@ function ina_diachrony_annotations_csv(dir)
filename, ext = splitext(file) filename, ext = splitext(file)
ext != ".csv" && continue ext != ".csv" && continue
metadata = split(filename, "§") # extract metadata from filename
timeperiod = metadata[1] timeperiod, age, sex, speaker = get_metadata(filename)
age, sex = split(metadata[2], "_")
speaker = metadata[3]
id = "ina_diachrony§$(filename)"
id = "ina_diachrony§$filename"
# generate annotation for each line in csv
open(joinpath(root, file)) do f open(joinpath(root, file)) do f
header = readline(f) header = readline(f)
line = 1 line = 1
...@@ -97,14 +106,14 @@ function ina_diachrony_annotations_csv(dir) ...@@ -97,14 +106,14 @@ function ina_diachrony_annotations_csv(dir)
start_time, end_time, text = split(current_line, ",", limit=3) start_time, end_time, text = split(current_line, ",", limit=3)
start_time = parse(Float64, start_time) start_time = parse(Float64, start_time)
duration = parse(Float64, end_time)-start_time duration = parse(Float64, end_time)-start_time
annotation_id = id*$(line)" annotation_id = id*$line"
annotations[id] = Annotation( annotations[id] = Annotation(
id, # audio id id, # audio id
annotation_id, # annotation id annotation_id, # annotation id
start_time, # start start_time, # start
duration, # duration duration, # duration
[1], # only 1 channel (mono recording) [1], # only 1 channel (mono recording)
Dict( Dict( # additional informations
"text" => text, "text" => text,
"speaker" => speaker, "speaker" => speaker,
"timeperiod" => timeperiod, "timeperiod" => timeperiod,
...@@ -125,7 +134,7 @@ end ...@@ -125,7 +134,7 @@ end
function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
# Validate the data directory # Validate the data directory
for d in [ina_wav_dir, ina_csv_dir] for d in [ina_wav_dir, ina_csv_dir]
! isdir(d) && throw(ArgumentError("invalid path $(d)")) isnothing(d) || checkdir(d)
end end
# Create the output directory. # Create the output directory.
...@@ -143,12 +152,13 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) ...@@ -143,12 +152,13 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
# Annotations # Annotations
@info "Extracting annotations from $ina_wav_dir" @info "Extracting annotations from $ina_wav_dir"
whole_annotations = ina_diachrony_annotations_whole(ina_wav_dir) annotations = ina_diachrony_annotations_whole(ina_wav_dir)
#@info "Extracting annotations from $ina_csv_dir" if ! isnothing(ina_csv_dir)
#csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir) @info "Extracting annotations from $ina_csv_dir"
#annotations = merge(whole_annotations, csv_annotations) csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
annotations = whole_annotations annotations = merge(annotations, csv_annotations)
end
manifestpath = joinpath(outputdir, "annotations.jsonl") manifestpath = joinpath(outputdir, "annotations.jsonl")
@info "Creating $manifestpath" @info "Creating $manifestpath"
open(manifestpath, "w") do f open(manifestpath, "w") do f
...@@ -156,7 +166,7 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) ...@@ -156,7 +166,7 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end end
end end
function INADIACHRONY(ina_wav_dir, ina_csv_dir, outputdir) function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) && if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl"))) isfile(joinpath(outputdir, "annotations.jsonl")))
ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir) ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment