fix text file reading, update readme

2fea769a · Nicolas Denier · b24e4214 · 2fea769a · 2fea769a · 2fea769a
Commit 2fea769a authored 9 months ago by Nicolas Denier
--- a/.gitignore
+++ b/.gitignore
-outputdir/
+*outputdir/
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus.
 Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry)
 to your julia installation. Then, install the package as usual:
 ```
-pkg> add SpeechCorpora
+pkg> add SpeechDatasets
 ```
 ## Example
@@ -18,9 +18,13 @@ julia> using SpeechDatasets
 julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
 ...
-julia> dataset = MINILIBRISPEECH("/path/to/timit/dir", "outputdir", :train) # :dev | :test
+julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
 ...
+julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
+...
 julia> for ((signal, fs), supervision) in dataset
           # do something
       end
@@ -36,5 +40,5 @@ julia> TIMITDICT("/path/to/timit/dir")
 ## License
-This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE)
+This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE))
--- a/src/corpora/ina_diachrony.jl
+++ b/src/corpora/ina_diachrony.jl
@@ -3,9 +3,12 @@
 const AUDIO_PATH = "/vol/work1/rilliard/diachronie/normal"
 const TRANSCRIPTION_PATH = "/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"
+function checkdir(dir::AbstractString)
+    isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
+end
 function ina_diachrony_recordings(dir::AbstractString)
-    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+    checkdir(dir)
    recordings = Dict()
    for (root, subdirs, files) in walkdir(dir)
@@ -13,7 +16,7 @@ function ina_diachrony_recordings(dir::AbstractString)
            filename, ext = splitext(file)
            ext != ".wav" && continue
-            id = "ina_diachrony§$(filename)"
+            id = "ina_diachrony§$filename"
            path = joinpath(root, file)
            audio_src = FileAudioSource(path)
@@ -30,8 +33,17 @@ function ina_diachrony_recordings(dir::AbstractString)
 end
+function get_metadata(filename)
+    metadata = split(filename, "§")
+    timeperiod = metadata[1]
+    age, sex = split(metadata[2], "_")
+    speaker = metadata[3]
+    return timeperiod, age, sex, speaker
+end
 function ina_diachrony_annotations_whole(dir)
-    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+    checkdir(dir)
    annotations = Dict()
@@ -39,26 +51,24 @@ function ina_diachrony_annotations_whole(dir)
        for file in files
            filename, ext = splitext(file)
            ext != ".wav" && continue
-            metadata = split(filename, "§")
-            timeperiod = metadata[1]
-            age, sex = split(metadata[2], "_")
-            speaker = metadata[3]
-            id = "ina_diachrony§$(filename)"
-            # extract text
-            textfilename = "$(filename).txt"
-            text = isfile(textfilename) ? readlines(textfilename) : ""
+            # extract metadata from filename
+            timeperiod, age, sex, speaker = get_metadata(filename)
+            # extract transcription text (same filename but .txt)
+            textfilepath = joinpath(root, "$filename.txt")
+            text = isfile(textfilepath) ? join(readlines(textfilepath), "\n") : ""
+            id = "ina_diachrony§$filename"
            annotation_id = id*"§0"
+            # generate annotation
            annotations[annotation_id] = Annotation(
                id, # audio id
                annotation_id, # annotation id
                -1,  # start and duration is -1 means that we take the whole
                -1,  # recording
                [1], # only 1 channel (mono recording)
-                Dict(
+                Dict( # additional informations
                     "text" => text,
                     "speaker" => speaker,
                     "timeperiod" => timeperiod,
@@ -71,8 +81,9 @@ function ina_diachrony_annotations_whole(dir)
    annotations
 end
 function ina_diachrony_annotations_csv(dir)
-    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+    checkdir(dir)
    annotations = Dict()
@@ -81,13 +92,11 @@ function ina_diachrony_annotations_csv(dir)
            filename, ext = splitext(file)
            ext != ".csv" && continue
-            metadata = split(filename, "§")
+            # extract metadata from filename
-            timeperiod = metadata[1]
+            timeperiod, age, sex, speaker = get_metadata(filename)
-            age, sex = split(metadata[2], "_")
-            speaker = metadata[3]
-            id = "ina_diachrony§$(filename)"
+            id = "ina_diachrony§$filename"
+            # generate annotation for each line in csv
            open(joinpath(root, file)) do f
                header = readline(f)   
                line = 1 
@@ -97,14 +106,14 @@ function ina_diachrony_annotations_csv(dir)
                    start_time, end_time, text = split(current_line, ",", limit=3)
                    start_time = parse(Float64, start_time)
                    duration = parse(Float64, end_time)-start_time
-                    annotation_id = id*"§$(line)"
+                    annotation_id = id*"§$line"
                    annotations[id] = Annotation(
                        id, # audio id
                        annotation_id, # annotation id
                        start_time,  # start
                        duration,  # duration
                        [1], # only 1 channel (mono recording)
-                        Dict(
+                        Dict( # additional informations
                            "text" => text,
                            "speaker" => speaker,
                            "timeperiod" => timeperiod,
@@ -125,7 +134,7 @@ end
 function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    # Validate the data directory
    for d in [ina_wav_dir, ina_csv_dir]
-        ! isdir(d) && throw(ArgumentError("invalid path $(d)"))
+        isnothing(d) || checkdir(d)
    end
    # Create the output directory.
@@ -143,12 +152,13 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    # Annotations
    @info "Extracting annotations from $ina_wav_dir"
-    whole_annotations = ina_diachrony_annotations_whole(ina_wav_dir)
+    annotations = ina_diachrony_annotations_whole(ina_wav_dir)
-    #@info "Extracting annotations from $ina_csv_dir"
+    if ! isnothing(ina_csv_dir)
-    #csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
+        @info "Extracting annotations from $ina_csv_dir"
-    #annotations = merge(whole_annotations, csv_annotations)
+        csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
-    annotations = whole_annotations
+        annotations = merge(annotations, csv_annotations)
+    end
    manifestpath = joinpath(outputdir, "annotations.jsonl")
    @info "Creating $manifestpath"
    open(manifestpath, "w") do f
@@ -156,7 +166,7 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    end
 end
-function INADIACHRONY(ina_wav_dir, ina_csv_dir, outputdir)
+function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)