Merge branch 'synthetic_dataset' into 'main'

Adding "Synthetic Dataset" in the available set of corpora See merge request !14

Merge branch 'synthetic_dataset' into 'main'
4f739461 · Simon Devauchelle · 17a60597 · 64b3a647 · 4f739461 · 4f739461
Commit 4f739461 authored 1 month ago by Simon Devauchelle
--- a/Project.toml
+++ b/Project.toml
 name = "SpeechDatasets"
 uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
 authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", "Nicolas DENIER <nicolas.denier@cnrs.fr>", "Remi URO <uro@lisn.fr>"]
-version = "0.20.2"
+version = "0.21.0"

 [deps]
 AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
+SyntheticVowelDataset = "04fd9c8d-fed6-4abe-8fdc-87f03ef5a264"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

+[compat]
+CSV = "0.10"
+DataFrames = "1.7"
+SyntheticVowelDataset = "0.1"
--- a/src/SpeechDatasets.jl
+++ b/src/SpeechDatasets.jl
@@ -6,6 +6,8 @@ using JSON
 import AudioSources
 using SpeechFeatures
 import MLUtils
+using DataFrames
+using CSV

 export
    # ManifestItem
@@ -50,6 +52,7 @@ include("corpora/timit.jl")
 include("corpora/pfc_lisn.jl")
 include("corpora/faetar_asr_challenge_2025.jl")
 include("corpora/mini_librispeech.jl")
+include("corpora/synthetic_vowel_dataset.jl")

 #include.("corpora/".*filter(contains(r"\.jl$"), readdir(joinpath(@__DIR__, "corpora"))))
 include("lexicons.jl")

--- a/src/corpora/corpora.json
+++ b/src/corpora/corpora.json
@@ -72,5 +72,15 @@
        "authors": ["Michael Ong", "Sean Robertson", "Leo Peckham", "Alba Jorquera Jimenez de Aberasturi", "Paula Arkhangorodsky", "Robin Huo", "Aman Sakhardande", "Mark Hallap", "Naomi Nagy", "Ewan Dunbar"],
        "description": "Data for the 2025 Faetar Low-Resource ASR Challenge",
        "subsets": ["train", "test", "dev", "unlab"]
+    },
+
+    "SYNTHETIC_VOWEL_DATASET": {
+        "name": "SYNTHETIC_VOWEL_DATASET",
+        "lang": ["fra", "eng"],
+        "license": "proprietary",
+        "source": "https://gitlab.lisn.upsaclay.fr/PTAL/Datasets/SyntheticVowelDataset",
+        "authors": ["Simon Devauchelle", "Lucas Ondel Yang", "Albert Rilliard", "David Doukhan"],
+        "description": "Synthetic vowels dataset generated from formants tables",
+        "subsets": []
    }
 }
--- a/src/corpora/synthetic_vowel_dataset.jl
+++ b/src/corpora/synthetic_vowel_dataset.jl
+# SPDX-License-Identifier: CECILL-B
+
+#######################################################################
+
+function prepare(::Val{:SYNTHETIC_VOWEL_DATASET}, synsetdir, odir)
+    # Validate the data directory
+    ! isdir(synsetdir) && throw(ArgumentError("invalid path $(synsetdir)"))
+
+    # Create the output directory.
+    dir = mkpath(odir)
+    rm(joinpath(odir, "recordings.jsonl"), force=true)
+
+    ## Recordings
+    @info "Extracting recordings from $synsetdir"
+    recordings = synset_recordings(synsetdir)
+
+    manifestpath = joinpath(odir, "recordings.jsonl")
+    @info "Creating $manifestpath"
+    open(manifestpath, "a") do f
+        writemanifest(f, recordings)
+    end
+
+    # Metadata
+    @info "Extracting metadata from $synsetdir/$(basename(synsetdir))_detailed.csv"
+    metadata = synset_metadata(synsetdir)
+
+    manifestpath = joinpath(odir, "annotations.jsonl")
+    @info "Creating $manifestpath"
+    open(manifestpath, "w") do f
+        writemanifest(f, metadata)
+    end
+
+end
+
+
+function synset_recordings(dir::AbstractString)
+    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+
+    recordings = Dict()
+    for (root, subdirs, files) in walkdir(dir)
+        for file in files
+            lname, ext = splitext(file)
+            sname = split(lname, "_")
+
+            ext != ".wav" && continue
+
+            spk = join(sname[1:3], "_")
+            gender, vowel, sigid = sname[4], sname[5], sname[6]
+
+            path = joinpath(root, file)
+
+            id = "$(spk)_$(gender)_$(vowel)_$(sigid)"
+
+            audio_src = AudioSources.FileAudioSource(path)
+
+            recordings[id] = Recording(
+                id,
+                audio_src;
+                channels = [1],
+                samplerate = 16000
+            )
+        end
+    end
+    recordings
+end
+
+
+function synset_metadata(dir)
+    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+
+    metadata = Dict()
+
+    fpath = joinpath("$dir", "$(basename(dir))_detailed.csv")
+    df = DataFrame(CSV.File(fpath))
+
+    # Get number of filter coefficients
+    countfilter = count(col -> occursin(r"^a_\d+$", col), names(df))
+
+    for row in eachrow(df)
+
+        # Get metadata
+        spk = row["fname"]
+        gender = row["gender"]
+        vowel = row["vowel"]
+        sigid = split(row["signal"], "_")[end]
+        f₀ = row["f0"]
+        ϕ = row["ϕ"]
+        vtl = row["vtl"]
+        filter = Dict(["a_$i" => row["a_$i"] for i in 1:countfilter])
+        angles = Dict(["θ$i" => row["θ$i"] for i in 1:Int(countfilter/2)])
+        magnitudes = Dict(["r$i" => row["r$i"] for i in 1:Int(countfilter/2)])
+
+        id = "$(spk)_$(gender)_$(vowel)_$(sigid)"
+        metadata[id] = Annotation(
+            id,  # recording id and annotation id are the same since we have
+            id,  # a one-to-one mapping
+            -1,  # start and duration is -1 means that we take the whole
+            -1,  # recording
+            [1], # only 1 channel (mono recording)
+            Dict(
+                    "spk" => spk,
+                    "gender" => gender,
+                    "vowel" => vowel,
+                    "sigid" => sigid,
+                    "f₀" => f₀,
+                    "ϕ" => ϕ,
+                    "vtl" => vtl,
+                    "filter" => filter,
+                    "angles" => angles,
+                    "magnitudes" => magnitudes
+            )
+        )
+    end
+    metadata
+end
+
--- a/test/runtests.jl
+++ b/test/runtests.jl
+
+using JSON
 using SpeechDatasets
+using SyntheticVowelDataset
 using Test


-
 ## The following tests do not work on the Gitlab CI because the volumes are not mounted on the runner. TODO find better tests

-# using JSON
-
-# PATHS = JSON.parsefile(
-#     joinpath(@__DIR__, "..", "DatasetsDocumentation", "corpora.json")
-# )
+#PATHS = JSON.parsefile(
+#    joinpath(@__DIR__, "../src/corpora", "corpora.json")
+#)


 # rm("/tmp/pfc/recordings.jsonl", force=true)
@@ -21,6 +21,10 @@ using Test
 # rm("/tmp/FAETAR_ASR_CHALLENGE_2025/recordings.jsonl", force=true)
 # rm("/tmp/FAETAR_ASR_CHALLENGE_2025/annotations.jsonl", force=true)

+# rm("/tmp/synset/recordings.jsonl", force=true)
+# rm("/tmp/synset/annotations.jsonl", force=true)
+
+
 # println("Testing FAETAR_ASR_CHALLENGE_2025 loading")
 # ds = SpeechDatasets.dataset(:FAETAR_ASR_CHALLENGE_2025, PATHS["FAETAR_ASR_CHALLENGE_2025"]["path"], "/tmp/FAETAR_ASR_CHALLENGE_2025")

@@ -41,3 +45,14 @@ using Test
 # @test isfile("/tmp/pfc/recordings.jsonl")
 # @test isfile("/tmp/pfc/annotations.jsonl")

+
+@testset "SYNTHETIC_VOWEL_DATASET" begin
+    datadir = mktempdir(; cleanup = false)
+    manifestdir = mktempdir(; cleanup = false)
+    SyntheticVowelDataset.generate(datadir, "calliope"; classes_number=1, signals_number=1)
+    ds = SpeechDatasets.dataset(:SYNTHETIC_VOWEL_DATASET, datadir, manifestdir)
+
+    @test isfile(joinpath(manifestdir, "recordings.jsonl"))
+    @test isfile(joinpath(manifestdir, "annotations.jsonl"))
+end
+