diff --git a/Project.toml b/Project.toml index 05695f7e6cbe36fa33032bf0c54499f14f6100aa..ed6c029fc1f31d933f81fc25205acbe15c40ed71 100644 --- a/Project.toml +++ b/Project.toml @@ -1,14 +1,20 @@ name = "SpeechDatasets" uuid = "ae813453-fab8-46d9-ab8f-a64c05464021" authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", "Nicolas DENIER <nicolas.denier@cnrs.fr>", "Remi URO <uro@lisn.fr>"] -version = "0.20.2" +version = "0.21.0" [deps] AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307" +SyntheticVowelDataset = "04fd9c8d-fed6-4abe-8fdc-87f03ef5a264" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +[compat] +CSV = "0.10" +DataFrames = "1.7" +SyntheticVowelDataset = "0.1" diff --git a/src/SpeechDatasets.jl b/src/SpeechDatasets.jl index 0d25b02248e4e650af68375f21b7198804a02bed..80482c9d7a999a18a514762fc73d1de532d64bff 100644 --- a/src/SpeechDatasets.jl +++ b/src/SpeechDatasets.jl @@ -6,6 +6,8 @@ using JSON import AudioSources using SpeechFeatures import MLUtils +using DataFrames +using CSV export # ManifestItem @@ -50,6 +52,7 @@ include("corpora/timit.jl") include("corpora/pfc_lisn.jl") include("corpora/faetar_asr_challenge_2025.jl") include("corpora/mini_librispeech.jl") +include("corpora/synthetic_vowel_dataset.jl") #include.("corpora/".*filter(contains(r"\.jl$"), readdir(joinpath(@__DIR__, "corpora")))) include("lexicons.jl") diff --git a/src/corpora/corpora.json b/src/corpora/corpora.json index 33fbe4375357f64a1a1aa4a5767c4c214cbf52d6..889d8d5ad263569f02fa091e480070ac56fff9f4 100644 --- a/src/corpora/corpora.json +++ b/src/corpora/corpora.json @@ -72,5 +72,15 @@ "authors": ["Michael Ong", "Sean Robertson", "Leo Peckham", "Alba Jorquera Jimenez de Aberasturi", "Paula Arkhangorodsky", "Robin Huo", "Aman Sakhardande", "Mark Hallap", "Naomi Nagy", "Ewan Dunbar"], "description": "Data for the 2025 Faetar Low-Resource ASR Challenge", "subsets": ["train", "test", "dev", "unlab"] + }, + + "SYNTHETIC_VOWEL_DATASET": { + "name": "SYNTHETIC_VOWEL_DATASET", + "lang": ["fra", "eng"], + "license": "proprietary", + "source": "https://gitlab.lisn.upsaclay.fr/PTAL/Datasets/SyntheticVowelDataset", + "authors": ["Simon Devauchelle", "Lucas Ondel Yang", "Albert Rilliard", "David Doukhan"], + "description": "Synthetic vowels dataset generated from formants tables", + "subsets": [] } } diff --git a/src/corpora/synthetic_vowel_dataset.jl b/src/corpora/synthetic_vowel_dataset.jl new file mode 100644 index 0000000000000000000000000000000000000000..fe19e7afa3cf5bc0c32584442d55205657b7fdff --- /dev/null +++ b/src/corpora/synthetic_vowel_dataset.jl @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: CECILL-B + +####################################################################### + +function prepare(::Val{:SYNTHETIC_VOWEL_DATASET}, synsetdir, odir) + # Validate the data directory + ! isdir(synsetdir) && throw(ArgumentError("invalid path $(synsetdir)")) + + # Create the output directory. + dir = mkpath(odir) + rm(joinpath(odir, "recordings.jsonl"), force=true) + + ## Recordings + @info "Extracting recordings from $synsetdir" + recordings = synset_recordings(synsetdir) + + manifestpath = joinpath(odir, "recordings.jsonl") + @info "Creating $manifestpath" + open(manifestpath, "a") do f + writemanifest(f, recordings) + end + + # Metadata + @info "Extracting metadata from $synsetdir/$(basename(synsetdir))_detailed.csv" + metadata = synset_metadata(synsetdir) + + manifestpath = joinpath(odir, "annotations.jsonl") + @info "Creating $manifestpath" + open(manifestpath, "w") do f + writemanifest(f, metadata) + end + +end + + +function synset_recordings(dir::AbstractString) + ! isdir(dir) && throw(ArgumentError("expected directory $dir")) + + recordings = Dict() + for (root, subdirs, files) in walkdir(dir) + for file in files + lname, ext = splitext(file) + sname = split(lname, "_") + + ext != ".wav" && continue + + spk = join(sname[1:3], "_") + gender, vowel, sigid = sname[4], sname[5], sname[6] + + path = joinpath(root, file) + + id = "$(spk)_$(gender)_$(vowel)_$(sigid)" + + audio_src = AudioSources.FileAudioSource(path) + + recordings[id] = Recording( + id, + audio_src; + channels = [1], + samplerate = 16000 + ) + end + end + recordings +end + + +function synset_metadata(dir) + ! isdir(dir) && throw(ArgumentError("expected directory $dir")) + + metadata = Dict() + + fpath = joinpath("$dir", "$(basename(dir))_detailed.csv") + df = DataFrame(CSV.File(fpath)) + + # Get number of filter coefficients + countfilter = count(col -> occursin(r"^a_\d+$", col), names(df)) + + for row in eachrow(df) + + # Get metadata + spk = row["fname"] + gender = row["gender"] + vowel = row["vowel"] + sigid = split(row["signal"], "_")[end] + f₀ = row["f0"] + ϕ = row["ϕ"] + vtl = row["vtl"] + filter = Dict(["a_$i" => row["a_$i"] for i in 1:countfilter]) + angles = Dict(["θ$i" => row["θ$i"] for i in 1:Int(countfilter/2)]) + magnitudes = Dict(["r$i" => row["r$i"] for i in 1:Int(countfilter/2)]) + + id = "$(spk)_$(gender)_$(vowel)_$(sigid)" + metadata[id] = Annotation( + id, # recording id and annotation id are the same since we have + id, # a one-to-one mapping + -1, # start and duration is -1 means that we take the whole + -1, # recording + [1], # only 1 channel (mono recording) + Dict( + "spk" => spk, + "gender" => gender, + "vowel" => vowel, + "sigid" => sigid, + "f₀" => f₀, + "ϕ" => ϕ, + "vtl" => vtl, + "filter" => filter, + "angles" => angles, + "magnitudes" => magnitudes + ) + ) + end + metadata +end + diff --git a/test/runtests.jl b/test/runtests.jl index 47f1b20ae87bd06d8bc59b06764a8bc07b7f182f..330eea72cd265b91805a6ae9bc6f5ff86952ccf6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,15 +1,15 @@ + +using JSON using SpeechDatasets +using SyntheticVowelDataset using Test - ## The following tests do not work on the Gitlab CI because the volumes are not mounted on the runner. TODO find better tests -# using JSON - -# PATHS = JSON.parsefile( -# joinpath(@__DIR__, "..", "DatasetsDocumentation", "corpora.json") -# ) +#PATHS = JSON.parsefile( +# joinpath(@__DIR__, "../src/corpora", "corpora.json") +#) # rm("/tmp/pfc/recordings.jsonl", force=true) @@ -21,6 +21,10 @@ using Test # rm("/tmp/FAETAR_ASR_CHALLENGE_2025/recordings.jsonl", force=true) # rm("/tmp/FAETAR_ASR_CHALLENGE_2025/annotations.jsonl", force=true) +# rm("/tmp/synset/recordings.jsonl", force=true) +# rm("/tmp/synset/annotations.jsonl", force=true) + + # println("Testing FAETAR_ASR_CHALLENGE_2025 loading") # ds = SpeechDatasets.dataset(:FAETAR_ASR_CHALLENGE_2025, PATHS["FAETAR_ASR_CHALLENGE_2025"]["path"], "/tmp/FAETAR_ASR_CHALLENGE_2025") @@ -41,3 +45,14 @@ using Test # @test isfile("/tmp/pfc/recordings.jsonl") # @test isfile("/tmp/pfc/annotations.jsonl") + +@testset "SYNTHETIC_VOWEL_DATASET" begin + datadir = mktempdir(; cleanup = false) + manifestdir = mktempdir(; cleanup = false) + SyntheticVowelDataset.generate(datadir, "calliope"; classes_number=1, signals_number=1) + ds = SpeechDatasets.dataset(:SYNTHETIC_VOWEL_DATASET, datadir, manifestdir) + + @test isfile(joinpath(manifestdir, "recordings.jsonl")) + @test isfile(joinpath(manifestdir, "annotations.jsonl")) +end +