Skip to content
Snippets Groups Projects
Commit 4f739461 authored by Simon Devauchelle's avatar Simon Devauchelle
Browse files

Merge branch 'synthetic_dataset' into 'main'

Adding "Synthetic Dataset" in the available set of corpora

See merge request !14
parents 17a60597 64b3a647
No related branches found
Tags v0.21.0
1 merge request!14Adding "Synthetic Dataset" in the available set of corpora
Pipeline #3624 passed
name = "SpeechDatasets"
uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", "Nicolas DENIER <nicolas.denier@cnrs.fr>", "Remi URO <uro@lisn.fr>"]
version = "0.20.2"
version = "0.21.0"
[deps]
AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
SyntheticVowelDataset = "04fd9c8d-fed6-4abe-8fdc-87f03ef5a264"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
CSV = "0.10"
DataFrames = "1.7"
SyntheticVowelDataset = "0.1"
......@@ -6,6 +6,8 @@ using JSON
import AudioSources
using SpeechFeatures
import MLUtils
using DataFrames
using CSV
export
# ManifestItem
......@@ -50,6 +52,7 @@ include("corpora/timit.jl")
include("corpora/pfc_lisn.jl")
include("corpora/faetar_asr_challenge_2025.jl")
include("corpora/mini_librispeech.jl")
include("corpora/synthetic_vowel_dataset.jl")
#include.("corpora/".*filter(contains(r"\.jl$"), readdir(joinpath(@__DIR__, "corpora"))))
include("lexicons.jl")
......
......@@ -72,5 +72,15 @@
"authors": ["Michael Ong", "Sean Robertson", "Leo Peckham", "Alba Jorquera Jimenez de Aberasturi", "Paula Arkhangorodsky", "Robin Huo", "Aman Sakhardande", "Mark Hallap", "Naomi Nagy", "Ewan Dunbar"],
"description": "Data for the 2025 Faetar Low-Resource ASR Challenge",
"subsets": ["train", "test", "dev", "unlab"]
},
"SYNTHETIC_VOWEL_DATASET": {
"name": "SYNTHETIC_VOWEL_DATASET",
"lang": ["fra", "eng"],
"license": "proprietary",
"source": "https://gitlab.lisn.upsaclay.fr/PTAL/Datasets/SyntheticVowelDataset",
"authors": ["Simon Devauchelle", "Lucas Ondel Yang", "Albert Rilliard", "David Doukhan"],
"description": "Synthetic vowels dataset generated from formants tables",
"subsets": []
}
}
# SPDX-License-Identifier: CECILL-B
#######################################################################
function prepare(::Val{:SYNTHETIC_VOWEL_DATASET}, synsetdir, odir)
# Validate the data directory
! isdir(synsetdir) && throw(ArgumentError("invalid path $(synsetdir)"))
# Create the output directory.
dir = mkpath(odir)
rm(joinpath(odir, "recordings.jsonl"), force=true)
## Recordings
@info "Extracting recordings from $synsetdir"
recordings = synset_recordings(synsetdir)
manifestpath = joinpath(odir, "recordings.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "a") do f
writemanifest(f, recordings)
end
# Metadata
@info "Extracting metadata from $synsetdir/$(basename(synsetdir))_detailed.csv"
metadata = synset_metadata(synsetdir)
manifestpath = joinpath(odir, "annotations.jsonl")
@info "Creating $manifestpath"
open(manifestpath, "w") do f
writemanifest(f, metadata)
end
end
function synset_recordings(dir::AbstractString)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
lname, ext = splitext(file)
sname = split(lname, "_")
ext != ".wav" && continue
spk = join(sname[1:3], "_")
gender, vowel, sigid = sname[4], sname[5], sname[6]
path = joinpath(root, file)
id = "$(spk)_$(gender)_$(vowel)_$(sigid)"
audio_src = AudioSources.FileAudioSource(path)
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
function synset_metadata(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
metadata = Dict()
fpath = joinpath("$dir", "$(basename(dir))_detailed.csv")
df = DataFrame(CSV.File(fpath))
# Get number of filter coefficients
countfilter = count(col -> occursin(r"^a_\d+$", col), names(df))
for row in eachrow(df)
# Get metadata
spk = row["fname"]
gender = row["gender"]
vowel = row["vowel"]
sigid = split(row["signal"], "_")[end]
f₀ = row["f0"]
ϕ = row["ϕ"]
vtl = row["vtl"]
filter = Dict(["a_$i" => row["a_$i"] for i in 1:countfilter])
angles = Dict([$i" => row[$i"] for i in 1:Int(countfilter/2)])
magnitudes = Dict(["r$i" => row["r$i"] for i in 1:Int(countfilter/2)])
id = "$(spk)_$(gender)_$(vowel)_$(sigid)"
metadata[id] = Annotation(
id, # recording id and annotation id are the same since we have
id, # a one-to-one mapping
-1, # start and duration is -1 means that we take the whole
-1, # recording
[1], # only 1 channel (mono recording)
Dict(
"spk" => spk,
"gender" => gender,
"vowel" => vowel,
"sigid" => sigid,
"f₀" => f₀,
"ϕ" => ϕ,
"vtl" => vtl,
"filter" => filter,
"angles" => angles,
"magnitudes" => magnitudes
)
)
end
metadata
end
using JSON
using SpeechDatasets
using SyntheticVowelDataset
using Test
## The following tests do not work on the Gitlab CI because the volumes are not mounted on the runner. TODO find better tests
# using JSON
# PATHS = JSON.parsefile(
# joinpath(@__DIR__, "..", "DatasetsDocumentation", "corpora.json")
# )
#PATHS = JSON.parsefile(
# joinpath(@__DIR__, "../src/corpora", "corpora.json")
#)
# rm("/tmp/pfc/recordings.jsonl", force=true)
......@@ -21,6 +21,10 @@ using Test
# rm("/tmp/FAETAR_ASR_CHALLENGE_2025/recordings.jsonl", force=true)
# rm("/tmp/FAETAR_ASR_CHALLENGE_2025/annotations.jsonl", force=true)
# rm("/tmp/synset/recordings.jsonl", force=true)
# rm("/tmp/synset/annotations.jsonl", force=true)
# println("Testing FAETAR_ASR_CHALLENGE_2025 loading")
# ds = SpeechDatasets.dataset(:FAETAR_ASR_CHALLENGE_2025, PATHS["FAETAR_ASR_CHALLENGE_2025"]["path"], "/tmp/FAETAR_ASR_CHALLENGE_2025")
......@@ -41,3 +45,14 @@ using Test
# @test isfile("/tmp/pfc/recordings.jsonl")
# @test isfile("/tmp/pfc/annotations.jsonl")
@testset "SYNTHETIC_VOWEL_DATASET" begin
datadir = mktempdir(; cleanup = false)
manifestdir = mktempdir(; cleanup = false)
SyntheticVowelDataset.generate(datadir, "calliope"; classes_number=1, signals_number=1)
ds = SpeechDatasets.dataset(:SYNTHETIC_VOWEL_DATASET, datadir, manifestdir)
@test isfile(joinpath(manifestdir, "recordings.jsonl"))
@test isfile(joinpath(manifestdir, "annotations.jsonl"))
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment