# SPDX-License-Identifier: CECILL-C const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json") @kwdef struct SpeechDatasetInfos name::AbstractString = "" lang::Union{AbstractString, Vector{AbstractString}} = "" license::AbstractString = "" source::AbstractString = "" authors::Vector{AbstractString} = [] description::AbstractString = "" end function SpeechDatasetInfos(infos::AbstractDict) kwargs = NamedTuple() for key in fieldnames(SpeechDatasetInfos) val = get(infos, String(key), nothing) # merge new (key=val) if key was found kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs end SpeechDatasetInfos(kwargs...) end function SpeechDatasetInfos(name::AbstractString) corpora_infos = JSON.parsefile(corpora_file) infos = filter(x -> x["name"]==name, corpora_infos)[1] SpeechDatasetInfos(infos) end struct SpeechDataset <: MLUtils.AbstractDataContainer infos::SpeechDatasetInfos idxs::Vector{AbstractString} annotations::Dict{AbstractString, Annotation} recordings::Dict{AbstractString, Recording} end function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording}) idxs = collect(keys(annotations)) SpeechDataset(infos, idxs, annotations, recordings) end function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, partition::AbstractString) partition_name = partition == "" ? "" : "-$(partition)" annot_path = joinpath(manifestroot, "annotations$(partition_name).jsonl") rec_path = joinpath(manifestroot, "recordings.jsonl") annotations = load_manifest(Annotation, annot_path) recordings = load_manifest(Recording, rec_path) SpeechDataset(infos, annotations, recordings) end Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key] Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx]) # Fix1 -> partial function with fixed 1st argument Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs) Base.length(d::SpeechDataset) = length(d.idxs) function Base.filter(fn, d::SpeechDataset) fidxs = filter(d.idxs) do i fn((d.recordings[i], d.annotations[i])) end idset = Set(fidxs) fannotations = filter(d.annotations) do (k, v) k ∈ idset end frecs = filter(d.recordings) do (k, v) k ∈ idset end SpeechDataset(d.infos, fidxs, fannotations, frecs) end