Newer
Older
# SPDX-License-Identifier: CECILL-C
Nicolas Denier
committed
const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json")
@kwdef struct SpeechDatasetInfos
name::AbstractString = ""
lang::Union{AbstractString, Vector{AbstractString}} = ""
license::AbstractString = ""
source::AbstractString = ""
authors::Vector{AbstractString} = []
description::AbstractString = ""
end
function SpeechDatasetInfos(infos::AbstractDict)
kwargs = NamedTuple()
for key in fieldnames(SpeechDatasetInfos)
val = get(infos, String(key), nothing)
# merge new (key=val) if key was found
kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs
end
SpeechDatasetInfos(kwargs...)
end
function SpeechDatasetInfos(name::AbstractString)
corpora_infos = JSON.parsefile(corpora_file)
infos = filter(x -> x["name"]==name, corpora_infos)[1]
SpeechDatasetInfos(infos)
end
Nicolas Denier
committed
infos::SpeechDatasetInfos
Simon Devauchelle
committed
annotations::Dict{AbstractString, Annotation}
Nicolas Denier
committed
function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording})
idxs = collect(keys(annotations))
SpeechDataset(infos, idxs, annotations, recordings)
end
Nicolas Denier
committed
function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, partition::AbstractString)
partition_name = partition == "" ? "" : "-$(partition)"
annot_path = joinpath(manifestroot, "annotations$(partition_name).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl")
Nicolas Denier
committed
annotations = load_manifest(Annotation, annot_path)
recordings = load_manifest(Recording, rec_path)
SpeechDataset(infos, annotations, recordings)
Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
Nicolas Denier
committed
# Fix1 -> partial function with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
fidxs = filter(d.idxs) do i
fn((d.recordings[i], d.annotations[i]))
end
idset = Set(fidxs)
fannotations = filter(d.annotations) do (k, v)
k ∈ idset
end
frecs = filter(d.recordings) do (k, v)
k ∈ idset
end
Nicolas Denier
committed
SpeechDataset(d.infos, fidxs, fannotations, frecs)