# SPDX-License-Identifier: CECILL-2.1 struct SpeechDataset <: MLUtils.AbstractDataContainer idxs::Vector{AbstractString} annotations::Dict{AbstractString, Annotation} recordings::Dict{AbstractString, Recording} partition::Symbol end """ dataset(manifestroot, partition) Load `SpeechDataset` from manifest files stored in `manifestroot`. Partition is specified by `partition`, e.g. `:train`, `:test`. Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`. See also [`Annotation`](@ref). # Examples ```julia-repl julia> ds = dataset("./manifests", :train) SpeechDataset( ... ) julia> ds[1] ( (samples=[...], sampling_rate=16_000), Dict( "text" => "Annotation text here" ) ) ``` """ function dataset(manifestroot, partition) annot_path = joinpath(manifestroot, "annotations-$(partition).jsonl") rec_path = joinpath(manifestroot, "recordings.jsonl") annotations = load(Annotation, annot_path) recordings = load(Recording, rec_path) dataset(annotations, recordings, partition) end function dataset(annotations, recordings, partition) idxs = collect(keys(annotations)) SpeechDataset(idxs, annotations, recordings, Symbol(partition)) end function Base.getindex(d::SpeechDataset, key::AbstractString) ann = d.annotations[key] rec = d.recordings[ann.recording_id] samples, sr = load(rec, ann) (samples=samples, sampling_rate=sr), ann.data end Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx]) # Fix1 -> partial funcion with fixed 1st argument Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs) Base.length(d::SpeechDataset) = length(d.idxs) function Base.filter(fn, d::SpeechDataset) fidxs = filter(fn, d.idxs) fannotations = filter(k_v -> fn(first(k_v)), d.annotations) frecs = filter(k_v -> fn(first(k_v)), d.recordings) SpeechDataset(fidxs, fannotations, frecs, :custom) end