Newer
Older
# SPDX-License-Identifier: CECILL-2.1
struct SpeechDataset <: MLUtils.AbstractDataContainer
idxs::Vector{AbstractString}
Simon Devauchelle
committed
annotations::Dict{AbstractString, Annotation}
recordings::Dict{AbstractString, Recording}
partition::Symbol
end
"""
Load `SpeechDataset` from manifest files stored in `manifestroot`.
Partition is specified by `partition`, e.g. `:train`, `:test`.
Simon Devauchelle
committed
Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
Simon Devauchelle
committed
See also [`Annotation`](@ref).
# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
...
)
julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
Simon Devauchelle
committed
"text" => "Annotation text here"
Simon Devauchelle
committed
annot_path = joinpath(manifestroot, "annotations-$(partition).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl")
Simon Devauchelle
committed
annotations = load(Annotation, annot_path)
Simon Devauchelle
committed
dataset(annotations, recordings, partition)
Simon Devauchelle
committed
function dataset(annotations, recordings, partition)
idxs = collect(keys(annotations))
SpeechDataset(idxs, annotations, recordings, Symbol(partition))
end
function Base.getindex(d::SpeechDataset, key::AbstractString)
Simon Devauchelle
committed
ann = d.annotations[key]
rec = d.recordings[ann.recording_id]
samples, sr = load(rec, ann)
(samples=samples, sampling_rate=sr), ann.data
end
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
function Base.filter(fn, d::SpeechDataset)
fidxs = filter(fn, d.idxs)
fannotations = filter(k_v -> fn(first(k_v)), d.annotations)
frecs = filter(k_v -> fn(first(k_v)), d.recordings)
SpeechDataset(fidxs, fannotations, frecs, :custom)
end