Skip to content
Snippets Groups Projects
dataset.jl 1.93 KiB
Newer Older
Martin Kocour's avatar
Martin Kocour committed
# SPDX-License-Identifier: CECILL-2.1

struct SpeechDataset <: MLUtils.AbstractDataContainer
    idxs::Vector{AbstractString}
    annotations::Dict{AbstractString, Annotation}
Martin Kocour's avatar
Martin Kocour committed
    recordings::Dict{AbstractString, Recording}
    partition::Symbol
end

"""
Martin Kocour's avatar
Martin Kocour committed
dataset(manifestroot, partition)
Martin Kocour's avatar
Martin Kocour committed

Load `SpeechDataset` from manifest files stored in `manifestroot`.
Martin Kocour's avatar
Martin Kocour committed
Partition is specified by `partition`, e.g. `:train`, `:test`.
Martin Kocour's avatar
Martin Kocour committed

Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
Martin Kocour's avatar
Martin Kocour committed

Martin Kocour's avatar
Martin Kocour committed

# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
    ...
)
julia> ds[1]
(
    (samples=[...], sampling_rate=16_000),
    Dict(
Martin Kocour's avatar
Martin Kocour committed
    )
)
```
"""
Lucas Ondel Yang's avatar
Lucas Ondel Yang committed
function dataset(manifestroot, partition)
    annot_path = joinpath(manifestroot, "annotations-$(partition).jsonl")
    rec_path = joinpath(manifestroot, "recordings.jsonl")
    annotations = load(Annotation, annot_path)
    recordings = load(Recording, rec_path)
    dataset(annotations, recordings, partition)
Martin Kocour's avatar
Martin Kocour committed
end

function dataset(annotations, recordings, partition)
    idxs = collect(keys(annotations))
    SpeechDataset(idxs, annotations, recordings, Symbol(partition))
Martin Kocour's avatar
Martin Kocour committed
end

function Base.getindex(d::SpeechDataset, key::AbstractString)
    ann = d.annotations[key]
    rec = d.recordings[ann.recording_id]
    samples, sr = load(rec, ann)
    (samples=samples, sampling_rate=sr), ann.data
Martin Kocour's avatar
Martin Kocour committed
end
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)

Base.length(d::SpeechDataset) = length(d.idxs)
Lucas Ondel Yang's avatar
Lucas Ondel Yang committed
function Base.filter(fn, d::SpeechDataset)
    fidxs = filter(fn, d.idxs)
    fannotations = filter(k_v -> fn(first(k_v)), d.annotations)
    frecs = filter(k_v -> fn(first(k_v)), d.recordings)
    SpeechDataset(fidxs, fannotations, frecs, :custom)
end