dataset.jl

# SPDX-License-Identifier: CECILL-2.1

struct SpeechDataset <: MLUtils.AbstractDataContainer
    idxs::Vector{AbstractString}
    supervisions::Dict{AbstractString, Supervision}
    recordings::Dict{AbstractString, Recording}
    partition::Symbol
end

"""
dataset(manifestroot, partition)

Load `SpeechDataset` from manifest files stored in `manifestroot`.
Partition is specified by `partition`, e.g. `:train`, `:test`.

Each item of the dataset is a nested tuple `((samples, sampling_rate), Supervision.data)`.

See also [`Supervision`](@ref).

# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
    ...
)
julia> ds[1]
(
    (samples=[...], sampling_rate=16_000),
    Dict(
        "text" => "Supervision text here"
    )
)
```
"""
function dataset(manifestroot::AbstractString, partition)
    sup_path = joinpath(manifestroot, "supervisions-$(subset).jsonl")
    rec_path = joinpath(manifestroot, "recordings.jsonl")
    supervisions = load(Supervision, sup_path)
    recordings = load(Recording, rec_path)
    dataset(supervisions, recordings)
end

function dataset(supervisions, recordings, partition)
    idxs = collect(keys(supervisions))
    SpeechDataset(idxs, supervisions, recordings, Symbol(partition))
end

function Base.getindex(d::SpeechDataset, key::AbstractString)
    sup = d.supervisions[key]
    rec = d.recordings[sup.recording_id]
    samples, sr = load(rec, sup)
    (samples=samples, sampling_rate=sr), sup.data
end
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)

Base.length(d::SpeechDataset) = length(d.idxs)