manifest_item.jl

# SPDX-License-Identifier: CECILL-2.1

"""
    abstract type AbstractAudioSource end

Base class for all audio source. Possible audio sources are:
* `FileAudioSource`
* `URLAudioSource`
* `CmdAudioSource`

You can load the data of an audio source with the internal function

    loadsoce(s::AbstractAudioSource, subrange)

"""
abstract type AbstractAudioSource end

struct FileAudioSource <: AbstractAudioSource
    path::AbstractString
end

struct URLAudioSource <: AbstractAudioSource
    url::AbstractString
end

struct CmdAudioSource <: AbstractAudioSource
    cmd
end
CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))

loadsource(s::FileAudioSource, subrange) = wavread(s.path; subrange)
loadsource(s::URLAudioSource, subrange) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)
loadsource(s::CmdAudioSource, subrange) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)

"""
    abstract type ManifestItem end

Base class for all manifest item. Every manifest item should have an
`id` attribute.
"""
abstract type ManifestItem end

"""
    struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
        id::AbstractString
        source::Ts
        channels::Vector{Int}
        samplerate::Int
    end

A recording is an audio source associated with and id.

# Constructors
    Recording(id, source, channels, samplerate)
    Recording(id, source[; channels = missing, samplerate = missing])

If the channels or the sample rate are not provided then they will be
read from `source`.

!!! warning
    When preparing large corpus, not providing the channes and/or the
    sample rate can drastically reduce the speed as it forces to read
    source.
"""
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
    id::AbstractString
    source::Ts
    channels::Vector{Int}
    samplerate::Int
end

function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
    if ismissing(channels) || ismissing(samplerate)
        x, sr = loadsource(s, :)
        samplerate = ismissing(samplerate) ? Int(sr) : samplerate
        channels = ismissing(channels) ? collect(1:size(x,2)) : channels
    end
    Recording(uttid, s, channels, samplerate)
end

"""
    struct Supervision <: ManifestItem
        id::AbstractString
        recording_id::AbstractString
        start::Float64
        duration::Float64
        channel::Int
        data::Dict
    end

A "supervision" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision.

# Constructor

    Supervision(id, recording_id, start, duration, channel, data)
    Supervision(id, recording_id[; channel = missing, start = -1, duration = -1, data = missing)

If `start` and/or `duration` are negative, the segment is considered to
be the whole sequence length of the recording.
"""
struct Supervision <: ManifestItem
    id::AbstractString
    recording_id::AbstractString
    start::Float64
    duration::Float64
    channel::Int
    data::Dict
end

Supervision(id, recid; channel = missing, start = -1, duration = -1, data = missing) =
    Supervision(id, recid, start, duration, channel, data)

"""
    load(recording[; start = -1, duration = -1, channels = recording.channels])
    load(recording, supervision)

Load the signal from a recording. `start`, `duration` (in seconds) can
be used to load only a segment. If a `supervision` is given, function
will return on the portion of the signal corresponding to the
supervision segment.

The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array
- ``N`` is the length of the signal and ``C`` is the number of channels
- and `sr` is the sampling rate of the signal.
"""
function load(r::Recording; start = -1, duration = -1, channels = r.channels)
    if start >= 0 && duration >= 0
        s = Int(floor(start * r.samplerate + 1))
        e = Int(ceil(duration * r.samplerate))
        subrange = (s:e)
    else
        subrange = (:)
    end

    x, sr, _, _ = loadsource(r.source, subrange)
    x[:,channels], sr
end
load(r::Recording, s::Supervision) =
    load(r; start = s.start, duration = s.duration, channels = [s.channel])