Forked from
FAST / SpeechDatasets.jl
56 commits behind the upstream repository.
-
Lucas Ondel Yang authoredLucas Ondel Yang authored
manifest_item.jl 4.08 KiB
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type AbstractAudioSource end
Base class for all audio source. Possible audio sources are:
* `FileAudioSource`
* `URLAudioSource`
* `CmdAudioSource`
You can load the data of an audio source with the internal function
loadsoce(s::AbstractAudioSource, subrange)
"""
abstract type AbstractAudioSource end
struct FileAudioSource <: AbstractAudioSource
path::AbstractString
end
struct URLAudioSource <: AbstractAudioSource
url::AbstractString
end
struct CmdAudioSource <: AbstractAudioSource
cmd
end
CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))
loadsource(s::FileAudioSource, subrange) = wavread(s.path; subrange)
loadsource(s::URLAudioSource, subrange) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)
loadsource(s::CmdAudioSource, subrange) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)
"""
abstract type ManifestItem end
Base class for all manifest item. Every manifest item should have an
`id` attribute.
"""
abstract type ManifestItem end
"""
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
id::AbstractString
source::Ts
channels::Vector{Int}
samplerate::Int
end
A recording is an audio source associated with and id.
# Constructors
Recording(id, source, channels, samplerate)
Recording(id, source[; channels = missing, samplerate = missing])
If the channels or the sample rate are not provided then they will be
read from `source`.
!!! warning
When preparing large corpus, not providing the channes and/or the
sample rate can drastically reduce the speed as it forces to read
source.
"""
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
id::AbstractString
source::Ts
channels::Vector{Int}
samplerate::Int
end
function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
if ismissing(channels) || ismissing(samplerate)
x, sr = loadsource(s, :)
samplerate = ismissing(samplerate) ? Int(sr) : samplerate
channels = ismissing(channels) ? collect(1:size(x,2)) : channels
end
Recording(uttid, s, channels, samplerate)
end
"""
struct Supervision <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
channel::Int
data::Dict
end
A "supervision" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision.
# Constructor
Supervision(id, recording_id, start, duration, channel, data)
Supervision(id, recording_id[; channel = missing, start = -1, duration = -1, data = missing)
If `start` and/or `duration` are negative, the segment is considered to
be the whole sequence length of the recording.
"""
struct Supervision <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
channel::Int
data::Dict
end
Supervision(id, recid; channel = missing, start = -1, duration = -1, data = missing) =
Supervision(id, recid, start, duration, channel, data)
"""
load(recording[; start = -1, duration = -1, channels = recording.channels])
load(recording, supervision)
Load the signal from a recording. `start`, `duration` (in seconds) can
be used to load only a segment. If a `supervision` is given, function
will return on the portion of the signal corresponding to the
supervision segment.
The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array
- ``N`` is the length of the signal and ``C`` is the number of channels
- and `sr` is the sampling rate of the signal.
"""
function load(r::Recording; start = -1, duration = -1, channels = r.channels)
if start >= 0 && duration >= 0
s = Int(floor(start * r.samplerate + 1))
e = Int(ceil(duration * r.samplerate))
subrange = (s:e)
else
subrange = (:)
end
x, sr, _, _ = loadsource(r.source, subrange)
x[:,channels], sr
end
load(r::Recording, s::Supervision) =
load(r; start = s.start, duration = s.duration, channels = [s.channel])