Skip to content
Snippets Groups Projects
Commit 620e3571 authored by Martin Kocour's avatar Martin Kocour Committed by Lucas Ondel Yang
Browse files

Dataset

parent fcdc1d21
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ version = "0.7.0"
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
......
......@@ -7,7 +7,9 @@ using HTTP
using JSON
using WAV
using SpeechFeatures: AbstractAudioSource, CmdAudioSource, FileAudioSource, URLAudioSource, loadaudio
using SpeechFeatures
import MLUtils
export
# ManifestItem
......@@ -28,11 +30,17 @@ export
# Corpora
MultilingualLibriSpeech,
MiniLibriSpeech,
TIMIT
TIMIT,
# Dataset
dataset
include("speechcorpus.jl")
include("manifest_item.jl")
include("manifest_io.jl")
include("dataset.jl")
# Supported corpora
include("corpora/multilingual_librispeech.jl")
include("corpora/mini_librispeech.jl")
include("corpora/timit.jl")
......
# SPDX-License-Identifier: CECILL-2.1
"""
FastDataset(supervisions, recordings, partition)
Constructor for dataset represented as JSONL files (a.k.a. manifests).
"""
struct SpeechDataset <: MLUtils.AbstractDataContainer
idxs::Vector{AbstractString}
superivions::Dict{AbstractString, Supervision}
recordings::Dict{AbstractString, Recording}
partition::Symbol
end
"""
dataset(manifestroot, subset)
Load `SpeechDataset` from manifest files stored in `manifestroot`.
Partition is specified by `subset`, e.g. `:train`, `:test`.
Each item of the dataset is a nested tuple `((samples, sampling_rate), Supervision.data)`.
See also [`Supervision`](@ref).
# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
...
)
julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
"text" => "Supervision text here"
)
)
```
"""
function dataset(manifestroot::AbstractString, subset)
supervisions = load(Supervision, manifestroot, subset)
recordings = load(Recording, manifestroot, subset)
idxs = collect(keys(supervisions))
SpeechDataset(idxs, supervisions, recordings, Symbol(subset))
end
function Base.getindex(d::SpeechDataset, key::AbstractString)
sup = d.superivions[key]
rec = d.recordings[sup.recording_id]
samples, sr = load(rec, sup)
(samples=samples, sampling_rate=sr), sup.data
end
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment