Skip to content
Snippets Groups Projects
Commit dbe1036d authored by Lucas Ondel Yang's avatar Lucas Ondel Yang
Browse files

Merge branch 'dataset' into 'main'

Dataset

See merge request fast/speechcorpora.jl!3
parents fcdc1d21 620e3571
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ version = "0.7.0"
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
......
......@@ -7,7 +7,9 @@ using HTTP
using JSON
using WAV
using SpeechFeatures: AbstractAudioSource, CmdAudioSource, FileAudioSource, URLAudioSource, loadaudio
using SpeechFeatures
import MLUtils
export
# ManifestItem
......@@ -28,11 +30,17 @@ export
# Corpora
MultilingualLibriSpeech,
MiniLibriSpeech,
TIMIT
TIMIT,
# Dataset
dataset
include("speechcorpus.jl")
include("manifest_item.jl")
include("manifest_io.jl")
include("dataset.jl")
# Supported corpora
include("corpora/multilingual_librispeech.jl")
include("corpora/mini_librispeech.jl")
include("corpora/timit.jl")
......
# SPDX-License-Identifier: CECILL-2.1
"""
FastDataset(supervisions, recordings, partition)
Constructor for dataset represented as JSONL files (a.k.a. manifests).
"""
struct SpeechDataset <: MLUtils.AbstractDataContainer
idxs::Vector{AbstractString}
superivions::Dict{AbstractString, Supervision}
recordings::Dict{AbstractString, Recording}
partition::Symbol
end
"""
dataset(manifestroot, subset)
Load `SpeechDataset` from manifest files stored in `manifestroot`.
Partition is specified by `subset`, e.g. `:train`, `:test`.
Each item of the dataset is a nested tuple `((samples, sampling_rate), Supervision.data)`.
See also [`Supervision`](@ref).
# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
...
)
julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
"text" => "Supervision text here"
)
)
```
"""
function dataset(manifestroot::AbstractString, subset)
supervisions = load(Supervision, manifestroot, subset)
recordings = load(Recording, manifestroot, subset)
idxs = collect(keys(supervisions))
SpeechDataset(idxs, supervisions, recordings, Symbol(subset))
end
function Base.getindex(d::SpeechDataset, key::AbstractString)
sup = d.superivions[key]
rec = d.recordings[sup.recording_id]
samples, sr = load(rec, sup)
(samples=samples, sampling_rate=sr), sup.data
end
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment