Skip to content
Snippets Groups Projects
Commit c234333f authored by Martin Kocour's avatar Martin Kocour
Browse files

Timit data preparation

parent 1bee8c12
Branches
Tags
1 merge request!1Timit data preparation
......@@ -7,6 +7,7 @@ version = "0.7.0"
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
[compat]
......
......@@ -7,12 +7,10 @@ using HTTP
using JSON
using WAV
using SpeechFeatures: AbstractAudioSource, CmdAudioSource, FileAudioSource, URLAudioSource, loadaudio
export
# ManifestItem
CmdAudioSource,
FileAudioSource,
RawAudioSource,
URLAudioSource,
Recording,
Supervision,
load,
......
......@@ -228,52 +228,111 @@ end
lang(::TIMIT) = "eng"
name(::TIMIT) = "timit"
function prepare(timit::TIMIT, dir)
function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
# Validate the data directory
! isdir(timit.datapath) && throw(ArgumentError("invalid path $(timit.datapath)"))
# Create the output directory.
dir = mkpath(dir)
# Get the list of speakers for the train set.
@debug "extracting training speaker list"
TIMIT_TRAIN_SPK_LIST = Set([])
for (root, dirs, files) in walkdir(joinpath(timit.datapath, "train"))
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
sdir = joinpath(timit.datapath, subdir)
# Recordings
manifestpath = joinpath(dir, manifestname(Recording, subset))
@debug "preparing $manifestpath"
recordings = timit_recordings(sdir; fmt=audio_fmt)
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Supervision
manifestpath = joinpath(dir, manifestname(Supervision, subset))
@debug "Preparing $manifestpath"
supervisions = timit_supervisions(sdir)
open(manifestpath, "w") do f
writemanifest(f, supervisions)
end
end
end
function timit_recordings(dir::AbstractString; fmt="SPHERE")
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
_, ext = splitext(basename(file))
name, ext = splitext(file)
ext != ".wav" && continue
push!(TIMIT_TRAIN_SPK_LIST, basename(root))
spk = basename(root)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
audio_src = if fmt == "SPHERE"
CmdAudioSource(`sph2pipe -f wav $path`)
else
FileAudioSource(path)
end
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
manifestpath = joinpath(dir, manifestname(Recording, subset))
@debug "preparing $manifestpath"
function timit_supervisions(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
splitline(line) = rsplit(line, limit=3)
recordings = Dict()
sdir = joinpath(timit.datapath, subdir)
supervisions = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
name, ext = splitext(file)
ext != ".wrd" && continue
! isdir(sdir) && throw(ArgumentError("expected directory $sdir"))
_, dialect, spk = rsplit(root, "/", limit=3)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
for (root, subdirs, files) in walkdir(sdir)
for file in files
name, ext = splitext(file)
ext != ".wav" && continue
spk = basename(root)
path = joinpath(root, file)
id = basename(root) * "_" * name
recordings[id] = Recording(
id,
CmdAudioSource(`sph2pipe -f wav $path`);
channels = [1],
samplerate = 16000
)
end
slines = map(splitline, eachline(path))
starts, ends, words = zip(slines...)
start_ts = parse(Int, first(starts)) / 16000
end_ts = parse(Int, last(ends)) / 16000
dur = end_ts - start_ts
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
supervisions[id] = Supervision(
id, id, start_ts, dur, 1,
Dict(
"text" => join(words, " "),
"dialect" => dialect,
"speaker" => spk,
"sex" => string(first(spk))
)
)
end
end
supervisions
end
timit_lexicon(t::TIMIT) = timit_lexicon(t.datapath)
function timit_lexicon(dir)
dictfile = joinpath(dir, "doc", "timitdic.txt")
iscomment(line) = first(line) == ';'
lexicon = Pair{String, String}[]
for line in eachline(dictfile)
iscomment(line) && continue
wrd, pron = split(line, limit=2)
pron = strip(pron, ['/', '\t', ' '])
wrd = '~' in wrd ? split(wrd, "~", limit=2)[1] : wrd
push!(lexicon, wrd => uppercase(pron))
end
lexicon
end
# SPDX-License-Identifier: CECILL-2.1
#=====================================================================#
# HTML pretty display
function Base.show(io::IO, ::MIME"text/html", r::AbstractAudioSource)
print(io, "<audio controls ")
print(io, "src=\"data:audio/wav;base64,")
x, s = load(r)
iob64_encode = Base64EncodePipe(io)
wavwrite(x, iob64_encode, Fs = s, nbits = 8, compression = WAV.WAVE_FORMAT_PCM)
close(iob64_encode)
println(io, "\" />")
end
#=====================================================================#
# JSON serialization of a manifest item
......@@ -141,12 +126,15 @@ end
# Some utilities
manifestname(::Type{<:Recording}, name) = "recordings-$name.jsonl"
manifestname(::Type{<:Supervision}, name) = "supervisions-" * name * ".jsonl"
load(T::Type{<:Union{Recording,Supervision}}, path::AbstractString) =
open(f -> readmanifest(f, T), path, "r")
load(corpus::SpeechCorpus, dir, T, subset) =
load(T, joinpath(path(corpus, dir), manifestname(T, subset)))
load(corpus::SpeechCorpus, T, subset) =
load(corpus, corporadir, T, subset)
manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
"""
load(Supervision, path)
load(Recording, path)
Load Recording/Supervision manifest from `path`.
"""
load(T::Type{<:Union{Recording,Supervision}}, manifestpath::AbstractString) =
open(f -> readmanifest(f, T), manifestpath, "r")
load(T::Type{<:Union{Recording, Supervision}}, manifestroot::AbstractString, subset) =
load(T, joinpath(manifestroot, manifestname(T, subset)))
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type AbstractAudioSource end
Base class for all audio source. Possible audio sources are:
* `CmdAudioSource`
* `FileAudioSource`
* `RawAudioSource`
* `URLAudioSource`
You can load the data of an audio source with the function
load(s::AbstractAudioSource, subrange = :)
"""
abstract type AbstractAudioSource end
struct CmdAudioSource <: AbstractAudioSource
cmd
end
CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))
struct FileAudioSource <: AbstractAudioSource
path::AbstractString
end
struct RawAudioSource <: AbstractAudioSource
data::AbstractMatrix
srate::Integer
end
RawAudioSource(x::AbstractVector, srate) = RawAudioSource(x[:,:], srate)
struct URLAudioSource <: AbstractAudioSource
url::AbstractString
end
load(s::CmdAudioSource, subrange = :) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)[1:2]
load(s::FileAudioSource, subrange = :) = wavread(s.path; subrange)[1:2]
load(s::RawAudioSource, subrange = :) = s.data[subrange,:], s.srate
load(s::URLAudioSource, subrange = :) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)[1:2]
"""
abstract type ManifestItem end
......@@ -79,7 +39,7 @@ end
function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
if ismissing(channels) || ismissing(samplerate)
x, sr = load(s)
x, sr = loadaudio(s)
samplerate = ismissing(samplerate) ? Int(sr) : samplerate
channels = ismissing(channels) ? collect(1:size(x,2)) : channels
end
......@@ -98,7 +58,8 @@ end
A "supervision" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision.
supervision. `start` and `duration` (in seconds) defines,
where the segment is locatated within the recoding `recording_id`.
# Constructor
......@@ -142,7 +103,7 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
subrange = (:)
end
x, sr = load(r.source, subrange)
x, sr = loadaudio(r.source, subrange)
x[:,channels], sr
end
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment