Skip to content
Snippets Groups Projects
Commit c234333f authored by Martin Kocour's avatar Martin Kocour
Browse files

Timit data preparation

parent 1bee8c12
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ version = "0.7.0"
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
[compat]
......
......@@ -7,12 +7,10 @@ using HTTP
using JSON
using WAV
using SpeechFeatures: AbstractAudioSource, CmdAudioSource, FileAudioSource, URLAudioSource, loadaudio
export
# ManifestItem
CmdAudioSource,
FileAudioSource,
RawAudioSource,
URLAudioSource,
Recording,
Supervision,
load,
......
......@@ -228,52 +228,111 @@ end
lang(::TIMIT) = "eng"
name(::TIMIT) = "timit"
function prepare(timit::TIMIT, dir)
function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
# Validate the data directory
! isdir(timit.datapath) && throw(ArgumentError("invalid path $(timit.datapath)"))
# Create the output directory.
dir = mkpath(dir)
# Get the list of speakers for the train set.
@debug "extracting training speaker list"
TIMIT_TRAIN_SPK_LIST = Set([])
for (root, dirs, files) in walkdir(joinpath(timit.datapath, "train"))
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
sdir = joinpath(timit.datapath, subdir)
# Recordings
manifestpath = joinpath(dir, manifestname(Recording, subset))
@debug "preparing $manifestpath"
recordings = timit_recordings(sdir; fmt=audio_fmt)
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
# Supervision
manifestpath = joinpath(dir, manifestname(Supervision, subset))
@debug "Preparing $manifestpath"
supervisions = timit_supervisions(sdir)
open(manifestpath, "w") do f
writemanifest(f, supervisions)
end
end
end
function timit_recordings(dir::AbstractString; fmt="SPHERE")
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
recordings = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
_, ext = splitext(basename(file))
name, ext = splitext(file)
ext != ".wav" && continue
push!(TIMIT_TRAIN_SPK_LIST, basename(root))
spk = basename(root)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
audio_src = if fmt == "SPHERE"
CmdAudioSource(`sph2pipe -f wav $path`)
else
FileAudioSource(path)
end
recordings[id] = Recording(
id,
audio_src;
channels = [1],
samplerate = 16000
)
end
end
recordings
end
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
manifestpath = joinpath(dir, manifestname(Recording, subset))
@debug "preparing $manifestpath"
function timit_supervisions(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
splitline(line) = rsplit(line, limit=3)
recordings = Dict()
sdir = joinpath(timit.datapath, subdir)
supervisions = Dict()
for (root, subdirs, files) in walkdir(dir)
for file in files
name, ext = splitext(file)
ext != ".wrd" && continue
! isdir(sdir) && throw(ArgumentError("expected directory $sdir"))
_, dialect, spk = rsplit(root, "/", limit=3)
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
for (root, subdirs, files) in walkdir(sdir)
for file in files
name, ext = splitext(file)
ext != ".wav" && continue
spk = basename(root)
path = joinpath(root, file)
id = basename(root) * "_" * name
recordings[id] = Recording(
id,
CmdAudioSource(`sph2pipe -f wav $path`);
channels = [1],
samplerate = 16000
)
end
slines = map(splitline, eachline(path))
starts, ends, words = zip(slines...)
start_ts = parse(Int, first(starts)) / 16000
end_ts = parse(Int, last(ends)) / 16000
dur = end_ts - start_ts
open(manifestpath, "w") do f
writemanifest(f, recordings)
end
supervisions[id] = Supervision(
id, id, start_ts, dur, 1,
Dict(
"text" => join(words, " "),
"dialect" => dialect,
"speaker" => spk,
"sex" => string(first(spk))
)
)
end
end
supervisions
end
timit_lexicon(t::TIMIT) = timit_lexicon(t.datapath)
function timit_lexicon(dir)
dictfile = joinpath(dir, "doc", "timitdic.txt")
iscomment(line) = first(line) == ';'
lexicon = Pair{String, String}[]
for line in eachline(dictfile)
iscomment(line) && continue
wrd, pron = split(line, limit=2)
pron = strip(pron, ['/', '\t', ' '])
wrd = '~' in wrd ? split(wrd, "~", limit=2)[1] : wrd
push!(lexicon, wrd => uppercase(pron))
end
lexicon
end
# SPDX-License-Identifier: CECILL-2.1
#=====================================================================#
# HTML pretty display
function Base.show(io::IO, ::MIME"text/html", r::AbstractAudioSource)
print(io, "<audio controls ")
print(io, "src=\"data:audio/wav;base64,")
x, s = load(r)
iob64_encode = Base64EncodePipe(io)
wavwrite(x, iob64_encode, Fs = s, nbits = 8, compression = WAV.WAVE_FORMAT_PCM)
close(iob64_encode)
println(io, "\" />")
end
#=====================================================================#
# JSON serialization of a manifest item
......@@ -141,12 +126,15 @@ end
# Some utilities
manifestname(::Type{<:Recording}, name) = "recordings-$name.jsonl"
manifestname(::Type{<:Supervision}, name) = "supervisions-" * name * ".jsonl"
load(T::Type{<:Union{Recording,Supervision}}, path::AbstractString) =
open(f -> readmanifest(f, T), path, "r")
load(corpus::SpeechCorpus, dir, T, subset) =
load(T, joinpath(path(corpus, dir), manifestname(T, subset)))
load(corpus::SpeechCorpus, T, subset) =
load(corpus, corporadir, T, subset)
manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
"""
load(Supervision, path)
load(Recording, path)
Load Recording/Supervision manifest from `path`.
"""
load(T::Type{<:Union{Recording,Supervision}}, manifestpath::AbstractString) =
open(f -> readmanifest(f, T), manifestpath, "r")
load(T::Type{<:Union{Recording, Supervision}}, manifestroot::AbstractString, subset) =
load(T, joinpath(manifestroot, manifestname(T, subset)))
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type AbstractAudioSource end
Base class for all audio source. Possible audio sources are:
* `CmdAudioSource`
* `FileAudioSource`
* `RawAudioSource`
* `URLAudioSource`
You can load the data of an audio source with the function
load(s::AbstractAudioSource, subrange = :)
"""
abstract type AbstractAudioSource end
struct CmdAudioSource <: AbstractAudioSource
cmd
end
CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))
struct FileAudioSource <: AbstractAudioSource
path::AbstractString
end
struct RawAudioSource <: AbstractAudioSource
data::AbstractMatrix
srate::Integer
end
RawAudioSource(x::AbstractVector, srate) = RawAudioSource(x[:,:], srate)
struct URLAudioSource <: AbstractAudioSource
url::AbstractString
end
load(s::CmdAudioSource, subrange = :) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)[1:2]
load(s::FileAudioSource, subrange = :) = wavread(s.path; subrange)[1:2]
load(s::RawAudioSource, subrange = :) = s.data[subrange,:], s.srate
load(s::URLAudioSource, subrange = :) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)[1:2]
"""
abstract type ManifestItem end
......@@ -79,7 +39,7 @@ end
function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
if ismissing(channels) || ismissing(samplerate)
x, sr = load(s)
x, sr = loadaudio(s)
samplerate = ismissing(samplerate) ? Int(sr) : samplerate
channels = ismissing(channels) ? collect(1:size(x,2)) : channels
end
......@@ -98,7 +58,8 @@ end
A "supervision" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision.
supervision. `start` and `duration` (in seconds) defines,
where the segment is locatated within the recoding `recording_id`.
# Constructor
......@@ -142,7 +103,7 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
subrange = (:)
end
x, sr = load(r.source, subrange)
x, sr = loadaudio(r.source, subrange)
x[:,channels], sr
end
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment