Timit data preparation

c234333f · Martin Kocour · 1bee8c12 · c234333f · c234333f · c234333f
Commit c234333f authored 1 year ago by Martin Kocour
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.7.0"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
 WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"

 [compat]

--- a/src/SpeechCorpora.jl
+++ b/src/SpeechCorpora.jl
@@ -7,12 +7,10 @@ using HTTP
 using JSON
 using WAV

+using SpeechFeatures: AbstractAudioSource, CmdAudioSource, FileAudioSource, URLAudioSource, loadaudio
+
 export
    # ManifestItem
-    CmdAudioSource,
-    FileAudioSource,
-    RawAudioSource,
-    URLAudioSource,
    Recording,
    Supervision,
    load,

--- a/src/corpora/timit.jl
+++ b/src/corpora/timit.jl
@@ -228,52 +228,111 @@ end
 lang(::TIMIT) = "eng"
 name(::TIMIT) = "timit"

-function prepare(timit::TIMIT, dir)
+function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
    # Validate the data directory
    ! isdir(timit.datapath) && throw(ArgumentError("invalid path $(timit.datapath)"))

    # Create the output directory.
    dir = mkpath(dir)

-    # Get the list of speakers for the train set.
-    @debug "extracting training speaker list"
-    TIMIT_TRAIN_SPK_LIST = Set([])
-    for (root, dirs, files) in walkdir(joinpath(timit.datapath, "train"))
+    for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
+        sdir = joinpath(timit.datapath, subdir)
+
+        # Recordings
+        manifestpath = joinpath(dir, manifestname(Recording, subset))
+        @debug "preparing $manifestpath"
+        recordings = timit_recordings(sdir; fmt=audio_fmt)
+        open(manifestpath, "w") do f
+            writemanifest(f, recordings)
+        end
+
+        # Supervision
+        manifestpath = joinpath(dir, manifestname(Supervision, subset))
+        @debug "Preparing $manifestpath"
+        supervisions = timit_supervisions(sdir)
+        open(manifestpath, "w") do f
+            writemanifest(f, supervisions)
+        end
+    end
+end
+
+function timit_recordings(dir::AbstractString; fmt="SPHERE")
+    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+
+    recordings = Dict()
+    for (root, subdirs, files) in walkdir(dir)
        for file in files
-            _, ext = splitext(basename(file))
+            name, ext = splitext(file)
            ext != ".wav" && continue
-            push!(TIMIT_TRAIN_SPK_LIST, basename(root))
+            spk = basename(root)
+            path = joinpath(root, file)
+            id = "timit_$(spk)_$(name)"
+
+            audio_src = if fmt == "SPHERE" 
+                CmdAudioSource(`sph2pipe -f wav $path`)
+            else
+                FileAudioSource(path)
+            end
+
+            recordings[id] = Recording(
+                id,
+                audio_src;
+                channels = [1],
+                samplerate = 16000
+            )
        end
    end
+    recordings
+end

-    for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
-        manifestpath = joinpath(dir, manifestname(Recording, subset))
-        @debug "preparing $manifestpath"
+function timit_supervisions(dir)
+    ! isdir(dir) && throw(ArgumentError("expected directory $dir"))
+    splitline(line) = rsplit(line, limit=3)

-        recordings = Dict()
-        sdir = joinpath(timit.datapath, subdir)
+    supervisions = Dict()
+    for (root, subdirs, files) in walkdir(dir)
+        for file in files
+            name, ext = splitext(file)
+            ext != ".wrd" && continue

-        ! isdir(sdir) && throw(ArgumentError("expected directory $sdir"))
+            _, dialect, spk = rsplit(root, "/", limit=3)
+            path = joinpath(root, file)
+            id = "timit_$(spk)_$(name)"

-        for (root, subdirs, files) in walkdir(sdir)
-            for file in files
-                name, ext = splitext(file)
-                ext != ".wav" && continue
-                spk = basename(root)
-                path = joinpath(root, file)
-                id = basename(root) * "_" * name
-                recordings[id] = Recording(
-                    id,
-                    CmdAudioSource(`sph2pipe -f wav $path`);
-                    channels = [1],
-                    samplerate = 16000
-                )
-            end
+            slines = map(splitline, eachline(path))
+            starts, ends, words = zip(slines...)
+            start_ts = parse(Int, first(starts)) / 16000
+            end_ts = parse(Int, last(ends)) / 16000
+            dur = end_ts - start_ts

-            open(manifestpath, "w") do f
-                writemanifest(f, recordings)
-            end
+            supervisions[id] = Supervision(
+                id, id, start_ts, dur, 1,
+                Dict(
+                     "text" => join(words, " "),
+                     "dialect" => dialect,
+                     "speaker" => spk,
+                     "sex" => string(first(spk))
+                )
+            )
        end
    end
+    supervisions
 end

+
+timit_lexicon(t::TIMIT) = timit_lexicon(t.datapath)
+function timit_lexicon(dir)
+    dictfile = joinpath(dir, "doc", "timitdic.txt")
+    iscomment(line) = first(line) == ';'
+
+    lexicon = Pair{String, String}[]
+    for line in eachline(dictfile)
+        iscomment(line) && continue
+
+        wrd, pron = split(line, limit=2)
+        pron = strip(pron, ['/', '\t', ' '])
+        wrd = '~' in wrd ? split(wrd, "~", limit=2)[1] : wrd
+        push!(lexicon, wrd => uppercase(pron))
+    end
+    lexicon
+end
--- a/src/manifest_io.jl
+++ b/src/manifest_io.jl
 # SPDX-License-Identifier: CECILL-2.1

-#=====================================================================#
-# HTML pretty display
-
-function Base.show(io::IO, ::MIME"text/html", r::AbstractAudioSource)
-    print(io, "<audio controls ")
-    print(io, "src=\"data:audio/wav;base64,")
-
-    x, s = load(r)
-    iob64_encode = Base64EncodePipe(io)
-    wavwrite(x, iob64_encode, Fs = s, nbits = 8, compression = WAV.WAVE_FORMAT_PCM)
-    close(iob64_encode)
-
-    println(io, "\" />")
-end
-
 #=====================================================================#
 # JSON serialization of a manifest item

@@ -141,12 +126,15 @@ end

 # Some utilities
 manifestname(::Type{<:Recording}, name) = "recordings-$name.jsonl"
-manifestname(::Type{<:Supervision}, name) = "supervisions-" * name * ".jsonl"
-
-load(T::Type{<:Union{Recording,Supervision}}, path::AbstractString) =
-    open(f -> readmanifest(f, T), path, "r")
-load(corpus::SpeechCorpus, dir, T, subset) =
-    load(T, joinpath(path(corpus, dir), manifestname(T, subset)))
-load(corpus::SpeechCorpus, T, subset) =
-    load(corpus, corporadir, T, subset)
-
+manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
+
+"""
+load(Supervision, path)
+load(Recording, path)
+
+Load Recording/Supervision manifest from `path`.
+"""
+load(T::Type{<:Union{Recording,Supervision}}, manifestpath::AbstractString) =
+    open(f -> readmanifest(f, T), manifestpath, "r")
+load(T::Type{<:Union{Recording, Supervision}}, manifestroot::AbstractString, subset) =
+    load(T, joinpath(manifestroot, manifestname(T, subset)))
--- a/src/manifest_item.jl
+++ b/src/manifest_item.jl
 # SPDX-License-Identifier: CECILL-2.1

-"""
-    abstract type AbstractAudioSource end
-
-Base class for all audio source. Possible audio sources are:
-* `CmdAudioSource`
-* `FileAudioSource`
-* `RawAudioSource`
-* `URLAudioSource`
-
-You can load the data of an audio source with the function
-
-    load(s::AbstractAudioSource, subrange = :)
-
-"""
-abstract type AbstractAudioSource end
-
-struct CmdAudioSource <: AbstractAudioSource
-    cmd
-end
-CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))
-
-struct FileAudioSource <: AbstractAudioSource
-    path::AbstractString
-end
-
-struct RawAudioSource <: AbstractAudioSource
-    data::AbstractMatrix
-    srate::Integer
-end
-RawAudioSource(x::AbstractVector, srate) = RawAudioSource(x[:,:], srate)
-
-struct URLAudioSource <: AbstractAudioSource
-    url::AbstractString
-end
-
-load(s::CmdAudioSource, subrange = :) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)[1:2]
-load(s::FileAudioSource, subrange = :) = wavread(s.path; subrange)[1:2]
-load(s::RawAudioSource, subrange = :) = s.data[subrange,:], s.srate
-load(s::URLAudioSource, subrange = :) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)[1:2]
-
 """
    abstract type ManifestItem end

@@ -79,7 +39,7 @@ end

 function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
    if ismissing(channels) || ismissing(samplerate)
-        x, sr = load(s)
+        x, sr = loadaudio(s)
        samplerate = ismissing(samplerate) ? Int(sr) : samplerate
        channels = ismissing(channels) ? collect(1:size(x,2)) : channels
    end
@@ -98,7 +58,8 @@ end

 A "supervision" defines a segment of a recording on a single channel.
 The `data` field is an arbitrary dictionary holdin the nature of the
-supervision.
+supervision. `start` and `duration` (in seconds) defines,
+where the segment is locatated within the recoding `recording_id`.

 # Constructor

@@ -142,7 +103,7 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
        subrange = (:)
    end

-    x, sr = load(r.source, subrange)
+    x, sr = loadaudio(r.source, subrange)
    x[:,channels], sr
 end