Skip to content
Snippets Groups Projects
Commit ad8ab070 authored by Lucas Ondel Yang's avatar Lucas Ondel Yang
Browse files

Resolve "prepare lexicon"

parent e0bbe27d
No related branches found
No related tags found
No related merge requests found
name = "SpeechDatasets"
uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>"]
version = "0.8.0"
version = "0.9.0"
[deps]
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
[compat]
JSON = "0.21"
WAV = "1.2"
julia = "1.8"
......@@ -13,28 +13,27 @@ pkg> add SpeechCorpora
## Example
```
julia> using SpeechCorpora
julia> using SpeechDatasets
julia> corpus = MultilingualLibriSpeech("fra") |> download |> prepare
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
...
# Load the recording manifest.
julia> recs = load(corpus, Recording, "dev") # use "train", "dev" or "test"
julia> dataset = MINILIBRISPEECH("/path/to/timit/dir", "outputdir", :train) # :dev | :test
...
# Load the supervision manifest.
julia> sups = load(corpus, Supervision, "dev") # use "train", "dev" or "test"
julia> for ((signal, fs), supervision) in dataset
# do something
end
# Load the signal of the first supervision segment
julia> s = first(values(sups))
julia> x, samplerate = load(recs[s.recording_id], s)
# Lexicons
julia> CMUDICT("outputfile")
...
# Play the recording of the first supervision segment
julia> play(recs[s.recording_id], s)
julia> TIMITDICT("/path/to/timit/dir")
...
```
## Author
* Lucas ONDEL YANG (LISN, CNRS)
## License
This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE)
......
......@@ -2,13 +2,8 @@
module SpeechDatasets
using Base64
using HTTP
using JSON
using WAV
using SpeechFeatures
import MLUtils
export
......@@ -29,9 +24,13 @@ export
# Corpora
MultilingualLibriSpeech,
MiniLibriSpeech,
MINILIBRISPEECH,
TIMIT,
# Lexicon
CMUDICT,
TIMITDICT,
# Dataset
dataset
......@@ -45,4 +44,6 @@ include("corpora/multilingual_librispeech.jl")
include("corpora/mini_librispeech.jl")
include("corpora/timit.jl")
include("lexicons.jl")
end
......@@ -11,15 +11,9 @@ const MINILS_SUBSETS = Dict(
"dev" => "dev-clean-2"
)
const MINILS_LANG = "eng"
const MINILS_NAME = "mini_librispeech"
#######################################################################
struct MiniLibriSpeech <: SpeechCorpus
lang
name
struct MINILIBRISPEECH <: SpeechCorpus
recordings
train
dev
......@@ -58,8 +52,12 @@ function minils_supervisions(dir, subset)
open(joinpath(d2, "$(k1)-$(k2).trans.txt"), "r") do f
for line in eachline(f)
tokens = split(line)
s = Supervision(tokens[1], tokens[1]; channel = 1,
data = Dict("text" => join(tokens[2:end], " ")))
s = Supervision(
tokens[1], # supervision id
tokens[1]; # recording id
channels = [1],
data = Dict("text" => join(tokens[2:end], " "))
)
sups[s.id] = s
end
end
......@@ -89,7 +87,7 @@ end
function minils_prepare(dir)
# 1. Recording manifest.
out = joinpath(dir, "recording-manifest.jsonl")
out = joinpath(dir, "recordings.jsonl")
if ! isfile(out)
open(out, "w") do f
for subset in ["train", "dev"]
......@@ -101,8 +99,8 @@ function minils_prepare(dir)
end
# 2. Supervision manifests.
for subset in ["train", "dev"]
out = joinpath(dir, "supervision-manifest-$subset.jsonl")
for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
out = joinpath(dir, "supervisions-$name.jsonl")
if ! isfile(out)
@debug "preparing supervision manifest ($subset) $out"
sups = minils_supervisions(dir, subset)
......@@ -113,20 +111,10 @@ function minils_prepare(dir)
end
end
function MiniLibriSpeech(outdir)
dir = joinpath(outdir, MINILS_LANG, MINILS_NAME)
function MINILIBRISPEECH(dir, subset)
minils_download(dir)
minils_prepare(dir)
MiniLibriSpeech(
MINILS_LANG,
MINILS_NAME,
load(Recording, joinpath(dir, "recording-manifest.jsonl")),
load(Supervision, joinpath(dir, "supervision-manifest-train.jsonl")),
load(Supervision, joinpath(dir, "supervision-manifest-dev.jsonl")),
load(Supervision, joinpath(dir, "supervision-manifest-dev.jsonl")),
)
dataset(dir, subset)
end
MiniLibriSpeech() = MiniLibriSpeech(SPEECH_CORPORA_ROOTDIR)
......@@ -2,17 +2,16 @@
#######################################################################
const TIMIT_SUBSETS = Dict(
"train" => "train",
"dev" => "dev",
"test" => "test"
)
const TIMIT_LANG = "eng"
const TIMIT_NAME = "timit"
const TIMIT_DEV_SPK_LIST = Set([
"faks0",
"faks0",
"fdac1",
"fjem0",
"mgwt0",
......@@ -64,6 +63,7 @@ const TIMIT_DEV_SPK_LIST = Set([
"mrws1"
])
const TIMIT_TEST_SPK_LIST = Set([
"mdab0",
"mwbt0",
......@@ -91,6 +91,7 @@ const TIMIT_TEST_SPK_LIST = Set([
"fmld0"
])
TIMIT_PHONE_MAP48 = Dict(
"aa" => "aa",
"ae" => "ae",
......@@ -155,6 +156,7 @@ TIMIT_PHONE_MAP48 = Dict(
"zh" => "zh"
)
TIMIT_PHONE_MAP39 = Dict(
"aa" => "aa",
"ae" => "ae",
......@@ -221,25 +223,19 @@ TIMIT_PHONE_MAP39 = Dict(
#######################################################################
struct TIMIT<: SpeechCorpus
datapath::AbstractString
end
lang(::TIMIT) = "eng"
name(::TIMIT) = "timit"
function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
function timit_prepare(timitdir, dir; audio_fmt="SPHERE")
# Validate the data directory
! isdir(timit.datapath) && throw(ArgumentError("invalid path $(timit.datapath)"))
! isdir(timitdir) && throw(ArgumentError("invalid path $(timitdir)"))
# Create the output directory.
dir = mkpath(dir)
for (subset, subdir) in [("train", "train"), ("dev", "train"), ("test", "test")]
sdir = joinpath(timit.datapath, subdir)
sdir = joinpath(timitdir, subdir)
# Recordings
manifestpath = joinpath(dir, manifestname(Recording, subset))
manifestpath = joinpath(dir, "recordings.jsonl")
@debug "preparing $manifestpath"
recordings = timit_recordings(sdir; fmt=audio_fmt)
open(manifestpath, "w") do f
......@@ -247,7 +243,7 @@ function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
end
# Supervision
manifestpath = joinpath(dir, manifestname(Supervision, subset))
manifestpath = joinpath(dir, "supervisions-$(subset).jsonl")
@debug "Preparing $manifestpath"
supervisions = timit_supervisions(sdir)
open(manifestpath, "w") do f
......@@ -256,6 +252,7 @@ function prepare(timit::TIMIT, dir; audio_fmt="SPHERE")
end
end
function timit_recordings(dir::AbstractString; fmt="SPHERE")
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
......@@ -268,7 +265,7 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
path = joinpath(root, file)
id = "timit_$(spk)_$(name)"
audio_src = if fmt == "SPHERE"
audio_src = if fmt == "SPHERE"
CmdAudioSource(`sph2pipe -f wav $path`)
else
FileAudioSource(path)
......@@ -285,6 +282,7 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
recordings
end
function timit_supervisions(dir)
! isdir(dir) && throw(ArgumentError("expected directory $dir"))
splitline(line) = rsplit(line, limit=3)
......@@ -320,19 +318,13 @@ function timit_supervisions(dir)
end
timit_lexicon(t::TIMIT) = timit_lexicon(t.datapath)
function timit_lexicon(dir)
dictfile = joinpath(dir, "doc", "timitdic.txt")
iscomment(line) = first(line) == ';'
lexicon = Pair{String, String}[]
for line in eachline(dictfile)
iscomment(line) && continue
wrd, pron = split(line, limit=2)
pron = strip(pron, ['/', '\t', ' '])
wrd = '~' in wrd ? split(wrd, "~", limit=2)[1] : wrd
push!(lexicon, wrd => uppercase(pron))
function TIMIT(timitdir, dir, subset)
if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
isfile(joinpath(dir, "supervisions-train.jsonl")) &&
isfile(joinpath(dir, "supervisions-dev.jsonl")) &&
isfile(joinpath(dir, "supervisions-test.jsonl")))
timit_prepare(timitdir, dir)
end
lexicon
dataset(dir, subset)
end
# SPDX-License-Identifier: CECILL-2.1
"""
FastDataset(supervisions, recordings, partition)
......@@ -7,7 +8,7 @@ Constructor for dataset represented as JSONL files (a.k.a. manifests).
"""
struct SpeechDataset <: MLUtils.AbstractDataContainer
idxs::Vector{AbstractString}
superivions::Dict{AbstractString, Supervision}
supervisions::Dict{AbstractString, Supervision}
recordings::Dict{AbstractString, Recording}
partition::Symbol
end
......@@ -38,14 +39,17 @@ julia> ds[1]
```
"""
function dataset(manifestroot::AbstractString, subset)
supervisions = load(Supervision, manifestroot, subset)
recordings = load(Recording, manifestroot, subset)
sup_path = joinpath(manifestroot, "supervisions-$(subset).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl")
supervisions = load(Supervision, sup_path)
recordings = load(Recording, rec_path)
idxs = collect(keys(supervisions))
SpeechDataset(idxs, supervisions, recordings, Symbol(subset))
end
function Base.getindex(d::SpeechDataset, key::AbstractString)
sup = d.superivions[key]
sup = d.supervisions[key]
rec = d.recordings[sup.recording_id]
samples, sr = load(rec, sup)
(samples=samples, sampling_rate=sr), sup.data
......@@ -55,3 +59,4 @@ Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
# SPDX-License-Identifier: CECILL-2.1
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
function normalizeword(word)
String(uppercase(word))
end
function normalizephoneme(phoneme)
String(uppercase(phoneme))
end
"""
CMUDICT(path)
Return the dictionary of pronunciation loaded from the CMU sphinx dictionary.
The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent
calls will only read the file `path` without downloading again the data.
"""
function CMUDICT(path)
if ! isfile(path)
dir = mktempdir()
run(`wget -P $dir $CMUDICT_URL`)
mv(joinpath(dir, "cmudict_SPHINX_40"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
word = replace(word, "(1)" => "", "(2)" => "", "(3)" => "", "(4)" => "")
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
lexicon
end
"""
TIMITDICT(timitdir)
Return the dictionary of pronunciation as provided by TIMIT corpus (located
in `timitdir`).
"""
function TIMITDICT(timitdir)
dictfile = joinpath(timitdir, "doc", "timitdic.txt")
iscomment(line) = first(line) == ';'
lexicon = Dict{String,Vector{Vector{String}}}()
for line in eachline(dictfile)
iscomment(line) && continue
word, pron = split(line, limit=2)
pron = strip(pron, ['/', '\t', ' '])
word = '~' in word ? split(word, "~", limit=2)[1] : word
word = normalizeword(word)
pron = normalizephoneme.(split(pron))
prononciations = get(lexicon, word, Vector{String}[])
push!(prononciations, pron)
lexicon[word] = prononciations
end
lexicon
end
......@@ -125,7 +125,7 @@ function readmanifest(io::IO, T)
end
# Some utilities
manifestname(::Type{<:Recording}, name) = "recordings-$name.jsonl"
manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
"""
......@@ -134,7 +134,5 @@ manifestname(::Type{<:Supervision}, name) = "supervisions-$name.jsonl"
Load Recording/Supervision manifest from `path`.
"""
load(T::Type{<:Union{Recording,Supervision}}, manifestpath::AbstractString) =
open(f -> readmanifest(f, T), manifestpath, "r")
load(T::Type{<:Union{Recording, Supervision}}, manifestroot::AbstractString, subset) =
load(T, joinpath(manifestroot, manifestname(T, subset)))
load(T::Type{<:Union{Recording,Supervision}}, path) = open(f -> readmanifest(f, T), path, "r")
......@@ -107,6 +107,5 @@ function load(r::Recording; start = -1, duration = -1, channels = r.channels)
x[:,channels], sr
end
load(r::Recording, s::Supervision) =
load(r; start = s.start, duration = s.duration, channels = s.channels)
load(r::Recording, s::Supervision) = load(r; start = s.start, duration = s.duration, channels = s.channels)
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type SpeechCorpus
abstract type SpeechCorpus end
Abstract type for all speech corpora.
"""
abstract type SpeechCorpus end
"""
lang(corpus)
......@@ -14,6 +16,7 @@ Return the ISO 639-3 code of the language of the corpus.
"""
lang
"""
name(corpus)
......@@ -21,6 +24,7 @@ Return the name identifier of the corpus.
"""
name
"""
download(corpus, rootdir)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment