Newer
Older
# SPDX-License-Identifier: CECILL-2.1
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
const FRMFA_DICT_URL = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/french/mfa/french_mfa.dict"
function normalizeword(word)
String(uppercase(word))
end
function normalizephoneme(phoneme)
String(uppercase(phoneme))
end
"""
CMUDICT(path)
Return the dictionary of pronunciation loaded from the CMU sphinx dictionary.
The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent
calls will only read the file `path` without downloading again the data.
"""
function CMUDICT(path)
if ! isfile(path)
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
dir = mktempdir()
run(`wget -P $dir $CMUDICT_URL`)
mv(joinpath(dir, "cmudict_SPHINX_40"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
word = replace(word, "(1)" => "", "(2)" => "", "(3)" => "", "(4)" => "")
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
lexicon
end
"""
TIMITDICT(timitdir)
Return the dictionary of pronunciation as provided by TIMIT corpus (located
in `timitdir`).
"""
function TIMITDICT(timitdir)
dictfile = joinpath(timitdir, "doc", "timitdic.txt")
iscomment(line) = first(line) == ';'
lexicon = Dict{String,Vector{Vector{String}}}()
for line in eachline(dictfile)
iscomment(line) && continue
word, pron = split(line, limit=2)
pron = strip(pron, ['/', '\t', ' '])
word = '~' in word ? split(word, "~", limit=2)[1] : word
word = normalizeword(word)
pron = normalizephoneme.(split(pron))
prononciations = get(lexicon, word, Vector{String}[])
push!(prononciations, pron)
lexicon[word] = prononciations
end
lexicon
end
"""
MFAFRDICT(path)
Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a) with OOVs words
generated by G2P model (french_mfa) on INA Diachronic Corpus.
"""
Simon Devauchelle
committed
function MFAFRDICT(path; oovs="")
if ! isfile(path)
mkpath(dirname(path))
dir = mktempdir()
run(`wget -P $dir $FRMFA_DICT_URL`)
mv(joinpath(dir, "french_mfa.dict"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
Simon Devauchelle
committed
# Add out-of-vocabulary words
if isfile(oovs)
open(oovs, "r") do o
for line in eachline(o)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
end