# SPDX-License-Identifier: CECILL-2.1 const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40" function normalizeword(word) String(uppercase(word)) end function normalizephoneme(phoneme) String(uppercase(phoneme)) end """ CMUDICT(path) Return the dictionary of pronunciation loaded from the CMU sphinx dictionary. The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent calls will only read the file `path` without downloading again the data. """ function CMUDICT(path) if ! isfile(path) dir = mktempdir() run(`wget -P $dir $CMUDICT_URL`) mv(joinpath(dir, "cmudict_SPHINX_40"), path) end lexicon = Dict() open(path, "r") do f for line in eachline(f) word, pron... = split(line) word = replace(word, "(1)" => "", "(2)" => "", "(3)" => "", "(4)" => "") prononciations = get(lexicon, word, []) push!(prononciations, pron) lexicon[word] = prononciations end end lexicon end """ TIMITDICT(timitdir) Return the dictionary of pronunciation as provided by TIMIT corpus (located in `timitdir`). """ function TIMITDICT(timitdir) dictfile = joinpath(timitdir, "doc", "timitdic.txt") iscomment(line) = first(line) == ';' lexicon = Dict{String,Vector{Vector{String}}}() for line in eachline(dictfile) iscomment(line) && continue word, pron = split(line, limit=2) pron = strip(pron, ['/', '\t', ' ']) word = '~' in word ? split(word, "~", limit=2)[1] : word word = normalizeword(word) pron = normalizephoneme.(split(pron)) prononciations = get(lexicon, word, Vector{String}[]) push!(prononciations, pron) lexicon[word] = prononciations end lexicon end