lexicons.jl

# SPDX-License-Identifier: CECILL-2.1


const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"


function normalizeword(word)
    String(uppercase(word))
end

function normalizephoneme(phoneme)
    String(uppercase(phoneme))
end


"""
    CMUDICT(path)

Return the dictionary of pronunciation loaded from the CMU sphinx dictionary.
The CMU dicionaty will be donwloaded and stored into to `path`. Subsequent
calls will only read the file `path` without downloading again the data.
"""
function CMUDICT(path)
    if ! isfile(path)
        dir = mktempdir()
        run(`wget -P $dir $CMUDICT_URL`)
        mv(joinpath(dir, "cmudict_SPHINX_40"), path)
    end

    lexicon = Dict()
    open(path, "r") do f
        for line in eachline(f)
            word, pron... = split(line)

            word = replace(word, "(1)" => "", "(2)" => "", "(3)" => "", "(4)" => "")

            prononciations = get(lexicon, word, [])
            push!(prononciations, pron)
            lexicon[word] = prononciations
        end
    end
    lexicon
end


"""
    TIMITDICT(timitdir)

Return the dictionary of pronunciation as provided by TIMIT corpus (located
in `timitdir`).
"""
function TIMITDICT(timitdir)
    dictfile = joinpath(timitdir, "doc", "timitdic.txt")
    iscomment(line) = first(line) == ';'

    lexicon = Dict{String,Vector{Vector{String}}}()
    for line in eachline(dictfile)
        iscomment(line) && continue

        word, pron = split(line, limit=2)
        pron = strip(pron, ['/', '\t', ' '])
        word = '~' in word ? split(word, "~", limit=2)[1] : word

        word = normalizeword(word)
        pron = normalizephoneme.(split(pron))

        prononciations = get(lexicon, word, Vector{String}[])
        push!(prononciations, pron)
        lexicon[word] = prononciations
    end
    lexicon
end