Skip to content
Snippets Groups Projects
Commit 9feaabf5 authored by simon devauchelle's avatar simon devauchelle
Browse files

Merge branch 'add_timit_alignments' into ina_diachronic_corpus

merge
parents 0d04b5f9 c31c1938
No related branches found
No related tags found
No related merge requests found
......@@ -31,6 +31,7 @@ export
# Lexicon
CMUDICT,
TIMITDICT,
MFAFRDICT,
# Dataset
dataset
......
......@@ -2,7 +2,7 @@
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
const FRMFA_DICT_URL = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/french/mfa/french_mfa.dict"
function normalizeword(word)
String(uppercase(word))
......@@ -71,3 +71,42 @@ function TIMITDICT(timitdir)
end
lexicon
end
"""
MFAFRDICT(path)
Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a) with OOVs words
generated by G2P model (french_mfa) on INA Diachronic Corpus.
"""
function MFAFRDICT(path; oovs="")
if ! isfile(path)
mkpath(dirname(path))
dir = mktempdir()
run(`wget -P $dir $FRMFA_DICT_URL`)
mv(joinpath(dir, "french_mfa.dict"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
# Add out-of-vocabulary words
if isfile(oovs)
open(oovs, "r") do o
for line in eachline(o)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
end
lexicon
end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment