Skip to content
Snippets Groups Projects
Commit a82df3ee authored by Simon Devauchelle's avatar Simon Devauchelle
Browse files

MFA French dictionnary added in lexicons

parent 991ec3b9
No related branches found
No related tags found
No related merge requests found
...@@ -30,6 +30,7 @@ export ...@@ -30,6 +30,7 @@ export
# Lexicon # Lexicon
CMUDICT, CMUDICT,
TIMITDICT, TIMITDICT,
MFAFRDICT,
# Dataset # Dataset
dataset dataset
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40" const CMUDICT_URL = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40"
const FRMFA_DICT_URL = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/french/mfa/french_mfa.dict"
function normalizeword(word) function normalizeword(word)
String(uppercase(word)) String(uppercase(word))
...@@ -71,3 +71,30 @@ function TIMITDICT(timitdir) ...@@ -71,3 +71,30 @@ function TIMITDICT(timitdir)
end end
lexicon lexicon
end end
"""
MFAFRDICT(path)
Return the french dictionary of pronunciation as provided by MFA (french_mfa v2.0.0a) with OOVs words
generated by G2P model (french_mfa) on INA Diachronic Corpus.
"""
function MFAFRDICT(path)
if ! isfile(path)
mkpath(dirname(path))
dir = mktempdir()
run(`wget -P $dir $FRMFA_DICT_URL`)
mv(joinpath(dir, "french_mfa.dict"), path)
end
lexicon = Dict()
open(path, "r") do f
for line in eachline(f)
word, pron... = split(line)
prononciations = get(lexicon, word, [])
push!(prononciations, pron)
lexicon[word] = prononciations
end
end
lexicon
end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment