Skip to content
Snippets Groups Projects
Commit 2b02063f authored by Karel Vesely's avatar Karel Vesely
Browse files

ngram-lm

- update loading function
- add scoring functions `ngram_get_score()` and `ngram_score_sentence()`
parent 55c03a7c
No related branches found
No related tags found
1 merge request!60[WIP] Resolve "Read IARPA format LM"
......@@ -55,8 +55,9 @@ include("filter.jl")
include("ops.jl")
# arpa format
export read_arpafile,
read_arpafile_as_dicts
export ngram_read_arpafile,
ngram_get_score,
ngram_score_sentence
include("ngram_arpa.jl")
# standalone libraries
......
using GZip
mutable struct ArpaParsingStatus
order_header::Int # order according of arpafile header
order::Int # current order by ngram arpafile reading
n_ngram_header
n_ngram_current
function ArpaParsingStatus()
order_header = -1 # order of arpafile ngram
order = -1 # current order of read ngram
n_ngram_header = []
n_ngram_current = []
new(order_header, order, n_ngram_header, n_ngram_current)
end
end
"""
::Dict = read_arpafile(arpafile::String; output_order=-1::Int)
Read srilm arpa lm in plain format or .gz
It produces dictionary of tuples like:
(0,0,1gram) => (logprob, backoff)
(0,1gram,2gram) => (logprob, backoff)
(1gram,2gram,3gram) => (logprob, backoff)
(1gram,2gram,3gram) => (logprob, backoff)
output_order - if set, create output dictionary based on this value.
default=-1 - use order from arpalm header
"""
function read_arpafile(arpafile::String; output_order=-1)
if occursin(r".*\.gz$", arpafile)
f = GZip.open(arpafile, "r")
else
f = open(arpafile)
end
status = ArpaParsingStatus()
dct_arpa = Dict() # output ngram dictionary
for l in eachline(f)
splitline = split(l)
nsplit = length(splitline) # n of columns in line
if (nsplit == 0)
# Skip empty lines
# -----------------------
# reading header
# -----------------------
elseif nsplit == 1 && l == "\\data\\"
# Consume "\data\" line
status.order = 0
elseif (status.order == 0) && (splitline[1] == "ngram")
# Consue "ngram 1=200003" lines
s = match(r"ngram (\d+)=(\d+)", l) # ngram 1=6
order = parse(Int64, s[1]); n = parse(Int64, s[2])
push!(status.n_ngram_header, n)
@assert (status.n_ngram_header[order] == n) "ERROR: header is probably corrupted or wrongly ordered"
status.order_header = order
# -----------------------
# reading ngrams
# -----------------------
elseif nsplit == 1 && occursin(r"\\.*-grams:$",l)
# initial ngram line: \{N}-grams
# --- Seting up "output_order" variable if it was not not set
# it is done after reading of the header
if (output_order < 0)
max_non_zero_order = 0
for (i, n) in enumerate(status.n_ngram_header)
if n > 0
max_non_zero_order = i
end
end
output_order = max_non_zero_order
end
# ---
s = match(r"\\(\d)+-grams.*", l)
order = parse(Int64, s[1])
status.order = order
push!(status.n_ngram_current, 0)
@assert (status.n_ngram_current[order] == 0) "ERROR: header is probably corrupted. Start reading different ngram order than expected"
elseif nsplit > 1 && status.order > 0 && status.order <= output_order # ngrams
status.n_ngram_current[status.order] += 1
prob = parse(Float64, splitline[1])
# Detect/set back-off prob
nsplit >= (status.order + 2) ? backoff = parse(Float64,splitline[status.order + 2]) : backoff = Nothing
ngram = splitline[2:(status.order + 1)]
ngram = ( zeros(output_order - length(ngram))..., ngram...,)
dct_arpa[ngram] = (prob, backoff)
elseif nsplit > 1 && status.order > output_order # ngrams
println("WARNING: order of ngram: $(ngram) is higher than given $(output_order). Skipp")
end
end
close(f)
return dct_arpa
end
"""
::Dict = read_ngram_dict
Read one table of ngrams with the same order from ARPA file.
"""
function read_ngram_dict(fd::T, ngram_order::Int64) where {T <: IO}
ngram_dict = Dict()
while true
# keyword follows ?
if Char(peek(fd)) == '\\'; break; end # `\2-grams:` line not consumed
# file ended ?
if eof(fd); break; end
# readline
line = strip(readline(fd))
arpa_columns = split(line)
nsplit = length(arpa_columns)
# skip empty lines
if (nsplit == 0); continue; end
# parse the line
# TODO: convert 10-base to E-base ?
prob = parse(Float64, arpa_columns[1])
ngram = arpa_columns[2:(1 + ngram_order)]
backoff = nothing
if (nsplit == ngram_order + 2)
backoff = parse(Float64, arpa_columns[ngram_order + 2])
end
ngram_dict[(ngram...,)] = (prob, backoff)
end
return ngram_dict
end
"""
::List[::Dict] = read_arpafile_as_dicts(arpafile::String)
Read srilm arpa lm in plain format or .gz
It produces list of dictionies of tuples like:
[Dict{(w1) => (logprob, backoff)},
It produces Vector of Dict(Tuple, Tuple) holding n-gram tables:
[Dict{(w1,) => (logprob, backoff)},
Dict{(w1,w2) => (logprob, backoff)},
Dict{(w1,w2,w2) => (logprob, backoff)},
...
]
In case backoff value is not present in ARPA file, its value is `nothing`.
"""
function read_arpafile_as_dicts(arpafile::String)
function ngram_read_arpafile(arpafile::String)
if occursin(r".*\.gz$", arpafile)
fd = GZip.open(arpafile, "r")
......@@ -228,7 +82,7 @@ function read_arpafile_as_dicts(arpafile::String)
ngram_order = parse(Int64, s[1])
# read the ngrams
ngram_dict = read_ngram_dict(fd, ngram_order)
ngram_dict = _read_ngram_dict(fd, ngram_order)
if length(ngram_dict) != ngram_count_in_header[ngram_order]
println("""WARNING: number of ngrams does not match !
......@@ -252,3 +106,139 @@ function read_arpafile_as_dicts(arpafile::String)
return ngram_dicts
end
"""
::Dict = read_ngram_dict
Read one table of ngrams with the same order from ARPA file.
"""
function _read_ngram_dict(fd::T, ngram_order::Int64) where {T <: IO}
ngram_dict = Dict()
log_base_10_to_e = 1. / log10()
while true
# keyword follows ?
if Char(peek(fd)) == '\\'; break; end # `\2-grams:` line not consumed
# file ended ?
if eof(fd); break; end
# readline
line = strip(readline(fd))
arpa_columns = split(line)
nsplit = length(arpa_columns)
# skip empty lines
if (nsplit == 0); continue; end
# parse the line
prob = parse(Float64, arpa_columns[1])
ngram = arpa_columns[2:(1 + ngram_order)]
backoff = nothing
if (nsplit == ngram_order + 2)
backoff = parse(Float64, arpa_columns[ngram_order + 2])
end
# convert log-base: 10 -> e
prob *= log_base_10_to_e
if backoff != nothing
backoff *= log_base_10_to_e
end
ngram_dict[(ngram...,)] = (prob, backoff)
end
return ngram_dict
end
"""
Get ngram score for single ngram represented as a tuple:
`(w1, w2, w3)`
"""
function ngram_get_score(ngram_dicts::Vector, ngram::Tuple; unk::String="<UNK>")
@assert length(ngram) <= length(ngram_dicts)
# replace OOVs with "<UNK>"
ngram_ = []
for w in ngram
haskey(ngram_dicts[1], (w,)) ? push!(ngram_, w) : push!(ngram_, unk)
end
ngram = (ngram_...,)
# get the score
if haskey(ngram_dicts[length(ngram)], ngram)
# n-gram recursion
return ngram_dicts[length(ngram)][ngram][1]
else
if length(ngram) == 1
@assert false "Key error for unigram `$(ngram)`. This sholud not happen..."
end
shorter_ngram = ngram[2:end]
prob = ngram_get_score(ngram_dicts, shorter_ngram; unk=unk)
backoff = let
history = ngram[begin:(end-1)]
len_minus_1 = length(ngram) - 1
if haskey(ngram_dicts[len_minus_1], history)
backoff = ngram_dicts[len_minus_1][history][2]
else
backoff = nothing
end
# 0.0 if ngram missing or no backoff value in arpa
(backoff == nothing ? 0.0 : backoff)
end
return backoff + prob # log scores
end
end
"""
Score a sentence using the ngram model.
The `sentence` is splitted into words.
We print per-ngram scores and return final score.
"""
function ngram_score_sentence(ngram_dicts::Vector, sentence::String; unk::String="<UNK>")
ngram_order = length(ngram_dicts)
words = split(sentence)
pushfirst!(words, "<s>")
push!(words, "</s>")
scores = []
ngrams = []
for w_j in range(2,length(words))
w_i = w_j - ngram_order + 1
if w_i < 1; w_i = 1; end
ngram = words[w_i:w_j]
ngram = (ngram...,) # tuple
push!(ngrams, ngram)
push!(scores, ngram_get_score(ngram_dicts, ngram; unk=unk))
end
println("LM scores $(collect(zip(ngrams, scores)))")
return sum(scores)
end
# Example:
#
# ngram_dicts = ngram_read_arpafile("/mnt/matylda5/iveselyk/FAST-ASR_julia/lm_arpa_librispeech/3-gram.pruned.3e-7.arpa.gz")
#
# ngram_get_score(ngram_dicts, ("A", "GENTLE", "KICK"))
#
# ngram_score_sentence(ngram_dicts, "A GENTLE KICK FROM THE TALL BOY IN THE BENCH BEHIND URGED STEPHEN TO ASK A DIFFICULT QUESTION")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment