ngram-lm

- update loading function - add scoring functions `ngram_get_score()` and `ngram_score_sentence()`

ngram-lm
2b02063f · Karel Vesely · 55c03a7c · 2b02063f · 2b02063f
Commit 2b02063f authored 1 year ago by Karel Vesely
--- a/src/TensorFSTs.jl
+++ b/src/TensorFSTs.jl
@@ -55,8 +55,9 @@ include("filter.jl")
 include("ops.jl")

 # arpa format
-export read_arpafile,
-       read_arpafile_as_dicts
+export ngram_read_arpafile,
+       ngram_get_score,
+       ngram_score_sentence
 include("ngram_arpa.jl")

 # standalone libraries

--- a/src/ngram_arpa.jl
+++ b/src/ngram_arpa.jl
 using GZip

-mutable struct ArpaParsingStatus
-    order_header::Int          # order according of arpafile header
-    order::Int                 # current order by ngram arpafile reading
-    n_ngram_header
-    n_ngram_current
-
-    function ArpaParsingStatus()
-        order_header    = -1         # order of arpafile ngram
-        order           = -1         # current order of read ngram
-        n_ngram_header  = []
-        n_ngram_current = []
-        new(order_header, order, n_ngram_header, n_ngram_current)
-    end
-end
-
-
-"""
-::Dict =  read_arpafile(arpafile::String; output_order=-1::Int)
-
-Read srilm arpa lm in plain format or .gz
-It  produces dictionary of tuples like:
- (0,0,1gram)         => (logprob, backoff)
- (0,1gram,2gram)     => (logprob, backoff)
- (1gram,2gram,3gram) => (logprob, backoff)
- (1gram,2gram,3gram) => (logprob, backoff)
-
-
-output_order - if set, create output dictionary based on this value.
-               default=-1 - use order from arpalm header
-"""
-function read_arpafile(arpafile::String; output_order=-1)
-    if occursin(r".*\.gz$", arpafile)
-        f = GZip.open(arpafile, "r")
-    else
-        f = open(arpafile)
-    end
-
-    status = ArpaParsingStatus()
-    dct_arpa      = Dict()        # output ngram dictionary
-
-    for l in eachline(f)
-        splitline = split(l)
-        nsplit    = length(splitline) # n of columns in line
-
-        if  (nsplit == 0)
-            # Skip empty lines
-
-        # -----------------------
-        # reading header
-        # -----------------------
-        elseif  nsplit == 1 && l == "\\data\\"
-            # Consume "\data\" line
-            status.order = 0
-
-        elseif (status.order == 0) && (splitline[1] == "ngram")
-            # Consue "ngram 1=200003" lines
-            s = match(r"ngram (\d+)=(\d+)", l) # ngram 1=6
-            order = parse(Int64, s[1]); n = parse(Int64, s[2])
-            push!(status.n_ngram_header, n)
-            @assert (status.n_ngram_header[order] == n)  "ERROR: header is probably corrupted or wrongly ordered"
-            status.order_header = order
-
-
-        # -----------------------
-        # reading ngrams
-        # -----------------------
-        elseif nsplit == 1 && occursin(r"\\.*-grams:$",l)
-            # initial ngram line: \{N}-grams
-
-            # --- Seting up "output_order" variable if it was not not set
-            #     it is done after reading of the header
-            if (output_order < 0)
-                max_non_zero_order = 0
-                for (i, n) in enumerate(status.n_ngram_header)
-                  if n > 0
-                    max_non_zero_order = i
-                  end
-                end
-                output_order = max_non_zero_order
-            end
-
-            # ---
-            s = match(r"\\(\d)+-grams.*", l)
-            order = parse(Int64, s[1])
-
-            status.order = order
-            push!(status.n_ngram_current, 0)
-            @assert (status.n_ngram_current[order] == 0)  "ERROR: header is probably corrupted. Start reading different ngram order than expected"
-
-        elseif nsplit > 1 && status.order > 0 && status.order <= output_order   # ngrams
-            status.n_ngram_current[status.order] += 1
-
-            prob = parse(Float64, splitline[1])
-            # Detect/set back-off prob
-            nsplit >= (status.order + 2) ? backoff = parse(Float64,splitline[status.order + 2]) : backoff = Nothing
-
-            ngram = splitline[2:(status.order + 1)]
-            ngram = ( zeros(output_order - length(ngram))..., ngram...,)
-            dct_arpa[ngram] = (prob, backoff)
-        elseif nsplit > 1 && status.order > output_order   # ngrams
-            println("WARNING: order of ngram: $(ngram) is higher than given $(output_order). Skipp")
-        end
-    end
-    close(f)
-    return dct_arpa
-end
-
-
-"""
-::Dict = read_ngram_dict
-
-Read one table of ngrams with the same order from ARPA file.
-"""
-function read_ngram_dict(fd::T, ngram_order::Int64) where {T <: IO}
-
-    ngram_dict = Dict()
-
-    while true
-        # keyword follows ?
-        if Char(peek(fd)) == '\\'; break; end  # `\2-grams:` line not consumed
-
-        # file ended ?
-        if eof(fd); break; end
-
-        # readline
-        line = strip(readline(fd))
-
-        arpa_columns = split(line)
-        nsplit = length(arpa_columns)
-
-        # skip empty lines
-        if (nsplit == 0); continue; end
-
-        # parse the line
-        # TODO: convert 10-base to E-base ?
-        prob = parse(Float64, arpa_columns[1])
-        ngram = arpa_columns[2:(1 + ngram_order)]
-
-        backoff = nothing
-        if (nsplit == ngram_order + 2)
-            backoff = parse(Float64, arpa_columns[ngram_order + 2])
-        end
-
-        ngram_dict[(ngram...,)] = (prob, backoff)
-    end
-
-    return ngram_dict
-end
-

 """
 ::List[::Dict] =  read_arpafile_as_dicts(arpafile::String)

 Read srilm arpa lm in plain format or .gz
-It produces list of dictionies of tuples like:
- [Dict{(w1)       => (logprob, backoff)},
+
+It produces Vector of Dict(Tuple, Tuple) holding n-gram tables:
+ [Dict{(w1,)       => (logprob, backoff)},
  Dict{(w1,w2)    => (logprob, backoff)},
  Dict{(w1,w2,w2) => (logprob, backoff)},
  ...
 ]
+
+In case backoff value is not present in ARPA file, its value is `nothing`.
 """
-function read_arpafile_as_dicts(arpafile::String)
+function ngram_read_arpafile(arpafile::String)

    if occursin(r".*\.gz$", arpafile)
        fd = GZip.open(arpafile, "r")
@@ -228,7 +82,7 @@ function read_arpafile_as_dicts(arpafile::String)
            ngram_order = parse(Int64, s[1])

            # read the ngrams
-            ngram_dict = read_ngram_dict(fd, ngram_order)
+            ngram_dict = _read_ngram_dict(fd, ngram_order)

            if length(ngram_dict) != ngram_count_in_header[ngram_order]
                println("""WARNING: number of ngrams does not match !
@@ -252,3 +106,139 @@ function read_arpafile_as_dicts(arpafile::String)

    return ngram_dicts
 end
+
+
+"""
+::Dict = read_ngram_dict
+
+Read one table of ngrams with the same order from ARPA file.
+"""
+function _read_ngram_dict(fd::T, ngram_order::Int64) where {T <: IO}
+
+    ngram_dict = Dict()
+
+    log_base_10_to_e = 1. / log10(ℯ)
+
+    while true
+        # keyword follows ?
+        if Char(peek(fd)) == '\\'; break; end  # `\2-grams:` line not consumed
+
+        # file ended ?
+        if eof(fd); break; end
+
+        # readline
+        line = strip(readline(fd))
+
+        arpa_columns = split(line)
+        nsplit = length(arpa_columns)
+
+        # skip empty lines
+        if (nsplit == 0); continue; end
+
+        # parse the line
+        prob = parse(Float64, arpa_columns[1])
+        ngram = arpa_columns[2:(1 + ngram_order)]
+
+        backoff = nothing
+        if (nsplit == ngram_order + 2)
+            backoff = parse(Float64, arpa_columns[ngram_order + 2])
+        end
+
+        # convert log-base: 10 -> e
+        prob *= log_base_10_to_e
+        if backoff != nothing
+            backoff *= log_base_10_to_e
+        end
+
+        ngram_dict[(ngram...,)] = (prob, backoff)
+    end
+
+    return ngram_dict
+end
+
+
+"""
+Get ngram score for single ngram represented as a tuple:
+`(w1, w2, w3)`
+"""
+function ngram_get_score(ngram_dicts::Vector, ngram::Tuple; unk::String="<UNK>")
+    @assert length(ngram) <= length(ngram_dicts)
+
+    # replace OOVs with "<UNK>"
+    ngram_ = []
+    for w in ngram
+        haskey(ngram_dicts[1], (w,)) ? push!(ngram_, w) : push!(ngram_, unk)
+    end
+    ngram = (ngram_...,)
+
+    # get the score
+    if haskey(ngram_dicts[length(ngram)], ngram)
+        # n-gram recursion
+        return ngram_dicts[length(ngram)][ngram][1]
+
+    else
+        if length(ngram) == 1
+            @assert false "Key error for unigram `$(ngram)`. This sholud not happen..."
+        end
+
+        shorter_ngram = ngram[2:end]
+        prob = ngram_get_score(ngram_dicts, shorter_ngram; unk=unk)
+
+        backoff = let
+            history = ngram[begin:(end-1)]
+            len_minus_1 = length(ngram) - 1
+
+            if haskey(ngram_dicts[len_minus_1], history)
+                backoff = ngram_dicts[len_minus_1][history][2]
+            else
+                backoff = nothing
+            end
+
+            # 0.0 if ngram missing or no backoff value in arpa
+            (backoff == nothing ? 0.0 : backoff)
+        end
+
+        return backoff + prob  # log scores
+    end
+end
+
+
+"""
+Score a sentence using the ngram model.
+The `sentence` is splitted into words.
+We print per-ngram scores and return final score.
+"""
+function ngram_score_sentence(ngram_dicts::Vector, sentence::String; unk::String="<UNK>")
+
+    ngram_order = length(ngram_dicts)
+
+    words = split(sentence)
+    pushfirst!(words, "<s>")
+    push!(words, "</s>")
+
+    scores = []
+    ngrams = []
+    for w_j in range(2,length(words))
+        w_i = w_j - ngram_order + 1
+        if w_i < 1; w_i = 1; end
+
+        ngram = words[w_i:w_j]
+        ngram = (ngram...,)  # tuple
+
+        push!(ngrams, ngram)
+        push!(scores, ngram_get_score(ngram_dicts, ngram; unk=unk))
+    end
+
+    println("LM scores $(collect(zip(ngrams, scores)))")
+
+    return sum(scores)
+end
+
+
+# Example:
+#
+# ngram_dicts = ngram_read_arpafile("/mnt/matylda5/iveselyk/FAST-ASR_julia/lm_arpa_librispeech/3-gram.pruned.3e-7.arpa.gz")
+#
+# ngram_get_score(ngram_dicts, ("A", "GENTLE", "KICK"))
+#
+# ngram_score_sentence(ngram_dicts, "A GENTLE KICK FROM THE TALL BOY IN THE BENCH BEHIND URGED STEPHEN TO ASK A DIFFICULT QUESTION")