# SPDX-License-Identifier: CECILL-2.1 module SpeechRecognitionFSTs using ..TensorFSTs using ..TensorFSTs.LinearFSTs export EmissionMapping, lexiconfst, tokenfst """ struct EmissionMapping <: AbstractMatrix{Int} numstates::Int numtokens::Int end Default emission mapping which assumes the same number of emissions per token. """ struct EmissionMapping <: AbstractMatrix{Int} numpdfs::Int numtokens::Int end Base.size(x::EmissionMapping) = (x.numpdfs, x.numtokens) Base.getindex(x::EmissionMapping, i, j) = (j - 1) * x.numpdfs + i Base.minimum(x::EmissionMapping) = 1 Base.maximum(x::EmissionMapping) = (x.numsnumpdfstates * x.num_emission) """ tokenfst(S, topo, initweights, finalweights, tokens, mapping) Create a FST composed of a set of smaller FSTs. Each token share the same FST topology specified with `topo = [(src, dest, weight), ...]`. `S` is a `Semiring` of output FST, `tokens` is a list of token IDs, and `mapping[tokens[i]]` is the output symbol . """ function tokenfst( S, topo, initweights, finalweights, tokens, mapping = EmissionMapping(length(topo), length(tokens)) ) print(mapping[1,2]) emission_count = 1 states = Set(Int[]) arcs, init, final = [], [], [] for (i, token) in enumerate(tokens) offset = length(states) values = [topo_arc[2] for topo_arc in topo] for (j, topo_arc) in enumerate(topo) if j>1 if topo_arc[2] != values[j-1] emission_count += 1 end end src, dest, weight = offset + topo_arc[1], offset + topo_arc[2], topo_arc[3] arc = Arc( src = src, isym = mapping[emission_count], osym = j == 1 ? token : 0, dest = dest, weight = S(weight) ) push!(states, src) push!(states, dest) push!(arcs, arc) end for (state, weight) in initweights state = offset + state push!(states, state) push!(init, state => S(weight)) end for (state, weight) in finalweights state = offset + state push!(states, state) push!(final, state => S(weight)) end end TensorFST(arcs, init, final) end include("trie.jl") include("lexicon.jl") end