Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# SPDX-License-Identifier: CECILL-2.1
module SpeechRecognitionFSTs
using ..TensorFSTs
using ..TensorFSTs.LinearFSTs
export EmissionMapping, lexiconfst, tokenfst
"""
lexiconfst(S, encoded_lexicon)
Create FST representation of lexicon.
All leafs of `encoded_lexicon` should be represented as `Integer`.
- `encoded_lexicon` - collection of `(token, [unit1, unit2, …])` pairs
- `S` - FST arc's weight type (e.g. Semiring)
# Example
```juliarepl
julia> lexicon = [1 => [1, 2, 3], 1 => [3,4,1], 2 => [1, 2]]
julia> lexiconfst(LogSemiring{Float32, 1.0}, lexicon)
TensorFST(
…
)
```
"""
function lexiconfst(S, lexicon)
fst = nothing
for (token, prons) in lexicon
for pron in prons
isyms = pron
osyms = vcat([token], ones(Int, length(pron) -1))
pronfst = linearfst(ones(S, length(pron)), isyms, osyms)
fst = isnothing(fst) ? pronfst : union(fst, pronfst)
end
end
fst
end
"""
struct EmissionMapping <: AbstractMatrix{Int}
numstates::Int
numtokens::Int
end
Default emission mapping which assumes the same number of emissions per token.
"""
struct EmissionMapping <: AbstractMatrix{Int}
numstates::Int
numtokens::Int
end
Base.size(x::EmissionMapping) = (x.numstates, x.numtokens)
Base.getindex(x::EmissionMapping, i, j) = (j - 1) * x.numstates + i + 1
Base.maximum(x::EmissionMapping) = (x.numstates * x.numtokens) + 1
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
tokenfst(S, topo, initweights, finalweights, tokens, mapping)
Create a FST composed of a set of smaller FSTs. Each token share the same FST
topology specified with `topo = [(src, dest, weight), ...]`. `S` is a
`Semiring` of output FST, `tokens` is a list of token IDs, and
`mapping[tokens[i]]` is the output symbol .
"""
function tokenfst(
S,
topo,
initweights,
finalweights,
tokens,
mapping = EmissionMapping(length(topo), length(tokens))
)
emission_count = 0
states = Set(Int[])
arcs, init, final = [], [], []
for (i, token) in enumerate(tokens)
offset = length(states)
for (j, topo_arc) in enumerate(topo)
emission_count += 1
src, dest, weight = offset + topo_arc[1], offset + topo_arc[2], topo_arc[3]
arc = Arc(
src = src,
isym = mapping[emission_count],
osym = j == 1 ? token : 1,
dest = dest,
weight = S(weight)
)
push!(states, src)
push!(states, dest)
push!(arcs, arc)
end
for (state, weight) in initweights
state = offset + state
push!(states, state)
push!(init, state => S(weight))
end
for (state, weight) in finalweights
state = offset + state
push!(states, state)
push!(final, state => S(weight))
end
end
TensorFST(arcs, init, final)
end
end