Date: Mon, 8 Apr 2024 12:09:04 -0300
Subject: [PATCH] benchmark update

 benchmark/Project.toml         |  17 +++
 benchmark/composition.jl       | 103 ++++++++++++++++++
 benchmark/run_benchmarks.jl    |   5 +
 benchmark/shortest_distance.jl | 106 ++++++++++++++++++
 benchmark/utils.jl             | 192 +++++++++++++++++++++++++++++++++
 5 files changed, 423 insertions(+)
 create mode 100644 benchmark/Project.toml
 create mode 100644 benchmark/composition.jl
 create mode 100644 benchmark/run_benchmarks.jl
 create mode 100644 benchmark/shortest_distance.jl
 create mode 100644 benchmark/utils.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..a4368ec
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,17 @@
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+NaNStatistics = "b946abbf-3ea7-4610-9019-9858bfdeaf2d"
+OpenFst = "3e215157-cce3-4b04-a8eb-cbfb077f1dc8"
+Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Semirings = "900aad66-9ca5-44d4-b043-321c62cb7767"
+SparseArrayKit = "a9a3c162-d163-4c15-8926-b8794fbefed2"
+SumSparseTensors = "472dc678-8c5a-4778-a6eb-28a4e9c7cb58"
+TensorFSTs = "9f8e39db-0d6b-46cc-a14d-daf437028eec"
+Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
diff --git a/benchmark/composition.jl b/benchmark/composition.jl
new file mode 100644
index 0000000..fd0b73b
--- /dev/null
+++ b/benchmark/composition.jl
@@ -0,0 +1,103 @@
+using DataFrames
+using Semirings
+using TensorFSTs
+using CSV
+using BenchmarkTools
+using OpenFst
+using Glob
+using IterTools
+pairs = [("dense","topology"),
+         ("dense","charlm"),
+         ("dense","random"),
+         ("topology","dense")]
+function compbench(compfunc, machineA, machineB, seconds)
+	b = @benchmarkable $compfunc($machineA, $machineB)	    
+    # tune!(b)
+    t = run(b, samples=100, seconds=seconds, evals=1)
+    t.times
+function compbench2(compfunc, machineA, machineB, seconds)
+    AM, Aa, Aw = tensorFST2SparseArray(machineA)
+    BM, Ba, Bw = tensorFST2SparseArray(machineB)
+    min_cd = min(AM.dims[4], BM.dims[3])
+    AM = AM[:,:,:,1:min_cd]
+    BM = BM[:,:,1:min_cd,:]
+	b = @benchmarkable $compfunc($AM, $BM, $Aa, $Aw, $Ba, $Bw)
+    # tune!(b)
+    t = run(b, samples=100, seconds=seconds, evals=1)
+    t.times
+function renamepath(path)
+    type = splitpath(path)[2]
+    name = replace(splitpath(path)[3],".fst"=>"")
+    type * "-" * name
+machinezoo_path = "../../MachineZoo.jl/"
+if !isdir(joinpath(machinezoo_path, "machines", "composition"))
+    mkdir(joinpath(machinezoo_path, "machines", "composition"))
+tseconds = 4
+oseconds = 1
+dfs = []
+for path in glob(joinpath(machinezoo_path,"machines/*/fstinfo.csv"))
+    df = DataFrame(CSV.File(path));
+    push!(dfs, df)
+df = vcat(dfs...)
+df[!,"type"] = map(x -> splitpath(x)[2] ,df[!,"file"])
+results = []
+for pair in pairs
+    dfx = df[df.type.==pair[1],:]
+    dfy = df[df.type.==pair[2],:]
+    for p in product(dfx.file, dfy.file)
+        ofstA =, p[1]))
+        ofstB =, p[2]))
+        println(p)
+        ofstC = OF.compose(ofstA, ofstB)
+        if OF.numstates(ofstC) == 0  
+            println("skipping")
+            continue
+        end
+        p1 = renamepath(p[1])
+        p2 = renamepath(p[2])
+        OF.write(ofstC, joinpath(machinezoo_path, "machines", "composition", "$(p1)-x-$(p2).fst"))
+        otimes = compbench(OF.compose, ofstA, ofstB, oseconds)
+        print(" of: ")
+        print(mean(otimes))
+        ttimes = nothing
+        try
+            ttimes = compbench(TF.compose,TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds)
+            print(" tf: ")
+            print(mean(ttimes))
+        catch
+            println("tf failed")
+            ttimes = [Inf]
+        end
+        # tutimes = compbench2(tulliocompose, TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds)
+        # print(" tu: ")
+        # print(mean(tutimes))
+        println()
+        push!(results, (fileA=p[1], fileB=p[2],
+            tmin=minimum(ttimes), tmax=maximum(ttimes), tmean=mean(ttimes), tstd=std(ttimes), tlen=length(ttimes), 
+            omin=minimum(otimes), omax=maximum(otimes), omean=mean(otimes), ostd=std(otimes), olen=length(otimes),
+            # tumin=minimum(tutimes), tumax=maximum(tutimes), tumean=mean(tutimes), tustd=std(tutimes), tulen=length(tutimes)
+        ))
+    end
+CSV.write("composition_benchmark.csv", DataFrame(results))
diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl
new file mode 100644
index 0000000..fb6f033
--- /dev/null
+++ b/benchmark/run_benchmarks.jl
@@ -0,0 +1,5 @@
+export LD_LIBRARY_PATH=../../OpenFst.jl/src/:../../OpenFst.jl/openfst-1.8.3/src/lib
+julia --project=$JULIA_ENV shortest_distance.jl
+julia --project=$JULIA_ENV composition.jl
\ No newline at end of file
diff --git a/benchmark/shortest_distance.jl b/benchmark/shortest_distance.jl
new file mode 100644
index 0000000..9694646
--- /dev/null
+++ b/benchmark/shortest_distance.jl
@@ -0,0 +1,106 @@
+using DataFrames
+using Semirings
+using TensorFSTs
+using CSV
+using BenchmarkTools
+using OpenFst
+using Glob
+using SparseArrays
+using CUDA
+using NaNStatistics
+function sdbench(sdfunc, machine, seconds)
+	b = @benchmarkable $sdfunc($machine)	    
+    # tune!(b)
+    t = run(b, samples=100, seconds=seconds, evals=1)
+    t.times
+machinezoo_path = "../../MachineZoo.jl/"
+tseconds = 4
+oseconds = 1
+dfs = []
+for path in glob(joinpath(machinezoo_path,"machines/*/*/fstinfo.csv"))
+    df = DataFrame(CSV.File(path));
+    push!(dfs, df)
+df = vcat(dfs...)
+results = []
+for r in eachrow(df)
+	ofst =, r["file"]))
+	if OF.numstates(ofst) == 0
+		continue
+	end
+	if r["# of arcs"] > 100000
+		continue
+	end
+	if r["cyclic"] == "y" && r["arc type"] == "log"
+		continue
+	end
+    println(r["file"])
+	tfst = TF.TensorFST(ofst)	
+	A_cpu, A_gpu = machine2matrices(tfst)
+	times = Dict()
+	#check results
+	sd0 = OF.shortestdistance(ofst)
+	sd1 = TF.shortestdistance(tfst)
+	sd2 = cpu_shortest_distance(A_cpu)
+	sd3 = cu_shortest_distance(A_gpu)
+	times["ofst"] = sdbench(OF.shortestdistance,ofst, tseconds)
+	if isapprox(sd0,val.(sd1[:]))
+		times["tfst"] = sdbench(TF.shortestdistance,tfst, tseconds)
+	else
+		times["tfst"] = [NaN]
+	end
+	if isapprox(sd0, val.(sd2[:]))
+		times["cpufst"] = sdbench(cpu_shortest_distance, A_cpu, tseconds)
+	else
+		times["cpufst"] = [NaN]
+	end
+	if isapprox(sd0, val.(Array(sd3)))
+		times["gpufst"] = sdbench(cu_shortest_distance, A_gpu, tseconds)
+	else
+		times["gpufst"] = [NaN]
+	end
+	if r["cyclic"]=="n"
+		sd4 = cpu_acyclic_shortest_distance(A_cpu)
+		sd5 = cu_acyclic_shortest_distance(A_gpu)
+		if isapprox(sd0, val.(sd4[:]))
+			times["cpufst_acyclic"] = sdbench(cpu_acyclic_shortest_distance, A_cpu, tseconds)
+		else
+			times["cpufst_acyclic"] = [NaN]
+		end
+		if isapprox(sd0, val.(Array(sd5)))
+			times["gpufst_acyclic"] = sdbench(cu_acyclic_shortest_distance, A_gpu, tseconds)
+		else
+			times["gpufst_acyclic"] = [NaN]
+		end
+	end
+	stats = Dict()
+	stats[Symbol("file")] = r["file"]
+	for (k,v) in times
+		stats[Symbol("$(k)_min")] = nanminimum(v)
+		stats[Symbol("$(k)_max")] = nanmaximum(v)
+		stats[Symbol("$(k)_mean")] = nanmean(v)
+		stats[Symbol("$(k)_std")] = nanstd(v)
+		stats[Symbol("$(k)_len")] = length(filter(!isnan, v))
+	end
+	push!(results, NamedTuple(stats))
+joined = innerjoin(df, DataFrame(results), on = :file)
+CSV.write("shortest_distance_benchmark.csv", joined)
diff --git a/benchmark/utils.jl b/benchmark/utils.jl
new file mode 100644
index 0000000..c47d666
--- /dev/null
+++ b/benchmark/utils.jl
@@ -0,0 +1,192 @@
+_logaddexp(b, x, y) = inv(b) * logaddexp(b*x, b*y)
+Base.:+(x::S, y::S) where S<:TropicalSemiring = S(min(val(x), val(y)))
+Base.:*(x::S, y::S) where S<:TropicalSemiring = S(val(x) + val(y))
+Base.:+(x::LogSemiring{T,b}, y::LogSemiring{T,b}) where {T,b} = LogSemiring{T,b}(_logaddexp(b, val(x), val(y)))
+Base.:*(x::S, y::S) where S<:LogSemiring = S(val(x) + val(y))
+function cu_shortest_distance(A)
+	K = eltype(A)	
+	xk = zeros(K,size(A)[2])
+	xk[1] = 1
+	u_n = CUDA.CuVector(xk)
+    res = similar(u_n)
+    prevres = similar(u_n)
+    copyto!(res, u_n)
+    copyto!(prevres, u_n)
+    stop = false
+    while ! stop
+        u_n = call_csr_spmv_vector_kernel(A, u_n)
+        res += u_n
+        stop = all(val.(res)≈val.(prevres))
+        copyto!(prevres, res)
+    end
+    res	
+## unoptimized (custom csr spmv should be used)
+function cpu_acyclic_shortest_distance(A)
+	K = eltype(A)
+	u_n = zeros(K,(1,size(A)[1]))
+	u_n[1] = 1
+	res = similar(u_n)
+	copyto!(res, u_n)
+    for i in 1:size(A)[1]
+	    u_n = u_n*A
+		res += u_n
+    end
+    res
+function cu_acyclic_shortest_distance(A)
+	K = eltype(A)	
+	xk = zeros(K,size(A)[2])
+	xk[1] = 1
+	u_n = CUDA.CuVector(xk)
+	res = similar(u_n)
+	copyto!(res, u_n)
+    for i in 1:size(A)[1]
+        call_csr_spmv_vector_kernel2(A, u_n, i)
+		res += u_n
+    end
+    res
+function cpu_shortest_distance(A)
+    K = eltype(A)
+	u_n = zeros(K,(1,size(A)[1]))
+	u_n[1] = 1
+    res = similar(u_n)
+    prevres = similar(u_n)
+    copyto!(res, u_n)
+    copyto!(prevres, u_n)
+    stop = false
+	c=0
+    while ! stop
+		c +=1 
+        u_n = u_n*A
+        res += u_n
+		stop = !_has_changed(res, prevres)
+        copyto!(prevres, res)
+    end
+    res	
+function _has_changed(x, y)
+    changed = false
+    for i in eachindex(x)
+        if ! (val(x[i]) ≈ val(y[i]))
+            changed = true
+            break
+        end
+    end
+    changed
+function machine2matrices(tfst)
+	K = semiring(tfst)	
+	Ma = sort(sum(tfst.M, dims=(3,4)),1)
+	row_ids = Int32.(first.(Ma.nzcoo))
+	col_ids = Int32.(last.(Ma.nzcoo))
+	A_cpu = sparse(row_ids, col_ids, Ma.nzval, size(Ma)...)
+	#to transpose csr
+	A = sparse(row_ids, col_ids, val.(Ma.nzval), size(Ma)...)
+	A_d = CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(A))
+	#cuda	
+	A_gpu=CUDA.CUSPARSE.CuSparseMatrixCSR{K}(
+		A_d.rowPtr,
+		A_d.colVal,
+		convert(CuVector{K}, A_cpu.nzval),
+		A_d.dims);
+	A_cpu, A_gpu 
+function warp_reduce(x::T) where T <: Semiring
+	offset = warpsize() ÷ 2
+	while offset > 0
+        x += T(CUDA.shfl_down_sync(CUDA.FULL_MASK, val(x), offset))
+		offset ÷= 2
+	end
+    x
+function _cukernel_mul_smdv!(c, rowptr, colval, nzval, b)
+    threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    warpid = (threadid - 1) ÷ warpsize() + 1
+    lane = ((threadid - 1) % warpsize()) + 1
+    r = warpid # assign one warp per row.
+    sum = zero(eltype(nzval))
+    if r < length(rowptr)
+        @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
+            sum += nzval[i] * b[colval[i]]
+        end
+    end
+    sum = warp_reduce(sum)
+    if lane == 1 && r < length(rowptr)
+        @inbounds c[r] = sum
+    end
+    return
+function call_csr_spmv_vector_kernel(A,x)
+    K = eltype(A.nzVal)
+    n_rows = A.dims[1]
+    col_ids = A.colVal
+    data = A.nzVal
+    row_ptr = A.rowPtr
+    y = CUDA.zeros(K,A.dims[1])
+    warp_size = 32
+    ckernel = @cuda launch=false _cukernel_mul_smdv!(y, row_ptr, col_ids, data, x)
+    config = launch_configuration(
+    threads = min(warp_size * n_rows, config.threads)
+    blocks = cld(warp_size * n_rows, threads)
+    ckernel(y, row_ptr, col_ids, data, x; threads=threads,blocks=blocks)
+    y
+function _cukernel_mul_smdv2!(c, rowptr, colval, nzval, min_row)
+    threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    warpid = (threadid - 1) ÷ warpsize() + 1
+    lane = ((threadid - 1) % warpsize()) + 1
+    r = warpid # assign one warp per row.
+    sum = zero(eltype(nzval))
+    if r < length(rowptr) && r>=min_row
+        @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
+            sum += nzval[i] * c[colval[i]]
+        end
+    end
+    sum = warp_reduce(sum)
+    if lane == 1 && r < length(rowptr) && r>=min_row
+        @inbounds c[r] = sum
+    end
+    return
+function call_csr_spmv_vector_kernel2(A,x,min_row)  
+    n_rows = A.dims[1]
+    col_ids = A.colVal
+    data = A.nzVal
+    row_ptr = A.rowPtr
+    warp_size = 32
+    ckernel = @cuda launch=false  _cukernel_mul_smdv2!(x, row_ptr, col_ids, data, min_row)
+    config = launch_configuration(
+    threads = min(warp_size * n_rows, config.threads)
+    blocks = cld(warp_size * n_rows, threads)
+    ckernel(x, row_ptr, col_ids, data,min_row; threads=threads,blocks=blocks)
+    x
\ No newline at end of file