From 77731c315f0f8b660d18de7ece904714b574e963 Mon Sep 17 00:00:00 2001 From: Pablo Riera <pablo.riera@gmail.com> Date: Mon, 8 Apr 2024 12:09:04 -0300 Subject: [PATCH] benchmark update --- benchmark/Project.toml | 17 +++ benchmark/composition.jl | 103 ++++++++++++++++++ benchmark/run_benchmarks.jl | 5 + benchmark/shortest_distance.jl | 106 ++++++++++++++++++ benchmark/utils.jl | 192 +++++++++++++++++++++++++++++++++ 5 files changed, 423 insertions(+) create mode 100644 benchmark/Project.toml create mode 100644 benchmark/composition.jl create mode 100644 benchmark/run_benchmarks.jl create mode 100644 benchmark/shortest_distance.jl create mode 100644 benchmark/utils.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 0000000..a4368ec --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,17 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" +IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +NaNStatistics = "b946abbf-3ea7-4610-9019-9858bfdeaf2d" +OpenFst = "3e215157-cce3-4b04-a8eb-cbfb077f1dc8" +Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" +Semirings = "900aad66-9ca5-44d4-b043-321c62cb7767" +SparseArrayKit = "a9a3c162-d163-4c15-8926-b8794fbefed2" +SumSparseTensors = "472dc678-8c5a-4778-a6eb-28a4e9c7cb58" +TensorFSTs = "9f8e39db-0d6b-46cc-a14d-daf437028eec" +Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc" diff --git a/benchmark/composition.jl b/benchmark/composition.jl new file mode 100644 index 0000000..fd0b73b --- /dev/null +++ b/benchmark/composition.jl @@ -0,0 +1,103 @@ +using DataFrames +using Semirings +using TensorFSTs +using CSV +using BenchmarkTools +using OpenFst +using Glob +using IterTools + +include("../../TensorFSTs.jl/lib/OpenFstConvert.jl") +include("tulliocompose.jl") + +pairs = [("dense","topology"), + ("dense","charlm"), + ("dense","random"), + ("topology","dense")] + +function compbench(compfunc, machineA, machineB, seconds) + b = @benchmarkable $compfunc($machineA, $machineB) + # tune!(b) + t = run(b, samples=100, seconds=seconds, evals=1) + t.times +end + +function compbench2(compfunc, machineA, machineB, seconds) + AM, Aa, Aw = tensorFST2SparseArray(machineA) + BM, Ba, Bw = tensorFST2SparseArray(machineB) + min_cd = min(AM.dims[4], BM.dims[3]) + AM = AM[:,:,:,1:min_cd] + BM = BM[:,:,1:min_cd,:] + b = @benchmarkable $compfunc($AM, $BM, $Aa, $Aw, $Ba, $Bw) + # tune!(b) + t = run(b, samples=100, seconds=seconds, evals=1) + t.times +end + +function renamepath(path) + type = splitpath(path)[2] + name = replace(splitpath(path)[3],".fst"=>"") + type * "-" * name +end + +machinezoo_path = "../../MachineZoo.jl/" + +if !isdir(joinpath(machinezoo_path, "machines", "composition")) + mkdir(joinpath(machinezoo_path, "machines", "composition")) +end + +tseconds = 4 +oseconds = 1 + +dfs = [] +for path in glob(joinpath(machinezoo_path,"machines/*/fstinfo.csv")) + df = DataFrame(CSV.File(path)); + push!(dfs, df) +end +df = vcat(dfs...) +df[!,"type"] = map(x -> splitpath(x)[2] ,df[!,"file"]) + +results = [] +for pair in pairs + dfx = df[df.type.==pair[1],:] + dfy = df[df.type.==pair[2],:] + for p in product(dfx.file, dfy.file) + ofstA = OF.read(joinpath(machinezoo_path, p[1])) + ofstB = OF.read(joinpath(machinezoo_path, p[2])) + println(p) + ofstC = OF.compose(ofstA, ofstB) + if OF.numstates(ofstC) == 0 + println("skipping") + continue + end + p1 = renamepath(p[1]) + p2 = renamepath(p[2]) + OF.write(ofstC, joinpath(machinezoo_path, "machines", "composition", "$(p1)-x-$(p2).fst")) + + otimes = compbench(OF.compose, ofstA, ofstB, oseconds) + print(" of: ") + print(mean(otimes)) + ttimes = nothing + try + ttimes = compbench(TF.compose,TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds) + print(" tf: ") + print(mean(ttimes)) + catch + println("tf failed") + ttimes = [Inf] + end + # tutimes = compbench2(tulliocompose, TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds) + # print(" tu: ") + # print(mean(tutimes)) + println() + + push!(results, (fileA=p[1], fileB=p[2], + tmin=minimum(ttimes), tmax=maximum(ttimes), tmean=mean(ttimes), tstd=std(ttimes), tlen=length(ttimes), + omin=minimum(otimes), omax=maximum(otimes), omean=mean(otimes), ostd=std(otimes), olen=length(otimes), + # tumin=minimum(tutimes), tumax=maximum(tutimes), tumean=mean(tutimes), tustd=std(tutimes), tulen=length(tutimes) + + )) + end +end + +CSV.write("composition_benchmark.csv", DataFrame(results)) diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl new file mode 100644 index 0000000..fb6f033 --- /dev/null +++ b/benchmark/run_benchmarks.jl @@ -0,0 +1,5 @@ +JULIA_ENV=./ +export LD_LIBRARY_PATH=../../OpenFst.jl/src/:../../OpenFst.jl/openfst-1.8.3/src/lib + +julia --project=$JULIA_ENV shortest_distance.jl +julia --project=$JULIA_ENV composition.jl \ No newline at end of file diff --git a/benchmark/shortest_distance.jl b/benchmark/shortest_distance.jl new file mode 100644 index 0000000..9694646 --- /dev/null +++ b/benchmark/shortest_distance.jl @@ -0,0 +1,106 @@ +using DataFrames +using Semirings +using TensorFSTs +using CSV +using BenchmarkTools +using OpenFst +using Glob +using SparseArrays +using CUDA +using NaNStatistics + +include("../../TensorFSTs.jl/lib/OpenFstConvert.jl") +include("utils.jl") + +function sdbench(sdfunc, machine, seconds) + b = @benchmarkable $sdfunc($machine) + # tune!(b) + t = run(b, samples=100, seconds=seconds, evals=1) + t.times +end + +machinezoo_path = "../../MachineZoo.jl/" +tseconds = 4 +oseconds = 1 + +dfs = [] +for path in glob(joinpath(machinezoo_path,"machines/*/*/fstinfo.csv")) + df = DataFrame(CSV.File(path)); + push!(dfs, df) +end +df = vcat(dfs...) + +results = [] +for r in eachrow(df) + ofst = OF.read(joinpath(machinezoo_path, r["file"])) + + if OF.numstates(ofst) == 0 + continue + end + if r["# of arcs"] > 100000 + continue + end + + if r["cyclic"] == "y" && r["arc type"] == "log" + continue + end + + println(r["file"]) + tfst = TF.TensorFST(ofst) + A_cpu, A_gpu = machine2matrices(tfst) + times = Dict() + + #check results + + sd0 = OF.shortestdistance(ofst) + sd1 = TF.shortestdistance(tfst) + sd2 = cpu_shortest_distance(A_cpu) + sd3 = cu_shortest_distance(A_gpu) + + times["ofst"] = sdbench(OF.shortestdistance,ofst, tseconds) + + if isapprox(sd0,val.(sd1[:])) + times["tfst"] = sdbench(TF.shortestdistance,tfst, tseconds) + else + times["tfst"] = [NaN] + end + if isapprox(sd0, val.(sd2[:])) + times["cpufst"] = sdbench(cpu_shortest_distance, A_cpu, tseconds) + else + times["cpufst"] = [NaN] + end + if isapprox(sd0, val.(Array(sd3))) + times["gpufst"] = sdbench(cu_shortest_distance, A_gpu, tseconds) + else + times["gpufst"] = [NaN] + end + + if r["cyclic"]=="n" + sd4 = cpu_acyclic_shortest_distance(A_cpu) + sd5 = cu_acyclic_shortest_distance(A_gpu) + if isapprox(sd0, val.(sd4[:])) + times["cpufst_acyclic"] = sdbench(cpu_acyclic_shortest_distance, A_cpu, tseconds) + else + times["cpufst_acyclic"] = [NaN] + end + if isapprox(sd0, val.(Array(sd5))) + times["gpufst_acyclic"] = sdbench(cu_acyclic_shortest_distance, A_gpu, tseconds) + else + times["gpufst_acyclic"] = [NaN] + end + end + + stats = Dict() + stats[Symbol("file")] = r["file"] + for (k,v) in times + stats[Symbol("$(k)_min")] = nanminimum(v) + stats[Symbol("$(k)_max")] = nanmaximum(v) + stats[Symbol("$(k)_mean")] = nanmean(v) + stats[Symbol("$(k)_std")] = nanstd(v) + stats[Symbol("$(k)_len")] = length(filter(!isnan, v)) + end + push!(results, NamedTuple(stats)) +end + +joined = innerjoin(df, DataFrame(results), on = :file) +CSV.write("shortest_distance_benchmark.csv", joined) diff --git a/benchmark/utils.jl b/benchmark/utils.jl new file mode 100644 index 0000000..c47d666 --- /dev/null +++ b/benchmark/utils.jl @@ -0,0 +1,192 @@ +_logaddexp(b, x, y) = inv(b) * logaddexp(b*x, b*y) +Base.:+(x::S, y::S) where S<:TropicalSemiring = S(min(val(x), val(y))) +Base.:*(x::S, y::S) where S<:TropicalSemiring = S(val(x) + val(y)) +Base.:+(x::LogSemiring{T,b}, y::LogSemiring{T,b}) where {T,b} = LogSemiring{T,b}(_logaddexp(b, val(x), val(y))) +Base.:*(x::S, y::S) where S<:LogSemiring = S(val(x) + val(y)) + +function cu_shortest_distance(A) + K = eltype(A) + xk = zeros(K,size(A)[2]) + xk[1] = 1 + u_n = CUDA.CuVector(xk) + res = similar(u_n) + prevres = similar(u_n) + copyto!(res, u_n) + copyto!(prevres, u_n) + stop = false + while ! stop + u_n = call_csr_spmv_vector_kernel(A, u_n) + res += u_n + stop = all(val.(res)≈val.(prevres)) + copyto!(prevres, res) + end + res +end + +## unoptimized (custom csr spmv should be used) +function cpu_acyclic_shortest_distance(A) + K = eltype(A) + u_n = zeros(K,(1,size(A)[1])) + u_n[1] = 1 + res = similar(u_n) + copyto!(res, u_n) + for i in 1:size(A)[1] + u_n = u_n*A + res += u_n + end + res +end + +function cu_acyclic_shortest_distance(A) + K = eltype(A) + xk = zeros(K,size(A)[2]) + xk[1] = 1 + u_n = CUDA.CuVector(xk) + res = similar(u_n) + copyto!(res, u_n) + for i in 1:size(A)[1] + call_csr_spmv_vector_kernel2(A, u_n, i) + res += u_n + end + res +end + +function cpu_shortest_distance(A) + K = eltype(A) + u_n = zeros(K,(1,size(A)[1])) + u_n[1] = 1 + res = similar(u_n) + prevres = similar(u_n) + copyto!(res, u_n) + copyto!(prevres, u_n) + stop = false + c=0 + while ! stop + c +=1 + u_n = u_n*A + res += u_n + stop = !_has_changed(res, prevres) + copyto!(prevres, res) + end + res +end + +function _has_changed(x, y) + changed = false + for i in eachindex(x) + if ! (val(x[i]) ≈ val(y[i])) + changed = true + break + end + end + changed +end + +function machine2matrices(tfst) + K = semiring(tfst) + + Ma = sort(sum(tfst.M, dims=(3,4)),1) + + row_ids = Int32.(first.(Ma.nzcoo)) + col_ids = Int32.(last.(Ma.nzcoo)) + A_cpu = sparse(row_ids, col_ids, Ma.nzval, size(Ma)...) + + #to transpose csr + A = sparse(row_ids, col_ids, val.(Ma.nzval), size(Ma)...) + A_d = CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(A)) + + #cuda + A_gpu=CUDA.CUSPARSE.CuSparseMatrixCSR{K}( + A_d.rowPtr, + A_d.colVal, + convert(CuVector{K}, A_cpu.nzval), + A_d.dims); + + A_cpu, A_gpu +end + +function warp_reduce(x::T) where T <: Semiring + offset = warpsize() ÷ 2 + while offset > 0 + x += T(CUDA.shfl_down_sync(CUDA.FULL_MASK, val(x), offset)) + offset ÷= 2 + end + x +end + +function _cukernel_mul_smdv!(c, rowptr, colval, nzval, b) + + threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x + warpid = (threadid - 1) ÷ warpsize() + 1 + lane = ((threadid - 1) % warpsize()) + 1 + + r = warpid # assign one warp per row. + sum = zero(eltype(nzval)) + if r < length(rowptr) + @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1) + sum += nzval[i] * b[colval[i]] + end + end + + sum = warp_reduce(sum) + if lane == 1 && r < length(rowptr) + @inbounds c[r] = sum + end + + return +end + +function call_csr_spmv_vector_kernel(A,x) + K = eltype(A.nzVal) + n_rows = A.dims[1] + col_ids = A.colVal + data = A.nzVal + row_ptr = A.rowPtr + + y = CUDA.zeros(K,A.dims[1]) + warp_size = 32 + ckernel = @cuda launch=false _cukernel_mul_smdv!(y, row_ptr, col_ids, data, x) + config = launch_configuration(ckernel.fun) + threads = min(warp_size * n_rows, config.threads) + blocks = cld(warp_size * n_rows, threads) + ckernel(y, row_ptr, col_ids, data, x; threads=threads,blocks=blocks) + y +end + + +function _cukernel_mul_smdv2!(c, rowptr, colval, nzval, min_row) + + threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x + warpid = (threadid - 1) ÷ warpsize() + 1 + lane = ((threadid - 1) % warpsize()) + 1 + + r = warpid # assign one warp per row. + sum = zero(eltype(nzval)) + if r < length(rowptr) && r>=min_row + @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1) + sum += nzval[i] * c[colval[i]] + end + end + sum = warp_reduce(sum) + + if lane == 1 && r < length(rowptr) && r>=min_row + @inbounds c[r] = sum + end + + return +end + +function call_csr_spmv_vector_kernel2(A,x,min_row) + n_rows = A.dims[1] + col_ids = A.colVal + data = A.nzVal + row_ptr = A.rowPtr + + warp_size = 32 + ckernel = @cuda launch=false _cukernel_mul_smdv2!(x, row_ptr, col_ids, data, min_row) + config = launch_configuration(ckernel.fun) + threads = min(warp_size * n_rows, config.threads) + blocks = cld(warp_size * n_rows, threads) + ckernel(x, row_ptr, col_ids, data,min_row; threads=threads,blocks=blocks) + x +end \ No newline at end of file -- GitLab