Skip to content
Snippets Groups Projects
Commit 77731c31 authored by Pablo Riera's avatar Pablo Riera
Browse files

benchmark update

parent 5f35cb2f
No related branches found
No related tags found
1 merge request!67benchmark update
This commit is part of merge request !67. Comments created here will be created in the context of that merge request.
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
NaNStatistics = "b946abbf-3ea7-4610-9019-9858bfdeaf2d"
OpenFst = "3e215157-cce3-4b04-a8eb-cbfb077f1dc8"
Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
Semirings = "900aad66-9ca5-44d4-b043-321c62cb7767"
SparseArrayKit = "a9a3c162-d163-4c15-8926-b8794fbefed2"
SumSparseTensors = "472dc678-8c5a-4778-a6eb-28a4e9c7cb58"
TensorFSTs = "9f8e39db-0d6b-46cc-a14d-daf437028eec"
Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
using DataFrames
using Semirings
using TensorFSTs
using CSV
using BenchmarkTools
using OpenFst
using Glob
using IterTools
include("../../TensorFSTs.jl/lib/OpenFstConvert.jl")
include("tulliocompose.jl")
pairs = [("dense","topology"),
("dense","charlm"),
("dense","random"),
("topology","dense")]
function compbench(compfunc, machineA, machineB, seconds)
b = @benchmarkable $compfunc($machineA, $machineB)
# tune!(b)
t = run(b, samples=100, seconds=seconds, evals=1)
t.times
end
function compbench2(compfunc, machineA, machineB, seconds)
AM, Aa, Aw = tensorFST2SparseArray(machineA)
BM, Ba, Bw = tensorFST2SparseArray(machineB)
min_cd = min(AM.dims[4], BM.dims[3])
AM = AM[:,:,:,1:min_cd]
BM = BM[:,:,1:min_cd,:]
b = @benchmarkable $compfunc($AM, $BM, $Aa, $Aw, $Ba, $Bw)
# tune!(b)
t = run(b, samples=100, seconds=seconds, evals=1)
t.times
end
function renamepath(path)
type = splitpath(path)[2]
name = replace(splitpath(path)[3],".fst"=>"")
type * "-" * name
end
machinezoo_path = "../../MachineZoo.jl/"
if !isdir(joinpath(machinezoo_path, "machines", "composition"))
mkdir(joinpath(machinezoo_path, "machines", "composition"))
end
tseconds = 4
oseconds = 1
dfs = []
for path in glob(joinpath(machinezoo_path,"machines/*/fstinfo.csv"))
df = DataFrame(CSV.File(path));
push!(dfs, df)
end
df = vcat(dfs...)
df[!,"type"] = map(x -> splitpath(x)[2] ,df[!,"file"])
results = []
for pair in pairs
dfx = df[df.type.==pair[1],:]
dfy = df[df.type.==pair[2],:]
for p in product(dfx.file, dfy.file)
ofstA = OF.read(joinpath(machinezoo_path, p[1]))
ofstB = OF.read(joinpath(machinezoo_path, p[2]))
println(p)
ofstC = OF.compose(ofstA, ofstB)
if OF.numstates(ofstC) == 0
println("skipping")
continue
end
p1 = renamepath(p[1])
p2 = renamepath(p[2])
OF.write(ofstC, joinpath(machinezoo_path, "machines", "composition", "$(p1)-x-$(p2).fst"))
otimes = compbench(OF.compose, ofstA, ofstB, oseconds)
print(" of: ")
print(mean(otimes))
ttimes = nothing
try
ttimes = compbench(TF.compose,TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds)
print(" tf: ")
print(mean(ttimes))
catch
println("tf failed")
ttimes = [Inf]
end
# tutimes = compbench2(tulliocompose, TF.TensorFST(ofstA), TF.TensorFST(ofstB), tseconds)
# print(" tu: ")
# print(mean(tutimes))
println()
push!(results, (fileA=p[1], fileB=p[2],
tmin=minimum(ttimes), tmax=maximum(ttimes), tmean=mean(ttimes), tstd=std(ttimes), tlen=length(ttimes),
omin=minimum(otimes), omax=maximum(otimes), omean=mean(otimes), ostd=std(otimes), olen=length(otimes),
# tumin=minimum(tutimes), tumax=maximum(tutimes), tumean=mean(tutimes), tustd=std(tutimes), tulen=length(tutimes)
))
end
end
CSV.write("composition_benchmark.csv", DataFrame(results))
JULIA_ENV=./
export LD_LIBRARY_PATH=../../OpenFst.jl/src/:../../OpenFst.jl/openfst-1.8.3/src/lib
julia --project=$JULIA_ENV shortest_distance.jl
julia --project=$JULIA_ENV composition.jl
\ No newline at end of file
using DataFrames
using Semirings
using TensorFSTs
using CSV
using BenchmarkTools
using OpenFst
using Glob
using SparseArrays
using CUDA
using NaNStatistics
include("../../TensorFSTs.jl/lib/OpenFstConvert.jl")
include("utils.jl")
function sdbench(sdfunc, machine, seconds)
b = @benchmarkable $sdfunc($machine)
# tune!(b)
t = run(b, samples=100, seconds=seconds, evals=1)
t.times
end
machinezoo_path = "../../MachineZoo.jl/"
tseconds = 4
oseconds = 1
dfs = []
for path in glob(joinpath(machinezoo_path,"machines/*/*/fstinfo.csv"))
df = DataFrame(CSV.File(path));
push!(dfs, df)
end
df = vcat(dfs...)
results = []
for r in eachrow(df)
ofst = OF.read(joinpath(machinezoo_path, r["file"]))
if OF.numstates(ofst) == 0
continue
end
if r["# of arcs"] > 100000
continue
end
if r["cyclic"] == "y" && r["arc type"] == "log"
continue
end
println(r["file"])
tfst = TF.TensorFST(ofst)
A_cpu, A_gpu = machine2matrices(tfst)
times = Dict()
#check results
sd0 = OF.shortestdistance(ofst)
sd1 = TF.shortestdistance(tfst)
sd2 = cpu_shortest_distance(A_cpu)
sd3 = cu_shortest_distance(A_gpu)
times["ofst"] = sdbench(OF.shortestdistance,ofst, tseconds)
if isapprox(sd0,val.(sd1[:]))
times["tfst"] = sdbench(TF.shortestdistance,tfst, tseconds)
else
times["tfst"] = [NaN]
end
if isapprox(sd0, val.(sd2[:]))
times["cpufst"] = sdbench(cpu_shortest_distance, A_cpu, tseconds)
else
times["cpufst"] = [NaN]
end
if isapprox(sd0, val.(Array(sd3)))
times["gpufst"] = sdbench(cu_shortest_distance, A_gpu, tseconds)
else
times["gpufst"] = [NaN]
end
if r["cyclic"]=="n"
sd4 = cpu_acyclic_shortest_distance(A_cpu)
sd5 = cu_acyclic_shortest_distance(A_gpu)
if isapprox(sd0, val.(sd4[:]))
times["cpufst_acyclic"] = sdbench(cpu_acyclic_shortest_distance, A_cpu, tseconds)
else
times["cpufst_acyclic"] = [NaN]
end
if isapprox(sd0, val.(Array(sd5)))
times["gpufst_acyclic"] = sdbench(cu_acyclic_shortest_distance, A_gpu, tseconds)
else
times["gpufst_acyclic"] = [NaN]
end
end
stats = Dict()
stats[Symbol("file")] = r["file"]
for (k,v) in times
stats[Symbol("$(k)_min")] = nanminimum(v)
stats[Symbol("$(k)_max")] = nanmaximum(v)
stats[Symbol("$(k)_mean")] = nanmean(v)
stats[Symbol("$(k)_std")] = nanstd(v)
stats[Symbol("$(k)_len")] = length(filter(!isnan, v))
end
push!(results, NamedTuple(stats))
end
joined = innerjoin(df, DataFrame(results), on = :file)
CSV.write("shortest_distance_benchmark.csv", joined)
_logaddexp(b, x, y) = inv(b) * logaddexp(b*x, b*y)
Base.:+(x::S, y::S) where S<:TropicalSemiring = S(min(val(x), val(y)))
Base.:*(x::S, y::S) where S<:TropicalSemiring = S(val(x) + val(y))
Base.:+(x::LogSemiring{T,b}, y::LogSemiring{T,b}) where {T,b} = LogSemiring{T,b}(_logaddexp(b, val(x), val(y)))
Base.:*(x::S, y::S) where S<:LogSemiring = S(val(x) + val(y))
function cu_shortest_distance(A)
K = eltype(A)
xk = zeros(K,size(A)[2])
xk[1] = 1
u_n = CUDA.CuVector(xk)
res = similar(u_n)
prevres = similar(u_n)
copyto!(res, u_n)
copyto!(prevres, u_n)
stop = false
while ! stop
u_n = call_csr_spmv_vector_kernel(A, u_n)
res += u_n
stop = all(val.(res)≈val.(prevres))
copyto!(prevres, res)
end
res
end
## unoptimized (custom csr spmv should be used)
function cpu_acyclic_shortest_distance(A)
K = eltype(A)
u_n = zeros(K,(1,size(A)[1]))
u_n[1] = 1
res = similar(u_n)
copyto!(res, u_n)
for i in 1:size(A)[1]
u_n = u_n*A
res += u_n
end
res
end
function cu_acyclic_shortest_distance(A)
K = eltype(A)
xk = zeros(K,size(A)[2])
xk[1] = 1
u_n = CUDA.CuVector(xk)
res = similar(u_n)
copyto!(res, u_n)
for i in 1:size(A)[1]
call_csr_spmv_vector_kernel2(A, u_n, i)
res += u_n
end
res
end
function cpu_shortest_distance(A)
K = eltype(A)
u_n = zeros(K,(1,size(A)[1]))
u_n[1] = 1
res = similar(u_n)
prevres = similar(u_n)
copyto!(res, u_n)
copyto!(prevres, u_n)
stop = false
c=0
while ! stop
c +=1
u_n = u_n*A
res += u_n
stop = !_has_changed(res, prevres)
copyto!(prevres, res)
end
res
end
function _has_changed(x, y)
changed = false
for i in eachindex(x)
if ! (val(x[i]) val(y[i]))
changed = true
break
end
end
changed
end
function machine2matrices(tfst)
K = semiring(tfst)
Ma = sort(sum(tfst.M, dims=(3,4)),1)
row_ids = Int32.(first.(Ma.nzcoo))
col_ids = Int32.(last.(Ma.nzcoo))
A_cpu = sparse(row_ids, col_ids, Ma.nzval, size(Ma)...)
#to transpose csr
A = sparse(row_ids, col_ids, val.(Ma.nzval), size(Ma)...)
A_d = CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(A))
#cuda
A_gpu=CUDA.CUSPARSE.CuSparseMatrixCSR{K}(
A_d.rowPtr,
A_d.colVal,
convert(CuVector{K}, A_cpu.nzval),
A_d.dims);
A_cpu, A_gpu
end
function warp_reduce(x::T) where T <: Semiring
offset = warpsize() ÷ 2
while offset > 0
x += T(CUDA.shfl_down_sync(CUDA.FULL_MASK, val(x), offset))
offset ÷= 2
end
x
end
function _cukernel_mul_smdv!(c, rowptr, colval, nzval, b)
threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
warpid = (threadid - 1) ÷ warpsize() + 1
lane = ((threadid - 1) % warpsize()) + 1
r = warpid # assign one warp per row.
sum = zero(eltype(nzval))
if r < length(rowptr)
@inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
sum += nzval[i] * b[colval[i]]
end
end
sum = warp_reduce(sum)
if lane == 1 && r < length(rowptr)
@inbounds c[r] = sum
end
return
end
function call_csr_spmv_vector_kernel(A,x)
K = eltype(A.nzVal)
n_rows = A.dims[1]
col_ids = A.colVal
data = A.nzVal
row_ptr = A.rowPtr
y = CUDA.zeros(K,A.dims[1])
warp_size = 32
ckernel = @cuda launch=false _cukernel_mul_smdv!(y, row_ptr, col_ids, data, x)
config = launch_configuration(ckernel.fun)
threads = min(warp_size * n_rows, config.threads)
blocks = cld(warp_size * n_rows, threads)
ckernel(y, row_ptr, col_ids, data, x; threads=threads,blocks=blocks)
y
end
function _cukernel_mul_smdv2!(c, rowptr, colval, nzval, min_row)
threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
warpid = (threadid - 1) ÷ warpsize() + 1
lane = ((threadid - 1) % warpsize()) + 1
r = warpid # assign one warp per row.
sum = zero(eltype(nzval))
if r < length(rowptr) && r>=min_row
@inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
sum += nzval[i] * c[colval[i]]
end
end
sum = warp_reduce(sum)
if lane == 1 && r < length(rowptr) && r>=min_row
@inbounds c[r] = sum
end
return
end
function call_csr_spmv_vector_kernel2(A,x,min_row)
n_rows = A.dims[1]
col_ids = A.colVal
data = A.nzVal
row_ptr = A.rowPtr
warp_size = 32
ckernel = @cuda launch=false _cukernel_mul_smdv2!(x, row_ptr, col_ids, data, min_row)
config = launch_configuration(ckernel.fun)
threads = min(warp_size * n_rows, config.threads)
blocks = cld(warp_size * n_rows, threads)
ckernel(x, row_ptr, col_ids, data,min_row; threads=threads,blocks=blocks)
x
end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment