_logaddexp(b, x, y) = inv(b) * logaddexp(b*x, b*y) Base.:+(x::S, y::S) where S<:TropicalSemiring = S(min(val(x), val(y))) Base.:*(x::S, y::S) where S<:TropicalSemiring = S(val(x) + val(y)) Base.:+(x::LogSemiring{T,b}, y::LogSemiring{T,b}) where {T,b} = LogSemiring{T,b}(_logaddexp(b, val(x), val(y))) Base.:*(x::S, y::S) where S<:LogSemiring = S(val(x) + val(y)) function cu_shortest_distance(A) K = eltype(A) xk = zeros(K,size(A)[2]) xk[1] = 1 u_n = CUDA.CuVector(xk) res = similar(u_n) prevres = similar(u_n) copyto!(res, u_n) copyto!(prevres, u_n) stop = false while ! stop u_n = call_csr_spmv_vector_kernel(A, u_n) res += u_n stop = all(val.(res)≈val.(prevres)) copyto!(prevres, res) end res end ## unoptimized (custom csr spmv should be used) function cpu_acyclic_shortest_distance(A) K = eltype(A) u_n = zeros(K,(1,size(A)[1])) u_n[1] = 1 res = similar(u_n) copyto!(res, u_n) for i in 1:size(A)[1] u_n = u_n*A res += u_n end res end function cu_acyclic_shortest_distance(A) K = eltype(A) xk = zeros(K,size(A)[2]) xk[1] = 1 u_n = CUDA.CuVector(xk) res = similar(u_n) copyto!(res, u_n) for i in 1:size(A)[1] call_csr_spmv_vector_kernel2(A, u_n, i) res += u_n end res end function cpu_shortest_distance(A) K = eltype(A) u_n = zeros(K,(1,size(A)[1])) u_n[1] = 1 res = similar(u_n) prevres = similar(u_n) copyto!(res, u_n) copyto!(prevres, u_n) stop = false c=0 while ! stop c +=1 u_n = u_n*A res += u_n stop = !_has_changed(res, prevres) copyto!(prevres, res) end res end function _has_changed(x, y) changed = false for i in eachindex(x) if ! (val(x[i]) ≈ val(y[i])) changed = true break end end changed end function machine2matrices(tfst) K = semiring(tfst) Ma = sort(sum(tfst.M, dims=(3,4)),1) row_ids = Int32.(first.(Ma.nzcoo)) col_ids = Int32.(last.(Ma.nzcoo)) A_cpu = sparse(row_ids, col_ids, Ma.nzval, size(Ma)...) #to transpose csr A = sparse(row_ids, col_ids, val.(Ma.nzval), size(Ma)...) A_d = CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(A)) #cuda A_gpu=CUDA.CUSPARSE.CuSparseMatrixCSR{K}( A_d.rowPtr, A_d.colVal, convert(CuVector{K}, A_cpu.nzval), A_d.dims); A_cpu, A_gpu end function warp_reduce(x::T) where T <: Semiring offset = warpsize() ÷ 2 while offset > 0 x += T(CUDA.shfl_down_sync(CUDA.FULL_MASK, val(x), offset)) offset ÷= 2 end x end function _cukernel_mul_smdv!(c, rowptr, colval, nzval, b) threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x warpid = (threadid - 1) ÷ warpsize() + 1 lane = ((threadid - 1) % warpsize()) + 1 r = warpid # assign one warp per row. sum = zero(eltype(nzval)) if r < length(rowptr) @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1) sum += nzval[i] * b[colval[i]] end end sum = warp_reduce(sum) if lane == 1 && r < length(rowptr) @inbounds c[r] = sum end return end function call_csr_spmv_vector_kernel(A,x) K = eltype(A.nzVal) n_rows = A.dims[1] col_ids = A.colVal data = A.nzVal row_ptr = A.rowPtr y = CUDA.zeros(K,A.dims[1]) warp_size = 32 ckernel = @cuda launch=false _cukernel_mul_smdv!(y, row_ptr, col_ids, data, x) config = launch_configuration(ckernel.fun) threads = min(warp_size * n_rows, config.threads) blocks = cld(warp_size * n_rows, threads) ckernel(y, row_ptr, col_ids, data, x; threads=threads,blocks=blocks) y end function _cukernel_mul_smdv2!(c, rowptr, colval, nzval, min_row) threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x warpid = (threadid - 1) ÷ warpsize() + 1 lane = ((threadid - 1) % warpsize()) + 1 r = warpid # assign one warp per row. sum = zero(eltype(nzval)) if r < length(rowptr) && r>=min_row @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1) sum += nzval[i] * c[colval[i]] end end sum = warp_reduce(sum) if lane == 1 && r < length(rowptr) && r>=min_row @inbounds c[r] = sum end return end function call_csr_spmv_vector_kernel2(A,x,min_row) n_rows = A.dims[1] col_ids = A.colVal data = A.nzVal row_ptr = A.rowPtr warp_size = 32 ckernel = @cuda launch=false _cukernel_mul_smdv2!(x, row_ptr, col_ids, data, min_row) config = launch_configuration(ckernel.fun) threads = min(warp_size * n_rows, config.threads) blocks = cld(warp_size * n_rows, threads) ckernel(x, row_ptr, col_ids, data,min_row; threads=threads,blocks=blocks) x end