utils.jl

_logaddexp(b, x, y) = inv(b) * logaddexp(b*x, b*y)
Base.:+(x::S, y::S) where S<:TropicalSemiring = S(min(val(x), val(y)))
Base.:*(x::S, y::S) where S<:TropicalSemiring = S(val(x) + val(y))
Base.:+(x::LogSemiring{T,b}, y::LogSemiring{T,b}) where {T,b} = LogSemiring{T,b}(_logaddexp(b, val(x), val(y)))
Base.:*(x::S, y::S) where S<:LogSemiring = S(val(x) + val(y))

function cu_shortest_distance(A)
	K = eltype(A)	
	xk = zeros(K,size(A)[2])
	xk[1] = 1
	u_n = CUDA.CuVector(xk)
    res = similar(u_n)
    prevres = similar(u_n)
    copyto!(res, u_n)
    copyto!(prevres, u_n)
    stop = false
    while ! stop
        u_n = call_csr_spmv_vector_kernel(A, u_n)
        res += u_n
        stop = all(val.(res)≈val.(prevres))
        copyto!(prevres, res)
    end
    res	
end

## unoptimized (custom csr spmv should be used)
function cpu_acyclic_shortest_distance(A)
	K = eltype(A)
	u_n = zeros(K,(1,size(A)[1]))
	u_n[1] = 1
	res = similar(u_n)
	copyto!(res, u_n)
    for i in 1:size(A)[1]
	    u_n = u_n*A
		res += u_n
    end
    res
end

function cu_acyclic_shortest_distance(A)
	K = eltype(A)	
	xk = zeros(K,size(A)[2])
	xk[1] = 1
	u_n = CUDA.CuVector(xk)
	res = similar(u_n)
	copyto!(res, u_n)
    for i in 1:size(A)[1]
        call_csr_spmv_vector_kernel2(A, u_n, i)
		res += u_n
    end
    res
end

function cpu_shortest_distance(A)
    K = eltype(A)
	u_n = zeros(K,(1,size(A)[1]))
	u_n[1] = 1
    res = similar(u_n)
    prevres = similar(u_n)
    copyto!(res, u_n)
    copyto!(prevres, u_n)
    stop = false
	c=0
    while ! stop
		c +=1 
        u_n = u_n*A
        res += u_n
		stop = !_has_changed(res, prevres)
        copyto!(prevres, res)
    end
    res	
end

function _has_changed(x, y)
    changed = false
    for i in eachindex(x)
        if ! (val(x[i]) ≈ val(y[i]))
            changed = true
            break
        end
    end
    changed
end

function machine2matrices(tfst)
	K = semiring(tfst)	
	
	Ma = sort(sum(tfst.M, dims=(3,4)),1)

	row_ids = Int32.(first.(Ma.nzcoo))
	col_ids = Int32.(last.(Ma.nzcoo))
	A_cpu = sparse(row_ids, col_ids, Ma.nzval, size(Ma)...)

	#to transpose csr
	A = sparse(row_ids, col_ids, val.(Ma.nzval), size(Ma)...)
	A_d = CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(A))
	
	#cuda	
	A_gpu=CUDA.CUSPARSE.CuSparseMatrixCSR{K}(
		A_d.rowPtr,
		A_d.colVal,
		convert(CuVector{K}, A_cpu.nzval),
		A_d.dims);
	
	A_cpu, A_gpu 
end

function warp_reduce(x::T) where T <: Semiring
	offset = warpsize() ÷ 2
	while offset > 0
        x += T(CUDA.shfl_down_sync(CUDA.FULL_MASK, val(x), offset))
		offset ÷= 2
	end
    x
end

function _cukernel_mul_smdv!(c, rowptr, colval, nzval, b)

    threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    warpid = (threadid - 1) ÷ warpsize() + 1
    lane = ((threadid - 1) % warpsize()) + 1

    r = warpid # assign one warp per row.
    sum = zero(eltype(nzval))
    if r < length(rowptr)
        @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
            sum += nzval[i] * b[colval[i]]
        end
    end

    sum = warp_reduce(sum)
    if lane == 1 && r < length(rowptr)
        @inbounds c[r] = sum
    end

    return
end

function call_csr_spmv_vector_kernel(A,x)
    K = eltype(A.nzVal)
    n_rows = A.dims[1]
    col_ids = A.colVal
    data = A.nzVal
    row_ptr = A.rowPtr
    
    y = CUDA.zeros(K,A.dims[1])
    warp_size = 32
    ckernel = @cuda launch=false _cukernel_mul_smdv!(y, row_ptr, col_ids, data, x)
    config = launch_configuration(ckernel.fun)
    threads = min(warp_size * n_rows, config.threads)
    blocks = cld(warp_size * n_rows, threads)
    ckernel(y, row_ptr, col_ids, data, x; threads=threads,blocks=blocks)
    y
end


function _cukernel_mul_smdv2!(c, rowptr, colval, nzval, min_row)

    threadid = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    warpid = (threadid - 1) ÷ warpsize() + 1
    lane = ((threadid - 1) % warpsize()) + 1

    r = warpid # assign one warp per row.
    sum = zero(eltype(nzval))
    if r < length(rowptr) && r>=min_row
        @inbounds for i in (rowptr[r] + lane - 1):warpsize():(rowptr[r+1] - 1)
            sum += nzval[i] * c[colval[i]]
        end
    end
    sum = warp_reduce(sum)
    
    if lane == 1 && r < length(rowptr) && r>=min_row
        @inbounds c[r] = sum
    end

    return
end 

function call_csr_spmv_vector_kernel2(A,x,min_row)  
    n_rows = A.dims[1]
    col_ids = A.colVal
    data = A.nzVal
    row_ptr = A.rowPtr
        
    warp_size = 32
    ckernel = @cuda launch=false  _cukernel_mul_smdv2!(x, row_ptr, col_ids, data, min_row)
    config = launch_configuration(ckernel.fun)
    threads = min(warp_size * n_rows, config.threads)
    blocks = cld(warp_size * n_rows, threads)
    ckernel(x, row_ptr, col_ids, data,min_row; threads=threads,blocks=blocks)
    x
end