Merge pull request #4 from pxl-th/pxl-th/cleanup

anicusan · web-flow · commit ea03d1661e78 · 2024-10-13T02:45:06.000+01:00
Remove redundant Const and inbounds macros
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -12,8 +12,8 @@ const ACC_FLAG_P::Int8 = 1          # Only current block's prefix available
 end
 
 
-@kernel cpu=false inbounds=true function _accumulate_block!(@Const(op), v, @Const(init),
-                                                            @Const(inclusive),
+@kernel cpu=false inbounds=true function _accumulate_block!(op, v, init,
+                                                            inclusive,
                                                             flags, prefixes)  # one per block
 
     # NOTE: shmem_size MUST be greater than 2 * block_size
@@ -139,8 +139,7 @@ end
 end
 
 
-@kernel cpu=false inbounds=true function _accumulate_previous!(@Const(op), v, @Const(init),
-                                                               flags, @Const(prefixes))
+@kernel cpu=false inbounds=true function _accumulate_previous!(op, v, init, flags, @Const(prefixes))
 
     len = length(v)
     block_size = @groupsize()[1]
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -1,4 +1,4 @@
-@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, @Const(f), @Const(op), @Const(init))
+@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, f, op, init)
 
     N = @groupsize()[1]
     sdata = @localmem eltype(dst) (N,)
@@ -26,41 +26,41 @@
     @synchronize()
 
     if N >= 512
-        ithread < 256 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
+        ithread < 256 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
         @synchronize()
     end
     if N >= 256
-        ithread < 128 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
+        ithread < 128 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
         @synchronize()
     end
     if N >= 128
-        ithread < 64 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
+        ithread < 64 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
         @synchronize()
     end
 
     # CUDA has a warp size of 32, AMD a "wavefront" of 64, and Intel Graphics messes it up
     if N >= 64
-        ithread < 32 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
+        ithread < 32 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
         @synchronize()
     end
     if N >= 32
-        ithread < 16 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
+        ithread < 16 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
         @synchronize()
     end
     if N >= 16
-        ithread < 8 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
+        ithread < 8 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
         @synchronize()
     end
     if N >= 8
-        ithread < 4 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
+        ithread < 4 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
         @synchronize()
     end
     if N >= 4
-        ithread < 2 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
+        ithread < 2 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
         @synchronize()
     end
     if N >= 2
-        ithread < 1 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
+        ithread < 1 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
         @synchronize()
     end
 
diff --git a/src/reduce.jl b/src/reduce.jl
@@ -1,4 +1,4 @@
-@kernel inbounds=true cpu=false function _reduce_block!(@Const(src), dst, @Const(op), @Const(init))
+@kernel inbounds=true cpu=false function _reduce_block!(@Const(src), dst, op, init)
 
     N = @groupsize()[1]
     sdata = @localmem eltype(dst) (N,)
@@ -26,41 +26,41 @@
     @synchronize()
 
     if N >= 512
-        ithread < 256 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
+        ithread < 256 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
         @synchronize()
     end
     if N >= 256
-        ithread < 128 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
+        ithread < 128 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
         @synchronize()
     end
     if N >= 128
-        ithread < 64 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
+        ithread < 64 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
         @synchronize()
     end
 
     # CUDA has a warp size of 32, AMD a "wavefront" of 64, and Intel Graphics messes it up
     if N >= 64
-        ithread < 32 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
+        ithread < 32 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
         @synchronize()
     end
     if N >= 32
-        ithread < 16 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
+        ithread < 16 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
         @synchronize()
     end
     if N >= 16
-        ithread < 8 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
+        ithread < 8 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
         @synchronize()
     end
     if N >= 8
-        ithread < 4 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
+        ithread < 4 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
         @synchronize()
     end
     if N >= 4
-        ithread < 2 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
+        ithread < 2 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
         @synchronize()
     end
     if N >= 2
-        ithread < 1 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
+        ithread < 1 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
         @synchronize()
     end
 
diff --git a/src/truth.jl b/src/truth.jl
@@ -1,5 +1,5 @@
 # TODO: this hangs / dies on oneAPI. Test on CUDA
-@kernel cpu=false inbounds=true function _any_global!(out, @Const(pred), @Const(v))
+@kernel cpu=false inbounds=true function _any_global!(out, pred, @Const(v))
     temp = @localmem Int8 (1,)
     i = @index(Global, Linear)