Skip to content

Commit ea03d16

Browse files
authored
Merge pull request #4 from pxl-th/pxl-th/cleanup
Remove redundant Const and inbounds macros
2 parents 8f240fe + ad52869 commit ea03d16

File tree

4 files changed

+24
-25
lines changed

4 files changed

+24
-25
lines changed

src/accumulate.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ const ACC_FLAG_P::Int8 = 1 # Only current block's prefix available
1212
end
1313

1414

15-
@kernel cpu=false inbounds=true function _accumulate_block!(@Const(op), v, @Const(init),
16-
@Const(inclusive),
15+
@kernel cpu=false inbounds=true function _accumulate_block!(op, v, init,
16+
inclusive,
1717
flags, prefixes) # one per block
1818

1919
# NOTE: shmem_size MUST be greater than 2 * block_size
@@ -139,8 +139,7 @@ end
139139
end
140140

141141

142-
@kernel cpu=false inbounds=true function _accumulate_previous!(@Const(op), v, @Const(init),
143-
flags, @Const(prefixes))
142+
@kernel cpu=false inbounds=true function _accumulate_previous!(op, v, init, flags, @Const(prefixes))
144143

145144
len = length(v)
146145
block_size = @groupsize()[1]

src/mapreduce.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, @Const(f), @Const(op), @Const(init))
1+
@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, f, op, init)
22

33
N = @groupsize()[1]
44
sdata = @localmem eltype(dst) (N,)
@@ -26,41 +26,41 @@
2626
@synchronize()
2727

2828
if N >= 512
29-
ithread < 256 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
29+
ithread < 256 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
3030
@synchronize()
3131
end
3232
if N >= 256
33-
ithread < 128 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
33+
ithread < 128 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
3434
@synchronize()
3535
end
3636
if N >= 128
37-
ithread < 64 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
37+
ithread < 64 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
3838
@synchronize()
3939
end
4040

4141
# CUDA has a warp size of 32, AMD a "wavefront" of 64, and Intel Graphics messes it up
4242
if N >= 64
43-
ithread < 32 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
43+
ithread < 32 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
4444
@synchronize()
4545
end
4646
if N >= 32
47-
ithread < 16 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
47+
ithread < 16 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
4848
@synchronize()
4949
end
5050
if N >= 16
51-
ithread < 8 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
51+
ithread < 8 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
5252
@synchronize()
5353
end
5454
if N >= 8
55-
ithread < 4 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
55+
ithread < 4 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
5656
@synchronize()
5757
end
5858
if N >= 4
59-
ithread < 2 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
59+
ithread < 2 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
6060
@synchronize()
6161
end
6262
if N >= 2
63-
ithread < 1 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
63+
ithread < 1 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
6464
@synchronize()
6565
end
6666

src/reduce.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@kernel inbounds=true cpu=false function _reduce_block!(@Const(src), dst, @Const(op), @Const(init))
1+
@kernel inbounds=true cpu=false function _reduce_block!(@Const(src), dst, op, init)
22

33
N = @groupsize()[1]
44
sdata = @localmem eltype(dst) (N,)
@@ -26,41 +26,41 @@
2626
@synchronize()
2727

2828
if N >= 512
29-
ithread < 256 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
29+
ithread < 256 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1]))
3030
@synchronize()
3131
end
3232
if N >= 256
33-
ithread < 128 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
33+
ithread < 128 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1]))
3434
@synchronize()
3535
end
3636
if N >= 128
37-
ithread < 64 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
37+
ithread < 64 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1]))
3838
@synchronize()
3939
end
4040

4141
# CUDA has a warp size of 32, AMD a "wavefront" of 64, and Intel Graphics messes it up
4242
if N >= 64
43-
ithread < 32 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
43+
ithread < 32 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1]))
4444
@synchronize()
4545
end
4646
if N >= 32
47-
ithread < 16 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
47+
ithread < 16 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1]))
4848
@synchronize()
4949
end
5050
if N >= 16
51-
ithread < 8 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
51+
ithread < 8 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1]))
5252
@synchronize()
5353
end
5454
if N >= 8
55-
ithread < 4 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
55+
ithread < 4 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1]))
5656
@synchronize()
5757
end
5858
if N >= 4
59-
ithread < 2 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
59+
ithread < 2 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1]))
6060
@synchronize()
6161
end
6262
if N >= 2
63-
ithread < 1 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
63+
ithread < 1 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1]))
6464
@synchronize()
6565
end
6666

src/truth.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# TODO: this hangs / dies on oneAPI. Test on CUDA
2-
@kernel cpu=false inbounds=true function _any_global!(out, @Const(pred), @Const(v))
2+
@kernel cpu=false inbounds=true function _any_global!(out, pred, @Const(v))
33
temp = @localmem Int8 (1,)
44
i = @index(Global, Linear)
55

0 commit comments

Comments
 (0)