|
1 | | -@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, @Const(f), @Const(op), @Const(init)) |
| 1 | +@kernel inbounds=true cpu=false function _mapreduce_block!(@Const(src), dst, f, op, init) |
2 | 2 |
|
3 | 3 | N = @groupsize()[1] |
4 | 4 | sdata = @localmem eltype(dst) (N,) |
|
26 | 26 | @synchronize() |
27 | 27 |
|
28 | 28 | if N >= 512 |
29 | | - ithread < 256 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1])) |
| 29 | + ithread < 256 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 256 + 1])) |
30 | 30 | @synchronize() |
31 | 31 | end |
32 | 32 | if N >= 256 |
33 | | - ithread < 128 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1])) |
| 33 | + ithread < 128 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 128 + 1])) |
34 | 34 | @synchronize() |
35 | 35 | end |
36 | 36 | if N >= 128 |
37 | | - ithread < 64 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1])) |
| 37 | + ithread < 64 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 64 + 1])) |
38 | 38 | @synchronize() |
39 | 39 | end |
40 | 40 |
|
41 | 41 | # CUDA has a warp size of 32, AMD a "wavefront" of 64, and Intel Graphics messes it up |
42 | 42 | if N >= 64 |
43 | | - ithread < 32 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1])) |
| 43 | + ithread < 32 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 32 + 1])) |
44 | 44 | @synchronize() |
45 | 45 | end |
46 | 46 | if N >= 32 |
47 | | - ithread < 16 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1])) |
| 47 | + ithread < 16 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 16 + 1])) |
48 | 48 | @synchronize() |
49 | 49 | end |
50 | 50 | if N >= 16 |
51 | | - ithread < 8 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1])) |
| 51 | + ithread < 8 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 8 + 1])) |
52 | 52 | @synchronize() |
53 | 53 | end |
54 | 54 | if N >= 8 |
55 | | - ithread < 4 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1])) |
| 55 | + ithread < 4 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 4 + 1])) |
56 | 56 | @synchronize() |
57 | 57 | end |
58 | 58 | if N >= 4 |
59 | | - ithread < 2 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1])) |
| 59 | + ithread < 2 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 2 + 1])) |
60 | 60 | @synchronize() |
61 | 61 | end |
62 | 62 | if N >= 2 |
63 | | - ithread < 1 && (@inbounds sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1])) |
| 63 | + ithread < 1 && (sdata[ithread + 1] = op(sdata[ithread + 1], sdata[ithread + 1 + 1])) |
64 | 64 | @synchronize() |
65 | 65 | end |
66 | 66 |
|
|
0 commit comments