|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. |
| 3 | + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | + * |
| 5 | + * This code is free software; you can redistribute it and/or modify it |
| 6 | + * under the terms of the GNU General Public License version 2 only, as |
| 7 | + * published by the Free Software Foundation. |
| 8 | + * |
| 9 | + * This code is distributed in the hope that it will be useful, but WITHOUT |
| 10 | + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 11 | + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 12 | + * version 2 for more details (a copy is included in the LICENSE file that |
| 13 | + * accompanied this code). |
| 14 | + * |
| 15 | + * You should have received a copy of the GNU General Public License version |
| 16 | + * 2 along with this work; if not, write to the Free Software Foundation, |
| 17 | + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 18 | + * |
| 19 | + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 20 | + * or visit www.oracle.com if you need additional information or have any |
| 21 | + * questions. |
| 22 | + */ |
| 23 | +package org.openjdk.bench.vm.compiler; |
| 24 | + |
| 25 | +import org.openjdk.jmh.annotations.*; |
| 26 | +import org.openjdk.jmh.infra.*; |
| 27 | + |
| 28 | +import java.lang.invoke.*; |
| 29 | +import java.lang.foreign.*; |
| 30 | + |
| 31 | +import java.util.concurrent.TimeUnit; |
| 32 | + |
| 33 | +/* |
| 34 | +
|
| 35 | + The purpose of this benchmark is to see the effect of automatic alignment in auto vectorization. |
| 36 | + It is recommended to view the differing results when using SuperWordAutomaticAlignment. |
| 37 | +
|
| 38 | + Without automatic alignment, i.e. SuperWordAutomaticAlignment=0, we may get a plot like below, for bench1L1S: |
| 39 | +
|
| 40 | + OFFSET_STORE |
| 41 | + ^ |
| 42 | + | ###############|X |
| 43 | + | ---------------0- <--- store aligned |
| 44 | + | ##############X|# |
| 45 | + | #############X#|# |
| 46 | + | ############X##|# |
| 47 | + | ###########X###|# |
| 48 | + | ##########X####|# |
| 49 | + | #########X#####|# |
| 50 | + | ########X######|# |
| 51 | + | #######X#######|# |
| 52 | + | ######X########|# |
| 53 | + | #####X#########|# |
| 54 | + | ####X##########|# |
| 55 | + | ###X###########|# |
| 56 | + | ##X############|# |
| 57 | + | #X#############|# |
| 58 | + | X##############|# |
| 59 | + ---OFFSET_LOAD ----> |
| 60 | +
|
| 61 | + ^ |
| 62 | + loads aligned | |
| 63 | +
|
| 64 | + #: lowest performance, both misaligned and also relatively misaligned. |
| 65 | + X: low performance, both misaligned but relatively aligned. |
| 66 | + |: medium performance, load aligned, store misaligned. |
| 67 | + -: good performance, load misaligned, store aligned. |
| 68 | + 0: extreme performance, load and store aligned. |
| 69 | +
|
| 70 | + Why is case "-" better than "|"? I.e. why are misaligned stores worse than misaligned loads? |
| 71 | + Misalignment means that a load or store goes over a cache line, and is split into two loads |
| 72 | + or stores. Most CPU's can execute 2 loads and 1 store per cycle, that is at least a partial |
| 73 | + explanation why we are more limited on stores than loads. |
| 74 | + No splitting, full alignment -> 1 load and 1 store |
| 75 | + Split load, store aligned -> 2 loads and 1 store |
| 76 | + Split store, load aligned -> 1 load and 2 stores |
| 77 | +
|
| 78 | + The warmup and measurement time is relatively short, but the benchmark already takes 25 min |
| 79 | + to go over the whole grid. This leads to some noise, but the pattern is very visible visually. |
| 80 | + Hence: this benchmark is more for visualization than for regression testing. |
| 81 | + For regression testing, please look at the related VectorAutoAlignment benchmark. |
| 82 | +
|
| 83 | + If you want to turn the JMH results into a table, then you may use this Java code. |
| 84 | +
|
| 85 | + import java.io.*; |
| 86 | + import java.util.ArrayList; |
| 87 | +
|
| 88 | + public class Extract { |
| 89 | + record Cell(int x, int y, float t) {} |
| 90 | +
|
| 91 | + public static void main(String[] args) throws Exception { |
| 92 | + String fileName = args[0]; |
| 93 | + System.out.println("Loading from file: " + fileName); |
| 94 | +
|
| 95 | + ArrayList<Cell> cells = new ArrayList<>(); |
| 96 | +
|
| 97 | + try(BufferedReader br = new BufferedReader(new FileReader(fileName))) { |
| 98 | + for(String line; (line = br.readLine()) != null; ) { |
| 99 | + System.out.println(line); |
| 100 | + String[] parts = line.split("[ ]+"); |
| 101 | + if (parts.length != 11) { continue; } |
| 102 | + System.out.println(String.join(" ", parts)); |
| 103 | + int x = Integer.parseInt(parts[2]); |
| 104 | + int y = Integer.parseInt(parts[3]); |
| 105 | + float t = Float.parseFloat(parts[7]); |
| 106 | + System.out.println("x=" + x + ", y=" + y + ", t=" + t); |
| 107 | + cells.add(new Cell(x, y, t)); |
| 108 | + } |
| 109 | + } |
| 110 | +
|
| 111 | + int maxX = cells.stream().mapToInt(c -> c.x).max().getAsInt(); |
| 112 | + int maxY = cells.stream().mapToInt(c -> c.y).max().getAsInt(); |
| 113 | + float[][] grid = new float[maxX + 1][maxY + 1]; |
| 114 | +
|
| 115 | + for (Cell c : cells) { |
| 116 | + grid[c.x][c.y] = c.t; |
| 117 | + } |
| 118 | +
|
| 119 | + for (int x = maxY; x >= 0; x--) { |
| 120 | + for (int y = 0; y <= maxY; y++) { |
| 121 | + System.out.print(String.format("%.5f ", grid[x][y])); |
| 122 | + } |
| 123 | + System.out.println(); |
| 124 | + } |
| 125 | + System.out.println("x-axis (->) LOAD_OFFSET"); |
| 126 | + System.out.println("y-axis (up) STORE_OFFSET"); |
| 127 | + } |
| 128 | + } |
| 129 | +
|
| 130 | + */ |
| 131 | + |
| 132 | +@BenchmarkMode(Mode.AverageTime) |
| 133 | +@OutputTimeUnit(TimeUnit.NANOSECONDS) |
| 134 | +@State(Scope.Thread) |
| 135 | +@Warmup(iterations = 2, time = 200, timeUnit = TimeUnit.MILLISECONDS) |
| 136 | +@Measurement(iterations = 3, time = 200, timeUnit = TimeUnit.MILLISECONDS) |
| 137 | +@Fork(value = 1) |
| 138 | +public class VectorAutoAlignmentVisualization { |
| 139 | + @Param({"2560"}) |
| 140 | + public int SIZE; |
| 141 | + |
| 142 | + @Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", |
| 143 | + "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", |
| 144 | + "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", |
| 145 | + "30", "31"}) |
| 146 | + public int OFFSET_LOAD; |
| 147 | + |
| 148 | + @Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", |
| 149 | + "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", |
| 150 | + "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", |
| 151 | + "30", "31"}) |
| 152 | + public int OFFSET_STORE; |
| 153 | + |
| 154 | + @Param({"2000"}) |
| 155 | + public int DISTANCE; |
| 156 | + |
| 157 | + // To get compile-time constants for OFFSET_LOAD, OFFSET_STORE, and DISTANCE |
| 158 | + static final MutableCallSite MUTABLE_CONSTANT_OFFSET_LOAD = new MutableCallSite(MethodType.methodType(int.class)); |
| 159 | + static final MethodHandle MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE = MUTABLE_CONSTANT_OFFSET_LOAD.dynamicInvoker(); |
| 160 | + static final MutableCallSite MUTABLE_CONSTANT_OFFSET_STORE = new MutableCallSite(MethodType.methodType(int.class)); |
| 161 | + static final MethodHandle MUTABLE_CONSTANT_OFFSET_STORE_HANDLE = MUTABLE_CONSTANT_OFFSET_STORE.dynamicInvoker(); |
| 162 | + static final MutableCallSite MUTABLE_CONSTANT_DISTANCE = new MutableCallSite(MethodType.methodType(int.class)); |
| 163 | + static final MethodHandle MUTABLE_CONSTANT_DISTANCE_HANDLE = MUTABLE_CONSTANT_DISTANCE.dynamicInvoker(); |
| 164 | + |
| 165 | + private MemorySegment ms; |
| 166 | + |
| 167 | + @Setup |
| 168 | + public void init() throws Throwable { |
| 169 | + long totalSize = 4L * SIZE + 4L * DISTANCE; |
| 170 | + long alignment = 4 * 1024; // 4k = page size |
| 171 | + ms = Arena.ofAuto().allocate(totalSize, alignment); |
| 172 | + |
| 173 | + MethodHandle offset_load_con = MethodHandles.constant(int.class, OFFSET_LOAD); |
| 174 | + MUTABLE_CONSTANT_OFFSET_LOAD.setTarget(offset_load_con); |
| 175 | + MethodHandle offset_store_con = MethodHandles.constant(int.class, OFFSET_STORE); |
| 176 | + MUTABLE_CONSTANT_OFFSET_STORE.setTarget(offset_store_con); |
| 177 | + MethodHandle distance_con = MethodHandles.constant(int.class, DISTANCE); |
| 178 | + MUTABLE_CONSTANT_DISTANCE.setTarget(distance_con); |
| 179 | + } |
| 180 | + |
| 181 | + @CompilerControl(CompilerControl.Mode.INLINE) |
| 182 | + private int offset_load_con() throws Throwable { |
| 183 | + return (int) MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE.invokeExact(); |
| 184 | + } |
| 185 | + |
| 186 | + @CompilerControl(CompilerControl.Mode.INLINE) |
| 187 | + private int offset_store_con() throws Throwable { |
| 188 | + return (int) MUTABLE_CONSTANT_OFFSET_STORE_HANDLE.invokeExact(); |
| 189 | + } |
| 190 | + |
| 191 | + @CompilerControl(CompilerControl.Mode.INLINE) |
| 192 | + private int distance_con() throws Throwable { |
| 193 | + return (int) MUTABLE_CONSTANT_DISTANCE_HANDLE.invokeExact(); |
| 194 | + } |
| 195 | + |
| 196 | + @Benchmark |
| 197 | + public void bench1L1S() throws Throwable { |
| 198 | + int offset_load = offset_load_con(); |
| 199 | + int offset_store = offset_store_con(); |
| 200 | + int distance = distance_con(); |
| 201 | + // Note: the offsets and distance are compile-time constants, which means |
| 202 | + // we can already prove non-aliasing of loads and stores at compile |
| 203 | + // time, which allows vectorization even without any aliasing runtime |
| 204 | + // checks. |
| 205 | + for (int i = 0; i < SIZE - /* slack for offset */ 32; i++) { |
| 206 | + int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_load + 4L * distance); |
| 207 | + ms.set(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_store, v); |
| 208 | + } |
| 209 | + } |
| 210 | +} |
0 commit comments