Skip to content

Commit 277bb20

Browse files
committed
8355094: Performance drop in auto-vectorized kernel due to split store
Reviewed-by: vlivanov, thartmann
1 parent e6750a5 commit 277bb20

File tree

4 files changed

+341
-1
lines changed

4 files changed

+341
-1
lines changed

src/hotspot/share/opto/c2_globals.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,14 @@
367367
"loop iterations this detection spans.") \
368368
range(0, 4096) \
369369
\
370+
product(uint, SuperWordAutomaticAlignment, 1, DIAGNOSTIC, \
371+
"0 = Disabled (unless AlignVector is enabled)" \
372+
"Else: align with a load or store of the largest vector width," \
373+
" and if there are loads and stores of the largest width:" \
374+
"1 = Prefer alignment with vector store (default)" \
375+
"2 = Prefer alignment with vector load.") \
376+
range(0, 2) \
377+
\
370378
product(bool, UseCMoveUnconditionally, false, \
371379
"Use CMove (scalar and vector) ignoring profitability test.") \
372380
\

src/hotspot/share/opto/superword.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2665,7 +2665,18 @@ void VTransform::determine_mem_ref_and_aw_for_main_loop_alignment() {
26652665
MemNode* p0 = vtn->nodes().at(0)->as_Mem();
26662666

26672667
int vw = p0->memory_size() * vtn->nodes().length();
2668-
if (vw > max_aw) {
2668+
// Generally, we prefer to align with the largest memory op (load or store).
2669+
// If there are multiple, then SuperWordAutomaticAlignment determines if we
2670+
// prefer loads or stores.
2671+
// When a load or store is misaligned, this can lead to the load or store
2672+
// being split, when it goes over a cache line. Most CPUs can schedule
2673+
// more loads than stores per cycle (often 2 loads and 1 store). Hence,
2674+
// it is worse if a store is split, and less bad if a load is split.
2675+
// By default, we have SuperWordAutomaticAlignment=1, i.e. we align with a
2676+
// store if possible, to avoid splitting that store.
2677+
bool prefer_store = mem_ref != nullptr && SuperWordAutomaticAlignment == 1 && mem_ref->is_Load() && p0->is_Store();
2678+
bool prefer_load = mem_ref != nullptr && SuperWordAutomaticAlignment == 2 && mem_ref->is_Store() && p0->is_Load();
2679+
if (vw > max_aw || (vw == max_aw && (prefer_load || prefer_store))) {
26692680
max_aw = vw;
26702681
mem_ref = p0;
26712682
}
@@ -2692,6 +2703,16 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
26922703
determine_mem_ref_and_aw_for_main_loop_alignment();
26932704
const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment;
26942705
const int aw = _aw_for_main_loop_alignment;
2706+
2707+
if (!VLoop::vectors_should_be_aligned() && SuperWordAutomaticAlignment == 0) {
2708+
#ifdef ASSERT
2709+
if (_trace._align_vector) {
2710+
tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors: disabled.");
2711+
}
2712+
#endif
2713+
return;
2714+
}
2715+
26952716
assert(align_to_ref != nullptr && aw > 0, "must have alignment reference and aw");
26962717
assert(cl()->is_main_loop(), "can only do alignment for main loop");
26972718

@@ -2912,6 +2933,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
29122933
p.for_each_invar_summand([&] (const MemPointerSummand& s) {
29132934
Node* invar_variable = s.variable();
29142935
jint invar_scale = s.scale().value();
2936+
TRACE_ALIGN_VECTOR_NODE(invar_variable);
29152937
if (igvn().type(invar_variable)->isa_long()) {
29162938
// Computations are done % (vector width/element size) so it's
29172939
// safe to simply convert invar to an int and loose the upper 32
@@ -2921,6 +2943,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
29212943
TRACE_ALIGN_VECTOR_NODE(invar_variable);
29222944
}
29232945
Node* invar_scale_con = igvn().intcon(invar_scale);
2946+
TRACE_ALIGN_VECTOR_NODE(invar_scale_con);
29242947
Node* invar_summand = new MulINode(invar_variable, invar_scale_con);
29252948
phase()->register_new_node(invar_summand, pre_ctrl);
29262949
TRACE_ALIGN_VECTOR_NODE(invar_summand);
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
package org.openjdk.bench.vm.compiler;
24+
25+
import org.openjdk.jmh.annotations.*;
26+
import org.openjdk.jmh.infra.*;
27+
28+
import java.lang.invoke.*;
29+
import java.lang.foreign.*;
30+
31+
import java.util.concurrent.TimeUnit;
32+
33+
/**
34+
* The purpose of this benchmark is to see the effect of automatic alignment in auto vectorization.
35+
*
36+
* Note: If you are interested in a nice visualization of load and store misalignment, please look
37+
* at the benchmark {@link VectorAutoAlignmentVisualization}.
38+
*/
39+
40+
@BenchmarkMode(Mode.AverageTime)
41+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
42+
@State(Scope.Thread)
43+
@Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.SECONDS)
44+
@Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
45+
@Fork(value = 1)
46+
public abstract class VectorAutoAlignment {
47+
@Param({"1024", "1152", "1280", "1408", "1536", "1664", "1792", "1920", "1984", "2048", "2114",
48+
"2176", "2304", "2432", "2560", "2688", "2816", "2944", "3072", "3200", "3328", "3456",
49+
"3584", "3712", "3840", "3968", "4096", "4224", "4352", "4480"})
50+
public int SIZE;
51+
52+
private MemorySegment ms;
53+
54+
@Setup
55+
public void init() throws Throwable {
56+
long totalSize = 4L * SIZE + 4L * SIZE;
57+
long alignment = 4 * 1024; // 4k = page size
58+
ms = Arena.ofAuto().allocate(totalSize, alignment);
59+
}
60+
61+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
62+
public void kernel1L1S(int offset_load, int offset_store) {
63+
for (int i = 0; i < SIZE - /* slack for offset */ 32; i++) {
64+
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_load + 4L * SIZE);
65+
ms.set(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_store, v);
66+
}
67+
}
68+
69+
@Benchmark
70+
public void bench1L1S() throws Throwable {
71+
// Go over all possible offsets, to get an average performance.
72+
for (int offset_load = 0; offset_load < 32; offset_load++) {
73+
for (int offset_store = 0; offset_store < 32; offset_store++) {
74+
kernel1L1S(offset_load, offset_store);
75+
}
76+
}
77+
}
78+
79+
@Fork(value = 1, jvmArgs = {
80+
"-XX:-UseSuperWord"
81+
})
82+
public static class NoVectorization extends VectorAutoAlignment {}
83+
84+
@Fork(value = 1, jvmArgs = {
85+
"-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=0"
86+
})
87+
public static class NoAutoAlign extends VectorAutoAlignment {}
88+
89+
@Fork(value = 1, jvmArgs = {
90+
"-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=1"
91+
})
92+
public static class AlignStore extends VectorAutoAlignment {}
93+
94+
95+
@Fork(value = 1, jvmArgs = {
96+
"-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=2"
97+
})
98+
public static class AlignLoad extends VectorAutoAlignment {}
99+
}
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
package org.openjdk.bench.vm.compiler;
24+
25+
import org.openjdk.jmh.annotations.*;
26+
import org.openjdk.jmh.infra.*;
27+
28+
import java.lang.invoke.*;
29+
import java.lang.foreign.*;
30+
31+
import java.util.concurrent.TimeUnit;
32+
33+
/*
34+
35+
The purpose of this benchmark is to see the effect of automatic alignment in auto vectorization.
36+
It is recommended to view the differing results when using SuperWordAutomaticAlignment.
37+
38+
Without automatic alignment, i.e. SuperWordAutomaticAlignment=0, we may get a plot like below, for bench1L1S:
39+
40+
OFFSET_STORE
41+
^
42+
| ###############|X
43+
| ---------------0- <--- store aligned
44+
| ##############X|#
45+
| #############X#|#
46+
| ############X##|#
47+
| ###########X###|#
48+
| ##########X####|#
49+
| #########X#####|#
50+
| ########X######|#
51+
| #######X#######|#
52+
| ######X########|#
53+
| #####X#########|#
54+
| ####X##########|#
55+
| ###X###########|#
56+
| ##X############|#
57+
| #X#############|#
58+
| X##############|#
59+
---OFFSET_LOAD ---->
60+
61+
^
62+
loads aligned |
63+
64+
#: lowest performance, both misaligned and also relatively misaligned.
65+
X: low performance, both misaligned but relatively aligned.
66+
|: medium performance, load aligned, store misaligned.
67+
-: good performance, load misaligned, store aligned.
68+
0: extreme performance, load and store aligned.
69+
70+
Why is case "-" better than "|"? I.e. why are misaligned stores worse than misaligned loads?
71+
Misalignment means that a load or store goes over a cache line, and is split into two loads
72+
or stores. Most CPU's can execute 2 loads and 1 store per cycle, that is at least a partial
73+
explanation why we are more limited on stores than loads.
74+
No splitting, full alignment -> 1 load and 1 store
75+
Split load, store aligned -> 2 loads and 1 store
76+
Split store, load aligned -> 1 load and 2 stores
77+
78+
The warmup and measurement time is relatively short, but the benchmark already takes 25 min
79+
to go over the whole grid. This leads to some noise, but the pattern is very visible visually.
80+
Hence: this benchmark is more for visualization than for regression testing.
81+
For regression testing, please look at the related VectorAutoAlignment benchmark.
82+
83+
If you want to turn the JMH results into a table, then you may use this Java code.
84+
85+
import java.io.*;
86+
import java.util.ArrayList;
87+
88+
public class Extract {
89+
record Cell(int x, int y, float t) {}
90+
91+
public static void main(String[] args) throws Exception {
92+
String fileName = args[0];
93+
System.out.println("Loading from file: " + fileName);
94+
95+
ArrayList<Cell> cells = new ArrayList<>();
96+
97+
try(BufferedReader br = new BufferedReader(new FileReader(fileName))) {
98+
for(String line; (line = br.readLine()) != null; ) {
99+
System.out.println(line);
100+
String[] parts = line.split("[ ]+");
101+
if (parts.length != 11) { continue; }
102+
System.out.println(String.join(" ", parts));
103+
int x = Integer.parseInt(parts[2]);
104+
int y = Integer.parseInt(parts[3]);
105+
float t = Float.parseFloat(parts[7]);
106+
System.out.println("x=" + x + ", y=" + y + ", t=" + t);
107+
cells.add(new Cell(x, y, t));
108+
}
109+
}
110+
111+
int maxX = cells.stream().mapToInt(c -> c.x).max().getAsInt();
112+
int maxY = cells.stream().mapToInt(c -> c.y).max().getAsInt();
113+
float[][] grid = new float[maxX + 1][maxY + 1];
114+
115+
for (Cell c : cells) {
116+
grid[c.x][c.y] = c.t;
117+
}
118+
119+
for (int x = maxY; x >= 0; x--) {
120+
for (int y = 0; y <= maxY; y++) {
121+
System.out.print(String.format("%.5f ", grid[x][y]));
122+
}
123+
System.out.println();
124+
}
125+
System.out.println("x-axis (->) LOAD_OFFSET");
126+
System.out.println("y-axis (up) STORE_OFFSET");
127+
}
128+
}
129+
130+
*/
131+
132+
@BenchmarkMode(Mode.AverageTime)
133+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
134+
@State(Scope.Thread)
135+
@Warmup(iterations = 2, time = 200, timeUnit = TimeUnit.MILLISECONDS)
136+
@Measurement(iterations = 3, time = 200, timeUnit = TimeUnit.MILLISECONDS)
137+
@Fork(value = 1)
138+
public class VectorAutoAlignmentVisualization {
139+
@Param({"2560"})
140+
public int SIZE;
141+
142+
@Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
143+
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
144+
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
145+
"30", "31"})
146+
public int OFFSET_LOAD;
147+
148+
@Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
149+
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
150+
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
151+
"30", "31"})
152+
public int OFFSET_STORE;
153+
154+
@Param({"2000"})
155+
public int DISTANCE;
156+
157+
// To get compile-time constants for OFFSET_LOAD, OFFSET_STORE, and DISTANCE
158+
static final MutableCallSite MUTABLE_CONSTANT_OFFSET_LOAD = new MutableCallSite(MethodType.methodType(int.class));
159+
static final MethodHandle MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE = MUTABLE_CONSTANT_OFFSET_LOAD.dynamicInvoker();
160+
static final MutableCallSite MUTABLE_CONSTANT_OFFSET_STORE = new MutableCallSite(MethodType.methodType(int.class));
161+
static final MethodHandle MUTABLE_CONSTANT_OFFSET_STORE_HANDLE = MUTABLE_CONSTANT_OFFSET_STORE.dynamicInvoker();
162+
static final MutableCallSite MUTABLE_CONSTANT_DISTANCE = new MutableCallSite(MethodType.methodType(int.class));
163+
static final MethodHandle MUTABLE_CONSTANT_DISTANCE_HANDLE = MUTABLE_CONSTANT_DISTANCE.dynamicInvoker();
164+
165+
private MemorySegment ms;
166+
167+
@Setup
168+
public void init() throws Throwable {
169+
long totalSize = 4L * SIZE + 4L * DISTANCE;
170+
long alignment = 4 * 1024; // 4k = page size
171+
ms = Arena.ofAuto().allocate(totalSize, alignment);
172+
173+
MethodHandle offset_load_con = MethodHandles.constant(int.class, OFFSET_LOAD);
174+
MUTABLE_CONSTANT_OFFSET_LOAD.setTarget(offset_load_con);
175+
MethodHandle offset_store_con = MethodHandles.constant(int.class, OFFSET_STORE);
176+
MUTABLE_CONSTANT_OFFSET_STORE.setTarget(offset_store_con);
177+
MethodHandle distance_con = MethodHandles.constant(int.class, DISTANCE);
178+
MUTABLE_CONSTANT_DISTANCE.setTarget(distance_con);
179+
}
180+
181+
@CompilerControl(CompilerControl.Mode.INLINE)
182+
private int offset_load_con() throws Throwable {
183+
return (int) MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE.invokeExact();
184+
}
185+
186+
@CompilerControl(CompilerControl.Mode.INLINE)
187+
private int offset_store_con() throws Throwable {
188+
return (int) MUTABLE_CONSTANT_OFFSET_STORE_HANDLE.invokeExact();
189+
}
190+
191+
@CompilerControl(CompilerControl.Mode.INLINE)
192+
private int distance_con() throws Throwable {
193+
return (int) MUTABLE_CONSTANT_DISTANCE_HANDLE.invokeExact();
194+
}
195+
196+
@Benchmark
197+
public void bench1L1S() throws Throwable {
198+
int offset_load = offset_load_con();
199+
int offset_store = offset_store_con();
200+
int distance = distance_con();
201+
// Note: the offsets and distance are compile-time constants, which means
202+
// we can already prove non-aliasing of loads and stores at compile
203+
// time, which allows vectorization even without any aliasing runtime
204+
// checks.
205+
for (int i = 0; i < SIZE - /* slack for offset */ 32; i++) {
206+
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_load + 4L * distance);
207+
ms.set(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_store, v);
208+
}
209+
}
210+
}

0 commit comments

Comments
 (0)