8355094: Performance drop in auto-vectorized kernel due to split store

eme64 · eme64 · commit 277bb208a2c6 · 2025-05-20T13:51:47.000Z
Reviewed-by: vlivanov, thartmann
diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
@@ -367,6 +367,14 @@
           "loop iterations this detection spans.")                          \
           range(0, 4096)                                                    \
                                                                             \
+  product(uint, SuperWordAutomaticAlignment, 1, DIAGNOSTIC,                 \
+          "0 = Disabled (unless AlignVector is enabled)"                    \
+          "Else: align with a load or store of the largest vector width,"   \
+          "      and if there are loads and stores of the largest width:"   \
+          "1 = Prefer alignment with vector store (default)"                \
+          "2 = Prefer alignment with vector load.")                         \
+          range(0, 2)                                                       \
+                                                                            \
   product(bool, UseCMoveUnconditionally, false,                             \
           "Use CMove (scalar and vector) ignoring profitability test.")     \
                                                                             \
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
@@ -2665,7 +2665,18 @@ void VTransform::determine_mem_ref_and_aw_for_main_loop_alignment() {
     MemNode* p0 = vtn->nodes().at(0)->as_Mem();
 
     int vw = p0->memory_size() * vtn->nodes().length();
-    if (vw > max_aw) {
+    // Generally, we prefer to align with the largest memory op (load or store).
+    // If there are multiple, then SuperWordAutomaticAlignment determines if we
+    // prefer loads or stores.
+    // When a load or store is misaligned, this can lead to the load or store
+    // being split, when it goes over a cache line. Most CPUs can schedule
+    // more loads than stores per cycle (often 2 loads and 1 store). Hence,
+    // it is worse if a store is split, and less bad if a load is split.
+    // By default, we have SuperWordAutomaticAlignment=1, i.e. we align with a
+    // store if possible, to avoid splitting that store.
+    bool prefer_store = mem_ref != nullptr && SuperWordAutomaticAlignment == 1 && mem_ref->is_Load() && p0->is_Store();
+    bool prefer_load  = mem_ref != nullptr && SuperWordAutomaticAlignment == 2 && mem_ref->is_Store() && p0->is_Load();
+    if (vw > max_aw || (vw == max_aw && (prefer_load || prefer_store))) {
       max_aw = vw;
       mem_ref = p0;
     }
@@ -2692,6 +2703,16 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
   determine_mem_ref_and_aw_for_main_loop_alignment();
   const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment;
   const int aw                = _aw_for_main_loop_alignment;
+
+  if (!VLoop::vectors_should_be_aligned() && SuperWordAutomaticAlignment == 0) {
+#ifdef ASSERT
+    if (_trace._align_vector) {
+      tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors: disabled.");
+    }
+#endif
+    return;
+  }
+
   assert(align_to_ref != nullptr && aw > 0, "must have alignment reference and aw");
   assert(cl()->is_main_loop(), "can only do alignment for main loop");
 
@@ -2912,6 +2933,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
   p.for_each_invar_summand([&] (const MemPointerSummand& s) {
     Node* invar_variable = s.variable();
     jint  invar_scale    = s.scale().value();
+    TRACE_ALIGN_VECTOR_NODE(invar_variable);
     if (igvn().type(invar_variable)->isa_long()) {
       // Computations are done % (vector width/element size) so it's
       // safe to simply convert invar to an int and loose the upper 32
@@ -2921,6 +2943,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
       TRACE_ALIGN_VECTOR_NODE(invar_variable);
     }
     Node* invar_scale_con = igvn().intcon(invar_scale);
+    TRACE_ALIGN_VECTOR_NODE(invar_scale_con);
     Node* invar_summand = new MulINode(invar_variable, invar_scale_con);
     phase()->register_new_node(invar_summand, pre_ctrl);
     TRACE_ALIGN_VECTOR_NODE(invar_summand);
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorAutoAlignment.java b/test/micro/org/openjdk/bench/vm/compiler/VectorAutoAlignment.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.lang.invoke.*;
+import java.lang.foreign.*;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * The purpose of this benchmark is to see the effect of automatic alignment in auto vectorization.
+ *
+ * Note: If you are interested in a nice visualization of load and store misalignment, please look
+ *       at the benchmark {@link VectorAutoAlignmentVisualization}.
+ */
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+public abstract class VectorAutoAlignment {
+    @Param({"1024", "1152", "1280", "1408", "1536", "1664", "1792", "1920", "1984", "2048", "2114",
+            "2176", "2304", "2432", "2560", "2688", "2816", "2944", "3072", "3200", "3328", "3456",
+            "3584", "3712", "3840", "3968", "4096", "4224", "4352", "4480"})
+    public int SIZE;
+
+    private MemorySegment ms;
+
+    @Setup
+    public void init() throws Throwable {
+        long totalSize = 4L * SIZE + 4L * SIZE;
+        long alignment = 4 * 1024; // 4k = page size
+        ms = Arena.ofAuto().allocate(totalSize, alignment);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void kernel1L1S(int offset_load, int offset_store) {
+        for (int i = 0; i < SIZE - /* slack for offset */ 32; i++) {
+            int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_load + 4L * SIZE);
+            ms.set(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_store, v);
+        }
+    }
+
+    @Benchmark
+    public void bench1L1S() throws Throwable {
+        // Go over all possible offsets, to get an average performance.
+        for (int offset_load = 0; offset_load < 32; offset_load++) {
+            for (int offset_store = 0; offset_store < 32; offset_store++) {
+                kernel1L1S(offset_load, offset_store);
+            }
+        }
+    }
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:-UseSuperWord"
+    })
+    public static class NoVectorization extends VectorAutoAlignment {}
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=0"
+    })
+    public static class NoAutoAlign extends VectorAutoAlignment {}
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=1"
+    })
+    public static class AlignStore extends VectorAutoAlignment {}
+
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordAutomaticAlignment=2"
+    })
+    public static class AlignLoad extends VectorAutoAlignment {}
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorAutoAlignmentVisualization.java b/test/micro/org/openjdk/bench/vm/compiler/VectorAutoAlignmentVisualization.java
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.lang.invoke.*;
+import java.lang.foreign.*;
+
+import java.util.concurrent.TimeUnit;
+
+/*
+
+  The purpose of this benchmark is to see the effect of automatic alignment in auto vectorization.
+  It is recommended to view the differing results when using SuperWordAutomaticAlignment.
+
+  Without automatic alignment, i.e. SuperWordAutomaticAlignment=0, we may get a plot like below, for bench1L1S:
+
+  OFFSET_STORE
+  ^
+  | ###############|X
+  | ---------------0-  <--- store aligned
+  | ##############X|#
+  | #############X#|#
+  | ############X##|#
+  | ###########X###|#
+  | ##########X####|#
+  | #########X#####|#
+  | ########X######|#
+  | #######X#######|#
+  | ######X########|#
+  | #####X#########|#
+  | ####X##########|#
+  | ###X###########|#
+  | ##X############|#
+  | #X#############|#
+  | X##############|#
+    ---OFFSET_LOAD ---->
+
+                   ^
+     loads aligned |
+
+  #: lowest performance, both misaligned and also relatively misaligned.
+  X: low performance, both misaligned but relatively aligned.
+  |: medium performance, load aligned, store misaligned.
+  -: good performance, load misaligned, store aligned.
+  0: extreme performance, load and store aligned.
+
+  Why is case "-" better than "|"? I.e. why are misaligned stores worse than misaligned loads?
+  Misalignment means that a load or store goes over a cache line, and is split into two loads
+  or stores. Most CPU's can execute 2 loads and 1 store per cycle, that is at least a partial
+  explanation why we are more limited on stores than loads.
+  No splitting, full alignment -> 1 load  and 1 store
+  Split load, store aligned    -> 2 loads and 1 store
+  Split store, load aligned    -> 1 load  and 2 stores
+
+  The warmup and measurement time is relatively short, but the benchmark already takes 25 min
+  to go over the whole grid. This leads to some noise, but the pattern is very visible visually.
+  Hence: this benchmark is more for visualization than for regression testing.
+  For regression testing, please look at the related VectorAutoAlignment benchmark.
+
+  If you want to turn the JMH results into a table, then you may use this Java code.
+
+    import java.io.*;
+    import java.util.ArrayList;
+
+    public class Extract {
+        record Cell(int x, int y, float t) {}
+
+        public static void main(String[] args) throws Exception {
+            String fileName = args[0];
+            System.out.println("Loading from file: " + fileName);
+
+            ArrayList<Cell> cells = new ArrayList<>();
+
+            try(BufferedReader br = new BufferedReader(new FileReader(fileName))) {
+                for(String line; (line = br.readLine()) != null; ) {
+                    System.out.println(line);
+                    String[] parts = line.split("[ ]+");
+                    if (parts.length != 11) { continue; }
+                    System.out.println(String.join(" ", parts));
+                    int x = Integer.parseInt(parts[2]);
+                    int y = Integer.parseInt(parts[3]);
+                    float t = Float.parseFloat(parts[7]);
+                    System.out.println("x=" + x + ", y=" + y + ", t=" + t);
+                    cells.add(new Cell(x, y, t));
+                }
+            }
+
+            int maxX = cells.stream().mapToInt(c -> c.x).max().getAsInt();
+            int maxY = cells.stream().mapToInt(c -> c.y).max().getAsInt();
+            float[][] grid = new float[maxX + 1][maxY + 1];
+
+            for (Cell c : cells) {
+                grid[c.x][c.y] = c.t;
+            }
+
+            for (int x = maxY; x >= 0; x--) {
+                for (int y = 0; y <= maxY; y++) {
+                    System.out.print(String.format("%.5f ", grid[x][y]));
+                }
+                System.out.println();
+            }
+            System.out.println("x-axis  (->)  LOAD_OFFSET");
+            System.out.println("y-axis  (up)  STORE_OFFSET");
+        }
+    }
+
+ */
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 2, time = 200, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 3, time = 200, timeUnit = TimeUnit.MILLISECONDS)
+@Fork(value = 1)
+public class VectorAutoAlignmentVisualization {
+    @Param({"2560"})
+    public int SIZE;
+
+    @Param({  "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",   "8",   "9",
+             "10",  "11",  "12",  "13",  "14",  "15",  "16",  "17",  "18",  "19",
+             "20",  "21",  "22",  "23",  "24",  "25",  "26",  "27",  "28",  "29",
+             "30",  "31"})
+    public int OFFSET_LOAD;
+
+    @Param({  "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",   "8",   "9",
+             "10",  "11",  "12",  "13",  "14",  "15",  "16",  "17",  "18",  "19",
+             "20",  "21",  "22",  "23",  "24",  "25",  "26",  "27",  "28",  "29",
+             "30",  "31"})
+    public int OFFSET_STORE;
+
+    @Param({"2000"})
+    public int DISTANCE;
+
+    // To get compile-time constants for OFFSET_LOAD, OFFSET_STORE, and DISTANCE
+    static final MutableCallSite MUTABLE_CONSTANT_OFFSET_LOAD = new MutableCallSite(MethodType.methodType(int.class));
+    static final MethodHandle MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE = MUTABLE_CONSTANT_OFFSET_LOAD.dynamicInvoker();
+    static final MutableCallSite MUTABLE_CONSTANT_OFFSET_STORE = new MutableCallSite(MethodType.methodType(int.class));
+    static final MethodHandle MUTABLE_CONSTANT_OFFSET_STORE_HANDLE = MUTABLE_CONSTANT_OFFSET_STORE.dynamicInvoker();
+    static final MutableCallSite MUTABLE_CONSTANT_DISTANCE = new MutableCallSite(MethodType.methodType(int.class));
+    static final MethodHandle MUTABLE_CONSTANT_DISTANCE_HANDLE = MUTABLE_CONSTANT_DISTANCE.dynamicInvoker();
+
+    private MemorySegment ms;
+
+    @Setup
+    public void init() throws Throwable {
+        long totalSize = 4L * SIZE + 4L * DISTANCE;
+        long alignment = 4 * 1024; // 4k = page size
+        ms = Arena.ofAuto().allocate(totalSize, alignment);
+
+        MethodHandle offset_load_con = MethodHandles.constant(int.class, OFFSET_LOAD);
+        MUTABLE_CONSTANT_OFFSET_LOAD.setTarget(offset_load_con);
+        MethodHandle offset_store_con = MethodHandles.constant(int.class, OFFSET_STORE);
+        MUTABLE_CONSTANT_OFFSET_STORE.setTarget(offset_store_con);
+        MethodHandle distance_con = MethodHandles.constant(int.class, DISTANCE);
+        MUTABLE_CONSTANT_DISTANCE.setTarget(distance_con);
+    }
+
+    @CompilerControl(CompilerControl.Mode.INLINE)
+    private int offset_load_con() throws Throwable {
+        return (int) MUTABLE_CONSTANT_OFFSET_LOAD_HANDLE.invokeExact();
+    }
+
+    @CompilerControl(CompilerControl.Mode.INLINE)
+    private int offset_store_con() throws Throwable {
+        return (int) MUTABLE_CONSTANT_OFFSET_STORE_HANDLE.invokeExact();
+    }
+
+    @CompilerControl(CompilerControl.Mode.INLINE)
+    private int distance_con() throws Throwable {
+        return (int) MUTABLE_CONSTANT_DISTANCE_HANDLE.invokeExact();
+    }
+
+    @Benchmark
+    public void bench1L1S() throws Throwable {
+        int offset_load = offset_load_con();
+        int offset_store = offset_store_con();
+        int distance = distance_con();
+        // Note: the offsets and distance are compile-time constants, which means
+        //       we can already prove non-aliasing of loads and stores at compile
+        //       time, which allows vectorization even without any aliasing runtime
+        //       checks.
+        for (int i = 0; i < SIZE - /* slack for offset */ 32; i++) {
+            int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_load + 4L * distance);
+            ms.set(ValueLayout.JAVA_INT_UNALIGNED, 4L * i + 4L * offset_store, v);
+        }
+    }
+}