diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml
deleted file mode 100644
index e34eada86b..0000000000
--- a/.github/workflows/apple_m.yml
+++ /dev/null
@@ -1,149 +0,0 @@
-name: apple m
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: macos-14
-
-    strategy:
-      fail-fast: false
-      matrix:
-        build: [cmake, make]
-        fortran: [gfortran]
-        openmp: [0, 1]
-        ilp64: [0, 1]
-        
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Print system information
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            cat /proc/cpuinfo
-          elif [ "$RUNNER_OS" == "macOS" ]; then
-            sysctl -a | grep machdep.cpu
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Install Dependencies
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
-          elif [ "$RUNNER_OS" == "macOS" ]; then
-            # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
-            brew reinstall gcc
-            brew install coreutils cmake ccache
-            brew install llvm
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          # GNU make and cmake call the compilers differently. It looks like
-          # that causes the cache to mismatch. Keep the ccache for both build
-          # tools separate to avoid polluting each other.
-          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
-          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
-            ccache-${{ runner.os }}-${{ matrix.build }}
-
-      - name: Configure ccache
-        run: |
-          if [ "${{ matrix.build }}" = "make" ]; then
-            # Add ccache to path
-            if [ "$RUNNER_OS" = "Linux" ]; then
-              echo "/usr/lib/ccache" >> $GITHUB_PATH
-            elif [ "$RUNNER_OS" = "macOS" ]; then
-              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
-              echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
-              echo "" >>$GITHUB_PATH
-            else
-              echo "::error::$RUNNER_OS not supported"
-              exit 1
-            fi
-          fi
-          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Build OpenBLAS
-        run: |
-          export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
-          export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
-          export CC="/opt/homebrew/opt/llvm/bin/clang"
-          case "${{ matrix.build }}" in
-            "make")
-              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
-              ;;
-            "cmake")
-              export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
-              mkdir build && cd build
-              cmake -DDYNAMIC_ARCH=1 \
-                    -DUSE_OPENMP=${{matrix.openmp}} \
-                    -DINTERFACE64=${{matrix.ilp64}} \
-                    -DNOFORTRAN=0 \
-                    -DBUILD_WITHOUT_LAPACK=0 \
-                    -DCMAKE_VERBOSE_MAKEFILE=ON \
-                    -DCMAKE_BUILD_TYPE=Release \
-                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
-                    ..
-              cmake --build .
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
-
-      - name: Show ccache status
-        continue-on-error: true
-        run: ccache -s
-
-      - name: Run tests
-        timeout-minutes: 60
-        run: |
-          case "${{ matrix.build }}" in
-            "make")
-              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
-              echo "::group::Tests in 'test' directory"
-              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'ctest' directory"
-              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'utest' directory"
-              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              ;;
-            "cmake")
-              cd build && ctest
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
deleted file mode 100644
index 6928312b56..0000000000
--- a/.github/workflows/arm64_graviton.yml
+++ /dev/null
@@ -1,139 +0,0 @@
-name: arm64 graviton cirun
-
-on:
-  push:
-    branches:
-      - develop
-      - release-**
-  pull_request:
-    branches:
-      - develop
-      - release-**
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
-
-    strategy:
-      fail-fast: false
-      matrix:
-        fortran: [gfortran]
-        build: [cmake, make]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Print system information
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            cat /proc/cpuinfo
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Install Dependencies
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt update
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          # GNU make and cmake call the compilers differently. It looks like
-          # that causes the cache to mismatch. Keep the ccache for both build
-          # tools separate to avoid polluting each other.
-          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
-          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
-            ccache-${{ runner.os }}-${{ matrix.build }}
-
-      - name: Configure ccache
-        run: |
-          if [ "${{ matrix.build }}" = "make" ]; then
-            # Add ccache to path
-            if [ "$RUNNER_OS" = "Linux" ]; then
-              echo "/usr/lib/ccache" >> $GITHUB_PATH
-            else
-              echo "::error::$RUNNER_OS not supported"
-              exit 1
-            fi
-          fi
-          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Build OpenBLAS
-        run: |
-          case "${{ matrix.build }}" in
-            "make")
-              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
-              ;;
-            "cmake")
-              mkdir build && cd build
-              cmake -DDYNAMIC_ARCH=1 \
-                    -DNOFORTRAN=0 \
-                    -DBUILD_WITHOUT_LAPACK=0 \
-                    -DCMAKE_VERBOSE_MAKEFILE=ON \
-                    -DCMAKE_BUILD_TYPE=Release \
-                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
-                    ..
-              cmake --build .
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
-
-      - name: Show ccache status
-        continue-on-error: true
-        run: ccache -s
-
-      - name: Run tests
-        timeout-minutes: 60
-        run: |
-          case "${{ matrix.build }}" in
-            "make")
-              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
-              echo "::group::Tests in 'test' directory"
-              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'ctest' directory"
-              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'utest' directory"
-              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              ;;
-            "cmake")
-              cd build && ctest
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
deleted file mode 100644
index a47ca1dce6..0000000000
--- a/.github/workflows/c910v.yml
+++ /dev/null
@@ -1,127 +0,0 @@
-name: c910v qemu test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  TEST:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    env:
-      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
-      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: RISCV64_GENERIC
-            triple: riscv64-linux-gnu
-            apt_triple: riscv64-linux-gnu
-            opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
-          - target: C910V
-            triple: riscv64-unknown-linux-gnu
-            apt_triple: riscv64-linux-gnu
-            opts: NO_SHARED=1 TARGET=C910V
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: install build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
-
-      - name: checkout qemu
-        uses: actions/checkout@v3
-        with:
-          repository: T-head-Semi/qemu
-          path: qemu
-          ref: 1e692ebb43d396c52352406323fc782c1ac99a42
-
-      - name: build qemu
-        run: |
-          # Force use c910v qemu-user
-          wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
-          cd qemu
-          patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
-          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
-          make -j$(nproc)
-          make install
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: build OpenBLAS
-        run: |
-            wget ${xuetie_toolchain}/${toolchain_file_name}
-            tar -xvf ${toolchain_file_name} -C /opt
-            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
-
-            make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
-
-      - name: test
-        run: |
-          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
-          qemu-riscv64 ./utest/openblas_utest
-          qemu-riscv64 ./utest/openblas_utest_ext
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/cirun-asv.yml b/.github/workflows/cirun-asv.yml
new file mode 100644
index 0000000000..94f8adea83
--- /dev/null
+++ b/.github/workflows/cirun-asv.yml
@@ -0,0 +1,91 @@
+name: ASV Benchmark on arm64 graviton cirun
+
+on:
+  push:
+    branches:
+      - develop
+      - release-**
+  pull_request:
+    branches:
+      - develop
+      - release-**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+env:
+ # GITHUB_TOKEN: ${{ secrets.OB_BENCH_TOKEN }}
+ # BENCHMARKS_REPO: ev-br/ob-bench-asv
+  ASV_CONFIG: asv.conf.json
+  MACHINE_NAME: github-actions-cirun-graviton  # to identify github actions machine as hostname changes everytime
+
+jobs:
+  build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
+
+    strategy:
+      fail-fast: false
+      matrix:
+        fortran: [gfortran]
+        build: [make]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # To fetch all commits to be able to generate benchmarks html
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install system dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt update
+            sudo apt-get install -y gfortran cmake ccache libtinfo5 python3-pip pkg-config
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install python dependencies
+        run: |
+          pip3 install numpy meson meson-python ninja build asv virtualenv
+          # pip3 install scipy_openblas32
+          # install the nightly OpenBLAS wheel
+          pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scipy-openblas32
+          python3 -c'import scipy_openblas32 as so; print(so.get_pkg_config())' > scipy_openblas.pc
+          export PKG_CONFIG_PATH=$PWD
+          echo ">>>> PKG_CONFIG" $PKG_CONFIG_PATH
+          cat scipy_openblas.pc
+
+      - name: Set and log asv machine configuration
+        run: |
+          cd benchmark/pybench/asv
+          python3 -m asv machine --yes --config asv.conf.json
+          echo "Machine Configuration:"
+          cat ~/.asv-machine.json
+          rm ~/.asv-machine.json
+
+          echo "Setting machine name to $MACHINE_NAME"
+          python3 -m asv machine --machine $MACHINE_NAME --yes --config $ASV_CONFIG -v
+          cat ~/.asv-machine.json
+
+      - name: Run benchmarks
+        run: |
+          echo ${{ github.workspace}}
+          cd benchmark/pybench/asv
+          python3 -m asv run --config $ASV_CONFIG -v
+        env:
+          PKG_CONFIG_PATH: ${{ github.workspace }}
+
diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml
deleted file mode 100644
index 04befefa9e..0000000000
--- a/.github/workflows/codspeed-bench.yml
+++ /dev/null
@@ -1,150 +0,0 @@
-name: Run codspeed benchmarks
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  benchmarks:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        fortran: [gfortran]
-        build: [make]
-        pyver: ["3.12"]
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v3
-        with:
-            python-version: ${{ matrix.pyver }}
-
-      - name: Print system information
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            cat /proc/cpuinfo
-          fi
-
-      - name: Install Dependencies
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt-get update
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          # GNU make and cmake call the compilers differently. It looks like
-          # that causes the cache to mismatch. Keep the ccache for both build
-          # tools separate to avoid polluting each other.
-          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
-          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
-            ccache-${{ runner.os }}-${{ matrix.build }}
-
-      - name: Write out the .pc
-        run: |
-             cd benchmark/pybench
-             cat > openblas.pc << EOF
-             libdir=${{ github.workspace }}
-             includedir= ${{ github.workspace }}
-             openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
-             version=0.0.99
-             extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
-             Name: openblas
-             Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
-             Version: ${version}
-             URL: https://github.com/xianyi/OpenBLAS
-             Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }}
-             Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
-             Cflags: -I${{ github.workspace}}
-             EOF
-             cat openblas.pc
-
-      - name: Configure ccache
-        run: |
-          if [ "${{ matrix.build }}" = "make" ]; then
-            # Add ccache to path
-            if [ "$RUNNER_OS" = "Linux" ]; then
-              echo "/usr/lib/ccache" >> $GITHUB_PATH
-            elif [ "$RUNNER_OS" = "macOS" ]; then
-              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
-            else
-              echo "::error::$RUNNER_OS not supported"
-              exit 1
-            fi
-          fi
-          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Build OpenBLAS
-        run: |
-          case "${{ matrix.build }}" in
-            "make")
-              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
-              ;;
-            "cmake")
-              mkdir build && cd build
-              cmake -DDYNAMIC_ARCH=1 \
-                    -DNOFORTRAN=0 \
-                    -DBUILD_WITHOUT_LAPACK=0 \
-                    -DCMAKE_VERBOSE_MAKEFILE=ON \
-                    -DCMAKE_BUILD_TYPE=Release \
-                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
-                    ..
-              cmake --build .
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
-
-      - name: Show ccache status
-        continue-on-error: true
-        run: ccache -s
-
-      - name: Install benchmark dependencies
-        run: pip install meson ninja numpy pytest pytest-codspeed --user
-
-      - name: Build the wrapper
-        run: |
-          cd benchmark/pybench
-          export PKG_CONFIG_PATH=$PWD
-          meson setup build  --prefix=$PWD/build-install
-          meson install -C build
-          #
-          # sanity check
-          cd build/openblas_wrap
-          python -c'import _flapack; print(dir(_flapack))'
-
-      - name: Run benchmarks
-        uses: CodSpeedHQ/action@v2
-        with:
-          token: ${{ secrets.CODSPEED_TOKEN }}
-          run: |
-            cd benchmark/pybench
-            export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
-            OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed
-
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
deleted file mode 100644
index 203097812f..0000000000
--- a/.github/workflows/docs.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: Publish docs via GitHub Pages
-on:
-  push:
-    branches:
-      - develop
-jobs:
-  build:
-    name: Deploy docs
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: "3.10"
-      - run: pip install mkdocs mkdocs-material
-      # mkdocs gh-deploy command only builds to the top-level, hence building then deploying ourselves
-      - run: mkdocs build
-      - name: Deploy docs
-        uses: peaceiris/actions-gh-pages@v3
-        if: ${{ github.ref == 'refs/heads/develop' }}
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./site
-          destination_dir: docs/
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
deleted file mode 100644
index 669aa81168..0000000000
--- a/.github/workflows/dynamic_arch.yml
+++ /dev/null
@@ -1,371 +0,0 @@
-name: continuous build
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest]
-        fortran: [gfortran, flang]
-        build: [cmake, make]
-        exclude:
-          - os: macos-latest
-            fortran: flang
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Print system information
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            cat /proc/cpuinfo
-          elif [ "$RUNNER_OS" == "macOS" ]; then
-            sysctl -a | grep machdep.cpu
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Install Dependencies
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt-get update
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
-          elif [ "$RUNNER_OS" == "macOS" ]; then
-            # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
-            brew reinstall gcc
-            brew install coreutils cmake ccache
-          else
-            echo "::error::$RUNNER_OS not supported"
-            exit 1
-          fi
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          # GNU make and cmake call the compilers differently. It looks like
-          # that causes the cache to mismatch. Keep the ccache for both build
-          # tools separate to avoid polluting each other.
-          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
-          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
-            ccache-${{ runner.os }}-${{ matrix.build }}
-
-      - name: Configure ccache
-        run: |
-          if [ "${{ matrix.build }}" = "make" ]; then
-            # Add ccache to path
-            if [ "$RUNNER_OS" = "Linux" ]; then
-              echo "/usr/lib/ccache" >> $GITHUB_PATH
-            elif [ "$RUNNER_OS" = "macOS" ]; then
-              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
-            else
-              echo "::error::$RUNNER_OS not supported"
-              exit 1
-            fi
-          fi
-          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Build OpenBLAS
-        run: |
-          if [ "${{ matrix.fortran }}" = "flang" ]; then
-            # download and install classic flang
-            cd /usr/
-            sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
-            sudo tar xf flang-20190329-x86-70.tgz
-            sudo rm flang-20190329-x86-70.tgz
-            cd -
-          fi
-          case "${{ matrix.build }}" in
-            "make")
-              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
-              ;;
-            "cmake")
-              mkdir build && cd build
-              cmake -DDYNAMIC_ARCH=1 \
-                    -DNOFORTRAN=0 \
-                    -DBUILD_WITHOUT_LAPACK=0 \
-                    -DCMAKE_VERBOSE_MAKEFILE=ON \
-                    -DCMAKE_BUILD_TYPE=Release \
-                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
-                    ..
-              cmake --build .
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
-
-      - name: Show ccache status
-        continue-on-error: true
-        run: ccache -s
-
-      - name: Run tests
-        timeout-minutes: 60
-        run: |
-          case "${{ matrix.build }}" in
-            "make")
-              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
-              echo "::group::Tests in 'test' directory"
-              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'ctest' directory"
-              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              echo "::group::Tests in 'utest' directory"
-              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
-              echo "::endgroup::"
-              ;;
-            "cmake")
-              cd build && ctest
-              ;;
-            *)
-              echo "::error::Configuration not supported"
-              exit 1
-              ;;
-          esac
-
-
-  msys2:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
-        idx: [int32, int64]
-        build-type: [Release]
-        include:
-          - msystem: UCRT64
-            idx: int32
-            target-prefix: mingw-w64-ucrt-x86_64
-            fc-pkg: fc
-          - msystem: MINGW32
-            idx: int32
-            target-prefix: mingw-w64-i686
-            fc-pkg: fc
-          - msystem: CLANG64
-            idx: int32
-            target-prefix: mingw-w64-clang-x86_64
-            fc-pkg: fc
-            # Compiling with Flang 16 seems to cause test errors on machines
-            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
-            no-avx512-flags: -DNO_AVX512=1
-          - msystem: CLANG32
-            idx: int32
-            target-prefix: mingw-w64-clang-i686
-            fc-pkg: cc
-            c-lapack-flags: -DC_LAPACK=ON
-          - msystem: UCRT64
-            idx: int64
-            idx64-flags: -DBINARY=64 -DINTERFACE64=1
-            target-prefix: mingw-w64-ucrt-x86_64
-            fc-pkg: fc
-          - msystem: CLANG64
-            idx: int64
-            idx64-flags: -DBINARY=64 -DINTERFACE64=1
-            target-prefix: mingw-w64-clang-x86_64
-            fc-pkg: fc
-            # Compiling with Flang 16 seems to cause test errors on machines
-            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
-            no-avx512-flags: -DNO_AVX512=1
-          - msystem: UCRT64
-            idx: int32
-            target-prefix: mingw-w64-ucrt-x86_64
-            fc-pkg: fc
-            build-type: None
-        exclude:
-          - msystem: MINGW32
-            idx: int64
-          - msystem: CLANG32
-            idx: int64
-
-    defaults:
-      run:
-        # Use MSYS2 bash as default shell
-        shell: msys2 {0}
-
-    env:
-      CHERE_INVOKING: 1
-
-    steps:
-      - name: Get CPU name
-        shell: pwsh
-        run : |
-          Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name
-
-      - name: Install build dependencies
-        uses: msys2/setup-msys2@v2
-        with:
-          msystem: ${{ matrix.msystem }}
-          update: true
-          release: false  # Use pre-installed version
-          install: >-
-            base-devel
-            ${{ matrix.target-prefix }}-cc
-            ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
-            ${{ matrix.target-prefix }}-cmake
-            ${{ matrix.target-prefix }}-ninja
-            ${{ matrix.target-prefix }}-ccache
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Prepare ccache
-        # Get cache location of ccache
-        # Create key that is used in action/cache/restore and action/cache/save steps
-        id: ccache-prepare
-        run: |
-          echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
-
-      - name: Restore ccache
-        uses: actions/cache/restore@v3
-        with:
-          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
-          key: ${{ steps.ccache-prepare.outputs.key }}
-          # Restore a matching ccache cache entry. Prefer same branch.
-          restore-keys: |
-            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
-            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}
-
-      - name: Configure ccache
-        # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
-        run: |
-          which ccache
-          test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
-          echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
-          echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
-          ccache -p
-          ccache -s
-          echo $HOME
-          cygpath -w $HOME
-
-      - name: Configure OpenBLAS
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
-                -DBUILD_SHARED_LIBS=ON \
-                -DBUILD_STATIC_LIBS=ON \
-                -DDYNAMIC_ARCH=ON \
-                -DUSE_THREAD=ON \
-                -DNUM_THREADS=64 \
-                -DTARGET=CORE2 \
-                ${{ matrix.idx64-flags }} \
-                ${{ matrix.c-lapack-flags }} \
-                ${{ matrix.no-avx512-flags }} \
-                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
-                ..
-
-      - name: Build OpenBLAS
-        run: cd build && cmake --build .
-
-      - name: Show ccache status
-        continue-on-error: true
-        run: ccache -s
-
-      - name: Save ccache
-        # Save the cache after we are done (successfully) building
-        uses: actions/cache/save@v3
-        with:
-          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
-          key: ${{ steps.ccache-prepare.outputs.key }}
-
-      - name: Run tests
-        id: run-ctest
-        timeout-minutes: 60
-        run: cd build && ctest
-
-      - name: Re-run tests
-        if: always() && (steps.run-ctest.outcome == 'failure')
-        timeout-minutes: 60
-        run: |
-          cd build
-          echo "::group::Re-run ctest"
-          ctest --rerun-failed --output-on-failure || true
-          echo "::endgroup::"
-          echo "::group::Log from these tests"
-          [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
-          echo "::endgroup::"
-
-
-  cross_build:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-22.04
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: mips64el
-            triple: mips64el-linux-gnuabi64
-            opts: DYNAMIC_ARCH=1 TARGET=GENERIC
-          - target: riscv64
-            triple: riscv64-linux-gnu
-            opts: TARGET=RISCV64_GENERIC
-          - target: mipsel
-            triple: mipsel-linux-gnu
-            opts: TARGET=MIPS1004K
-          - target: alpha
-            triple: alpha-linux-gnu
-            opts: TARGET=EV4
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-
-      - name: Build OpenBLAS
-        run: |
-          make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
deleted file mode 100644
index da7f6c9a0c..0000000000
--- a/.github/workflows/loongarch64.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-name: loongarch64 qemu test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  TEST:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: LOONGSONGENERIC
-            triple:  loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
-          - target: LOONGSON3R5
-            triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
-          - target: LOONGSON2K1000
-            triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
-          - target: DYNAMIC_ARCH
-            triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install APT deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
-
-      - name: Download and install loongarch64-toolchain
-        run: |
-          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
-          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
-          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
-
-      - name: Checkout qemu
-        uses: actions/checkout@v3
-        with:
-          repository: qemu/qemu
-          path: qemu
-          ref: master
-
-      - name: Install qemu
-        run: |
-          cd qemu
-          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static
-          make -j$(nproc)
-          make install
-
-      - name: Set env
-        run: |
-          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Disable utest dsdot:dsdot_n_1
-        run: |
-          echo -n > utest/test_dsdot.c
-          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
-          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
-
-      - name: Build OpenBLAS
-        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
-
-      - name: Test
-        run: |
-          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
-          qemu-loongarch64 ./utest/openblas_utest
-          qemu-loongarch64 ./utest/openblas_utest_ext
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml
deleted file mode 100644
index d08e56f627..0000000000
--- a/.github/workflows/loongarch64_clang.yml
+++ /dev/null
@@ -1,135 +0,0 @@
-name: loongarch64 clang qemu test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  TEST:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: LOONGSONGENERIC
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
-          - target: LOONGSON3R5
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
-          - target: LOONGSON2K1000
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
-          - target: DYNAMIC_ARCH
-            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install libffi6
-        run: |
-          wget http://ftp.ca.debian.org/debian/pool/main/libf/libffi/libffi6_3.2.1-9_amd64.deb
-          sudo dpkg -i libffi6_3.2.1-9_amd64.deb
-
-      - name: Install APT deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
-
-      - name: Download and install loongarch64-toolchain
-        run: |
-          wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz
-          wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz
-          tar -xf clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz -C /opt
-          tar -xf loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz -C /opt
-
-      - name: Checkout qemu
-        uses: actions/checkout@v3
-        with:
-          repository: qemu/qemu
-          path: qemu
-          ref: master
-
-      - name: Install qemu
-        run: |
-          cd qemu
-          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static
-          make -j$(nproc)
-          make install
-
-      - name: Set env
-        run: |
-          echo "PATH=$GITHUB_WORKSPACE:/opt/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10/bin:/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/bin:$PATH" >> $GITHUB_ENV
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: Disable utest dsdot:dsdot_n_1
-        run: |
-          echo -n > utest/test_dsdot.c
-          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
-          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
-
-      - name: Build OpenBLAS
-        run: make CC='ccache clang --target=loongarch64-linux-gnu --sysroot=/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/loongarch64-linux-gnu/sysroot/ -static' FC='ccache loongarch64-linux-gnu-gfortran -static' HOSTCC='ccache clang' CROSS_SUFFIX=llvm-  NO_SHARED=1 ${{ matrix.opts }} -j$(nproc)
-
-      - name: Test
-        run: |
-          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
-          qemu-loongarch64 ./utest/openblas_utest
-          qemu-loongarch64 ./utest/openblas_utest_ext
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
-
diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
deleted file mode 100644
index 1491aff78b..0000000000
--- a/.github/workflows/mips64.yml
+++ /dev/null
@@ -1,123 +0,0 @@
-name: mips64 qemu test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  TEST:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: MIPS64_GENERIC
-            triple: mips64el-linux-gnuabi64
-            opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
-          - target: SICORTEX
-            triple: mips64el-linux-gnuabi64
-            opts: NO_SHARED=1 TARGET=SICORTEX
-          - target: I6400
-            triple: mipsisa64r6el-linux-gnuabi64
-            opts: NO_SHARED=1 TARGET=I6400
-          - target: P6600
-            triple: mipsisa64r6el-linux-gnuabi64
-            opts: NO_SHARED=1 TARGET=P6600
-          - target: I6500
-            triple: mipsisa64r6el-linux-gnuabi64
-            opts: NO_SHARED=1 TARGET=I6500
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: install build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
-
-      - name: checkout qemu
-        uses: actions/checkout@v3
-        with:
-          repository: qemu/qemu
-          path: qemu
-          ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
-
-      - name: build qemu
-        run: |
-          cd qemu
-          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
-          make -j$(nproc)
-          make install
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: build OpenBLAS
-        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
-
-      - name: test
-        run: |
-          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
-          qemu-mips64el ./utest/openblas_utest
-          qemu-mips64el ./utest/openblas_utest_ext
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT2.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
-          rm -f ./test/?BLAT3.SUMM
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
-          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
deleted file mode 100644
index ca57fba709..0000000000
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# Only the "head" branch of the OpenBLAS package is tested
-
-on:
-  push:
-    paths:
-      - '**/nightly-Homebrew-build.yml'
-  pull_request:
-    branches:
-      - develop
-    paths:
-      - '**/nightly-Homebrew-build.yml'
-  schedule:
-    - cron: 45 7 * * *
-# This is 7:45 AM UTC daily, late at night in the USA
-
-# Since push and pull_request will still always be building and testing the `develop` branch,
-# it only makes sense to test if this file has been changed
-
-name: Nightly-Homebrew-Build
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build-OpenBLAS-with-Homebrew:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: macos-latest
-    env:
-      DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
-      HOMEBREW_DEVELOPER: "ON"
-      HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
-      HOMEBREW_NO_ANALYTICS: "ON"
-      HOMEBREW_NO_AUTO_UPDATE: "ON"
-      HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
-      HOMEBREW_NO_INSTALL_CLEANUP: "ON"
-      HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "ON"
-      HOMEBREW_NO_INSTALL_FROM_API: "ON"      
-
-    steps:
-      - name: Random delay for cron job
-        run: |
-          delay=$(( RANDOM % 600 ))
-          printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}"
-          sleep ${delay}
-        if: github.event_name == 'schedule'
-
-      - uses: actions/checkout@v2
-        # This isn't even needed, technically. Homebrew will get `develop` via git
-
-      - name: Update Homebrew
-        if: github.event_name != 'pull_request'
-        run: brew update || true
-          
-      - name: Install prerequisites
-        run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
-
-      - name: Install and bottle OpenBLAS
-        run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas
-        # the HEAD flags tell Homebrew to build the develop branch fetch via git
-
-      - name: Create bottle
-        run: |
-          brew bottle -v openblas
-          mkdir bottles
-          mv *.bottle.tar.gz bottles
-
-      - name: Upload bottle
-        uses: actions/upload-artifact@v1
-        with:
-          name: openblas--HEAD.catalina.bottle.tar.gz
-          path: bottles
-
-      - name: Show linkage
-        run: brew linkage -v openblas
-
-      - name: Test openblas
-        run: brew test --HEAD --verbose openblas
-
-      - name: Audit openblas formula
-        run: |
-          brew audit --strict openblas
-          brew cat openblas
-
-      - name: Post logs on failure
-        if: failure()
-        run: brew gist-logs --with-hostname -v openblas
diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml
deleted file mode 100644
index dd6fe9ca80..0000000000
--- a/.github/workflows/riscv64_vector.yml
+++ /dev/null
@@ -1,253 +0,0 @@
-name: riscv64 zvl256b qemu test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  TEST:
-    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
-    env:
-      triple: riscv64-unknown-linux-gnu
-      riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
-      riscv_gnu_toolchain_version: 13.2.0
-      riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - target: RISCV64_ZVL128B
-            opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
-            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
-          - target: RISCV64_ZVL256B
-            opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
-            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: install build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make \
-          libgomp1-riscv64-cross ccache
-          wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
-          tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
-
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          path: ~/.ccache
-          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
-          restore-keys: |
-            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
-            ccache-${{ runner.os }}-${{ matrix.target }}
-
-      - name: Configure ccache
-        run: |
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 300M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
-          ccache -s
-
-      - name: build OpenBLAS libs
-        run: |
-            export PATH="/opt/riscv/bin:$PATH"
-            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
-            CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
-            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
-            RANLIB='ccache ${triple}-ranlib' \
-            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
-            HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
-
-      - name: build OpenBLAS tests
-        run: |
-            export PATH="/opt/riscv/bin:$PATH"
-            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
-            CC='${triple}-gcc' \
-            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
-            RANLIB='ccache ${triple}-ranlib' \
-            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
-            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
-
-      - name: build lapack-netlib tests
-        working-directory: ./lapack-netlib/TESTING
-        run: |
-            export PATH="/opt/riscv/bin:$PATH"
-            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
-            CC='${triple}-gcc' \
-            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
-            RANLIB='ccache ${triple}-ranlib' \
-            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
-            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
-            LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
-            LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
-            LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
-
-      - name: OpenBLAS tests
-        shell: bash
-        run: |
-          export PATH="/opt/riscv/bin:$PATH"
-          export QEMU_CPU=${{ matrix.qemu_cpu }}
-          rm -rf ./test_out
-          mkdir -p ./test_out
-          run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
-            echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
-            if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ 
-            else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
-            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
-          }
-          run_test test cblat1 &
-          run_test test cblat2 cblat2.dat &
-          run_test test cblat3 cblat3.dat &
-          run_test test dblat1 &
-          run_test test dblat2 dblat2.dat &
-          run_test test dblat3 dblat3.dat &
-          run_test test sblat1 &
-          run_test test sblat2 sblat2.dat &
-          run_test test sblat3 sblat3.dat &
-          run_test test zblat1 &
-          run_test test zblat2 zblat2.dat &
-          run_test test zblat3 zblat3.dat &
-          run_test ctest xccblat1 &
-          run_test ctest xccblat2 cin2 &
-          run_test ctest xccblat3 cin3 &
-          run_test ctest xdcblat1 &
-          run_test ctest xdcblat2 din2 &
-          run_test ctest xdcblat3 din3 &
-          run_test ctest xscblat1 &
-          run_test ctest xscblat2 sin2 &
-          run_test ctest xscblat3 sin3 &
-          run_test ctest xzcblat1 &
-          run_test ctest xzcblat2 zin2 &
-          run_test ctest xzcblat3 zin3 &
-          wait
-          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
-          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
-
-      - name: netlib tests
-        shell: bash
-        run: |
-          : # these take a very long time
-          echo "Skipping netlib tests in CI"
-          exit 0
-          : # comment out exit above to enable the tests
-          : # probably we want to identify a subset to run in CI
-          export PATH="/opt/riscv/bin:$PATH"
-          export QEMU_CPU=${{ matrix.qemu_cpu }}
-          rm -rf ./test_out
-          mkdir -p ./test_out
-          run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
-            echo "$4" >> $OUTPUT; \
-            echo "$CMD" >> $OUTPUT; \
-            qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
-            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
-            if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
-            if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
-          }
-          run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines"  &
-          run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines"  &
-          run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines"  &
-          run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines"  &
-          run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines"  &
-          run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines"  &
-          run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines"  &
-          run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines"  &
-          run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines"  &
-          run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines"  &
-          run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
-          run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines"  &
-          run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines"  &
-          run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver"  &
-          run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines"  &
-          run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
-          run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines"  &
-          run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines"  &
-          run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix"  &
-          run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix"  &
-          run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices"  &
-          run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices"  &
-          run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines"  &
-          run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines"  &
-          run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines"  &
-          run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines"  &
-          run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines"  &
-          run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines"  &
-          run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
-          run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines"  &
-          run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines"  &
-          run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver"  &
-          run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines"  &
-          run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
-          run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines"  &
-          run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
-          run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix"  &
-          run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix"  &
-          run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
-          run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices"  &
-          run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines"  &
-          run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines"  &
-          run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines"  &
-          run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines"  &
-          run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines"  &
-          run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines"  &
-          run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
-          run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines"  &
-          run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines"  &
-          run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver"  &
-          run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines"  &
-          run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
-          run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines"  &
-          run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines"  &
-          run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix"  &
-          run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix"  &
-          run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices"  &
-          run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices"  &
-          run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines"  &
-          run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines"  &
-          run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines"  &
-          run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines"  &
-          run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines"  &
-          run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines"  &
-          run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
-          run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
-          run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines"  &
-          run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines"  &
-          run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver"  &
-          run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines"  &
-          run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
-          run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines"  &
-          run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
-          run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix"  &
-          run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix"  &
-          run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
-          run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices"  &
-          run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines"  &
-          run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines"  &
-          run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines"  &
-          run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines"  &
-          run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines"  &
-          run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines"  &
-          wait
-          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ;  done < <(grep -lZ FAIL ./test_out/*)
-          python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
-          TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
-          NUMERICAL_ERRORS=-1
-          OTHER_ERRORS=-1
-          . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
-          if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
-          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
diff --git a/benchmark/pybench/asv/README.md b/benchmark/pybench/asv/README.md
new file mode 100644
index 0000000000..1436e08d30
--- /dev/null
+++ b/benchmark/pybench/asv/README.md
@@ -0,0 +1 @@
+Benchmark graphs are at https://ev-br.github.io/ob-bench-asv/
diff --git a/benchmark/pybench/asv/asv.conf.json b/benchmark/pybench/asv/asv.conf.json
new file mode 100644
index 0000000000..8323173318
--- /dev/null
+++ b/benchmark/pybench/asv/asv.conf.json
@@ -0,0 +1,195 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "OpenBLAS",
+
+    // The project's homepage
+    "project_url": "https://github.com/OpenMathLib/OpenBLAS/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "../../..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "benchmarks/pybench/asv",
+
+    // Customizable commands for building the project.
+    // See asv.conf.json documentation.
+    // To build the package using pyproject.toml (PEP518), uncomment the following lines
+       "build_command": [
+           "python -m pip install build",
+           "python -m build",
+           // https://github.com/scipy/scipy/issues/20574
+           "PIP_NO_BUILD_ISOLATION=false python -m pip wheel . --no-deps --no-index -w {build_cache_dir} {build_dir}"
+       ],
+    // To build the package using setuptools and a setup.py file, uncomment the following lines
+    // "build_command": [
+    //     "python setup.py build",
+    //     "python -mpip wheel -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // Customizable commands for installing and uninstalling the project.
+    // See asv.conf.json documentation.
+       "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+       "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+
+    // List of branches to benchmark. If not provided, defaults to "main"
+    // (for git) or "default" (for mercurial).
+       "branches": ["develop"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+       "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv", "mamba" (above 3.8)
+    // or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    // "show_commit_url": "http://github.com/owner/project/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.8", "3.12"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": ["conda-forge", "defaults"],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    //
+    // "matrix": {
+    //     "req": {
+    //         "numpy": ["1.6", "1.7"],
+    //         "six": ["", null],  // test with and without six installed
+    //         "pip+emcee": [""]   // emcee is only available for install with pip.
+    //     },
+    //     "env": {"ENV_VAR_1": ["val1", "val2"]},
+    //     "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python3.12
+    //     {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO": "123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "3.12", "req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+       "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmark/pybench/asv/benchmarks/__init__.py b/benchmark/pybench/asv/benchmarks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/pybench/asv/benchmarks/benchmarks.py b/benchmark/pybench/asv/benchmarks/benchmarks.py
new file mode 100644
index 0000000000..120615a57c
--- /dev/null
+++ b/benchmark/pybench/asv/benchmarks/benchmarks.py
@@ -0,0 +1,258 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+
+'''
+class TimeSuite:
+    """
+    An example benchmark that times the performance of various kinds
+    of iterating over dictionaries in Python.
+    """
+    def setup(self):
+        self.d = {}
+        for x in range(500):
+            self.d[x] = None
+
+    def time_keys(self):
+        for key in self.d.keys():
+            pass
+
+    def time_values(self):
+        for value in self.d.values():
+            pass
+
+    def time_range(self):
+        d = self.d
+        for key in range(500):
+            d[key]
+
+
+class MemSuite:
+    def mem_list(self):
+        return [0] * 256
+'''
+
+
+import numpy as np
+from openblas_wrap import (
+    # level 1
+    dnrm2, ddot, daxpy,
+    # level 3
+    dgemm, dsyrk,
+    # lapack
+    dgesv,                   # linalg.solve
+    dgesdd, dgesdd_lwork,    # linalg.svd
+    dsyev, dsyev_lwork,      # linalg.eigh
+)
+
+# ### BLAS level 1 ###
+
+# dnrm2
+
+dnrm2_sizes = [100, 1000]
+
+def run_dnrm2(n, x, incx):
+    res = dnrm2(x, n, incx=incx)
+    return res
+
+
+
+class Nrm2:
+
+    params = [100, 1000]
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.x = rndm.uniform(size=(n,)).astype(float)
+
+    def time_dnrm2(self, n):
+        run_dnrm2(n, self.x, 1)
+
+
+# ddot
+
+ddot_sizes = [100, 1000]
+
+def run_ddot(x, y,):
+    res = ddot(x, y)
+    return res
+
+
+class DDot:
+    params = ddot_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.x = np.array(rndm.uniform(size=(n,)), dtype=float)
+        self.y = np.array(rndm.uniform(size=(n,)), dtype=float)
+
+    def time_ddot(self, n):
+        run_ddot(self.x, self.y)
+
+
+
+# daxpy
+
+daxpy_sizes = [100, 1000]
+
+def run_daxpy(x, y,):
+    res = daxpy(x, y, a=2.0)
+    return res
+
+
+class Daxpy:
+    params = daxpy_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.x = np.array(rndm.uniform(size=(n,)), dtype=float)
+        self.y = np.array(rndm.uniform(size=(n,)), dtype=float)
+
+    def time_daxpy(self, n):
+        run_daxpy(self.x, self.y)
+
+
+
+# ### BLAS level 3 ###
+
+# dgemm
+
+gemm_sizes = [100, 1000]
+
+def run_dgemm(a, b, c):
+    alpha = 1.0
+    res = dgemm(alpha, a, b, c=c, overwrite_c=True)
+    return res
+
+
+class Dgemm:
+    params = gemm_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+        self.b = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+        self.c = np.empty((n, n), dtype=float, order='F')
+
+    def time_dgemm(self, n):
+        run_dgemm(self.a, self.b, self.c)
+
+
+# dsyrk
+
+syrk_sizes = [100, 1000]
+
+
+def run_dsyrk(a, c):
+    res = dsyrk(1.0, a, c=c, overwrite_c=True)
+    return res
+
+
+class DSyrk:
+    params = syrk_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+        self.c = np.empty((n, n), dtype=float, order='F')
+
+    def time_dsyrk(self, n):
+        run_dsyrk(self.a, self.c)
+
+
+# ### LAPACK ###
+
+# linalg.solve
+
+dgesv_sizes = [100, 1000]
+
+
+def run_dgesv(a, b):
+    res = dgesv(a, b, overwrite_a=True, overwrite_b=True)
+    return res
+
+
+class Dgesv:
+    params = dgesv_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        self.a = (np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') +
+                  np.eye(n, order='F'))
+        self.b = np.array(rndm.uniform(size=(n, 1)), order='F')
+
+    def time_dgesv(self, n):
+        run_dgesv(self.a, self.b)
+
+      # XXX: how to run asserts?
+      #  lu, piv, x, info = benchmark(run_gesv, a, b)
+      #  assert lu is a
+      #  assert x is b
+      #  assert info == 0
+
+
+# linalg.svd
+
+dgesdd_sizes = ["100, 5", "1000, 222"]
+
+
+def run_dgesdd(a, lwork):
+    res = dgesdd(a, lwork=lwork, full_matrices=False, overwrite_a=False)
+    return res
+
+
+class Dgesdd:
+    params = dgesdd_sizes
+    param_names = ["(m, n)"]
+
+    def setup(self, mn):
+        m, n = (int(x) for x in mn.split(","))
+
+        rndm = np.random.RandomState(1234)
+        a = np.array(rndm.uniform(size=(m, n)), dtype=float, order='F')
+
+        lwork, info = dgesdd_lwork(m, n)
+        lwork = int(lwork)
+        assert info == 0
+
+        self.a, self.lwork = a, lwork
+
+    def time_dgesdd(self, mn):
+        run_dgesdd(self.a, self.lwork)
+
+
+# linalg.eigh
+
+dsyev_sizes = [50, 200]
+
+
+def run_dsyev(a, lwork):
+    res = dsyev(a, lwork=lwork, overwrite_a=True)
+    return res
+
+
+class Dsyev:
+    params = dsyev_sizes
+    param_names = ["size"]
+
+    def setup(self, n):
+        rndm = np.random.RandomState(1234)
+        a = rndm.uniform(size=(n, n))
+        a = np.asarray(a + a.T, dtype=float, order='F')
+        a_ = a.copy()
+
+        lwork, info = dsyev_lwork(n)
+        lwork = int(lwork)
+        assert info == 0
+
+        self.a = a_
+        self.lwork = lwork
+
+    def time_dsyev(self, n):
+        run_dsyev(self.a, self.lwork)
+
diff --git a/benchmark/pybench/asv/meson.build b/benchmark/pybench/asv/meson.build
new file mode 100644
index 0000000000..88d05be7d7
--- /dev/null
+++ b/benchmark/pybench/asv/meson.build
@@ -0,0 +1,48 @@
+#
+# Taken from SciPy (of course)
+#
+project(
+  'openblas-wrap',
+  'c', 'fortran',
+  version: '0.1',
+  license: 'BSD-3',
+  meson_version: '>= 1.1.0',
+  default_options: [
+    'buildtype=debugoptimized',
+    'b_ndebug=if-release',
+    'c_std=c17',
+    'fortran_std=legacy',
+  ],
+)
+
+py3 = import('python').find_installation(pure: false)
+py3_dep = py3.dependency()
+
+cc = meson.get_compiler('c')
+
+_global_c_args = cc.get_supported_arguments(
+  '-Wno-unused-but-set-variable',
+  '-Wno-unused-function',
+  '-Wno-conversion',
+  '-Wno-misleading-indentation',
+)
+add_project_arguments(_global_c_args, language : 'c')
+
+# We need -lm for all C code (assuming it uses math functions, which is safe to
+# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is
+# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
+m_dep = cc.find_library('m', required : false)
+if m_dep.found()
+  add_project_link_arguments('-lm', language : 'c')
+endif
+
+generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py')
+
+openblas = dependency('scipy_openblas', method: 'pkg-config', required: true)
+openblas_dep = declare_dependency(
+  dependencies: openblas,
+  compile_args: []
+)
+
+
+subdir('openblas_wrap')
diff --git a/benchmark/pybench/asv/openblas_wrap/__init__.py b/benchmark/pybench/asv/openblas_wrap/__init__.py
new file mode 100644
index 0000000000..9958423ac3
--- /dev/null
+++ b/benchmark/pybench/asv/openblas_wrap/__init__.py
@@ -0,0 +1,29 @@
+"""
+Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...)
+from benchmarking.
+"""
+
+__version__ = "0.1"  
+
+import scipy_openblas32   # preload symbols. typically done in _distributor_init.py
+
+#from scipy.linalg.blas import (
+from ._flapack import (
+    # level 1
+    scipy_dnrm2 as dnrm2,
+    scipy_ddot as ddot,
+    scipy_daxpy as daxpy,
+    # level 3
+    scipy_dgemm as dgemm,
+    scipy_dsyrk as dsyrk,
+)
+
+#from scipy.linalg.lapack import (
+from openblas_wrap._flapack import (
+    # linalg.solve
+    scipy_dgesv as dgesv,
+    # linalg.svd
+    scipy_dgesdd as dgesdd, scipy_dgesdd_lwork as dgesdd_lwork,
+    # linalg.eigh
+    scipy_dsyev as dsyev, scipy_dsyev_lwork as dsyev_lwork
+)
diff --git a/benchmark/pybench/asv/openblas_wrap/_distributor_init.py b/benchmark/pybench/asv/openblas_wrap/_distributor_init.py
new file mode 100644
index 0000000000..1779095c75
--- /dev/null
+++ b/benchmark/pybench/asv/openblas_wrap/_distributor_init.py
@@ -0,0 +1,4 @@
+'''
+Helper to preload OpenBLAS from scipy_openblas32
+'''
+import scipy_openblas32
diff --git a/benchmark/pybench/asv/openblas_wrap/blas_lapack.pyf.src b/benchmark/pybench/asv/openblas_wrap/blas_lapack.pyf.src
new file mode 100644
index 0000000000..76278e0a30
--- /dev/null
+++ b/benchmark/pybench/asv/openblas_wrap/blas_lapack.pyf.src
@@ -0,0 +1,327 @@
+!
+! Taken from scipy/linalg
+!
+! Shorthand notations
+!
+! <tchar=s,d,cs,zd>
+! <tchar2c=cs,zd>
+!
+! <prefix=scipy_s,scipy_d,scipy_c,scipy_z>
+! <prefix2=scipy_s,scipy_d>
+! <prefix2c=scipy_c,scipy_z>
+! <prefix3=scipy_s,scipy_sc>
+! <prefix4=scipy_d,scipy_dz>
+! <prefix6=scipy_s,scipy_d,scipy_c,scipy_z,scipy_c,scipy_z>
+!
+! <ftype2=real,double precision>
+! <ftype2c=complex,double complex>
+! <ftype3=real,complex>
+! <ftype4=double precision,double complex>
+! <ftypereal3=real,real>
+! <ftypereal4=double precision,double precision>
+! <ftype6=real,double precision,complex,double complex,\2,\3>
+! <ftype6creal=real,double precision,complex,double complex,\0,\1>
+!
+! <ctype2=float,double>
+! <ctype2c=complex_float,complex_double>
+! <ctype3=float,complex_float>
+! <ctype4=double,complex_double>
+! <ctypereal3=float,float>
+! <ctypereal4=double,double>
+! <ctype6=float,double,complex_float,complex_double,\2,\3>
+! <ctype6creal=float,double,complex_float,complex_double,\0,\1>
+!
+!
+! Level 1 BLAS
+!
+
+
+python module _flapack
+    usercode '''
+#define F_INT int
+'''
+
+interface
+
+
+subroutine <prefix>axpy(n,a,x,offx,incx,y,offy,incy)
+  ! Calculate z = a*x+y, where a is scalar.
+
+  callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy)
+  callprotoargument F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*
+
+  <ftype> dimension(*), intent(in) :: x
+  <ftype> dimension(*), intent(in,out,out=z) :: y
+  <ftype> optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2>
+  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
+  integer optional, intent(in),depend(x) :: offx=0
+  integer optional, intent(in),depend(y) :: offy=0
+  check(offx>=0 && offx<len(x)) :: offx
+  check(offy>=0 && offy<len(y)) :: offy
+  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
+       n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  check(len(y)-offy>(n-1)*abs(incy)) :: n
+
+end subroutine <prefix>axpy
+
+function scipy_ddot(n,x,offx,incx,y,offy,incy) result (xy)
+  ! Computes a vector-vector dot product.
+
+  callstatement scipy_ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy)
+  callprotoargument F_INT*,double*,F_INT*,double*,F_INT*
+  intent(c) scipy_ddot
+  fortranname F_FUNC(scipy_ddot,DDOT)
+
+  double precision dimension(*), intent(in) :: x
+  double precision dimension(*), intent(in) :: y
+  double precision ddot,xy
+  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
+  integer optional, intent(in),depend(x) :: offx=0
+  integer optional, intent(in),depend(y) :: offy=0
+  check(offx>=0 && offx<len(x)) :: offx
+  check(offy>=0 && offy<len(y)) :: offy
+  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
+       n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  check(len(y)-offy>(n-1)*abs(incy)) :: n
+
+end function scipy_ddot
+
+
+function <prefix4>nrm2(n,x,offx,incx) result(n2)
+
+  <ftypereal4> <prefix4>nrm2, n2
+
+  callstatement <prefix4>nrm2_return_value = (*f2py_func)(&n,x+offx,&incx)
+  callprotoargument F_INT*,<ctype4>*,F_INT*
+  intent(c) <prefix4>nrm2
+  fortranname F_FUNC(<prefix4>nrm2,<D,DZ>NRM2)
+
+  <ftype4> dimension(*),intent(in) :: x
+
+  integer optional, intent(in),check(incx>0) :: incx = 1
+
+  integer optional,intent(in),depend(x) :: offx=0
+  check(offx>=0 && offx<len(x)) :: offx
+
+  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+
+end function <prefix4>nrm2
+
+!
+! Level 3 BLAS
+!
+
+
+subroutine <prefix>gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb)
+  ! Computes a scalar-matrix-matrix product and adds the result to a
+  ! scalar-matrix product.
+  !
+  ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0)
+  ! Calculate C <- alpha * op(A) * op(B) + beta * C
+
+  callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), &
+       (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m)
+  callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*, &
+       F_INT*,<ctype>*,<ctype>*,F_INT*
+
+  integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0
+  integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0
+  <ftype> intent(in) :: alpha
+  <ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>
+
+  <ftype> dimension(lda,ka),intent(in) :: a
+  <ftype> dimension(ldb,kb),intent(in) :: b
+  <ftype> dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c
+  check(shape(c,0)==m && shape(c,1)==n) :: c
+
+  integer depend(a),intent(hide) :: lda = shape(a,0)
+  integer depend(a),intent(hide) :: ka = shape(a,1)
+  integer depend(b),intent(hide) :: ldb = shape(b,0)
+  integer depend(b),intent(hide) :: kb = shape(b,1)
+
+  integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda)
+  integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka)
+  integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: &
+       n = (trans_b?ldb:kb)
+
+end subroutine <prefix>gemm
+
+
+subroutine <prefix6><sy,\0,\0,\0,he,he>rk(n,k,alpha,a,beta,c,trans,lower,lda,ka)
+  !  performs one of the symmetric rank k operations
+  !     C := alpha*A*A**T + beta*C,  or   C := alpha*A**T*A + beta*C,
+  !
+  ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0)
+  !
+  callstatement (*f2py_func)((lower?"L":"U"), &
+        (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n)
+  callprotoargument char*,char*,F_INT*,F_INT*,<ctype6>*,<ctype6>*,F_INT*,<ctype6>*, &
+        <ctype6>*,F_INT*
+
+  integer optional, intent(in),check(lower==0||lower==1) :: lower = 0
+  integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0
+
+  <ftype6> intent(in) :: alpha
+  <ftype6> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2>
+
+  <ftype6> dimension(lda,ka),intent(in) :: a
+  <ftype6> dimension(n,n),intent(in,out,copy),depend(n),optional :: c
+  check(shape(c,0)==n && shape(c,1)==n) :: c
+
+  integer depend(a),intent(hide) :: lda = shape(a,0)
+  integer depend(a),intent(hide) :: ka = shape(a,1)
+
+  integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda)
+  integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka)
+
+end subroutine <prefix6><sy,\0,\0,\0,he,he>rk
+
+
+!
+! LAPACK
+!
+
+subroutine <prefix>gesv(n,nrhs,a,piv,b,info)
+    ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0)
+    ! Solve A * X = B.
+    ! A = P * L * U
+    ! U is upper diagonal triangular, L is unit lower triangular,
+    ! piv pivots columns.
+
+    callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\<n;--piv[i++]);}
+    callprotoargument F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*
+
+    integer depend(a),intent(hide):: n = shape(a,0)
+    integer depend(b),intent(hide):: nrhs = shape(b,1)
+    <ftype> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
+    integer dimension(n),depend(n),intent(out) :: piv
+    <ftype> dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b
+    integer intent(out)::info
+    intent(in,out,copy,out=x) b
+    intent(in,out,copy,out=lu) a
+end subroutine <prefix>gesv
+
+
+subroutine <prefix2>gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
+    ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0)
+    ! Compute the singular value decomposition (SVD) using divide and conquer:
+    !   A = U * SIGMA * transpose(V)
+    ! A - M x N matrix
+    ! U - M x M matrix or min(M,N) x N if full_matrices=False
+    ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N)
+    !               singular values
+    ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False
+
+    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info)
+    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
+
+    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
+    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
+    integer intent(hide),depend(a):: m = shape(a,0)
+    integer intent(hide),depend(a):: n = shape(a,1)
+    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
+    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
+    integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1)
+    <ftype2> dimension(m,n),intent(in,copy,aligned8) :: a
+    <ftype2> dimension(minmn),intent(out),depend(minmn) :: s
+    <ftype2> dimension(u0,u1),intent(out),depend(u0, u1) :: u
+    <ftype2> dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt
+    <ftype2> dimension(lwork),intent(hide,cache),depend(lwork) :: work
+    integer optional,intent(in),depend(minmn,compute_uv) &
+        :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1)
+    integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork
+    integer intent(out)::info
+
+end subroutine <prefix2>gesdd
+
+subroutine <prefix2>gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
+    ! LWORK computation for (S/D)GESDD
+
+    fortranname <prefix2>gesdd
+    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info)
+    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
+
+    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
+    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
+    integer intent(in) :: m
+    integer intent(in) :: n
+    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
+    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
+    <ftype2> intent(hide) :: a
+    <ftype2> intent(hide) :: s
+    <ftype2> intent(hide) :: u
+    <ftype2> intent(hide) :: vt
+    <ftype2> intent(out) :: work
+    integer intent(hide) :: lwork = -1
+    integer intent(hide) :: iwork
+    integer intent(out) :: info
+
+end subroutine <prefix2>gesdd_lwork
+
+
+subroutine <prefix2>syev(compute_v,lower,n,w,a,lda,work,lwork,info)
+    ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0)
+    ! Compute all eigenvalues and, optionally, eigenvectors of a
+    ! real symmetric matrix A.
+    !
+    ! Performance tip:
+    !   If compute_v=0 then set also overwrite_a=1.
+
+    callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info)
+    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
+
+    integer optional,intent(in):: compute_v = 1
+    check(compute_v==1||compute_v==0) compute_v
+    integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
+
+    integer intent(hide),depend(a):: n = shape(a,0)
+    integer intent(hide),depend(a):: lda = MAX(1,shape(a,0))
+    <ftype2> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
+    intent(in,copy,out,out=v) :: a
+
+    <ftype2> dimension(n),intent(out),depend(n) :: w
+
+    integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1)
+    check(lwork>=3*n-1) :: lwork
+    <ftype2> dimension(lwork),intent(hide),depend(lwork) :: work
+
+    integer intent(out) :: info
+
+end subroutine <prefix2>syev
+
+
+subroutine <prefix2>syev_lwork(lower,n,w,a,lda,work,lwork,info)
+    ! LWORK routines for syev
+
+    fortranname <prefix2>syev
+
+    callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info)
+    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
+    
+     integer intent(in):: n
+     integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
+     
+     integer intent(hide),depend(n):: lda = MAX(1, n)
+     <ftype2> intent(hide):: a
+     <ftype2> intent(hide):: w
+     integer intent(hide):: lwork = -1
+    
+     <ftype2> intent(out):: work
+     integer intent(out):: info
+     
+end subroutine <prefix2>syev_lwork
+
+end interface
+
+end python module _flapack
+
+
+
diff --git a/benchmark/pybench/asv/openblas_wrap/generate_f2pymod.py b/benchmark/pybench/asv/openblas_wrap/generate_f2pymod.py
new file mode 100644
index 0000000000..5a8ba13895
--- /dev/null
+++ b/benchmark/pybench/asv/openblas_wrap/generate_f2pymod.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Process f2py template files (`filename.pyf.src` -> `filename.pyf`)
+
+Usage: python generate_pyf.py filename.pyf.src -o filename.pyf
+"""
+
+import os
+import sys
+import re
+import subprocess
+import argparse
+
+
+# START OF CODE VENDORED FROM `numpy.distutils.from_template`
+#############################################################
+"""
+process_file(filename)
+
+  takes templated file .xxx.src and produces .xxx file where .xxx
+  is .pyf .f90 or .f using the following template rules:
+
+  '<..>' denotes a template.
+
+  All function and subroutine blocks in a source file with names that
+  contain '<..>' will be replicated according to the rules in '<..>'.
+
+  The number of comma-separated words in '<..>' will determine the number of
+  replicates.
+
+  '<..>' may have two different forms, named and short. For example,
+
+  named:
+   <p=d,s,z,c> where anywhere inside a block '<p>' will be replaced with
+   'd', 's', 'z', and 'c' for each replicate of the block.
+
+   <_c>  is already defined: <_c=s,d,c,z>
+   <_t>  is already defined: <_t=real,double precision,complex,double complex>
+
+  short:
+   <s,d,c,z>, a short form of the named, useful when no <p> appears inside
+   a block.
+
+  In general, '<..>' contains a comma separated list of arbitrary
+  expressions. If these expression must contain a comma|leftarrow|rightarrow,
+  then prepend the comma|leftarrow|rightarrow with a backslash.
+
+  If an expression matches '\\<index>' then it will be replaced
+  by <index>-th expression.
+
+  Note that all '<..>' forms in a block must have the same number of
+  comma-separated entries.
+
+ Predefined named template rules:
+  <prefix=s,d,c,z>
+  <ftype=real,double precision,complex,double complex>
+  <ftypereal=real,double precision,\\0,\\1>
+  <ctype=float,double,complex_float,complex_double>
+  <ctypereal=float,double,\\0,\\1>
+"""
+
+routine_start_re = re.compile(
+    r'(\n|\A)((     (\$|\*))|)\s*(subroutine|function)\b',
+    re.I
+)
+routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I)
+function_start_re = re.compile(r'\n     (\$|\*)\s*function\b', re.I)
+
+def parse_structure(astr):
+    """ Return a list of tuples for each function or subroutine each
+    tuple is the start and end of a subroutine or function to be
+    expanded.
+    """
+
+    spanlist = []
+    ind = 0
+    while True:
+        m = routine_start_re.search(astr, ind)
+        if m is None:
+            break
+        start = m.start()
+        if function_start_re.match(astr, start, m.end()):
+            while True:
+                i = astr.rfind('\n', ind, start)
+                if i==-1:
+                    break
+                start = i
+                if astr[i:i+7]!='\n     $':
+                    break
+        start += 1
+        m = routine_end_re.search(astr, m.end())
+        ind = end = m and m.end()-1 or len(astr)
+        spanlist.append((start, end))
+    return spanlist
+
+template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>")
+named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>")
+list_re = re.compile(r"<\s*((.*?))\s*>")
+
+def find_repl_patterns(astr):
+    reps = named_re.findall(astr)
+    names = {}
+    for rep in reps:
+        name = rep[0].strip() or unique_key(names)
+        repl = rep[1].replace(r'\,', '@comma@')
+        thelist = conv(repl)
+        names[name] = thelist
+    return names
+
+def find_and_remove_repl_patterns(astr):
+    names = find_repl_patterns(astr)
+    astr = re.subn(named_re, '', astr)[0]
+    return astr, names
+
+item_re = re.compile(r"\A\\(?P<index>\d+)\Z")
+def conv(astr):
+    b = astr.split(',')
+    l = [x.strip() for x in b]
+    for i in range(len(l)):
+        m = item_re.match(l[i])
+        if m:
+            j = int(m.group('index'))
+            l[i] = l[j]
+    return ','.join(l)
+
+def unique_key(adict):
+    """ Obtain a unique key given a dictionary."""
+    allkeys = list(adict.keys())
+    done = False
+    n = 1
+    while not done:
+        newkey = '__l%s' % (n)
+        if newkey in allkeys:
+            n += 1
+        else:
+            done = True
+    return newkey
+
+
+template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z')
+def expand_sub(substr, names):
+    substr = substr.replace(r'\>', '@rightarrow@')
+    substr = substr.replace(r'\<', '@leftarrow@')
+    lnames = find_repl_patterns(substr)
+    substr = named_re.sub(r"<\1>", substr)  # get rid of definition templates
+
+    def listrepl(mobj):
+        thelist = conv(mobj.group(1).replace(r'\,', '@comma@'))
+        if template_name_re.match(thelist):
+            return "<%s>" % (thelist)
+        name = None
+        for key in lnames.keys():    # see if list is already in dictionary
+            if lnames[key] == thelist:
+                name = key
+        if name is None:      # this list is not in the dictionary yet
+            name = unique_key(lnames)
+            lnames[name] = thelist
+        return "<%s>" % name
+
+    substr = list_re.sub(listrepl, substr) # convert all lists to named templates
+                                           # newnames are constructed as needed
+
+    numsubs = None
+    base_rule = None
+    rules = {}
+    for r in template_re.findall(substr):
+        if r not in rules:
+            thelist = lnames.get(r, names.get(r, None))
+            if thelist is None:
+                raise ValueError('No replicates found for <%s>' % (r))
+            if r not in names and not thelist.startswith('_'):
+                names[r] = thelist
+            rule = [i.replace('@comma@', ',') for i in thelist.split(',')]
+            num = len(rule)
+
+            if numsubs is None:
+                numsubs = num
+                rules[r] = rule
+                base_rule = r
+            elif num == numsubs:
+                rules[r] = rule
+            else:
+                print("Mismatch in number of replacements (base <{}={}>) "
+                      "for <{}={}>. Ignoring."
+                      .format(base_rule, ','.join(rules[base_rule]), r, thelist))
+    if not rules:
+        return substr
+
+    def namerepl(mobj):
+        name = mobj.group(1)
+        return rules.get(name, (k+1)*[name])[k]
+
+    newstr = ''
+    for k in range(numsubs):
+        newstr += template_re.sub(namerepl, substr) + '\n\n'
+
+    newstr = newstr.replace('@rightarrow@', '>')
+    newstr = newstr.replace('@leftarrow@', '<')
+    return newstr
+
+def process_str(allstr):
+    newstr = allstr
+    writestr = ''
+
+    struct = parse_structure(newstr)
+
+    oldend = 0
+    names = {}
+    names.update(_special_names)
+    for sub in struct:
+        cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]])
+        writestr += cleanedstr
+        names.update(defs)
+        writestr += expand_sub(newstr[sub[0]:sub[1]], names)
+        oldend =  sub[1]
+    writestr += newstr[oldend:]
+
+    return writestr
+
+include_src_re = re.compile(
+    r"(\n|\A)\s*include\s*['\"](?P<name>[\w\d./\\]+\.src)['\"]",
+    re.I
+)
+
+def resolve_includes(source):
+    d = os.path.dirname(source)
+    with open(source) as fid:
+        lines = []
+        for line in fid:
+            m = include_src_re.match(line)
+            if m:
+                fn = m.group('name')
+                if not os.path.isabs(fn):
+                    fn = os.path.join(d, fn)
+                if os.path.isfile(fn):
+                    lines.extend(resolve_includes(fn))
+                else:
+                    lines.append(line)
+            else:
+                lines.append(line)
+    return lines
+
+def process_file(source):
+    lines = resolve_includes(source)
+    return process_str(''.join(lines))
+
+_special_names = find_repl_patterns('''
+<_c=s,d,c,z>
+<_t=real,double precision,complex,double complex>
+<prefix=s,d,c,z>
+<ftype=real,double precision,complex,double complex>
+<ctype=float,double,complex_float,complex_double>
+<ftypereal=real,double precision,\\0,\\1>
+<ctypereal=float,double,\\0,\\1>
+''')
+
+# END OF CODE VENDORED FROM `numpy.distutils.from_template`
+###########################################################
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=str,
+                        help="Path to the input file")
+    parser.add_argument("-o", "--outdir", type=str,
+                        help="Path to the output directory")
+    args = parser.parse_args()
+
+    if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')):
+        raise ValueError(f"Input file has unknown extension: {args.infile}")
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+    # Write out the .pyf/.f file
+    if args.infile.endswith(('.pyf.src', '.f.src')):
+        code = process_file(args.infile)
+        fname_pyf = os.path.join(args.outdir,
+                                 os.path.splitext(os.path.split(args.infile)[1])[0])
+
+        with open(fname_pyf, 'w') as f:
+            f.write(code)
+    else:
+        fname_pyf = args.infile
+
+    # Now invoke f2py to generate the C API module file
+    if args.infile.endswith(('.pyf.src', '.pyf')):
+        p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf,
+                            '--build-dir', outdir_abs], #'--quiet'],
+                            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            cwd=os.getcwd())
+        out, err = p.communicate()
+        if not (p.returncode == 0):
+            raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n"
+                            f"{out}\n"
+                            r"{err}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/pybench/asv/openblas_wrap/meson.build b/benchmark/pybench/asv/openblas_wrap/meson.build
new file mode 100644
index 0000000000..9f1b717876
--- /dev/null
+++ b/benchmark/pybench/asv/openblas_wrap/meson.build
@@ -0,0 +1,50 @@
+# find numpy & f2py includes
+inc_numpy = run_command(py3,
+  ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
+  check : true
+).stdout().strip()
+
+inc_f2py = run_command(py3,
+    ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
+    check : true
+).stdout().strip()
+
+
+inc_np = include_directories(inc_numpy, inc_f2py)
+fortranobject_c = inc_f2py / 'fortranobject.c'
+
+
+fortranobject_lib = static_library('_fortranobject',
+  fortranobject_c,
+#  c_args: numpy_nodepr_api,
+  dependencies: py3_dep,
+  include_directories: [inc_np, inc_f2py],
+  gnu_symbol_visibility: 'hidden',
+)
+fortranobject_dep = declare_dependency(
+  link_with: fortranobject_lib,
+  include_directories: [inc_np, inc_f2py],
+)
+
+
+# f2py generated wrappers
+
+flapack_module = custom_target('flapack_module',
+  output: ['_flapackmodule.c'],
+  input: 'blas_lapack.pyf.src',
+  command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'],
+)
+
+py3.extension_module('_flapack',
+  flapack_module,
+  link_args: [],  # version_link_args,
+  dependencies: [openblas_dep, fortranobject_dep],
+  install: true,
+  subdir: 'openblas_wrap'
+)
+
+
+py3.install_sources(
+  ['__init__.py'],
+  subdir: 'openblas_wrap'
+)
diff --git a/benchmark/pybench/asv/pyproject.toml b/benchmark/pybench/asv/pyproject.toml
new file mode 100644
index 0000000000..535f96425e
--- /dev/null
+++ b/benchmark/pybench/asv/pyproject.toml
@@ -0,0 +1,22 @@
+[build-system]
+build-backend = "mesonpy"
+requires = [
+    "meson-python>=0.16.0",
+    "numpy",
+    "scipy_openblas32"
+]
+
+
+
+[project]
+name = "openblas_wrap"
+version = "0.1"
+maintainers = [
+    {name = ".", email = ".@gmail.com"}
+]
+description = "a wrapper"
+requires-python = ">=3.10"
+dependencies = ["numpy>=1.23,<3",
+                "scipy_openblas32"
+]
+