diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a143738c..80fedf99 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -59,6 +59,25 @@ jobs:
           python-version: ${{ matrix.py_version }}
           allow-prereleases: true
 
+      - name: Free disk space
+        if: contains(matrix.os, 'ubuntu')
+        uses: descriptinc/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false # This frees space on the wrong partition.
+
+      - uses: Jimver/cuda-toolkit@v0.2.15
+        if: contains(matrix.os, 'ubuntu')
+        id: cuda-toolkit
+        with:
+          cuda: '12.4.1'
+          linux-local-args: '["--toolkit"]'
+          log-file-suffix: "${{ matrix.os }}-${{ matrix.py_version }}.txt"
+
       - name: Install and configure
         shell: bash
         run: |
diff --git a/mlir/extras/dialects/ext/gpu.py b/mlir/extras/dialects/ext/gpu.py
index b2e4ec2c..8d7af826 100644
--- a/mlir/extras/dialects/ext/gpu.py
+++ b/mlir/extras/dialects/ext/gpu.py
@@ -166,9 +166,7 @@ def __init__(
             if isinstance(t, str):
                 targets[i] = Attribute.parse(t)
         _ods_context = get_default_loc_context(loc)
-        super().__init__(targets=ArrayAttr.get(targets), loc=loc, ip=ip)
-        self.regions[0].blocks.append()
-        self.operation.attributes["sym_name"] = (
+        sym_name = (
             sym_name
             if (
                 issubclass(type(sym_name), Attribute)
@@ -176,6 +174,10 @@ def __init__(
             )
             else AttrBuilder.get("SymbolNameAttr")(sym_name, context=_ods_context)
         )
+        super().__init__(
+            sym_name=sym_name, targets=ArrayAttr.get(targets), loc=loc, ip=ip
+        )
+        self.regions[0].blocks.append()
 
     @property
     def body(self):
diff --git a/tests/test_nvgpu_nvvm.py b/tests/test_nvgpu_nvvm.py
index 91badbc5..b4d34c3d 100644
--- a/tests/test_nvgpu_nvvm.py
+++ b/tests/test_nvgpu_nvvm.py
@@ -1,4 +1,5 @@
 import re
+import subprocess
 from pathlib import Path
 from textwrap import dedent
 
@@ -8,12 +9,12 @@
 from mlir.dialects.memref import cast
 from mlir.dialects.nvgpu import (
     TensorMapDescriptorType,
-    TensorMapSwizzleKind,
+    TensorMapInterleaveKind,
     TensorMapL2PromoKind,
     TensorMapOOBKind,
-    TensorMapInterleaveKind,
+    TensorMapSwizzleKind,
+    tma_create_descriptor,
 )
-from mlir.dialects.nvgpu import tma_create_descriptor
 from mlir.dialects.transform import any_op_t
 from mlir.dialects.transform.extras import named_sequence
 from mlir.dialects.transform.structured import MatchInterfaceEnum
@@ -21,15 +22,15 @@
 
 from mlir import _mlir_libs
 from mlir.extras.ast.canonicalize import canonicalize
-from mlir.extras.dialects.ext import arith, memref, scf, gpu, linalg, transform, nvgpu
+from mlir.extras.dialects.ext import arith, gpu, linalg, memref, nvgpu, scf, transform
 from mlir.extras.dialects.ext.func import func
 from mlir.extras.dialects.ext.gpu import smem_space
 from mlir.extras.dialects.ext.llvm import llvm_ptr_t
-from mlir.extras.runtime.passes import run_pipeline, Pipeline
+from mlir.extras.runtime.passes import Pipeline, run_pipeline
 from mlir.extras.runtime.refbackend import LLVMJITBackend
 
 # noinspection PyUnresolvedReferences
-from mlir.extras.testing import mlir_ctx as ctx, filecheck, MLIRContext
+from mlir.extras.testing import MLIRContext, filecheck, mlir_ctx as ctx
 from mlir.extras.util import find_ops
 
 # needed since the fix isn't defined here nor conftest.py
@@ -200,7 +201,8 @@ def payload():
         compute_linspace_val.emit()
 
         @func
-        def printMemrefF32(x: T.memref(T.f32())): ...
+        def printMemrefF32(x: T.memref(T.f32())):
+            ...
 
         printMemrefF32_.append(printMemrefF32)
 
@@ -421,8 +423,15 @@ def main(module: any_op_t()):
 CUDA_RUNTIME_LIB_PATH = Path(_mlir_libs.__file__).parent / f"libmlir_cuda_runtime.so"
 
 
+NVIDIA_GPU = False
+try:
+    subprocess.check_output("nvidia-smi")
+    NVIDIA_GPU = True
+except Exception:
+    print("No Nvidia GPU in system!")
+
 # based on https://github.com/llvm/llvm-project/blob/9cc2122bf5a81f7063c2a32b2cb78c8d615578a1/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir#L6
-@pytest.mark.skipif(not CUDA_RUNTIME_LIB_PATH.exists(), reason="no cuda library")
+@pytest.mark.skipif(not NVIDIA_GPU, reason="no cuda library")
 def test_transform_mma_sync_matmul_f16_f16_accum_run(ctx: MLIRContext, capfd):
     range_ = scf.range_
 
@@ -549,7 +558,8 @@ def payload():
         compute_linspace_val.emit()
 
         @func
-        def printMemrefF32(x: T.memref(T.f32())): ...
+        def printMemrefF32(x: T.memref(T.f32())):
+            ...
 
         printMemrefF32_.append(printMemrefF32)