[CUDA] Fix cuda 12.8 build warnings (#26136)

tianleiwu · github-actions[bot] · web-flow · commit ad32f424b5a3 · 2025-09-24T10:04:38.000-07:00
Fix build warnings using cuda 12.8 in Linux like the following:
```
&lt;command-line&gt;: error: "_FORTIFY_SOURCE" redefined [-Werror]

gather_block_quantized.cc:95:40: warning: comparison of integer expressions of different signedness: ‘int64_t’ {aka ‘long int’} and ‘long unsigned int’ [-Wsign-compare]
   95 |   for (int64_t i = gather_axis_ + 1; i &lt; data_rank; ++i) {

attention_op_test.cc:304:85: warning: comparison of integer expressions of different signedness: ‘int’ and ‘std::vector&lt;float&gt;::size_type’ {aka ‘long unsigned int’} [-Wsign-compare]
  304 |     } else if (batch_size * q_num_heads * q_sequence_length * total_sequence_length == attn_mask.size()) {
  
  /cast_op_test.cc:1487:24: warning: comparison of integer expressions of different signedness: ‘size_t’ {aka ‘long unsigned int’} and ‘int’ [-Wsign-compare]
 1487 |   for (size_t i = 0; i &lt; num_pairs; ++i) {
      |                      ~~^~~~~~~~~~~     
```

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc b/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc
@@ -64,12 +64,12 @@ Status GatherBlockQuantized<T1, T2, Tind>::ComputeInternal(OpKernelContext* ctx)
   const Tensor* zero_points = ctx->Input<Tensor>(3);
 
   auto data_shape = data->Shape().GetDims();
-  auto data_rank = data->Shape().NumDimensions();
+  int64_t data_rank = data->Shape().NumDimensions();
 
   auto indices_shape = indices->Shape().GetDims();
   auto indices_rank = indices->Shape().NumDimensions();
 
-  ORT_ENFORCE(quantize_axis_ == data_rank - 1);
+  ORT_ENFORCE(quantize_axis_ == static_cast<int64_t>(data_rank) - 1);
 
   TensorShapeVector output_shape;
   output_shape.reserve(data_rank - 1 + indices_rank);
@@ -92,7 +92,7 @@ Status GatherBlockQuantized<T1, T2, Tind>::ComputeInternal(OpKernelContext* ctx)
   }
 
   // 3) dims after gather_axis
-  for (int64_t i = gather_axis_ + 1; i < data_rank; ++i) {
+  for (int64_t i = gather_axis_ + 1; i < static_cast<int64_t>(data_rank); ++i) {
     output_shape.push_back(data_shape[i]);
     after_gather_dim *= data_shape[i];
   }
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -220,10 +220,11 @@ static void RunTest3D(
   std::vector<int64_t> v_shape = {batch_size, kv_sequence_length, v_hidden_size};
 
   std::vector<int64_t> attn_mask_shape = {q_sequence_length, total_sequence_length};
-  if (q_sequence_length * total_sequence_length != attn_mask.size() && attn_mask.size() > 0) {
-    if (batch_size * q_sequence_length * total_sequence_length == attn_mask.size()) {
+  size_t expected_mask_size = static_cast<size_t>(q_sequence_length) * static_cast<size_t>(total_sequence_length);
+  if (expected_mask_size != attn_mask.size() && attn_mask.size() > 0) {
+    if (static_cast<size_t>(batch_size) * expected_mask_size == attn_mask.size()) {
       attn_mask_shape = {batch_size, 1, q_sequence_length, total_sequence_length};
-    } else if (batch_size * q_num_heads * q_sequence_length * total_sequence_length == attn_mask.size()) {
+    } else if (static_cast<size_t>(batch_size) * static_cast<size_t>(q_num_heads) * expected_mask_size == attn_mask.size()) {
       attn_mask_shape = {batch_size, q_num_heads, q_sequence_length, total_sequence_length};
     } else {
       ORT_THROW("Invalid attention mask size: ", attn_mask.size(),
@@ -298,10 +299,11 @@ static void RunTest4D(
   std::vector<int64_t> v_shape = {batch_size, kv_num_heads, kv_sequence_length, v_head_size};
 
   std::vector<int64_t> attn_mask_shape = {q_sequence_length, total_sequence_length};
-  if (q_sequence_length * total_sequence_length != attn_mask.size() && attn_mask.size() > 0) {
-    if (batch_size * q_sequence_length * total_sequence_length == attn_mask.size()) {
+  size_t expected_mask_size = static_cast<size_t>(q_sequence_length) * static_cast<size_t>(total_sequence_length);
+  if (expected_mask_size != attn_mask.size() && attn_mask.size() > 0) {
+    if (static_cast<size_t>(batch_size) * expected_mask_size == attn_mask.size()) {
       attn_mask_shape = {batch_size, 1, q_sequence_length, total_sequence_length};
-    } else if (batch_size * q_num_heads * q_sequence_length * total_sequence_length == attn_mask.size()) {
+    } else if (static_cast<size_t>(batch_size) * static_cast<size_t>(q_num_heads) * expected_mask_size == attn_mask.size()) {
       attn_mask_shape = {batch_size, q_num_heads, q_sequence_length, total_sequence_length};
     } else {
       ORT_THROW("Invalid attention mask size: ", attn_mask.size(),
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -1477,7 +1477,7 @@ template <typename F4>
 void CastOpTestFloatFloat4(std::vector<int64_t> shape,
                            std::vector<float> float_data,
                            bool is_fp4_input = false) {
-  int num_pairs = static_cast<int>(float_data.size()) / 2;
+  size_t num_pairs = float_data.size() / 2;
   int num_fp4_elements = static_cast<int>((float_data.size() + 1) / 2);
   bool is_odd_count = (float_data.size() % 2 != 0);
 
@@ -1489,7 +1489,7 @@ void CastOpTestFloatFloat4(std::vector<int64_t> shape,
   }
 
   if (is_odd_count) {
-    fp4_data.emplace_back(F4(float_data[num_pairs * 2], 0));  // Padding zero
+    fp4_data.emplace_back(F4(float_data.back(), 0));  // Padding zero
   }
 
   if (!is_fp4_input) {
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
@@ -1183,7 +1183,8 @@ def generate_build_tree(
                 if config == "Release":
                     cflags = [
                         "-DNDEBUG",
-                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-U_FORTIFY_SOURCE",
+                        "-D_FORTIFY_SOURCE=2",
                         "-Wp,-D_GLIBCXX_ASSERTIONS",
                         "-fstack-protector-strong",
                         "-O3",
@@ -1194,7 +1195,8 @@ def generate_build_tree(
                 elif config == "RelWithDebInfo":
                     cflags = [
                         "-DNDEBUG",
-                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-U_FORTIFY_SOURCE",
+                        "-D_FORTIFY_SOURCE=2",
                         "-Wp,-D_GLIBCXX_ASSERTIONS",
                         "-fstack-protector-strong",
                         "-O3",
@@ -1209,7 +1211,8 @@ def generate_build_tree(
                 elif config == "MinSizeRel":
                     cflags = [
                         "-DNDEBUG",
-                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-U_FORTIFY_SOURCE",
+                        "-D_FORTIFY_SOURCE=2",
                         "-Wp,-D_GLIBCXX_ASSERTIONS",
                         "-fstack-protector-strong",
                         "-Os",
diff --git a/tools/python/util/vcpkg_helpers.py b/tools/python/util/vcpkg_helpers.py
@@ -362,7 +362,12 @@ def generate_triplet_for_posix_platform(
             cflags_release = ["-DNDEBUG", "-O3"]
 
             if enable_binskim:
-                cflags_release += ["-Wp,-D_FORTIFY_SOURCE=2", "-Wp,-D_GLIBCXX_ASSERTIONS", "-fstack-protector-strong"]
+                cflags_release += [
+                    "-U_FORTIFY_SOURCE",
+                    "-D_FORTIFY_SOURCE=2",
+                    "-Wp,-D_GLIBCXX_ASSERTIONS",
+                    "-fstack-protector-strong",
+                ]
                 if target_abi == "x64":
                     cflags_release += ["-fstack-clash-protection", "-fcf-protection"]
 

Original file line number	Diff line number	Diff line change
`@@ -1477,7 +1477,7 @@ template <typename F4>`
`1477`	`1477`	`void CastOpTestFloatFloat4(std::vector<int64_t> shape,`
`1478`	`1478`	`std::vector<float> float_data,`
`1479`	`1479`	`bool is_fp4_input = false) {`
`1480`		`- int num_pairs = static_cast<int>(float_data.size()) / 2;`
	`1480`	`+ size_t num_pairs = float_data.size() / 2;`
`1481`	`1481`	`int num_fp4_elements = static_cast<int>((float_data.size() + 1) / 2);`
`1482`	`1482`	`bool is_odd_count = (float_data.size() % 2 != 0);`
`1483`	`1483`
`@@ -1489,7 +1489,7 @@ void CastOpTestFloatFloat4(std::vector<int64_t> shape,`
`1489`	`1489`	`}`
`1490`	`1490`
`1491`	`1491`	`if (is_odd_count) {`
`1492`		`- fp4_data.emplace_back(F4(float_data[num_pairs * 2], 0)); // Padding zero`
	`1492`	`+ fp4_data.emplace_back(F4(float_data.back(), 0)); // Padding zero`
`1493`	`1493`	`}`
`1494`	`1494`
`1495`	`1495`	`if (!is_fp4_input) {`