From 25c5feaebb681fc8c91ce81348d02cd579f32495 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 30 Apr 2025 11:20:51 -0700
Subject: [PATCH] [flang][cuda] Use a reference for asyncObject

---
 .../flang-rt/runtime/allocator-registry.h     |  4 +--
 .../include/flang-rt/runtime/descriptor.h     |  6 ++---
 .../flang-rt/runtime/reduction-templates.h    |  2 +-
 flang-rt/lib/cuda/allocator.cpp               | 16 ++++++------
 flang-rt/lib/cuda/descriptor.cpp              |  2 +-
 flang-rt/lib/runtime/allocatable.cpp          | 12 ++++-----
 flang-rt/lib/runtime/array-constructor.cpp    |  4 +--
 flang-rt/lib/runtime/assign.cpp               |  4 +--
 flang-rt/lib/runtime/character.cpp            | 20 +++++++-------
 flang-rt/lib/runtime/copy.cpp                 |  4 +--
 flang-rt/lib/runtime/derived.cpp              |  6 ++---
 flang-rt/lib/runtime/descriptor.cpp           |  4 +--
 flang-rt/lib/runtime/extrema.cpp              |  4 +--
 flang-rt/lib/runtime/findloc.cpp              |  2 +-
 flang-rt/lib/runtime/matmul-transpose.cpp     |  2 +-
 flang-rt/lib/runtime/matmul.cpp               |  2 +-
 flang-rt/lib/runtime/misc-intrinsic.cpp       |  2 +-
 flang-rt/lib/runtime/pointer.cpp              |  2 +-
 flang-rt/lib/runtime/temporary-stack.cpp      |  2 +-
 flang-rt/lib/runtime/tools.cpp                |  2 +-
 flang-rt/lib/runtime/transformational.cpp     |  4 +--
 flang-rt/unittests/Evaluate/reshape.cpp       |  2 +-
 flang-rt/unittests/Runtime/Allocatable.cpp    |  4 +--
 .../unittests/Runtime/CUDA/Allocatable.cpp    |  3 ++-
 .../unittests/Runtime/CUDA/AllocatorCUF.cpp   |  4 +--
 flang-rt/unittests/Runtime/CUDA/Memory.cpp    |  4 +--
 flang-rt/unittests/Runtime/CharacterTest.cpp  |  2 +-
 flang-rt/unittests/Runtime/CommandTest.cpp    |  8 +++---
 flang-rt/unittests/Runtime/TemporaryStack.cpp |  4 +--
 flang-rt/unittests/Runtime/tools.h            |  2 +-
 .../flang/Optimizer/Dialect/CUF/CUFOps.td     | 11 ++++----
 .../include/flang/Runtime/CUDA/allocatable.h  |  8 +++---
 flang/include/flang/Runtime/CUDA/pointer.h    |  8 +++---
 flang/include/flang/Runtime/allocatable.h     |  7 ++---
 flang/lib/Lower/Allocatable.cpp               |  2 +-
 .../Optimizer/Builder/Runtime/Allocatable.cpp |  7 +++--
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp    | 22 ++++++++--------
 .../Optimizer/Transforms/CUFOpConversion.cpp  | 10 +++----
 flang/test/Fir/CUDA/cuda-allocate.fir         | 18 ++++++-------
 flang/test/Fir/cuf-invalid.fir                |  5 ++--
 flang/test/Fir/cuf.mlir                       |  7 +++--
 flang/test/HLFIR/elemental-codegen.fir        |  6 ++---
 flang/test/Lower/CUDA/cuda-allocatable.cuf    |  9 +++----
 .../acc-declare-unwrap-defaultbounds.f90      |  4 +--
 flang/test/Lower/OpenACC/acc-declare.f90      |  4 +--
 flang/test/Lower/allocatable-polymorphic.f90  | 26 +++++++++----------
 flang/test/Lower/allocatable-runtime.f90      |  4 +--
 flang/test/Lower/allocate-mold.f90            |  4 +--
 flang/test/Lower/polymorphic.f90              |  2 +-
 flang/test/Transforms/lower-repack-arrays.fir |  8 +++---
 50 files changed, 153 insertions(+), 158 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
index 33e8e2c7d7850..f0ba77a360736 100644
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -19,7 +19,7 @@
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t, std::int64_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t *);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
@@ -28,7 +28,7 @@ typedef struct Allocator_t {
 } Allocator_t;
 
 static RT_API_ATTRS void *MallocWrapper(
-    std::size_t size, [[maybe_unused]] std::int64_t) {
+    std::size_t size, [[maybe_unused]] std::int64_t *) {
   return std::malloc(size);
 }
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h
index 9907e7866e7bf..19cfeeb1e9dd1 100644
--- a/flang-rt/include/flang-rt/runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@@ -29,8 +29,8 @@
 #include <cstdio>
 #include <cstring>
 
-/// Value used for asyncId when no specific stream is specified.
-static constexpr std::int64_t kNoAsyncId = -1;
+/// Value used for asyncObject when no specific stream is specified.
+static constexpr void *kNoAsyncObject = nullptr;
 
 namespace Fortran::runtime {
 
@@ -372,7 +372,7 @@ class Descriptor {
   // before calling.  It (re)computes the byte strides after
   // allocation.  Does not allocate automatic components or
   // perform default component initialization.
-  RT_API_ATTRS int Allocate(std::int64_t asyncId);
+  RT_API_ATTRS int Allocate(std::int64_t *asyncObject);
   RT_API_ATTRS void SetByteStrides();
 
   // Deallocates storage; does not call FINAL subroutines or
diff --git a/flang-rt/include/flang-rt/runtime/reduction-templates.h b/flang-rt/include/flang-rt/runtime/reduction-templates.h
index 77f77a592a476..18412708b02c5 100644
--- a/flang-rt/include/flang-rt/runtime/reduction-templates.h
+++ b/flang-rt/include/flang-rt/runtime/reduction-templates.h
@@ -347,7 +347,7 @@ inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
     // as the element size of the source.
     result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
         CFI_attribute_allocatable);
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
       terminator.Crash(
           "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
     }
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 51119ab251168..3a92c5b21c9af 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -136,7 +136,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 
 void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -144,18 +144,18 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
         cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   } else {
-    if (asyncId == kNoAsyncId) {
+    if (asyncObject == kNoAsyncObject) {
       CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
     } else {
       CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
-      insertAllocation(p, sizeInBytes, asyncId);
+          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
     }
   }
   return p;
@@ -174,7 +174,7 @@ void CUFFreeDevice(void *p) {
 }
 
 void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -184,9 +184,9 @@ void *CUFAllocManaged(
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
 void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes, asyncObject);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index 175e8c0ef8438..7b768f91af29d 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -21,7 +21,7 @@ RT_EXT_API_GROUP_BEGIN
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
   return reinterpret_cast<Descriptor *>(
-      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
+      CUFAllocManaged(sizeInBytes, /*asyncObject=*/nullptr));
 }
 
 void RTDEF(CUFFreeDescriptor)(
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index 6acce34eb9a9e..ef18da6ea0786 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -133,17 +133,17 @@ void RTDEF(AllocatableApplyMold)(
   }
 }
 
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
+    std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
   } else if (descriptor.IsAllocated()) {
     return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
   } else {
-    int stat{
-        ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
+    int stat{ReturnError(
+        terminator, descriptor.Allocate(asyncObject), errMsg, hasStat)};
     if (stat == StatOk) {
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
@@ -162,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
-      alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang-rt/lib/runtime/array-constructor.cpp b/flang-rt/lib/runtime/array-constructor.cpp
index 67b3b5e1e0f50..858fac7bf2b39 100644
--- a/flang-rt/lib/runtime/array-constructor.cpp
+++ b/flang-rt/lib/runtime/array-constructor.cpp
@@ -50,7 +50,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
           initialAllocationSize(fromElements, to.ElementBytes())};
       to.GetDimension(0).SetBounds(1, allocationSize);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       to.GetDimension(0).SetBounds(1, fromElements);
       vector.actualAllocationSize = allocationSize;
@@ -59,7 +59,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
       // first value: there should be no reallocation.
       RUNTIME_CHECK(terminator, previousToElements >= fromElements);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       vector.actualAllocationSize = previousToElements;
     }
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index 4a813cd489022..8a4fa36c91479 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -99,7 +99,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncId))};
+  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
   if (result == StatOk && derived && !derived->noInitializationNeeded()) {
     result = ReturnError(terminator, Initialize(to, *derived, terminator));
   }
@@ -277,7 +277,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncId))};
+      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
       if (stat == StatOk) {
         if (HasDynamicComponent(from)) {
           // If 'from' has allocatable/automatic component, we cannot
diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
index d1152ee1caefb..f140d202e118e 100644
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -118,7 +118,7 @@ static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("Compare: could not allocate storage for result");
   }
   std::size_t xChars{x.ElementBytes() >> shift<CHAR>};
@@ -173,7 +173,7 @@ static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("ADJUSTL/R: could not allocate storage for result");
   }
   for (SubscriptValue resultAt{0}; elements-- > 0;
@@ -227,7 +227,7 @@ static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("LEN_TRIM: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -427,7 +427,7 @@ static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("SCAN/VERIFY: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -530,7 +530,8 @@ static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
     for (int j{0}; j < rank; ++j) {
       accumulator.GetDimension(j).SetBounds(1, ub[j]);
     }
-    RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+    RUNTIME_CHECK(
+        terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   }
   for (CHAR *result{accumulator.OffsetElement<CHAR>()}; elements-- > 0;
        accumData += accumChars, result += chars, x.IncrementSubscripts(xAt)) {
@@ -606,7 +607,7 @@ void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
   for (int j{0}; j < rank; ++j) {
     accumulator.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (accumulator.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (accumulator.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash(
         "CharacterConcatenate: could not allocate storage for result");
   }
@@ -629,7 +630,8 @@ void RTDEF(CharacterConcatenateScalar1)(
   accumulator.set_base_addr(nullptr);
   std::size_t oldLen{accumulator.ElementBytes()};
   accumulator.raw().elem_len += chars;
-  RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(
+      terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(accumulator.OffsetElement<char>(oldLen), from, chars);
   FreeMemory(old);
 }
@@ -831,7 +833,7 @@ void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
   std::size_t origBytes{string.ElementBytes()};
   result.Establish(string.type(), origBytes * ncopies, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("REPEAT could not allocate storage for result");
   }
   const char *from{string.OffsetElement()};
@@ -865,7 +867,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
   }
   result.Establish(string.type(), resultBytes, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }
 
diff --git a/flang-rt/lib/runtime/copy.cpp b/flang-rt/lib/runtime/copy.cpp
index 3a0f98cf8d376..f990f46e0be66 100644
--- a/flang-rt/lib/runtime/copy.cpp
+++ b/flang-rt/lib/runtime/copy.cpp
@@ -171,8 +171,8 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
                 *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
             if (toDesc.raw().base_addr != nullptr) {
               toDesc.set_base_addr(nullptr);
-              RUNTIME_CHECK(
-                  terminator, toDesc.Allocate(/*asyncId=*/-1) == CFI_SUCCESS);
+              RUNTIME_CHECK(terminator,
+                  toDesc.Allocate(/*asyncObject=*/nullptr) == CFI_SUCCESS);
               const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
                   fromPtr + component->offset())};
               copyStack.emplace(toDesc, fromDesc);
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index c46ea806a430a..35037036f63e7 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -52,7 +52,7 @@ RT_API_ATTRS int Initialize(const Descriptor &instance,
         allocDesc.raw().attribute = CFI_attribute_allocatable;
         if (comp.genre() == typeInfo::Component::Genre::Automatic) {
           stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
               if (const auto *derived{addendum->derivedType()}) {
@@ -153,7 +153,7 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
         if (origDesc.IsAllocated()) {
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
           stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
               if (const typeInfo::DerivedType *
@@ -260,7 +260,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy.raw().attribute = CFI_attribute_allocatable;
         Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
         RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncId) == CFI_SUCCESS);
+            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 3debf53bb5290..67336d01380e0 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -158,7 +158,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
 #endif
 }
 
-RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t *asyncObject) {
   std::size_t elementBytes{ElementBytes()};
   if (static_cast<std::int64_t>(elementBytes) < 0) {
     // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1, asyncId)};
+  void *p{alloc(byteSize ? byteSize : 1, asyncObject)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 4c7f8e8b99e8f..03e574a8fbff1 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -152,7 +152,7 @@ inline RT_API_ATTRS void CharacterMaxOrMinLoc(const char *intrinsic,
       CFI_attribute_allocatable);
   result.GetDimension(0).SetBounds(1, extent[0]);
   Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
   }
@@ -181,7 +181,7 @@ inline RT_API_ATTRS void TotalNumericMaxOrMinLoc(const char *intrinsic,
       CFI_attribute_allocatable);
   result.GetDimension(0).SetBounds(1, extent[0]);
   Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
   }
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index e3e98953b0cfc..5485f4b97bd2f 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -220,7 +220,7 @@ void RTDEF(Findloc)(Descriptor &result, const Descriptor &x,
       CFI_attribute_allocatable);
   result.GetDimension(0).SetBounds(1, extent[0]);
   Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "FINDLOC: could not allocate memory for result; STAT=%d", stat);
   }
diff --git a/flang-rt/lib/runtime/matmul-transpose.cpp b/flang-rt/lib/runtime/matmul-transpose.cpp
index 17987fb73d943..c9e21502b629e 100644
--- a/flang-rt/lib/runtime/matmul-transpose.cpp
+++ b/flang-rt/lib/runtime/matmul-transpose.cpp
@@ -183,7 +183,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
     for (int j{0}; j < resRank; ++j) {
       result.GetDimension(j).SetBounds(1, extent[j]);
     }
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
       terminator.Crash(
           "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
           stat);
diff --git a/flang-rt/lib/runtime/matmul.cpp b/flang-rt/lib/runtime/matmul.cpp
index 0ff92cecbbcb8..5acb345725212 100644
--- a/flang-rt/lib/runtime/matmul.cpp
+++ b/flang-rt/lib/runtime/matmul.cpp
@@ -255,7 +255,7 @@ static inline RT_API_ATTRS void DoMatmul(
     for (int j{0}; j < resRank; ++j) {
       result.GetDimension(j).SetBounds(1, extent[j]);
     }
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
       terminator.Crash(
           "MATMUL: could not allocate memory for result; STAT=%d", stat);
     }
diff --git a/flang-rt/lib/runtime/misc-intrinsic.cpp b/flang-rt/lib/runtime/misc-intrinsic.cpp
index 2fde859869ef0..a8797f48fa667 100644
--- a/flang-rt/lib/runtime/misc-intrinsic.cpp
+++ b/flang-rt/lib/runtime/misc-intrinsic.cpp
@@ -30,7 +30,7 @@ static RT_API_ATTRS void TransferImpl(Descriptor &result,
   if (const DescriptorAddendum * addendum{mold.Addendum()}) {
     *result.Addendum() = *addendum;
   }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     Terminator{sourceFile, line}.Crash(
         "TRANSFER: could not allocate memory for result; STAT=%d", stat);
   }
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index fd2427f4124b5..7331f7bbc3a75 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
   byteSize = ((byteSize + align - 1) / align) * align;
   std::size_t total{byteSize + sizeof(std::uintptr_t)};
   AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
-  void *p{alloc(total, /*asyncId=*/-1)};
+  void *p{alloc(total, /*asyncObject=*/nullptr)};
   if (p && allocatorIdx == 0) {
     // Fill the footer word with the XOR of the ones' complement of
     // the base address, which is a value that would be highly unlikely
diff --git a/flang-rt/lib/runtime/temporary-stack.cpp b/flang-rt/lib/runtime/temporary-stack.cpp
index 3a952b1fdbcca..3f6fd8ee15a80 100644
--- a/flang-rt/lib/runtime/temporary-stack.cpp
+++ b/flang-rt/lib/runtime/temporary-stack.cpp
@@ -148,7 +148,7 @@ void DescriptorStorage<COPY_VALUES>::push(const Descriptor &source) {
   if constexpr (COPY_VALUES) {
     // copy the data pointed to by the box
     box.set_base_addr(nullptr);
-    box.Allocate(kNoAsyncId);
+    box.Allocate(kNoAsyncObject);
     RTNAME(AssignTemporary)
     (box, source, terminator_.sourceFileName(), terminator_.sourceLine());
   }
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index 5d6e35faca70a..1f965b0b151ce 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -261,7 +261,7 @@ RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
   for (int j{0}; j + 1 < xRank; ++j) {
     result.GetDimension(j).SetBounds(1, resultExtent[j]);
   }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
   }
diff --git a/flang-rt/lib/runtime/transformational.cpp b/flang-rt/lib/runtime/transformational.cpp
index a7d5a48530ee9..3df314a4e966b 100644
--- a/flang-rt/lib/runtime/transformational.cpp
+++ b/flang-rt/lib/runtime/transformational.cpp
@@ -132,7 +132,7 @@ static inline RT_API_ATTRS std::size_t AllocateResult(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, extent[j]);
   }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "%s: Could not allocate memory for result (stat=%d)", function, stat);
   }
@@ -157,7 +157,7 @@ static inline RT_API_ATTRS std::size_t AllocateBesselResult(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, extent[j]);
   }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
     terminator.Crash(
         "%s: Could not allocate memory for result (stat=%d)", function, stat);
   }
diff --git a/flang-rt/unittests/Evaluate/reshape.cpp b/flang-rt/unittests/Evaluate/reshape.cpp
index 67a0be124e8e0..f84de443965d1 100644
--- a/flang-rt/unittests/Evaluate/reshape.cpp
+++ b/flang-rt/unittests/Evaluate/reshape.cpp
@@ -26,7 +26,7 @@ int main() {
   for (int j{0}; j < 3; ++j) {
     source->GetDimension(j).SetBounds(1, sourceExtent[j]);
   }
-  TEST(source->Allocate(kNoAsyncId) == CFI_SUCCESS);
+  TEST(source->Allocate(kNoAsyncObject) == CFI_SUCCESS);
   TEST(source->IsAllocated());
   MATCH(2, source->GetDimension(0).Extent());
   MATCH(3, source->GetDimension(1).Extent());
diff --git a/flang-rt/unittests/Runtime/Allocatable.cpp b/flang-rt/unittests/Runtime/Allocatable.cpp
index a6fcdd0d1423c..b394312e5bc5a 100644
--- a/flang-rt/unittests/Runtime/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/Allocatable.cpp
@@ -26,7 +26,7 @@ TEST(AllocatableTest, MoveAlloc) {
   auto b{createAllocatable(TypeCategory::Integer, 4)};
   // ALLOCATE(a(20))
   a->GetDimension(0).SetBounds(1, 20);
-  a->Allocate(kNoAsyncId);
+  a->Allocate(kNoAsyncObject);
 
   EXPECT_TRUE(a->IsAllocated());
   EXPECT_FALSE(b->IsAllocated());
@@ -46,7 +46,7 @@ TEST(AllocatableTest, MoveAlloc) {
   // move_alloc with errMsg
   auto errMsg{Descriptor::Create(
       sizeof(char), 64, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  errMsg->Allocate(kNoAsyncId);
+  errMsg->Allocate(kNoAsyncObject);
   RTNAME(MoveAlloc)(*b, *a, nullptr, false, errMsg.get(), __FILE__, __LINE__);
   EXPECT_FALSE(a->IsAllocated());
   EXPECT_TRUE(b->IsAllocated());
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 89649aa95ad93..202119cf2ea20 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -42,7 +42,8 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
   CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
 
   RTNAME(AllocatableAllocate)
-  (*a, kNoAsyncId, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*a, kNoAsyncObject, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
   cudaDeviceSynchronize();
diff --git a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 2f1dc64dc8c5a..f1f931e87a86e 100644
--- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -35,7 +35,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   EXPECT_FALSE(a->HasAddendum());
   RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
       __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(AllocatableDeallocate)
@@ -54,7 +54,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   EXPECT_FALSE(a->HasAddendum());
   RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
       __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(AllocatableDeallocate)
diff --git a/flang-rt/unittests/Runtime/CUDA/Memory.cpp b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
index b3612073657ab..7915baca6c203 100644
--- a/flang-rt/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
@@ -50,8 +50,8 @@ TEST(MemoryCUFTest, CUFDataTransferDescDesc) {
   EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx());
   RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*dev, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
+  (*dev, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
+      __FILE__, __LINE__);
   EXPECT_TRUE(dev->IsAllocated());
 
   // Create temp array to transfer to device.
diff --git a/flang-rt/unittests/Runtime/CharacterTest.cpp b/flang-rt/unittests/Runtime/CharacterTest.cpp
index 0f28e883671bc..2c7af27b9da77 100644
--- a/flang-rt/unittests/Runtime/CharacterTest.cpp
+++ b/flang-rt/unittests/Runtime/CharacterTest.cpp
@@ -35,7 +35,7 @@ OwningPtr<Descriptor> CreateDescriptor(const std::vector<SubscriptValue> &shape,
   for (int j{0}; j < rank; ++j) {
     descriptor->GetDimension(j).SetBounds(2, shape[j] + 1);
   }
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
     return nullptr;
   }
 
diff --git a/flang-rt/unittests/Runtime/CommandTest.cpp b/flang-rt/unittests/Runtime/CommandTest.cpp
index 9d0da4ce8dd4e..6919a98105b8a 100644
--- a/flang-rt/unittests/Runtime/CommandTest.cpp
+++ b/flang-rt/unittests/Runtime/CommandTest.cpp
@@ -26,7 +26,7 @@ template <std::size_t n = 64>
 static OwningPtr<Descriptor> CreateEmptyCharDescriptor() {
   OwningPtr<Descriptor> descriptor{Descriptor::Create(
       sizeof(char), n, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
     return nullptr;
   }
   return descriptor;
@@ -36,7 +36,7 @@ static OwningPtr<Descriptor> CharDescriptor(const char *value) {
   std::size_t n{std::strlen(value)};
   OwningPtr<Descriptor> descriptor{Descriptor::Create(
       sizeof(char), n, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
     return nullptr;
   }
   std::memcpy(descriptor->OffsetElement(), value, n);
@@ -47,7 +47,7 @@ template <int kind = sizeof(std::int64_t)>
 static OwningPtr<Descriptor> EmptyIntDescriptor() {
   OwningPtr<Descriptor> descriptor{Descriptor::Create(TypeCategory::Integer,
       kind, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
     return nullptr;
   }
   return descriptor;
@@ -57,7 +57,7 @@ template <int kind = sizeof(std::int64_t)>
 static OwningPtr<Descriptor> IntDescriptor(const int &value) {
   OwningPtr<Descriptor> descriptor{Descriptor::Create(TypeCategory::Integer,
       kind, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
     return nullptr;
   }
   std::memcpy(descriptor->OffsetElement<int>(), &value, sizeof(int));
diff --git a/flang-rt/unittests/Runtime/TemporaryStack.cpp b/flang-rt/unittests/Runtime/TemporaryStack.cpp
index 3291794f22fc1..65725840459ab 100644
--- a/flang-rt/unittests/Runtime/TemporaryStack.cpp
+++ b/flang-rt/unittests/Runtime/TemporaryStack.cpp
@@ -59,7 +59,7 @@ TEST(TemporaryStack, ValueStackBasic) {
   Descriptor &outputDesc2{testDescriptorStorage[2].descriptor()};
   inputDesc.Establish(code, elementBytes, descriptorPtr, rank, extent);
 
-  inputDesc.Allocate(kNoAsyncId);
+  inputDesc.Allocate(kNoAsyncObject);
   ASSERT_EQ(inputDesc.IsAllocated(), true);
   uint32_t *inputData = static_cast<uint32_t *>(inputDesc.raw().base_addr);
   for (std::size_t i = 0; i < inputDesc.Elements(); ++i) {
@@ -123,7 +123,7 @@ TEST(TemporaryStack, ValueStackMultiSize) {
       boxDims.extent = extent[dim];
       boxDims.sm = elementBytes;
     }
-    desc->Allocate(kNoAsyncId);
+    desc->Allocate(kNoAsyncObject);
 
     // fill the array with some data to test
     for (uint32_t i = 0; i < desc->Elements(); ++i) {
diff --git a/flang-rt/unittests/Runtime/tools.h b/flang-rt/unittests/Runtime/tools.h
index a1eba45647a80..4ada862df110b 100644
--- a/flang-rt/unittests/Runtime/tools.h
+++ b/flang-rt/unittests/Runtime/tools.h
@@ -42,7 +42,7 @@ static OwningPtr<Descriptor> MakeArray(const std::vector<int> &shape,
   for (int j{0}; j < rank; ++j) {
     result->GetDimension(j).SetBounds(1, shape[j]);
   }
-  int stat{result->Allocate(kNoAsyncId)};
+  int stat{result->Allocate(kNoAsyncObject)};
   EXPECT_EQ(stat, 0) << stat;
   EXPECT_LE(data.size(), result->Elements());
   char *p{result->OffsetElement<char>()};
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 46cc59cda1612..e38738230ffbc 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -95,12 +95,11 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
   }];
 
   let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
-                       Optional<AnyIntegerType>:$stream,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
-                       cuf_DataAttributeAttr:$data_attr,
-                       UnitAttr:$hasStat);
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+      Optional<fir_ReferenceType>:$stream,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
+      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat);
 
   let results = (outs AnyIntegerType:$stat);
 
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 822f2d4a2b297..6c97afa9e10e8 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -17,14 +17,14 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
@@ -32,14 +32,14 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 7fbd8f8e061f2..bdfc3268e0814 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -17,14 +17,14 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
@@ -32,14 +32,14 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 6895f8af5e2a8..863c07494e7c3 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -94,9 +94,10 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+int RTDECL(AllocatableAllocate)(Descriptor &,
+    std::int64_t *asyncObject = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 8d0444a6e5bd4..af8169c8e7f7b 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -773,7 +773,7 @@ class AllocateStmtHelper {
     mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
     mlir::Value stream =
         streamExpr
-            ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+            ? fir::getBase(converter.genExprAddr(loc, *streamExpr, stmtCtx))
             : nullptr;
     mlir::Value pinned =
         pinnedExpr
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index 28452d3b486da..cd5f1f6d098c3 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -76,8 +76,7 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
   mlir::FunctionType fTy{func.getFunctionType()};
-  mlir::Value asyncId =
-      builder.createIntegerConstant(loc, builder.getI64Type(), -1);
+  mlir::Value asyncObject = builder.createNullConstant(loc);
   mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
   mlir::Value sourceLine{
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
@@ -88,7 +87,7 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
     errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
   }
   llvm::SmallVector<mlir::Value> args{
-      fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
-                                    errMsg, sourceFile, sourceLine)};
+      fir::runtime::createArguments(builder, loc, fTy, desc, asyncObject,
+                                    hasStat, errMsg, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 24033bc15b8eb..687007d957225 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -76,6 +76,16 @@ llvm::LogicalResult cuf::FreeOp::verify() { return checkCudaAttr(*this); }
 // AllocateOp
 //===----------------------------------------------------------------------===//
 
+template <typename OpTy>
+static llvm::LogicalResult checkStreamType(OpTy op) {
+  if (!op.getStream())
+    return mlir::success();
+  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
+    if (!refTy.getEleTy().isInteger(64))
+      return op.emitOpError("stream is expected to be an i64 reference");
+  return mlir::success();
+}
+
 llvm::LogicalResult cuf::AllocateOp::verify() {
   if (getPinned() && getStream())
     return emitOpError("pinned and stream cannot appears at the same time");
@@ -92,7 +102,7 @@ llvm::LogicalResult cuf::AllocateOp::verify() {
         "expect errmsg to be a reference to/or a box type value");
   if (getErrmsg() && !getHasStat())
     return emitOpError("expect stat attribute when errmsg is provided");
-  return mlir::success();
+  return checkStreamType(*this);
 }
 
 //===----------------------------------------------------------------------===//
@@ -143,16 +153,6 @@ llvm::LogicalResult cuf::DeallocateOp::verify() {
 // KernelLaunchOp
 //===----------------------------------------------------------------------===//
 
-template <typename OpTy>
-static llvm::LogicalResult checkStreamType(OpTy op) {
-  if (!op.getStream())
-    return mlir::success();
-  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
-    if (!refTy.getEleTy().isInteger(64))
-      return op.emitOpError("stream is expected to be an i64 reference");
-  return mlir::success();
-}
-
 llvm::LogicalResult cuf::KernelLaunchOp::verify() {
   return checkStreamType(*this);
 }
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index e70ceb3a67d98..3a3eab9e8e37b 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -128,17 +128,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
                            mlir::IntegerType::get(op.getContext(), 1)));
     if (op.getSource()) {
       mlir::Value stream =
-          op.getStream()
-              ? op.getStream()
-              : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(2));
       args = fir::runtime::createArguments(
           builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
           hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
-          op.getStream()
-              ? op.getStream()
-              : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(1));
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
                                            stream, pinned, hasStat, errmsg,
                                            sourceFile, sourceLine);
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 095ad92d5deb5..ea7890c9aac52 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -170,16 +170,14 @@ func.func @_QQallocate_stream() {
   %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
   %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-  %4 = fir.load %3 : !fir.ref<i64>
-  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%3 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
 // CHECK-LABEL: func.func @_QQallocate_stream() 
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -268,6 +266,6 @@ func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
 // CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
 // CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
 // CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 } // end of module
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index a3b9be3ee8223..dceb8f6fde236 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -2,13 +2,12 @@
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %1 = fir.alloca i32
+  %s = fir.alloca i64
   %pinned = fir.alloca i1
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %s = fir.load %1 : !fir.ref<i32>
   // expected-error@+1{{'cuf.allocate' op pinned and stream cannot appears at the same time}}
-  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : !fir.ref<i64>) pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir
index d38b26a4548ed..f80a70eca34a3 100644
--- a/flang/test/Fir/cuf.mlir
+++ b/flang/test/Fir/cuf.mlir
@@ -18,15 +18,14 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %1 = fir.alloca i32
+  %1 = fir.alloca i64
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %s = fir.load %1 : !fir.ref<i32>
-  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {data_attr = #cuf.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%1 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
-// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {data_attr = #cuf.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
 
 // -----
 
diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir
index a715479f16115..67af4261470f7 100644
--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@@ -191,7 +191,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
 // CHECK:           %[[VAL_35:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_12:.*]] = arith.constant true
 // CHECK:           %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_40:.*]] = arith.constant 1 : index
@@ -275,7 +275,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_36:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_13:.*]] = arith.constant true
 // CHECK:           %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_41:.*]] = arith.constant 1 : index
@@ -328,7 +328,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_85:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_62:.*]] = arith.constant true
 // CHECK:           %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_90:.*]] = arith.constant 1 : index
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index a570f636b8db1..cec10dda839e9 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -90,7 +90,7 @@ end subroutine
 
 subroutine sub4()
   real, allocatable, device :: a(:)
-  integer :: istream
+  integer(8) :: istream
   allocate(a(10), stream=istream)
 end subroutine
 
@@ -98,11 +98,10 @@ end subroutine
 ! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: fir.embox {{.*}} {allocator_idx = 2 : i32}
 ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
-! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i64 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
-! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {data_attr = #cuf.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[ISTREAM_DECL]]#0 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: fir.if %{{.*}} {
 ! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
diff --git a/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
index 5bb1ae3797346..6869af863644d 100644
--- a/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
@@ -473,6 +473,6 @@ subroutine init()
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 889cdef51f4ce..4d95ffa10edaf 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -434,6 +434,6 @@ subroutine init()
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index dd8671daeaf8e..cbd7876203424 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -267,7 +267,7 @@ subroutine test_allocatable()
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -276,7 +276,7 @@ subroutine test_allocatable()
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -285,7 +285,7 @@ subroutine test_allocatable()
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -300,7 +300,7 @@ subroutine test_allocatable()
 ! CHECK: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -316,7 +316,7 @@ subroutine test_allocatable()
 ! CHECK: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[C1_LOAD1:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: fir.dispatch "proc1"(%[[C1_LOAD1]] : !fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>)
@@ -390,7 +390,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec()
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[CAT:.*]] = arith.constant 2 : i32
@@ -573,7 +573,7 @@ subroutine test_allocatable_up_character()
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 end module
 
@@ -592,17 +592,17 @@ program test_alloc
 ! LLVM-LABEL: define void @_QMpolyPtest_allocatable()
 
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0)
 ! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0)
 ! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
@@ -683,5 +683,5 @@ program test_alloc
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 8, i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
 ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false)
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
diff --git a/flang/test/Lower/allocatable-runtime.f90 b/flang/test/Lower/allocatable-runtime.f90
index 37272c90656cc..c63252c68974e 100644
--- a/flang/test/Lower/allocatable-runtime.f90
+++ b/flang/test/Lower/allocatable-runtime.f90
@@ -31,7 +31,7 @@ subroutine foo()
   ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
   ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref<i8>
-  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
   ! Simply check that we are emitting the right numebr of set bound for y and z. Otherwise, this is just like x.
   ! CHECK: fir.convert %[[yBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
@@ -180,4 +180,4 @@ subroutine mold_allocation()
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocate-mold.f90 b/flang/test/Lower/allocate-mold.f90
index c7985b11397ce..9427c8b08786f 100644
--- a/flang/test/Lower/allocate-mold.f90
+++ b/flang/test/Lower/allocate-mold.f90
@@ -16,7 +16,7 @@ subroutine scalar_mold_allocation()
 ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 subroutine array_scalar_mold_allocation()
   real, allocatable :: a(:)
@@ -40,4 +40,4 @@ end subroutine array_scalar_mold_allocation
 ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 485861a838ff6..b7be5f685d9e3 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1149,7 +1149,7 @@ program test
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], inner : (!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) {
diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir
index bbae7ba5b0e0b..0b323b1bb0697 100644
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@@ -840,7 +840,7 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
@@ -928,7 +928,7 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
@@ -1015,7 +1015,7 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
@@ -1103,7 +1103,7 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>