Skip to content

Commit 9684c87

Browse files
authored
[flang][runtime] Fixed performance regression in CopyElement. (#102081)
Polyhedron/capacita,protein and CPU2000/facerec,wupwise showed up to 60% regression on x86 after #101421. The memcpy loops of the toAt and fromAt arrays that are run to create the initial work item end up being encoded as 'rep mov', and they add noticeable overhead comparing to the total amount of work. 'rep mov' is not the best choise for small size memcpy (e.g. when the array rank is 1 or 2, it would be quite slow). Moreover, the rest of the stack related setup is also noticeable for the simple cases. I added a shortcut for the simple copy case, and also got rid of the initial toAt/fromAt copies by allowing the CopyDescriptor to use the external subscript storages.
1 parent b809671 commit 9684c87

File tree

1 file changed

+39
-10
lines changed

1 file changed

+39
-10
lines changed

flang/runtime/copy.cpp

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,17 @@ using StaticDescTy = StaticDescriptor<maxRank, true, 0>;
2323
// for CopyElement.
2424
struct CopyDescriptor {
2525
// A constructor specifying all members explicitly.
26+
// The toAt and fromAt specify subscript storages that might be
27+
// external to CopyElement, and cannot be modified.
28+
// The copy descriptor only establishes toAtPtr_ and fromAtPtr_
29+
// pointers to point to these storages.
2630
RT_API_ATTRS CopyDescriptor(const Descriptor &to, const SubscriptValue toAt[],
2731
const Descriptor &from, const SubscriptValue fromAt[],
2832
std::size_t elements, bool usesStaticDescriptors = false)
2933
: to_(to), from_(from), elements_(elements),
3034
usesStaticDescriptors_(usesStaticDescriptors) {
31-
for (int dim{0}; dim < to.rank(); ++dim) {
32-
toAt_[dim] = toAt[dim];
33-
}
34-
for (int dim{0}; dim < from.rank(); ++dim) {
35-
fromAt_[dim] = fromAt[dim];
36-
}
35+
toAtPtr_ = toAt;
36+
fromAtPtr_ = fromAt;
3737
}
3838
// The number of elements to copy is initialized from the to descriptor.
3939
// The current element subscripts are initialized from the lower bounds
@@ -46,14 +46,32 @@ struct CopyDescriptor {
4646
from.GetLowerBounds(fromAt_);
4747
}
4848

49+
// Increment the toAt_ and fromAt_ subscripts to the next
50+
// element.
51+
RT_API_ATTRS void IncrementSubscripts(Terminator &terminator) {
52+
// This method must not be called for copy descriptors
53+
// using external non-modifiable subscript storage.
54+
RUNTIME_CHECK(terminator, toAt_ == toAtPtr_ && fromAt_ == fromAtPtr_);
55+
to_.IncrementSubscripts(toAt_);
56+
from_.IncrementSubscripts(fromAt_);
57+
}
58+
4959
// Descriptor of the destination.
5060
const Descriptor &to_;
5161
// A subscript specifying the current element position to copy to.
5262
SubscriptValue toAt_[maxRank];
63+
// A pointer to the storage of the 'to' subscript.
64+
// It may point to toAt_ or to an external non-modifiable
65+
// subscript storage.
66+
const SubscriptValue *toAtPtr_{toAt_};
5367
// Descriptor of the source.
5468
const Descriptor &from_;
5569
// A subscript specifying the current element position to copy from.
5670
SubscriptValue fromAt_[maxRank];
71+
// A pointer to the storage of the 'from' subscript.
72+
// It may point to fromAt_ or to an external non-modifiable
73+
// subscript storage.
74+
const SubscriptValue *fromAtPtr_{fromAt_};
5775
// Number of elements left to copy.
5876
std::size_t elements_;
5977
// Must be true, if the to and from descriptors are allocated
@@ -75,6 +93,18 @@ RT_OFFLOAD_API_GROUP_BEGIN
7593
RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
7694
const Descriptor &from, const SubscriptValue fromAt[],
7795
Terminator &terminator) {
96+
if (!to.Addendum()) {
97+
// Avoid the overhead of creating the work stacks below
98+
// for the simple non-derived type cases, because the overhead
99+
// might be noticeable over the total amount of work that
100+
// needs to be done for the copy.
101+
char *toPtr{to.Element<char>(toAt)};
102+
char *fromPtr{from.Element<char>(fromAt)};
103+
RUNTIME_CHECK(terminator, to.ElementBytes() == from.ElementBytes());
104+
std::memcpy(toPtr, fromPtr, to.ElementBytes());
105+
return;
106+
}
107+
78108
#if !defined(RT_DEVICE_COMPILATION)
79109
constexpr unsigned copyStackReserve{16};
80110
constexpr unsigned descriptorStackReserve{6};
@@ -108,9 +138,9 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
108138
continue;
109139
}
110140
const Descriptor &curTo{currentCopy.to_};
111-
SubscriptValue *curToAt{currentCopy.toAt_};
141+
const SubscriptValue *curToAt{currentCopy.toAtPtr_};
112142
const Descriptor &curFrom{currentCopy.from_};
113-
SubscriptValue *curFromAt{currentCopy.fromAt_};
143+
const SubscriptValue *curFromAt{currentCopy.fromAtPtr_};
114144
char *toPtr{curTo.Element<char>(curToAt)};
115145
char *fromPtr{curFrom.Element<char>(curFromAt)};
116146
RUNTIME_CHECK(terminator, curTo.ElementBytes() == curFrom.ElementBytes());
@@ -121,8 +151,7 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
121151
std::memcpy(toPtr, fromPtr, curTo.ElementBytes());
122152
--elements;
123153
if (elements != 0) {
124-
curTo.IncrementSubscripts(curToAt);
125-
curFrom.IncrementSubscripts(curFromAt);
154+
currentCopy.IncrementSubscripts(terminator);
126155
}
127156

128157
// Deep copy allocatable and automatic components if any.

0 commit comments

Comments
 (0)