Skip to content

Commit 534453e

Browse files
dcharkesCommit Queue
authored and
Commit Queue
committed
[vm] MemoryCopyInstr Remove constant length loops
Removes loops with constant lenght taking code-size into account. On ia32 and x64 only removes single iteration (removing the branch). This speeds up single byte copies. On arm, arm64, and risc-v, removes loops up to 4 iterations, shrinking code size. No speedups were measured on these platforms. TEST=runtime/vm/compiler/backend/memory_copy_test.cc Bug: #51031 Change-Id: I292ebde023b3ec2c3a9ce872e0c9543ac43371b9 Cq-Include-Trybots: luci.dart.try:vm-precomp-ffi-qemu-linux-release-riscv64-try,vm-precomp-ffi-qemu-linux-release-arm-try,vm-ffi-android-debug-arm64c-try,vm-ffi-android-debug-arm-try,vm-kernel-nnbd-mac-debug-arm64-try,vm-kernel-nnbd-win-debug-x64-try,vm-kernel-win-debug-x64c-try,vm-kernel-win-debug-ia32-try,vm-kernel-nnbd-linux-debug-ia32-try,vm-reload-rollback-linux-debug-x64-try Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/279178 Reviewed-by: Aske Simon Christensen <[email protected]> Reviewed-by: Alexander Markov <[email protected]>
1 parent e51b24a commit 534453e

File tree

7 files changed

+258
-27
lines changed

7 files changed

+258
-27
lines changed

runtime/vm/compiler/backend/il_arm.cc

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@ LocationSummary* MemoryCopyInstr::MakeLocationSummary(Zone* zone,
167167
locs->set_in(kDestPos, Location::WritableRegister());
168168
locs->set_in(kSrcStartPos, LocationRegisterOrConstant(src_start()));
169169
locs->set_in(kDestStartPos, LocationRegisterOrConstant(dest_start()));
170-
locs->set_in(kLengthPos, Location::WritableRegister());
170+
locs->set_in(kLengthPos,
171+
LocationWritableRegisterOrSmiConstant(length(), 0, 4));
171172
for (intptr_t i = 0; i < kNumTemps; i++) {
172173
locs->set_temp(i, Location::RequiresRegister());
173174
}
@@ -179,7 +180,8 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
179180
const Register dest_reg = locs()->in(kDestPos).reg();
180181
const Location src_start_loc = locs()->in(kSrcStartPos);
181182
const Location dest_start_loc = locs()->in(kDestStartPos);
182-
const Register length_reg = locs()->in(kLengthPos).reg();
183+
const Location length_loc = locs()->in(kLengthPos);
184+
const bool constant_length = length_loc.IsConstant();
183185

184186
const Register temp_reg = locs()->temp(0).reg();
185187
RegList temp_regs = 0;
@@ -190,6 +192,39 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
190192
EmitComputeStartPointer(compiler, src_cid_, src_reg, src_start_loc);
191193
EmitComputeStartPointer(compiler, dest_cid_, dest_reg, dest_start_loc);
192194

195+
if (constant_length) {
196+
const intptr_t mov_repeat =
197+
Integer::Cast(length_loc.constant()).AsInt64Value();
198+
for (intptr_t i = 0; i < mov_repeat; i++) {
199+
compiler::Address src_address =
200+
compiler::Address(src_reg, element_size_ * i);
201+
compiler::Address dest_address =
202+
compiler::Address(dest_reg, element_size_ * i);
203+
switch (element_size_) {
204+
case 1:
205+
__ ldrb(temp_reg, src_address);
206+
__ strb(temp_reg, dest_address);
207+
break;
208+
case 2:
209+
__ ldrh(temp_reg, src_address);
210+
__ strh(temp_reg, dest_address);
211+
break;
212+
case 4:
213+
__ ldr(temp_reg, src_address);
214+
__ str(temp_reg, dest_address);
215+
break;
216+
case 8:
217+
case 16:
218+
__ ldm(BlockAddressMode::IA_W, src_reg, temp_regs);
219+
__ stm(BlockAddressMode::IA_W, dest_reg, temp_regs);
220+
break;
221+
}
222+
}
223+
return;
224+
}
225+
226+
const Register length_reg = length_loc.reg();
227+
193228
compiler::Label loop, done;
194229

195230
compiler::Address src_address =
@@ -4331,8 +4366,8 @@ LocationSummary* BoxInt64Instr::MakeLocationSummary(Zone* zone,
43314366
object_store->allocate_mint_without_fpu_regs_stub()
43324367
->untag()
43334368
->InVMIsolateHeap();
4334-
const bool shared_slow_path_call = SlowPathSharingSupported(opt) &&
4335-
!stubs_in_vm_isolate;
4369+
const bool shared_slow_path_call =
4370+
SlowPathSharingSupported(opt) && !stubs_in_vm_isolate;
43364371
LocationSummary* summary = new (zone) LocationSummary(
43374372
zone, kNumInputs, kNumTemps,
43384373
ValueFitsSmi()

runtime/vm/compiler/backend/il_arm64.cc

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,8 @@ LocationSummary* MemoryCopyInstr::MakeLocationSummary(Zone* zone,
164164
locs->set_in(kDestPos, Location::WritableRegister());
165165
locs->set_in(kSrcStartPos, LocationRegisterOrConstant(src_start()));
166166
locs->set_in(kDestStartPos, LocationRegisterOrConstant(dest_start()));
167-
locs->set_in(kLengthPos, Location::WritableRegister());
167+
locs->set_in(kLengthPos,
168+
LocationWritableRegisterOrSmiConstant(length(), 0, 4));
168169
locs->set_temp(0, element_size_ == 16
169170
? Location::Pair(Location::RequiresRegister(),
170171
Location::RequiresRegister())
@@ -177,7 +178,8 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
177178
const Register dest_reg = locs()->in(kDestPos).reg();
178179
const Location src_start_loc = locs()->in(kSrcStartPos);
179180
const Location dest_start_loc = locs()->in(kDestStartPos);
180-
const Register length_reg = locs()->in(kLengthPos).reg();
181+
const Location length_loc = locs()->in(kLengthPos);
182+
const bool constant_length = length_loc.IsConstant();
181183

182184
Register temp_reg, temp_reg2;
183185
if (locs()->temp(0).IsPairLocation()) {
@@ -192,6 +194,42 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
192194
EmitComputeStartPointer(compiler, src_cid_, src_reg, src_start_loc);
193195
EmitComputeStartPointer(compiler, dest_cid_, dest_reg, dest_start_loc);
194196

197+
if (constant_length) {
198+
const intptr_t mov_repeat =
199+
Integer::Cast(length_loc.constant()).AsInt64Value();
200+
for (intptr_t i = 0; i < mov_repeat; i++) {
201+
compiler::Address src_address =
202+
compiler::Address(src_reg, element_size_ * i);
203+
compiler::Address dest_address =
204+
compiler::Address(dest_reg, element_size_ * i);
205+
switch (element_size_) {
206+
case 1:
207+
__ ldr(temp_reg, src_address, compiler::kUnsignedByte);
208+
__ str(temp_reg, dest_address, compiler::kUnsignedByte);
209+
break;
210+
case 2:
211+
__ ldr(temp_reg, src_address, compiler::kUnsignedTwoBytes);
212+
__ str(temp_reg, dest_address, compiler::kUnsignedTwoBytes);
213+
break;
214+
case 4:
215+
__ ldr(temp_reg, src_address, compiler::kUnsignedFourBytes);
216+
__ str(temp_reg, dest_address, compiler::kUnsignedFourBytes);
217+
break;
218+
case 8:
219+
__ ldr(temp_reg, src_address, compiler::kEightBytes);
220+
__ str(temp_reg, dest_address, compiler::kEightBytes);
221+
break;
222+
case 16:
223+
__ ldp(temp_reg, temp_reg2, src_address, compiler::kEightBytes);
224+
__ stp(temp_reg, temp_reg2, dest_address, compiler::kEightBytes);
225+
break;
226+
}
227+
}
228+
return;
229+
}
230+
231+
const Register length_reg = length_loc.reg();
232+
195233
compiler::Label loop, done;
196234

197235
compiler::Address src_address =
@@ -225,6 +263,7 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
225263
__ stp(temp_reg, temp_reg2, dest_address, compiler::kEightBytes);
226264
break;
227265
}
266+
228267
__ subs(length_reg, length_reg, compiler::Operand(loop_subtract),
229268
compiler::kObjectBytes);
230269
__ b(&loop, NOT_ZERO);

runtime/vm/compiler/backend/il_ia32.cc

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,29 +79,73 @@ DEFINE_BACKEND(TailCall,
7979

8080
LocationSummary* MemoryCopyInstr::MakeLocationSummary(Zone* zone,
8181
bool opt) const {
82+
const bool remove_loop =
83+
length()->BindsToSmiConstant() && length()->BoundSmiConstant() <= 4;
8284
const intptr_t kNumInputs = 5;
83-
const intptr_t kNumTemps = 0;
85+
const intptr_t kNumTemps = remove_loop ? 1 : 0;
8486
LocationSummary* locs = new (zone)
8587
LocationSummary(zone, kNumInputs, kNumTemps, LocationSummary::kNoCall);
8688
locs->set_in(kSrcPos, Location::RequiresRegister());
8789
locs->set_in(kDestPos, Location::RegisterLocation(EDI));
8890
locs->set_in(kSrcStartPos, LocationRegisterOrConstant(src_start()));
8991
locs->set_in(kDestStartPos, LocationRegisterOrConstant(dest_start()));
90-
locs->set_in(kLengthPos, Location::RegisterLocation(ECX));
92+
if (remove_loop) {
93+
locs->set_in(
94+
kLengthPos,
95+
Location::Constant(
96+
length()->definition()->OriginalDefinition()->AsConstant()));
97+
// Needs a valid ByteRegister for single byte moves, and a temp register
98+
// for more than one move. We could potentially optimize the 2 and 4 byte
99+
// single moves to overwrite the src_reg.
100+
locs->set_temp(0, Location::RegisterLocation(ECX));
101+
} else {
102+
locs->set_in(kLengthPos, Location::RegisterLocation(ECX));
103+
}
91104
return locs;
92105
}
93106

94107
void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
95108
const Register src_reg = locs()->in(kSrcPos).reg();
109+
const Register dest_reg = locs()->in(kDestPos).reg();
96110
const Location src_start_loc = locs()->in(kSrcStartPos);
97111
const Location dest_start_loc = locs()->in(kDestStartPos);
112+
const Location length_loc = locs()->in(kLengthPos);
113+
114+
EmitComputeStartPointer(compiler, src_cid_, src_reg, src_start_loc);
115+
EmitComputeStartPointer(compiler, dest_cid_, dest_reg, dest_start_loc);
116+
117+
if (length_loc.IsConstant()) {
118+
const intptr_t num_bytes =
119+
Integer::Cast(length_loc.constant()).AsInt64Value() * element_size_;
120+
const intptr_t mov_size = Utils::Minimum(element_size_, 4);
121+
const intptr_t mov_repeat = num_bytes / mov_size;
122+
ASSERT(num_bytes % mov_size == 0);
123+
124+
const Register temp_reg = locs()->temp(0).reg();
125+
for (intptr_t i = 0; i < mov_repeat; i++) {
126+
const intptr_t disp = mov_size * i;
127+
switch (mov_size) {
128+
case 1:
129+
__ movzxb(temp_reg, compiler::Address(src_reg, disp));
130+
__ movb(compiler::Address(dest_reg, disp), ByteRegisterOf(temp_reg));
131+
break;
132+
case 2:
133+
__ movzxw(temp_reg, compiler::Address(src_reg, disp));
134+
__ movw(compiler::Address(dest_reg, disp), temp_reg);
135+
break;
136+
case 4:
137+
__ movl(temp_reg, compiler::Address(src_reg, disp));
138+
__ movl(compiler::Address(dest_reg, disp), temp_reg);
139+
break;
140+
}
141+
}
142+
return;
143+
}
98144

99145
// Save ESI which is THR.
100146
__ pushl(ESI);
101147
__ movl(ESI, src_reg);
102148

103-
EmitComputeStartPointer(compiler, src_cid_, ESI, src_start_loc);
104-
EmitComputeStartPointer(compiler, dest_cid_, EDI, dest_start_loc);
105149
if (element_size_ <= compiler::target::kWordSize) {
106150
if (!unboxed_length_) {
107151
__ SmiUntag(ECX);

runtime/vm/compiler/backend/il_riscv.cc

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ LocationSummary* MemoryCopyInstr::MakeLocationSummary(Zone* zone,
181181
locs->set_in(kDestPos, Location::WritableRegister());
182182
locs->set_in(kSrcStartPos, LocationRegisterOrConstant(src_start()));
183183
locs->set_in(kDestStartPos, LocationRegisterOrConstant(dest_start()));
184-
locs->set_in(kLengthPos, Location::WritableRegister());
184+
locs->set_in(kLengthPos,
185+
LocationWritableRegisterOrSmiConstant(length(), 0, 4));
185186
return locs;
186187
}
187188

@@ -190,11 +191,55 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
190191
const Register dest_reg = locs()->in(kDestPos).reg();
191192
const Location src_start_loc = locs()->in(kSrcStartPos);
192193
const Location dest_start_loc = locs()->in(kDestStartPos);
193-
const Register length_reg = locs()->in(kLengthPos).reg();
194+
const Location length_loc = locs()->in(kLengthPos);
195+
const bool constant_length = length_loc.IsConstant();
196+
const Register length_reg = constant_length ? kNoRegister : length_loc.reg();
194197

195198
EmitComputeStartPointer(compiler, src_cid_, src_reg, src_start_loc);
196199
EmitComputeStartPointer(compiler, dest_cid_, dest_reg, dest_start_loc);
197200

201+
if (constant_length) {
202+
const intptr_t num_bytes =
203+
Integer::Cast(length_loc.constant()).AsInt64Value() * element_size_;
204+
const intptr_t mov_size =
205+
Utils::Minimum(element_size_, static_cast<intptr_t>(XLEN / 8));
206+
const intptr_t mov_repeat = num_bytes / mov_size;
207+
ASSERT(num_bytes % mov_size == 0);
208+
for (intptr_t i = 0; i < mov_repeat; i++) {
209+
switch (mov_size) {
210+
case 1:
211+
__ lb(TMP, compiler::Address(src_reg, mov_size * i));
212+
__ sb(TMP, compiler::Address(dest_reg, mov_size * i));
213+
break;
214+
case 2:
215+
__ lh(TMP, compiler::Address(src_reg, mov_size * i));
216+
__ sh(TMP, compiler::Address(dest_reg, mov_size * i));
217+
break;
218+
case 4:
219+
__ lw(TMP, compiler::Address(src_reg, mov_size * i));
220+
__ sw(TMP, compiler::Address(dest_reg, mov_size * i));
221+
break;
222+
case 8:
223+
#if XLEN == 64
224+
__ ld(TMP, compiler::Address(src_reg, mov_size * i));
225+
__ sd(TMP, compiler::Address(dest_reg, mov_size * i));
226+
#else
227+
UNREACHABLE();
228+
#endif
229+
break;
230+
case 16:
231+
#if XLEN == 128
232+
__ lq(TMP, compiler::Address(src_reg, mov_size * i));
233+
__ sq(TMP, compiler::Address(dest_reg, mov_size * i));
234+
#else
235+
UNREACHABLE();
236+
#endif
237+
break;
238+
}
239+
}
240+
return;
241+
}
242+
198243
compiler::Label loop, done;
199244

200245
const intptr_t loop_subtract = unboxed_length_ ? 1 : Smi::RawValue(1);
@@ -262,6 +307,7 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
262307
#endif
263308
break;
264309
}
310+
265311
__ subi(length_reg, length_reg, loop_subtract);
266312
__ bnez(length_reg, &loop);
267313
__ Bind(&done);

runtime/vm/compiler/backend/il_x64.cc

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,16 +162,59 @@ LocationSummary* MemoryCopyInstr::MakeLocationSummary(Zone* zone,
162162
locs->set_in(kDestPos, Location::RegisterLocation(RDI));
163163
locs->set_in(kSrcStartPos, LocationRegisterOrConstant(src_start()));
164164
locs->set_in(kDestStartPos, LocationRegisterOrConstant(dest_start()));
165-
locs->set_in(kLengthPos, Location::RegisterLocation(RCX));
165+
if (length()->BindsToSmiConstant() && length()->BoundSmiConstant() <= 4) {
166+
locs->set_in(
167+
kLengthPos,
168+
Location::Constant(
169+
length()->definition()->OriginalDefinition()->AsConstant()));
170+
} else {
171+
locs->set_in(kLengthPos, Location::RegisterLocation(RCX));
172+
}
166173
return locs;
167174
}
168175

169176
void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
177+
const Register src_reg = locs()->in(kSrcPos).reg();
178+
const Register dest_reg = locs()->in(kDestPos).reg();
170179
const Location src_start_loc = locs()->in(kSrcStartPos);
171180
const Location dest_start_loc = locs()->in(kDestStartPos);
181+
const Location length_loc = locs()->in(kLengthPos);
182+
183+
EmitComputeStartPointer(compiler, src_cid_, src_reg, src_start_loc);
184+
EmitComputeStartPointer(compiler, dest_cid_, dest_reg, dest_start_loc);
185+
186+
if (length_loc.IsConstant()) {
187+
const intptr_t num_bytes =
188+
Integer::Cast(length_loc.constant()).AsInt64Value() * element_size_;
189+
const intptr_t mov_size =
190+
Utils::Minimum(element_size_, static_cast<intptr_t>(8));
191+
const intptr_t mov_repeat = num_bytes / mov_size;
192+
ASSERT(num_bytes % mov_size == 0);
193+
194+
for (intptr_t i = 0; i < mov_repeat; i++) {
195+
const intptr_t disp = mov_size * i;
196+
switch (mov_size) {
197+
case 1:
198+
__ movzxb(TMP, compiler::Address(src_reg, disp));
199+
__ movb(compiler::Address(dest_reg, disp), ByteRegisterOf(TMP));
200+
break;
201+
case 2:
202+
__ movzxw(TMP, compiler::Address(src_reg, disp));
203+
__ movw(compiler::Address(dest_reg, disp), TMP);
204+
break;
205+
case 4:
206+
__ movl(TMP, compiler::Address(src_reg, disp));
207+
__ movl(compiler::Address(dest_reg, disp), TMP);
208+
break;
209+
case 8:
210+
__ movq(TMP, compiler::Address(src_reg, disp));
211+
__ movq(compiler::Address(dest_reg, disp), TMP);
212+
break;
213+
}
214+
}
215+
return;
216+
}
172217

173-
EmitComputeStartPointer(compiler, src_cid_, RSI, src_start_loc);
174-
EmitComputeStartPointer(compiler, dest_cid_, RDI, dest_start_loc);
175218
if (element_size_ <= compiler::target::kWordSize) {
176219
if (!unboxed_length_) {
177220
__ SmiUntag(RCX);

0 commit comments

Comments
 (0)