Skip to content

Commit e4371fb

Browse files
committed
time: optimize Now on darwin, windows
Fetch both monotonic and wall time together when possible. Avoids skew and is cheaper. Also shave a few ns off in conversion in package time. Compared to current implementation (after monotonic changes): name old time/op new time/op delta Now 19.6ns ± 1% 9.7ns ± 1% -50.63% (p=0.000 n=41+49) darwin/amd64 Now 23.5ns ± 4% 10.6ns ± 5% -54.61% (p=0.000 n=30+28) windows/amd64 Now 54.5ns ± 5% 29.8ns ± 9% -45.40% (p=0.000 n=27+29) windows/386 More importantly, compared to Go 1.8: name old time/op new time/op delta Now 9.5ns ± 1% 9.7ns ± 1% +1.94% (p=0.000 n=41+49) darwin/amd64 Now 12.9ns ± 5% 10.6ns ± 5% -17.73% (p=0.000 n=30+28) windows/amd64 Now 15.3ns ± 5% 29.8ns ± 9% +94.36% (p=0.000 n=30+29) windows/386 This brings time.Now back in line with Go 1.8 on darwin/amd64 and windows/amd64. It's not obvious why windows/386 is still noticeably worse than Go 1.8, but it's better than before this CL. The windows/386 speed is not too important; the changes just keep the two architectures similar. Change-Id: If69b94970c8a1a57910a371ee91e0d4e82e46c5d Reviewed-on: https://go-review.googlesource.com/36428 Run-TryBot: Russ Cox <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Ian Lance Taylor <[email protected]>
1 parent 3a6842a commit e4371fb

15 files changed

+284
-137
lines changed

src/runtime/heapdump.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ func dumpmemstats() {
548548
dumpint(memstats.gc_sys)
549549
dumpint(memstats.other_sys)
550550
dumpint(memstats.next_gc)
551-
dumpint(memstats.last_gc)
551+
dumpint(memstats.last_gc_unix)
552552
dumpint(memstats.pause_total_ns)
553553
for i := 0; i < 256; i++ {
554554
dumpint(memstats.pause_ns[i])

src/runtime/mgc.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,10 +1291,13 @@ func gcMarkTermination() {
12911291
}
12921292

12931293
// Update timing memstats
1294-
now, unixNow := nanotime(), unixnanotime()
1294+
now := nanotime()
1295+
sec, nsec, _ := time_now()
1296+
unixNow := sec*1e9 + int64(nsec)
12951297
work.pauseNS += now - work.pauseStart
12961298
work.tEnd = now
1297-
atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
1299+
atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1300+
atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
12981301
memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
12991302
memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
13001303
memstats.pause_total_ns += uint64(work.pauseNS)

src/runtime/mstats.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ type mstats struct {
7272
// Statistics about garbage collector.
7373
// Protected by mheap or stopping the world during GC.
7474
next_gc uint64 // goal heap_live for when next GC ends; ^0 if disabled
75-
last_gc uint64 // last gc (in absolute time)
75+
last_gc_unix uint64 // last gc (in unix time)
7676
pause_total_ns uint64
7777
pause_ns [256]uint64 // circular buffer of recent gc pause lengths
7878
pause_end [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
@@ -92,7 +92,8 @@ type mstats struct {
9292

9393
// Statistics below here are not exported to MemStats directly.
9494

95-
tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
95+
last_gc_nanotime uint64 // last gc (monotonic time)
96+
tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
9697

9798
// gc_trigger is the heap size that triggers marking.
9899
//
@@ -497,7 +498,7 @@ func readGCStats_m(pauses *[]uint64) {
497498
p[n+i] = memstats.pause_end[j]
498499
}
499500

500-
p[n+n] = memstats.last_gc
501+
p[n+n] = memstats.last_gc_unix
501502
p[n+n+1] = uint64(memstats.numgc)
502503
p[n+n+2] = memstats.pause_total_ns
503504
unlock(&mheap_.lock)

src/runtime/os_windows.go

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -578,50 +578,7 @@ func unminit() {
578578
*tp = 0
579579
}
580580

581-
// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
582-
type _KSYSTEM_TIME struct {
583-
LowPart uint32
584-
High1Time int32
585-
High2Time int32
586-
}
587-
588-
const (
589-
_INTERRUPT_TIME = 0x7ffe0008
590-
_SYSTEM_TIME = 0x7ffe0014
591-
)
592-
593-
//go:nosplit
594-
func systime(addr uintptr) int64 {
595-
timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
596-
597-
var t _KSYSTEM_TIME
598-
for i := 1; i < 10000; i++ {
599-
// these fields must be read in that order (see URL above)
600-
t.High1Time = timeaddr.High1Time
601-
t.LowPart = timeaddr.LowPart
602-
t.High2Time = timeaddr.High2Time
603-
if t.High1Time == t.High2Time {
604-
return int64(t.High1Time)<<32 | int64(t.LowPart)
605-
}
606-
if (i % 100) == 0 {
607-
osyield()
608-
}
609-
}
610-
systemstack(func() {
611-
throw("interrupt/system time is changing too fast")
612-
})
613-
return 0
614-
}
615-
616-
//go:nosplit
617-
func unixnano() int64 {
618-
return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
619-
}
620-
621-
//go:nosplit
622-
func nanotime() int64 {
623-
return systime(_INTERRUPT_TIME) * 100
624-
}
581+
func nanotime() int64
625582

626583
// Calling stdcall on os stack.
627584
// May run during STW, so write barriers are not allowed.

src/runtime/proc.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3818,7 +3818,6 @@ func sysmon() {
38183818
// poll network if not polled for more than 10ms
38193819
lastpoll := int64(atomic.Load64(&sched.lastpoll))
38203820
now := nanotime()
3821-
unixnow := unixnanotime()
38223821
if lastpoll != 0 && lastpoll+10*1000*1000 < now {
38233822
atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
38243823
gp := netpoll(false) // non-blocking - returns list of goroutines
@@ -3843,8 +3842,8 @@ func sysmon() {
38433842
idle++
38443843
}
38453844
// check if we need to force a GC
3846-
lastgc := int64(atomic.Load64(&memstats.last_gc))
3847-
if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
3845+
lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
3846+
if gcphase == _GCoff && lastgc != 0 && now-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
38483847
lock(&forcegc.lock)
38493848
forcegc.idle = 0
38503849
forcegc.g.schedlink = 0

src/runtime/stubs.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,6 @@ func stackBarrier()
241241
// in asm_*.s
242242
func return0()
243243

244-
func walltime() (sec int64, nsec int32)
245-
246244
// in asm_*.s
247245
// not called directly; definitions here supply type information for traceback.
248246
func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
@@ -279,11 +277,6 @@ func prefetcht1(addr uintptr)
279277
func prefetcht2(addr uintptr)
280278
func prefetchnta(addr uintptr)
281279

282-
func unixnanotime() int64 {
283-
sec, nsec := walltime()
284-
return sec*1e9 + int64(nsec)
285-
}
286-
287280
// round n up to a multiple of a. a must be a power of 2.
288281
func round(n, a uintptr) uintptr {
289282
return (n + a - 1) &^ (a - 1)

src/runtime/sys_darwin_386.s

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0
114114
// 64-bit unix nanoseconds returned in DX:AX.
115115
// I'd much rather write this in C but we need
116116
// assembly for the 96-bit multiply and RDTSC.
117+
//
118+
// Note that we could arrange to return monotonic time here
119+
// as well, but we don't bother, for two reasons:
120+
// 1. macOS only supports 64-bit systems, so no one should
121+
// be using the 32-bit code in production.
122+
// This code is only maintained to make it easier for developers
123+
// using Macs to test the 32-bit compiler.
124+
// 2. On some (probably now unsupported) CPUs,
125+
// the code falls back to the system call always,
126+
// so it can't even use the comm page at all.
117127
TEXT runtime·now(SB),NOSPLIT,$40
118128
MOVL $0xffff0000, BP /* comm page base */
119129

@@ -217,9 +227,15 @@ inreg:
217227
ADCL $0, DX
218228
RET
219229

220-
// func walltime() (sec int64, nsec int32)
221-
TEXT runtime·walltime(SB),NOSPLIT,$0
230+
// func now() (sec int64, nsec int32, mono uint64)
231+
TEXT time·now(SB),NOSPLIT,$0-20
222232
CALL runtime·now(SB)
233+
MOVL AX, BX
234+
MOVL DX, BP
235+
SUBL runtime·startNano(SB), BX
236+
SBBL runtime·startNano+4(SB), BP
237+
MOVL BX, mono+12(FP)
238+
MOVL BP, mono+16(FP)
223239
MOVL $1000000000, CX
224240
DIVL CX
225241
MOVL AX, sec+0(FP)
@@ -230,6 +246,8 @@ TEXT runtime·walltime(SB),NOSPLIT,$0
230246
// func nanotime() int64
231247
TEXT runtime·nanotime(SB),NOSPLIT,$0
232248
CALL runtime·now(SB)
249+
SUBL runtime·startNano(SB), AX
250+
SBBL runtime·startNano+4(SB), DX
233251
MOVL AX, ret_lo+0(FP)
234252
MOVL DX, ret_hi+4(FP)
235253
RET

src/runtime/sys_darwin_amd64.s

Lines changed: 59 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -117,35 +117,44 @@ TEXT runtime·madvise(SB), NOSPLIT, $0
117117
#define gtod_ns_base 0x70
118118
#define gtod_sec_base 0x78
119119

120-
TEXT monotonictime<>(SB), NOSPLIT, $32
121-
MOVQ $0x7fffffe00000, SI // comm page base
122-
120+
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
121+
MOVQ $0x7fffffe00000, BP /* comm page base */
122+
// Loop trying to take a consistent snapshot
123+
// of the time parameters.
123124
timeloop:
124-
MOVL nt_generation(SI), R8
125-
TESTL R8, R8
126-
JZ timeloop
125+
MOVL nt_generation(BP), R9
126+
TESTL R9, R9
127+
JZ timeloop
127128
RDTSC
128-
SHLQ $32, DX
129-
ORQ DX, AX
130-
MOVL nt_shift(SI), CX
131-
SUBQ nt_tsc_base(SI), AX
132-
SHLQ CX, AX
133-
MOVL nt_scale(SI), CX
134-
MULQ CX
135-
SHRQ $32, AX:DX
136-
ADDQ nt_ns_base(SI), AX
137-
CMPL nt_generation(SI), R8
138-
JNE timeloop
139-
RET
140-
141-
TEXT nanotime<>(SB), NOSPLIT, $32
129+
MOVQ nt_tsc_base(BP), R10
130+
MOVL nt_scale(BP), R11
131+
MOVQ nt_ns_base(BP), R12
132+
CMPL nt_generation(BP), R9
133+
JNE timeloop
134+
135+
// Gathered all the data we need. Compute monotonic time:
136+
// ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
137+
// The multiply and shift extracts the top 64 bits of the 96-bit product.
138+
SHLQ $32, DX
139+
ADDQ DX, AX
140+
SUBQ R10, AX
141+
MULQ R11
142+
SHRQ $32, AX:DX
143+
ADDQ R12, AX
144+
MOVQ runtime·startNano(SB), CX
145+
SUBQ CX, AX
146+
MOVQ AX, ret+0(FP)
147+
RET
148+
149+
TEXT time·now(SB), NOSPLIT, $32-24
150+
// Note: The 32 bytes of stack frame requested on the TEXT line
151+
// are used in the systime fallback, as the timeval address
152+
// filled in by the system call.
142153
MOVQ $0x7fffffe00000, BP /* comm page base */
143154
// Loop trying to take a consistent snapshot
144155
// of the time parameters.
145156
timeloop:
146157
MOVL gtod_generation(BP), R8
147-
TESTL R8, R8
148-
JZ systime
149158
MOVL nt_generation(BP), R9
150159
TESTL R9, R9
151160
JZ timeloop
@@ -160,18 +169,42 @@ timeloop:
160169
CMPL gtod_generation(BP), R8
161170
JNE timeloop
162171

163-
// Gathered all the data we need. Compute time.
164-
// ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
172+
// Gathered all the data we need. Compute:
173+
// monotonic_time = ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
165174
// The multiply and shift extracts the top 64 bits of the 96-bit product.
166175
SHLQ $32, DX
167176
ADDQ DX, AX
168177
SUBQ R10, AX
169178
MULQ R11
170179
SHRQ $32, AX:DX
171180
ADDQ R12, AX
181+
MOVQ AX, BX
182+
MOVQ runtime·startNano(SB), CX
183+
SUBQ CX, BX
184+
MOVQ BX, monotonic+16(FP)
185+
186+
// Compute:
187+
// wall_time = monotonic time - gtod_ns_base + gtod_sec_base*1e9
188+
// or, if gtod_generation==0, invoke the system call.
189+
TESTL R8, R8
190+
JZ systime
172191
SUBQ R13, AX
173192
IMULQ $1000000000, R14
174193
ADDQ R14, AX
194+
195+
// Split wall time into sec, nsec.
196+
// generated code for
197+
// func f(x uint64) (uint64, uint64) { return x/1e9, x%1e9 }
198+
// adapted to reduce duplication
199+
MOVQ AX, CX
200+
SHRQ $9, AX
201+
MOVQ $19342813113834067, DX
202+
MULQ DX
203+
SHRQ $11, DX
204+
MOVQ DX, sec+0(FP)
205+
IMULQ $1000000000, DX
206+
SUBQ DX, CX
207+
MOVL CX, nsec+8(FP)
175208
RET
176209

177210
systime:
@@ -187,34 +220,9 @@ systime:
187220
MOVL 8(SP), DX
188221
inreg:
189222
// sec is in AX, usec in DX
190-
// return nsec in AX
191-
IMULQ $1000000000, AX
192223
IMULQ $1000, DX
193-
ADDQ DX, AX
194-
RET
195-
196-
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
197-
CALL monotonictime<>(SB)
198-
MOVQ AX, ret+0(FP)
199-
RET
200-
201-
// func walltime() (sec int64, nsec int32)
202-
TEXT runtime·walltime(SB),NOSPLIT,$0-12
203-
CALL nanotime<>(SB)
204-
205-
// generated code for
206-
// func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
207-
// adapted to reduce duplication
208-
MOVQ AX, CX
209-
MOVQ $1360296554856532783, AX
210-
MULQ CX
211-
ADDQ CX, DX
212-
RCRQ $1, DX
213-
SHRQ $29, DX
214-
MOVQ DX, sec+0(FP)
215-
IMULQ $1000000000, DX
216-
SUBQ DX, CX
217-
MOVL CX, nsec+8(FP)
224+
MOVQ AX, sec+0(FP)
225+
MOVL DX, nsec+8(FP)
218226
RET
219227

220228
TEXT runtime·sigprocmask(SB),NOSPLIT,$0

0 commit comments

Comments
 (0)