Skip to content

Commit 22540ab

Browse files
author
Martin Möhrmann
committed
runtime: use RDTSCP for instruction stream serialized read of TSC
To measure all instructions having been completed before reading the time stamp counter with RDTSC an instruction sequence that has instruction stream serializing properties which guarantee waiting until all previous instructions have been executed is needed. This does not necessary mean to wait for all stores to be globally visible. This CL aims to remove vendor specific logic for determining the instruction sequence with CPU feature flag checks that are CPU vendor independent. For intel LFENCE has the wanted properties at least since it was introduced together with SSE2 support. On AMD instruction stream serializing LFENCE is supported by setting an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors. AMD family 0Fh/11h processors support LFENCE as serializing always. AMD plans support for this MSR and access to this bit for all future processors. Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf Reading the MSR to determine LFENCE properties is not always possible or reliable (hypervisors). The Linux kernel is relying on serializing LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295 and the MSR C001_1029 to enable serialization has been set by default with the Spectre v1 mitigations. Using an MFENCE on AMD is waiting on previous instructions having been executed but in addition also flushes store buffers. To align the serialization properties without runtime detection of CPU manufacturers we can use the newer RDTSCP instruction which waits until all previous instructions have been executed. RDTSCP is available on Intel since around 2008 and on AMD CPUs since around 2006. Support for RDTSCP can be checked independently of manufacturer by checking CPUID bits. Using RDTSCP is the default in Linux to read TSC in program order when the instruction is available. https://github.com/torvalds/linux/blob/e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93/arch/x86/include/asm/msr.h#L231 Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d Reviewed-on: https://go-review.googlesource.com/c/go/+/344429 Reviewed-by: Keith Randall <[email protected]> Trust: Martin Möhrmann <[email protected]> Run-TryBot: Martin Möhrmann <[email protected]> TryBot-Result: Go Bot <[email protected]>
1 parent fa34678 commit 22540ab

File tree

6 files changed

+64
-28
lines changed

6 files changed

+64
-28
lines changed

src/internal/cpu/cpu.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ var X86 struct {
3636
HasOSXSAVE bool
3737
HasPCLMULQDQ bool
3838
HasPOPCNT bool
39+
HasRDTSCP bool
3940
HasSSE2 bool
4041
HasSSE3 bool
4142
HasSSSE3 bool

src/internal/cpu/cpu_x86.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ const (
3737
cpuid_BMI2 = 1 << 8
3838
cpuid_ERMS = 1 << 9
3939
cpuid_ADX = 1 << 19
40+
41+
// edx bits for CPUID 0x80000001
42+
cpuid_RDTSCP = 1 << 27
4043
)
4144

4245
var maxExtendedFunctionInformation uint32
@@ -53,6 +56,7 @@ func doinit() {
5356
{Name: "fma", Feature: &X86.HasFMA},
5457
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
5558
{Name: "popcnt", Feature: &X86.HasPOPCNT},
59+
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
5660
{Name: "sse3", Feature: &X86.HasSSE3},
5761
{Name: "sse41", Feature: &X86.HasSSE41},
5862
{Name: "sse42", Feature: &X86.HasSSE42},
@@ -112,6 +116,16 @@ func doinit() {
112116
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
113117
X86.HasERMS = isSet(ebx7, cpuid_ERMS)
114118
X86.HasADX = isSet(ebx7, cpuid_ADX)
119+
120+
var maxExtendedInformation uint32
121+
maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
122+
123+
if maxExtendedInformation < 0x80000001 {
124+
return
125+
}
126+
127+
_, _, _, edxExt1 := cpuid(0x80000001, 0)
128+
X86.HasRDTSCP = isSet(edxExt1, cpuid_RDTSCP)
115129
}
116130

117131
func isSet(hwc uint32, value uint32) bool {

src/runtime/asm_386.s

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -137,17 +137,13 @@ has_cpuid:
137137
CMPL AX, $0
138138
JE nocpuinfo
139139

140-
// Figure out how to serialize RDTSC.
141-
// On Intel processors LFENCE is enough. AMD requires MFENCE.
142-
// Don't know about the rest, so let's do MFENCE.
143140
CMPL BX, $0x756E6547 // "Genu"
144141
JNE notintel
145142
CMPL DX, $0x49656E69 // "ineI"
146143
JNE notintel
147144
CMPL CX, $0x6C65746E // "ntel"
148145
JNE notintel
149146
MOVB $1, runtime·isIntel(SB)
150-
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
151147
notintel:
152148

153149
// Load EAX=1 cpuid flags
@@ -838,19 +834,36 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
838834

839835
// func cputicks() int64
840836
TEXT runtime·cputicks(SB),NOSPLIT,$0-8
837+
// LFENCE/MFENCE instruction support is dependent on SSE2.
838+
// When no SSE2 support is present do not enforce any serialization
839+
// since using CPUID to serialize the instruction stream is
840+
// very costly.
841841
CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
842-
JNE done
843-
CMPB runtime·lfenceBeforeRdtsc(SB), $1
844-
JNE mfence
845-
LFENCE
846-
JMP done
847-
mfence:
848-
MFENCE
842+
JNE rdtsc
843+
CMPB internal∕cpu·X86+const_offsetX86HasRDTSCP(SB), $1
844+
JNE fences
845+
// Instruction stream serializing RDTSCP is supported.
846+
// RDTSCP is supported by Intel Nehalem (2008) and
847+
// AMD K8 Rev. F (2006) and newer.
848+
RDTSCP
849849
done:
850-
RDTSC
851850
MOVL AX, ret_lo+0(FP)
852851
MOVL DX, ret_hi+4(FP)
853852
RET
853+
fences:
854+
// MFENCE is instruction stream serializing and flushes the
855+
// store buffers on AMD. The serialization semantics of LFENCE on AMD
856+
// are dependent on MSR C001_1029 and CPU generation.
857+
// LFENCE on Intel does wait for all previous instructions to have executed.
858+
// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
859+
// previous instructions executed and all previous loads and stores to globally visible.
860+
// Using MFENCE;LFENCE here aligns the serializing properties without
861+
// runtime detection of CPU manufacturer.
862+
MFENCE
863+
LFENCE
864+
rdtsc:
865+
RDTSC
866+
JMP done
854867

855868
TEXT ldt0setup<>(SB),NOSPLIT,$16-0
856869
// set up ldt 7 to point at m0.tls

src/runtime/asm_amd64.s

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,13 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
103103
CMPL AX, $0
104104
JE nocpuinfo
105105

106-
// Figure out how to serialize RDTSC.
107-
// On Intel processors LFENCE is enough. AMD requires MFENCE.
108-
// Don't know about the rest, so let's do MFENCE.
109106
CMPL BX, $0x756E6547 // "Genu"
110107
JNE notintel
111108
CMPL DX, $0x49656E69 // "ineI"
112109
JNE notintel
113110
CMPL CX, $0x6C65746E // "ntel"
114111
JNE notintel
115112
MOVB $1, runtime·isIntel(SB)
116-
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
117113
notintel:
118114

119115
// Load EAX=1 cpuid flags
@@ -928,18 +924,30 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
928924

929925
// func cputicks() int64
930926
TEXT runtime·cputicks(SB),NOSPLIT,$0-0
931-
CMPB runtime·lfenceBeforeRdtsc(SB), $1
932-
JNE mfence
933-
LFENCE
934-
JMP done
935-
mfence:
936-
MFENCE
927+
CMPB internal∕cpu·X86+const_offsetX86HasRDTSCP(SB), $1
928+
JNE fences
929+
// Instruction stream serializing RDTSCP is supported.
930+
// RDTSCP is supported by Intel Nehalem (2008) and
931+
// AMD K8 Rev. F (2006) and newer.
932+
RDTSCP
937933
done:
938-
RDTSC
939934
SHLQ $32, DX
940935
ADDQ DX, AX
941936
MOVQ AX, ret+0(FP)
942937
RET
938+
fences:
939+
// MFENCE is instruction stream serializing and flushes the
940+
// store buffers on AMD. The serialization semantics of LFENCE on AMD
941+
// are dependent on MSR C001_1029 and CPU generation.
942+
// LFENCE on Intel does wait for all previous instructions to have executed.
943+
// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
944+
// previous instructions executed and all previous loads and stores to globally visible.
945+
// Using MFENCE;LFENCE here aligns the serializing properties without
946+
// runtime detection of CPU manufacturer.
947+
MFENCE
948+
LFENCE
949+
RDTSC
950+
JMP done
943951

944952
// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
945953
// hash function using AES hardware instructions

src/runtime/cpuflags.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ import (
1111

1212
// Offsets into internal/cpu records for use in assembly.
1313
const (
14-
offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX)
15-
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
16-
offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
17-
offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
14+
offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX)
15+
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
16+
offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
17+
offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP)
18+
offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
1819

1920
offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
2021

src/runtime/runtime2.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,6 @@ var (
11281128
// Set on startup in asm_{386,amd64}.s
11291129
processorVersionInfo uint32
11301130
isIntel bool
1131-
lfenceBeforeRdtsc bool
11321131

11331132
goarm uint8 // set by cmd/link on arm systems
11341133
)

0 commit comments

Comments
 (0)