1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !plan9
6
7#include "go_asm.h"
8#include "textflag.h"
9#include "asm_amd64.h"
10
11// See memclrNoHeapPointers Go doc for important implementation constraints.
12
13// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
14// ABIInternal for performance.
15TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
16	// AX = ptr
17	// BX = n
18	MOVQ	AX, DI	// DI = ptr
19	XORQ	AX, AX
20
21	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
22tail:
23	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
24	TESTQ	BX, BX
25	JEQ	_0
26	CMPQ	BX, $2
27	JBE	_1or2
28	CMPQ	BX, $4
29	JBE	_3or4
30	CMPQ	BX, $8
31	JB	_5through7
32	JE	_8
33	CMPQ	BX, $16
34	JBE	_9through16
35	CMPQ	BX, $32
36	JBE	_17through32
37	CMPQ	BX, $64
38	JBE	_33through64
39	CMPQ	BX, $128
40	JBE	_65through128
41	CMPQ	BX, $256
42	JBE	_129through256
43
44	CMPB	internalcpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
45	JNE	skip_erms
46
47	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
48	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
49	// in the Intel Optimization Guide shows better performance for ERMSB starting
50	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
51	CMPQ    BX, $2048
52	JAE	loop_preheader_erms
53
54skip_erms:
55#ifndef hasAVX2
56	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
57	JE	loop_preheader_avx2
58	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
59
60loop:
61	MOVOU	X15, 0(DI)
62	MOVOU	X15, 16(DI)
63	MOVOU	X15, 32(DI)
64	MOVOU	X15, 48(DI)
65	MOVOU	X15, 64(DI)
66	MOVOU	X15, 80(DI)
67	MOVOU	X15, 96(DI)
68	MOVOU	X15, 112(DI)
69	MOVOU	X15, 128(DI)
70	MOVOU	X15, 144(DI)
71	MOVOU	X15, 160(DI)
72	MOVOU	X15, 176(DI)
73	MOVOU	X15, 192(DI)
74	MOVOU	X15, 208(DI)
75	MOVOU	X15, 224(DI)
76	MOVOU	X15, 240(DI)
77	SUBQ	$256, BX
78	ADDQ	$256, DI
79	CMPQ	BX, $256
80	JAE	loop
81	JMP	tail
82#endif
83
84loop_preheader_avx2:
85	VPXOR X0, X0, X0
86	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
87	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
88	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
89	CMPQ    BX, $0x2000000
90	JAE	loop_preheader_avx2_huge
91
92loop_avx2:
93	VMOVDQU	Y0, 0(DI)
94	VMOVDQU	Y0, 32(DI)
95	VMOVDQU	Y0, 64(DI)
96	VMOVDQU	Y0, 96(DI)
97	SUBQ	$128, BX
98	ADDQ	$128, DI
99	CMPQ	BX, $128
100	JAE	loop_avx2
101	VMOVDQU  Y0, -32(DI)(BX*1)
102	VMOVDQU  Y0, -64(DI)(BX*1)
103	VMOVDQU  Y0, -96(DI)(BX*1)
104	VMOVDQU  Y0, -128(DI)(BX*1)
105	VZEROUPPER
106	RET
107
108loop_preheader_erms:
109#ifndef hasAVX2
110	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
111	JNE	loop_erms
112#endif
113
114	VPXOR X0, X0, X0
115	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
116	// write protocol, ERMS could show the same or slower performance comparing to
117	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
118	CMPQ	BX, $0x2000000
119	JAE	loop_preheader_avx2_huge
120
121loop_erms:
122	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
123	// for a memory subsystem as the GC requires this.
124	MOVQ	BX, CX
125	SHRQ	$3, CX
126	ANDQ	$7, BX
127	REP;	STOSQ
128	JMP	tail
129
130loop_preheader_avx2_huge:
131	// Align to 32 byte boundary
132	VMOVDQU  Y0, 0(DI)
133	MOVQ	DI, SI
134	ADDQ	$32, DI
135	ANDQ	$~31, DI
136	SUBQ	DI, SI
137	ADDQ	SI, BX
138loop_avx2_huge:
139	VMOVNTDQ	Y0, 0(DI)
140	VMOVNTDQ	Y0, 32(DI)
141	VMOVNTDQ	Y0, 64(DI)
142	VMOVNTDQ	Y0, 96(DI)
143	SUBQ	$128, BX
144	ADDQ	$128, DI
145	CMPQ	BX, $128
146	JAE	loop_avx2_huge
147	// In the description of MOVNTDQ in [1]
148	// "... fencing operation implemented with the SFENCE or MFENCE instruction
149	// should be used in conjunction with MOVNTDQ instructions..."
150	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
151	SFENCE
152	VMOVDQU  Y0, -32(DI)(BX*1)
153	VMOVDQU  Y0, -64(DI)(BX*1)
154	VMOVDQU  Y0, -96(DI)(BX*1)
155	VMOVDQU  Y0, -128(DI)(BX*1)
156	VZEROUPPER
157	RET
158
159_1or2:
160	MOVB	AX, (DI)
161	MOVB	AX, -1(DI)(BX*1)
162	RET
163_0:
164	RET
165_3or4:
166	MOVW	AX, (DI)
167	MOVW	AX, -2(DI)(BX*1)
168	RET
169_5through7:
170	MOVL	AX, (DI)
171	MOVL	AX, -4(DI)(BX*1)
172	RET
173_8:
174	// We need a separate case for 8 to make sure we clear pointers atomically.
175	MOVQ	AX, (DI)
176	RET
177_9through16:
178	MOVQ	AX, (DI)
179	MOVQ	AX, -8(DI)(BX*1)
180	RET
181_17through32:
182	MOVOU	X15, (DI)
183	MOVOU	X15, -16(DI)(BX*1)
184	RET
185_33through64:
186	MOVOU	X15, (DI)
187	MOVOU	X15, 16(DI)
188	MOVOU	X15, -32(DI)(BX*1)
189	MOVOU	X15, -16(DI)(BX*1)
190	RET
191_65through128:
192	MOVOU	X15, (DI)
193	MOVOU	X15, 16(DI)
194	MOVOU	X15, 32(DI)
195	MOVOU	X15, 48(DI)
196	MOVOU	X15, -64(DI)(BX*1)
197	MOVOU	X15, -48(DI)(BX*1)
198	MOVOU	X15, -32(DI)(BX*1)
199	MOVOU	X15, -16(DI)(BX*1)
200	RET
201_129through256:
202	MOVOU	X15, (DI)
203	MOVOU	X15, 16(DI)
204	MOVOU	X15, 32(DI)
205	MOVOU	X15, 48(DI)
206	MOVOU	X15, 64(DI)
207	MOVOU	X15, 80(DI)
208	MOVOU	X15, 96(DI)
209	MOVOU	X15, 112(DI)
210	MOVOU	X15, -128(DI)(BX*1)
211	MOVOU	X15, -112(DI)(BX*1)
212	MOVOU	X15, -96(DI)(BX*1)
213	MOVOU	X15, -80(DI)(BX*1)
214	MOVOU	X15, -64(DI)(BX*1)
215	MOVOU	X15, -48(DI)(BX*1)
216	MOVOU	X15, -32(DI)(BX*1)
217	MOVOU	X15, -16(DI)(BX*1)
218	RET
219