1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package amd64
6
7import (
8	"cmd/compile/internal/ir"
9	"cmd/compile/internal/objw"
10	"cmd/compile/internal/types"
11	"cmd/internal/obj"
12	"cmd/internal/obj/x86"
13	"internal/buildcfg"
14)
15
16// no floating point in note handlers on Plan 9
17var isPlan9 = buildcfg.GOOS == "plan9"
18
19// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
20// See runtime/mkduff.go.
21const (
22	dzBlocks    = 16 // number of MOV/ADD blocks
23	dzBlockLen  = 4  // number of clears per block
24	dzBlockSize = 23 // size of instructions in a single block
25	dzMovSize   = 5  // size of single MOV instruction w/ offset
26	dzLeaqSize  = 4  // size of single LEAQ instruction
27	dzClearStep = 16 // number of bytes cleared by each MOV instruction
28
29	dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
30	dzSize     = dzBlocks * dzBlockSize
31)
32
33// dzOff returns the offset for a jump into DUFFZERO.
34// b is the number of bytes to zero.
35func dzOff(b int64) int64 {
36	off := int64(dzSize)
37	off -= b / dzClearLen * dzBlockSize
38	tailLen := b % dzClearLen
39	if tailLen >= dzClearStep {
40		off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
41	}
42	return off
43}
44
45// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
46// b is the number of bytes to zero.
47func dzDI(b int64) int64 {
48	tailLen := b % dzClearLen
49	if tailLen < dzClearStep {
50		return 0
51	}
52	tailSteps := tailLen / dzClearStep
53	return -dzClearStep * (dzBlockLen - tailSteps)
54}
55
56func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
57	const (
58		r13 = 1 << iota // if R13 is already zeroed.
59	)
60
61	if cnt == 0 {
62		return p
63	}
64
65	if cnt == 8 {
66		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
67	} else if !isPlan9 && cnt <= int64(8*types.RegSize) {
68		for i := int64(0); i < cnt/16; i++ {
69			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
70		}
71
72		if cnt%16 != 0 {
73			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
74		}
75	} else if !isPlan9 && (cnt <= int64(128*types.RegSize)) {
76		// Save DI to r12. With the amd64 Go register abi, DI can contain
77		// an incoming parameter, whereas R12 is always scratch.
78		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
79		// Emit duffzero call
80		p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
81		p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
82		p.To.Sym = ir.Syms.Duffzero
83		if cnt%16 != 0 {
84			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
85		}
86		// Restore DI from r12
87		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
88
89	} else {
90		// When the register ABI is in effect, at this point in the
91		// prolog we may have live values in all of RAX,RDI,RCX. Save
92		// them off to registers before the REPSTOSQ below, then
93		// restore. Note that R12 and R13 are always available as
94		// scratch regs; here we also use R15 (this is safe to do
95		// since there won't be any globals accessed in the prolog).
96		// See rewriteToUseGot() in obj6.go for more on r15 use.
97
98		// Save rax/rdi/rcx
99		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
100		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
101		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
102
103		// Set up the REPSTOSQ and kick it off.
104		p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
105		p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
106		p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
107		p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
108		p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
109
110		// Restore rax/rdi/rcx
111		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
112		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
113		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
114
115		// Record the fact that r13 is no longer zero.
116		*state &= ^uint32(r13)
117	}
118
119	return p
120}
121
122func ginsnop(pp *objw.Progs) *obj.Prog {
123	// This is a hardware nop (1-byte 0x90) instruction,
124	// even though we describe it as an explicit XCHGL here.
125	// Particularly, this does not zero the high 32 bits
126	// like typical *L opcodes.
127	// (gas assembles "xchg %eax,%eax" to 0x87 0xc0, which
128	// does zero the high 32 bits.)
129	p := pp.Prog(x86.AXCHGL)
130	p.From.Type = obj.TYPE_REG
131	p.From.Reg = x86.REG_AX
132	p.To.Type = obj.TYPE_REG
133	p.To.Reg = x86.REG_AX
134	return p
135}
136