1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Assembler to produce x86-64 instructions. Somewhat influenced by V8 assembler.
18
19 #ifndef BERBERIS_ASSEMBLER_X86_64_H_
20 #define BERBERIS_ASSEMBLER_X86_64_H_
21
22 #include <type_traits> // std::is_same
23
24 #include "berberis/assembler/x86_32_and_x86_64.h"
25 #include "berberis/base/logging.h"
26
27 namespace berberis {
28
29 class MachindeCode;
30
31 namespace x86_64 {
32
33 class Assembler : public x86_32_and_x86_64::Assembler<Assembler> {
34 public:
35 using BaseAssembler = x86_32_and_x86_64::Assembler<Assembler>;
36 using FinalAssembler = Assembler;
37
Assembler(MachineCode * code)38 explicit Assembler(MachineCode* code) : BaseAssembler(code) {}
39
40 static constexpr Register no_register{0x80};
41 static constexpr Register rax{0};
42 static constexpr Register rcx{1};
43 static constexpr Register rdx{2};
44 static constexpr Register rbx{3};
45 static constexpr Register rsp{4};
46 static constexpr Register rbp{5};
47 static constexpr Register rsi{6};
48 static constexpr Register rdi{7};
49 static constexpr Register r8{8};
50 static constexpr Register r9{9};
51 static constexpr Register r10{10};
52 static constexpr Register r11{11};
53 static constexpr Register r12{12};
54 static constexpr Register r13{13};
55 static constexpr Register r14{14};
56 static constexpr Register r15{15};
57
58 static constexpr XMMRegister no_xmm_register{0x80};
59 static constexpr XMMRegister xmm0{0};
60 static constexpr XMMRegister xmm1{1};
61 static constexpr XMMRegister xmm2{2};
62 static constexpr XMMRegister xmm3{3};
63 static constexpr XMMRegister xmm4{4};
64 static constexpr XMMRegister xmm5{5};
65 static constexpr XMMRegister xmm6{6};
66 static constexpr XMMRegister xmm7{7};
67 static constexpr XMMRegister xmm8{8};
68 static constexpr XMMRegister xmm9{9};
69 static constexpr XMMRegister xmm10{10};
70 static constexpr XMMRegister xmm11{11};
71 static constexpr XMMRegister xmm12{12};
72 static constexpr XMMRegister xmm13{13};
73 static constexpr XMMRegister xmm14{14};
74 static constexpr XMMRegister xmm15{15};
75
76 // Macroassembler uses these names to support both x86-32 and x86-64 modes.
77 static constexpr Register gpr_a{0};
78 static constexpr Register gpr_c{1};
79 static constexpr Register gpr_d{2};
80 static constexpr Register gpr_s{4};
81
82 // Instructions.
83 #include "berberis/assembler/gen_assembler_x86_64-inl.h" // NOLINT generated file!
84
85 // Historical curiosity: x86-32 mode has Movq for memory-to-xmm operations.
86 // x86-64 added another one, with different opcode but since they are functionally equivalent
87 // GNU Assembler and Clang use old one both in 32-bit mode and 64-bit mode thus we are doing
88 // the same.
89
90 // Unhide Decl(Mem) hidden by Decl(Reg).
91 using BaseAssembler::Decl;
92
93 // Unhide Decw(Mem) hidden by Decw(Reg).
94 using BaseAssembler::Decw;
95
96 // Unhide Incl(Mem) hidden by Incl(Reg).
97 using BaseAssembler::Incl;
98
99 // Unhide Incw(Mem) hidden by Incw(Reg).
100 using BaseAssembler::Incw;
101
102 // Unhide Movq(Mem, XMMReg) and Movq(XMMReg, Mem) hidden by Movq(Reg, Imm) and many others.
103 using BaseAssembler::Movq;
104
105 // Unhide Xchgl(Mem, Reg) hidden by modified version below.
106 using BaseAssembler::Xchgl;
107
108 // Unhide Vmov*(Mem, Reg) hidden by Vmov*(Reg, Reg).
109 using BaseAssembler::Vmovapd;
110 using BaseAssembler::Vmovaps;
111 using BaseAssembler::Vmovdqa;
112 using BaseAssembler::Vmovdqu;
113 using BaseAssembler::Vmovq;
114 using BaseAssembler::Vmovsd;
115 using BaseAssembler::Vmovss;
116
Xchgl(Register dest,Register src)117 void Xchgl(Register dest, Register src) {
118 // In 32-bit mode "xchgl %eax, %eax" did nothing and was often reused as "nop".
119 //
120 // On x86-64 "xchgl %eax, %eax" clears top half of %eax register, but having single-byte nop
121 // is too convenient, thus, as special exception, 0x90 is not interpreted as "xchgl %eax, %eax",
122 // but was kept as "nop" - thus longer encoding for "xchgl %eax, %eax" must be used.
123
124 if (IsAccumulator(src) && IsAccumulator(dest)) {
125 Emit16(0xc087);
126 } else {
127 BaseAssembler::Xchgl(dest, src);
128 }
129 }
130
131 // TODO(b/127356868): decide what to do with these functions when cross-arch assembler is used.
132
133 #ifdef __amd64__
134
135 // Unhide Call(Reg), hidden by special version below.
136 using BaseAssembler::Call;
137
Call(const void * target)138 void Call(const void* target) {
139 // There are no call instruction with properties we need thus we emulate it.
140 // This is what the following code looks like when decoded with objdump (if
141 // target address is 0x123456789abcdef0):
142 // 0: ff 15 02 00 00 00 callq *0x2(%rip) # 0x8
143 // 6: eb 08 jmp 0x10
144 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
145 // First we do call - with address taken from last 8 bytes, then we jump over
146 // these 8 bytes.
147 Emit64(0x08eb0000000215ff);
148 Emit64(bit_cast<int64_t>(target));
149 }
150
151 // Unhide Jcc(Label), hidden by special version below.
152 using BaseAssembler::Jcc;
153
154 // Make sure only type void* can be passed to function below, not Label* or any other pointer.
155 template <typename T>
156 auto Jcc(Condition cc, T* target) -> void = delete;
157
158 template <typename T>
159 auto Jcc(Condition cc, T target)
160 -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
161
Jcc(Condition cc,uintptr_t target)162 void Jcc(Condition cc, uintptr_t target) {
163 if (cc == Condition::kAlways) {
164 Jmp(target);
165 return;
166 } else if (cc == Condition::kNever) {
167 return;
168 }
169 CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
170 // There are no Jcc instruction with properties we need thus we emulate it.
171 // This is what the following code looks like when decoded with objdump (if
172 // target address is 0x123456789abcdef0):
173 // 0: 75 0e jne 0x10
174 // 2: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x8
175 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
176 // We are doing relative jump for the inverted condition (because Jcc could
177 // only jump ±2GiB and in 64 bit mode which is not enough to reach arbitrary
178 // address), then jmpq with address stored right after jmpq.
179 Emit64(0x0000000025ff'0e70 | static_cast<int8_t>(ToReverseCond(cc)));
180 Emit64(bit_cast<int64_t>(target));
181 }
182
183 void Jcc(Condition cc, const void* target) { Jcc(cc, bit_cast<uintptr_t>(target)); }
184
185 // Unhide Jmp(Reg), hidden by special version below.
186 using BaseAssembler::Jmp;
187
188 // Make sure only type void* can be passed to function below, not Label* or any other pointer.
189 template <typename T>
190 auto Jmp(T* target) -> void = delete;
191
192 template <typename T>
193 auto Jmp(T target)
194 -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
195
196 void Jmp(uintptr_t target) {
197 // There are no jump instruction with properties we need thus we emulate it.
198 // This is what the following code looks like when decoded with objdump (if
199 // target address is 0x123456789abcdef0):
200 // 0: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x6
201 // 6: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
202 // We are doing jump to the address stored right after jmpq using %rip-relative
203 // addressing (with offset 0).
204 Emit16(0x25ff);
205 Emit32(0x00000000);
206 Emit64(bit_cast<int64_t>(target));
207 }
208
209 void Jmp(const void* target) { Jmp(bit_cast<uintptr_t>(target)); }
210
211 #endif
212
213 private:
214 Assembler() = delete;
215 Assembler(const Assembler&) = delete;
216 Assembler(Assembler&&) = delete;
217 void operator=(const Assembler&) = delete;
218 void operator=(Assembler&&) = delete;
219 using DerivedAssemblerType = Assembler;
220
221 static Register Accumulator() { return rax; }
222 static bool IsAccumulator(Register reg) { return reg == rax; }
223
224 struct Register64Bit {
225 explicit constexpr Register64Bit(Register reg) : num_(reg.num_) {}
226 uint8_t num_;
227 };
228
229 struct Memory64Bit {
230 explicit Memory64Bit(const Operand& op) : operand(op) {}
231 Operand operand;
232 };
233
234 struct Label64Bit {
235 explicit Label64Bit(const LabelOperand& l) : label(l.label) {}
236 const Label& label;
237 };
238
239 // This type is only used by CmpXchg16b and acts similarly to Memory64Bit there.
240 using Memory128Bit = Memory64Bit;
241 using Label128Bit = Label64Bit;
242
243 // Check if a given type is "a register with size" (for EmitInstruction).
244 template <typename ArgumentType>
245 struct IsRegister {
246 static constexpr bool value = std::is_same_v<ArgumentType, Register8Bit> ||
247 std::is_same_v<ArgumentType, Register32Bit> ||
248 std::is_same_v<ArgumentType, Register64Bit>;
249 };
250
251 // Check if a given type is "a memory operand with size" (for EmitInstruction).
252 template <typename ArgumentType>
253 struct IsMemoryOperand {
254 static constexpr bool value =
255 std::is_same_v<ArgumentType, Memory32Bit> || std::is_same_v<ArgumentType, Memory64Bit>;
256 };
257
258 template <typename ArgumentType>
259 struct IsLabelOperand {
260 static constexpr bool value =
261 std::is_same_v<ArgumentType, Label32Bit> || std::is_same_v<ArgumentType, Label64Bit>;
262 };
263
264 template <typename... ArgumentsTypes>
265 void EmitRex(ArgumentsTypes... arguments) {
266 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
267 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
268 static_assert(registers_count + operands_count <= 2,
269 "Only two-arguments instructions are supported, not VEX or EVEX");
270 uint8_t rex = 0;
271 if constexpr (registers_count == 2) {
272 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
273 Rex<0b0001>(ArgumentByType<1, IsRegister>(arguments...));
274 } else if constexpr (registers_count == 1 && operands_count == 1) {
275 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
276 Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
277 } else if constexpr (registers_count == 1) {
278 rex = Rex<0b0001>(ArgumentByType<0, IsRegister>(arguments...));
279 } else if constexpr (operands_count == 1) {
280 rex = Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
281 }
282 if (rex) {
283 Emit8(rex);
284 }
285 }
286
287 template <uint8_t base_rex, typename ArgumentType>
288 uint8_t Rex(ArgumentType argument) {
289 if (argument.num_ & 0b1000) {
290 // 64-bit argument requires REX.W bit
291 if (std::is_same_v<ArgumentType, Register64Bit>) {
292 return 0b0100'1000 | base_rex;
293 }
294 return 0b0100'0000 | base_rex;
295 }
296 // 8-bit argument requires REX (even if without any bits).
297 if (std::is_same_v<ArgumentType, Register8Bit> && argument.num_ > 3) {
298 return 0b0100'0000;
299 }
300 if (std::is_same_v<ArgumentType, Register64Bit>) {
301 return 0b0100'1000;
302 }
303 return 0;
304 }
305
Rex(Operand operand)306 uint8_t Rex(Operand operand) {
307 // REX.B and REX.X always come from operand.
308 uint8_t rex = ((operand.base.num_ & 0b1000) >> 3) | ((operand.index.num_ & 0b1000) >> 2);
309 if (rex) {
310 // We actually need rex byte here.
311 return 0b0100'0000 | rex;
312 } else {
313 return 0;
314 }
315 }
316
Rex(Memory32Bit operand)317 uint8_t Rex(Memory32Bit operand) { return Rex(operand.operand); }
318
Rex(Memory64Bit operand)319 uint8_t Rex(Memory64Bit operand) {
320 // 64-bit argument requires REX.W bit - and thus REX itself.
321 return 0b0100'1000 | Rex(operand.operand);
322 }
323
324 template <typename RegisterType>
325 [[nodiscard]] static bool IsSwapProfitable(RegisterType rm_arg, RegisterType vex_arg) {
326 // In 64bit mode we may use more compact encoding if operand encoded in rm is low register.
327 // Return true if we may achieve that by swapping arguments.
328 return rm_arg.num_ >= 8 && vex_arg.num_ < 8;
329 }
330
331 template <uint8_t byte1,
332 uint8_t byte2,
333 uint8_t byte3,
334 bool reg_is_opcode_extension,
335 typename... ArgumentsTypes>
336 void EmitVex(ArgumentsTypes... arguments) {
337 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
338 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
339 constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
340 constexpr auto vvvv_parameter = 2 - reg_is_opcode_extension - operands_count - labels_count;
341 int vvvv = 0;
342 if constexpr (registers_count > vvvv_parameter) {
343 vvvv = ArgumentByType<vvvv_parameter, IsRegister>(arguments...).num_;
344 }
345 auto vex2 = byte2 | 0b111'00000;
346 if constexpr (operands_count == 1) {
347 auto operand = ArgumentByType<0, IsMemoryOperand>(arguments...);
348 vex2 ^= (operand.operand.base.num_ & 0b1000) << 2;
349 vex2 ^= (operand.operand.index.num_ & 0b1000) << 3;
350 if constexpr (!reg_is_opcode_extension) {
351 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
352 }
353 } else if constexpr (labels_count == 1) {
354 if constexpr (!reg_is_opcode_extension) {
355 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
356 }
357 } else if constexpr (registers_count > 0) {
358 if constexpr (reg_is_opcode_extension) {
359 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 2;
360 } else {
361 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
362 vex2 ^= (ArgumentByType<1, IsRegister>(arguments...).num_ & 0b1000) << 2;
363 }
364 }
365 if (byte1 == 0xC4 && (vex2 & 0b0'1'1'11111) == 0b0'1'1'00001 && (byte3 & 0b1'0000'0'00) == 0) {
366 Emit16((0xc5 | ((vex2 & 0b1'0'0'00000) << 8) | (byte3 << 8) |
367 0b0'1111'000'00000000) ^ (vvvv << 11));
368 } else {
369 Emit8(byte1);
370 Emit16((vex2 | (byte3 << 8) | 0b0'1111'000'00000000) ^ (vvvv << 11));
371 }
372 }
373
374 template <typename ArgumentType>
EmitRegisterInOpcode(uint8_t opcode,ArgumentType argument)375 void EmitRegisterInOpcode(uint8_t opcode, ArgumentType argument) {
376 Emit8(opcode | (argument.num_ & 0b111));
377 }
378
379 template <typename ArgumentType1, typename ArgumentType2>
EmitModRM(ArgumentType1 argument1,ArgumentType2 argument2)380 void EmitModRM(ArgumentType1 argument1, ArgumentType2 argument2) {
381 Emit8(0xC0 | ((argument1.num_ & 0b111) << 3) | (argument2.num_ & 0b111));
382 }
383
384 template <typename ArgumentType>
EmitModRM(uint8_t opcode_extension,ArgumentType argument)385 void EmitModRM(uint8_t opcode_extension, ArgumentType argument) {
386 CHECK_LE(opcode_extension, 0b111);
387 Emit8(0xC0 | (opcode_extension << 3) | (argument.num_ & 0b111));
388 }
389
390 template <typename ArgumentType>
EmitOperandOp(ArgumentType argument,Operand operand)391 void EmitOperandOp(ArgumentType argument, Operand operand) {
392 EmitOperandOp(static_cast<int>(argument.num_ & 0b111), operand);
393 }
394
395 template <size_t kImmediatesSize, typename ArgumentType>
EmitRipOp(ArgumentType argument,const Label & label)396 void EmitRipOp(ArgumentType argument, const Label& label) {
397 EmitRipOp<kImmediatesSize>(static_cast<int>(argument.num_) & 0b111, label);
398 }
399
400 // Emit the ModR/M byte, and optionally the SIB byte and
401 // 1- or 4-byte offset for a memory operand. Also used to encode
402 // a three-bit opcode extension into the ModR/M byte.
403 void EmitOperandOp(int num_ber, const Operand& addr);
404 // Helper functions to handle various ModR/M and SIB combinations.
405 // Should *only* be called from EmitOperandOp!
406 void EmitIndexDispOperand(int reg, const Operand& addr);
407 template <typename ArgType, void (AssemblerBase::*)(ArgType)>
408 void EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr);
409 // Emit ModR/M for rip-addressig.
410 template <size_t kImmediatesSize>
411 void EmitRipOp(int num_, const Label& label);
412
413 friend BaseAssembler;
414 };
415
416 // This function looks big, but when we are emitting Operand with fixed registers
417 // (which is the most common case) all "if"s below are calculated statically which
418 // makes effective size of that function very small.
419 //
420 // But for this to happen function have to be inline and in header.
EmitOperandOp(int num_ber,const Operand & addr)421 inline void Assembler::EmitOperandOp(int num_ber, const Operand& addr) {
422 // Additional info (register num_ber, etc) is limited to 3 bits.
423 CHECK_LE(unsigned(num_ber), 7);
424
425 // Reg field must be shifted by 3 bits.
426 int reg = num_ber << 3;
427
428 // On x86 %rsp cannot be index, only base.
429 CHECK(addr.index != rsp);
430
431 // If base is not %rsp/r12 and we don't have index, then we don't have SIB byte.
432 // All other cases have "ModR/M" and SIB bytes.
433 if (addr.base != rsp && addr.base != r12 && addr.index == no_register) {
434 // If we have base register then we could use the same logic as for other common cases.
435 if (addr.base != no_register) {
436 EmitBaseIndexDispOperand<uint8_t, &Assembler::Emit8>((addr.base.num_ & 7) | reg, addr);
437 } else {
438 Emit16(0x2504 | reg);
439 Emit32(addr.disp);
440 }
441 } else if (addr.index == no_register) {
442 // Note: when ModR/M and SIB are used "no index" is encoded as if %rsp is used in place of
443 // index (that's why %rsp couldn't be used as index - see check above).
444 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(
445 0x2004 | ((addr.base.num_ & 7) << 8) | reg, addr);
446 } else if (addr.base == no_register) {
447 EmitIndexDispOperand(reg, addr);
448 } else {
449 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(0x04 | (addr.scale << 14) |
450 ((addr.index.num_ & 7) << 11) |
451 ((addr.base.num_ & 7) << 8) | reg,
452 addr);
453 }
454 }
455
EmitIndexDispOperand(int reg,const Operand & addr)456 inline void Assembler::EmitIndexDispOperand(int reg, const Operand& addr) {
457 // We only have index here, no base, use SIB but put %rbp in "base" field.
458 Emit16(0x0504 | (addr.scale << 14) | ((addr.index.num_ & 7) << 11) | reg);
459 Emit32(addr.disp);
460 }
461
462 template <size_t kImmediatesSize>
EmitRipOp(int num_,const Label & label)463 inline void Assembler::EmitRipOp(int num_, const Label& label) {
464 Emit8(0x05 | (num_ << 3));
465 jumps_.push_back(Jump{&label, pc(), false});
466 Emit32(0xfffffffc - kImmediatesSize);
467 }
468
469 template <typename ArgType, void (AssemblerBase::*EmitBase)(ArgType)>
EmitBaseIndexDispOperand(int base_modrm_and_sib,const Operand & addr)470 inline void Assembler::EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr) {
471 if (addr.disp == 0 && addr.base != rbp && addr.base != r13) {
472 // We can omit zero displacement only if base isn't %rbp/%r13
473 (this->*EmitBase)(base_modrm_and_sib);
474 } else if (IsInRange<int8_t>(addr.disp)) {
475 // If disp could it in byte then use byte-disp.
476 (this->*EmitBase)(base_modrm_and_sib | 0x40);
477 Emit8(addr.disp);
478 } else {
479 // Otherwise use full-disp.
480 (this->*EmitBase)(base_modrm_and_sib | 0x80);
481 Emit32(addr.disp);
482 }
483 }
484
Movq(Register dest,int64_t imm64)485 inline void Assembler::Movq(Register dest, int64_t imm64) {
486 if (IsInRange<uint32_t>(imm64)) {
487 // Shorter encoding.
488 Movl(dest, static_cast<uint32_t>(imm64));
489 } else if (IsInRange<int32_t>(imm64)) {
490 // Slightly longer encoding.
491 EmitInstruction<0xc7, 0x00>(Register64Bit(dest), static_cast<int32_t>(imm64));
492 } else {
493 // Longest encoding.
494 EmitInstruction<0xb8>(Register64Bit(dest), imm64);
495 }
496 }
497
Vmovapd(XMMRegister arg0,XMMRegister arg1)498 inline void Assembler::Vmovapd(XMMRegister arg0, XMMRegister arg1) {
499 if (arg0.num_ < 8 && arg1.num_ >= 8) {
500 return EmitInstruction<0xc4, 0x01, 0x01, 0x29>(VectorRegister128Bit(arg1),
501 VectorRegister128Bit(arg0));
502 }
503 EmitInstruction<0xc4, 0x01, 0x01, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
504 }
505
Vmovaps(XMMRegister arg0,XMMRegister arg1)506 inline void Assembler::Vmovaps(XMMRegister arg0, XMMRegister arg1) {
507 if (arg0.num_ < 8 && arg1.num_ >= 8) {
508 return EmitInstruction<0xc4, 0x01, 0x00, 0x29>(VectorRegister128Bit(arg1),
509 VectorRegister128Bit(arg0));
510 }
511 EmitInstruction<0xc4, 0x01, 0x00, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
512 }
513
Vmovdqa(XMMRegister arg0,XMMRegister arg1)514 inline void Assembler::Vmovdqa(XMMRegister arg0, XMMRegister arg1) {
515 if (arg0.num_ < 8 && arg1.num_ >= 8) {
516 return EmitInstruction<0xc4, 0x01, 0x01, 0x7F>(VectorRegister128Bit(arg1),
517 VectorRegister128Bit(arg0));
518 }
519 EmitInstruction<0xc4, 0x01, 0x01, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
520 }
521
Vmovdqu(XMMRegister arg0,XMMRegister arg1)522 inline void Assembler::Vmovdqu(XMMRegister arg0, XMMRegister arg1) {
523 if (arg0.num_ < 8 && arg1.num_ >= 8) {
524 return EmitInstruction<0xc4, 0x01, 0x02, 0x7F>(VectorRegister128Bit(arg1),
525 VectorRegister128Bit(arg0));
526 }
527 EmitInstruction<0xc4, 0x01, 0x02, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
528 }
529
Vmovsd(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)530 inline void Assembler::Vmovsd(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
531 if (arg0.num_ < 8 && arg2.num_ >= 8) {
532 return EmitInstruction<0xc4, 0x01, 0x03, 0x11>(
533 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
534 }
535 EmitInstruction<0xc4, 0x01, 0x03, 0x10>(
536 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
537 }
538
Vmovss(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)539 inline void Assembler::Vmovss(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
540 if (arg0.num_ < 8 && arg2.num_ >= 8) {
541 return EmitInstruction<0xc4, 0x01, 0x02, 0x11>(
542 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
543 }
544 EmitInstruction<0xc4, 0x01, 0x02, 0x10>(
545 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
546 }
547
Xchgq(Register dest,Register src)548 inline void Assembler::Xchgq(Register dest, Register src) {
549 // We compare output to that from clang and thus want to produce the same code.
550 // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
551 // and this is what gcc + gas are producing), but this is what clang <= 8 does.
552 if (IsAccumulator(src) && IsAccumulator(dest)) {
553 Emit8(0x90);
554 } else if (IsAccumulator(src) || IsAccumulator(dest)) {
555 Register other = IsAccumulator(src) ? dest : src;
556 EmitInstruction<0x90>(Register64Bit(other));
557 } else {
558 // Clang 8 (after r330298) puts dest before src. We are comparing output
559 // to clang in exhaustive test thus we want to match clang behavior exactly.
560 EmitInstruction<0x87>(Register64Bit(dest), Register64Bit(src));
561 }
562 }
563
564 } // namespace x86_64
565
566 } // namespace berberis
567
568 #endif // BERBERIS_ASSEMBLER_X86_64_H_
569