1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Assembler to produce x86-64 instructions. Somewhat influenced by V8 assembler.
18 
19 #ifndef BERBERIS_ASSEMBLER_X86_64_H_
20 #define BERBERIS_ASSEMBLER_X86_64_H_
21 
22 #include <type_traits>  // std::is_same
23 
24 #include "berberis/assembler/x86_32_and_x86_64.h"
25 #include "berberis/base/logging.h"
26 
27 namespace berberis {
28 
29 class MachindeCode;
30 
31 namespace x86_64 {
32 
33 class Assembler : public x86_32_and_x86_64::Assembler<Assembler> {
34  public:
35   using BaseAssembler = x86_32_and_x86_64::Assembler<Assembler>;
36   using FinalAssembler = Assembler;
37 
Assembler(MachineCode * code)38   explicit Assembler(MachineCode* code) : BaseAssembler(code) {}
39 
40   static constexpr Register no_register{0x80};
41   static constexpr Register rax{0};
42   static constexpr Register rcx{1};
43   static constexpr Register rdx{2};
44   static constexpr Register rbx{3};
45   static constexpr Register rsp{4};
46   static constexpr Register rbp{5};
47   static constexpr Register rsi{6};
48   static constexpr Register rdi{7};
49   static constexpr Register r8{8};
50   static constexpr Register r9{9};
51   static constexpr Register r10{10};
52   static constexpr Register r11{11};
53   static constexpr Register r12{12};
54   static constexpr Register r13{13};
55   static constexpr Register r14{14};
56   static constexpr Register r15{15};
57 
58   static constexpr XMMRegister no_xmm_register{0x80};
59   static constexpr XMMRegister xmm0{0};
60   static constexpr XMMRegister xmm1{1};
61   static constexpr XMMRegister xmm2{2};
62   static constexpr XMMRegister xmm3{3};
63   static constexpr XMMRegister xmm4{4};
64   static constexpr XMMRegister xmm5{5};
65   static constexpr XMMRegister xmm6{6};
66   static constexpr XMMRegister xmm7{7};
67   static constexpr XMMRegister xmm8{8};
68   static constexpr XMMRegister xmm9{9};
69   static constexpr XMMRegister xmm10{10};
70   static constexpr XMMRegister xmm11{11};
71   static constexpr XMMRegister xmm12{12};
72   static constexpr XMMRegister xmm13{13};
73   static constexpr XMMRegister xmm14{14};
74   static constexpr XMMRegister xmm15{15};
75 
76   // Macroassembler uses these names to support both x86-32 and x86-64 modes.
77   static constexpr Register gpr_a{0};
78   static constexpr Register gpr_c{1};
79   static constexpr Register gpr_d{2};
80   static constexpr Register gpr_s{4};
81 
82 // Instructions.
83 #include "berberis/assembler/gen_assembler_x86_64-inl.h"  // NOLINT generated file!
84 
85   // Historical curiosity: x86-32 mode has Movq for memory-to-xmm operations.
86   // x86-64 added another one, with different opcode but since they are functionally equivalent
87   // GNU Assembler and Clang use old one both in 32-bit mode and 64-bit mode thus we are doing
88   // the same.
89 
90   // Unhide Decl(Mem) hidden by Decl(Reg).
91   using BaseAssembler::Decl;
92 
93   // Unhide Decw(Mem) hidden by Decw(Reg).
94   using BaseAssembler::Decw;
95 
96   // Unhide Incl(Mem) hidden by Incl(Reg).
97   using BaseAssembler::Incl;
98 
99   // Unhide Incw(Mem) hidden by Incw(Reg).
100   using BaseAssembler::Incw;
101 
102   // Unhide Movq(Mem, XMMReg) and Movq(XMMReg, Mem) hidden by Movq(Reg, Imm) and many others.
103   using BaseAssembler::Movq;
104 
105   // Unhide Xchgl(Mem, Reg) hidden by modified version below.
106   using BaseAssembler::Xchgl;
107 
108   // Unhide Vmov*(Mem, Reg) hidden by Vmov*(Reg, Reg).
109   using BaseAssembler::Vmovapd;
110   using BaseAssembler::Vmovaps;
111   using BaseAssembler::Vmovdqa;
112   using BaseAssembler::Vmovdqu;
113   using BaseAssembler::Vmovq;
114   using BaseAssembler::Vmovsd;
115   using BaseAssembler::Vmovss;
116 
Xchgl(Register dest,Register src)117   void Xchgl(Register dest, Register src) {
118     // In 32-bit mode "xchgl %eax, %eax" did nothing and was often reused as "nop".
119     //
120     // On x86-64 "xchgl %eax, %eax" clears top half of %eax register, but having single-byte nop
121     // is too convenient, thus, as special exception, 0x90 is not interpreted as "xchgl %eax, %eax",
122     // but was kept as "nop" - thus longer encoding for "xchgl %eax, %eax" must be used.
123 
124     if (IsAccumulator(src) && IsAccumulator(dest)) {
125       Emit16(0xc087);
126     } else {
127       BaseAssembler::Xchgl(dest, src);
128     }
129   }
130 
131   // TODO(b/127356868): decide what to do with these functions when cross-arch assembler is used.
132 
133 #ifdef __amd64__
134 
135   // Unhide Call(Reg), hidden by special version below.
136   using BaseAssembler::Call;
137 
Call(const void * target)138   void Call(const void* target) {
139     // There are no call instruction with properties we need thus we emulate it.
140     // This is what the following code looks like when decoded with objdump (if
141     // target address is 0x123456789abcdef0):
142     //   0: ff 15 02 00 00 00        callq  *0x2(%rip) # 0x8
143     //   6: eb 08                    jmp    0x10
144     //   8: f0 de bc 9a 78 56 34 12  lock fidivrs 0x12345678(%rdx,%rbx,4)
145     // First we do call - with address taken from last 8 bytes, then we jump over
146     // these 8 bytes.
147     Emit64(0x08eb0000000215ff);
148     Emit64(bit_cast<int64_t>(target));
149   }
150 
151   // Unhide Jcc(Label), hidden by special version below.
152   using BaseAssembler::Jcc;
153 
154   // Make sure only type void* can be passed to function below, not Label* or any other pointer.
155   template <typename T>
156   auto Jcc(Condition cc, T* target) -> void = delete;
157 
158   template <typename T>
159   auto Jcc(Condition cc, T target)
160       -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
161 
Jcc(Condition cc,uintptr_t target)162   void Jcc(Condition cc, uintptr_t target) {
163     if (cc == Condition::kAlways) {
164       Jmp(target);
165       return;
166     } else if (cc == Condition::kNever) {
167       return;
168     }
169     CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
170     // There are no Jcc instruction with properties we need thus we emulate it.
171     // This is what the following code looks like when decoded with objdump (if
172     // target address is 0x123456789abcdef0):
173     //   0: 75 0e                   jne    0x10
174     //   2: ff 25 00 00 00 00       jmpq   *0x0(%rip) # 0x8
175     //   8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
176     // We are doing relative jump for the inverted condition (because Jcc could
177     // only jump ±2GiB and in 64 bit mode which is not enough to reach arbitrary
178     // address), then jmpq with address stored right after jmpq.
179     Emit64(0x0000000025ff'0e70 | static_cast<int8_t>(ToReverseCond(cc)));
180     Emit64(bit_cast<int64_t>(target));
181   }
182 
183   void Jcc(Condition cc, const void* target) { Jcc(cc, bit_cast<uintptr_t>(target)); }
184 
185   // Unhide Jmp(Reg), hidden by special version below.
186   using BaseAssembler::Jmp;
187 
188   // Make sure only type void* can be passed to function below, not Label* or any other pointer.
189   template <typename T>
190   auto Jmp(T* target) -> void = delete;
191 
192   template <typename T>
193   auto Jmp(T target)
194       -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
195 
196   void Jmp(uintptr_t target) {
197     // There are no jump instruction with properties we need thus we emulate it.
198     // This is what the following code looks like when decoded with objdump (if
199     // target address is 0x123456789abcdef0):
200     //   0: ff 25 00 00 00 00       jmpq   *0x0(%rip) # 0x6
201     //   6: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
202     // We are doing jump to the address stored right after jmpq using %rip-relative
203     // addressing (with offset 0).
204     Emit16(0x25ff);
205     Emit32(0x00000000);
206     Emit64(bit_cast<int64_t>(target));
207   }
208 
209   void Jmp(const void* target) { Jmp(bit_cast<uintptr_t>(target)); }
210 
211 #endif
212 
213  private:
214   Assembler() = delete;
215   Assembler(const Assembler&) = delete;
216   Assembler(Assembler&&) = delete;
217   void operator=(const Assembler&) = delete;
218   void operator=(Assembler&&) = delete;
219   using DerivedAssemblerType = Assembler;
220 
221   static Register Accumulator() { return rax; }
222   static bool IsAccumulator(Register reg) { return reg == rax; }
223 
224   struct Register64Bit {
225     explicit constexpr Register64Bit(Register reg) : num_(reg.num_) {}
226     uint8_t num_;
227   };
228 
229   struct Memory64Bit {
230     explicit Memory64Bit(const Operand& op) : operand(op) {}
231     Operand operand;
232   };
233 
234   struct Label64Bit {
235     explicit Label64Bit(const LabelOperand& l) : label(l.label) {}
236     const Label& label;
237   };
238 
239   // This type is only used by CmpXchg16b and acts similarly to Memory64Bit there.
240   using Memory128Bit = Memory64Bit;
241   using Label128Bit = Label64Bit;
242 
243   // Check if a given type is "a register with size" (for EmitInstruction).
244   template <typename ArgumentType>
245   struct IsRegister {
246     static constexpr bool value = std::is_same_v<ArgumentType, Register8Bit> ||
247                                   std::is_same_v<ArgumentType, Register32Bit> ||
248                                   std::is_same_v<ArgumentType, Register64Bit>;
249   };
250 
251   // Check if a given type is "a memory operand with size" (for EmitInstruction).
252   template <typename ArgumentType>
253   struct IsMemoryOperand {
254     static constexpr bool value =
255         std::is_same_v<ArgumentType, Memory32Bit> || std::is_same_v<ArgumentType, Memory64Bit>;
256   };
257 
258   template <typename ArgumentType>
259   struct IsLabelOperand {
260     static constexpr bool value =
261         std::is_same_v<ArgumentType, Label32Bit> || std::is_same_v<ArgumentType, Label64Bit>;
262   };
263 
264   template <typename... ArgumentsTypes>
265   void EmitRex(ArgumentsTypes... arguments) {
266     constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
267     constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
268     static_assert(registers_count + operands_count <= 2,
269                   "Only two-arguments instructions are supported, not VEX or EVEX");
270     uint8_t rex = 0;
271     if constexpr (registers_count == 2) {
272       rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
273             Rex<0b0001>(ArgumentByType<1, IsRegister>(arguments...));
274     } else if constexpr (registers_count == 1 && operands_count == 1) {
275       rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
276             Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
277     } else if constexpr (registers_count == 1) {
278       rex = Rex<0b0001>(ArgumentByType<0, IsRegister>(arguments...));
279     } else if constexpr (operands_count == 1) {
280       rex = Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
281     }
282     if (rex) {
283       Emit8(rex);
284     }
285   }
286 
287   template <uint8_t base_rex, typename ArgumentType>
288   uint8_t Rex(ArgumentType argument) {
289     if (argument.num_ & 0b1000) {
290       // 64-bit argument requires REX.W bit
291       if (std::is_same_v<ArgumentType, Register64Bit>) {
292         return 0b0100'1000 | base_rex;
293       }
294       return 0b0100'0000 | base_rex;
295     }
296     // 8-bit argument requires REX (even if without any bits).
297     if (std::is_same_v<ArgumentType, Register8Bit> && argument.num_ > 3) {
298       return 0b0100'0000;
299     }
300     if (std::is_same_v<ArgumentType, Register64Bit>) {
301       return 0b0100'1000;
302     }
303     return 0;
304   }
305 
Rex(Operand operand)306   uint8_t Rex(Operand operand) {
307     // REX.B and REX.X always come from operand.
308     uint8_t rex = ((operand.base.num_ & 0b1000) >> 3) | ((operand.index.num_ & 0b1000) >> 2);
309     if (rex) {
310       // We actually need rex byte here.
311       return 0b0100'0000 | rex;
312     } else {
313       return 0;
314     }
315   }
316 
Rex(Memory32Bit operand)317   uint8_t Rex(Memory32Bit operand) { return Rex(operand.operand); }
318 
Rex(Memory64Bit operand)319   uint8_t Rex(Memory64Bit operand) {
320     // 64-bit argument requires REX.W bit - and thus REX itself.
321     return 0b0100'1000 | Rex(operand.operand);
322   }
323 
324   template <typename RegisterType>
325   [[nodiscard]] static bool IsSwapProfitable(RegisterType rm_arg, RegisterType vex_arg) {
326     // In 64bit mode we may use more compact encoding if operand encoded in rm is low register.
327     // Return true if we may achieve that by swapping arguments.
328     return rm_arg.num_ >= 8 && vex_arg.num_ < 8;
329   }
330 
331   template <uint8_t byte1,
332             uint8_t byte2,
333             uint8_t byte3,
334             bool reg_is_opcode_extension,
335             typename... ArgumentsTypes>
336   void EmitVex(ArgumentsTypes... arguments) {
337     constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
338     constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
339     constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
340     constexpr auto vvvv_parameter = 2 - reg_is_opcode_extension - operands_count - labels_count;
341     int vvvv = 0;
342     if constexpr (registers_count > vvvv_parameter) {
343       vvvv = ArgumentByType<vvvv_parameter, IsRegister>(arguments...).num_;
344     }
345     auto vex2 = byte2 | 0b111'00000;
346     if constexpr (operands_count == 1) {
347       auto operand = ArgumentByType<0, IsMemoryOperand>(arguments...);
348       vex2 ^= (operand.operand.base.num_ & 0b1000) << 2;
349       vex2 ^= (operand.operand.index.num_ & 0b1000) << 3;
350       if constexpr (!reg_is_opcode_extension) {
351         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
352       }
353     } else if constexpr (labels_count == 1) {
354       if constexpr (!reg_is_opcode_extension) {
355         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
356       }
357     } else if constexpr (registers_count > 0) {
358       if constexpr (reg_is_opcode_extension) {
359         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 2;
360       } else {
361         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
362         vex2 ^= (ArgumentByType<1, IsRegister>(arguments...).num_ & 0b1000) << 2;
363       }
364     }
365     if (byte1 == 0xC4 && (vex2 & 0b0'1'1'11111) == 0b0'1'1'00001 && (byte3 & 0b1'0000'0'00) == 0) {
366       Emit16((0xc5 | ((vex2 & 0b1'0'0'00000) << 8) | (byte3 << 8) |
367               0b0'1111'000'00000000) ^ (vvvv << 11));
368     } else {
369       Emit8(byte1);
370       Emit16((vex2 | (byte3 << 8) | 0b0'1111'000'00000000) ^ (vvvv << 11));
371     }
372   }
373 
374   template <typename ArgumentType>
EmitRegisterInOpcode(uint8_t opcode,ArgumentType argument)375   void EmitRegisterInOpcode(uint8_t opcode, ArgumentType argument) {
376     Emit8(opcode | (argument.num_ & 0b111));
377   }
378 
379   template <typename ArgumentType1, typename ArgumentType2>
EmitModRM(ArgumentType1 argument1,ArgumentType2 argument2)380   void EmitModRM(ArgumentType1 argument1, ArgumentType2 argument2) {
381     Emit8(0xC0 | ((argument1.num_ & 0b111) << 3) | (argument2.num_ & 0b111));
382   }
383 
384   template <typename ArgumentType>
EmitModRM(uint8_t opcode_extension,ArgumentType argument)385   void EmitModRM(uint8_t opcode_extension, ArgumentType argument) {
386     CHECK_LE(opcode_extension, 0b111);
387     Emit8(0xC0 | (opcode_extension << 3) | (argument.num_ & 0b111));
388   }
389 
390   template <typename ArgumentType>
EmitOperandOp(ArgumentType argument,Operand operand)391   void EmitOperandOp(ArgumentType argument, Operand operand) {
392     EmitOperandOp(static_cast<int>(argument.num_ & 0b111), operand);
393   }
394 
395   template <size_t kImmediatesSize, typename ArgumentType>
EmitRipOp(ArgumentType argument,const Label & label)396   void EmitRipOp(ArgumentType argument, const Label& label) {
397     EmitRipOp<kImmediatesSize>(static_cast<int>(argument.num_) & 0b111, label);
398   }
399 
400   // Emit the ModR/M byte, and optionally the SIB byte and
401   // 1- or 4-byte offset for a memory operand.  Also used to encode
402   // a three-bit opcode extension into the ModR/M byte.
403   void EmitOperandOp(int num_ber, const Operand& addr);
404   // Helper functions to handle various ModR/M and SIB combinations.
405   // Should *only* be called from EmitOperandOp!
406   void EmitIndexDispOperand(int reg, const Operand& addr);
407   template <typename ArgType, void (AssemblerBase::*)(ArgType)>
408   void EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr);
409   // Emit ModR/M for rip-addressig.
410   template <size_t kImmediatesSize>
411   void EmitRipOp(int num_, const Label& label);
412 
413   friend BaseAssembler;
414 };
415 
416 // This function looks big, but when we are emitting Operand with fixed registers
417 // (which is the most common case) all "if"s below are calculated statically which
418 // makes effective size of that function very small.
419 //
420 // But for this to happen function have to be inline and in header.
EmitOperandOp(int num_ber,const Operand & addr)421 inline void Assembler::EmitOperandOp(int num_ber, const Operand& addr) {
422   // Additional info (register num_ber, etc) is limited to 3 bits.
423   CHECK_LE(unsigned(num_ber), 7);
424 
425   // Reg field must be shifted by 3 bits.
426   int reg = num_ber << 3;
427 
428   // On x86 %rsp cannot be index, only base.
429   CHECK(addr.index != rsp);
430 
431   // If base is not %rsp/r12 and we don't have index, then we don't have SIB byte.
432   // All other cases have "ModR/M" and SIB bytes.
433   if (addr.base != rsp && addr.base != r12 && addr.index == no_register) {
434     // If we have base register then we could use the same logic as for other common cases.
435     if (addr.base != no_register) {
436       EmitBaseIndexDispOperand<uint8_t, &Assembler::Emit8>((addr.base.num_ & 7) | reg, addr);
437     } else {
438       Emit16(0x2504 | reg);
439       Emit32(addr.disp);
440     }
441   } else if (addr.index == no_register) {
442     // Note: when ModR/M and SIB are used "no index" is encoded as if %rsp is used in place of
443     // index (that's why %rsp couldn't be used as index - see check above).
444     EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(
445         0x2004 | ((addr.base.num_ & 7) << 8) | reg, addr);
446   } else if (addr.base == no_register) {
447     EmitIndexDispOperand(reg, addr);
448   } else {
449     EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(0x04 | (addr.scale << 14) |
450                                                               ((addr.index.num_ & 7) << 11) |
451                                                               ((addr.base.num_ & 7) << 8) | reg,
452                                                           addr);
453   }
454 }
455 
EmitIndexDispOperand(int reg,const Operand & addr)456 inline void Assembler::EmitIndexDispOperand(int reg, const Operand& addr) {
457   // We only have index here, no base, use SIB but put %rbp in "base" field.
458   Emit16(0x0504 | (addr.scale << 14) | ((addr.index.num_ & 7) << 11) | reg);
459   Emit32(addr.disp);
460 }
461 
462 template <size_t kImmediatesSize>
EmitRipOp(int num_,const Label & label)463 inline void Assembler::EmitRipOp(int num_, const Label& label) {
464   Emit8(0x05 | (num_ << 3));
465   jumps_.push_back(Jump{&label, pc(), false});
466   Emit32(0xfffffffc - kImmediatesSize);
467 }
468 
469 template <typename ArgType, void (AssemblerBase::*EmitBase)(ArgType)>
EmitBaseIndexDispOperand(int base_modrm_and_sib,const Operand & addr)470 inline void Assembler::EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr) {
471   if (addr.disp == 0 && addr.base != rbp && addr.base != r13) {
472     // We can omit zero displacement only if base isn't %rbp/%r13
473     (this->*EmitBase)(base_modrm_and_sib);
474   } else if (IsInRange<int8_t>(addr.disp)) {
475     // If disp could it in byte then use byte-disp.
476     (this->*EmitBase)(base_modrm_and_sib | 0x40);
477     Emit8(addr.disp);
478   } else {
479     // Otherwise use full-disp.
480     (this->*EmitBase)(base_modrm_and_sib | 0x80);
481     Emit32(addr.disp);
482   }
483 }
484 
Movq(Register dest,int64_t imm64)485 inline void Assembler::Movq(Register dest, int64_t imm64) {
486   if (IsInRange<uint32_t>(imm64)) {
487     // Shorter encoding.
488     Movl(dest, static_cast<uint32_t>(imm64));
489   } else if (IsInRange<int32_t>(imm64)) {
490     // Slightly longer encoding.
491     EmitInstruction<0xc7, 0x00>(Register64Bit(dest), static_cast<int32_t>(imm64));
492   } else {
493     // Longest encoding.
494     EmitInstruction<0xb8>(Register64Bit(dest), imm64);
495   }
496 }
497 
Vmovapd(XMMRegister arg0,XMMRegister arg1)498 inline void Assembler::Vmovapd(XMMRegister arg0, XMMRegister arg1) {
499   if (arg0.num_ < 8 && arg1.num_ >= 8) {
500     return EmitInstruction<0xc4, 0x01, 0x01, 0x29>(VectorRegister128Bit(arg1),
501                                                    VectorRegister128Bit(arg0));
502   }
503   EmitInstruction<0xc4, 0x01, 0x01, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
504 }
505 
Vmovaps(XMMRegister arg0,XMMRegister arg1)506 inline void Assembler::Vmovaps(XMMRegister arg0, XMMRegister arg1) {
507   if (arg0.num_ < 8 && arg1.num_ >= 8) {
508     return EmitInstruction<0xc4, 0x01, 0x00, 0x29>(VectorRegister128Bit(arg1),
509                                                    VectorRegister128Bit(arg0));
510   }
511   EmitInstruction<0xc4, 0x01, 0x00, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
512 }
513 
Vmovdqa(XMMRegister arg0,XMMRegister arg1)514 inline void Assembler::Vmovdqa(XMMRegister arg0, XMMRegister arg1) {
515   if (arg0.num_ < 8 && arg1.num_ >= 8) {
516     return EmitInstruction<0xc4, 0x01, 0x01, 0x7F>(VectorRegister128Bit(arg1),
517                                                    VectorRegister128Bit(arg0));
518   }
519   EmitInstruction<0xc4, 0x01, 0x01, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
520 }
521 
Vmovdqu(XMMRegister arg0,XMMRegister arg1)522 inline void Assembler::Vmovdqu(XMMRegister arg0, XMMRegister arg1) {
523   if (arg0.num_ < 8 && arg1.num_ >= 8) {
524     return EmitInstruction<0xc4, 0x01, 0x02, 0x7F>(VectorRegister128Bit(arg1),
525                                                    VectorRegister128Bit(arg0));
526   }
527   EmitInstruction<0xc4, 0x01, 0x02, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
528 }
529 
Vmovsd(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)530 inline void Assembler::Vmovsd(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
531   if (arg0.num_ < 8 && arg2.num_ >= 8) {
532     return EmitInstruction<0xc4, 0x01, 0x03, 0x11>(
533         VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
534   }
535   EmitInstruction<0xc4, 0x01, 0x03, 0x10>(
536       VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
537 }
538 
Vmovss(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)539 inline void Assembler::Vmovss(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
540   if (arg0.num_ < 8 && arg2.num_ >= 8) {
541     return EmitInstruction<0xc4, 0x01, 0x02, 0x11>(
542         VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
543   }
544   EmitInstruction<0xc4, 0x01, 0x02, 0x10>(
545       VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
546 }
547 
Xchgq(Register dest,Register src)548 inline void Assembler::Xchgq(Register dest, Register src) {
549   // We compare output to that from clang and thus want to produce the same code.
550   // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
551   // and this is what gcc + gas are producing), but this is what clang <= 8 does.
552   if (IsAccumulator(src) && IsAccumulator(dest)) {
553     Emit8(0x90);
554   } else if (IsAccumulator(src) || IsAccumulator(dest)) {
555     Register other = IsAccumulator(src) ? dest : src;
556     EmitInstruction<0x90>(Register64Bit(other));
557   } else {
558   // Clang 8 (after r330298) puts dest before src.  We are comparing output
559   // to clang in exhaustive test thus we want to match clang behavior exactly.
560   EmitInstruction<0x87>(Register64Bit(dest), Register64Bit(src));
561   }
562 }
563 
564 }  // namespace x86_64
565 
566 }  // namespace berberis
567 
568 #endif  // BERBERIS_ASSEMBLER_X86_64_H_
569