1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
18 #define BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
19 
20 #include <cstddef>  // std::size_t
21 #include <cstdint>
22 #include <type_traits>  // std::enable_if, std::is_integral
23 
24 #include "berberis/assembler/common.h"
25 #include "berberis/base/bit_util.h"
26 #include "berberis/base/checks.h"
27 
28 namespace berberis {
29 
30 // Assembler includes implementation of most x86 assembler instructions.
31 //
32 // x86-32 and x86-64 assemblers are nearly identical, but difference lies in handling
33 // of very low-level instruction details: almost all instructions on x86-64 could include
34 // REX byte which is needed if new registers (%r8 to %r15 or %xmm8 to %xmm15) are used.
35 //
36 // To handle that difference efficiently Assembler is CRTP class: it's parameterized
37 // by its own descendant and pull certain functions (e.g. GetHighBit or Rex8Size) from
38 // its implementation.
39 //
40 // Certain functions are only implemented by its descendant (since there are instructions
41 // which only exist in x86-32 mode and instructions which only exist in x86-64 mode).
42 
43 namespace x86_32 {
44 
45 class Assembler;
46 
47 }  // namespace x86_32
48 
49 namespace x86_64 {
50 
51 class Assembler;
52 
53 }  // namespace x86_64
54 
55 namespace x86_32_and_x86_64 {
56 
57 template <typename DerivedAssemblerType>
58 class Assembler : public AssemblerBase {
59  public:
Assembler(MachineCode * code)60   explicit Assembler(MachineCode* code) : AssemblerBase(code) {}
61 
62   enum class Condition {
63     kInvalidCondition = -1,
64 
65     kOverflow = 0,
66     kNoOverflow = 1,
67     kBelow = 2,
68     kAboveEqual = 3,
69     kEqual = 4,
70     kNotEqual = 5,
71     kBelowEqual = 6,
72     kAbove = 7,
73     kNegative = 8,
74     kPositiveOrZero = 9,
75     kParityEven = 10,
76     kParityOdd = 11,
77     kLess = 12,
78     kGreaterEqual = 13,
79     kLessEqual = 14,
80     kGreater = 15,
81     kAlways = 16,
82     kNever = 17,
83 
84     // aka...
85     kCarry = kBelow,
86     kNotCarry = kAboveEqual,
87     kZero = kEqual,
88     kNotZero = kNotEqual,
89     kSign = kNegative,
90     kNotSign = kPositiveOrZero
91   };
92 
GetCondName(Condition cond)93   friend constexpr const char* GetCondName(Condition cond) {
94     switch (cond) {
95       case Condition::kOverflow:
96         return "O";
97       case Condition::kNoOverflow:
98         return "NO";
99       case Condition::kBelow:
100         return "B";
101       case Condition::kAboveEqual:
102         return "AE";
103       case Condition::kEqual:
104         return "Z";
105       case Condition::kNotEqual:
106         return "NZ";
107       case Condition::kBelowEqual:
108         return "BE";
109       case Condition::kAbove:
110         return "A";
111       case Condition::kNegative:
112         return "N";
113       case Condition::kPositiveOrZero:
114         return "PL";
115       case Condition::kParityEven:
116         return "PE";
117       case Condition::kParityOdd:
118         return "PO";
119       case Condition::kLess:
120         return "LS";
121       case Condition::kGreaterEqual:
122         return "GE";
123       case Condition::kLessEqual:
124         return "LE";
125       case Condition::kGreater:
126         return "GT";
127       default:
128         return "??";
129     }
130   }
131 
132   class Register {
133    public:
134     constexpr bool operator==(const Register& reg) const { return num_ == reg.num_; }
135     constexpr bool operator!=(const Register& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()136     constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(Register value)137     friend constexpr uint8_t ValueForFmtSpec(Register value) { return value.num_; }
138     friend class Assembler<DerivedAssemblerType>;
139     friend class x86_32::Assembler;
140     friend class x86_64::Assembler;
141 
142    private:
Register(uint8_t num)143     explicit constexpr Register(uint8_t num) : num_(num) {}
144     uint8_t num_;
145   };
146 
147   class X87Register {
148    public:
149     constexpr bool operator==(const Register& reg) const { return num_ == reg.num_; }
150     constexpr bool operator!=(const Register& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()151     constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(X87Register value)152     friend constexpr uint8_t ValueForFmtSpec(X87Register value) { return value.num_; }
153     friend class Assembler<DerivedAssemblerType>;
154     friend class x86_32::Assembler;
155     friend class x86_64::Assembler;
156 
157    private:
X87Register(uint8_t num)158     explicit constexpr X87Register(uint8_t num) : num_(num) {}
159     uint8_t num_;
160   };
161 
162   static constexpr X87Register st{0};
163   static constexpr X87Register st0{0};
164   static constexpr X87Register st1{1};
165   static constexpr X87Register st2{2};
166   static constexpr X87Register st3{3};
167   static constexpr X87Register st4{4};
168   static constexpr X87Register st5{5};
169   static constexpr X87Register st6{6};
170   static constexpr X87Register st7{7};
171 
172   template <int kBits>
173   class SIMDRegister {
174    public:
175     constexpr bool operator==(const SIMDRegister& reg) const { return num_ == reg.num_; }
176     constexpr bool operator!=(const SIMDRegister& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()177     constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(SIMDRegister value)178     friend constexpr uint8_t ValueForFmtSpec(SIMDRegister value) { return value.num_; }
179     friend class Assembler<DerivedAssemblerType>;
180     friend class x86_32::Assembler;
181     friend class x86_64::Assembler;
182     friend class SIMDRegister<384 - kBits>;
183 
To128Bit()184     constexpr auto To128Bit() const {
185       return std::enable_if_t<kBits != 128, SIMDRegister<128>>{num_};
186     }
To256Bit()187     constexpr auto To256Bit() const {
188       return std::enable_if_t<kBits != 256, SIMDRegister<256>>{num_};
189     }
190 
191    private:
SIMDRegister(uint8_t num)192     explicit constexpr SIMDRegister(uint8_t num) : num_(num) {}
193     uint8_t num_;
194   };
195 
196   using XMMRegister = SIMDRegister<128>;
197   using YMMRegister = SIMDRegister<256>;
198 
199   enum ScaleFactor { kTimesOne = 0, kTimesTwo = 1, kTimesFour = 2, kTimesEight = 3 };
200 
201   struct Operand {
rexOperand202     constexpr uint8_t rex() const {
203       return DerivedAssemblerType::kIsX86_64
204                  ? ((index.num_ & 0x08) >> 2) | ((base.num_ & 0x08) >> 3)
205                  : 0;
206     }
207 
RequiresRexOperand208     constexpr bool RequiresRex() const {
209       return DerivedAssemblerType::kIsX86_64 ? ((index.num_ & 0x08) | (base.num_ & 0x08)) : false;
210     }
211 
212     Register base = DerivedAssemblerType::no_register;
213     Register index = DerivedAssemblerType::no_register;
214     ScaleFactor scale = kTimesOne;
215     int32_t disp = 0;
216   };
217 
218   struct LabelOperand {
219     const Label& label;
220   };
221 
222   // Macro operations.
Finalize()223   void Finalize() { ResolveJumps(); }
224 
P2Align(uint32_t m)225   void P2Align(uint32_t m) {
226     uint32_t mask = m - 1;
227     uint32_t addr = pc();
228     Nop((m - (addr & mask)) & mask);
229   }
230 
Nop(uint32_t bytes)231   void Nop(uint32_t bytes) {
232     static const uint32_t kNumNops = 15;
233     static const uint8_t nop1[] = {0x90};
234     static const uint8_t nop2[] = {0x66, 0x90};
235     static const uint8_t nop3[] = {0x0f, 0x1f, 0x00};
236     static const uint8_t nop4[] = {0x0f, 0x1f, 0x40, 0x00};
237     static const uint8_t nop5[] = {0x0f, 0x1f, 0x44, 0x00, 0x00};
238     static const uint8_t nop6[] = {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x0};
239     static const uint8_t nop7[] = {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x0, 0x00};
240     static const uint8_t nop8[] = {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
241     static const uint8_t nop9[] = {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
242     static const uint8_t nop10[] = {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
243     static const uint8_t nop11[] = {
244         0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
245     static const uint8_t nop12[] = {
246         0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
247     static const uint8_t nop13[] = {
248         0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
249     static const uint8_t nop14[] = {
250         0x66, 0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
251     static const uint8_t nop15[] = {
252         0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
253 
254     static const uint8_t* nops[kNumNops] = {nop1,
255                                             nop2,
256                                             nop3,
257                                             nop4,
258                                             nop5,
259                                             nop6,
260                                             nop7,
261                                             nop8,
262                                             nop9,
263                                             nop10,
264                                             nop11,
265                                             nop12,
266                                             nop13,
267                                             nop14,
268                                             nop15};
269     // Common case.
270     if (bytes == 1) {
271       Emit8(nop1[0]);
272       return;
273     }
274 
275     while (bytes > 0) {
276       uint32_t len = bytes;
277       if (len > kNumNops) {
278         len = kNumNops;
279       }
280       EmitSequence(nops[len - 1], len);
281       bytes -= len;
282     }
283   }
284 
285 // Instructions.
286 #include "berberis/assembler/gen_assembler_x86_32_and_x86_64-inl.h"  // NOLINT generated file
287 
288   // Flow control.
JmpRel(int32_t offset)289   void JmpRel(int32_t offset) {
290     CHECK_GE(offset, INT32_MIN + 2);
291     int32_t short_offset = offset - 2;
292     if (IsInRange<int8_t>(short_offset)) {
293       Emit8(0xeb);
294       Emit8(static_cast<int8_t>(short_offset));
295     } else {
296       CHECK_GE(offset, INT32_MIN + 5);
297       Emit8(0xe9);
298       Emit32(offset - 5);
299     }
300   }
301 
Call(int32_t offset)302   void Call(int32_t offset) {
303     CHECK_GE(offset, INT32_MIN + 5);
304     Emit8(0xe8);
305     Emit32(offset - 5);
306   }
307 
JccRel(Condition cc,int32_t offset)308   void JccRel(Condition cc, int32_t offset) {
309     if (cc == Condition::kAlways) {
310       JmpRel(offset);
311       return;
312     }
313     if (cc == Condition::kNever) {
314       return;
315     }
316     CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xf0);
317     CHECK_GE(offset, INT32_MIN + 2);
318     int32_t short_offset = offset - 2;
319     if (IsInRange<int8_t>(short_offset)) {
320       Emit8(0x70 | static_cast<uint8_t>(cc));
321       Emit8(static_cast<int8_t>(short_offset));
322     } else {
323       CHECK_GE(offset, INT32_MIN + 6);
324       Emit8(0x0f);
325       Emit8(0x80 | static_cast<uint8_t>(cc));
326       Emit32(offset - 6);
327     }
328   }
329 
330  protected:
331   // Helper types to distinguish argument types.
332   struct Register8Bit {
Register8BitRegister8Bit333     explicit constexpr Register8Bit(Register reg) : num_(reg.num_) {}
334     uint8_t num_;
335   };
336 
337   // Any register number that doesn't need special processing.
338   struct SizeAgnosticRegister {
SizeAgnosticRegisterSizeAgnosticRegister339     explicit constexpr SizeAgnosticRegister(Register reg) : num_(reg.num_) {}
SizeAgnosticRegisterSizeAgnosticRegister340     explicit constexpr SizeAgnosticRegister(XMMRegister reg) : num_(reg.num_) {}
SizeAgnosticRegisterSizeAgnosticRegister341     explicit constexpr SizeAgnosticRegister(YMMRegister reg) : num_(reg.num_) {}
342     uint8_t num_;
343   };
344 
345   // 16-bit, 32bit, 128-bit, and 256bit vector registers don't need special rules.
346   using Register16Bit = SizeAgnosticRegister;
347   using Register32Bit = SizeAgnosticRegister;
348   using VectorRegister128Bit = SizeAgnosticRegister;
349   using VectorRegister256Bit = SizeAgnosticRegister;
350   // Certain instructions (Enter/Leave, Jcc/Jmp/Loop, Call/Ret, Push/Pop) always operate
351   // on registers of default size (32-bit in 32-bit mode, 64-bit in 64-bit mode (see
352   // "Instructions Not Requiring REX Prefix in 64-Bit Mode" table in 24594 AMD Manual)
353   // Map these to SizeAgnosticRegister, too, since they don't need REX.W even in 64-bit mode.
354   //
355   // x87 instructions fall into that category, too, since they were not expanded in x86-64 mode.
356   using RegisterDefaultBit = SizeAgnosticRegister;
357 
358   // Any memory address that doesn't need special processing.
359   struct SizeAgnosticMemory {
SizeAgnosticMemorySizeAgnosticMemory360     explicit SizeAgnosticMemory(const Operand& op) : operand(op) {}
361     Operand operand;
362   };
363 
364   // 8-bit, 16-bit, 128-bit memory behave the same as 32-bit memory.
365   // Only 64-bit memory is different.
366   using Memory8Bit = SizeAgnosticMemory;
367   using Memory16Bit = SizeAgnosticMemory;
368   using Memory32Bit = SizeAgnosticMemory;
369   // Some instructions have memory operand that have unspecified size (lea, prefetch, etc),
370   // they are encoded like SizeAgnosticMemory, anyway.
371   using MemoryDefaultBit = SizeAgnosticMemory;
372   // X87 instructions always use the same encoding - even for 64-bit or 28-bytes
373   // memory operands (like in fldenv/fnstenv)
374   using MemoryX87 = SizeAgnosticMemory;
375   using MemoryX8716Bit = SizeAgnosticMemory;
376   using MemoryX8732Bit = SizeAgnosticMemory;
377   using MemoryX8764Bit = SizeAgnosticMemory;
378   using MemoryX8780Bit = SizeAgnosticMemory;
379   // Most vector instructions don't need to use REX.W to access 64-bit or 128-bit memory.
380   using VectorMemory32Bit = SizeAgnosticMemory;
381   using VectorMemory64Bit = SizeAgnosticMemory;
382   using VectorMemory128Bit = SizeAgnosticMemory;
383   using VectorMemory256Bit = SizeAgnosticMemory;
384 
385   // Labels types for memory quantities.  Note that names are similar to the ones before because
386   // they are autogenerated.  E.g. VectorLabel32Bit should be read as “VECTOR's operation LABEL
387   // for 32-BIT quantity in memory”.
388   struct Label32Bit {
Label32BitLabel32Bit389     explicit Label32Bit(const struct LabelOperand& l) : label(l.label) {}
390     const Label& label;
391   };
392 
393   // 8-bit, 16-bit, 128-bit memory behave the same as 32-bit memory.
394   // Only 64-bit memory is different.
395   using Label8Bit = Label32Bit;
396   using Label16Bit = Label32Bit;
397   // Some instructions have memory operand that have unspecified size (lea, prefetch, etc),
398   // they are encoded like Label32Bit, anyway.
399   using LabelDefaultBit = Label32Bit;
400   // X87 instructions always use the same encoding - even for 64-bit or 28-bytes
401   // memory operands (like in fldenv/fnstenv)
402   using LabelX87 = Label32Bit;
403   using LabelX8716Bit = Label32Bit;
404   using LabelX8732Bit = Label32Bit;
405   using LabelX8764Bit = Label32Bit;
406   using LabelX8780Bit = Label32Bit;
407   // Most vector instructions don't need to use REX.W to access 64-bit or 128-bit memory.
408   using VectorLabel32Bit = Label32Bit;
409   using VectorLabel64Bit = Label32Bit;
410   using VectorLabel128Bit = Label32Bit;
411 
IsLegacyPrefix(int code)412   static constexpr bool IsLegacyPrefix(int code) {
413     // Legacy prefixes used as opcode extensions in SSE.
414     // Lock is used by cmpxchg.
415     return (code == 0x66) || (code == 0xf2) || (code == 0xf3) || (code == 0xf0);
416   }
417 
418   // Delegate check to Assembler::template IsRegister.
419   template <typename ArgumentType>
420   struct IsCondition {
421     static constexpr bool value = std::is_same_v<ArgumentType, Condition>;
422   };
423 
424   template <typename ArgumentType>
425   struct IsRegister {
426     static constexpr bool value = DerivedAssemblerType::template IsRegister<ArgumentType>::value ||
427                                   std::is_same_v<ArgumentType, X87Register>;
428   };
429 
430   template <typename ArgumentType>
431   struct IsMemoryOperand {
432     static constexpr bool value =
433         DerivedAssemblerType::template IsMemoryOperand<ArgumentType>::value;
434   };
435 
436   template <typename ArgumentType>
437   struct IsLabelOperand {
438     static constexpr bool value =
439         DerivedAssemblerType::template IsLabelOperand<ArgumentType>::value;
440   };
441 
442   template <typename ArgumentType>
443   struct IsImmediate {
444     static constexpr bool value =
445         std::is_integral_v<ArgumentType> &&
446         ((sizeof(ArgumentType) == sizeof(int8_t)) || (sizeof(ArgumentType) == sizeof(int16_t)) ||
447          (sizeof(ArgumentType) == sizeof(int32_t)) || (sizeof(ArgumentType) == sizeof(int64_t)));
448   };
449 
450   // Count number of arguments selected by Predicate.
451   template <template <typename> typename Predicate, typename... ArgumentTypes>
452   static constexpr std::size_t kCountArguments =
453       ((Predicate<ArgumentTypes>::value ? 1 : 0) + ... + 0);
454 
455   // Extract arguments selected by Predicate.
456   //
457   // Note: This interface begs for the trick used in EmitFunctionTypeHelper in make_intrinsics.cc
458   // in conjunction with structured bindings.
459   //
460   // Unfortunately returning std::tuple slows down AssemblerTest by about 30% when libc++ and clang
461   // are used together (no slowdown on GCC, no slowdown on clang+libstdc++).
462   //
463   // TODO(http://b/140721204): refactor when it would be safe to return std::tuple from function.
464   //
465   template <std::size_t index,
466             template <typename>
467             typename Predicate,
468             typename ArgumentType,
469             typename... ArgumentTypes>
ArgumentByType(ArgumentType argument,ArgumentTypes...arguments)470   static constexpr auto ArgumentByType(ArgumentType argument, ArgumentTypes... arguments) {
471     if constexpr (Predicate<std::decay_t<ArgumentType>>::value) {
472       if constexpr (index == 0) {
473         return argument;
474       } else {
475         return ArgumentByType<index - 1, Predicate>(arguments...);
476       }
477     } else {
478       return ArgumentByType<index, Predicate>(arguments...);
479     }
480   }
481 
482   // Emit immediates - they always come at the end and don't affect anything except rip-addressig.
EmitImmediates()483   static constexpr void EmitImmediates() {}
484 
485   template <typename FirstArgumentType, typename... ArgumentTypes>
EmitImmediates(FirstArgumentType first_argument,ArgumentTypes...other_arguments)486   void EmitImmediates(FirstArgumentType first_argument, ArgumentTypes... other_arguments) {
487     if constexpr (std::is_integral_v<FirstArgumentType> &&
488                   sizeof(FirstArgumentType) == sizeof(int8_t)) {
489       Emit8(first_argument);
490     } else if constexpr (std::is_integral_v<FirstArgumentType> &&
491                          sizeof(FirstArgumentType) == sizeof(int16_t)) {
492       Emit16(first_argument);
493     } else if constexpr (std::is_integral_v<FirstArgumentType> &&
494                          sizeof(FirstArgumentType) == sizeof(int32_t)) {
495       Emit32(first_argument);
496     } else if constexpr (std::is_integral_v<FirstArgumentType> &&
497                          sizeof(FirstArgumentType) == sizeof(int64_t)) {
498       Emit64(first_argument);
499     }
500     EmitImmediates(other_arguments...);
501   }
502 
503   template <typename ArgumentType>
ImmediateSize()504   static constexpr size_t ImmediateSize() {
505     if constexpr (std::is_integral_v<ArgumentType> && sizeof(ArgumentType) == sizeof(int8_t)) {
506       return 1;
507     } else if constexpr (std::is_integral_v<ArgumentType> &&
508                          sizeof(ArgumentType) == sizeof(int16_t)) {
509       return 2;
510     } else if constexpr (std::is_integral_v<ArgumentType> &&
511                          sizeof(ArgumentType) == sizeof(int32_t)) {
512       return 4;
513     } else if constexpr (std::is_integral_v<ArgumentType> &&
514                          sizeof(ArgumentType) == sizeof(int64_t)) {
515       return 8;
516     } else {
517       static_assert(!std::is_integral_v<ArgumentType>);
518       return 0;
519     }
520   }
521 
522   template <typename... ArgumentTypes>
ImmediatesSize()523   static constexpr size_t ImmediatesSize() {
524     return (ImmediateSize<ArgumentTypes>() + ... + 0);
525   }
526 
527   // Note: We may need separate x87 EmitInstruction if we would want to support
528   // full set of x86 instructions.
529   //
530   // That's because 8087 was completely separate piece of silicone which was only
531   // partially driven by 8086:
532   //     https://en.wikipedia.org/wiki/Intel_8087
533   //
534   // In particular it had the following properties:
535   //   1. It had its own separate subset of opcodes - because it did its own decoding.
536   //   2. It had separate set of registers and could *only* access these.
537   //   2a. The 8086, in turn, *couldn't* access these registers at all.
538   //   3. To access memory it was designed to take address from address bus.
539   //
540   // This means that:
541   //   1. x87 instructions are easily recognizable - all instructions with opcodes 0xd8
542   //      to 0xdf are x87 instructions, all instructions with other opcodes are not.
543   //   2. We could be sure that x87 registers would only be used with x87 instructions
544   //      and other types of registers wouldn't be used with these.
545   //   3. We still would use normal registers for memory access, but REX.W bit wouldn't
546   //      be used for 64-bit quantities, whether they are floating point numbers or integers.
547   //
548   // Right now we only use EmitInstruction to emit x87 instructions which are using memory
549   // operands - and it works well enough for that because of #3.
550 
551   // If you want to understand how this function works (and how helper function like Vex and
552   // Rex work), you need good understanding of AMD/Intel Instruction format.
553   //
554   // Intel manual includes the most precise explanation, but it's VERY hard to read.
555   //
556   // AMD manual is much easier to read, but it doesn't include description of EVEX
557   // instructions and is less precise. Diagram on page 2 of Volume 3 is especially helpful:
558   //   https://www.amd.com/system/files/TechDocs/24594.pdf#page=42
559   //
560   // And the most concise (albeit unofficial) in on osdev Wiki:
561   //   https://wiki.osdev.org/X86-64_Instruction_Encoding
562 
563   // Note: if you change this function (or any of the helper functions) then remove --fast
564   // option from ExhaustiveAssemblerTest to run full blackbox comparison to clang.
565 
566   template <uint8_t... kOpcodes, typename... ArgumentsTypes>
EmitInstruction(ArgumentsTypes...arguments)567   void EmitInstruction(ArgumentsTypes... arguments) {
568     static constexpr auto kOpcodesArray = std::array{kOpcodes...};
569     static constexpr size_t kLegacyPrefixesCount = []() {
570       size_t legacy_prefixes_count = 0;
571       for (legacy_prefixes_count = 0; IsLegacyPrefix(kOpcodesArray[legacy_prefixes_count]);
572            ++legacy_prefixes_count) {
573       }
574       return legacy_prefixes_count;
575     }();
576     for (size_t legacy_prefixes_index = 0; legacy_prefixes_index < kLegacyPrefixesCount;
577          ++legacy_prefixes_index) {
578       Emit8(kOpcodesArray[legacy_prefixes_index]);
579     }
580     // We don't yet support any XOP-encoded instructions, but they are 100% identical to vex ones,
581     // except they are using 0x8F prefix, not 0xC4 prefix.
582     constexpr auto kVexOrXop = []() {
583       if constexpr (std::size(kOpcodesArray) < kLegacyPrefixesCount + 3) {
584         return false;
585         // Note that JSON files use AMD approach: bytes are specified as in AMD manual (only we are
586         // replacing ¬R/¬X/¬B and vvvv bits with zeros).
587         //
588         // In particular it means that vex-encoded instructions should be specified with 0xC4 even
589         // if they are always emitted with 0xC4-to-0xC5 folding.
590       } else if constexpr (kOpcodesArray[kLegacyPrefixesCount] == 0xC4 ||
591                            kOpcodesArray[kLegacyPrefixesCount] == 0x8F) {
592         return true;
593       }
594       return false;
595     }();
596     constexpr auto conditions_count = kCountArguments<IsCondition, ArgumentsTypes...>;
597     constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
598     constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
599     constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
600     // We need to know if Reg field (in ModRM byte) is an opcode extension or if opcode extension
601     // goes into the immediate field.
602     constexpr auto reg_is_opcode_extension =
603         (registers_count + operands_count > 0) &&
604         (registers_count + operands_count + labels_count <
605          2 + kVexOrXop * (std::size(kOpcodesArray) - kLegacyPrefixesCount - 4));
606     static_assert((registers_count + operands_count + labels_count + conditions_count +
607                    kCountArguments<IsImmediate, ArgumentsTypes...>) == sizeof...(ArgumentsTypes),
608                   "Only registers (with specified size), Operands (with specified size), "
609                   "Conditions, and Immediates are supported.");
610     static_assert(operands_count <= 1, "Only one operand is allowed in instruction.");
611     static_assert(labels_count <= 1, "Only one label is allowed in instruction.");
612     // 0x0f is an opcode extension, if it's not there then we only have one byte opcode.
613     const size_t kPrefixesAndOpcodeExtensionsCount = []() {
614       if constexpr (kVexOrXop) {
615         static_assert(conditions_count == 0,
616                       "No conditionals are supported in vex/xop instructions.");
617         static_assert((registers_count + operands_count + labels_count) <= 4,
618                       "Up to four-arguments in vex/xop instructions are supported.");
619         return kLegacyPrefixesCount + 3;
620       } else {
621         static_assert(conditions_count <= 1, "Only one condition is allowed in instruction.");
622         static_assert((registers_count + operands_count + labels_count) <= 2,
623                       "Only two-arguments legacy instructions are supported.");
624         if constexpr (kOpcodesArray[kLegacyPrefixesCount] == 0x0F) {
625           if constexpr (kOpcodesArray[kLegacyPrefixesCount + 1] == 0x38 ||
626                         kOpcodesArray[kLegacyPrefixesCount + 1] == 0x3A) {
627             return kLegacyPrefixesCount + 2;
628           }
629           return kLegacyPrefixesCount + 1;
630         }
631         return kLegacyPrefixesCount;
632       }
633     }();
634     if constexpr (kVexOrXop) {
635       static_cast<DerivedAssemblerType*>(this)
636           ->template EmitVex<kOpcodesArray[kLegacyPrefixesCount],
637                              kOpcodesArray[kLegacyPrefixesCount + 1],
638                              kOpcodesArray[kLegacyPrefixesCount + 2],
639                              reg_is_opcode_extension>(arguments...);
640     } else {
641       static_cast<DerivedAssemblerType*>(this)->EmitRex(arguments...);
642       for (size_t extension_opcode_index = kLegacyPrefixesCount;
643            extension_opcode_index < kPrefixesAndOpcodeExtensionsCount;
644            ++extension_opcode_index) {
645         Emit8(kOpcodesArray[extension_opcode_index]);
646       }
647     }
648     // These are older 8086 instructions which encode register number in the opcode itself.
649     if constexpr (registers_count == 1 && operands_count == 0 && labels_count == 0 &&
650                   std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 1) {
651       static_cast<DerivedAssemblerType*>(this)->EmitRegisterInOpcode(
652           kOpcodesArray[kPrefixesAndOpcodeExtensionsCount],
653           ArgumentByType<0, IsRegister>(arguments...));
654       EmitImmediates(arguments...);
655     } else {
656       // Emit "main" single-byte opcode.
657       if constexpr (conditions_count == 1) {
658         auto condition_code = static_cast<uint8_t>(ArgumentByType<0, IsCondition>(arguments...));
659         CHECK_EQ(0, condition_code & 0xF0);
660         Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount] | condition_code);
661       } else {
662         Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount]);
663       }
664       if constexpr (reg_is_opcode_extension) {
665         if constexpr (operands_count == 1) {
666           static_cast<DerivedAssemblerType*>(this)->EmitOperandOp(
667               static_cast<int>(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]),
668               ArgumentByType<0, IsMemoryOperand>(arguments...).operand);
669         } else if constexpr (labels_count == 1) {
670           static_cast<DerivedAssemblerType*>(this)
671               ->template EmitRipOp<ImmediatesSize<ArgumentsTypes...>()>(
672                   static_cast<int>(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]),
673                   ArgumentByType<0, IsLabelOperand>(arguments...).label);
674         } else {
675           static_cast<DerivedAssemblerType*>(this)->EmitModRM(
676               kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1],
677               ArgumentByType<0, IsRegister>(arguments...));
678         }
679       } else if constexpr (registers_count > 0) {
680         if constexpr (operands_count == 1) {
681           static_cast<DerivedAssemblerType*>(this)->EmitOperandOp(
682               ArgumentByType<0, IsRegister>(arguments...),
683               ArgumentByType<0, IsMemoryOperand>(arguments...).operand);
684         } else if constexpr (labels_count == 1) {
685           static_cast<DerivedAssemblerType*>(this)
686               ->template EmitRipOp<ImmediatesSize<ArgumentsTypes...>()>(
687                   ArgumentByType<0, IsRegister>(arguments...),
688                   ArgumentByType<0, IsLabelOperand>(arguments...).label);
689         } else {
690           static_cast<DerivedAssemblerType*>(this)->EmitModRM(
691               ArgumentByType<0, IsRegister>(arguments...),
692               ArgumentByType<1, IsRegister>(arguments...));
693         }
694       }
695       // If reg is an opcode extension then we already used that element.
696       if constexpr (reg_is_opcode_extension) {
697         static_assert(std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 2);
698       } else if constexpr (std::size(kOpcodesArray) > kPrefixesAndOpcodeExtensionsCount + 1) {
699         // Final opcode byte(s) - they are in the place where immediate is expected.
700         // Cmpsps/Cmppd and 3DNow! instructions are using it.
701         static_assert(std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 2);
702         Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]);
703       }
704       if constexpr (registers_count + operands_count + labels_count == 4) {
705         if constexpr (kCountArguments<IsImmediate, ArgumentsTypes...> == 1) {
706           Emit8((ArgumentByType<registers_count - 1, IsRegister>(arguments...).num_ << 4) |
707                 ArgumentByType<0, IsImmediate>(arguments...));
708         } else {
709           static_assert(kCountArguments<IsImmediate, ArgumentsTypes...> == 0);
710           Emit8(ArgumentByType<registers_count - 1, IsRegister>(arguments...).num_ << 4);
711         }
712       } else {
713         EmitImmediates(arguments...);
714       }
715     }
716   }
717 
718   // Normally instruction arguments come in the following order: vex, rm, reg, imm.
719   // But certain instructions can have swapped arguments in a different order.
720   // In addition to that we have special case where two arguments may need to be swapped
721   // to reduce encoding size.
722 
723   template <uint8_t... kOpcodes,
724             typename ArgumentsType0,
725             typename ArgumentsType1,
726             typename... ArgumentsTypes>
EmitRegToRmInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsTypes &&...arguments)727   void EmitRegToRmInstruction(ArgumentsType0&& argument0,
728                               ArgumentsType1&& argument1,
729                               ArgumentsTypes&&... arguments) {
730     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType1>(argument1),
731                                         std::forward<ArgumentsType0>(argument0),
732                                         std::forward<ArgumentsTypes>(arguments)...);
733   }
734 
735   template <uint8_t... kOpcodes,
736             typename ArgumentsType0,
737             typename ArgumentsType1,
738             typename... ArgumentsTypes>
EmitRmToVexInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsTypes &&...arguments)739   void EmitRmToVexInstruction(ArgumentsType0&& argument0,
740                               ArgumentsType1&& argument1,
741                               ArgumentsTypes&&... arguments) {
742     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType1>(argument1),
743                                         std::forward<ArgumentsType0>(argument0),
744                                         std::forward<ArgumentsTypes>(arguments)...);
745   }
746 
747   // If vex operand is one of first 8 registers and rm operand is not then swapping these two
748   // operands produces more compact encoding.
749   // This only works with commutative instructions from first opcode map.
750   template <uint8_t... kOpcodes,
751             typename ArgumentsType0,
752             typename ArgumentsType1,
753             typename ArgumentsType2,
754             typename... ArgumentsTypes>
EmitOptimizableUsingCommutationInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsTypes &&...arguments)755   void EmitOptimizableUsingCommutationInstruction(ArgumentsType0&& argument0,
756                                                   ArgumentsType1&& argument1,
757                                                   ArgumentsType2&& argument2,
758                                                   ArgumentsTypes&&... arguments) {
759     if constexpr (std::is_same_v<ArgumentsType2, ArgumentsType1>) {
760       if (DerivedAssemblerType::IsSwapProfitable(std::forward<ArgumentsType2>(argument2),
761                                                  std::forward<ArgumentsType1>(argument1))) {
762         return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
763                                             std::forward<ArgumentsType1>(argument1),
764                                             std::forward<ArgumentsType2>(argument2),
765                                             std::forward<ArgumentsTypes>(arguments)...);
766       }
767     }
768     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
769                                         std::forward<ArgumentsType2>(argument2),
770                                         std::forward<ArgumentsType1>(argument1),
771                                         std::forward<ArgumentsTypes>(arguments)...);
772   }
773 
774   template <uint8_t... kOpcodes,
775             typename ArgumentsType0,
776             typename ArgumentsType1,
777             typename ArgumentsType2,
778             typename ArgumentsType3,
779             typename... ArgumentsTypes>
EmitVexImmRmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsType3 && argument3,ArgumentsTypes &&...arguments)780   void EmitVexImmRmToRegInstruction(ArgumentsType0&& argument0,
781                                     ArgumentsType1&& argument1,
782                                     ArgumentsType2&& argument2,
783                                     ArgumentsType3&& argument3,
784                                     ArgumentsTypes&&... arguments) {
785     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
786                                         std::forward<ArgumentsType3>(argument3),
787                                         std::forward<ArgumentsType1>(argument1),
788                                         std::forward<ArgumentsType2>(argument2),
789                                         std::forward<ArgumentsTypes>(arguments)...);
790   }
791 
792   template <uint8_t... kOpcodes,
793             typename ArgumentsType0,
794             typename ArgumentsType1,
795             typename ArgumentsType2,
796             typename ArgumentsType3,
797             typename... ArgumentsTypes>
EmitVexRmImmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsType3 && argument3,ArgumentsTypes &&...arguments)798   void EmitVexRmImmToRegInstruction(ArgumentsType0&& argument0,
799                                     ArgumentsType1&& argument1,
800                                     ArgumentsType2&& argument2,
801                                     ArgumentsType3&& argument3,
802                                     ArgumentsTypes&&... arguments) {
803     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
804                                         std::forward<ArgumentsType2>(argument2),
805                                         std::forward<ArgumentsType1>(argument1),
806                                         std::forward<ArgumentsType3>(argument3),
807                                         std::forward<ArgumentsTypes>(arguments)...);
808   }
809 
810   template <uint8_t... kOpcodes,
811             typename ArgumentsType0,
812             typename ArgumentsType1,
813             typename ArgumentsType2,
814             typename... ArgumentsTypes>
EmitVexRmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsTypes &&...arguments)815   void EmitVexRmToRegInstruction(ArgumentsType0&& argument0,
816                                  ArgumentsType1&& argument1,
817                                  ArgumentsType2&& argument2,
818                                  ArgumentsTypes&&... arguments) {
819     return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
820                                         std::forward<ArgumentsType2>(argument2),
821                                         std::forward<ArgumentsType1>(argument1),
822                                         std::forward<ArgumentsTypes>(arguments)...);
823   }
824 
825   void ResolveJumps();
826 
827  private:
828   Assembler() = delete;
829   Assembler(const Assembler&) = delete;
830   Assembler(Assembler&&) = delete;
831   void operator=(const Assembler&) = delete;
832   void operator=(Assembler&&) = delete;
833 };
834 
835 template <typename DerivedAssemblerType>
Pmov(XMMRegister dest,XMMRegister src)836 inline void Assembler<DerivedAssemblerType>::Pmov(XMMRegister dest, XMMRegister src) {
837   // SSE does not have operations for register-to-register integer move and
838   // Intel explicitly recommends to use pshufd instead on Pentium4:
839   //   See https://software.intel.com/en-us/articles/
840   //               fast-simd-integer-move-for-the-intel-pentiumr-4-processor
841   // These recommendations are CPU-dependent, though, thus we will need to
842   // investigate this question further before we could decide when to use
843   // movaps (or movapd) and when to use pshufd.
844   //
845   // TODO(khim): investigate performance problems related to integer MOVs
846   Movaps(dest, src);
847 }
848 
849 template <typename DerivedAssemblerType>
Call(const Label & label)850 inline void Assembler<DerivedAssemblerType>::Call(const Label& label) {
851   if (label.IsBound()) {
852     int32_t offset = label.position() - pc();
853     Call(offset);
854   } else {
855     Emit8(0xe8);
856     Emit32(0xfffffffc);
857     jumps_.push_back(Jump{&label, pc() - 4, false});
858   }
859 }
860 
861 template <typename DerivedAssemblerType>
Jcc(Condition cc,const Label & label)862 inline void Assembler<DerivedAssemblerType>::Jcc(Condition cc, const Label& label) {
863   if (cc == Condition::kAlways) {
864     Jmp(label);
865     return;
866   } else if (cc == Condition::kNever) {
867     return;
868   }
869   CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
870   // TODO(eaeltsin): may be remove IsBound case?
871   // Then jcc by label will be of fixed size (5 bytes)
872   if (label.IsBound()) {
873     int32_t offset = label.position() - pc();
874     JccRel(cc, offset);
875   } else {
876     Emit16(0x800f | (static_cast<uint8_t>(cc) << 8));
877     Emit32(0xfffffffc);
878     jumps_.push_back(Jump{&label, pc() - 4, false});
879   }
880 }
881 
882 template <typename DerivedAssemblerType>
Jmp(const Label & label)883 inline void Assembler<DerivedAssemblerType>::Jmp(const Label& label) {
884   // TODO(eaeltsin): may be remove IsBound case?
885   // Then jmp by label will be of fixed size (5 bytes)
886   if (label.IsBound()) {
887     int32_t offset = label.position() - pc();
888     JmpRel(offset);
889   } else {
890     Emit8(0xe9);
891     Emit32(0xfffffffc);
892     jumps_.push_back(Jump{&label, pc() - 4, false});
893   }
894 }
895 
896 template <typename DerivedAssemblerType>
ResolveJumps()897 inline void Assembler<DerivedAssemblerType>::ResolveJumps() {
898   for (const auto& jump : jumps_) {
899     const Label* label = jump.label;
900     uint32_t pc = jump.pc;
901     CHECK(label->IsBound());
902     if (jump.is_recovery) {
903       // Add pc -> label correspondence to recovery map.
904       AddRelocation(0, RelocationType::RelocRecoveryPoint, pc, label->position());
905     } else {
906       int32_t offset = label->position() - pc;
907       *AddrAs<int32_t>(pc) += offset;
908     }
909   }
910 }
911 
912 // Code size optimized instructions: they have different variants depending on registers used.
913 
914 template <typename DerivedAssemblerType>
Xchgl(Register dest,Register src)915 inline void Assembler<DerivedAssemblerType>::Xchgl(Register dest, Register src) {
916   if (DerivedAssemblerType::IsAccumulator(src) || DerivedAssemblerType::IsAccumulator(dest)) {
917     Register other = DerivedAssemblerType::IsAccumulator(src) ? dest : src;
918     EmitInstruction<0x90>(SizeAgnosticRegister(other));
919   } else {
920     // Clang 8 (after r330298) puts dest before src.  We are comparing output
921     // to clang in exhaustive test thus we want to match clang behavior exactly.
922     EmitInstruction<0x87>(SizeAgnosticRegister(dest), SizeAgnosticRegister(src));
923   }
924 }
925 
926 }  // namespace x86_32_and_x86_64
927 
928 }  // namespace berberis
929 
930 #endif  // BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
931