1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
18 #define BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
19
20 #include <cstddef> // std::size_t
21 #include <cstdint>
22 #include <type_traits> // std::enable_if, std::is_integral
23
24 #include "berberis/assembler/common.h"
25 #include "berberis/base/bit_util.h"
26 #include "berberis/base/checks.h"
27
28 namespace berberis {
29
30 // Assembler includes implementation of most x86 assembler instructions.
31 //
32 // x86-32 and x86-64 assemblers are nearly identical, but difference lies in handling
33 // of very low-level instruction details: almost all instructions on x86-64 could include
34 // REX byte which is needed if new registers (%r8 to %r15 or %xmm8 to %xmm15) are used.
35 //
36 // To handle that difference efficiently Assembler is CRTP class: it's parameterized
37 // by its own descendant and pull certain functions (e.g. GetHighBit or Rex8Size) from
38 // its implementation.
39 //
40 // Certain functions are only implemented by its descendant (since there are instructions
41 // which only exist in x86-32 mode and instructions which only exist in x86-64 mode).
42
43 namespace x86_32 {
44
45 class Assembler;
46
47 } // namespace x86_32
48
49 namespace x86_64 {
50
51 class Assembler;
52
53 } // namespace x86_64
54
55 namespace x86_32_and_x86_64 {
56
57 template <typename DerivedAssemblerType>
58 class Assembler : public AssemblerBase {
59 public:
Assembler(MachineCode * code)60 explicit Assembler(MachineCode* code) : AssemblerBase(code) {}
61
62 enum class Condition {
63 kInvalidCondition = -1,
64
65 kOverflow = 0,
66 kNoOverflow = 1,
67 kBelow = 2,
68 kAboveEqual = 3,
69 kEqual = 4,
70 kNotEqual = 5,
71 kBelowEqual = 6,
72 kAbove = 7,
73 kNegative = 8,
74 kPositiveOrZero = 9,
75 kParityEven = 10,
76 kParityOdd = 11,
77 kLess = 12,
78 kGreaterEqual = 13,
79 kLessEqual = 14,
80 kGreater = 15,
81 kAlways = 16,
82 kNever = 17,
83
84 // aka...
85 kCarry = kBelow,
86 kNotCarry = kAboveEqual,
87 kZero = kEqual,
88 kNotZero = kNotEqual,
89 kSign = kNegative,
90 kNotSign = kPositiveOrZero
91 };
92
GetCondName(Condition cond)93 friend constexpr const char* GetCondName(Condition cond) {
94 switch (cond) {
95 case Condition::kOverflow:
96 return "O";
97 case Condition::kNoOverflow:
98 return "NO";
99 case Condition::kBelow:
100 return "B";
101 case Condition::kAboveEqual:
102 return "AE";
103 case Condition::kEqual:
104 return "Z";
105 case Condition::kNotEqual:
106 return "NZ";
107 case Condition::kBelowEqual:
108 return "BE";
109 case Condition::kAbove:
110 return "A";
111 case Condition::kNegative:
112 return "N";
113 case Condition::kPositiveOrZero:
114 return "PL";
115 case Condition::kParityEven:
116 return "PE";
117 case Condition::kParityOdd:
118 return "PO";
119 case Condition::kLess:
120 return "LS";
121 case Condition::kGreaterEqual:
122 return "GE";
123 case Condition::kLessEqual:
124 return "LE";
125 case Condition::kGreater:
126 return "GT";
127 default:
128 return "??";
129 }
130 }
131
132 class Register {
133 public:
134 constexpr bool operator==(const Register& reg) const { return num_ == reg.num_; }
135 constexpr bool operator!=(const Register& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()136 constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(Register value)137 friend constexpr uint8_t ValueForFmtSpec(Register value) { return value.num_; }
138 friend class Assembler<DerivedAssemblerType>;
139 friend class x86_32::Assembler;
140 friend class x86_64::Assembler;
141
142 private:
Register(uint8_t num)143 explicit constexpr Register(uint8_t num) : num_(num) {}
144 uint8_t num_;
145 };
146
147 class X87Register {
148 public:
149 constexpr bool operator==(const Register& reg) const { return num_ == reg.num_; }
150 constexpr bool operator!=(const Register& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()151 constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(X87Register value)152 friend constexpr uint8_t ValueForFmtSpec(X87Register value) { return value.num_; }
153 friend class Assembler<DerivedAssemblerType>;
154 friend class x86_32::Assembler;
155 friend class x86_64::Assembler;
156
157 private:
X87Register(uint8_t num)158 explicit constexpr X87Register(uint8_t num) : num_(num) {}
159 uint8_t num_;
160 };
161
162 static constexpr X87Register st{0};
163 static constexpr X87Register st0{0};
164 static constexpr X87Register st1{1};
165 static constexpr X87Register st2{2};
166 static constexpr X87Register st3{3};
167 static constexpr X87Register st4{4};
168 static constexpr X87Register st5{5};
169 static constexpr X87Register st6{6};
170 static constexpr X87Register st7{7};
171
172 template <int kBits>
173 class SIMDRegister {
174 public:
175 constexpr bool operator==(const SIMDRegister& reg) const { return num_ == reg.num_; }
176 constexpr bool operator!=(const SIMDRegister& reg) const { return num_ != reg.num_; }
GetPhysicalIndex()177 constexpr uint8_t GetPhysicalIndex() { return num_; }
ValueForFmtSpec(SIMDRegister value)178 friend constexpr uint8_t ValueForFmtSpec(SIMDRegister value) { return value.num_; }
179 friend class Assembler<DerivedAssemblerType>;
180 friend class x86_32::Assembler;
181 friend class x86_64::Assembler;
182 friend class SIMDRegister<384 - kBits>;
183
To128Bit()184 constexpr auto To128Bit() const {
185 return std::enable_if_t<kBits != 128, SIMDRegister<128>>{num_};
186 }
To256Bit()187 constexpr auto To256Bit() const {
188 return std::enable_if_t<kBits != 256, SIMDRegister<256>>{num_};
189 }
190
191 private:
SIMDRegister(uint8_t num)192 explicit constexpr SIMDRegister(uint8_t num) : num_(num) {}
193 uint8_t num_;
194 };
195
196 using XMMRegister = SIMDRegister<128>;
197 using YMMRegister = SIMDRegister<256>;
198
199 enum ScaleFactor { kTimesOne = 0, kTimesTwo = 1, kTimesFour = 2, kTimesEight = 3 };
200
201 struct Operand {
rexOperand202 constexpr uint8_t rex() const {
203 return DerivedAssemblerType::kIsX86_64
204 ? ((index.num_ & 0x08) >> 2) | ((base.num_ & 0x08) >> 3)
205 : 0;
206 }
207
RequiresRexOperand208 constexpr bool RequiresRex() const {
209 return DerivedAssemblerType::kIsX86_64 ? ((index.num_ & 0x08) | (base.num_ & 0x08)) : false;
210 }
211
212 Register base = DerivedAssemblerType::no_register;
213 Register index = DerivedAssemblerType::no_register;
214 ScaleFactor scale = kTimesOne;
215 int32_t disp = 0;
216 };
217
218 struct LabelOperand {
219 const Label& label;
220 };
221
222 // Macro operations.
Finalize()223 void Finalize() { ResolveJumps(); }
224
P2Align(uint32_t m)225 void P2Align(uint32_t m) {
226 uint32_t mask = m - 1;
227 uint32_t addr = pc();
228 Nop((m - (addr & mask)) & mask);
229 }
230
Nop(uint32_t bytes)231 void Nop(uint32_t bytes) {
232 static const uint32_t kNumNops = 15;
233 static const uint8_t nop1[] = {0x90};
234 static const uint8_t nop2[] = {0x66, 0x90};
235 static const uint8_t nop3[] = {0x0f, 0x1f, 0x00};
236 static const uint8_t nop4[] = {0x0f, 0x1f, 0x40, 0x00};
237 static const uint8_t nop5[] = {0x0f, 0x1f, 0x44, 0x00, 0x00};
238 static const uint8_t nop6[] = {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x0};
239 static const uint8_t nop7[] = {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x0, 0x00};
240 static const uint8_t nop8[] = {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
241 static const uint8_t nop9[] = {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
242 static const uint8_t nop10[] = {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
243 static const uint8_t nop11[] = {
244 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
245 static const uint8_t nop12[] = {
246 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
247 static const uint8_t nop13[] = {
248 0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
249 static const uint8_t nop14[] = {
250 0x66, 0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
251 static const uint8_t nop15[] = {
252 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00};
253
254 static const uint8_t* nops[kNumNops] = {nop1,
255 nop2,
256 nop3,
257 nop4,
258 nop5,
259 nop6,
260 nop7,
261 nop8,
262 nop9,
263 nop10,
264 nop11,
265 nop12,
266 nop13,
267 nop14,
268 nop15};
269 // Common case.
270 if (bytes == 1) {
271 Emit8(nop1[0]);
272 return;
273 }
274
275 while (bytes > 0) {
276 uint32_t len = bytes;
277 if (len > kNumNops) {
278 len = kNumNops;
279 }
280 EmitSequence(nops[len - 1], len);
281 bytes -= len;
282 }
283 }
284
285 // Instructions.
286 #include "berberis/assembler/gen_assembler_x86_32_and_x86_64-inl.h" // NOLINT generated file
287
288 // Flow control.
JmpRel(int32_t offset)289 void JmpRel(int32_t offset) {
290 CHECK_GE(offset, INT32_MIN + 2);
291 int32_t short_offset = offset - 2;
292 if (IsInRange<int8_t>(short_offset)) {
293 Emit8(0xeb);
294 Emit8(static_cast<int8_t>(short_offset));
295 } else {
296 CHECK_GE(offset, INT32_MIN + 5);
297 Emit8(0xe9);
298 Emit32(offset - 5);
299 }
300 }
301
Call(int32_t offset)302 void Call(int32_t offset) {
303 CHECK_GE(offset, INT32_MIN + 5);
304 Emit8(0xe8);
305 Emit32(offset - 5);
306 }
307
JccRel(Condition cc,int32_t offset)308 void JccRel(Condition cc, int32_t offset) {
309 if (cc == Condition::kAlways) {
310 JmpRel(offset);
311 return;
312 }
313 if (cc == Condition::kNever) {
314 return;
315 }
316 CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xf0);
317 CHECK_GE(offset, INT32_MIN + 2);
318 int32_t short_offset = offset - 2;
319 if (IsInRange<int8_t>(short_offset)) {
320 Emit8(0x70 | static_cast<uint8_t>(cc));
321 Emit8(static_cast<int8_t>(short_offset));
322 } else {
323 CHECK_GE(offset, INT32_MIN + 6);
324 Emit8(0x0f);
325 Emit8(0x80 | static_cast<uint8_t>(cc));
326 Emit32(offset - 6);
327 }
328 }
329
330 protected:
331 // Helper types to distinguish argument types.
332 struct Register8Bit {
Register8BitRegister8Bit333 explicit constexpr Register8Bit(Register reg) : num_(reg.num_) {}
334 uint8_t num_;
335 };
336
337 // Any register number that doesn't need special processing.
338 struct SizeAgnosticRegister {
SizeAgnosticRegisterSizeAgnosticRegister339 explicit constexpr SizeAgnosticRegister(Register reg) : num_(reg.num_) {}
SizeAgnosticRegisterSizeAgnosticRegister340 explicit constexpr SizeAgnosticRegister(XMMRegister reg) : num_(reg.num_) {}
SizeAgnosticRegisterSizeAgnosticRegister341 explicit constexpr SizeAgnosticRegister(YMMRegister reg) : num_(reg.num_) {}
342 uint8_t num_;
343 };
344
345 // 16-bit, 32bit, 128-bit, and 256bit vector registers don't need special rules.
346 using Register16Bit = SizeAgnosticRegister;
347 using Register32Bit = SizeAgnosticRegister;
348 using VectorRegister128Bit = SizeAgnosticRegister;
349 using VectorRegister256Bit = SizeAgnosticRegister;
350 // Certain instructions (Enter/Leave, Jcc/Jmp/Loop, Call/Ret, Push/Pop) always operate
351 // on registers of default size (32-bit in 32-bit mode, 64-bit in 64-bit mode (see
352 // "Instructions Not Requiring REX Prefix in 64-Bit Mode" table in 24594 AMD Manual)
353 // Map these to SizeAgnosticRegister, too, since they don't need REX.W even in 64-bit mode.
354 //
355 // x87 instructions fall into that category, too, since they were not expanded in x86-64 mode.
356 using RegisterDefaultBit = SizeAgnosticRegister;
357
358 // Any memory address that doesn't need special processing.
359 struct SizeAgnosticMemory {
SizeAgnosticMemorySizeAgnosticMemory360 explicit SizeAgnosticMemory(const Operand& op) : operand(op) {}
361 Operand operand;
362 };
363
364 // 8-bit, 16-bit, 128-bit memory behave the same as 32-bit memory.
365 // Only 64-bit memory is different.
366 using Memory8Bit = SizeAgnosticMemory;
367 using Memory16Bit = SizeAgnosticMemory;
368 using Memory32Bit = SizeAgnosticMemory;
369 // Some instructions have memory operand that have unspecified size (lea, prefetch, etc),
370 // they are encoded like SizeAgnosticMemory, anyway.
371 using MemoryDefaultBit = SizeAgnosticMemory;
372 // X87 instructions always use the same encoding - even for 64-bit or 28-bytes
373 // memory operands (like in fldenv/fnstenv)
374 using MemoryX87 = SizeAgnosticMemory;
375 using MemoryX8716Bit = SizeAgnosticMemory;
376 using MemoryX8732Bit = SizeAgnosticMemory;
377 using MemoryX8764Bit = SizeAgnosticMemory;
378 using MemoryX8780Bit = SizeAgnosticMemory;
379 // Most vector instructions don't need to use REX.W to access 64-bit or 128-bit memory.
380 using VectorMemory32Bit = SizeAgnosticMemory;
381 using VectorMemory64Bit = SizeAgnosticMemory;
382 using VectorMemory128Bit = SizeAgnosticMemory;
383 using VectorMemory256Bit = SizeAgnosticMemory;
384
385 // Labels types for memory quantities. Note that names are similar to the ones before because
386 // they are autogenerated. E.g. VectorLabel32Bit should be read as “VECTOR's operation LABEL
387 // for 32-BIT quantity in memory”.
388 struct Label32Bit {
Label32BitLabel32Bit389 explicit Label32Bit(const struct LabelOperand& l) : label(l.label) {}
390 const Label& label;
391 };
392
393 // 8-bit, 16-bit, 128-bit memory behave the same as 32-bit memory.
394 // Only 64-bit memory is different.
395 using Label8Bit = Label32Bit;
396 using Label16Bit = Label32Bit;
397 // Some instructions have memory operand that have unspecified size (lea, prefetch, etc),
398 // they are encoded like Label32Bit, anyway.
399 using LabelDefaultBit = Label32Bit;
400 // X87 instructions always use the same encoding - even for 64-bit or 28-bytes
401 // memory operands (like in fldenv/fnstenv)
402 using LabelX87 = Label32Bit;
403 using LabelX8716Bit = Label32Bit;
404 using LabelX8732Bit = Label32Bit;
405 using LabelX8764Bit = Label32Bit;
406 using LabelX8780Bit = Label32Bit;
407 // Most vector instructions don't need to use REX.W to access 64-bit or 128-bit memory.
408 using VectorLabel32Bit = Label32Bit;
409 using VectorLabel64Bit = Label32Bit;
410 using VectorLabel128Bit = Label32Bit;
411
IsLegacyPrefix(int code)412 static constexpr bool IsLegacyPrefix(int code) {
413 // Legacy prefixes used as opcode extensions in SSE.
414 // Lock is used by cmpxchg.
415 return (code == 0x66) || (code == 0xf2) || (code == 0xf3) || (code == 0xf0);
416 }
417
418 // Delegate check to Assembler::template IsRegister.
419 template <typename ArgumentType>
420 struct IsCondition {
421 static constexpr bool value = std::is_same_v<ArgumentType, Condition>;
422 };
423
424 template <typename ArgumentType>
425 struct IsRegister {
426 static constexpr bool value = DerivedAssemblerType::template IsRegister<ArgumentType>::value ||
427 std::is_same_v<ArgumentType, X87Register>;
428 };
429
430 template <typename ArgumentType>
431 struct IsMemoryOperand {
432 static constexpr bool value =
433 DerivedAssemblerType::template IsMemoryOperand<ArgumentType>::value;
434 };
435
436 template <typename ArgumentType>
437 struct IsLabelOperand {
438 static constexpr bool value =
439 DerivedAssemblerType::template IsLabelOperand<ArgumentType>::value;
440 };
441
442 template <typename ArgumentType>
443 struct IsImmediate {
444 static constexpr bool value =
445 std::is_integral_v<ArgumentType> &&
446 ((sizeof(ArgumentType) == sizeof(int8_t)) || (sizeof(ArgumentType) == sizeof(int16_t)) ||
447 (sizeof(ArgumentType) == sizeof(int32_t)) || (sizeof(ArgumentType) == sizeof(int64_t)));
448 };
449
450 // Count number of arguments selected by Predicate.
451 template <template <typename> typename Predicate, typename... ArgumentTypes>
452 static constexpr std::size_t kCountArguments =
453 ((Predicate<ArgumentTypes>::value ? 1 : 0) + ... + 0);
454
455 // Extract arguments selected by Predicate.
456 //
457 // Note: This interface begs for the trick used in EmitFunctionTypeHelper in make_intrinsics.cc
458 // in conjunction with structured bindings.
459 //
460 // Unfortunately returning std::tuple slows down AssemblerTest by about 30% when libc++ and clang
461 // are used together (no slowdown on GCC, no slowdown on clang+libstdc++).
462 //
463 // TODO(http://b/140721204): refactor when it would be safe to return std::tuple from function.
464 //
465 template <std::size_t index,
466 template <typename>
467 typename Predicate,
468 typename ArgumentType,
469 typename... ArgumentTypes>
ArgumentByType(ArgumentType argument,ArgumentTypes...arguments)470 static constexpr auto ArgumentByType(ArgumentType argument, ArgumentTypes... arguments) {
471 if constexpr (Predicate<std::decay_t<ArgumentType>>::value) {
472 if constexpr (index == 0) {
473 return argument;
474 } else {
475 return ArgumentByType<index - 1, Predicate>(arguments...);
476 }
477 } else {
478 return ArgumentByType<index, Predicate>(arguments...);
479 }
480 }
481
482 // Emit immediates - they always come at the end and don't affect anything except rip-addressig.
EmitImmediates()483 static constexpr void EmitImmediates() {}
484
485 template <typename FirstArgumentType, typename... ArgumentTypes>
EmitImmediates(FirstArgumentType first_argument,ArgumentTypes...other_arguments)486 void EmitImmediates(FirstArgumentType first_argument, ArgumentTypes... other_arguments) {
487 if constexpr (std::is_integral_v<FirstArgumentType> &&
488 sizeof(FirstArgumentType) == sizeof(int8_t)) {
489 Emit8(first_argument);
490 } else if constexpr (std::is_integral_v<FirstArgumentType> &&
491 sizeof(FirstArgumentType) == sizeof(int16_t)) {
492 Emit16(first_argument);
493 } else if constexpr (std::is_integral_v<FirstArgumentType> &&
494 sizeof(FirstArgumentType) == sizeof(int32_t)) {
495 Emit32(first_argument);
496 } else if constexpr (std::is_integral_v<FirstArgumentType> &&
497 sizeof(FirstArgumentType) == sizeof(int64_t)) {
498 Emit64(first_argument);
499 }
500 EmitImmediates(other_arguments...);
501 }
502
503 template <typename ArgumentType>
ImmediateSize()504 static constexpr size_t ImmediateSize() {
505 if constexpr (std::is_integral_v<ArgumentType> && sizeof(ArgumentType) == sizeof(int8_t)) {
506 return 1;
507 } else if constexpr (std::is_integral_v<ArgumentType> &&
508 sizeof(ArgumentType) == sizeof(int16_t)) {
509 return 2;
510 } else if constexpr (std::is_integral_v<ArgumentType> &&
511 sizeof(ArgumentType) == sizeof(int32_t)) {
512 return 4;
513 } else if constexpr (std::is_integral_v<ArgumentType> &&
514 sizeof(ArgumentType) == sizeof(int64_t)) {
515 return 8;
516 } else {
517 static_assert(!std::is_integral_v<ArgumentType>);
518 return 0;
519 }
520 }
521
522 template <typename... ArgumentTypes>
ImmediatesSize()523 static constexpr size_t ImmediatesSize() {
524 return (ImmediateSize<ArgumentTypes>() + ... + 0);
525 }
526
527 // Note: We may need separate x87 EmitInstruction if we would want to support
528 // full set of x86 instructions.
529 //
530 // That's because 8087 was completely separate piece of silicone which was only
531 // partially driven by 8086:
532 // https://en.wikipedia.org/wiki/Intel_8087
533 //
534 // In particular it had the following properties:
535 // 1. It had its own separate subset of opcodes - because it did its own decoding.
536 // 2. It had separate set of registers and could *only* access these.
537 // 2a. The 8086, in turn, *couldn't* access these registers at all.
538 // 3. To access memory it was designed to take address from address bus.
539 //
540 // This means that:
541 // 1. x87 instructions are easily recognizable - all instructions with opcodes 0xd8
542 // to 0xdf are x87 instructions, all instructions with other opcodes are not.
543 // 2. We could be sure that x87 registers would only be used with x87 instructions
544 // and other types of registers wouldn't be used with these.
545 // 3. We still would use normal registers for memory access, but REX.W bit wouldn't
546 // be used for 64-bit quantities, whether they are floating point numbers or integers.
547 //
548 // Right now we only use EmitInstruction to emit x87 instructions which are using memory
549 // operands - and it works well enough for that because of #3.
550
551 // If you want to understand how this function works (and how helper function like Vex and
552 // Rex work), you need good understanding of AMD/Intel Instruction format.
553 //
554 // Intel manual includes the most precise explanation, but it's VERY hard to read.
555 //
556 // AMD manual is much easier to read, but it doesn't include description of EVEX
557 // instructions and is less precise. Diagram on page 2 of Volume 3 is especially helpful:
558 // https://www.amd.com/system/files/TechDocs/24594.pdf#page=42
559 //
560 // And the most concise (albeit unofficial) in on osdev Wiki:
561 // https://wiki.osdev.org/X86-64_Instruction_Encoding
562
563 // Note: if you change this function (or any of the helper functions) then remove --fast
564 // option from ExhaustiveAssemblerTest to run full blackbox comparison to clang.
565
566 template <uint8_t... kOpcodes, typename... ArgumentsTypes>
EmitInstruction(ArgumentsTypes...arguments)567 void EmitInstruction(ArgumentsTypes... arguments) {
568 static constexpr auto kOpcodesArray = std::array{kOpcodes...};
569 static constexpr size_t kLegacyPrefixesCount = []() {
570 size_t legacy_prefixes_count = 0;
571 for (legacy_prefixes_count = 0; IsLegacyPrefix(kOpcodesArray[legacy_prefixes_count]);
572 ++legacy_prefixes_count) {
573 }
574 return legacy_prefixes_count;
575 }();
576 for (size_t legacy_prefixes_index = 0; legacy_prefixes_index < kLegacyPrefixesCount;
577 ++legacy_prefixes_index) {
578 Emit8(kOpcodesArray[legacy_prefixes_index]);
579 }
580 // We don't yet support any XOP-encoded instructions, but they are 100% identical to vex ones,
581 // except they are using 0x8F prefix, not 0xC4 prefix.
582 constexpr auto kVexOrXop = []() {
583 if constexpr (std::size(kOpcodesArray) < kLegacyPrefixesCount + 3) {
584 return false;
585 // Note that JSON files use AMD approach: bytes are specified as in AMD manual (only we are
586 // replacing ¬R/¬X/¬B and vvvv bits with zeros).
587 //
588 // In particular it means that vex-encoded instructions should be specified with 0xC4 even
589 // if they are always emitted with 0xC4-to-0xC5 folding.
590 } else if constexpr (kOpcodesArray[kLegacyPrefixesCount] == 0xC4 ||
591 kOpcodesArray[kLegacyPrefixesCount] == 0x8F) {
592 return true;
593 }
594 return false;
595 }();
596 constexpr auto conditions_count = kCountArguments<IsCondition, ArgumentsTypes...>;
597 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
598 constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
599 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
600 // We need to know if Reg field (in ModRM byte) is an opcode extension or if opcode extension
601 // goes into the immediate field.
602 constexpr auto reg_is_opcode_extension =
603 (registers_count + operands_count > 0) &&
604 (registers_count + operands_count + labels_count <
605 2 + kVexOrXop * (std::size(kOpcodesArray) - kLegacyPrefixesCount - 4));
606 static_assert((registers_count + operands_count + labels_count + conditions_count +
607 kCountArguments<IsImmediate, ArgumentsTypes...>) == sizeof...(ArgumentsTypes),
608 "Only registers (with specified size), Operands (with specified size), "
609 "Conditions, and Immediates are supported.");
610 static_assert(operands_count <= 1, "Only one operand is allowed in instruction.");
611 static_assert(labels_count <= 1, "Only one label is allowed in instruction.");
612 // 0x0f is an opcode extension, if it's not there then we only have one byte opcode.
613 const size_t kPrefixesAndOpcodeExtensionsCount = []() {
614 if constexpr (kVexOrXop) {
615 static_assert(conditions_count == 0,
616 "No conditionals are supported in vex/xop instructions.");
617 static_assert((registers_count + operands_count + labels_count) <= 4,
618 "Up to four-arguments in vex/xop instructions are supported.");
619 return kLegacyPrefixesCount + 3;
620 } else {
621 static_assert(conditions_count <= 1, "Only one condition is allowed in instruction.");
622 static_assert((registers_count + operands_count + labels_count) <= 2,
623 "Only two-arguments legacy instructions are supported.");
624 if constexpr (kOpcodesArray[kLegacyPrefixesCount] == 0x0F) {
625 if constexpr (kOpcodesArray[kLegacyPrefixesCount + 1] == 0x38 ||
626 kOpcodesArray[kLegacyPrefixesCount + 1] == 0x3A) {
627 return kLegacyPrefixesCount + 2;
628 }
629 return kLegacyPrefixesCount + 1;
630 }
631 return kLegacyPrefixesCount;
632 }
633 }();
634 if constexpr (kVexOrXop) {
635 static_cast<DerivedAssemblerType*>(this)
636 ->template EmitVex<kOpcodesArray[kLegacyPrefixesCount],
637 kOpcodesArray[kLegacyPrefixesCount + 1],
638 kOpcodesArray[kLegacyPrefixesCount + 2],
639 reg_is_opcode_extension>(arguments...);
640 } else {
641 static_cast<DerivedAssemblerType*>(this)->EmitRex(arguments...);
642 for (size_t extension_opcode_index = kLegacyPrefixesCount;
643 extension_opcode_index < kPrefixesAndOpcodeExtensionsCount;
644 ++extension_opcode_index) {
645 Emit8(kOpcodesArray[extension_opcode_index]);
646 }
647 }
648 // These are older 8086 instructions which encode register number in the opcode itself.
649 if constexpr (registers_count == 1 && operands_count == 0 && labels_count == 0 &&
650 std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 1) {
651 static_cast<DerivedAssemblerType*>(this)->EmitRegisterInOpcode(
652 kOpcodesArray[kPrefixesAndOpcodeExtensionsCount],
653 ArgumentByType<0, IsRegister>(arguments...));
654 EmitImmediates(arguments...);
655 } else {
656 // Emit "main" single-byte opcode.
657 if constexpr (conditions_count == 1) {
658 auto condition_code = static_cast<uint8_t>(ArgumentByType<0, IsCondition>(arguments...));
659 CHECK_EQ(0, condition_code & 0xF0);
660 Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount] | condition_code);
661 } else {
662 Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount]);
663 }
664 if constexpr (reg_is_opcode_extension) {
665 if constexpr (operands_count == 1) {
666 static_cast<DerivedAssemblerType*>(this)->EmitOperandOp(
667 static_cast<int>(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]),
668 ArgumentByType<0, IsMemoryOperand>(arguments...).operand);
669 } else if constexpr (labels_count == 1) {
670 static_cast<DerivedAssemblerType*>(this)
671 ->template EmitRipOp<ImmediatesSize<ArgumentsTypes...>()>(
672 static_cast<int>(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]),
673 ArgumentByType<0, IsLabelOperand>(arguments...).label);
674 } else {
675 static_cast<DerivedAssemblerType*>(this)->EmitModRM(
676 kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1],
677 ArgumentByType<0, IsRegister>(arguments...));
678 }
679 } else if constexpr (registers_count > 0) {
680 if constexpr (operands_count == 1) {
681 static_cast<DerivedAssemblerType*>(this)->EmitOperandOp(
682 ArgumentByType<0, IsRegister>(arguments...),
683 ArgumentByType<0, IsMemoryOperand>(arguments...).operand);
684 } else if constexpr (labels_count == 1) {
685 static_cast<DerivedAssemblerType*>(this)
686 ->template EmitRipOp<ImmediatesSize<ArgumentsTypes...>()>(
687 ArgumentByType<0, IsRegister>(arguments...),
688 ArgumentByType<0, IsLabelOperand>(arguments...).label);
689 } else {
690 static_cast<DerivedAssemblerType*>(this)->EmitModRM(
691 ArgumentByType<0, IsRegister>(arguments...),
692 ArgumentByType<1, IsRegister>(arguments...));
693 }
694 }
695 // If reg is an opcode extension then we already used that element.
696 if constexpr (reg_is_opcode_extension) {
697 static_assert(std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 2);
698 } else if constexpr (std::size(kOpcodesArray) > kPrefixesAndOpcodeExtensionsCount + 1) {
699 // Final opcode byte(s) - they are in the place where immediate is expected.
700 // Cmpsps/Cmppd and 3DNow! instructions are using it.
701 static_assert(std::size(kOpcodesArray) == kPrefixesAndOpcodeExtensionsCount + 2);
702 Emit8(kOpcodesArray[kPrefixesAndOpcodeExtensionsCount + 1]);
703 }
704 if constexpr (registers_count + operands_count + labels_count == 4) {
705 if constexpr (kCountArguments<IsImmediate, ArgumentsTypes...> == 1) {
706 Emit8((ArgumentByType<registers_count - 1, IsRegister>(arguments...).num_ << 4) |
707 ArgumentByType<0, IsImmediate>(arguments...));
708 } else {
709 static_assert(kCountArguments<IsImmediate, ArgumentsTypes...> == 0);
710 Emit8(ArgumentByType<registers_count - 1, IsRegister>(arguments...).num_ << 4);
711 }
712 } else {
713 EmitImmediates(arguments...);
714 }
715 }
716 }
717
718 // Normally instruction arguments come in the following order: vex, rm, reg, imm.
719 // But certain instructions can have swapped arguments in a different order.
720 // In addition to that we have special case where two arguments may need to be swapped
721 // to reduce encoding size.
722
723 template <uint8_t... kOpcodes,
724 typename ArgumentsType0,
725 typename ArgumentsType1,
726 typename... ArgumentsTypes>
EmitRegToRmInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsTypes &&...arguments)727 void EmitRegToRmInstruction(ArgumentsType0&& argument0,
728 ArgumentsType1&& argument1,
729 ArgumentsTypes&&... arguments) {
730 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType1>(argument1),
731 std::forward<ArgumentsType0>(argument0),
732 std::forward<ArgumentsTypes>(arguments)...);
733 }
734
735 template <uint8_t... kOpcodes,
736 typename ArgumentsType0,
737 typename ArgumentsType1,
738 typename... ArgumentsTypes>
EmitRmToVexInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsTypes &&...arguments)739 void EmitRmToVexInstruction(ArgumentsType0&& argument0,
740 ArgumentsType1&& argument1,
741 ArgumentsTypes&&... arguments) {
742 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType1>(argument1),
743 std::forward<ArgumentsType0>(argument0),
744 std::forward<ArgumentsTypes>(arguments)...);
745 }
746
747 // If vex operand is one of first 8 registers and rm operand is not then swapping these two
748 // operands produces more compact encoding.
749 // This only works with commutative instructions from first opcode map.
750 template <uint8_t... kOpcodes,
751 typename ArgumentsType0,
752 typename ArgumentsType1,
753 typename ArgumentsType2,
754 typename... ArgumentsTypes>
EmitOptimizableUsingCommutationInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsTypes &&...arguments)755 void EmitOptimizableUsingCommutationInstruction(ArgumentsType0&& argument0,
756 ArgumentsType1&& argument1,
757 ArgumentsType2&& argument2,
758 ArgumentsTypes&&... arguments) {
759 if constexpr (std::is_same_v<ArgumentsType2, ArgumentsType1>) {
760 if (DerivedAssemblerType::IsSwapProfitable(std::forward<ArgumentsType2>(argument2),
761 std::forward<ArgumentsType1>(argument1))) {
762 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
763 std::forward<ArgumentsType1>(argument1),
764 std::forward<ArgumentsType2>(argument2),
765 std::forward<ArgumentsTypes>(arguments)...);
766 }
767 }
768 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
769 std::forward<ArgumentsType2>(argument2),
770 std::forward<ArgumentsType1>(argument1),
771 std::forward<ArgumentsTypes>(arguments)...);
772 }
773
774 template <uint8_t... kOpcodes,
775 typename ArgumentsType0,
776 typename ArgumentsType1,
777 typename ArgumentsType2,
778 typename ArgumentsType3,
779 typename... ArgumentsTypes>
EmitVexImmRmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsType3 && argument3,ArgumentsTypes &&...arguments)780 void EmitVexImmRmToRegInstruction(ArgumentsType0&& argument0,
781 ArgumentsType1&& argument1,
782 ArgumentsType2&& argument2,
783 ArgumentsType3&& argument3,
784 ArgumentsTypes&&... arguments) {
785 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
786 std::forward<ArgumentsType3>(argument3),
787 std::forward<ArgumentsType1>(argument1),
788 std::forward<ArgumentsType2>(argument2),
789 std::forward<ArgumentsTypes>(arguments)...);
790 }
791
792 template <uint8_t... kOpcodes,
793 typename ArgumentsType0,
794 typename ArgumentsType1,
795 typename ArgumentsType2,
796 typename ArgumentsType3,
797 typename... ArgumentsTypes>
EmitVexRmImmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsType3 && argument3,ArgumentsTypes &&...arguments)798 void EmitVexRmImmToRegInstruction(ArgumentsType0&& argument0,
799 ArgumentsType1&& argument1,
800 ArgumentsType2&& argument2,
801 ArgumentsType3&& argument3,
802 ArgumentsTypes&&... arguments) {
803 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
804 std::forward<ArgumentsType2>(argument2),
805 std::forward<ArgumentsType1>(argument1),
806 std::forward<ArgumentsType3>(argument3),
807 std::forward<ArgumentsTypes>(arguments)...);
808 }
809
810 template <uint8_t... kOpcodes,
811 typename ArgumentsType0,
812 typename ArgumentsType1,
813 typename ArgumentsType2,
814 typename... ArgumentsTypes>
EmitVexRmToRegInstruction(ArgumentsType0 && argument0,ArgumentsType1 && argument1,ArgumentsType2 && argument2,ArgumentsTypes &&...arguments)815 void EmitVexRmToRegInstruction(ArgumentsType0&& argument0,
816 ArgumentsType1&& argument1,
817 ArgumentsType2&& argument2,
818 ArgumentsTypes&&... arguments) {
819 return EmitInstruction<kOpcodes...>(std::forward<ArgumentsType0>(argument0),
820 std::forward<ArgumentsType2>(argument2),
821 std::forward<ArgumentsType1>(argument1),
822 std::forward<ArgumentsTypes>(arguments)...);
823 }
824
825 void ResolveJumps();
826
827 private:
828 Assembler() = delete;
829 Assembler(const Assembler&) = delete;
830 Assembler(Assembler&&) = delete;
831 void operator=(const Assembler&) = delete;
832 void operator=(Assembler&&) = delete;
833 };
834
835 template <typename DerivedAssemblerType>
Pmov(XMMRegister dest,XMMRegister src)836 inline void Assembler<DerivedAssemblerType>::Pmov(XMMRegister dest, XMMRegister src) {
837 // SSE does not have operations for register-to-register integer move and
838 // Intel explicitly recommends to use pshufd instead on Pentium4:
839 // See https://software.intel.com/en-us/articles/
840 // fast-simd-integer-move-for-the-intel-pentiumr-4-processor
841 // These recommendations are CPU-dependent, though, thus we will need to
842 // investigate this question further before we could decide when to use
843 // movaps (or movapd) and when to use pshufd.
844 //
845 // TODO(khim): investigate performance problems related to integer MOVs
846 Movaps(dest, src);
847 }
848
849 template <typename DerivedAssemblerType>
Call(const Label & label)850 inline void Assembler<DerivedAssemblerType>::Call(const Label& label) {
851 if (label.IsBound()) {
852 int32_t offset = label.position() - pc();
853 Call(offset);
854 } else {
855 Emit8(0xe8);
856 Emit32(0xfffffffc);
857 jumps_.push_back(Jump{&label, pc() - 4, false});
858 }
859 }
860
861 template <typename DerivedAssemblerType>
Jcc(Condition cc,const Label & label)862 inline void Assembler<DerivedAssemblerType>::Jcc(Condition cc, const Label& label) {
863 if (cc == Condition::kAlways) {
864 Jmp(label);
865 return;
866 } else if (cc == Condition::kNever) {
867 return;
868 }
869 CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
870 // TODO(eaeltsin): may be remove IsBound case?
871 // Then jcc by label will be of fixed size (5 bytes)
872 if (label.IsBound()) {
873 int32_t offset = label.position() - pc();
874 JccRel(cc, offset);
875 } else {
876 Emit16(0x800f | (static_cast<uint8_t>(cc) << 8));
877 Emit32(0xfffffffc);
878 jumps_.push_back(Jump{&label, pc() - 4, false});
879 }
880 }
881
882 template <typename DerivedAssemblerType>
Jmp(const Label & label)883 inline void Assembler<DerivedAssemblerType>::Jmp(const Label& label) {
884 // TODO(eaeltsin): may be remove IsBound case?
885 // Then jmp by label will be of fixed size (5 bytes)
886 if (label.IsBound()) {
887 int32_t offset = label.position() - pc();
888 JmpRel(offset);
889 } else {
890 Emit8(0xe9);
891 Emit32(0xfffffffc);
892 jumps_.push_back(Jump{&label, pc() - 4, false});
893 }
894 }
895
896 template <typename DerivedAssemblerType>
ResolveJumps()897 inline void Assembler<DerivedAssemblerType>::ResolveJumps() {
898 for (const auto& jump : jumps_) {
899 const Label* label = jump.label;
900 uint32_t pc = jump.pc;
901 CHECK(label->IsBound());
902 if (jump.is_recovery) {
903 // Add pc -> label correspondence to recovery map.
904 AddRelocation(0, RelocationType::RelocRecoveryPoint, pc, label->position());
905 } else {
906 int32_t offset = label->position() - pc;
907 *AddrAs<int32_t>(pc) += offset;
908 }
909 }
910 }
911
912 // Code size optimized instructions: they have different variants depending on registers used.
913
914 template <typename DerivedAssemblerType>
Xchgl(Register dest,Register src)915 inline void Assembler<DerivedAssemblerType>::Xchgl(Register dest, Register src) {
916 if (DerivedAssemblerType::IsAccumulator(src) || DerivedAssemblerType::IsAccumulator(dest)) {
917 Register other = DerivedAssemblerType::IsAccumulator(src) ? dest : src;
918 EmitInstruction<0x90>(SizeAgnosticRegister(other));
919 } else {
920 // Clang 8 (after r330298) puts dest before src. We are comparing output
921 // to clang in exhaustive test thus we want to match clang behavior exactly.
922 EmitInstruction<0x87>(SizeAgnosticRegister(dest), SizeAgnosticRegister(src));
923 }
924 }
925
926 } // namespace x86_32_and_x86_64
927
928 } // namespace berberis
929
930 #endif // BERBERIS_ASSEMBLER_X86_32_AND_X86_64_H_
931