xref: /aosp_15_r20/external/FP16/include/fp16/avx.py (revision 5f32b7105932ea8520a0e8811c640f936367d707)
1*5f32b710SXin Lifrom peachpy import *
2*5f32b710SXin Lifrom peachpy.x86_64 import *
3*5f32b710SXin Li
4*5f32b710SXin Li
5*5f32b710SXin Lidef fp16_alt_xmm_to_fp32_xmm(xmm_half):
6*5f32b710SXin Li	xmm_zero = XMMRegister()
7*5f32b710SXin Li	VPXOR(xmm_zero, xmm_zero, xmm_zero)
8*5f32b710SXin Li
9*5f32b710SXin Li	xmm_word = XMMRegister()
10*5f32b710SXin Li	VPUNPCKLWD(xmm_word, xmm_zero, xmm_half)
11*5f32b710SXin Li
12*5f32b710SXin Li	xmm_shl1_half = XMMRegister()
13*5f32b710SXin Li	VPADDW(xmm_shl1_half, xmm_half, xmm_half)
14*5f32b710SXin Li
15*5f32b710SXin Li	xmm_shl1_nonsign = XMMRegister()
16*5f32b710SXin Li	VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word)
17*5f32b710SXin Li
18*5f32b710SXin Li	sign_mask = Constant.float32x4(-0.0)
19*5f32b710SXin Li
20*5f32b710SXin Li	xmm_sign = XMMRegister()
21*5f32b710SXin Li	VANDPS(xmm_sign, xmm_word, sign_mask)
22*5f32b710SXin Li
23*5f32b710SXin Li	xmm_shr3_nonsign = XMMRegister()
24*5f32b710SXin Li	VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4)
25*5f32b710SXin Li
26*5f32b710SXin Li	exp_offset = Constant.uint32x4(0x38000000)
27*5f32b710SXin Li
28*5f32b710SXin Li	xmm_norm_nonsign = XMMRegister()
29*5f32b710SXin Li	VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset)
30*5f32b710SXin Li
31*5f32b710SXin Li	magic_mask = Constant.uint16x8(0x3E80)
32*5f32b710SXin Li	xmm_denorm_nonsign = XMMRegister()
33*5f32b710SXin Li	VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask)
34*5f32b710SXin Li
35*5f32b710SXin Li	magic_bias = Constant.float32x4(0.25)
36*5f32b710SXin Li	VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias)
37*5f32b710SXin Li
38*5f32b710SXin Li	xmm_denorm_cutoff = XMMRegister()
39*5f32b710SXin Li	VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000))
40*5f32b710SXin Li
41*5f32b710SXin Li	xmm_denorm_mask = XMMRegister()
42*5f32b710SXin Li	VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign)
43*5f32b710SXin Li
44*5f32b710SXin Li	xmm_nonsign = XMMRegister()
45*5f32b710SXin Li	VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask)
46*5f32b710SXin Li
47*5f32b710SXin Li	xmm_float = XMMRegister()
48*5f32b710SXin Li	VORPS(xmm_float, xmm_nonsign, xmm_sign)
49*5f32b710SXin Li
50*5f32b710SXin Li	return xmm_float
51