1*5f32b710SXin Lifrom peachpy import * 2*5f32b710SXin Lifrom peachpy.x86_64 import * 3*5f32b710SXin Li 4*5f32b710SXin Li 5*5f32b710SXin Lidef fp16_alt_xmm_to_fp32_xmm(xmm_half): 6*5f32b710SXin Li xmm_zero = XMMRegister() 7*5f32b710SXin Li VPXOR(xmm_zero, xmm_zero, xmm_zero) 8*5f32b710SXin Li 9*5f32b710SXin Li xmm_word = XMMRegister() 10*5f32b710SXin Li VPUNPCKLWD(xmm_word, xmm_zero, xmm_half) 11*5f32b710SXin Li 12*5f32b710SXin Li xmm_shl1_half = XMMRegister() 13*5f32b710SXin Li VPADDW(xmm_shl1_half, xmm_half, xmm_half) 14*5f32b710SXin Li 15*5f32b710SXin Li xmm_shl1_nonsign = XMMRegister() 16*5f32b710SXin Li VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word) 17*5f32b710SXin Li 18*5f32b710SXin Li sign_mask = Constant.float32x4(-0.0) 19*5f32b710SXin Li 20*5f32b710SXin Li xmm_sign = XMMRegister() 21*5f32b710SXin Li VANDPS(xmm_sign, xmm_word, sign_mask) 22*5f32b710SXin Li 23*5f32b710SXin Li xmm_shr3_nonsign = XMMRegister() 24*5f32b710SXin Li VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4) 25*5f32b710SXin Li 26*5f32b710SXin Li exp_offset = Constant.uint32x4(0x38000000) 27*5f32b710SXin Li 28*5f32b710SXin Li xmm_norm_nonsign = XMMRegister() 29*5f32b710SXin Li VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset) 30*5f32b710SXin Li 31*5f32b710SXin Li magic_mask = Constant.uint16x8(0x3E80) 32*5f32b710SXin Li xmm_denorm_nonsign = XMMRegister() 33*5f32b710SXin Li VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask) 34*5f32b710SXin Li 35*5f32b710SXin Li magic_bias = Constant.float32x4(0.25) 36*5f32b710SXin Li VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias) 37*5f32b710SXin Li 38*5f32b710SXin Li xmm_denorm_cutoff = XMMRegister() 39*5f32b710SXin Li VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000)) 40*5f32b710SXin Li 41*5f32b710SXin Li xmm_denorm_mask = XMMRegister() 42*5f32b710SXin Li VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign) 43*5f32b710SXin Li 44*5f32b710SXin Li xmm_nonsign = XMMRegister() 45*5f32b710SXin Li VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask) 46*5f32b710SXin Li 47*5f32b710SXin Li xmm_float = XMMRegister() 48*5f32b710SXin Li VORPS(xmm_float, xmm_nonsign, xmm_sign) 49*5f32b710SXin Li 50*5f32b710SXin Li return xmm_float 51