1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; 7*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 8*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 9*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 10*dfc6aa5cSAndroid Build Coastguard Worker; 11*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 12*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 13*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 14*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 15*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 16*dfc6aa5cSAndroid Build Coastguard Worker; 17*dfc6aa5cSAndroid Build Coastguard Worker; This file contains a floating-point implementation of the inverse DCT 18*dfc6aa5cSAndroid Build Coastguard Worker; (Discrete Cosine Transform). The following code is based directly on 19*dfc6aa5cSAndroid Build Coastguard Worker; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20*dfc6aa5cSAndroid Build Coastguard Worker 21*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 22*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 23*dfc6aa5cSAndroid Build Coastguard Worker 24*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 25*dfc6aa5cSAndroid Build Coastguard Worker 26*dfc6aa5cSAndroid Build Coastguard Worker%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 27*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0x44 28*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 29*dfc6aa5cSAndroid Build Coastguard Worker 30*dfc6aa5cSAndroid Build Coastguard Worker%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 31*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0xEE 32*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 33*dfc6aa5cSAndroid Build Coastguard Worker 34*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 35*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_CONST 36*dfc6aa5cSAndroid Build Coastguard Worker 37*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 38*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_DATA(jconst_idct_float_sse2) 39*dfc6aa5cSAndroid Build Coastguard Worker 40*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_idct_float_sse2): 41*dfc6aa5cSAndroid Build Coastguard Worker 42*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_414 times 4 dd 1.414213562373095048801689 43*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_847 times 4 dd 1.847759065022573512256366 44*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_082 times 4 dd 1.082392200292393968799446 45*dfc6aa5cSAndroid Build Coastguard WorkerPD_M2_613 times 4 dd -2.613125929752753055713286 46*dfc6aa5cSAndroid Build Coastguard WorkerPD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 47*dfc6aa5cSAndroid Build Coastguard WorkerPB_CENTERJSAMP times 16 db CENTERJSAMPLE 48*dfc6aa5cSAndroid Build Coastguard Worker 49*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 50*dfc6aa5cSAndroid Build Coastguard Worker 51*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 52*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 53*dfc6aa5cSAndroid Build Coastguard Worker BITS 32 54*dfc6aa5cSAndroid Build Coastguard Worker; 55*dfc6aa5cSAndroid Build Coastguard Worker; Perform dequantization and inverse DCT on one block of coefficients. 56*dfc6aa5cSAndroid Build Coastguard Worker; 57*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 58*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 59*dfc6aa5cSAndroid Build Coastguard Worker; JSAMPARRAY output_buf, JDIMENSION output_col) 60*dfc6aa5cSAndroid Build Coastguard Worker; 61*dfc6aa5cSAndroid Build Coastguard Worker 62*dfc6aa5cSAndroid Build Coastguard Worker%define dct_table(b) (b) + 8 ; void *dct_table 63*dfc6aa5cSAndroid Build Coastguard Worker%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 64*dfc6aa5cSAndroid Build Coastguard Worker%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 65*dfc6aa5cSAndroid Build Coastguard Worker%define output_col(b) (b) + 20 ; JDIMENSION output_col 66*dfc6aa5cSAndroid Build Coastguard Worker 67*dfc6aa5cSAndroid Build Coastguard Worker%define original_ebp ebp + 0 68*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 69*dfc6aa5cSAndroid Build Coastguard Worker ; xmmword wk[WK_NUM] 70*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM 2 71*dfc6aa5cSAndroid Build Coastguard Worker%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 72*dfc6aa5cSAndroid Build Coastguard Worker ; FAST_FLOAT workspace[DCTSIZE2] 73*dfc6aa5cSAndroid Build Coastguard Worker 74*dfc6aa5cSAndroid Build Coastguard Worker align 32 75*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_idct_float_sse2) 76*dfc6aa5cSAndroid Build Coastguard Worker 77*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_idct_float_sse2): 78*dfc6aa5cSAndroid Build Coastguard Worker push ebp 79*dfc6aa5cSAndroid Build Coastguard Worker mov eax, esp ; eax = original ebp 80*dfc6aa5cSAndroid Build Coastguard Worker sub esp, byte 4 81*dfc6aa5cSAndroid Build Coastguard Worker and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82*dfc6aa5cSAndroid Build Coastguard Worker mov [esp], eax 83*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp ; ebp = aligned ebp 84*dfc6aa5cSAndroid Build Coastguard Worker lea esp, [workspace] 85*dfc6aa5cSAndroid Build Coastguard Worker push ebx 86*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; need not be preserved 87*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 88*dfc6aa5cSAndroid Build Coastguard Worker push esi 89*dfc6aa5cSAndroid Build Coastguard Worker push edi 90*dfc6aa5cSAndroid Build Coastguard Worker 91*dfc6aa5cSAndroid Build Coastguard Worker get_GOT ebx ; get GOT address 92*dfc6aa5cSAndroid Build Coastguard Worker 93*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 1: process columns from input, store into work array. 94*dfc6aa5cSAndroid Build Coastguard Worker 95*dfc6aa5cSAndroid Build Coastguard Worker; mov eax, [original_ebp] 96*dfc6aa5cSAndroid Build Coastguard Worker mov edx, POINTER [dct_table(eax)] ; quantptr 97*dfc6aa5cSAndroid Build Coastguard Worker mov esi, JCOEFPTR [coef_block(eax)] ; inptr 98*dfc6aa5cSAndroid Build Coastguard Worker lea edi, [workspace] ; FAST_FLOAT *wsptr 99*dfc6aa5cSAndroid Build Coastguard Worker mov ecx, DCTSIZE/4 ; ctr 100*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 101*dfc6aa5cSAndroid Build Coastguard Worker.columnloop: 102*dfc6aa5cSAndroid Build Coastguard Worker%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 103*dfc6aa5cSAndroid Build Coastguard Worker mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 104*dfc6aa5cSAndroid Build Coastguard Worker or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 105*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnDCT 106*dfc6aa5cSAndroid Build Coastguard Worker 107*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 108*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 109*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 110*dfc6aa5cSAndroid Build Coastguard Worker movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 111*dfc6aa5cSAndroid Build Coastguard Worker movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 112*dfc6aa5cSAndroid Build Coastguard Worker movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 113*dfc6aa5cSAndroid Build Coastguard Worker movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 114*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm2 115*dfc6aa5cSAndroid Build Coastguard Worker por xmm3, xmm4 116*dfc6aa5cSAndroid Build Coastguard Worker por xmm5, xmm6 117*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm3 118*dfc6aa5cSAndroid Build Coastguard Worker por xmm5, xmm7 119*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm5 120*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm1, xmm1 121*dfc6aa5cSAndroid Build Coastguard Worker movd eax, xmm1 122*dfc6aa5cSAndroid Build Coastguard Worker test eax, eax 123*dfc6aa5cSAndroid Build Coastguard Worker jnz short .columnDCT 124*dfc6aa5cSAndroid Build Coastguard Worker 125*dfc6aa5cSAndroid Build Coastguard Worker ; -- AC terms all zero 126*dfc6aa5cSAndroid Build Coastguard Worker 127*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 128*dfc6aa5cSAndroid Build Coastguard Worker 129*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 130*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 131*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 132*dfc6aa5cSAndroid Build Coastguard Worker 133*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 134*dfc6aa5cSAndroid Build Coastguard Worker 135*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm0 136*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm0 137*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 138*dfc6aa5cSAndroid Build Coastguard Worker 139*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 140*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 141*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 142*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 143*dfc6aa5cSAndroid Build Coastguard Worker 144*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 145*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 146*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 147*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 148*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 149*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 150*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 151*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 152*dfc6aa5cSAndroid Build Coastguard Worker jmp near .nextcolumn 153*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 154*dfc6aa5cSAndroid Build Coastguard Worker%endif 155*dfc6aa5cSAndroid Build Coastguard Worker.columnDCT: 156*dfc6aa5cSAndroid Build Coastguard Worker 157*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 158*dfc6aa5cSAndroid Build Coastguard Worker 159*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 160*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 161*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 162*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 163*dfc6aa5cSAndroid Build Coastguard Worker 164*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 165*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 166*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 167*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 168*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 169*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 170*dfc6aa5cSAndroid Build Coastguard Worker 171*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 172*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 173*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 174*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 175*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 176*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 177*dfc6aa5cSAndroid Build Coastguard Worker 178*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 179*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 180*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 181*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 182*dfc6aa5cSAndroid Build Coastguard Worker 183*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 184*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 185*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 186*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 187*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 188*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 189*dfc6aa5cSAndroid Build Coastguard Worker 190*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [GOTOFF(ebx,PD_1_414)] 191*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 192*dfc6aa5cSAndroid Build Coastguard Worker 193*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 194*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 195*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 196*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 197*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 198*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 199*dfc6aa5cSAndroid Build Coastguard Worker 200*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 201*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 202*dfc6aa5cSAndroid Build Coastguard Worker 203*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 204*dfc6aa5cSAndroid Build Coastguard Worker 205*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 206*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 207*dfc6aa5cSAndroid Build Coastguard Worker movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 208*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 209*dfc6aa5cSAndroid Build Coastguard Worker 210*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 211*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 212*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 213*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 214*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 215*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 216*dfc6aa5cSAndroid Build Coastguard Worker 217*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 218*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 219*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 220*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 221*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 222*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 223*dfc6aa5cSAndroid Build Coastguard Worker 224*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 225*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 226*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 227*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 228*dfc6aa5cSAndroid Build Coastguard Worker 229*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 230*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 231*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 232*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 233*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 234*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 235*dfc6aa5cSAndroid Build Coastguard Worker 236*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 237*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 238*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 239*dfc6aa5cSAndroid Build Coastguard Worker 240*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 241*dfc6aa5cSAndroid Build Coastguard Worker 242*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 243*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 244*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 245*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 246*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 247*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 248*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 249*dfc6aa5cSAndroid Build Coastguard Worker 250*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 251*dfc6aa5cSAndroid Build Coastguard Worker 252*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 253*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 254*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 255*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 256*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 257*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 258*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 259*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 260*dfc6aa5cSAndroid Build Coastguard Worker 261*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm6 ; transpose coefficients(phase 1) 262*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 263*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 264*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 ; transpose coefficients(phase 1) 265*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 266*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 267*dfc6aa5cSAndroid Build Coastguard Worker 268*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 269*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 270*dfc6aa5cSAndroid Build Coastguard Worker 271*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 272*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 273*dfc6aa5cSAndroid Build Coastguard Worker 274*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 275*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 276*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm5 277*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 278*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 279*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 280*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 281*dfc6aa5cSAndroid Build Coastguard Worker 282*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm7 ; transpose coefficients(phase 1) 283*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 284*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 285*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm5 ; transpose coefficients(phase 1) 286*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 287*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 288*dfc6aa5cSAndroid Build Coastguard Worker 289*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm6 ; transpose coefficients(phase 2) 290*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 291*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 292*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm1 ; transpose coefficients(phase 2) 293*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 294*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 295*dfc6aa5cSAndroid Build Coastguard Worker 296*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 297*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 298*dfc6aa5cSAndroid Build Coastguard Worker 299*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 300*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 301*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 302*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 303*dfc6aa5cSAndroid Build Coastguard Worker 304*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm5 ; transpose coefficients(phase 2) 305*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 306*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 307*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm4 ; transpose coefficients(phase 2) 308*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 309*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 310*dfc6aa5cSAndroid Build Coastguard Worker 311*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 312*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 313*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 314*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 315*dfc6aa5cSAndroid Build Coastguard Worker 316*dfc6aa5cSAndroid Build Coastguard Worker.nextcolumn: 317*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 4*SIZEOF_JCOEF ; coef_block 318*dfc6aa5cSAndroid Build Coastguard Worker add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 319*dfc6aa5cSAndroid Build Coastguard Worker add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 320*dfc6aa5cSAndroid Build Coastguard Worker dec ecx ; ctr 321*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnloop 322*dfc6aa5cSAndroid Build Coastguard Worker 323*dfc6aa5cSAndroid Build Coastguard Worker ; -- Prefetch the next coefficient block 324*dfc6aa5cSAndroid Build Coastguard Worker 325*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 326*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 327*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 328*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 329*dfc6aa5cSAndroid Build Coastguard Worker 330*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 2: process rows from work array, store into output array. 331*dfc6aa5cSAndroid Build Coastguard Worker 332*dfc6aa5cSAndroid Build Coastguard Worker mov eax, [original_ebp] 333*dfc6aa5cSAndroid Build Coastguard Worker lea esi, [workspace] ; FAST_FLOAT *wsptr 334*dfc6aa5cSAndroid Build Coastguard Worker mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 335*dfc6aa5cSAndroid Build Coastguard Worker mov eax, JDIMENSION [output_col(eax)] 336*dfc6aa5cSAndroid Build Coastguard Worker mov ecx, DCTSIZE/4 ; ctr 337*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 338*dfc6aa5cSAndroid Build Coastguard Worker.rowloop: 339*dfc6aa5cSAndroid Build Coastguard Worker 340*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 341*dfc6aa5cSAndroid Build Coastguard Worker 342*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 343*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 344*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 345*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 346*dfc6aa5cSAndroid Build Coastguard Worker 347*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 348*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 349*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 350*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 351*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 352*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 353*dfc6aa5cSAndroid Build Coastguard Worker 354*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [GOTOFF(ebx,PD_1_414)] 355*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 356*dfc6aa5cSAndroid Build Coastguard Worker 357*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 358*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 359*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 360*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 361*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 362*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 363*dfc6aa5cSAndroid Build Coastguard Worker 364*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 365*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 366*dfc6aa5cSAndroid Build Coastguard Worker 367*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 368*dfc6aa5cSAndroid Build Coastguard Worker 369*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 370*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 371*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 372*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 373*dfc6aa5cSAndroid Build Coastguard Worker 374*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 375*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 376*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 377*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 378*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 379*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 380*dfc6aa5cSAndroid Build Coastguard Worker 381*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 382*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 383*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 384*dfc6aa5cSAndroid Build Coastguard Worker 385*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 386*dfc6aa5cSAndroid Build Coastguard Worker 387*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 388*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 389*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 390*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 391*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 392*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 393*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 394*dfc6aa5cSAndroid Build Coastguard Worker 395*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 396*dfc6aa5cSAndroid Build Coastguard Worker 397*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 398*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 399*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 400*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 401*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 402*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 403*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 404*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 405*dfc6aa5cSAndroid Build Coastguard Worker 406*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 407*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqd xmm3, xmm3 408*dfc6aa5cSAndroid Build Coastguard Worker psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 409*dfc6aa5cSAndroid Build Coastguard Worker 410*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 411*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 412*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 413*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 414*dfc6aa5cSAndroid Build Coastguard Worker 415*dfc6aa5cSAndroid Build Coastguard Worker pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 416*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 417*dfc6aa5cSAndroid Build Coastguard Worker pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 418*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 419*dfc6aa5cSAndroid Build Coastguard Worker por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 420*dfc6aa5cSAndroid Build Coastguard Worker por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 421*dfc6aa5cSAndroid Build Coastguard Worker 422*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 423*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 424*dfc6aa5cSAndroid Build Coastguard Worker 425*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 426*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm1 427*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm3 428*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 429*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 430*dfc6aa5cSAndroid Build Coastguard Worker subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 431*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 432*dfc6aa5cSAndroid Build Coastguard Worker 433*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 434*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqd xmm4, xmm4 435*dfc6aa5cSAndroid Build Coastguard Worker psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 436*dfc6aa5cSAndroid Build Coastguard Worker 437*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 438*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 439*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 440*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 441*dfc6aa5cSAndroid Build Coastguard Worker 442*dfc6aa5cSAndroid Build Coastguard Worker pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 443*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 444*dfc6aa5cSAndroid Build Coastguard Worker pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 445*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 446*dfc6aa5cSAndroid Build Coastguard Worker por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 447*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 448*dfc6aa5cSAndroid Build Coastguard Worker 449*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 450*dfc6aa5cSAndroid Build Coastguard Worker 451*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 452*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 453*dfc6aa5cSAndroid Build Coastguard Worker paddb xmm6, xmm2 454*dfc6aa5cSAndroid Build Coastguard Worker paddb xmm1, xmm2 455*dfc6aa5cSAndroid Build Coastguard Worker 456*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 457*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 458*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 459*dfc6aa5cSAndroid Build Coastguard Worker 460*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 461*dfc6aa5cSAndroid Build Coastguard Worker punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 462*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 463*dfc6aa5cSAndroid Build Coastguard Worker 464*dfc6aa5cSAndroid Build Coastguard Worker pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 465*dfc6aa5cSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 466*dfc6aa5cSAndroid Build Coastguard Worker 467*dfc6aa5cSAndroid Build Coastguard Worker pushpic ebx ; save GOT address 468*dfc6aa5cSAndroid Build Coastguard Worker 469*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 470*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 471*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 472*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 473*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 474*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 475*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 476*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 477*dfc6aa5cSAndroid Build Coastguard Worker 478*dfc6aa5cSAndroid Build Coastguard Worker poppic ebx ; restore GOT address 479*dfc6aa5cSAndroid Build Coastguard Worker 480*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 481*dfc6aa5cSAndroid Build Coastguard Worker add edi, byte 4*SIZEOF_JSAMPROW 482*dfc6aa5cSAndroid Build Coastguard Worker dec ecx ; ctr 483*dfc6aa5cSAndroid Build Coastguard Worker jnz near .rowloop 484*dfc6aa5cSAndroid Build Coastguard Worker 485*dfc6aa5cSAndroid Build Coastguard Worker pop edi 486*dfc6aa5cSAndroid Build Coastguard Worker pop esi 487*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 488*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; need not be preserved 489*dfc6aa5cSAndroid Build Coastguard Worker pop ebx 490*dfc6aa5cSAndroid Build Coastguard Worker mov esp, ebp ; esp <- aligned ebp 491*dfc6aa5cSAndroid Build Coastguard Worker pop esp ; esp <- original ebp 492*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 493*dfc6aa5cSAndroid Build Coastguard Worker ret 494*dfc6aa5cSAndroid Build Coastguard Worker 495*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 496*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 497*dfc6aa5cSAndroid Build Coastguard Worker align 32 498