1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jidctflt.asm - floating-point IDCT (SSE & MMX) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; 7*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 8*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 9*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 10*dfc6aa5cSAndroid Build Coastguard Worker; 11*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 12*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 13*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 14*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 15*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 16*dfc6aa5cSAndroid Build Coastguard Worker; 17*dfc6aa5cSAndroid Build Coastguard Worker; This file contains a floating-point implementation of the inverse DCT 18*dfc6aa5cSAndroid Build Coastguard Worker; (Discrete Cosine Transform). The following code is based directly on 19*dfc6aa5cSAndroid Build Coastguard Worker; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20*dfc6aa5cSAndroid Build Coastguard Worker 21*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 22*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 23*dfc6aa5cSAndroid Build Coastguard Worker 24*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 25*dfc6aa5cSAndroid Build Coastguard Worker 26*dfc6aa5cSAndroid Build Coastguard Worker%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 27*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0x44 28*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 29*dfc6aa5cSAndroid Build Coastguard Worker 30*dfc6aa5cSAndroid Build Coastguard Worker%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 31*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0xEE 32*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 33*dfc6aa5cSAndroid Build Coastguard Worker 34*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 35*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_CONST 36*dfc6aa5cSAndroid Build Coastguard Worker 37*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 38*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_DATA(jconst_idct_float_sse) 39*dfc6aa5cSAndroid Build Coastguard Worker 40*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_idct_float_sse): 41*dfc6aa5cSAndroid Build Coastguard Worker 42*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_414 times 4 dd 1.414213562373095048801689 43*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_847 times 4 dd 1.847759065022573512256366 44*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_082 times 4 dd 1.082392200292393968799446 45*dfc6aa5cSAndroid Build Coastguard WorkerPD_M2_613 times 4 dd -2.613125929752753055713286 46*dfc6aa5cSAndroid Build Coastguard WorkerPD_0_125 times 4 dd 0.125 ; 1/8 47*dfc6aa5cSAndroid Build Coastguard WorkerPB_CENTERJSAMP times 8 db CENTERJSAMPLE 48*dfc6aa5cSAndroid Build Coastguard Worker 49*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 50*dfc6aa5cSAndroid Build Coastguard Worker 51*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 52*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 53*dfc6aa5cSAndroid Build Coastguard Worker BITS 32 54*dfc6aa5cSAndroid Build Coastguard Worker; 55*dfc6aa5cSAndroid Build Coastguard Worker; Perform dequantization and inverse DCT on one block of coefficients. 56*dfc6aa5cSAndroid Build Coastguard Worker; 57*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 58*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block, 59*dfc6aa5cSAndroid Build Coastguard Worker; JSAMPARRAY output_buf, JDIMENSION output_col) 60*dfc6aa5cSAndroid Build Coastguard Worker; 61*dfc6aa5cSAndroid Build Coastguard Worker 62*dfc6aa5cSAndroid Build Coastguard Worker%define dct_table(b) (b) + 8 ; void *dct_table 63*dfc6aa5cSAndroid Build Coastguard Worker%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 64*dfc6aa5cSAndroid Build Coastguard Worker%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 65*dfc6aa5cSAndroid Build Coastguard Worker%define output_col(b) (b) + 20 ; JDIMENSION output_col 66*dfc6aa5cSAndroid Build Coastguard Worker 67*dfc6aa5cSAndroid Build Coastguard Worker%define original_ebp ebp + 0 68*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 69*dfc6aa5cSAndroid Build Coastguard Worker ; xmmword wk[WK_NUM] 70*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM 2 71*dfc6aa5cSAndroid Build Coastguard Worker%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 72*dfc6aa5cSAndroid Build Coastguard Worker ; FAST_FLOAT workspace[DCTSIZE2] 73*dfc6aa5cSAndroid Build Coastguard Worker 74*dfc6aa5cSAndroid Build Coastguard Worker align 32 75*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_idct_float_sse) 76*dfc6aa5cSAndroid Build Coastguard Worker 77*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_idct_float_sse): 78*dfc6aa5cSAndroid Build Coastguard Worker push ebp 79*dfc6aa5cSAndroid Build Coastguard Worker mov eax, esp ; eax = original ebp 80*dfc6aa5cSAndroid Build Coastguard Worker sub esp, byte 4 81*dfc6aa5cSAndroid Build Coastguard Worker and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82*dfc6aa5cSAndroid Build Coastguard Worker mov [esp], eax 83*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp ; ebp = aligned ebp 84*dfc6aa5cSAndroid Build Coastguard Worker lea esp, [workspace] 85*dfc6aa5cSAndroid Build Coastguard Worker push ebx 86*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; need not be preserved 87*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 88*dfc6aa5cSAndroid Build Coastguard Worker push esi 89*dfc6aa5cSAndroid Build Coastguard Worker push edi 90*dfc6aa5cSAndroid Build Coastguard Worker 91*dfc6aa5cSAndroid Build Coastguard Worker get_GOT ebx ; get GOT address 92*dfc6aa5cSAndroid Build Coastguard Worker 93*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 1: process columns from input, store into work array. 94*dfc6aa5cSAndroid Build Coastguard Worker 95*dfc6aa5cSAndroid Build Coastguard Worker; mov eax, [original_ebp] 96*dfc6aa5cSAndroid Build Coastguard Worker mov edx, POINTER [dct_table(eax)] ; quantptr 97*dfc6aa5cSAndroid Build Coastguard Worker mov esi, JCOEFPTR [coef_block(eax)] ; inptr 98*dfc6aa5cSAndroid Build Coastguard Worker lea edi, [workspace] ; FAST_FLOAT *wsptr 99*dfc6aa5cSAndroid Build Coastguard Worker mov ecx, DCTSIZE/4 ; ctr 100*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 101*dfc6aa5cSAndroid Build Coastguard Worker.columnloop: 102*dfc6aa5cSAndroid Build Coastguard Worker%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 103*dfc6aa5cSAndroid Build Coastguard Worker mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 104*dfc6aa5cSAndroid Build Coastguard Worker or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 105*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnDCT 106*dfc6aa5cSAndroid Build Coastguard Worker 107*dfc6aa5cSAndroid Build Coastguard Worker movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 108*dfc6aa5cSAndroid Build Coastguard Worker movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 109*dfc6aa5cSAndroid Build Coastguard Worker por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 110*dfc6aa5cSAndroid Build Coastguard Worker por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 111*dfc6aa5cSAndroid Build Coastguard Worker por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 112*dfc6aa5cSAndroid Build Coastguard Worker por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 113*dfc6aa5cSAndroid Build Coastguard Worker por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 114*dfc6aa5cSAndroid Build Coastguard Worker por mm1, mm0 115*dfc6aa5cSAndroid Build Coastguard Worker packsswb mm1, mm1 116*dfc6aa5cSAndroid Build Coastguard Worker movd eax, mm1 117*dfc6aa5cSAndroid Build Coastguard Worker test eax, eax 118*dfc6aa5cSAndroid Build Coastguard Worker jnz short .columnDCT 119*dfc6aa5cSAndroid Build Coastguard Worker 120*dfc6aa5cSAndroid Build Coastguard Worker ; -- AC terms all zero 121*dfc6aa5cSAndroid Build Coastguard Worker 122*dfc6aa5cSAndroid Build Coastguard Worker movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 123*dfc6aa5cSAndroid Build Coastguard Worker 124*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm1, mm0 ; mm1=(** 02 ** 03) 125*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm0, mm0 ; mm0=(00 00 01 01) 126*dfc6aa5cSAndroid Build Coastguard Worker psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) 127*dfc6aa5cSAndroid Build Coastguard Worker psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 128*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **) 129*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) 130*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03) 131*dfc6aa5cSAndroid Build Coastguard Worker 132*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 133*dfc6aa5cSAndroid Build Coastguard Worker 134*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm0 135*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm0 136*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 137*dfc6aa5cSAndroid Build Coastguard Worker 138*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 139*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 140*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 141*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 142*dfc6aa5cSAndroid Build Coastguard Worker 143*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 144*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 145*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 146*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 147*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 148*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 149*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 150*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 151*dfc6aa5cSAndroid Build Coastguard Worker jmp near .nextcolumn 152*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 153*dfc6aa5cSAndroid Build Coastguard Worker%endif 154*dfc6aa5cSAndroid Build Coastguard Worker.columnDCT: 155*dfc6aa5cSAndroid Build Coastguard Worker 156*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 157*dfc6aa5cSAndroid Build Coastguard Worker 158*dfc6aa5cSAndroid Build Coastguard Worker movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 159*dfc6aa5cSAndroid Build Coastguard Worker movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 160*dfc6aa5cSAndroid Build Coastguard Worker movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 161*dfc6aa5cSAndroid Build Coastguard Worker movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 162*dfc6aa5cSAndroid Build Coastguard Worker 163*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm4, mm0 ; mm4=(** 02 ** 03) 164*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm0, mm0 ; mm0=(00 00 01 01) 165*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm5, mm1 ; mm5=(** 22 ** 23) 166*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm1, mm1 ; mm1=(20 20 21 21) 167*dfc6aa5cSAndroid Build Coastguard Worker 168*dfc6aa5cSAndroid Build Coastguard Worker psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) 169*dfc6aa5cSAndroid Build Coastguard Worker psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 170*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **) 171*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) 172*dfc6aa5cSAndroid Build Coastguard Worker psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) 173*dfc6aa5cSAndroid Build Coastguard Worker psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) 174*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **) 175*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **) 176*dfc6aa5cSAndroid Build Coastguard Worker 177*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm6, mm2 ; mm6=(** 42 ** 43) 178*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm2, mm2 ; mm2=(40 40 41 41) 179*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm7, mm3 ; mm7=(** 62 ** 63) 180*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm3, mm3 ; mm3=(60 60 61 61) 181*dfc6aa5cSAndroid Build Coastguard Worker 182*dfc6aa5cSAndroid Build Coastguard Worker psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) 183*dfc6aa5cSAndroid Build Coastguard Worker psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) 184*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **) 185*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **) 186*dfc6aa5cSAndroid Build Coastguard Worker psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) 187*dfc6aa5cSAndroid Build Coastguard Worker psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) 188*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **) 189*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **) 190*dfc6aa5cSAndroid Build Coastguard Worker 191*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03) 192*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23) 193*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 194*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 195*dfc6aa5cSAndroid Build Coastguard Worker 196*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43) 197*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63) 198*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 199*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 200*dfc6aa5cSAndroid Build Coastguard Worker 201*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 202*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 203*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 204*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 205*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 206*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 207*dfc6aa5cSAndroid Build Coastguard Worker 208*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [GOTOFF(ebx,PD_1_414)] 209*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 210*dfc6aa5cSAndroid Build Coastguard Worker 211*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 212*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 213*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 214*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 215*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 216*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 217*dfc6aa5cSAndroid Build Coastguard Worker 218*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 219*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 220*dfc6aa5cSAndroid Build Coastguard Worker 221*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 222*dfc6aa5cSAndroid Build Coastguard Worker 223*dfc6aa5cSAndroid Build Coastguard Worker movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 224*dfc6aa5cSAndroid Build Coastguard Worker movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 225*dfc6aa5cSAndroid Build Coastguard Worker movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 226*dfc6aa5cSAndroid Build Coastguard Worker movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 227*dfc6aa5cSAndroid Build Coastguard Worker 228*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm6, mm4 ; mm6=(** 12 ** 13) 229*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm4, mm4 ; mm4=(10 10 11 11) 230*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm2, mm0 ; mm2=(** 32 ** 33) 231*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm0, mm0 ; mm0=(30 30 31 31) 232*dfc6aa5cSAndroid Build Coastguard Worker 233*dfc6aa5cSAndroid Build Coastguard Worker psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) 234*dfc6aa5cSAndroid Build Coastguard Worker psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) 235*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **) 236*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **) 237*dfc6aa5cSAndroid Build Coastguard Worker psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) 238*dfc6aa5cSAndroid Build Coastguard Worker psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) 239*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **) 240*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **) 241*dfc6aa5cSAndroid Build Coastguard Worker 242*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm7, mm5 ; mm7=(** 52 ** 53) 243*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm5, mm5 ; mm5=(50 50 51 51) 244*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm3, mm1 ; mm3=(** 72 ** 73) 245*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm1, mm1 ; mm1=(70 70 71 71) 246*dfc6aa5cSAndroid Build Coastguard Worker 247*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13) 248*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33) 249*dfc6aa5cSAndroid Build Coastguard Worker 250*dfc6aa5cSAndroid Build Coastguard Worker psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) 251*dfc6aa5cSAndroid Build Coastguard Worker psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) 252*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **) 253*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **) 254*dfc6aa5cSAndroid Build Coastguard Worker psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) 255*dfc6aa5cSAndroid Build Coastguard Worker psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) 256*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **) 257*dfc6aa5cSAndroid Build Coastguard Worker cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **) 258*dfc6aa5cSAndroid Build Coastguard Worker 259*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 260*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 261*dfc6aa5cSAndroid Build Coastguard Worker 262*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53) 263*dfc6aa5cSAndroid Build Coastguard Worker movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73) 264*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 265*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 266*dfc6aa5cSAndroid Build Coastguard Worker 267*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 268*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 269*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 270*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 271*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 272*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 273*dfc6aa5cSAndroid Build Coastguard Worker 274*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 275*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 276*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 277*dfc6aa5cSAndroid Build Coastguard Worker 278*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 279*dfc6aa5cSAndroid Build Coastguard Worker 280*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 281*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 282*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 283*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 284*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 285*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 286*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 287*dfc6aa5cSAndroid Build Coastguard Worker 288*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 289*dfc6aa5cSAndroid Build Coastguard Worker 290*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 291*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 292*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 293*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 294*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 295*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 296*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 297*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 298*dfc6aa5cSAndroid Build Coastguard Worker 299*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm6 ; transpose coefficients(phase 1) 300*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 301*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 302*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 ; transpose coefficients(phase 1) 303*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 304*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 305*dfc6aa5cSAndroid Build Coastguard Worker 306*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 307*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 308*dfc6aa5cSAndroid Build Coastguard Worker 309*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 310*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 311*dfc6aa5cSAndroid Build Coastguard Worker 312*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 313*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 314*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm5 315*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 316*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 317*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 318*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 319*dfc6aa5cSAndroid Build Coastguard Worker 320*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm7 ; transpose coefficients(phase 1) 321*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 322*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 323*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm5 ; transpose coefficients(phase 1) 324*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 325*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 326*dfc6aa5cSAndroid Build Coastguard Worker 327*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm6 ; transpose coefficients(phase 2) 328*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 329*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 330*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm1 ; transpose coefficients(phase 2) 331*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 332*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 333*dfc6aa5cSAndroid Build Coastguard Worker 334*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 335*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 336*dfc6aa5cSAndroid Build Coastguard Worker 337*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 338*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 339*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 340*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 341*dfc6aa5cSAndroid Build Coastguard Worker 342*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm5 ; transpose coefficients(phase 2) 343*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 344*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 345*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm4 ; transpose coefficients(phase 2) 346*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 347*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 348*dfc6aa5cSAndroid Build Coastguard Worker 349*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 350*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 351*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 352*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 353*dfc6aa5cSAndroid Build Coastguard Worker 354*dfc6aa5cSAndroid Build Coastguard Worker.nextcolumn: 355*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 4*SIZEOF_JCOEF ; coef_block 356*dfc6aa5cSAndroid Build Coastguard Worker add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 357*dfc6aa5cSAndroid Build Coastguard Worker add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 358*dfc6aa5cSAndroid Build Coastguard Worker dec ecx ; ctr 359*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnloop 360*dfc6aa5cSAndroid Build Coastguard Worker 361*dfc6aa5cSAndroid Build Coastguard Worker ; -- Prefetch the next coefficient block 362*dfc6aa5cSAndroid Build Coastguard Worker 363*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 364*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 365*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 366*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 367*dfc6aa5cSAndroid Build Coastguard Worker 368*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 2: process rows from work array, store into output array. 369*dfc6aa5cSAndroid Build Coastguard Worker 370*dfc6aa5cSAndroid Build Coastguard Worker mov eax, [original_ebp] 371*dfc6aa5cSAndroid Build Coastguard Worker lea esi, [workspace] ; FAST_FLOAT *wsptr 372*dfc6aa5cSAndroid Build Coastguard Worker mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 373*dfc6aa5cSAndroid Build Coastguard Worker mov eax, JDIMENSION [output_col(eax)] 374*dfc6aa5cSAndroid Build Coastguard Worker mov ecx, DCTSIZE/4 ; ctr 375*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 376*dfc6aa5cSAndroid Build Coastguard Worker.rowloop: 377*dfc6aa5cSAndroid Build Coastguard Worker 378*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 379*dfc6aa5cSAndroid Build Coastguard Worker 380*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 381*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 382*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 383*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 384*dfc6aa5cSAndroid Build Coastguard Worker 385*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 386*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 387*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 388*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 389*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 390*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 391*dfc6aa5cSAndroid Build Coastguard Worker 392*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [GOTOFF(ebx,PD_1_414)] 393*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 394*dfc6aa5cSAndroid Build Coastguard Worker 395*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 396*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 397*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 398*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 399*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 400*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 401*dfc6aa5cSAndroid Build Coastguard Worker 402*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 403*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 404*dfc6aa5cSAndroid Build Coastguard Worker 405*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 406*dfc6aa5cSAndroid Build Coastguard Worker 407*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 408*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 409*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 410*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 411*dfc6aa5cSAndroid Build Coastguard Worker 412*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 413*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 414*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 415*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 416*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 417*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 418*dfc6aa5cSAndroid Build Coastguard Worker 419*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 420*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 421*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 422*dfc6aa5cSAndroid Build Coastguard Worker 423*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 424*dfc6aa5cSAndroid Build Coastguard Worker 425*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 426*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 427*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 428*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 429*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 430*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 431*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 432*dfc6aa5cSAndroid Build Coastguard Worker 433*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 434*dfc6aa5cSAndroid Build Coastguard Worker 435*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 436*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 437*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 438*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 439*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 440*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 441*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 442*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 443*dfc6aa5cSAndroid Build Coastguard Worker 444*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] 445*dfc6aa5cSAndroid Build Coastguard Worker 446*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm6, xmm1 ; descale(1/8) 447*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm7, xmm1 ; descale(1/8) 448*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm5, xmm1 ; descale(1/8) 449*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, xmm1 ; descale(1/8) 450*dfc6aa5cSAndroid Build Coastguard Worker 451*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm3, xmm6 452*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm1, xmm7 453*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10) 454*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11) 455*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30) 456*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31) 457*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm0, mm2 ; mm0=data0=(00 10 20 30) 458*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm1, mm3 ; mm1=data1=(01 11 21 31) 459*dfc6aa5cSAndroid Build Coastguard Worker 460*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm6, xmm5 461*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm7, xmm0 462*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17) 463*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16) 464*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37) 465*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36) 466*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm4, mm6 ; mm4=data7=(07 17 27 37) 467*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm5, mm7 ; mm5=data6=(06 16 26 36) 468*dfc6aa5cSAndroid Build Coastguard Worker 469*dfc6aa5cSAndroid Build Coastguard Worker packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36) 470*dfc6aa5cSAndroid Build Coastguard Worker packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37) 471*dfc6aa5cSAndroid Build Coastguard Worker 472*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 473*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 474*dfc6aa5cSAndroid Build Coastguard Worker 475*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] 476*dfc6aa5cSAndroid Build Coastguard Worker 477*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 478*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm3 479*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm1 480*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32) 481*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34) 482*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35) 483*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33) 484*dfc6aa5cSAndroid Build Coastguard Worker 485*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, xmm6 ; descale(1/8) 486*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, xmm6 ; descale(1/8) 487*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm5, xmm6 ; descale(1/8) 488*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, xmm6 ; descale(1/8) 489*dfc6aa5cSAndroid Build Coastguard Worker 490*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm7, xmm3 491*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm2, xmm1 492*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12) 493*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14) 494*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32) 495*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34) 496*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm2, mm6 ; mm2=data2=(02 12 22 32) 497*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm3, mm7 ; mm3=data4=(04 14 24 34) 498*dfc6aa5cSAndroid Build Coastguard Worker 499*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm4, xmm5 500*dfc6aa5cSAndroid Build Coastguard Worker movhlps xmm6, xmm0 501*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15) 502*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13) 503*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35) 504*dfc6aa5cSAndroid Build Coastguard Worker cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33) 505*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm5, mm6 ; mm5=data5=(05 15 25 35) 506*dfc6aa5cSAndroid Build Coastguard Worker packssdw mm4, mm7 ; mm4=data3=(03 13 23 33) 507*dfc6aa5cSAndroid Build Coastguard Worker 508*dfc6aa5cSAndroid Build Coastguard Worker movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] 509*dfc6aa5cSAndroid Build Coastguard Worker 510*dfc6aa5cSAndroid Build Coastguard Worker packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34) 511*dfc6aa5cSAndroid Build Coastguard Worker packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35) 512*dfc6aa5cSAndroid Build Coastguard Worker 513*dfc6aa5cSAndroid Build Coastguard Worker paddb mm0, mm6 514*dfc6aa5cSAndroid Build Coastguard Worker paddb mm1, mm6 515*dfc6aa5cSAndroid Build Coastguard Worker paddb mm2, mm6 516*dfc6aa5cSAndroid Build Coastguard Worker paddb mm4, mm6 517*dfc6aa5cSAndroid Build Coastguard Worker 518*dfc6aa5cSAndroid Build Coastguard Worker movq mm7, mm0 ; transpose coefficients(phase 1) 519*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31) 520*dfc6aa5cSAndroid Build Coastguard Worker punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37) 521*dfc6aa5cSAndroid Build Coastguard Worker movq mm3, mm2 ; transpose coefficients(phase 1) 522*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33) 523*dfc6aa5cSAndroid Build Coastguard Worker punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35) 524*dfc6aa5cSAndroid Build Coastguard Worker 525*dfc6aa5cSAndroid Build Coastguard Worker movq mm5, mm0 ; transpose coefficients(phase 2) 526*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13) 527*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33) 528*dfc6aa5cSAndroid Build Coastguard Worker movq mm6, mm3 ; transpose coefficients(phase 2) 529*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17) 530*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37) 531*dfc6aa5cSAndroid Build Coastguard Worker 532*dfc6aa5cSAndroid Build Coastguard Worker movq mm1, mm0 ; transpose coefficients(phase 3) 533*dfc6aa5cSAndroid Build Coastguard Worker punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07) 534*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17) 535*dfc6aa5cSAndroid Build Coastguard Worker movq mm4, mm5 ; transpose coefficients(phase 3) 536*dfc6aa5cSAndroid Build Coastguard Worker punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27) 537*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37) 538*dfc6aa5cSAndroid Build Coastguard Worker 539*dfc6aa5cSAndroid Build Coastguard Worker pushpic ebx ; save GOT address 540*dfc6aa5cSAndroid Build Coastguard Worker 541*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 542*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 543*dfc6aa5cSAndroid Build Coastguard Worker movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 544*dfc6aa5cSAndroid Build Coastguard Worker movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 545*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 546*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 547*dfc6aa5cSAndroid Build Coastguard Worker movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 548*dfc6aa5cSAndroid Build Coastguard Worker movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 549*dfc6aa5cSAndroid Build Coastguard Worker 550*dfc6aa5cSAndroid Build Coastguard Worker poppic ebx ; restore GOT address 551*dfc6aa5cSAndroid Build Coastguard Worker 552*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 553*dfc6aa5cSAndroid Build Coastguard Worker add edi, byte 4*SIZEOF_JSAMPROW 554*dfc6aa5cSAndroid Build Coastguard Worker dec ecx ; ctr 555*dfc6aa5cSAndroid Build Coastguard Worker jnz near .rowloop 556*dfc6aa5cSAndroid Build Coastguard Worker 557*dfc6aa5cSAndroid Build Coastguard Worker emms ; empty MMX state 558*dfc6aa5cSAndroid Build Coastguard Worker 559*dfc6aa5cSAndroid Build Coastguard Worker pop edi 560*dfc6aa5cSAndroid Build Coastguard Worker pop esi 561*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 562*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; need not be preserved 563*dfc6aa5cSAndroid Build Coastguard Worker pop ebx 564*dfc6aa5cSAndroid Build Coastguard Worker mov esp, ebp ; esp <- aligned ebp 565*dfc6aa5cSAndroid Build Coastguard Worker pop esp ; esp <- original ebp 566*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 567*dfc6aa5cSAndroid Build Coastguard Worker ret 568*dfc6aa5cSAndroid Build Coastguard Worker 569*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 570*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 571*dfc6aa5cSAndroid Build Coastguard Worker align 32 572