xref: /aosp_15_r20/external/lzma/Asm/x86/7zCrcOpt.asm (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2; 2023-12-08 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8NUM_WORDS       equ     3
9UNROLL_CNT      equ     2
10
11if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
12.err <NUM_WORDS_IS_INCORRECT>
13endif
14if (UNROLL_CNT lt 1)
15.err <UNROLL_CNT_IS_INCORRECT>
16endif
17
18rD      equ  r2
19rD_x    equ  x2
20rN      equ  r7
21rT      equ  r5
22
23ifndef x64
24    if (IS_CDECL gt 0)
25        crc_OFFS    equ (REG_SIZE * 5)
26        data_OFFS   equ (REG_SIZE + crc_OFFS)
27        size_OFFS   equ (REG_SIZE + data_OFFS)
28    else
29        size_OFFS   equ (REG_SIZE * 5)
30    endif
31        table_OFFS  equ (REG_SIZE + size_OFFS)
32endif
33
34; rN + rD is same speed as rD, but we reduce one instruction in loop
35SRCDAT_1        equ     rN + rD * 1 + 1 *
36SRCDAT_4        equ     rN + rD * 1 + 4 *
37
38CRC macro op:req, dest:req, src:req, t:req
39        op      dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
40endm
41
42CRC_XOR macro dest:req, src:req, t:req
43        CRC     xor, dest, src, t
44endm
45
46CRC_MOV macro dest:req, src:req, t:req
47        CRC     mov, dest, src, t
48endm
49
50MOVZXLO macro dest:req, src:req
51        movzx   dest, @CatStr(src, _L)
52endm
53
54MOVZXHI macro dest:req, src:req
55        movzx   dest, @CatStr(src, _H)
56endm
57
58; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
59; movzx x3, x0_L sometimes is 0   cycles latency (not always)
60; movzx x3, x0_L sometimes is 0.5 cycles latency
61; movzx x3, x0_H is 2 cycles latency in some cpus
62
63CRC1b macro
64        movzx   x6, byte ptr [rD]
65        MOVZXLO x3, x0
66        inc     rD
67        shr     x0, 8
68        xor     x6, x3
69        CRC_XOR x0, x6, 0
70        dec     rN
71endm
72
73LOAD_1 macro dest:req, t:req, iter:req, index:req
74        movzx   dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
75endm
76
77LOAD_2 macro dest:req, t:req, iter:req, index:req
78        movzx   dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
79endm
80
81CRC_QUAD macro nn, t:req, iter:req
82ifdef x64
83        ; paired memory loads give 1-3% speed gain, but it uses more registers
84        LOAD_2  x3, t, iter, 0
85        LOAD_2  x9, t, iter, 2
86        MOVZXLO x6, x3
87        shr     x3, 8
88        CRC_XOR nn, x6, t * 4 + 3
89        MOVZXLO x6, x9
90        shr     x9, 8
91        CRC_XOR nn, x3, t * 4 + 2
92        CRC_XOR nn, x6, t * 4 + 1
93        CRC_XOR nn, x9, t * 4 + 0
94elseif 0
95        LOAD_2  x3, t, iter, 0
96        MOVZXLO x6, x3
97        shr     x3, 8
98        CRC_XOR nn, x6, t * 4 + 3
99        CRC_XOR nn, x3, t * 4 + 2
100        LOAD_2  x3, t, iter, 2
101        MOVZXLO x6, x3
102        shr     x3, 8
103        CRC_XOR nn, x6, t * 4 + 1
104        CRC_XOR nn, x3, t * 4 + 0
105elseif 0
106        LOAD_1  x3, t, iter, 0
107        LOAD_1  x6, t, iter, 1
108        CRC_XOR nn, x3, t * 4 + 3
109        CRC_XOR nn, x6, t * 4 + 2
110        LOAD_1  x3, t, iter, 2
111        LOAD_1  x6, t, iter, 3
112        CRC_XOR nn, x3, t * 4 + 1
113        CRC_XOR nn, x6, t * 4 + 0
114else
115        ; 32-bit load is better if there is only one read port (core2)
116        ; but that code can be slower if there are 2 read ports (snb)
117        mov     x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter *  NUM_WORDS) + 0)]
118        MOVZXLO x6, x3
119        CRC_XOR nn, x6, t * 4 + 3
120        MOVZXHI x6, x3
121        shr     x3, 16
122        CRC_XOR nn, x6, t * 4 + 2
123        MOVZXLO x6, x3
124        shr     x3, 8
125        CRC_XOR nn, x6, t * 4 + 1
126        CRC_XOR nn, x3, t * 4 + 0
127endif
128endm
129
130
131LAST    equ     (4 * (NUM_WORDS - 1))
132
133CRC_ITER macro qq, nn, iter
134        mov     nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
135
136    i = 0
137    rept NUM_WORDS - 1
138        CRC_QUAD nn, i, iter
139        i = i + 1
140    endm
141
142        MOVZXLO x6, qq
143        mov     x3, qq
144        shr     x3, 24
145        CRC_XOR nn, x6, LAST + 3
146        CRC_XOR nn, x3, LAST + 0
147        ror     qq, 16
148        MOVZXLO x6, qq
149        shr     qq, 24
150        CRC_XOR nn, x6, LAST + 1
151if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
152        CRC_MOV qq, qq, LAST + 2
153        xor     qq, nn
154else
155        CRC_XOR nn, qq, LAST + 2
156endif
157endm
158
159
160; + 4 for prefetching next 4-bytes after current iteration
161NUM_BYTES_LIMIT equ     (NUM_WORDS * 4 * UNROLL_CNT + 4)
162ALIGN_MASK      equ     3
163
164
165; MY_PROC @CatStr(CrcUpdateT, 12), 4
166MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
167        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
168    ifdef x64
169        mov     x0, REG_ABI_PARAM_0_x   ; x0 = x1(win) / x7(linux)
170        mov     rT, REG_ABI_PARAM_3     ; r5 = r9(win) / x1(linux)
171        mov     rN, REG_ABI_PARAM_2     ; r7 = r8(win) / r2(linux)
172        ; mov     rD, REG_ABI_PARAM_1     ; r2 = r2(win)
173      if  (IS_LINUX gt 0)
174        mov     rD, REG_ABI_PARAM_1     ; r2 = r6
175      endif
176    else
177      if  (IS_CDECL gt 0)
178        mov     x0, [r4 + crc_OFFS]
179        mov     rD, [r4 + data_OFFS]
180      else
181        mov     x0, REG_ABI_PARAM_0_x
182      endif
183        mov     rN, [r4 + size_OFFS]
184        mov     rT, [r4 + table_OFFS]
185    endif
186
187        cmp     rN, NUM_BYTES_LIMIT + ALIGN_MASK
188        jb      crc_end
189@@:
190        test    rD_x, ALIGN_MASK    ; test    rD, ALIGN_MASK
191        jz      @F
192        CRC1b
193        jmp     @B
194@@:
195        xor     x0, dword ptr [rD]
196        lea     rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
197        sub     rD, rN
198
199align 16
200@@:
201unr_index = 0
202while unr_index lt UNROLL_CNT
203    if (unr_index and 1) eq 0
204        CRC_ITER x0, x1, unr_index
205    else
206        CRC_ITER x1, x0, unr_index
207    endif
208        unr_index = unr_index + 1
209endm
210
211        add     rD, NUM_WORDS * 4 * UNROLL_CNT
212        jnc     @B
213
214if 0
215        ; byte verson
216        add     rD, rN
217        xor     x0, dword ptr [rD]
218        add     rN, NUM_BYTES_LIMIT - 1
219else
220        ; 4-byte version
221        add     rN, 4 * NUM_WORDS * UNROLL_CNT
222        sub     rD, 4 * NUM_WORDS * UNROLL_CNT
223@@:
224        MOVZXLO x3, x0
225        MOVZXHI x1, x0
226        shr     x0, 16
227        MOVZXLO x6, x0
228        shr     x0, 8
229        CRC_MOV x0, x0, 0
230        CRC_XOR x0, x3, 3
231        CRC_XOR x0, x1, 2
232        CRC_XOR x0, x6, 1
233
234        add     rD, 4
235if (NUM_WORDS * UNROLL_CNT) ne 1
236        jc      @F
237        xor     x0, [SRCDAT_4 0]
238        jmp     @B
239@@:
240endif
241        add     rD, rN
242        add     rN, 4 - 1
243
244endif
245
246        sub     rN, rD
247crc_end:
248        test    rN, rN
249        jz      func_end
250@@:
251        CRC1b
252        jnz     @B
253
254func_end:
255        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
256MY_ENDP
257
258end
259