xref: /aosp_15_r20/external/lzma/Asm/x86/LzFindOpt.asm (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
2; 2024-06-18: Igor Pavlov : Public domain
3;
4
5ifndef x64
6; x64=1
7; .err <x64_IS_REQUIRED>
8endif
9
10include 7zAsm.asm
11
12MY_ASM_START
13
14ifndef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
15if (IS_LINUX gt 0)
16  Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
17else
18  Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
19endif
20endif
21
22ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
23_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
24MY_ALIGN macro num:req
25        align  num
26        ; align  16
27endm
28else
29MY_ALIGN macro num:req
30        ; We expect that ".text" is aligned for 16-bytes.
31        ; So we don't need large alignment inside our function.
32        align  16
33endm
34endif
35
36
37MY_ALIGN_16 macro
38        MY_ALIGN 16
39endm
40
41MY_ALIGN_32 macro
42        MY_ALIGN 32
43endm
44
45MY_ALIGN_64 macro
46        MY_ALIGN 64
47endm
48
49
50t0_L    equ x0_L
51t0_x    equ x0
52t0      equ r0
53t1_x    equ x3
54t1      equ r3
55
56cp_x    equ t1_x
57cp_r    equ t1
58m       equ x5
59m_r     equ r5
60len_x   equ x6
61len     equ r6
62diff_x  equ x7
63diff    equ r7
64len0    equ r10
65len1_x  equ x11
66len1    equ r11
67maxLen_x equ x12
68maxLen  equ r12
69d       equ r13
70ptr0    equ r14
71ptr1    equ r15
72
73d_lim       equ m_r
74cycSize     equ len_x
75hash_lim    equ len0
76delta1_x    equ len1_x
77delta1_r    equ len1
78delta_x     equ maxLen_x
79delta_r     equ maxLen
80hash        equ ptr0
81src         equ ptr1
82
83
84
85if (IS_LINUX gt 0)
86
87; r1 r2  r8 r9        : win32
88; r7 r6  r2 r1  r8 r9 : linux
89
90lenLimit        equ r8
91lenLimit_x      equ x8
92; pos_r           equ r2
93pos             equ x2
94cur             equ r1
95son             equ r9
96
97else
98
99lenLimit        equ REG_ABI_PARAM_2
100lenLimit_x      equ REG_ABI_PARAM_2_x
101pos             equ REG_ABI_PARAM_1_x
102cur             equ REG_ABI_PARAM_0
103son             equ REG_ABI_PARAM_3
104
105endif
106
107
108if (IS_LINUX gt 0)
109    maxLen_OFFS         equ  (REG_SIZE * (6 + 1))
110else
111    cutValue_OFFS       equ  (REG_SIZE * (8 + 1 + 4))
112    d_OFFS              equ  (REG_SIZE + cutValue_OFFS)
113    maxLen_OFFS         equ  (REG_SIZE + d_OFFS)
114endif
115    hash_OFFS           equ  (REG_SIZE + maxLen_OFFS)
116    limit_OFFS          equ  (REG_SIZE + hash_OFFS)
117    size_OFFS           equ  (REG_SIZE + limit_OFFS)
118    cycPos_OFFS         equ  (REG_SIZE + size_OFFS)
119    cycSize_OFFS        equ  (REG_SIZE + cycPos_OFFS)
120    posRes_OFFS         equ  (REG_SIZE + cycSize_OFFS)
121
122if (IS_LINUX gt 0)
123else
124    cutValue_PAR        equ  [r0 + cutValue_OFFS]
125    d_PAR               equ  [r0 + d_OFFS]
126endif
127    maxLen_PAR          equ  [r0 + maxLen_OFFS]
128    hash_PAR            equ  [r0 + hash_OFFS]
129    limit_PAR           equ  [r0 + limit_OFFS]
130    size_PAR            equ  [r0 + size_OFFS]
131    cycPos_PAR          equ  [r0 + cycPos_OFFS]
132    cycSize_PAR         equ  [r0 + cycSize_OFFS]
133    posRes_PAR          equ  [r0 + posRes_OFFS]
134
135
136    cutValue_VAR        equ  DWORD PTR [r4 + 8 * 0]
137    cutValueCur_VAR     equ  DWORD PTR [r4 + 8 * 0 + 4]
138    cycPos_VAR          equ  DWORD PTR [r4 + 8 * 1 + 0]
139    cycSize_VAR         equ  DWORD PTR [r4 + 8 * 1 + 4]
140    hash_VAR            equ  QWORD PTR [r4 + 8 * 2]
141    limit_VAR           equ  QWORD PTR [r4 + 8 * 3]
142    size_VAR            equ  QWORD PTR [r4 + 8 * 4]
143    distances           equ  QWORD PTR [r4 + 8 * 5]
144    maxLen_VAR          equ  QWORD PTR [r4 + 8 * 6]
145
146    Old_RSP             equ  QWORD PTR [r4 + 8 * 7]
147    LOCAL_SIZE          equ  8 * 8
148
149COPY_VAR_32 macro dest_var, src_var
150        mov     x3, src_var
151        mov     dest_var, x3
152endm
153
154COPY_VAR_64 macro dest_var, src_var
155        mov     r3, src_var
156        mov     dest_var, r3
157endm
158
159
160ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
161; MY_ALIGN_64
162else
163  MY_ALIGN_16
164endif
165MY_PROC GetMatchesSpecN_2, 13
166MY_PUSH_PRESERVED_ABI_REGS
167        mov     r0, RSP
168        lea     r3, [r0 - LOCAL_SIZE]
169        and     r3, -64
170        mov     RSP, r3
171        mov     Old_RSP, r0
172
173if (IS_LINUX gt 0)
174        mov     d,            REG_ABI_PARAM_5       ; r13 = r9
175        mov     cutValue_VAR, REG_ABI_PARAM_4_x     ;     = r8
176        mov     son,          REG_ABI_PARAM_3       ;  r9 = r1
177        mov     r8,           REG_ABI_PARAM_2       ;  r8 = r2
178        mov     pos,          REG_ABI_PARAM_1_x     ;  r2 = x6
179        mov     r1,           REG_ABI_PARAM_0       ;  r1 = r7
180else
181        COPY_VAR_32 cutValue_VAR, cutValue_PAR
182        mov     d, d_PAR
183endif
184
185        COPY_VAR_64 limit_VAR, limit_PAR
186
187        mov     hash_lim, size_PAR
188        mov     size_VAR, hash_lim
189
190        mov     cp_x, cycPos_PAR
191        mov     hash, hash_PAR
192
193        mov     cycSize, cycSize_PAR
194        mov     cycSize_VAR, cycSize
195
196        ; we want cur in (rcx). So we change the cur and lenLimit variables
197        sub     lenLimit, cur
198        neg     lenLimit_x
199        inc     lenLimit_x
200
201        mov     t0_x, maxLen_PAR
202        sub     t0, lenLimit
203        mov     maxLen_VAR, t0
204
205        jmp     main_loop
206
207MY_ALIGN_64
208fill_empty:
209        ; ptr0 = *ptr1 = kEmptyHashValue;
210        mov     QWORD PTR [ptr1], 0
211        inc     pos
212        inc     cp_x
213        mov     DWORD PTR [d - 4], 0
214        cmp     d, limit_VAR
215        jae     fin
216        cmp     hash, hash_lim
217        je      fin
218
219; MY_ALIGN_64
220main_loop:
221        ; UInt32 delta = *hash++;
222        mov     diff_x, [hash]  ; delta
223        add     hash, 4
224        ; mov     cycPos_VAR, cp_x
225
226        inc     cur
227        add     d, 4
228        mov     m, pos
229        sub     m, diff_x;      ; matchPos
230
231        ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
232        lea     ptr1, [son + 8 * cp_r]
233        ; mov     cycSize, cycSize_VAR
234        cmp     pos, cycSize
235        jb      directMode      ; if (pos < cycSize_VAR)
236
237        ; CYC MODE
238
239        cmp     diff_x, cycSize
240        jae     fill_empty      ; if (delta >= cycSize_VAR)
241
242        xor     t0_x, t0_x
243        mov     cycPos_VAR, cp_x
244        sub     cp_x, diff_x
245        ; jae     prepare_for_tree_loop
246        ; add     cp_x, cycSize
247        cmovb   t0_x, cycSize
248        add     cp_x, t0_x      ; cp_x +=  (cycPos < delta ? cycSize : 0)
249        jmp     prepare_for_tree_loop
250
251
252directMode:
253        cmp     diff_x,  pos
254        je      fill_empty      ; if (delta == pos)
255        jae     fin_error       ; if (delta >= pos)
256
257        mov     cycPos_VAR, cp_x
258        mov     cp_x, m
259
260prepare_for_tree_loop:
261        mov     len0, lenLimit
262        mov     hash_VAR, hash
263        ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
264        lea     ptr0, [ptr1 + 4]
265        ; UInt32 *_distances = ++d;
266        mov     distances, d
267
268        neg     len0
269        mov     len1, len0
270
271        mov     t0_x, cutValue_VAR
272        mov     maxLen, maxLen_VAR
273        mov     cutValueCur_VAR, t0_x
274
275MY_ALIGN_32
276tree_loop:
277        neg     diff
278        mov     len, len0
279        cmp     len1, len0
280        cmovb   len, len1       ; len = (len1 < len0 ? len1 : len0);
281        add     diff, cur
282
283        mov     t0_x, [son + cp_r * 8]  ; prefetch
284        movzx   t0_x, BYTE PTR [diff + 1 * len]
285        lea     cp_r, [son + cp_r * 8]
286        cmp     [cur + 1 * len], t0_L
287        je      matched_1
288
289        jb      left_0
290
291        mov     [ptr1], m
292        mov        m, [cp_r + 4]
293        lea     ptr1, [cp_r + 4]
294        sub     diff, cur ; FIX32
295        jmp     next_node
296
297MY_ALIGN_32
298left_0:
299        mov     [ptr0], m
300        mov        m, [cp_r]
301        mov     ptr0, cp_r
302        sub     diff, cur ; FIX32
303        ; jmp     next_node
304
305; ------------ NEXT NODE ------------
306; MY_ALIGN_32
307next_node:
308        mov     cycSize, cycSize_VAR
309        dec     cutValueCur_VAR
310        je      finish_tree
311
312        add     diff_x, pos     ; prev_match = pos + diff
313        cmp     m, diff_x
314        jae     fin_error       ; if (new_match >= prev_match)
315
316        mov     diff_x, pos
317        sub     diff_x, m       ; delta = pos - new_match
318        cmp     pos, cycSize
319        jae     cyc_mode_2      ; if (pos >= cycSize)
320
321        mov     cp_x, m
322        test    m, m
323        jne     tree_loop       ; if (m != 0)
324
325finish_tree:
326        ; ptr0 = *ptr1 = kEmptyHashValue;
327        mov     DWORD PTR [ptr0], 0
328        mov     DWORD PTR [ptr1], 0
329
330        inc     pos
331
332        ; _distances[-1] = (UInt32)(d - _distances);
333        mov     t0, distances
334        mov     t1, d
335        sub     t1, t0
336        shr     t1_x, 2
337        mov     [t0 - 4], t1_x
338
339        cmp     d, limit_VAR
340        jae     fin             ; if (d >= limit)
341
342        mov     cp_x, cycPos_VAR
343        mov     hash, hash_VAR
344        mov     hash_lim, size_VAR
345        inc     cp_x
346        cmp     hash, hash_lim
347        jne     main_loop       ; if (hash != size)
348        jmp     fin
349
350
351MY_ALIGN_32
352cyc_mode_2:
353        cmp     diff_x, cycSize
354        jae     finish_tree     ; if (delta >= cycSize)
355
356        mov     cp_x, cycPos_VAR
357        xor     t0_x, t0_x
358        sub     cp_x, diff_x    ; cp_x = cycPos - delta
359        cmovb   t0_x, cycSize
360        add     cp_x, t0_x      ; cp_x += (cycPos < delta ? cycSize : 0)
361        jmp     tree_loop
362
363
364MY_ALIGN_32
365matched_1:
366
367        inc     len
368        ; cmp     len_x, lenLimit_x
369        je      short lenLimit_reach
370        movzx   t0_x, BYTE PTR [diff + 1 * len]
371        cmp     [cur + 1 * len], t0_L
372        jne     mismatch
373
374
375MY_ALIGN_32
376match_loop:
377        ;  while (++len != lenLimit)  (len[diff] != len[0]) ;
378
379        inc     len
380        ; cmp     len_x, lenLimit_x
381        je      short lenLimit_reach
382        movzx   t0_x, BYTE PTR [diff + 1 * len]
383        cmp     BYTE PTR [cur + 1 * len], t0_L
384        je      match_loop
385
386mismatch:
387        jb      left_2
388
389        mov     [ptr1], m
390        mov        m, [cp_r + 4]
391        lea     ptr1, [cp_r + 4]
392        mov     len1, len
393
394        jmp     max_update
395
396MY_ALIGN_32
397left_2:
398        mov     [ptr0], m
399        mov        m, [cp_r]
400        mov     ptr0, cp_r
401        mov     len0, len
402
403max_update:
404        sub     diff, cur       ; restore diff
405
406        cmp     maxLen, len
407        jae     next_node
408
409        mov     maxLen, len
410        add     len, lenLimit
411        mov     [d], len_x
412        mov     t0_x, diff_x
413        not     t0_x
414        mov     [d + 4], t0_x
415        add     d, 8
416
417        jmp     next_node
418
419
420
421MY_ALIGN_32
422lenLimit_reach:
423
424        mov     delta_r, cur
425        sub     delta_r, diff
426        lea     delta1_r, [delta_r - 1]
427
428        mov     t0_x, [cp_r]
429        mov     [ptr1], t0_x
430        mov     t0_x, [cp_r + 4]
431        mov     [ptr0], t0_x
432
433        mov     [d], lenLimit_x
434        mov     [d + 4], delta1_x
435        add     d, 8
436
437        ; _distances[-1] = (UInt32)(d - _distances);
438        mov     t0, distances
439        mov     t1, d
440        sub     t1, t0
441        shr     t1_x, 2
442        mov     [t0 - 4], t1_x
443
444        mov     hash, hash_VAR
445        mov     hash_lim, size_VAR
446
447        inc     pos
448        mov     cp_x, cycPos_VAR
449        inc     cp_x
450
451        mov     d_lim, limit_VAR
452        mov     cycSize, cycSize_VAR
453        ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
454        ;    break;
455        cmp     hash, hash_lim
456        je      fin
457        cmp     d, d_lim
458        jae     fin
459        cmp     delta_x, [hash]
460        jne     main_loop
461        movzx   t0_x, BYTE PTR [diff]
462        cmp     [cur], t0_L
463        jne     main_loop
464
465        ; jmp     main_loop     ; bypass for debug
466
467        mov     cycPos_VAR, cp_x
468        shl     len, 3          ; cycSize * 8
469        sub     diff, cur       ; restore diff
470        xor     t0_x, t0_x
471        cmp     cp_x, delta_x   ; cmp (cycPos_VAR, delta)
472        lea     cp_r, [son + 8 * cp_r]  ; dest
473        lea     src, [cp_r + 8 * diff]
474        cmovb   t0, len         ; t0 =  (cycPos_VAR < delta ? cycSize * 8 : 0)
475        add     src, t0
476        add     len, son        ; len = son + cycSize * 8
477
478
479MY_ALIGN_32
480long_loop:
481        add     hash, 4
482
483        ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
484
485        mov     t0, [src]
486        add     src, 8
487        mov     [cp_r], t0
488        add     cp_r, 8
489        cmp     src, len
490        cmove   src, son       ; if end of (son) buffer is reached, we wrap to begin
491
492        mov     DWORD PTR [d], 2
493        mov     [d + 4], lenLimit_x
494        mov     [d + 8], delta1_x
495        add     d, 12
496
497        inc     cur
498
499        cmp     hash, hash_lim
500        je      long_footer
501        cmp     delta_x, [hash]
502        jne     long_footer
503        movzx   t0_x, BYTE PTR [diff + 1 * cur]
504        cmp     [cur], t0_L
505        jne     long_footer
506        cmp     d, d_lim
507        jb      long_loop
508
509long_footer:
510        sub     cp_r, son
511        shr     cp_r, 3
512        add     pos, cp_x
513        sub     pos, cycPos_VAR
514        mov     cycSize, cycSize_VAR
515
516        cmp     d, d_lim
517        jae     fin
518        cmp     hash, hash_lim
519        jne     main_loop
520        jmp     fin
521
522
523
524fin_error:
525        xor     d, d
526
527fin:
528        mov     RSP, Old_RSP
529        mov     t0, [r4 + posRes_OFFS]
530        mov     [t0], pos
531        mov     r0, d
532
533MY_POP_PRESERVED_ABI_REGS
534MY_ENDP
535
536ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
537_TEXT$LZFINDOPT ENDS
538endif
539
540end
541