1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function 2; 2024-06-18: Igor Pavlov : Public domain 3; 4 5ifndef x64 6; x64=1 7; .err <x64_IS_REQUIRED> 8endif 9 10include 7zAsm.asm 11 12MY_ASM_START 13 14ifndef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT 15if (IS_LINUX gt 0) 16 Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1 17else 18 Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1 19endif 20endif 21 22ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT 23_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE' 24MY_ALIGN macro num:req 25 align num 26 ; align 16 27endm 28else 29MY_ALIGN macro num:req 30 ; We expect that ".text" is aligned for 16-bytes. 31 ; So we don't need large alignment inside our function. 32 align 16 33endm 34endif 35 36 37MY_ALIGN_16 macro 38 MY_ALIGN 16 39endm 40 41MY_ALIGN_32 macro 42 MY_ALIGN 32 43endm 44 45MY_ALIGN_64 macro 46 MY_ALIGN 64 47endm 48 49 50t0_L equ x0_L 51t0_x equ x0 52t0 equ r0 53t1_x equ x3 54t1 equ r3 55 56cp_x equ t1_x 57cp_r equ t1 58m equ x5 59m_r equ r5 60len_x equ x6 61len equ r6 62diff_x equ x7 63diff equ r7 64len0 equ r10 65len1_x equ x11 66len1 equ r11 67maxLen_x equ x12 68maxLen equ r12 69d equ r13 70ptr0 equ r14 71ptr1 equ r15 72 73d_lim equ m_r 74cycSize equ len_x 75hash_lim equ len0 76delta1_x equ len1_x 77delta1_r equ len1 78delta_x equ maxLen_x 79delta_r equ maxLen 80hash equ ptr0 81src equ ptr1 82 83 84 85if (IS_LINUX gt 0) 86 87; r1 r2 r8 r9 : win32 88; r7 r6 r2 r1 r8 r9 : linux 89 90lenLimit equ r8 91lenLimit_x equ x8 92; pos_r equ r2 93pos equ x2 94cur equ r1 95son equ r9 96 97else 98 99lenLimit equ REG_ABI_PARAM_2 100lenLimit_x equ REG_ABI_PARAM_2_x 101pos equ REG_ABI_PARAM_1_x 102cur equ REG_ABI_PARAM_0 103son equ REG_ABI_PARAM_3 104 105endif 106 107 108if (IS_LINUX gt 0) 109 maxLen_OFFS equ (REG_SIZE * (6 + 1)) 110else 111 cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4)) 112 d_OFFS equ (REG_SIZE + cutValue_OFFS) 113 maxLen_OFFS equ (REG_SIZE + d_OFFS) 114endif 115 hash_OFFS equ (REG_SIZE + maxLen_OFFS) 116 limit_OFFS equ (REG_SIZE + hash_OFFS) 117 size_OFFS equ (REG_SIZE + limit_OFFS) 118 cycPos_OFFS equ (REG_SIZE + size_OFFS) 119 cycSize_OFFS equ (REG_SIZE + cycPos_OFFS) 120 posRes_OFFS equ (REG_SIZE + cycSize_OFFS) 121 122if (IS_LINUX gt 0) 123else 124 cutValue_PAR equ [r0 + cutValue_OFFS] 125 d_PAR equ [r0 + d_OFFS] 126endif 127 maxLen_PAR equ [r0 + maxLen_OFFS] 128 hash_PAR equ [r0 + hash_OFFS] 129 limit_PAR equ [r0 + limit_OFFS] 130 size_PAR equ [r0 + size_OFFS] 131 cycPos_PAR equ [r0 + cycPos_OFFS] 132 cycSize_PAR equ [r0 + cycSize_OFFS] 133 posRes_PAR equ [r0 + posRes_OFFS] 134 135 136 cutValue_VAR equ DWORD PTR [r4 + 8 * 0] 137 cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4] 138 cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0] 139 cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4] 140 hash_VAR equ QWORD PTR [r4 + 8 * 2] 141 limit_VAR equ QWORD PTR [r4 + 8 * 3] 142 size_VAR equ QWORD PTR [r4 + 8 * 4] 143 distances equ QWORD PTR [r4 + 8 * 5] 144 maxLen_VAR equ QWORD PTR [r4 + 8 * 6] 145 146 Old_RSP equ QWORD PTR [r4 + 8 * 7] 147 LOCAL_SIZE equ 8 * 8 148 149COPY_VAR_32 macro dest_var, src_var 150 mov x3, src_var 151 mov dest_var, x3 152endm 153 154COPY_VAR_64 macro dest_var, src_var 155 mov r3, src_var 156 mov dest_var, r3 157endm 158 159 160ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT 161; MY_ALIGN_64 162else 163 MY_ALIGN_16 164endif 165MY_PROC GetMatchesSpecN_2, 13 166MY_PUSH_PRESERVED_ABI_REGS 167 mov r0, RSP 168 lea r3, [r0 - LOCAL_SIZE] 169 and r3, -64 170 mov RSP, r3 171 mov Old_RSP, r0 172 173if (IS_LINUX gt 0) 174 mov d, REG_ABI_PARAM_5 ; r13 = r9 175 mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8 176 mov son, REG_ABI_PARAM_3 ; r9 = r1 177 mov r8, REG_ABI_PARAM_2 ; r8 = r2 178 mov pos, REG_ABI_PARAM_1_x ; r2 = x6 179 mov r1, REG_ABI_PARAM_0 ; r1 = r7 180else 181 COPY_VAR_32 cutValue_VAR, cutValue_PAR 182 mov d, d_PAR 183endif 184 185 COPY_VAR_64 limit_VAR, limit_PAR 186 187 mov hash_lim, size_PAR 188 mov size_VAR, hash_lim 189 190 mov cp_x, cycPos_PAR 191 mov hash, hash_PAR 192 193 mov cycSize, cycSize_PAR 194 mov cycSize_VAR, cycSize 195 196 ; we want cur in (rcx). So we change the cur and lenLimit variables 197 sub lenLimit, cur 198 neg lenLimit_x 199 inc lenLimit_x 200 201 mov t0_x, maxLen_PAR 202 sub t0, lenLimit 203 mov maxLen_VAR, t0 204 205 jmp main_loop 206 207MY_ALIGN_64 208fill_empty: 209 ; ptr0 = *ptr1 = kEmptyHashValue; 210 mov QWORD PTR [ptr1], 0 211 inc pos 212 inc cp_x 213 mov DWORD PTR [d - 4], 0 214 cmp d, limit_VAR 215 jae fin 216 cmp hash, hash_lim 217 je fin 218 219; MY_ALIGN_64 220main_loop: 221 ; UInt32 delta = *hash++; 222 mov diff_x, [hash] ; delta 223 add hash, 4 224 ; mov cycPos_VAR, cp_x 225 226 inc cur 227 add d, 4 228 mov m, pos 229 sub m, diff_x; ; matchPos 230 231 ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; 232 lea ptr1, [son + 8 * cp_r] 233 ; mov cycSize, cycSize_VAR 234 cmp pos, cycSize 235 jb directMode ; if (pos < cycSize_VAR) 236 237 ; CYC MODE 238 239 cmp diff_x, cycSize 240 jae fill_empty ; if (delta >= cycSize_VAR) 241 242 xor t0_x, t0_x 243 mov cycPos_VAR, cp_x 244 sub cp_x, diff_x 245 ; jae prepare_for_tree_loop 246 ; add cp_x, cycSize 247 cmovb t0_x, cycSize 248 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) 249 jmp prepare_for_tree_loop 250 251 252directMode: 253 cmp diff_x, pos 254 je fill_empty ; if (delta == pos) 255 jae fin_error ; if (delta >= pos) 256 257 mov cycPos_VAR, cp_x 258 mov cp_x, m 259 260prepare_for_tree_loop: 261 mov len0, lenLimit 262 mov hash_VAR, hash 263 ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; 264 lea ptr0, [ptr1 + 4] 265 ; UInt32 *_distances = ++d; 266 mov distances, d 267 268 neg len0 269 mov len1, len0 270 271 mov t0_x, cutValue_VAR 272 mov maxLen, maxLen_VAR 273 mov cutValueCur_VAR, t0_x 274 275MY_ALIGN_32 276tree_loop: 277 neg diff 278 mov len, len0 279 cmp len1, len0 280 cmovb len, len1 ; len = (len1 < len0 ? len1 : len0); 281 add diff, cur 282 283 mov t0_x, [son + cp_r * 8] ; prefetch 284 movzx t0_x, BYTE PTR [diff + 1 * len] 285 lea cp_r, [son + cp_r * 8] 286 cmp [cur + 1 * len], t0_L 287 je matched_1 288 289 jb left_0 290 291 mov [ptr1], m 292 mov m, [cp_r + 4] 293 lea ptr1, [cp_r + 4] 294 sub diff, cur ; FIX32 295 jmp next_node 296 297MY_ALIGN_32 298left_0: 299 mov [ptr0], m 300 mov m, [cp_r] 301 mov ptr0, cp_r 302 sub diff, cur ; FIX32 303 ; jmp next_node 304 305; ------------ NEXT NODE ------------ 306; MY_ALIGN_32 307next_node: 308 mov cycSize, cycSize_VAR 309 dec cutValueCur_VAR 310 je finish_tree 311 312 add diff_x, pos ; prev_match = pos + diff 313 cmp m, diff_x 314 jae fin_error ; if (new_match >= prev_match) 315 316 mov diff_x, pos 317 sub diff_x, m ; delta = pos - new_match 318 cmp pos, cycSize 319 jae cyc_mode_2 ; if (pos >= cycSize) 320 321 mov cp_x, m 322 test m, m 323 jne tree_loop ; if (m != 0) 324 325finish_tree: 326 ; ptr0 = *ptr1 = kEmptyHashValue; 327 mov DWORD PTR [ptr0], 0 328 mov DWORD PTR [ptr1], 0 329 330 inc pos 331 332 ; _distances[-1] = (UInt32)(d - _distances); 333 mov t0, distances 334 mov t1, d 335 sub t1, t0 336 shr t1_x, 2 337 mov [t0 - 4], t1_x 338 339 cmp d, limit_VAR 340 jae fin ; if (d >= limit) 341 342 mov cp_x, cycPos_VAR 343 mov hash, hash_VAR 344 mov hash_lim, size_VAR 345 inc cp_x 346 cmp hash, hash_lim 347 jne main_loop ; if (hash != size) 348 jmp fin 349 350 351MY_ALIGN_32 352cyc_mode_2: 353 cmp diff_x, cycSize 354 jae finish_tree ; if (delta >= cycSize) 355 356 mov cp_x, cycPos_VAR 357 xor t0_x, t0_x 358 sub cp_x, diff_x ; cp_x = cycPos - delta 359 cmovb t0_x, cycSize 360 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) 361 jmp tree_loop 362 363 364MY_ALIGN_32 365matched_1: 366 367 inc len 368 ; cmp len_x, lenLimit_x 369 je short lenLimit_reach 370 movzx t0_x, BYTE PTR [diff + 1 * len] 371 cmp [cur + 1 * len], t0_L 372 jne mismatch 373 374 375MY_ALIGN_32 376match_loop: 377 ; while (++len != lenLimit) (len[diff] != len[0]) ; 378 379 inc len 380 ; cmp len_x, lenLimit_x 381 je short lenLimit_reach 382 movzx t0_x, BYTE PTR [diff + 1 * len] 383 cmp BYTE PTR [cur + 1 * len], t0_L 384 je match_loop 385 386mismatch: 387 jb left_2 388 389 mov [ptr1], m 390 mov m, [cp_r + 4] 391 lea ptr1, [cp_r + 4] 392 mov len1, len 393 394 jmp max_update 395 396MY_ALIGN_32 397left_2: 398 mov [ptr0], m 399 mov m, [cp_r] 400 mov ptr0, cp_r 401 mov len0, len 402 403max_update: 404 sub diff, cur ; restore diff 405 406 cmp maxLen, len 407 jae next_node 408 409 mov maxLen, len 410 add len, lenLimit 411 mov [d], len_x 412 mov t0_x, diff_x 413 not t0_x 414 mov [d + 4], t0_x 415 add d, 8 416 417 jmp next_node 418 419 420 421MY_ALIGN_32 422lenLimit_reach: 423 424 mov delta_r, cur 425 sub delta_r, diff 426 lea delta1_r, [delta_r - 1] 427 428 mov t0_x, [cp_r] 429 mov [ptr1], t0_x 430 mov t0_x, [cp_r + 4] 431 mov [ptr0], t0_x 432 433 mov [d], lenLimit_x 434 mov [d + 4], delta1_x 435 add d, 8 436 437 ; _distances[-1] = (UInt32)(d - _distances); 438 mov t0, distances 439 mov t1, d 440 sub t1, t0 441 shr t1_x, 2 442 mov [t0 - 4], t1_x 443 444 mov hash, hash_VAR 445 mov hash_lim, size_VAR 446 447 inc pos 448 mov cp_x, cycPos_VAR 449 inc cp_x 450 451 mov d_lim, limit_VAR 452 mov cycSize, cycSize_VAR 453 ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) 454 ; break; 455 cmp hash, hash_lim 456 je fin 457 cmp d, d_lim 458 jae fin 459 cmp delta_x, [hash] 460 jne main_loop 461 movzx t0_x, BYTE PTR [diff] 462 cmp [cur], t0_L 463 jne main_loop 464 465 ; jmp main_loop ; bypass for debug 466 467 mov cycPos_VAR, cp_x 468 shl len, 3 ; cycSize * 8 469 sub diff, cur ; restore diff 470 xor t0_x, t0_x 471 cmp cp_x, delta_x ; cmp (cycPos_VAR, delta) 472 lea cp_r, [son + 8 * cp_r] ; dest 473 lea src, [cp_r + 8 * diff] 474 cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0) 475 add src, t0 476 add len, son ; len = son + cycSize * 8 477 478 479MY_ALIGN_32 480long_loop: 481 add hash, 4 482 483 ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; 484 485 mov t0, [src] 486 add src, 8 487 mov [cp_r], t0 488 add cp_r, 8 489 cmp src, len 490 cmove src, son ; if end of (son) buffer is reached, we wrap to begin 491 492 mov DWORD PTR [d], 2 493 mov [d + 4], lenLimit_x 494 mov [d + 8], delta1_x 495 add d, 12 496 497 inc cur 498 499 cmp hash, hash_lim 500 je long_footer 501 cmp delta_x, [hash] 502 jne long_footer 503 movzx t0_x, BYTE PTR [diff + 1 * cur] 504 cmp [cur], t0_L 505 jne long_footer 506 cmp d, d_lim 507 jb long_loop 508 509long_footer: 510 sub cp_r, son 511 shr cp_r, 3 512 add pos, cp_x 513 sub pos, cycPos_VAR 514 mov cycSize, cycSize_VAR 515 516 cmp d, d_lim 517 jae fin 518 cmp hash, hash_lim 519 jne main_loop 520 jmp fin 521 522 523 524fin_error: 525 xor d, d 526 527fin: 528 mov RSP, Old_RSP 529 mov t0, [r4 + posRes_OFFS] 530 mov [t0], pos 531 mov r0, d 532 533MY_POP_PRESERVED_ABI_REGS 534MY_ENDP 535 536ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT 537_TEXT$LZFINDOPT ENDS 538endif 539 540end 541