1; 7zCrcOpt.asm -- CRC32 calculation : optimized version 2; 2023-12-08 : Igor Pavlov : Public domain 3 4include 7zAsm.asm 5 6MY_ASM_START 7 8NUM_WORDS equ 3 9UNROLL_CNT equ 2 10 11if (NUM_WORDS lt 1) or (NUM_WORDS gt 64) 12.err <NUM_WORDS_IS_INCORRECT> 13endif 14if (UNROLL_CNT lt 1) 15.err <UNROLL_CNT_IS_INCORRECT> 16endif 17 18rD equ r2 19rD_x equ x2 20rN equ r7 21rT equ r5 22 23ifndef x64 24 if (IS_CDECL gt 0) 25 crc_OFFS equ (REG_SIZE * 5) 26 data_OFFS equ (REG_SIZE + crc_OFFS) 27 size_OFFS equ (REG_SIZE + data_OFFS) 28 else 29 size_OFFS equ (REG_SIZE * 5) 30 endif 31 table_OFFS equ (REG_SIZE + size_OFFS) 32endif 33 34; rN + rD is same speed as rD, but we reduce one instruction in loop 35SRCDAT_1 equ rN + rD * 1 + 1 * 36SRCDAT_4 equ rN + rD * 1 + 4 * 37 38CRC macro op:req, dest:req, src:req, t:req 39 op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)] 40endm 41 42CRC_XOR macro dest:req, src:req, t:req 43 CRC xor, dest, src, t 44endm 45 46CRC_MOV macro dest:req, src:req, t:req 47 CRC mov, dest, src, t 48endm 49 50MOVZXLO macro dest:req, src:req 51 movzx dest, @CatStr(src, _L) 52endm 53 54MOVZXHI macro dest:req, src:req 55 movzx dest, @CatStr(src, _H) 56endm 57 58; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest 59; movzx x3, x0_L sometimes is 0 cycles latency (not always) 60; movzx x3, x0_L sometimes is 0.5 cycles latency 61; movzx x3, x0_H is 2 cycles latency in some cpus 62 63CRC1b macro 64 movzx x6, byte ptr [rD] 65 MOVZXLO x3, x0 66 inc rD 67 shr x0, 8 68 xor x6, x3 69 CRC_XOR x0, x6, 0 70 dec rN 71endm 72 73LOAD_1 macro dest:req, t:req, iter:req, index:req 74 movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] 75endm 76 77LOAD_2 macro dest:req, t:req, iter:req, index:req 78 movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] 79endm 80 81CRC_QUAD macro nn, t:req, iter:req 82ifdef x64 83 ; paired memory loads give 1-3% speed gain, but it uses more registers 84 LOAD_2 x3, t, iter, 0 85 LOAD_2 x9, t, iter, 2 86 MOVZXLO x6, x3 87 shr x3, 8 88 CRC_XOR nn, x6, t * 4 + 3 89 MOVZXLO x6, x9 90 shr x9, 8 91 CRC_XOR nn, x3, t * 4 + 2 92 CRC_XOR nn, x6, t * 4 + 1 93 CRC_XOR nn, x9, t * 4 + 0 94elseif 0 95 LOAD_2 x3, t, iter, 0 96 MOVZXLO x6, x3 97 shr x3, 8 98 CRC_XOR nn, x6, t * 4 + 3 99 CRC_XOR nn, x3, t * 4 + 2 100 LOAD_2 x3, t, iter, 2 101 MOVZXLO x6, x3 102 shr x3, 8 103 CRC_XOR nn, x6, t * 4 + 1 104 CRC_XOR nn, x3, t * 4 + 0 105elseif 0 106 LOAD_1 x3, t, iter, 0 107 LOAD_1 x6, t, iter, 1 108 CRC_XOR nn, x3, t * 4 + 3 109 CRC_XOR nn, x6, t * 4 + 2 110 LOAD_1 x3, t, iter, 2 111 LOAD_1 x6, t, iter, 3 112 CRC_XOR nn, x3, t * 4 + 1 113 CRC_XOR nn, x6, t * 4 + 0 114else 115 ; 32-bit load is better if there is only one read port (core2) 116 ; but that code can be slower if there are 2 read ports (snb) 117 mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)] 118 MOVZXLO x6, x3 119 CRC_XOR nn, x6, t * 4 + 3 120 MOVZXHI x6, x3 121 shr x3, 16 122 CRC_XOR nn, x6, t * 4 + 2 123 MOVZXLO x6, x3 124 shr x3, 8 125 CRC_XOR nn, x6, t * 4 + 1 126 CRC_XOR nn, x3, t * 4 + 0 127endif 128endm 129 130 131LAST equ (4 * (NUM_WORDS - 1)) 132 133CRC_ITER macro qq, nn, iter 134 mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))] 135 136 i = 0 137 rept NUM_WORDS - 1 138 CRC_QUAD nn, i, iter 139 i = i + 1 140 endm 141 142 MOVZXLO x6, qq 143 mov x3, qq 144 shr x3, 24 145 CRC_XOR nn, x6, LAST + 3 146 CRC_XOR nn, x3, LAST + 0 147 ror qq, 16 148 MOVZXLO x6, qq 149 shr qq, 24 150 CRC_XOR nn, x6, LAST + 1 151if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1)) 152 CRC_MOV qq, qq, LAST + 2 153 xor qq, nn 154else 155 CRC_XOR nn, qq, LAST + 2 156endif 157endm 158 159 160; + 4 for prefetching next 4-bytes after current iteration 161NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4) 162ALIGN_MASK equ 3 163 164 165; MY_PROC @CatStr(CrcUpdateT, 12), 4 166MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4 167 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 168 ifdef x64 169 mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux) 170 mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux) 171 mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux) 172 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win) 173 if (IS_LINUX gt 0) 174 mov rD, REG_ABI_PARAM_1 ; r2 = r6 175 endif 176 else 177 if (IS_CDECL gt 0) 178 mov x0, [r4 + crc_OFFS] 179 mov rD, [r4 + data_OFFS] 180 else 181 mov x0, REG_ABI_PARAM_0_x 182 endif 183 mov rN, [r4 + size_OFFS] 184 mov rT, [r4 + table_OFFS] 185 endif 186 187 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK 188 jb crc_end 189@@: 190 test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK 191 jz @F 192 CRC1b 193 jmp @B 194@@: 195 xor x0, dword ptr [rD] 196 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] 197 sub rD, rN 198 199align 16 200@@: 201unr_index = 0 202while unr_index lt UNROLL_CNT 203 if (unr_index and 1) eq 0 204 CRC_ITER x0, x1, unr_index 205 else 206 CRC_ITER x1, x0, unr_index 207 endif 208 unr_index = unr_index + 1 209endm 210 211 add rD, NUM_WORDS * 4 * UNROLL_CNT 212 jnc @B 213 214if 0 215 ; byte verson 216 add rD, rN 217 xor x0, dword ptr [rD] 218 add rN, NUM_BYTES_LIMIT - 1 219else 220 ; 4-byte version 221 add rN, 4 * NUM_WORDS * UNROLL_CNT 222 sub rD, 4 * NUM_WORDS * UNROLL_CNT 223@@: 224 MOVZXLO x3, x0 225 MOVZXHI x1, x0 226 shr x0, 16 227 MOVZXLO x6, x0 228 shr x0, 8 229 CRC_MOV x0, x0, 0 230 CRC_XOR x0, x3, 3 231 CRC_XOR x0, x1, 2 232 CRC_XOR x0, x6, 1 233 234 add rD, 4 235if (NUM_WORDS * UNROLL_CNT) ne 1 236 jc @F 237 xor x0, [SRCDAT_4 0] 238 jmp @B 239@@: 240endif 241 add rD, rN 242 add rN, 4 - 1 243 244endif 245 246 sub rN, rD 247crc_end: 248 test rN, rN 249 jz func_end 250@@: 251 CRC1b 252 jnz @B 253 254func_end: 255 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 256MY_ENDP 257 258end 259