1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !math_big_pure_go && (ppc64 || ppc64le) 6 7#include "textflag.h" 8 9// This file provides fast assembly versions for the elementary 10// arithmetic operations on vectors implemented in arith.go. 11 12// func addVV(z, y, y []Word) (c Word) 13// z[i] = x[i] + y[i] for all i, carrying 14TEXT ·addVV(SB), NOSPLIT, $0 15 MOVD z_len+8(FP), R7 // R7 = z_len 16 MOVD x+24(FP), R8 // R8 = x[] 17 MOVD y+48(FP), R9 // R9 = y[] 18 MOVD z+0(FP), R10 // R10 = z[] 19 20 // If z_len = 0, we are done 21 CMP R7, $0 22 MOVD R0, R4 23 BEQ done 24 25 // Process the first iteration out of the loop so we can 26 // use MOVDU and avoid 3 index registers updates. 27 MOVD 0(R8), R11 // R11 = x[i] 28 MOVD 0(R9), R12 // R12 = y[i] 29 ADD $-1, R7 // R7 = z_len - 1 30 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 31 CMP R7, $0 32 MOVD R15, 0(R10) // z[i] 33 BEQ final // If z_len was 1, we are done 34 35 SRD $2, R7, R5 // R5 = z_len/4 36 CMP R5, $0 37 MOVD R5, CTR // Set up loop counter 38 BEQ tail // If R5 = 0, we can't use the loop 39 40 // Process 4 elements per iteration. Unrolling this loop 41 // means a performance trade-off: we will lose performance 42 // for small values of z_len (0.90x in the worst case), but 43 // gain significant performance as z_len increases (up to 44 // 1.45x). 45 46 PCALIGN $16 47loop: 48 MOVD 8(R8), R11 // R11 = x[i] 49 MOVD 16(R8), R12 // R12 = x[i+1] 50 MOVD 24(R8), R14 // R14 = x[i+2] 51 MOVDU 32(R8), R15 // R15 = x[i+3] 52 MOVD 8(R9), R16 // R16 = y[i] 53 MOVD 16(R9), R17 // R17 = y[i+1] 54 MOVD 24(R9), R18 // R18 = y[i+2] 55 MOVDU 32(R9), R19 // R19 = y[i+3] 56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 60 MOVD R20, 8(R10) // z[i] 61 MOVD R21, 16(R10) // z[i+1] 62 MOVD R22, 24(R10) // z[i+2] 63 MOVDU R23, 32(R10) // z[i+3] 64 ADD $-4, R7 // R7 = z_len - 4 65 BDNZ loop 66 67 // We may have more elements to read 68 CMP R7, $0 69 BEQ final 70 71 // Process the remaining elements, one at a time 72tail: 73 MOVDU 8(R8), R11 // R11 = x[i] 74 MOVDU 8(R9), R16 // R16 = y[i] 75 ADD $-1, R7 // R7 = z_len - 1 76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 77 CMP R7, $0 78 MOVDU R20, 8(R10) // z[i] 79 BEQ final // If R7 = 0, we are done 80 81 MOVDU 8(R8), R11 82 MOVDU 8(R9), R16 83 ADD $-1, R7 84 ADDE R11, R16, R20 85 CMP R7, $0 86 MOVDU R20, 8(R10) 87 BEQ final 88 89 MOVD 8(R8), R11 90 MOVD 8(R9), R16 91 ADDE R11, R16, R20 92 MOVD R20, 8(R10) 93 94final: 95 ADDZE R4 // Capture CA 96 97done: 98 MOVD R4, c+72(FP) 99 RET 100 101// func subVV(z, x, y []Word) (c Word) 102// z[i] = x[i] - y[i] for all i, carrying 103TEXT ·subVV(SB), NOSPLIT, $0 104 MOVD z_len+8(FP), R7 // R7 = z_len 105 MOVD x+24(FP), R8 // R8 = x[] 106 MOVD y+48(FP), R9 // R9 = y[] 107 MOVD z+0(FP), R10 // R10 = z[] 108 109 // If z_len = 0, we are done 110 CMP R7, $0 111 MOVD R0, R4 112 BEQ done 113 114 // Process the first iteration out of the loop so we can 115 // use MOVDU and avoid 3 index registers updates. 116 MOVD 0(R8), R11 // R11 = x[i] 117 MOVD 0(R9), R12 // R12 = y[i] 118 ADD $-1, R7 // R7 = z_len - 1 119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 120 CMP R7, $0 121 MOVD R15, 0(R10) // z[i] 122 BEQ final // If z_len was 1, we are done 123 124 SRD $2, R7, R5 // R5 = z_len/4 125 CMP R5, $0 126 MOVD R5, CTR // Set up loop counter 127 BEQ tail // If R5 = 0, we can't use the loop 128 129 // Process 4 elements per iteration. Unrolling this loop 130 // means a performance trade-off: we will lose performance 131 // for small values of z_len (0.92x in the worst case), but 132 // gain significant performance as z_len increases (up to 133 // 1.45x). 134 135 PCALIGN $16 136loop: 137 MOVD 8(R8), R11 // R11 = x[i] 138 MOVD 16(R8), R12 // R12 = x[i+1] 139 MOVD 24(R8), R14 // R14 = x[i+2] 140 MOVDU 32(R8), R15 // R15 = x[i+3] 141 MOVD 8(R9), R16 // R16 = y[i] 142 MOVD 16(R9), R17 // R17 = y[i+1] 143 MOVD 24(R9), R18 // R18 = y[i+2] 144 MOVDU 32(R9), R19 // R19 = y[i+3] 145 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 146 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 147 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 148 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 149 MOVD R20, 8(R10) // z[i] 150 MOVD R21, 16(R10) // z[i+1] 151 MOVD R22, 24(R10) // z[i+2] 152 MOVDU R23, 32(R10) // z[i+3] 153 ADD $-4, R7 // R7 = z_len - 4 154 BDNZ loop 155 156 // We may have more elements to read 157 CMP R7, $0 158 BEQ final 159 160 // Process the remaining elements, one at a time 161tail: 162 MOVDU 8(R8), R11 // R11 = x[i] 163 MOVDU 8(R9), R16 // R16 = y[i] 164 ADD $-1, R7 // R7 = z_len - 1 165 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 166 CMP R7, $0 167 MOVDU R20, 8(R10) // z[i] 168 BEQ final // If R7 = 0, we are done 169 170 MOVDU 8(R8), R11 171 MOVDU 8(R9), R16 172 ADD $-1, R7 173 SUBE R16, R11, R20 174 CMP R7, $0 175 MOVDU R20, 8(R10) 176 BEQ final 177 178 MOVD 8(R8), R11 179 MOVD 8(R9), R16 180 SUBE R16, R11, R20 181 MOVD R20, 8(R10) 182 183final: 184 ADDZE R4 185 XOR $1, R4 186 187done: 188 MOVD R4, c+72(FP) 189 RET 190 191// func addVW(z, x []Word, y Word) (c Word) 192TEXT ·addVW(SB), NOSPLIT, $0 193 MOVD z+0(FP), R10 // R10 = z[] 194 MOVD x+24(FP), R8 // R8 = x[] 195 MOVD y+48(FP), R4 // R4 = y = c 196 MOVD z_len+8(FP), R11 // R11 = z_len 197 198 CMP R11, $0 // If z_len is zero, return 199 BEQ done 200 201 // We will process the first iteration out of the loop so we capture 202 // the value of c. In the subsequent iterations, we will rely on the 203 // value of CA set here. 204 MOVD 0(R8), R20 // R20 = x[i] 205 ADD $-1, R11 // R11 = z_len - 1 206 ADDC R20, R4, R6 // R6 = x[i] + c 207 CMP R11, $0 // If z_len was 1, we are done 208 MOVD R6, 0(R10) // z[i] 209 BEQ final 210 211 // We will read 4 elements per iteration 212 SRDCC $2, R11, R9 // R9 = z_len/4 213 DCBT (R8) 214 MOVD R9, CTR // Set up the loop counter 215 BEQ tail // If R9 = 0, we can't use the loop 216 PCALIGN $16 217 218loop: 219 MOVD 8(R8), R20 // R20 = x[i] 220 MOVD 16(R8), R21 // R21 = x[i+1] 221 MOVD 24(R8), R22 // R22 = x[i+2] 222 MOVDU 32(R8), R23 // R23 = x[i+3] 223 ADDZE R20, R24 // R24 = x[i] + CA 224 ADDZE R21, R25 // R25 = x[i+1] + CA 225 ADDZE R22, R26 // R26 = x[i+2] + CA 226 ADDZE R23, R27 // R27 = x[i+3] + CA 227 MOVD R24, 8(R10) // z[i] 228 MOVD R25, 16(R10) // z[i+1] 229 MOVD R26, 24(R10) // z[i+2] 230 MOVDU R27, 32(R10) // z[i+3] 231 ADD $-4, R11 // R11 = z_len - 4 232 BDNZ loop 233 234 // We may have some elements to read 235 CMP R11, $0 236 BEQ final 237 238tail: 239 MOVDU 8(R8), R20 240 ADDZE R20, R24 241 ADD $-1, R11 242 MOVDU R24, 8(R10) 243 CMP R11, $0 244 BEQ final 245 246 MOVDU 8(R8), R20 247 ADDZE R20, R24 248 ADD $-1, R11 249 MOVDU R24, 8(R10) 250 CMP R11, $0 251 BEQ final 252 253 MOVD 8(R8), R20 254 ADDZE R20, R24 255 MOVD R24, 8(R10) 256 257final: 258 ADDZE R0, R4 // c = CA 259done: 260 MOVD R4, c+56(FP) 261 RET 262 263// func subVW(z, x []Word, y Word) (c Word) 264TEXT ·subVW(SB), NOSPLIT, $0 265 MOVD z+0(FP), R10 // R10 = z[] 266 MOVD x+24(FP), R8 // R8 = x[] 267 MOVD y+48(FP), R4 // R4 = y = c 268 MOVD z_len+8(FP), R11 // R11 = z_len 269 270 CMP R11, $0 // If z_len is zero, return 271 BEQ done 272 273 // We will process the first iteration out of the loop so we capture 274 // the value of c. In the subsequent iterations, we will rely on the 275 // value of CA set here. 276 MOVD 0(R8), R20 // R20 = x[i] 277 ADD $-1, R11 // R11 = z_len - 1 278 SUBC R4, R20, R6 // R6 = x[i] - c 279 CMP R11, $0 // If z_len was 1, we are done 280 MOVD R6, 0(R10) // z[i] 281 BEQ final 282 283 // We will read 4 elements per iteration 284 SRDCC $2, R11, R9 // R9 = z_len/4 285 DCBT (R8) 286 MOVD R9, CTR // Set up the loop counter 287 BEQ tail // If R9 = 0, we can't use the loop 288 289 // The loop here is almost the same as the one used in s390x, but 290 // we don't need to capture CA every iteration because we've already 291 // done that above. 292 293 PCALIGN $16 294loop: 295 MOVD 8(R8), R20 296 MOVD 16(R8), R21 297 MOVD 24(R8), R22 298 MOVDU 32(R8), R23 299 SUBE R0, R20 300 SUBE R0, R21 301 SUBE R0, R22 302 SUBE R0, R23 303 MOVD R20, 8(R10) 304 MOVD R21, 16(R10) 305 MOVD R22, 24(R10) 306 MOVDU R23, 32(R10) 307 ADD $-4, R11 308 BDNZ loop 309 310 // We may have some elements to read 311 CMP R11, $0 312 BEQ final 313 314tail: 315 MOVDU 8(R8), R20 316 SUBE R0, R20 317 ADD $-1, R11 318 MOVDU R20, 8(R10) 319 CMP R11, $0 320 BEQ final 321 322 MOVDU 8(R8), R20 323 SUBE R0, R20 324 ADD $-1, R11 325 MOVDU R20, 8(R10) 326 CMP R11, $0 327 BEQ final 328 329 MOVD 8(R8), R20 330 SUBE R0, R20 331 MOVD R20, 8(R10) 332 333final: 334 // Capture CA 335 SUBE R4, R4 336 NEG R4, R4 337 338done: 339 MOVD R4, c+56(FP) 340 RET 341 342//func shlVU(z, x []Word, s uint) (c Word) 343TEXT ·shlVU(SB), NOSPLIT, $0 344 MOVD z+0(FP), R3 345 MOVD x+24(FP), R6 346 MOVD s+48(FP), R9 347 MOVD z_len+8(FP), R4 348 MOVD x_len+32(FP), R7 349 CMP R9, $0 // s==0 copy(z,x) 350 BEQ zeroshift 351 CMP R4, $0 // len(z)==0 return 352 BEQ done 353 354 ADD $-1, R4, R5 // len(z)-1 355 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 356 SLD $3, R5, R7 357 ADD R6, R7, R15 // save starting address &x[len(z)-1] 358 ADD R3, R7, R16 // save starting address &z[len(z)-1] 359 MOVD (R6)(R7), R14 360 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 361 CMP R5, $0 // iterate from i=len(z)-1 to 0 362 BEQ loopexit // Already at end? 363 MOVD 0(R15),R10 // x[i] 364 PCALIGN $16 365shloop: 366 SLD R9, R10, R10 // x[i]<<s 367 MOVDU -8(R15), R14 368 SRD R4, R14, R11 // x[i-1]>>ŝ 369 OR R11, R10, R10 370 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ 371 MOVD R14, R10 // reuse x[i-1] for next iteration 372 ADD $-8, R16 // i-- 373 CMP R15, R6 // &x[i-1]>&x[0]? 374 BGT shloop 375loopexit: 376 MOVD 0(R6), R4 377 SLD R9, R4, R4 378 MOVD R4, 0(R3) // z[0]=x[0]<<s 379 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c 380 RET 381 382zeroshift: 383 CMP R6, $0 // x is null, nothing to copy 384 BEQ done 385 CMP R6, R3 // if x is same as z, nothing to copy 386 BEQ done 387 CMP R7, R4 388 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z 389 SLD $3, R7, R7 390 SUB R6, R3, R11 // dest - src 391 CMPU R11, R7, CR2 // < len? 392 BLT CR2, backward // there is overlap, copy backwards 393 MOVD $0, R14 394 // shlVU processes backwards, but added a forward copy option 395 // since its faster on POWER 396repeat: 397 MOVD (R6)(R14), R15 // Copy 8 bytes at a time 398 MOVD R15, (R3)(R14) 399 ADD $8, R14 400 CMP R14, R7 // More 8 bytes left? 401 BLT repeat 402 BR done 403backward: 404 ADD $-8,R7, R14 405repeatback: 406 MOVD (R6)(R14), R15 // copy x into z backwards 407 MOVD R15, (R3)(R14) // copy 8 bytes at a time 408 SUB $8, R14 409 CMP R14, $-8 // More 8 bytes left? 410 BGT repeatback 411 412done: 413 MOVD R0, c+56(FP) // c=0 414 RET 415 416//func shrVU(z, x []Word, s uint) (c Word) 417TEXT ·shrVU(SB), NOSPLIT, $0 418 MOVD z+0(FP), R3 419 MOVD x+24(FP), R6 420 MOVD s+48(FP), R9 421 MOVD z_len+8(FP), R4 422 MOVD x_len+32(FP), R7 423 424 CMP R9, $0 // s==0, copy(z,x) 425 BEQ zeroshift 426 CMP R4, $0 // len(z)==0 return 427 BEQ done 428 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 429 430 MOVD 0(R6), R7 431 SLD R5, R7, R7 // compute x[0]<<ŝ 432 MOVD $1, R8 // iterate from i=1 to i<len(z) 433 CMP R8, R4 434 BGE loopexit // Already at end? 435 436 // vectorize if len(z) is >=3, else jump to scalar loop 437 CMP R4, $3 438 BLT scalar 439 MTVSRD R9, VS38 // s 440 VSPLTB $7, V6, V4 441 MTVSRD R5, VS39 // ŝ 442 VSPLTB $7, V7, V2 443 ADD $-2, R4, R16 444 PCALIGN $16 445loopback: 446 ADD $-1, R8, R10 447 SLD $3, R10 448 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] 449 SLD $3, R8, R12 450 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] 451 452 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s 453 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ 454 VOR V3, V5, V5 // Or(|) the two registers together 455 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] 456 ADD $2, R8 // Done processing 2 entries, i and i+1 457 CMP R8, R16 // Are there at least a couple of more entries left? 458 BLE loopback 459 CMP R8, R4 // Are we at the last element? 460 BEQ loopexit 461scalar: 462 ADD $-1, R8, R10 463 SLD $3, R10 464 MOVD (R6)(R10),R11 465 SRD R9, R11, R11 // x[len(z)-2] >> s 466 SLD $3, R8, R12 467 MOVD (R6)(R12), R12 468 SLD R5, R12, R12 // x[len(z)-1]<<ŝ 469 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ 470 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ 471loopexit: 472 ADD $-1, R4 473 SLD $3, R4 474 MOVD (R6)(R4), R5 475 SRD R9, R5, R5 // x[len(z)-1]>>s 476 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s 477 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c 478 RET 479 480zeroshift: 481 CMP R6, $0 // x is null, nothing to copy 482 BEQ done 483 CMP R6, R3 // if x is same as z, nothing to copy 484 BEQ done 485 CMP R7, R4 486 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z 487 SLD $3, R7, R7 488 MOVD $0, R14 489repeat: 490 MOVD (R6)(R14), R15 // copy 8 bytes at a time 491 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards 492 ADD $8, R14 493 CMP R14, R7 // More 8 bytes left? 494 BLT repeat 495done: 496 MOVD R0, c+56(FP) 497 RET 498 499// func mulAddVWW(z, x []Word, y, r Word) (c Word) 500TEXT ·mulAddVWW(SB), NOSPLIT, $0 501 MOVD z+0(FP), R10 // R10 = z[] 502 MOVD x+24(FP), R8 // R8 = x[] 503 MOVD y+48(FP), R9 // R9 = y 504 MOVD r+56(FP), R4 // R4 = r = c 505 MOVD z_len+8(FP), R11 // R11 = z_len 506 507 CMP R11, $0 508 BEQ done 509 510 MOVD 0(R8), R20 511 ADD $-1, R11 512 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 513 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 514 ADDC R4, R6 // R6 = z0 + r 515 ADDZE R7, R4 // R4 = z1 + CA 516 CMP R11, $0 517 MOVD R6, 0(R10) // z[i] 518 BEQ done 519 520 // We will read 4 elements per iteration 521 SRDCC $2, R11, R14 // R14 = z_len/4 522 DCBT (R8) 523 MOVD R14, CTR // Set up the loop counter 524 BEQ tail // If R9 = 0, we can't use the loop 525 PCALIGN $16 526 527loop: 528 MOVD 8(R8), R20 // R20 = x[i] 529 MOVD 16(R8), R21 // R21 = x[i+1] 530 MOVD 24(R8), R22 // R22 = x[i+2] 531 MOVDU 32(R8), R23 // R23 = x[i+3] 532 MULLD R9, R20, R24 // R24 = z0[i] 533 MULHDU R9, R20, R20 // R20 = z1[i] 534 ADDC R4, R24 // R24 = z0[i] + c 535 MULLD R9, R21, R25 536 MULHDU R9, R21, R21 537 ADDE R20, R25 538 MULLD R9, R22, R26 539 MULHDU R9, R22, R22 540 MULLD R9, R23, R27 541 MULHDU R9, R23, R23 542 ADDE R21, R26 543 MOVD R24, 8(R10) // z[i] 544 MOVD R25, 16(R10) // z[i+1] 545 ADDE R22, R27 546 ADDZE R23,R4 // update carry 547 MOVD R26, 24(R10) // z[i+2] 548 MOVDU R27, 32(R10) // z[i+3] 549 ADD $-4, R11 // R11 = z_len - 4 550 BDNZ loop 551 552 // We may have some elements to read 553 CMP R11, $0 554 BEQ done 555 556 // Process the remaining elements, one at a time 557tail: 558 MOVDU 8(R8), R20 // R20 = x[i] 559 MULLD R9, R20, R24 // R24 = z0[i] 560 MULHDU R9, R20, R25 // R25 = z1[i] 561 ADD $-1, R11 // R11 = z_len - 1 562 ADDC R4, R24 563 ADDZE R25, R4 564 MOVDU R24, 8(R10) // z[i] 565 CMP R11, $0 566 BEQ done // If R11 = 0, we are done 567 568 MOVDU 8(R8), R20 569 MULLD R9, R20, R24 570 MULHDU R9, R20, R25 571 ADD $-1, R11 572 ADDC R4, R24 573 ADDZE R25, R4 574 MOVDU R24, 8(R10) 575 CMP R11, $0 576 BEQ done 577 578 MOVD 8(R8), R20 579 MULLD R9, R20, R24 580 MULHDU R9, R20, R25 581 ADD $-1, R11 582 ADDC R4, R24 583 ADDZE R25,R4 584 MOVD R24, 8(R10) 585 586done: 587 MOVD R4, c+64(FP) 588 RET 589 590// func addMulVVW(z, x []Word, y Word) (c Word) 591TEXT ·addMulVVW(SB), NOSPLIT, $0 592 MOVD z+0(FP), R3 // R3 = z[] 593 MOVD x+24(FP), R4 // R4 = x[] 594 MOVD y+48(FP), R5 // R5 = y 595 MOVD z_len+8(FP), R6 // R6 = z_len 596 597 CMP R6, $4 598 MOVD R0, R9 // R9 = c = 0 599 BLT tail 600 SRD $2, R6, R7 601 MOVD R7, CTR // Initialize loop counter 602 PCALIGN $16 603 604loop: 605 MOVD 0(R4), R14 // x[i] 606 MOVD 8(R4), R16 // x[i+1] 607 MOVD 16(R4), R18 // x[i+2] 608 MOVD 24(R4), R20 // x[i+3] 609 MOVD 0(R3), R15 // z[i] 610 MOVD 8(R3), R17 // z[i+1] 611 MOVD 16(R3), R19 // z[i+2] 612 MOVD 24(R3), R21 // z[i+3] 613 MULLD R5, R14, R10 // low x[i]*y 614 MULHDU R5, R14, R11 // high x[i]*y 615 ADDC R15, R10 616 ADDZE R11 617 ADDC R9, R10 618 ADDZE R11, R9 619 MULLD R5, R16, R14 // low x[i+1]*y 620 MULHDU R5, R16, R15 // high x[i+1]*y 621 ADDC R17, R14 622 ADDZE R15 623 ADDC R9, R14 624 ADDZE R15, R9 625 MULLD R5, R18, R16 // low x[i+2]*y 626 MULHDU R5, R18, R17 // high x[i+2]*y 627 ADDC R19, R16 628 ADDZE R17 629 ADDC R9, R16 630 ADDZE R17, R9 631 MULLD R5, R20, R18 // low x[i+3]*y 632 MULHDU R5, R20, R19 // high x[i+3]*y 633 ADDC R21, R18 634 ADDZE R19 635 ADDC R9, R18 636 ADDZE R19, R9 637 MOVD R10, 0(R3) // z[i] 638 MOVD R14, 8(R3) // z[i+1] 639 MOVD R16, 16(R3) // z[i+2] 640 MOVD R18, 24(R3) // z[i+3] 641 ADD $32, R3 642 ADD $32, R4 643 BDNZ loop 644 645 ANDCC $3, R6 646tail: 647 CMP R6, $0 648 BEQ done 649 MOVD R6, CTR 650 PCALIGN $16 651tailloop: 652 MOVD 0(R4), R14 653 MOVD 0(R3), R15 654 MULLD R5, R14, R10 655 MULHDU R5, R14, R11 656 ADDC R15, R10 657 ADDZE R11 658 ADDC R9, R10 659 ADDZE R11, R9 660 MOVD R10, 0(R3) 661 ADD $8, R3 662 ADD $8, R4 663 BDNZ tailloop 664 665done: 666 MOVD R9, c+56(FP) 667 RET 668 669