1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert MR % 4 == 0 7$assert NR % 4 == 0 8$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 9#include <assert.h> 10 11#include <wasm_simd128.h> 12 13#include <xnnpack/ppmm.h> 14 15 16$WASM_F32X4_MIN={"MINMAX": "wasm_f32x4_min", "PMINMAX": "wasm_f32x4_pmin"}[MINMAX] 17$WASM_F32X4_MAX={"MINMAX": "wasm_f32x4_max", "PMINMAX": "wasm_f32x4_pmax"}[MINMAX] 18$ARCH_SUFFIX = "_x86" if MINMAX == "PMINMAX" else "_arm" 19void xnn_f32_ppmm_minmax_ukernel_${MR}x${NR}__wasmsimd${ARCH_SUFFIX}_splat( 20 size_t mr, 21 size_t nc, 22 size_t kc, 23 const float*restrict a, 24 const float*restrict w, 25 float*restrict c, 26 size_t cm_stride, 27 size_t cn_stride, 28 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) 29{ 30 assert(mr != 0); 31 assert(mr <= ${MR}); 32 assert(nc != 0); 33 assert(kc != 0); 34 assert(kc % sizeof(float) == 0); 35 36 float* c0 = c; 37 $for M in range(1, MR): 38 float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride); 39 $if M % 2 == 0: 40 if XNN_UNPREDICTABLE(mr <= ${M}) { 41 c${M} = c${M-1}; 42 } 43 $elif M + 1 == MR: 44 if XNN_UNPREDICTABLE(mr != ${M+1}) { 45 c${M} = c${M-1}; 46 } 47 $else: 48 if XNN_UNPREDICTABLE(mr < ${M+1}) { 49 c${M} = c${M-1}; 50 } 51 52 const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); 53 const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); 54 do { 55 v128_t vacc0x${ABC[0:4]} = wasm_v128_load(w); 56 $for N in range(4, NR, 4): 57 v128_t vacc0x${ABC[N:N+4]} = wasm_v128_load(w + ${N}); 58 $for M in range(1, MR): 59 $for N in range(0, NR, 4): 60 v128_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]}; 61 w += ${NR}; 62 63 size_t k = kc; 64 do { 65 const v128_t va${ABC[0:4]} = wasm_v128_load(a); 66 $for M in range(4, MR, 4): 67 const v128_t va${ABC[M:M+4]} = wasm_v128_load(a + ${M}); 68 a += ${MR}; 69 70 const v128_t vb${ABC[0:4]} = wasm_v128_load(w); 71 $for N in range(4, NR, 4): 72 const v128_t vb${ABC[N:N+4]} = wasm_v128_load(w + ${N}); 73 w += ${NR}; 74 75 $for M in range(MR): 76 $MMMM = str(M) * 4 77 const v128_t va${MMMM} = wasm_v32x4_shuffle(va${ABC[M&-4:4+M&-4]}, va${ABC[M&-4:4+M&-4]}, ${M}, ${M}, ${M}, ${M}); 78 79 $for N in range(0, NR, 4): 80 $for M in range(MR): 81 $MMMM = str(M) * 4 82 vacc${M}x${ABC[N:N+4]} = wasm_f32x4_add(vacc${M}x${ABC[N:N+4]}, wasm_f32x4_mul(va${MMMM}, vb${ABC[N:N+4]})); 83 84 k -= sizeof(float); 85 } while (k != 0); 86 87 $for N in range(0, NR, 4): 88 $for M in range(MR): 89 vacc${M}x${ABC[N:N+4]} = ${WASM_F32X4_MAX}(vmin, vacc${M}x${ABC[N:N+4]}); 90 91 $for N in range(0, NR, 4): 92 $for M in range(MR): 93 vacc${M}x${ABC[N:N+4]} = ${WASM_F32X4_MIN}(vmax, vacc${M}x${ABC[N:N+4]}); 94 95 if XNN_LIKELY(nc >= ${NR}) { 96 $for M in reversed(range(MR)): 97 wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]}); 98 $for N in range(4, NR, 4): 99 wasm_v128_store(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); 100 101 a = (const float*) ((uintptr_t) a - kc * ${MR}); 102 103 $for M in reversed(range(MR)): 104 c${M} = (float*) ((uintptr_t) c${M} + cn_stride); 105 106 nc -= ${NR}; 107 } else { 108 $for LOG2N in reversed(range(NR.bit_length())): 109 $if NR != 1 << LOG2N: 110 if (nc & ${1 << LOG2N}) { 111 $if LOG2N >= 2: 112 $for M in reversed(range(MR)): 113 wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]}); 114 $for N in range(4, 1 << LOG2N, 4): 115 wasm_v128_store(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); 116 117 $for M in reversed(range(MR)): 118 $for N in range(0, 1 << (LOG2N - 1), 4): 119 vacc${M}x${ABC[N:N+4]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+4]}; 120 121 $for M in reversed(range(MR)): 122 c${M} += ${1 << LOG2N}; 123 $elif LOG2N == 1: 124 $for M in reversed(range(MR)): 125 *((double*) c${M}) = wasm_f64x2_extract_lane(vacc${M}x${ABC[0:4]}, 0); 126 127 $for M in reversed(range(MR)): 128 vacc${M}x${ABC[0:4]} = wasm_v32x4_shuffle(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]}, 2, 3, 2, 3); 129 130 $for M in reversed(range(MR)): 131 c${M} += 2; 132 $elif LOG2N == 0: 133 $for M in reversed(range(MR)): 134 *c${M} = wasm_f32x4_extract_lane(vacc${M}x${ABC[0:4]}, 0); 135 } 136 137 nc = 0; 138 } 139 } while (nc != 0); 140} 141