1*c217d954SCole Faust /*
2*c217d954SCole Faust * Copyright (c) 2017-2021 Arm Limited.
3*c217d954SCole Faust *
4*c217d954SCole Faust * SPDX-License-Identifier: MIT
5*c217d954SCole Faust *
6*c217d954SCole Faust * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust *
13*c217d954SCole Faust * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust * copies or substantial portions of the Software.
15*c217d954SCole Faust *
16*c217d954SCole Faust * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust * SOFTWARE.
23*c217d954SCole Faust */
24*c217d954SCole Faust #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
25*c217d954SCole Faust
26*c217d954SCole Faust #include "arm_compute/core/Error.h"
27*c217d954SCole Faust #include "arm_compute/core/Helpers.h"
28*c217d954SCole Faust #include "arm_compute/core/ITensor.h"
29*c217d954SCole Faust #include "arm_compute/core/TensorInfo.h"
30*c217d954SCole Faust #include "arm_compute/core/Types.h"
31*c217d954SCole Faust #include "arm_compute/core/Utils.h"
32*c217d954SCole Faust #include "arm_compute/core/Validate.h"
33*c217d954SCole Faust #include "arm_compute/core/Window.h"
34*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
35*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
36*c217d954SCole Faust
37*c217d954SCole Faust #include <arm_neon.h>
38*c217d954SCole Faust
39*c217d954SCole Faust namespace arm_compute
40*c217d954SCole Faust {
41*c217d954SCole Faust namespace cpu
42*c217d954SCole Faust {
43*c217d954SCole Faust namespace kernels
44*c217d954SCole Faust {
45*c217d954SCole Faust namespace
46*c217d954SCole Faust {
vector_matrix_multiply_u8(Iterator & ina,Iterator & inb,Iterator & out,int width_a,int width_b,int width_out,size_t stride_b,const Window & window)47*c217d954SCole Faust void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
48*c217d954SCole Faust {
49*c217d954SCole Faust execute_window_loop(window, [&](const Coordinates & id)
50*c217d954SCole Faust {
51*c217d954SCole Faust if(id.x() > width_b)
52*c217d954SCole Faust {
53*c217d954SCole Faust return;
54*c217d954SCole Faust }
55*c217d954SCole Faust
56*c217d954SCole Faust // Note: Since the input are all positives, we can use uint32_t
57*c217d954SCole Faust // Accumulators for the block 0
58*c217d954SCole Faust uint32x4x4_t c0 =
59*c217d954SCole Faust {
60*c217d954SCole Faust {
61*c217d954SCole Faust vdupq_n_u32(0),
62*c217d954SCole Faust vdupq_n_u32(0),
63*c217d954SCole Faust vdupq_n_u32(0),
64*c217d954SCole Faust vdupq_n_u32(0)
65*c217d954SCole Faust }
66*c217d954SCole Faust };
67*c217d954SCole Faust
68*c217d954SCole Faust auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());
69*c217d954SCole Faust auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());
70*c217d954SCole Faust auto vec_a_end_addr = vec_a + width_a;
71*c217d954SCole Faust
72*c217d954SCole Faust // This for loop performs 8 accumulations
73*c217d954SCole Faust for(; vec_a <= (vec_a_end_addr - 8);)
74*c217d954SCole Faust {
75*c217d954SCole Faust const uint8x8_t a00_u8 = vld1_u8(vec_a);
76*c217d954SCole Faust const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
77*c217d954SCole Faust const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
78*c217d954SCole Faust const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
79*c217d954SCole Faust const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
80*c217d954SCole Faust const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
81*c217d954SCole Faust const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
82*c217d954SCole Faust const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
83*c217d954SCole Faust const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
84*c217d954SCole Faust
85*c217d954SCole Faust // Convert a00_u8 to uint16_t and get the lower part
86*c217d954SCole Faust const uint16x4x2_t a00_u16 =
87*c217d954SCole Faust {
88*c217d954SCole Faust {
89*c217d954SCole Faust vget_low_u16(vmovl_u8(a00_u8)),
90*c217d954SCole Faust vget_high_u16(vmovl_u8(a00_u8))
91*c217d954SCole Faust }
92*c217d954SCole Faust };
93*c217d954SCole Faust
94*c217d954SCole Faust const uint16x4x4_t b00_u16 =
95*c217d954SCole Faust {
96*c217d954SCole Faust {
97*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
98*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
99*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
100*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
101*c217d954SCole Faust }
102*c217d954SCole Faust };
103*c217d954SCole Faust
104*c217d954SCole Faust const uint16x4x4_t b10_u16 =
105*c217d954SCole Faust {
106*c217d954SCole Faust {
107*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
108*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
109*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
110*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
111*c217d954SCole Faust }
112*c217d954SCole Faust };
113*c217d954SCole Faust
114*c217d954SCole Faust const uint16x4x4_t b20_u16 =
115*c217d954SCole Faust {
116*c217d954SCole Faust {
117*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
118*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
119*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
120*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
121*c217d954SCole Faust }
122*c217d954SCole Faust };
123*c217d954SCole Faust
124*c217d954SCole Faust const uint16x4x4_t b30_u16 =
125*c217d954SCole Faust {
126*c217d954SCole Faust {
127*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
128*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
129*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
130*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
131*c217d954SCole Faust }
132*c217d954SCole Faust };
133*c217d954SCole Faust
134*c217d954SCole Faust const uint16x4x4_t b40_u16 =
135*c217d954SCole Faust {
136*c217d954SCole Faust {
137*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
138*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
139*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
140*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
141*c217d954SCole Faust }
142*c217d954SCole Faust };
143*c217d954SCole Faust
144*c217d954SCole Faust const uint16x4x4_t b50_u16 =
145*c217d954SCole Faust {
146*c217d954SCole Faust {
147*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
148*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
149*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
150*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
151*c217d954SCole Faust }
152*c217d954SCole Faust };
153*c217d954SCole Faust
154*c217d954SCole Faust const uint16x4x4_t b60_u16 =
155*c217d954SCole Faust {
156*c217d954SCole Faust {
157*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
158*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
159*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
160*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
161*c217d954SCole Faust }
162*c217d954SCole Faust };
163*c217d954SCole Faust
164*c217d954SCole Faust const uint16x4x4_t b70_u16 =
165*c217d954SCole Faust {
166*c217d954SCole Faust {
167*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
168*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
169*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
170*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
171*c217d954SCole Faust }
172*c217d954SCole Faust };
173*c217d954SCole Faust
174*c217d954SCole Faust // Accumulate 0:
175*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
176*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
177*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
178*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
179*c217d954SCole Faust
180*c217d954SCole Faust // Accumulate 1:
181*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
182*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
183*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
184*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
185*c217d954SCole Faust
186*c217d954SCole Faust // Accumulate 2:
187*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
188*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
189*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
190*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
191*c217d954SCole Faust
192*c217d954SCole Faust // Accumulate 3:
193*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
194*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
195*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
196*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
197*c217d954SCole Faust
198*c217d954SCole Faust // Accumulate 4:
199*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
200*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
201*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
202*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
203*c217d954SCole Faust
204*c217d954SCole Faust // Accumulate 5:
205*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
206*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
207*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
208*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
209*c217d954SCole Faust
210*c217d954SCole Faust // Accumulate 6:
211*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
212*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
213*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
214*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
215*c217d954SCole Faust
216*c217d954SCole Faust // Accumulate 7:
217*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
218*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
219*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
220*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
221*c217d954SCole Faust
222*c217d954SCole Faust vec_a += 8;
223*c217d954SCole Faust matrix_b += 8 * stride_b;
224*c217d954SCole Faust }
225*c217d954SCole Faust
226*c217d954SCole Faust // This for loop performs the left-over accumulations
227*c217d954SCole Faust for(; vec_a < vec_a_end_addr;)
228*c217d954SCole Faust {
229*c217d954SCole Faust const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);
230*c217d954SCole Faust const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
231*c217d954SCole Faust
232*c217d954SCole Faust const uint16x4x4_t b00_u16 =
233*c217d954SCole Faust {
234*c217d954SCole Faust {
235*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
236*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
237*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
238*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
239*c217d954SCole Faust }
240*c217d954SCole Faust };
241*c217d954SCole Faust
242*c217d954SCole Faust // Convert a00_u8 to uint16_t and get the lower part
243*c217d954SCole Faust const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
244*c217d954SCole Faust
245*c217d954SCole Faust // Accumulate 0:
246*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
247*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
248*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
249*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
250*c217d954SCole Faust
251*c217d954SCole Faust vec_a += 1;
252*c217d954SCole Faust matrix_b += stride_b;
253*c217d954SCole Faust }
254*c217d954SCole Faust
255*c217d954SCole Faust auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
256*c217d954SCole Faust if(id.x() < (width_out - 16))
257*c217d954SCole Faust {
258*c217d954SCole Faust vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
259*c217d954SCole Faust vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
260*c217d954SCole Faust vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
261*c217d954SCole Faust vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
262*c217d954SCole Faust }
263*c217d954SCole Faust else
264*c217d954SCole Faust {
265*c217d954SCole Faust auto left_over = width_out - id.x();
266*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
267*c217d954SCole Faust {
268*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
269*c217d954SCole Faust {
270*c217d954SCole Faust *(vec_out + k * 4 + j) = c0.val[k][j];
271*c217d954SCole Faust }
272*c217d954SCole Faust }
273*c217d954SCole Faust }
274*c217d954SCole Faust },
275*c217d954SCole Faust ina, inb, out);
276*c217d954SCole Faust }
277*c217d954SCole Faust
vector_matrix_multiply_s8(Iterator & ina,Iterator & inb,Iterator & out,int width_a,int width_b,int width_out,size_t stride_b,const Window & window)278*c217d954SCole Faust void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
279*c217d954SCole Faust {
280*c217d954SCole Faust execute_window_loop(window, [&](const Coordinates & id)
281*c217d954SCole Faust {
282*c217d954SCole Faust if(id.x() > width_b)
283*c217d954SCole Faust {
284*c217d954SCole Faust return;
285*c217d954SCole Faust }
286*c217d954SCole Faust
287*c217d954SCole Faust // Accumulators for the block 0
288*c217d954SCole Faust int32x4x4_t c0 =
289*c217d954SCole Faust {
290*c217d954SCole Faust {
291*c217d954SCole Faust vdupq_n_s32(0),
292*c217d954SCole Faust vdupq_n_s32(0),
293*c217d954SCole Faust vdupq_n_s32(0),
294*c217d954SCole Faust vdupq_n_s32(0)
295*c217d954SCole Faust }
296*c217d954SCole Faust };
297*c217d954SCole Faust
298*c217d954SCole Faust auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());
299*c217d954SCole Faust auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());
300*c217d954SCole Faust auto vec_a_end_addr = vec_a + width_a;
301*c217d954SCole Faust
302*c217d954SCole Faust // This for loop performs 8 accumulations
303*c217d954SCole Faust for(; vec_a <= (vec_a_end_addr - 8);)
304*c217d954SCole Faust {
305*c217d954SCole Faust const int8x8_t a00_s8 = vld1_s8(vec_a);
306*c217d954SCole Faust const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
307*c217d954SCole Faust const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
308*c217d954SCole Faust const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
309*c217d954SCole Faust const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
310*c217d954SCole Faust const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
311*c217d954SCole Faust const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
312*c217d954SCole Faust const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
313*c217d954SCole Faust const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
314*c217d954SCole Faust
315*c217d954SCole Faust // Convert a00_s8 to int16_t and get the lower part
316*c217d954SCole Faust const int16x4x2_t a00_s16 =
317*c217d954SCole Faust {
318*c217d954SCole Faust {
319*c217d954SCole Faust vget_low_s16(vmovl_s8(a00_s8)),
320*c217d954SCole Faust vget_high_s16(vmovl_s8(a00_s8))
321*c217d954SCole Faust }
322*c217d954SCole Faust };
323*c217d954SCole Faust
324*c217d954SCole Faust const int16x4x4_t b00_s16 =
325*c217d954SCole Faust {
326*c217d954SCole Faust {
327*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
328*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
329*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
330*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
331*c217d954SCole Faust }
332*c217d954SCole Faust };
333*c217d954SCole Faust
334*c217d954SCole Faust const int16x4x4_t b10_s16 =
335*c217d954SCole Faust {
336*c217d954SCole Faust {
337*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
338*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
339*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
340*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
341*c217d954SCole Faust }
342*c217d954SCole Faust };
343*c217d954SCole Faust
344*c217d954SCole Faust const int16x4x4_t b20_s16 =
345*c217d954SCole Faust {
346*c217d954SCole Faust {
347*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
348*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
349*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
350*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
351*c217d954SCole Faust }
352*c217d954SCole Faust };
353*c217d954SCole Faust
354*c217d954SCole Faust const int16x4x4_t b30_s16 =
355*c217d954SCole Faust {
356*c217d954SCole Faust {
357*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
358*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
359*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
360*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
361*c217d954SCole Faust }
362*c217d954SCole Faust };
363*c217d954SCole Faust
364*c217d954SCole Faust const int16x4x4_t b40_s16 =
365*c217d954SCole Faust {
366*c217d954SCole Faust {
367*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
368*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
369*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
370*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
371*c217d954SCole Faust }
372*c217d954SCole Faust };
373*c217d954SCole Faust
374*c217d954SCole Faust const int16x4x4_t b50_s16 =
375*c217d954SCole Faust {
376*c217d954SCole Faust {
377*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
378*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
379*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
380*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
381*c217d954SCole Faust }
382*c217d954SCole Faust };
383*c217d954SCole Faust
384*c217d954SCole Faust const int16x4x4_t b60_s16 =
385*c217d954SCole Faust {
386*c217d954SCole Faust {
387*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
388*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
389*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
390*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
391*c217d954SCole Faust }
392*c217d954SCole Faust };
393*c217d954SCole Faust
394*c217d954SCole Faust const int16x4x4_t b70_s16 =
395*c217d954SCole Faust {
396*c217d954SCole Faust {
397*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
398*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
399*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
400*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
401*c217d954SCole Faust }
402*c217d954SCole Faust };
403*c217d954SCole Faust
404*c217d954SCole Faust // Accumulate 0:
405*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
406*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
407*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
408*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
409*c217d954SCole Faust
410*c217d954SCole Faust // Accumulate 1:
411*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
412*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
413*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
414*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
415*c217d954SCole Faust
416*c217d954SCole Faust // Accumulate 2:
417*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
418*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
419*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
420*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
421*c217d954SCole Faust
422*c217d954SCole Faust // Accumulate 3:
423*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
424*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
425*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
426*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
427*c217d954SCole Faust
428*c217d954SCole Faust // Accumulate 4:
429*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
430*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
431*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
432*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
433*c217d954SCole Faust
434*c217d954SCole Faust // Accumulate 5:
435*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
436*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
437*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
438*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
439*c217d954SCole Faust
440*c217d954SCole Faust // Accumulate 6:
441*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
442*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
443*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
444*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
445*c217d954SCole Faust
446*c217d954SCole Faust // Accumulate 7:
447*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
448*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
449*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
450*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
451*c217d954SCole Faust
452*c217d954SCole Faust vec_a += 8;
453*c217d954SCole Faust matrix_b += 8 * stride_b;
454*c217d954SCole Faust }
455*c217d954SCole Faust
456*c217d954SCole Faust // This for loop performs the left-over accumulations
457*c217d954SCole Faust for(; vec_a < vec_a_end_addr;)
458*c217d954SCole Faust {
459*c217d954SCole Faust const int8x8_t a00_s8 = vld1_dup_s8(vec_a);
460*c217d954SCole Faust const int8x16_t b00_s8 = vld1q_s8(matrix_b);
461*c217d954SCole Faust
462*c217d954SCole Faust const int16x4x4_t b00_s16 =
463*c217d954SCole Faust {
464*c217d954SCole Faust {
465*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
466*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
467*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
468*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
469*c217d954SCole Faust }
470*c217d954SCole Faust };
471*c217d954SCole Faust
472*c217d954SCole Faust // Convert a00_s8 to uint16_t and get the lower part
473*c217d954SCole Faust const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
474*c217d954SCole Faust
475*c217d954SCole Faust // Accumulate 0:
476*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
477*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
478*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
479*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
480*c217d954SCole Faust
481*c217d954SCole Faust vec_a += 1;
482*c217d954SCole Faust matrix_b += stride_b;
483*c217d954SCole Faust }
484*c217d954SCole Faust
485*c217d954SCole Faust auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
486*c217d954SCole Faust if(id.x() < (width_out - 16))
487*c217d954SCole Faust {
488*c217d954SCole Faust vst1q_s32(vec_out + 0, c0.val[0]);
489*c217d954SCole Faust vst1q_s32(vec_out + 4, c0.val[1]);
490*c217d954SCole Faust vst1q_s32(vec_out + 8, c0.val[2]);
491*c217d954SCole Faust vst1q_s32(vec_out + 12, c0.val[3]);
492*c217d954SCole Faust }
493*c217d954SCole Faust else
494*c217d954SCole Faust {
495*c217d954SCole Faust auto left_over = width_out - id.x();
496*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
497*c217d954SCole Faust {
498*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
499*c217d954SCole Faust {
500*c217d954SCole Faust *(vec_out + k * 4 + j) = c0.val[k][j];
501*c217d954SCole Faust }
502*c217d954SCole Faust }
503*c217d954SCole Faust }
504*c217d954SCole Faust },
505*c217d954SCole Faust ina, inb, out);
506*c217d954SCole Faust }
507*c217d954SCole Faust
matrix_multiply_u8(Iterator & ina,Iterator & inb,Iterator & out,int width_b,const TensorInfo & out_info,const Window & window)508*c217d954SCole Faust void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
509*c217d954SCole Faust {
510*c217d954SCole Faust const auto width_out = static_cast<int>(out_info.dimension(0));
511*c217d954SCole Faust const auto height_out = static_cast<int>(out_info.dimension(1));
512*c217d954SCole Faust const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
513*c217d954SCole Faust execute_window_loop(window, [&](const Coordinates & id)
514*c217d954SCole Faust {
515*c217d954SCole Faust const uint8_t *mtx_a0 = ina.ptr();
516*c217d954SCole Faust const uint8_t *mtx_b0 = inb.ptr();
517*c217d954SCole Faust
518*c217d954SCole Faust // Note: Since the input are all positives, we can use uint32_t
519*c217d954SCole Faust // Accumulators for the block 0
520*c217d954SCole Faust uint32x4x4_t c0 =
521*c217d954SCole Faust {
522*c217d954SCole Faust {
523*c217d954SCole Faust vdupq_n_u32(0),
524*c217d954SCole Faust vdupq_n_u32(0),
525*c217d954SCole Faust vdupq_n_u32(0),
526*c217d954SCole Faust vdupq_n_u32(0)
527*c217d954SCole Faust }
528*c217d954SCole Faust };
529*c217d954SCole Faust
530*c217d954SCole Faust // Accumulators for the block 1
531*c217d954SCole Faust uint32x4x4_t c1 =
532*c217d954SCole Faust {
533*c217d954SCole Faust {
534*c217d954SCole Faust vdupq_n_u32(0),
535*c217d954SCole Faust vdupq_n_u32(0),
536*c217d954SCole Faust vdupq_n_u32(0),
537*c217d954SCole Faust vdupq_n_u32(0)
538*c217d954SCole Faust }
539*c217d954SCole Faust };
540*c217d954SCole Faust
541*c217d954SCole Faust // Accumulators for the block 2
542*c217d954SCole Faust uint32x4x4_t c2 =
543*c217d954SCole Faust {
544*c217d954SCole Faust {
545*c217d954SCole Faust vdupq_n_u32(0),
546*c217d954SCole Faust vdupq_n_u32(0),
547*c217d954SCole Faust vdupq_n_u32(0),
548*c217d954SCole Faust vdupq_n_u32(0)
549*c217d954SCole Faust }
550*c217d954SCole Faust };
551*c217d954SCole Faust
552*c217d954SCole Faust // Accumulators for the block 3
553*c217d954SCole Faust uint32x4x4_t c3 =
554*c217d954SCole Faust {
555*c217d954SCole Faust {
556*c217d954SCole Faust vdupq_n_u32(0),
557*c217d954SCole Faust vdupq_n_u32(0),
558*c217d954SCole Faust vdupq_n_u32(0),
559*c217d954SCole Faust vdupq_n_u32(0)
560*c217d954SCole Faust }
561*c217d954SCole Faust };
562*c217d954SCole Faust
563*c217d954SCole Faust for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
564*c217d954SCole Faust {
565*c217d954SCole Faust const uint8x8_t a00_u8 = vld1_u8(mtx_a0);
566*c217d954SCole Faust const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
567*c217d954SCole Faust
568*c217d954SCole Faust // Convert a00_u8 to uint16_t and get the lower part
569*c217d954SCole Faust const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
570*c217d954SCole Faust
571*c217d954SCole Faust // Convert b00_s8 to uint16_t
572*c217d954SCole Faust const uint16x4x4_t b00_u16 =
573*c217d954SCole Faust {
574*c217d954SCole Faust {
575*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
576*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
577*c217d954SCole Faust vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
578*c217d954SCole Faust vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
579*c217d954SCole Faust }
580*c217d954SCole Faust };
581*c217d954SCole Faust
582*c217d954SCole Faust // 4x4 block 0
583*c217d954SCole Faust c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
584*c217d954SCole Faust c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
585*c217d954SCole Faust c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
586*c217d954SCole Faust c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
587*c217d954SCole Faust
588*c217d954SCole Faust // 4x4 block 1
589*c217d954SCole Faust c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
590*c217d954SCole Faust c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
591*c217d954SCole Faust c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
592*c217d954SCole Faust c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
593*c217d954SCole Faust
594*c217d954SCole Faust // 4x4 block 2
595*c217d954SCole Faust c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
596*c217d954SCole Faust c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
597*c217d954SCole Faust c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
598*c217d954SCole Faust c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
599*c217d954SCole Faust
600*c217d954SCole Faust // 4x4 block 3
601*c217d954SCole Faust c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
602*c217d954SCole Faust c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
603*c217d954SCole Faust c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
604*c217d954SCole Faust c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
605*c217d954SCole Faust }
606*c217d954SCole Faust
607*c217d954SCole Faust auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
608*c217d954SCole Faust
609*c217d954SCole Faust if(id.y() < height_out && id.x() < (width_out - 16))
610*c217d954SCole Faust {
611*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
612*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
613*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
614*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
615*c217d954SCole Faust if(id.y() + 1 < height_out)
616*c217d954SCole Faust {
617*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
618*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
619*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
620*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
621*c217d954SCole Faust if(id.y() + 2 < height_out)
622*c217d954SCole Faust {
623*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
624*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
625*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
626*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
627*c217d954SCole Faust if(id.y() + 3 < height_out)
628*c217d954SCole Faust {
629*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
630*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
631*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
632*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
633*c217d954SCole Faust }
634*c217d954SCole Faust }
635*c217d954SCole Faust }
636*c217d954SCole Faust }
637*c217d954SCole Faust else
638*c217d954SCole Faust {
639*c217d954SCole Faust const auto left_over_value = width_out - id.x();
640*c217d954SCole Faust auto left_over = left_over_value;
641*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
642*c217d954SCole Faust {
643*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
644*c217d954SCole Faust {
645*c217d954SCole Faust *(mtx_out + k * 4 + j) = c0.val[k][j];
646*c217d954SCole Faust }
647*c217d954SCole Faust }
648*c217d954SCole Faust if(id.y() + 1 < height_out)
649*c217d954SCole Faust {
650*c217d954SCole Faust left_over = left_over_value;
651*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
652*c217d954SCole Faust {
653*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
654*c217d954SCole Faust {
655*c217d954SCole Faust *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
656*c217d954SCole Faust }
657*c217d954SCole Faust }
658*c217d954SCole Faust if(id.y() + 2 < height_out)
659*c217d954SCole Faust {
660*c217d954SCole Faust left_over = left_over_value;
661*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
662*c217d954SCole Faust {
663*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
664*c217d954SCole Faust {
665*c217d954SCole Faust *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
666*c217d954SCole Faust }
667*c217d954SCole Faust }
668*c217d954SCole Faust if(id.y() + 3 < height_out)
669*c217d954SCole Faust {
670*c217d954SCole Faust left_over = left_over_value;
671*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
672*c217d954SCole Faust {
673*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
674*c217d954SCole Faust {
675*c217d954SCole Faust *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
676*c217d954SCole Faust }
677*c217d954SCole Faust }
678*c217d954SCole Faust }
679*c217d954SCole Faust }
680*c217d954SCole Faust }
681*c217d954SCole Faust }
682*c217d954SCole Faust },
683*c217d954SCole Faust ina, inb, out);
684*c217d954SCole Faust }
685*c217d954SCole Faust
matrix_multiply_s8(Iterator & ina,Iterator & inb,Iterator & out,int width_b,const TensorInfo & out_info,const Window & window)686*c217d954SCole Faust void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
687*c217d954SCole Faust {
688*c217d954SCole Faust const auto width_out = static_cast<int>(out_info.dimension(0));
689*c217d954SCole Faust const auto height_out = static_cast<int>(out_info.dimension(1));
690*c217d954SCole Faust const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
691*c217d954SCole Faust // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
692*c217d954SCole Faust // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
693*c217d954SCole Faust // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
694*c217d954SCole Faust execute_window_loop(window, [&](const Coordinates & id)
695*c217d954SCole Faust {
696*c217d954SCole Faust auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
697*c217d954SCole Faust auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
698*c217d954SCole Faust
699*c217d954SCole Faust // Note: Since the input are all positives, we can use uint32_t
700*c217d954SCole Faust // Accumulators for the block 0
701*c217d954SCole Faust int32x4x4_t c0 =
702*c217d954SCole Faust {
703*c217d954SCole Faust {
704*c217d954SCole Faust vdupq_n_s32(0),
705*c217d954SCole Faust vdupq_n_s32(0),
706*c217d954SCole Faust vdupq_n_s32(0),
707*c217d954SCole Faust vdupq_n_s32(0)
708*c217d954SCole Faust }
709*c217d954SCole Faust };
710*c217d954SCole Faust
711*c217d954SCole Faust // Accumulators for the block 1
712*c217d954SCole Faust int32x4x4_t c1 =
713*c217d954SCole Faust {
714*c217d954SCole Faust {
715*c217d954SCole Faust vdupq_n_s32(0),
716*c217d954SCole Faust vdupq_n_s32(0),
717*c217d954SCole Faust vdupq_n_s32(0),
718*c217d954SCole Faust vdupq_n_s32(0)
719*c217d954SCole Faust }
720*c217d954SCole Faust };
721*c217d954SCole Faust
722*c217d954SCole Faust // Accumulators for the block 2
723*c217d954SCole Faust int32x4x4_t c2 =
724*c217d954SCole Faust {
725*c217d954SCole Faust {
726*c217d954SCole Faust vdupq_n_s32(0),
727*c217d954SCole Faust vdupq_n_s32(0),
728*c217d954SCole Faust vdupq_n_s32(0),
729*c217d954SCole Faust vdupq_n_s32(0)
730*c217d954SCole Faust }
731*c217d954SCole Faust };
732*c217d954SCole Faust
733*c217d954SCole Faust // Accumulators for the block 3
734*c217d954SCole Faust int32x4x4_t c3 =
735*c217d954SCole Faust {
736*c217d954SCole Faust {
737*c217d954SCole Faust vdupq_n_s32(0),
738*c217d954SCole Faust vdupq_n_s32(0),
739*c217d954SCole Faust vdupq_n_s32(0),
740*c217d954SCole Faust vdupq_n_s32(0)
741*c217d954SCole Faust }
742*c217d954SCole Faust };
743*c217d954SCole Faust
744*c217d954SCole Faust for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
745*c217d954SCole Faust {
746*c217d954SCole Faust const int8x8_t a00_s8 = vld1_s8(mtx_a0);
747*c217d954SCole Faust const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
748*c217d954SCole Faust
749*c217d954SCole Faust // Convert a00_s8 to uint16_t and get the lower part
750*c217d954SCole Faust const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
751*c217d954SCole Faust
752*c217d954SCole Faust // Convert b00_s8 to int16_t
753*c217d954SCole Faust const int16x4x4_t b00_s16 =
754*c217d954SCole Faust {
755*c217d954SCole Faust {
756*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
757*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
758*c217d954SCole Faust vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
759*c217d954SCole Faust vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
760*c217d954SCole Faust }
761*c217d954SCole Faust };
762*c217d954SCole Faust
763*c217d954SCole Faust // 4x4 block 0
764*c217d954SCole Faust c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
765*c217d954SCole Faust c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
766*c217d954SCole Faust c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
767*c217d954SCole Faust c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
768*c217d954SCole Faust
769*c217d954SCole Faust // 4x4 block 1
770*c217d954SCole Faust c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
771*c217d954SCole Faust c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
772*c217d954SCole Faust c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
773*c217d954SCole Faust c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
774*c217d954SCole Faust
775*c217d954SCole Faust // 4x4 block 2
776*c217d954SCole Faust c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
777*c217d954SCole Faust c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
778*c217d954SCole Faust c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
779*c217d954SCole Faust c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
780*c217d954SCole Faust
781*c217d954SCole Faust // 4x4 block 3
782*c217d954SCole Faust c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
783*c217d954SCole Faust c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
784*c217d954SCole Faust c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
785*c217d954SCole Faust c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
786*c217d954SCole Faust }
787*c217d954SCole Faust auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
788*c217d954SCole Faust if(id.y() < height_out && id.x() < (width_out - 16))
789*c217d954SCole Faust {
790*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
791*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
792*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
793*c217d954SCole Faust vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
794*c217d954SCole Faust if(id.y() + 1 < height_out)
795*c217d954SCole Faust {
796*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
797*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
798*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
799*c217d954SCole Faust vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
800*c217d954SCole Faust if(id.y() + 2 < height_out)
801*c217d954SCole Faust {
802*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
803*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
804*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
805*c217d954SCole Faust vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
806*c217d954SCole Faust if(id.y() + 3 < height_out)
807*c217d954SCole Faust {
808*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
809*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
810*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
811*c217d954SCole Faust vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
812*c217d954SCole Faust }
813*c217d954SCole Faust }
814*c217d954SCole Faust }
815*c217d954SCole Faust }
816*c217d954SCole Faust else if(id.y() < height_out)
817*c217d954SCole Faust {
818*c217d954SCole Faust const auto left_over_value = width_out - id.x();
819*c217d954SCole Faust auto left_over = left_over_value;
820*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
821*c217d954SCole Faust {
822*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
823*c217d954SCole Faust {
824*c217d954SCole Faust *(mtx_out + k * 4 + j) = c0.val[k][j];
825*c217d954SCole Faust }
826*c217d954SCole Faust }
827*c217d954SCole Faust if(id.y() + 1 < height_out)
828*c217d954SCole Faust {
829*c217d954SCole Faust left_over = left_over_value;
830*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
831*c217d954SCole Faust {
832*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
833*c217d954SCole Faust {
834*c217d954SCole Faust *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
835*c217d954SCole Faust }
836*c217d954SCole Faust }
837*c217d954SCole Faust if(id.y() + 2 < height_out)
838*c217d954SCole Faust {
839*c217d954SCole Faust left_over = left_over_value;
840*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
841*c217d954SCole Faust {
842*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
843*c217d954SCole Faust {
844*c217d954SCole Faust *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
845*c217d954SCole Faust }
846*c217d954SCole Faust }
847*c217d954SCole Faust if(id.y() + 3 < height_out)
848*c217d954SCole Faust {
849*c217d954SCole Faust left_over = left_over_value;
850*c217d954SCole Faust for(auto k = 0; k < 4 && left_over; ++k)
851*c217d954SCole Faust {
852*c217d954SCole Faust for(auto j = 0; j < 4 && left_over; ++j, --left_over)
853*c217d954SCole Faust {
854*c217d954SCole Faust *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
855*c217d954SCole Faust }
856*c217d954SCole Faust }
857*c217d954SCole Faust }
858*c217d954SCole Faust }
859*c217d954SCole Faust }
860*c217d954SCole Faust }
861*c217d954SCole Faust
862*c217d954SCole Faust },
863*c217d954SCole Faust ina, inb, out);
864*c217d954SCole Faust }
865*c217d954SCole Faust
validate_arguments(const ITensorInfo * src0,const ITensorInfo * src1,const ITensorInfo * dst)866*c217d954SCole Faust Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
867*c217d954SCole Faust {
868*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
869*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
870*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
871*c217d954SCole Faust
872*c217d954SCole Faust TensorShape in0_shape = src0->tensor_shape();
873*c217d954SCole Faust TensorShape in1_shape = src1->tensor_shape();
874*c217d954SCole Faust TensorShape out_shape = dst->tensor_shape();
875*c217d954SCole Faust
876*c217d954SCole Faust // Check vector-by-matrix case
877*c217d954SCole Faust if(out_shape[1] == 1)
878*c217d954SCole Faust {
879*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
880*c217d954SCole Faust }
881*c217d954SCole Faust else
882*c217d954SCole Faust {
883*c217d954SCole Faust in0_shape.collapse(2);
884*c217d954SCole Faust in1_shape.collapse(2);
885*c217d954SCole Faust out_shape.collapse(2);
886*c217d954SCole Faust
887*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
888*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
889*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
890*c217d954SCole Faust }
891*c217d954SCole Faust
892*c217d954SCole Faust return Status{};
893*c217d954SCole Faust }
894*c217d954SCole Faust } // namespace
895*c217d954SCole Faust
configure(const ITensorInfo * src0,const ITensorInfo * src1,ITensorInfo * dst)896*c217d954SCole Faust void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
897*c217d954SCole Faust {
898*c217d954SCole Faust ARM_COMPUTE_UNUSED(src0);
899*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
900*c217d954SCole Faust ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst));
901*c217d954SCole Faust
902*c217d954SCole Faust TensorShape in1_shape = src1->tensor_shape();
903*c217d954SCole Faust in1_shape.collapse(2);
904*c217d954SCole Faust
905*c217d954SCole Faust _slide_matrix_b = in1_shape[2] != 1;
906*c217d954SCole Faust
907*c217d954SCole Faust constexpr unsigned int num_elems_processed_per_iteration_x = 16;
908*c217d954SCole Faust constexpr unsigned int num_elems_processed_per_iteration_y = 4;
909*c217d954SCole Faust
910*c217d954SCole Faust Window win;
911*c217d954SCole Faust // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
912*c217d954SCole Faust if((dst->dimension(1) == 1))
913*c217d954SCole Faust {
914*c217d954SCole Faust // Configure kernel window
915*c217d954SCole Faust win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
916*c217d954SCole Faust }
917*c217d954SCole Faust else
918*c217d954SCole Faust {
919*c217d954SCole Faust win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
920*c217d954SCole Faust }
921*c217d954SCole Faust
922*c217d954SCole Faust ICpuKernel::configure(win);
923*c217d954SCole Faust }
924*c217d954SCole Faust
validate(const ITensorInfo * src0,const ITensorInfo * src1,const ITensorInfo * dst)925*c217d954SCole Faust Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
926*c217d954SCole Faust {
927*c217d954SCole Faust ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
928*c217d954SCole Faust return Status{};
929*c217d954SCole Faust }
930*c217d954SCole Faust
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)931*c217d954SCole Faust void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
932*c217d954SCole Faust {
933*c217d954SCole Faust ARM_COMPUTE_UNUSED(info);
934*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
935*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
936*c217d954SCole Faust
937*c217d954SCole Faust auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
938*c217d954SCole Faust auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
939*c217d954SCole Faust auto dst = tensors.get_tensor(TensorType::ACL_DST);
940*c217d954SCole Faust
941*c217d954SCole Faust // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
942*c217d954SCole Faust if((dst->info()->dimension(1) == 1))
943*c217d954SCole Faust {
944*c217d954SCole Faust const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
945*c217d954SCole Faust const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
946*c217d954SCole Faust const auto width_out = static_cast<int>(dst->info()->dimension(0));
947*c217d954SCole Faust const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
948*c217d954SCole Faust
949*c217d954SCole Faust // The implementation computes 16 elements per iteration
950*c217d954SCole Faust const int window_start_x = 16 * info.thread_id;
951*c217d954SCole Faust const int window_step_x = 16 * info.num_threads;
952*c217d954SCole Faust // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
953*c217d954SCole Faust const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
954*c217d954SCole Faust
955*c217d954SCole Faust Window win_out(window);
956*c217d954SCole Faust win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
957*c217d954SCole Faust win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
958*c217d954SCole Faust
959*c217d954SCole Faust Window win_a(window);
960*c217d954SCole Faust win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
961*c217d954SCole Faust win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
962*c217d954SCole Faust
963*c217d954SCole Faust Window win_b;
964*c217d954SCole Faust // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
965*c217d954SCole Faust // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
966*c217d954SCole Faust if(src1->info()->num_dimensions() >= 3)
967*c217d954SCole Faust {
968*c217d954SCole Faust win_b = window;
969*c217d954SCole Faust }
970*c217d954SCole Faust win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
971*c217d954SCole Faust win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
972*c217d954SCole Faust
973*c217d954SCole Faust Iterator ina(src0, win_a);
974*c217d954SCole Faust Iterator inb(src1, win_b);
975*c217d954SCole Faust Iterator out(dst, win_out);
976*c217d954SCole Faust
977*c217d954SCole Faust switch(src0->info()->data_type())
978*c217d954SCole Faust {
979*c217d954SCole Faust case DataType::S8:
980*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
981*c217d954SCole Faust {
982*c217d954SCole Faust vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
983*c217d954SCole Faust break;
984*c217d954SCole Faust }
985*c217d954SCole Faust case DataType::U8:
986*c217d954SCole Faust case DataType::QASYMM8:
987*c217d954SCole Faust {
988*c217d954SCole Faust vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
989*c217d954SCole Faust break;
990*c217d954SCole Faust }
991*c217d954SCole Faust default:
992*c217d954SCole Faust {
993*c217d954SCole Faust ARM_COMPUTE_ERROR("Not supported");
994*c217d954SCole Faust break;
995*c217d954SCole Faust }
996*c217d954SCole Faust }
997*c217d954SCole Faust }
998*c217d954SCole Faust else
999*c217d954SCole Faust {
1000*c217d954SCole Faust const size_t in_b_stride = src1->info()->strides_in_bytes()[1];
1001*c217d954SCole Faust const int width_b = src1->info()->dimension(0);
1002*c217d954SCole Faust
1003*c217d954SCole Faust // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
1004*c217d954SCole Faust Window win_a(window);
1005*c217d954SCole Faust win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
1006*c217d954SCole Faust win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
1007*c217d954SCole Faust
1008*c217d954SCole Faust // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
1009*c217d954SCole Faust Window win_b;
1010*c217d954SCole Faust // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
1011*c217d954SCole Faust // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
1012*c217d954SCole Faust if(_slide_matrix_b)
1013*c217d954SCole Faust {
1014*c217d954SCole Faust win_b = window;
1015*c217d954SCole Faust }
1016*c217d954SCole Faust win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
1017*c217d954SCole Faust win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
1018*c217d954SCole Faust
1019*c217d954SCole Faust // The step x and step y for the output matrix has been already set using in configure()
1020*c217d954SCole Faust Iterator ina(src0, win_a);
1021*c217d954SCole Faust Iterator inb(src1, win_b);
1022*c217d954SCole Faust Iterator out(dst, window);
1023*c217d954SCole Faust
1024*c217d954SCole Faust switch(src0->info()->data_type())
1025*c217d954SCole Faust {
1026*c217d954SCole Faust case DataType::S8:
1027*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
1028*c217d954SCole Faust {
1029*c217d954SCole Faust matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window);
1030*c217d954SCole Faust break;
1031*c217d954SCole Faust }
1032*c217d954SCole Faust case DataType::U8:
1033*c217d954SCole Faust case DataType::QASYMM8:
1034*c217d954SCole Faust {
1035*c217d954SCole Faust matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window);
1036*c217d954SCole Faust break;
1037*c217d954SCole Faust }
1038*c217d954SCole Faust default:
1039*c217d954SCole Faust {
1040*c217d954SCole Faust ARM_COMPUTE_ERROR("Not supported");
1041*c217d954SCole Faust break;
1042*c217d954SCole Faust }
1043*c217d954SCole Faust }
1044*c217d954SCole Faust }
1045*c217d954SCole Faust }
1046*c217d954SCole Faust
name() const1047*c217d954SCole Faust const char *CpuGemmLowpMatrixMultiplyKernel::name() const
1048*c217d954SCole Faust {
1049*c217d954SCole Faust return "CpuGemmLowpMatrixMultiplyKernel";
1050*c217d954SCole Faust }
1051*c217d954SCole Faust } // namespace kernels
1052*c217d954SCole Faust } // namespace cpu
1053*c217d954SCole Faust } // namespace arm_compute