1*4e366538SXin Li /*
2*4e366538SXin Li * Copyright 2017 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li *
4*4e366538SXin Li * Use of this source code is governed by a BSD-style license
5*4e366538SXin Li * that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li * tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li * in the file PATENTS. All contributing project authors may
8*4e366538SXin Li * be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li */
10*4e366538SXin Li
11*4e366538SXin Li #include "libyuv/basic_types.h"
12*4e366538SXin Li
13*4e366538SXin Li #include "libyuv/compare_row.h"
14*4e366538SXin Li #include "libyuv/row.h"
15*4e366538SXin Li
16*4e366538SXin Li // This module is for GCC MSA
17*4e366538SXin Li #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
18*4e366538SXin Li #include "libyuv/macros_msa.h"
19*4e366538SXin Li
20*4e366538SXin Li #ifdef __cplusplus
21*4e366538SXin Li namespace libyuv {
22*4e366538SXin Li extern "C" {
23*4e366538SXin Li #endif
24*4e366538SXin Li
HammingDistance_MSA(const uint8_t * src_a,const uint8_t * src_b,int count)25*4e366538SXin Li uint32_t HammingDistance_MSA(const uint8_t* src_a,
26*4e366538SXin Li const uint8_t* src_b,
27*4e366538SXin Li int count) {
28*4e366538SXin Li uint32_t diff = 0u;
29*4e366538SXin Li int i;
30*4e366538SXin Li v16u8 src0, src1, src2, src3;
31*4e366538SXin Li v2i64 vec0 = {0}, vec1 = {0};
32*4e366538SXin Li
33*4e366538SXin Li for (i = 0; i < count; i += 32) {
34*4e366538SXin Li src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
35*4e366538SXin Li src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
36*4e366538SXin Li src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
37*4e366538SXin Li src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
38*4e366538SXin Li src0 ^= src2;
39*4e366538SXin Li src1 ^= src3;
40*4e366538SXin Li vec0 += __msa_pcnt_d((v2i64)src0);
41*4e366538SXin Li vec1 += __msa_pcnt_d((v2i64)src1);
42*4e366538SXin Li src_a += 32;
43*4e366538SXin Li src_b += 32;
44*4e366538SXin Li }
45*4e366538SXin Li
46*4e366538SXin Li vec0 += vec1;
47*4e366538SXin Li diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
48*4e366538SXin Li diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
49*4e366538SXin Li return diff;
50*4e366538SXin Li }
51*4e366538SXin Li
SumSquareError_MSA(const uint8_t * src_a,const uint8_t * src_b,int count)52*4e366538SXin Li uint32_t SumSquareError_MSA(const uint8_t* src_a,
53*4e366538SXin Li const uint8_t* src_b,
54*4e366538SXin Li int count) {
55*4e366538SXin Li uint32_t sse = 0u;
56*4e366538SXin Li int i;
57*4e366538SXin Li v16u8 src0, src1, src2, src3;
58*4e366538SXin Li v8i16 vec0, vec1, vec2, vec3;
59*4e366538SXin Li v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
60*4e366538SXin Li v2i64 tmp0;
61*4e366538SXin Li
62*4e366538SXin Li for (i = 0; i < count; i += 32) {
63*4e366538SXin Li src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
64*4e366538SXin Li src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
65*4e366538SXin Li src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
66*4e366538SXin Li src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
67*4e366538SXin Li vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
68*4e366538SXin Li vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
69*4e366538SXin Li vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
70*4e366538SXin Li vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
71*4e366538SXin Li vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
72*4e366538SXin Li vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
73*4e366538SXin Li vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
74*4e366538SXin Li vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
75*4e366538SXin Li reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
76*4e366538SXin Li reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
77*4e366538SXin Li reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
78*4e366538SXin Li reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
79*4e366538SXin Li src_a += 32;
80*4e366538SXin Li src_b += 32;
81*4e366538SXin Li }
82*4e366538SXin Li
83*4e366538SXin Li reg0 += reg1;
84*4e366538SXin Li reg2 += reg3;
85*4e366538SXin Li reg0 += reg2;
86*4e366538SXin Li tmp0 = __msa_hadd_s_d(reg0, reg0);
87*4e366538SXin Li sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
88*4e366538SXin Li sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
89*4e366538SXin Li return sse;
90*4e366538SXin Li }
91*4e366538SXin Li
92*4e366538SXin Li #ifdef __cplusplus
93*4e366538SXin Li } // extern "C"
94*4e366538SXin Li } // namespace libyuv
95*4e366538SXin Li #endif
96*4e366538SXin Li
97*4e366538SXin Li #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
98