1 #[cfg(target_arch = "x86")]
2 use core::arch::x86::*;
3 #[cfg(target_arch = "x86_64")]
4 use core::arch::x86_64::*;
5 
6 #[repr(C)]
7 union UnionCast {
8     u32x4: [u32; 4],
9     f32x4: [f32; 4],
10     m128: __m128,
11 }
12 
m128_from_f32x4(f32x4: [f32; 4]) -> __m12813 pub const fn m128_from_f32x4(f32x4: [f32; 4]) -> __m128 {
14     unsafe { UnionCast { f32x4 }.m128 }
15 }
16 
m128_from_u32x4(u32x4: [u32; 4]) -> __m12817 const fn m128_from_u32x4(u32x4: [u32; 4]) -> __m128 {
18     unsafe { UnionCast { u32x4 }.m128 }
19 }
20 
21 const PS_ABS_MASK: __m128 = m128_from_u32x4([0x7fffffff; 4]);
22 const PS_INV_SIGN_MASK: __m128 = m128_from_u32x4([!0x8000_0000; 4]);
23 const PS_SIGN_MASK: __m128 = m128_from_u32x4([0x8000_0000; 4]);
24 const PS_NO_FRACTION: __m128 = m128_from_f32x4([8388608.0; 4]);
25 const PS_NEGATIVE_ZERO: __m128 = m128_from_u32x4([0x8000_0000; 4]);
26 const PS_PI: __m128 = m128_from_f32x4([core::f32::consts::PI; 4]);
27 const PS_HALF_PI: __m128 = m128_from_f32x4([core::f32::consts::FRAC_PI_2; 4]);
28 const PS_SIN_COEFFICIENTS0: __m128 =
29     m128_from_f32x4([-0.16666667, 0.008_333_331, -0.00019840874, 2.752_556_2e-6]);
30 const PS_SIN_COEFFICIENTS1: __m128 = m128_from_f32x4([
31     -2.388_985_9e-8,
32     -0.16665852,      /*Est1*/
33     0.008_313_95,     /*Est2*/
34     -0.000_185_246_7, /*Est3*/
35 ]);
36 const PS_ONE: __m128 = m128_from_f32x4([1.0; 4]);
37 const PS_TWO_PI: __m128 = m128_from_f32x4([core::f32::consts::TAU; 4]);
38 const PS_RECIPROCAL_TWO_PI: __m128 = m128_from_f32x4([0.159_154_94; 4]);
39 
40 /// Calculates the vector 3 dot product and returns answer in x lane of __m128.
41 #[inline(always)]
dot3_in_x(lhs: __m128, rhs: __m128) -> __m12842 pub(crate) unsafe fn dot3_in_x(lhs: __m128, rhs: __m128) -> __m128 {
43     let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
44     let y2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_01);
45     let z2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_10);
46     let x2y2_0_0_0 = _mm_add_ss(x2_y2_z2_w2, y2_0_0_0);
47     _mm_add_ss(x2y2_0_0_0, z2_0_0_0)
48 }
49 
50 /// Calculates the vector 4 dot product and returns answer in x lane of __m128.
51 #[inline(always)]
dot4_in_x(lhs: __m128, rhs: __m128) -> __m12852 pub(crate) unsafe fn dot4_in_x(lhs: __m128, rhs: __m128) -> __m128 {
53     let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
54     let z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_11_10);
55     let x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0);
56     let y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, 0b00_00_00_01);
57     _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0)
58 }
59 
60 #[inline]
dot3(lhs: __m128, rhs: __m128) -> f3261 pub(crate) unsafe fn dot3(lhs: __m128, rhs: __m128) -> f32 {
62     _mm_cvtss_f32(dot3_in_x(lhs, rhs))
63 }
64 
65 #[inline]
dot3_into_m128(lhs: __m128, rhs: __m128) -> __m12866 pub(crate) unsafe fn dot3_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
67     let dot_in_x = dot3_in_x(lhs, rhs);
68     _mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
69 }
70 
71 #[inline]
dot4(lhs: __m128, rhs: __m128) -> f3272 pub(crate) unsafe fn dot4(lhs: __m128, rhs: __m128) -> f32 {
73     _mm_cvtss_f32(dot4_in_x(lhs, rhs))
74 }
75 
76 #[inline]
dot4_into_m128(lhs: __m128, rhs: __m128) -> __m12877 pub(crate) unsafe fn dot4_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
78     let dot_in_x = dot4_in_x(lhs, rhs);
79     _mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
80 }
81 
82 #[inline]
m128_floor(v: __m128) -> __m12883 pub(crate) unsafe fn m128_floor(v: __m128) -> __m128 {
84     // Based on https://github.com/microsoft/DirectXMath `XMVectorFloor`
85     // To handle NAN, INF and numbers greater than 8388608, use masking
86     let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
87     let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
88     // Truncate
89     let vint = _mm_cvttps_epi32(v);
90     let result = _mm_cvtepi32_ps(vint);
91     let larger = _mm_cmpgt_ps(result, v);
92     // 0 -> 0, 0xffffffff -> -1.0f
93     let larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
94     let result = _mm_add_ps(result, larger);
95     // All numbers less than 8388608 will use the round to int
96     let result = _mm_and_ps(result, _mm_castsi128_ps(test));
97     // All others, use the ORIGINAL value
98     let test = _mm_andnot_si128(test, _mm_castps_si128(v));
99     _mm_or_ps(result, _mm_castsi128_ps(test))
100 }
101 
102 #[inline]
m128_ceil(v: __m128) -> __m128103 pub(crate) unsafe fn m128_ceil(v: __m128) -> __m128 {
104     // Based on https://github.com/microsoft/DirectXMath `XMVectorCeil`
105     // To handle NAN, INF and numbers greater than 8388608, use masking
106     let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
107     let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
108     // Truncate
109     let vint = _mm_cvttps_epi32(v);
110     let result = _mm_cvtepi32_ps(vint);
111     let smaller = _mm_cmplt_ps(result, v);
112     // 0 -> 0, 0xffffffff -> -1.0f
113     let smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
114     let result = _mm_sub_ps(result, smaller);
115     // All numbers less than 8388608 will use the round to int
116     let result = _mm_and_ps(result, _mm_castsi128_ps(test));
117     // All others, use the ORIGINAL value
118     let test = _mm_andnot_si128(test, _mm_castps_si128(v));
119     _mm_or_ps(result, _mm_castsi128_ps(test))
120 }
121 
122 #[inline]
m128_abs(v: __m128) -> __m128123 pub(crate) unsafe fn m128_abs(v: __m128) -> __m128 {
124     _mm_and_ps(v, _mm_castsi128_ps(_mm_set1_epi32(0x7f_ff_ff_ff)))
125 }
126 
127 #[inline(always)]
m128_mul_add(a: __m128, b: __m128, c: __m128) -> __m128128 pub(crate) unsafe fn m128_mul_add(a: __m128, b: __m128, c: __m128) -> __m128 {
129     // Only enable fused multiply-adds here if "fast-math" is enabled and the
130     // platform supports it. Otherwise this may break cross-platform determinism.
131     #[cfg(all(feature = "fast-math", target_feature = "fma"))]
132     {
133         _mm_fmadd_ps(a, b, c)
134     }
135 
136     #[cfg(any(not(feature = "fast-math"), not(target_feature = "fma")))]
137     {
138         _mm_add_ps(_mm_mul_ps(a, b), c)
139     }
140 }
141 
142 #[inline(always)]
m128_neg_mul_sub(a: __m128, b: __m128, c: __m128) -> __m128143 pub(crate) unsafe fn m128_neg_mul_sub(a: __m128, b: __m128, c: __m128) -> __m128 {
144     _mm_sub_ps(c, _mm_mul_ps(a, b))
145 }
146 
147 #[inline]
m128_round(v: __m128) -> __m128148 pub(crate) unsafe fn m128_round(v: __m128) -> __m128 {
149     // Based on https://github.com/microsoft/DirectXMath `XMVectorRound`
150     let sign = _mm_and_ps(v, PS_SIGN_MASK);
151     let s_magic = _mm_or_ps(PS_NO_FRACTION, sign);
152     let r1 = _mm_add_ps(v, s_magic);
153     let r1 = _mm_sub_ps(r1, s_magic);
154     let r2 = _mm_and_ps(v, PS_INV_SIGN_MASK);
155     let mask = _mm_cmple_ps(r2, PS_NO_FRACTION);
156     let r2 = _mm_andnot_ps(mask, v);
157     let r1 = _mm_and_ps(r1, mask);
158     _mm_xor_ps(r1, r2)
159 }
160 
161 #[inline]
m128_trunc(v: __m128) -> __m128162 pub(crate) unsafe fn m128_trunc(v: __m128) -> __m128 {
163     // Based on https://github.com/microsoft/DirectXMath `XMVectorTruncate`
164     // To handle NAN, INF and numbers greater than 8388608, use masking
165     // Get the abs value
166     let mut vtest = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_ABS_MASK));
167     // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
168     vtest = _mm_cmplt_epi32(vtest, _mm_castps_si128(PS_NO_FRACTION));
169     // Convert to int and back to float for rounding with truncation
170     let vint = _mm_cvttps_epi32(v);
171     // Convert back to floats
172     let mut vresult = _mm_cvtepi32_ps(vint);
173     // All numbers less than 8388608 will use the round to int
174     vresult = _mm_and_ps(vresult, _mm_castsi128_ps(vtest));
175     // All others, use the ORIGINAL value
176     vtest = _mm_andnot_si128(vtest, _mm_castps_si128(v));
177     _mm_or_ps(vresult, _mm_castsi128_ps(vtest))
178 }
179 
180 /// Returns a vector whose components are the corresponding components of Angles modulo 2PI.
181 #[inline]
m128_mod_angles(angles: __m128) -> __m128182 pub(crate) unsafe fn m128_mod_angles(angles: __m128) -> __m128 {
183     // Based on https://github.com/microsoft/DirectXMath `XMVectorModAngles`
184     let v = _mm_mul_ps(angles, PS_RECIPROCAL_TWO_PI);
185     let v = m128_round(v);
186     m128_neg_mul_sub(PS_TWO_PI, v, angles)
187 }
188 
189 /// Computes the sine of the angle in each lane of `v`. Values outside
190 /// the bounds of PI may produce an increasing error as the input angle
191 /// drifts from `[-PI, PI]`.
192 #[inline]
m128_sin(v: __m128) -> __m128193 pub(crate) unsafe fn m128_sin(v: __m128) -> __m128 {
194     // Based on https://github.com/microsoft/DirectXMath `XMVectorSin`
195 
196     // 11-degree minimax approximation
197 
198     // Force the value within the bounds of pi
199     let mut x = m128_mod_angles(v);
200 
201     // Map in [-pi/2,pi/2] with sin(y) = sin(x).
202     let sign = _mm_and_ps(x, PS_NEGATIVE_ZERO);
203     // pi when x >= 0, -pi when x < 0
204     let c = _mm_or_ps(PS_PI, sign);
205     // |x|
206     let absx = _mm_andnot_ps(sign, x);
207     let rflx = _mm_sub_ps(c, x);
208     let comp = _mm_cmple_ps(absx, PS_HALF_PI);
209     let select0 = _mm_and_ps(comp, x);
210     let select1 = _mm_andnot_ps(comp, rflx);
211     x = _mm_or_ps(select0, select1);
212 
213     let x2 = _mm_mul_ps(x, x);
214 
215     // Compute polynomial approximation
216     const SC1: __m128 = PS_SIN_COEFFICIENTS1;
217     let v_constants_b = _mm_shuffle_ps(SC1, SC1, 0b00_00_00_00);
218 
219     const SC0: __m128 = PS_SIN_COEFFICIENTS0;
220     let mut v_constants = _mm_shuffle_ps(SC0, SC0, 0b11_11_11_11);
221     let mut result = m128_mul_add(v_constants_b, x2, v_constants);
222 
223     v_constants = _mm_shuffle_ps(SC0, SC0, 0b10_10_10_10);
224     result = m128_mul_add(result, x2, v_constants);
225 
226     v_constants = _mm_shuffle_ps(SC0, SC0, 0b01_01_01_01);
227     result = m128_mul_add(result, x2, v_constants);
228 
229     v_constants = _mm_shuffle_ps(SC0, SC0, 0b00_00_00_00);
230     result = m128_mul_add(result, x2, v_constants);
231 
232     result = m128_mul_add(result, x2, PS_ONE);
233     result = _mm_mul_ps(result, x);
234 
235     result
236 }
237 
238 #[test]
test_sse2_m128_sin()239 fn test_sse2_m128_sin() {
240     use crate::Vec4;
241     use core::f32::consts::PI;
242 
243     fn test_sse2_m128_sin_angle(a: f32) {
244         let v = unsafe { m128_sin(_mm_set_ps1(a)) };
245         let v = Vec4(v);
246         let a_sin = a.sin();
247         // dbg!((a, a_sin, v));
248         assert!(v.abs_diff_eq(Vec4::splat(a_sin), 1e-6));
249     }
250 
251     let mut a = -PI;
252     let end = PI;
253     let step = PI / 8192.0;
254 
255     while a <= end {
256         test_sse2_m128_sin_angle(a);
257         a += step;
258     }
259 }
260