EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HelixHough_kappaRange_sse.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file HelixHough_kappaRange_sse.cpp
1 #include "vector_math_inline.h"
2 #include "HelixHough.h"
3 #include <cmath>
4 #include <iostream>
5 
6 using namespace std;
7 
8 
9 static const unsigned int allones[4] __attribute__((aligned(16))) = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
10 static const unsigned int allzeroes[4] __attribute__((aligned(16))) = {0,0,0,0};
11 static const __m128 one_o_2 = {0.5,0.5,0.5,0.5};
12 
13 
14 static inline void __attribute__((always_inline)) _vec_select_ux_uy(__m128 v_phi, __m128& ux, __m128& uy, __m128& tanphi, __m128& phisel)
15 {
16  //extract the sign of phi and set pos_phi = |phi|
17  __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
18  __m128 phi_sign = _mm_and_ps((__m128)vec_sgn, v_phi);
19  __m128 pos_phi = _mm_xor_ps(phi_sign, v_phi);
20 
21  //|phi - pi|
22  __m128 v_phi_m_pi = _mm_sub_ps(v_phi, pi);
23  phi_sign = _mm_and_ps((__m128)vec_sgn, v_phi_m_pi);
24  __m128 pos_phi_m_pi = _mm_xor_ps(phi_sign, v_phi_m_pi);
25 
26  //|twopi - phi|
27  __m128 v_twopi_m_phi = _mm_sub_ps(twopi, v_phi);
28  phi_sign = _mm_and_ps((__m128)vec_sgn, v_twopi_m_phi);
29  __m128 pos_twopi_m_phi = _mm_xor_ps(phi_sign, v_twopi_m_phi);
30 
31  phisel = _mm_cmplt_ps(pos_phi, pi_over_four);
32  __m128 tmp2 = _mm_cmplt_ps(pos_phi_m_pi, pi_over_four);
33  phisel = _mm_or_ps(phisel, tmp2);
34  tmp2 = _mm_cmplt_ps(pos_twopi_m_phi, pi_over_four);
35  phisel = _mm_or_ps(phisel, tmp2);
36 
37  __m128 sinval = {0.,0.,0.,0.};
38  __m128 cosval = {0.,0.,0.,0.};
39  _vec_sin_cos_ps(v_phi, sinval, cosval);
40 
41  __m128 num = _mm_and_ps(phisel, sinval);
42  tmp2 = _mm_andnot_ps(phisel, cosval);
43  num = _mm_xor_ps(num, tmp2);
44  __m128 denom = _mm_and_ps(phisel, cosval);
45  tmp2 = _mm_andnot_ps(phisel, sinval);
46  denom = _mm_xor_ps(denom, tmp2);
47  tanphi = _mm_div_ps(num, denom);
48  __m128 v_phi_p_pi02 = _mm_add_ps(v_phi, pi_over_two);
49  __m128 phi_compare = _mm_and_ps(phisel, v_phi_p_pi02);
50  tmp2 = _mm_andnot_ps(phisel, v_phi);
51  phi_compare = _mm_xor_ps(phi_compare, tmp2);
52  __m128 tmp3 = _mm_mul_ps(tanphi, tanphi);
53  tmp3 = _mm_add_ps(one, tmp3);
54  tmp3 = _mm_div_ps(one, tmp3);
55  tmp3 = _mm_sqrt_ps(tmp3);
56  tmp2 = _mm_cmpgt_ps(phi_compare, pi);
57  __m128 tmp4 = _mm_cmplt_ps(phi_compare, twopi);
58  tmp2 = _mm_and_ps(tmp2, tmp4);
59  tmp2 = _mm_and_ps((__m128)vec_sgn, tmp2);
60  tmp3 = _mm_xor_ps(tmp3, tmp2);
61  tmp4 = _mm_mul_ps(tanphi, tmp3);
62  ux = _mm_and_ps(phisel, tmp3);
63  tmp2 = _mm_andnot_ps(phisel, tmp4);
64  ux = _mm_xor_ps(ux, tmp2);
65  uy = _mm_and_ps(phisel, tmp4);
66  tmp2 = _mm_andnot_ps(phisel, tmp3);
67  uy = _mm_xor_ps(uy, tmp2);
68 }
69 
70 
71 static inline void __attribute__((always_inline)) _vec_invRRangeOnePhi(__m128 x, __m128 y, __m128 v_phi, __m128 v_mind, __m128 v_maxd, __m128& v_mink, __m128& v_maxk, __m128& reflection_min, __m128& reflection_max, __m128& incorrect_quadrant, __m128& correct_quadrant)
72 {
73  __m128 dx = {0.,0.,0.,0.};
74  __m128 dy = {0.,0.,0.,0.};
75  __m128 ux = {0.,0.,0.,0.};
76  __m128 uy = {0.,0.,0.,0.};
77  __m128 tanphi = {0.,0.,0.,0.};
78  __m128 phisel = {0.,0.,0.,0.};
79  __m128 t = {0.,0.,0.,0.};
80  __m128 tP = {0.,0.,0.,0.};
81  __m128 mx = {0.,0.,0.,0.};
82  __m128 my = {0.,0.,0.,0.};
83  __m128 Ix = {0.,0.,0.,0.};
84  __m128 Iy = {0.,0.,0.,0.};
85  __m128 r1 = {0.,0.,0.,0.};
86  __m128 r2 = {0.,0.,0.,0.};
87 
88  __m128i vec_ones = _mm_load_si128((__m128i*)allones);
89  __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
90 
91  _vec_select_ux_uy(v_phi, ux, uy, tanphi, phisel);
92 
93  tP = _mm_mul_ps(ux, x);
94  __m128 tmp1 = _mm_mul_ps(uy, y);
95  tP = _mm_add_ps(tP, tmp1);
96  dx = _mm_mul_ps(ux, v_mind);
97  dy = _mm_mul_ps(uy, v_mind);
98 
99  tmp1 = _mm_cmplt_ps(v_mind, tP);
100  reflection_min = _mm_or_ps(reflection_min, tmp1);
101  tmp1 = _mm_xor_ps(tmp1, (__m128)vec_ones);
102  reflection_max = _mm_or_ps(reflection_max, tmp1);
103  mx = _mm_add_ps(dx, x);
104  mx = _mm_mul_ps(mx, one_o_2);
105  my = _mm_add_ps(dy, y);
106  my = _mm_mul_ps(my, one_o_2);
107 
108  __m128 tmp2 = _mm_mul_ps(mx, tanphi);
109  tmp2 = _mm_sub_ps(my, tmp2);
110  tmp1 = _mm_sub_ps(y, dy);
111  __m128 tmp3 = _mm_mul_ps(tanphi, tmp1);
112  tmp1 = _mm_sub_ps(x, dx);
113  tmp1 = _mm_add_ps(tmp1, tmp3);
114  tmp1 = _mm_div_ps(tmp2, tmp1);
115  tmp2 = _mm_mul_ps(tanphi, my);
116  tmp2 = _mm_sub_ps(tmp2, mx);
117  tmp3 = _mm_sub_ps(x, dx);
118  tmp3 = _mm_mul_ps(tmp3, tanphi);
119  __m128 tmp4 = _mm_sub_ps(y, dy);
120  tmp3 = _mm_add_ps(tmp3, tmp4);
121  tmp2 = _mm_div_ps(tmp2, tmp3);
122  t = _mm_and_ps(phisel, tmp1);
123  tmp3 = _mm_andnot_ps(phisel, tmp2);
124  t = _mm_xor_ps(t, tmp3);
125 
126  tmp1 = _mm_sub_ps(y,dy);
127  tmp1 = _mm_mul_ps(tmp1, t);
128  Ix = _mm_add_ps(mx, tmp1);
129  tmp2 = _mm_sub_ps(dx,x);
130  tmp2 = _mm_mul_ps(tmp2, t);
131  Iy = _mm_add_ps(my, tmp2);
132 
133  //extract sign of Ix
134  __m128 Ix_sign = _mm_and_ps((__m128)vec_sgn, Ix);
135  //sign of ux
136  __m128 ux_sign = _mm_and_ps((__m128)vec_sgn, ux);
137  //compare signs of Ix and ux
138  tmp1 = _mm_cmpeq_ps(Ix_sign, ux_sign);
139  //now for Iy,uy
140  __m128 Iy_sign = _mm_and_ps((__m128)vec_sgn, Iy);
141  __m128 uy_sign = _mm_and_ps((__m128)vec_sgn, uy);
142  tmp2 = _mm_cmpeq_ps(Iy_sign, uy_sign);
143  tmp3 = _mm_and_ps(tmp1, tmp2);
144  correct_quadrant = _mm_or_ps(correct_quadrant, tmp3);
145  tmp3 = _mm_xor_ps(tmp3, (__m128)vec_ones);
146  incorrect_quadrant = _mm_or_ps(incorrect_quadrant, tmp3);
147 
148  tmp1 = _mm_sub_ps(Ix, dx);
149  tmp1 = _mm_mul_ps(tmp1, tmp1);
150  tmp2 = _mm_sub_ps(Iy, dy);
151  tmp2 = _mm_mul_ps(tmp2, tmp2);
152  r1 = _mm_add_ps(tmp1, tmp2);
153  r1 = _mm_sqrt_ps(r1);
154 
155  //now repeat for maxd
156  dx = _mm_mul_ps(ux, v_maxd);
157  dy = _mm_mul_ps(uy, v_maxd);
158 
159  tmp1 = _mm_cmplt_ps(v_maxd, tP);
160  reflection_min = _mm_or_ps(reflection_min, tmp1);
161  tmp1 = _mm_xor_ps(tmp1, (__m128)vec_ones);
162  reflection_max = _mm_or_ps(reflection_max, tmp1);
163  mx = _mm_add_ps(dx, x);
164  mx = _mm_mul_ps(mx, one_o_2);
165  my = _mm_add_ps(dy, y);
166  my = _mm_mul_ps(my, one_o_2);
167 
168  tmp2 = _mm_mul_ps(mx, tanphi);
169  tmp2 = _mm_sub_ps(my, tmp2);
170  tmp1 = _mm_sub_ps(y, dy);
171  tmp3 = _mm_mul_ps(tanphi, tmp1);
172  tmp1 = _mm_sub_ps(x, dx);
173  tmp1 = _mm_add_ps(tmp1, tmp3);
174  tmp1 = _mm_div_ps(tmp2, tmp1);
175  tmp2 = _mm_mul_ps(tanphi, my);
176  tmp2 = _mm_sub_ps(tmp2, mx);
177  tmp3 = _mm_sub_ps(x, dx);
178  tmp3 = _mm_mul_ps(tmp3, tanphi);
179  tmp4 = _mm_sub_ps(y, dy);
180  tmp3 = _mm_add_ps(tmp3, tmp4);
181  tmp2 = _mm_div_ps(tmp2, tmp3);
182  t = _mm_and_ps(phisel, tmp1);
183  tmp3 = _mm_andnot_ps(phisel, tmp2);
184  t = _mm_xor_ps(t, tmp3);
185 
186  tmp1 = _mm_sub_ps(y,dy);
187  tmp1 = _mm_mul_ps(tmp1, t);
188  Ix = _mm_add_ps(mx, tmp1);
189  tmp2 = _mm_sub_ps(dx,x);
190  tmp2 = _mm_mul_ps(tmp2, t);
191  Iy = _mm_add_ps(my, tmp2);
192 
193  //extract sign of Ix
194  Ix_sign = _mm_and_ps((__m128)vec_sgn, Ix);
195  //sign of ux
196  ux_sign = _mm_and_ps((__m128)vec_sgn, ux);
197  //compare signs of Ix and ux
198  tmp1 = _mm_cmpeq_ps(Ix_sign, ux_sign);
199  //now for Iy,uy
200  Iy_sign = _mm_and_ps((__m128)vec_sgn, Iy);
201  uy_sign = _mm_and_ps((__m128)vec_sgn, uy);
202  tmp2 = _mm_cmpeq_ps(Iy_sign, uy_sign);
203  tmp3 = _mm_and_ps(tmp1, tmp2);
204  correct_quadrant = _mm_or_ps(correct_quadrant, tmp3);
205  tmp3 = _mm_xor_ps(tmp3, (__m128)vec_ones);
206  incorrect_quadrant = _mm_or_ps(incorrect_quadrant, tmp3);
207 
208  tmp1 = _mm_sub_ps(Ix, dx);
209  tmp1 = _mm_mul_ps(tmp1, tmp1);
210  tmp2 = _mm_sub_ps(Iy, dy);
211  tmp2 = _mm_mul_ps(tmp2, tmp2);
212  r2 = _mm_add_ps(tmp1, tmp2);
213  r2 = _mm_sqrt_ps(r2);
214 
215  __m128 invr1 = _mm_div_ps(one, r1);
216  __m128 invr2 = _mm_div_ps(one, r2);
217  tmp1 = _mm_cmplt_ps(r1, r2);
218  v_mink = _mm_and_ps(tmp1, invr2);
219  tmp2 = _mm_andnot_ps(tmp1, invr1);
220  v_mink = _mm_xor_ps(v_mink, tmp2);
221  v_maxk = _mm_and_ps(tmp1, invr1);
222  tmp2 = _mm_andnot_ps(tmp1, invr2);
223  v_maxk = _mm_xor_ps(v_maxk, tmp2);
224 }
225 
226 
227 void HelixHough::kappaRange_sse(const SimpleHit3D& hit, float* minphi, float* maxphi, float* mind, float* maxd, float* min_invR, float* max_invR)
228 {
229  __m128 v_minphi = _mm_load_ps(minphi);
230  __m128 v_maxphi = _mm_load_ps(maxphi);
231  __m128 v_mind = _mm_load_ps(mind);
232  __m128 v_maxd = _mm_load_ps(maxd);
233  __m128 v_mink = _mm_load_ps(min_invR);
234  __m128 v_maxk = _mm_load_ps(max_invR);
235 
236  float x_arr[4] __attribute__((aligned(16))) = {hit.x,hit.x,hit.x,hit.x};
237  float y_arr[4] __attribute__((aligned(16))) = {hit.y,hit.y,hit.y,hit.y};
238 
239  // __m128 x = _mm_load1_ps(&(hit.x));
240  // __m128 y = _mm_load1_ps(&(hit.y));
241  __m128 x = _mm_load_ps(x_arr);
242  __m128 y = _mm_load_ps(y_arr);
243 
244  __m128i vec_zeroes = _mm_load_si128((__m128i*)allzeroes);
245 
246  __m128 reflection_min = (__m128)(vec_zeroes);
247  __m128 reflection_max = (__m128)(vec_zeroes);
248  __m128 correct_quadrant = (__m128)(vec_zeroes);
249  __m128 incorrect_quadrant = (__m128)(vec_zeroes);
250 
251  __m128 v_mink1 = {0.,0.,0.,0.};
252  __m128 v_maxk1 = {0.,0.,0.,0.};
253  _vec_invRRangeOnePhi(x, y, v_minphi, v_mind, v_maxd, v_mink1, v_maxk1, reflection_min, reflection_max, incorrect_quadrant, correct_quadrant);
254 
255  __m128 v_mink2 = {0.,0.,0.,0.};
256  __m128 v_maxk2 = {0.,0.,0.,0.};
257  _vec_invRRangeOnePhi(x, y, v_maxphi, v_mind, v_maxd, v_mink2, v_maxk2, reflection_min, reflection_max, incorrect_quadrant, correct_quadrant);
258 
259  __m128 tmp1 = _mm_cmplt_ps(v_mink1, v_mink2);
260  v_mink = _mm_and_ps(tmp1, v_mink1);
261  __m128 tmp2 = _mm_andnot_ps(tmp1, v_mink2);
262  v_mink = _mm_xor_ps(v_mink, tmp2);
263 
264  tmp1 = _mm_cmpgt_ps(v_maxk1, v_maxk2);
265  v_maxk = _mm_and_ps(tmp1, v_maxk1);
266  tmp2 = _mm_andnot_ps(tmp1, v_maxk2);
267  v_maxk = _mm_xor_ps(v_maxk, tmp2);
268 
269  tmp1 = _mm_and_ps(reflection_min, reflection_max);
270  v_mink = _mm_andnot_ps(tmp1, v_mink);
271  tmp2 = _mm_and_ps(tmp1, zero);
272  v_mink = _mm_xor_ps(v_mink, tmp2);
273 
274  tmp1 = _mm_and_ps(correct_quadrant, incorrect_quadrant);
275  v_mink = _mm_andnot_ps(tmp1, v_mink);
276  tmp2 = _mm_and_ps(tmp1, zero);
277  v_mink = _mm_xor_ps(v_mink, tmp2);
278 
279  _mm_store_ps(min_invR, v_mink);
280  _mm_store_ps(max_invR, v_maxk);
281 }
282 
283