EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HelixHough_phiRange_sse.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file HelixHough_phiRange_sse.cpp
1 #include "HelixHough.h"
2 #include "vector_math_inline.h"
3 
4 #include <emmintrin.h>
5 #include <xmmintrin.h>
6 
7 using namespace std;
8 
9 
10 static const __m128 two = {2., 2., 2., 2.};
11 static const __m128 three_pi_over_two = {3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f};
12 static const __m128 SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
13 
14 void HelixHough::phiRange_sse(float* hit_x, float* hit_y, float* min_d, float* max_d, float* min_k, float* max_k, float* min_phi_1, float* max_phi_1, float* min_phi_2, float* max_phi_2)
15 {
16  __m128 x = _mm_load_ps(hit_x);
17  __m128 y = _mm_load_ps(hit_y);
18 
19  __m128 hit_phi = _vec_atan2_ps(y,x);
20  //if phi < 0, phi += 2*pi
21  __m128 tmp1 = _mm_cmplt_ps(hit_phi, zero);
22  __m128 tmp2 = _mm_and_ps(tmp1, twopi);
23  tmp1 = _mm_andnot_ps(tmp1, zero);
24  tmp1 = _mm_xor_ps(tmp1, tmp2);
25  hit_phi = _mm_add_ps(hit_phi, tmp1);
26 
27  // min_d, min_k
28  __m128 d = _mm_load_ps(min_d);
29  __m128 k = _mm_load_ps(min_k);
30  __m128 D = _mm_mul_ps(x,x);
31  tmp1 = _mm_mul_ps(y,y);
32  D = _mm_add_ps(D,tmp1);
33  D = _vec_sqrt_ps(D);
34  __m128 D_inv = _vec_rec_ps(D);
35  __m128 ak = d;
36  ak = _mm_mul_ps(d, two);
37  tmp1 = _mm_mul_ps(d,d);
38  tmp1 = _mm_mul_ps(tmp1, k);
39  ak = _mm_add_ps(ak, tmp1);
40  tmp1 = _mm_mul_ps(D,D);
41  tmp1 = _mm_mul_ps(tmp1, k);
42  ak = _mm_add_ps(ak, tmp1);
43  ak = _mm_mul_ps(ak, D_inv);
44  ak = _mm_mul_ps(ak, one_o_2);
45  __m128 hk = _mm_mul_ps(d,k);
46  hk = _mm_add_ps(hk, one);
47  hk = _mm_mul_ps(hk,hk);
48  tmp1 = _mm_mul_ps(ak,ak);
49  hk = _mm_sub_ps(hk, tmp1);
50  __m128 neg = _mm_cmple_ps(hk, zero);
51  hk = _vec_sqrt_ps(hk);
52 
53  __m128 xk1 = _mm_mul_ps(ak, x);
54  tmp1 = _mm_mul_ps(hk,y);
55  __m128 xk2 = _mm_sub_ps(xk1, tmp1);
56  xk1 = _mm_add_ps(xk1, tmp1);
57  xk1 = _mm_mul_ps(xk1, D_inv);
58  xk2 = _mm_mul_ps(xk2, D_inv);
59 
60  __m128 yk1 = _mm_mul_ps(ak, y);
61  tmp1 = _mm_mul_ps(hk,x);
62  __m128 yk2 = _mm_add_ps(yk1, tmp1);
63  yk1 = _mm_sub_ps(yk1, tmp1);
64  yk1 = _mm_mul_ps(yk1, D_inv);
65  yk2 = _mm_mul_ps(yk2, D_inv);
66 
67  __m128 phi_r_1 = _vec_atan2_ps(yk1, xk1);
68  //if phi < 0, phi += 2*pi
69  tmp1 = _mm_cmplt_ps(phi_r_1, zero);
70  tmp2 = _mm_and_ps(tmp1, twopi);
71  tmp1 = _mm_andnot_ps(tmp1, zero);
72  tmp1 = _mm_xor_ps(tmp1, tmp2);
73  phi_r_1 = _mm_add_ps(phi_r_1, tmp1);
74  // if neg==true, phi = hit_phi
75  tmp1 = _mm_and_ps(neg, hit_phi);
76  phi_r_1 = _mm_andnot_ps(neg, phi_r_1);
77  phi_r_1 = _mm_xor_ps(tmp1, phi_r_1);
78 
79  __m128 phi_l_1 = _vec_atan2_ps(yk2, xk2);
80  //if phi < 0, phi += 2*pi
81  tmp1 = _mm_cmplt_ps(phi_l_1, zero);
82  tmp2 = _mm_and_ps(tmp1, twopi);
83  tmp1 = _mm_andnot_ps(tmp1, zero);
84  tmp1 = _mm_xor_ps(tmp1, tmp2);
85  phi_l_1 = _mm_add_ps(phi_l_1, tmp1);
86  // if neg==true, phi = hit_phi
87  tmp1 = _mm_and_ps(neg, hit_phi);
88  phi_l_1 = _mm_andnot_ps(neg, phi_l_1);
89  phi_l_1 = _mm_xor_ps(tmp1, phi_l_1);
90 
91 
92  // min_d, max_k
93  d = _mm_load_ps(min_d);
94  k = _mm_load_ps(max_k);
95  D = _mm_mul_ps(x,x);
96  tmp1 = _mm_mul_ps(y,y);
97  D = _mm_add_ps(D,tmp1);
98  D = _vec_sqrt_ps(D);
99  D_inv = _vec_rec_ps(D);
100  ak = d;
101  ak = _mm_mul_ps(d, two);
102  tmp1 = _mm_mul_ps(d,d);
103  tmp1 = _mm_mul_ps(tmp1, k);
104  ak = _mm_add_ps(ak, tmp1);
105  tmp1 = _mm_mul_ps(D,D);
106  tmp1 = _mm_mul_ps(tmp1, k);
107  ak = _mm_add_ps(ak, tmp1);
108  ak = _mm_mul_ps(ak, D_inv);
109  ak = _mm_mul_ps(ak, one_o_2);
110  hk = _mm_mul_ps(d,k);
111  hk = _mm_add_ps(hk, one);
112  hk = _mm_mul_ps(hk,hk);
113  tmp1 = _mm_mul_ps(ak,ak);
114  hk = _mm_sub_ps(hk, tmp1);
115  neg = _mm_cmple_ps(hk, zero);
116  hk = _vec_sqrt_ps(hk);
117 
118  xk1 = _mm_mul_ps(ak, x);
119  tmp1 = _mm_mul_ps(hk,y);
120  xk2 = _mm_sub_ps(xk1, tmp1);
121  xk1 = _mm_add_ps(xk1, tmp1);
122  xk1 = _mm_mul_ps(xk1, D_inv);
123  xk2 = _mm_mul_ps(xk2, D_inv);
124 
125  yk1 = _mm_mul_ps(ak, y);
126  tmp1 = _mm_mul_ps(hk,x);
127  yk2 = _mm_add_ps(yk1, tmp1);
128  yk1 = _mm_sub_ps(yk1, tmp1);
129  yk1 = _mm_mul_ps(yk1, D_inv);
130  yk2 = _mm_mul_ps(yk2, D_inv);
131 
132  __m128 phi_r_2 = _vec_atan2_ps(yk1, xk1);
133  //if phi < 0, phi += 2*pi
134  tmp1 = _mm_cmplt_ps(phi_r_2, zero);
135  tmp2 = _mm_and_ps(tmp1, twopi);
136  tmp1 = _mm_andnot_ps(tmp1, zero);
137  tmp1 = _mm_xor_ps(tmp1, tmp2);
138  phi_r_2 = _mm_add_ps(phi_r_2, tmp1);
139  // if neg==true, phi = hit_phi
140  tmp1 = _mm_and_ps(neg, hit_phi);
141  phi_r_2 = _mm_andnot_ps(neg, phi_r_2);
142  phi_r_2 = _mm_xor_ps(tmp1, phi_r_2);
143 
144  __m128 phi_l_2 = _vec_atan2_ps(yk2, xk2);
145  //if phi < 0, phi += 2*pi
146  tmp1 = _mm_cmplt_ps(phi_l_2, zero);
147  tmp2 = _mm_and_ps(tmp1, twopi);
148  tmp1 = _mm_andnot_ps(tmp1, zero);
149  tmp1 = _mm_xor_ps(tmp1, tmp2);
150  phi_l_2 = _mm_add_ps(phi_l_2, tmp1);
151  // if neg==true, phi = hit_phi
152  tmp1 = _mm_and_ps(neg, hit_phi);
153  phi_l_2 = _mm_andnot_ps(neg, phi_l_2);
154  phi_l_2 = _mm_xor_ps(tmp1, phi_l_2);
155 
156  // max_d, min_k
157  d = _mm_load_ps(max_d);
158  k = _mm_load_ps(max_k);
159  D = _mm_mul_ps(x,x);
160  tmp1 = _mm_mul_ps(y,y);
161  D = _mm_add_ps(D,tmp1);
162  D = _vec_sqrt_ps(D);
163  D_inv = _vec_rec_ps(D);
164  ak = d;
165  ak = _mm_mul_ps(d, two);
166  tmp1 = _mm_mul_ps(d,d);
167  tmp1 = _mm_mul_ps(tmp1, k);
168  ak = _mm_add_ps(ak, tmp1);
169  tmp1 = _mm_mul_ps(D,D);
170  tmp1 = _mm_mul_ps(tmp1, k);
171  ak = _mm_add_ps(ak, tmp1);
172  ak = _mm_mul_ps(ak, D_inv);
173  ak = _mm_mul_ps(ak, one_o_2);
174  hk = _mm_mul_ps(d,k);
175  hk = _mm_add_ps(hk, one);
176  hk = _mm_mul_ps(hk,hk);
177  tmp1 = _mm_mul_ps(ak,ak);
178  hk = _mm_sub_ps(hk, tmp1);
179  neg = _mm_cmple_ps(hk, zero);
180  hk = _vec_sqrt_ps(hk);
181 
182  xk1 = _mm_mul_ps(ak, x);
183  tmp1 = _mm_mul_ps(hk,y);
184  xk2 = _mm_sub_ps(xk1, tmp1);
185  xk1 = _mm_add_ps(xk1, tmp1);
186  xk1 = _mm_mul_ps(xk1, D_inv);
187  xk2 = _mm_mul_ps(xk2, D_inv);
188 
189  yk1 = _mm_mul_ps(ak, y);
190  tmp1 = _mm_mul_ps(hk,x);
191  yk2 = _mm_add_ps(yk1, tmp1);
192  yk1 = _mm_sub_ps(yk1, tmp1);
193  yk1 = _mm_mul_ps(yk1, D_inv);
194  yk2 = _mm_mul_ps(yk2, D_inv);
195 
196  __m128 phi_r_3 = _vec_atan2_ps(yk1, xk1);
197  //if phi < 0, phi += 2*pi
198  tmp1 = _mm_cmplt_ps(phi_r_3, zero);
199  tmp2 = _mm_and_ps(tmp1, twopi);
200  tmp1 = _mm_andnot_ps(tmp1, zero);
201  tmp1 = _mm_xor_ps(tmp1, tmp2);
202  phi_r_3 = _mm_add_ps(phi_r_3, tmp1);
203  // if neg==true, phi = hit_phi
204  tmp1 = _mm_and_ps(neg, hit_phi);
205  phi_r_3 = _mm_andnot_ps(neg, phi_r_3);
206  phi_r_3 = _mm_xor_ps(tmp1, phi_r_3);
207 
208  __m128 phi_l_3 = _vec_atan2_ps(yk2, xk2);
209  //if phi < 0, phi += 2*pi
210  tmp1 = _mm_cmplt_ps(phi_l_3, zero);
211  tmp2 = _mm_and_ps(tmp1, twopi);
212  tmp1 = _mm_andnot_ps(tmp1, zero);
213  tmp1 = _mm_xor_ps(tmp1, tmp2);
214  phi_l_3 = _mm_add_ps(phi_l_3, tmp1);
215  // if neg==true, phi = hit_phi
216  tmp1 = _mm_and_ps(neg, hit_phi);
217  phi_l_3 = _mm_andnot_ps(neg, phi_l_3);
218  phi_l_3 = _mm_xor_ps(tmp1, phi_l_3);
219 
220  // max_d, max_k
221  d = _mm_load_ps(max_d);
222  k = _mm_load_ps(max_k);
223  D = _mm_mul_ps(x,x);
224  tmp1 = _mm_mul_ps(y,y);
225  D = _mm_add_ps(D,tmp1);
226  D = _vec_sqrt_ps(D);
227  D_inv = _vec_rec_ps(D);
228  ak = d;
229  ak = _mm_mul_ps(d, two);
230  tmp1 = _mm_mul_ps(d,d);
231  tmp1 = _mm_mul_ps(tmp1, k);
232  ak = _mm_add_ps(ak, tmp1);
233  tmp1 = _mm_mul_ps(D,D);
234  tmp1 = _mm_mul_ps(tmp1, k);
235  ak = _mm_add_ps(ak, tmp1);
236  ak = _mm_mul_ps(ak, D_inv);
237  ak = _mm_mul_ps(ak, one_o_2);
238  hk = _mm_mul_ps(d,k);
239  hk = _mm_add_ps(hk, one);
240  hk = _mm_mul_ps(hk,hk);
241  tmp1 = _mm_mul_ps(ak,ak);
242  hk = _mm_sub_ps(hk, tmp1);
243  neg = _mm_cmple_ps(hk, zero);
244  hk = _vec_sqrt_ps(hk);
245 
246  xk1 = _mm_mul_ps(ak, x);
247  tmp1 = _mm_mul_ps(hk,y);
248  xk2 = _mm_sub_ps(xk1, tmp1);
249  xk1 = _mm_add_ps(xk1, tmp1);
250  xk1 = _mm_mul_ps(xk1, D_inv);
251  xk2 = _mm_mul_ps(xk2, D_inv);
252 
253  yk1 = _mm_mul_ps(ak, y);
254  tmp1 = _mm_mul_ps(hk,x);
255  yk2 = _mm_add_ps(yk1, tmp1);
256  yk1 = _mm_sub_ps(yk1, tmp1);
257  yk1 = _mm_mul_ps(yk1, D_inv);
258  yk2 = _mm_mul_ps(yk2, D_inv);
259 
260  __m128 phi_r_4 = _vec_atan2_ps(yk1, xk1);
261  //if phi < 0, phi += 2*pi
262  tmp1 = _mm_cmplt_ps(phi_r_4, zero);
263  tmp2 = _mm_and_ps(tmp1, twopi);
264  tmp1 = _mm_andnot_ps(tmp1, zero);
265  tmp1 = _mm_xor_ps(tmp1, tmp2);
266  phi_r_4 = _mm_add_ps(phi_r_4, tmp1);
267  // if neg==true, phi = hit_phi
268  tmp1 = _mm_and_ps(neg, hit_phi);
269  phi_r_4 = _mm_andnot_ps(neg, phi_r_4);
270  phi_r_4 = _mm_xor_ps(tmp1, phi_r_4);
271 
272  __m128 phi_l_4 = _vec_atan2_ps(yk2, xk2);
273  //if phi < 0, phi += 2*pi
274  tmp1 = _mm_cmplt_ps(phi_l_4, zero);
275  tmp2 = _mm_and_ps(tmp1, twopi);
276  tmp1 = _mm_andnot_ps(tmp1, zero);
277  tmp1 = _mm_xor_ps(tmp1, tmp2);
278  phi_l_4 = _mm_add_ps(phi_l_4, tmp1);
279  // if neg==true, phi = hit_phi
280  tmp1 = _mm_and_ps(neg, hit_phi);
281  phi_l_4 = _mm_andnot_ps(neg, phi_l_4);
282  phi_l_4 = _mm_xor_ps(tmp1, phi_l_4);
283 
285 
286  // check if phi_r overlaps the 0,2pi jump
287  tmp1 = _mm_cmplt_ps(phi_r_1, pi_over_two);
288  tmp2 = _mm_cmplt_ps(phi_r_2, pi_over_two);
289  tmp1 = _mm_or_ps(tmp1, tmp2);
290  tmp2 = _mm_cmplt_ps(phi_r_3, pi_over_two);
291  tmp1 = _mm_or_ps(tmp1, tmp2);
292  tmp2 = _mm_cmplt_ps(phi_r_4, pi_over_two);
293  tmp1 = _mm_or_ps(tmp1, tmp2);
294 
295  tmp2 = _mm_cmpgt_ps(phi_r_1, three_pi_over_two);
296  __m128 tmp3 = _mm_cmpgt_ps(phi_r_2, three_pi_over_two);
297  tmp2 = _mm_or_ps(tmp2, tmp3);
298  tmp3 = _mm_cmpgt_ps(phi_r_3, three_pi_over_two);
299  tmp2 = _mm_or_ps(tmp2, tmp3);
300  tmp3 = _mm_cmpgt_ps(phi_r_4, three_pi_over_two);
301  tmp2 = _mm_or_ps(tmp2, tmp3);
302 
303  tmp1 = _mm_and_ps(tmp1, tmp2);
304 
305  // tmp1 is now all ones if phi_r overlaps the jump, all zeros otherwise
306  // if tmp1 is true, then subtract 2*pi from all of the phi_r values > 3*pi/2
307  tmp2 = _mm_and_ps(tmp1, twopi);
308  tmp3 = _mm_andnot_ps(tmp1, zero);
309  tmp2 = _mm_xor_ps(tmp2, tmp3);
310 
311  __m128 tmp4 = _mm_cmpgt_ps(phi_r_1, three_pi_over_two);
312  tmp3 = _mm_and_ps(tmp4, tmp2);
313  __m128 tmp5 = _mm_andnot_ps(tmp4, zero);
314  tmp3 = _mm_xor_ps(tmp3, tmp5);
315  phi_r_1 = _mm_sub_ps(phi_r_1, tmp3);
316 
317  tmp4 = _mm_cmpgt_ps(phi_r_2, three_pi_over_two);
318  tmp3 = _mm_and_ps(tmp4, tmp2);
319  tmp5 = _mm_andnot_ps(tmp4, zero);
320  tmp3 = _mm_xor_ps(tmp3, tmp5);
321  phi_r_2 = _mm_sub_ps(phi_r_2, tmp3);
322 
323  tmp4 = _mm_cmpgt_ps(phi_r_3, three_pi_over_two);
324  tmp3 = _mm_and_ps(tmp4, tmp2);
325  tmp5 = _mm_andnot_ps(tmp4, zero);
326  tmp3 = _mm_xor_ps(tmp3, tmp5);
327  phi_r_3 = _mm_sub_ps(phi_r_3, tmp3);
328 
329  tmp4 = _mm_cmpgt_ps(phi_r_4, three_pi_over_two);
330  tmp3 = _mm_and_ps(tmp4, tmp2);
331  tmp5 = _mm_andnot_ps(tmp4, zero);
332  tmp3 = _mm_xor_ps(tmp3, tmp5);
333  phi_r_4 = _mm_sub_ps(phi_r_4, tmp3);
334 
335 
336  // find the minimum phi_r
337  __m128 phi_r_min = phi_r_1;
338  tmp2 = _mm_cmplt_ps(phi_r_2, phi_r_min);
339  tmp3 = _mm_and_ps(tmp2, phi_r_2);
340  phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
341  phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
342  tmp2 = _mm_cmplt_ps(phi_r_3, phi_r_min);
343  tmp3 = _mm_and_ps(tmp2, phi_r_3);
344  phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
345  phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
346  tmp2 = _mm_cmplt_ps(phi_r_4, phi_r_min);
347  tmp3 = _mm_and_ps(tmp2, phi_r_4);
348  phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
349  phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
350 
351  // find the maximum phi_r
352  __m128 phi_r_max = phi_r_1;
353  tmp2 = _mm_cmpgt_ps(phi_r_2, phi_r_max);
354  tmp3 = _mm_and_ps(tmp2, phi_r_2);
355  phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
356  phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
357  tmp2 = _mm_cmpgt_ps(phi_r_3, phi_r_max);
358  tmp3 = _mm_and_ps(tmp2, phi_r_3);
359  phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
360  phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
361  tmp2 = _mm_cmpgt_ps(phi_r_4, phi_r_max);
362  tmp3 = _mm_and_ps(tmp2, phi_r_4);
363  phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
364  phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
365 
366  _mm_store_ps(min_phi_1, phi_r_min);
367  _mm_store_ps(max_phi_1, phi_r_max);
368 
369 
370 
371  // check if phi_l overlaps the 0,2pi jump
372  tmp1 = _mm_cmplt_ps(phi_l_1, pi_over_two);
373  tmp2 = _mm_cmplt_ps(phi_l_2, pi_over_two);
374  tmp1 = _mm_or_ps(tmp1, tmp2);
375  tmp2 = _mm_cmplt_ps(phi_l_3, pi_over_two);
376  tmp1 = _mm_or_ps(tmp1, tmp2);
377  tmp2 = _mm_cmplt_ps(phi_l_4, pi_over_two);
378  tmp1 = _mm_or_ps(tmp1, tmp2);
379 
380  tmp2 = _mm_cmpgt_ps(phi_l_1, three_pi_over_two);
381  tmp3 = _mm_cmpgt_ps(phi_l_2, three_pi_over_two);
382  tmp2 = _mm_or_ps(tmp2, tmp3);
383  tmp3 = _mm_cmpgt_ps(phi_l_3, three_pi_over_two);
384  tmp2 = _mm_or_ps(tmp2, tmp3);
385  tmp3 = _mm_cmpgt_ps(phi_l_4, three_pi_over_two);
386  tmp2 = _mm_or_ps(tmp2, tmp3);
387 
388  tmp1 = _mm_and_ps(tmp1, tmp2);
389 
390  // tmp1 is now all ones if phi_l overlaps the jump, all zeros otherwise
391  // if tmp1 is true, then subtract 2*pi from all of the phi_l values > 3*pi/2
392  tmp2 = _mm_and_ps(tmp1, twopi);
393  tmp3 = _mm_andnot_ps(tmp1, zero);
394  tmp2 = _mm_xor_ps(tmp2, tmp3);
395 
396  tmp4 = _mm_cmpgt_ps(phi_l_1, three_pi_over_two);
397  tmp3 = _mm_and_ps(tmp4, tmp2);
398  tmp5 = _mm_andnot_ps(tmp4, zero);
399  tmp3 = _mm_xor_ps(tmp3, tmp5);
400  phi_l_1 = _mm_sub_ps(phi_l_1, tmp3);
401 
402  tmp4 = _mm_cmpgt_ps(phi_l_2, three_pi_over_two);
403  tmp3 = _mm_and_ps(tmp4, tmp2);
404  tmp5 = _mm_andnot_ps(tmp4, zero);
405  tmp3 = _mm_xor_ps(tmp3, tmp5);
406  phi_l_2 = _mm_sub_ps(phi_l_2, tmp3);
407 
408  tmp4 = _mm_cmpgt_ps(phi_l_3, three_pi_over_two);
409  tmp3 = _mm_and_ps(tmp4, tmp2);
410  tmp5 = _mm_andnot_ps(tmp4, zero);
411  tmp3 = _mm_xor_ps(tmp3, tmp5);
412  phi_l_3 = _mm_sub_ps(phi_l_3, tmp3);
413 
414  tmp4 = _mm_cmpgt_ps(phi_l_4, three_pi_over_two);
415  tmp3 = _mm_and_ps(tmp4, tmp2);
416  tmp5 = _mm_andnot_ps(tmp4, zero);
417  tmp3 = _mm_xor_ps(tmp3, tmp5);
418  phi_l_4 = _mm_sub_ps(phi_l_4, tmp3);
419 
420 
421  // find the minimum phi_l
422  __m128 phi_l_min = phi_l_1;
423  tmp2 = _mm_cmplt_ps(phi_l_2, phi_l_min);
424  tmp3 = _mm_and_ps(tmp2, phi_l_2);
425  phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
426  phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
427  tmp2 = _mm_cmplt_ps(phi_l_3, phi_l_min);
428  tmp3 = _mm_and_ps(tmp2, phi_l_3);
429  phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
430  phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
431  tmp2 = _mm_cmplt_ps(phi_l_4, phi_l_min);
432  tmp3 = _mm_and_ps(tmp2, phi_l_4);
433  phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
434  phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
435 
436  // find the maximum phi_l
437  __m128 phi_l_max = phi_l_1;
438  tmp2 = _mm_cmpgt_ps(phi_l_2, phi_l_max);
439  tmp3 = _mm_and_ps(tmp2, phi_l_2);
440  phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
441  phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
442  tmp2 = _mm_cmpgt_ps(phi_l_3, phi_l_max);
443  tmp3 = _mm_and_ps(tmp2, phi_l_3);
444  phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
445  phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
446  tmp2 = _mm_cmpgt_ps(phi_l_4, phi_l_max);
447  tmp3 = _mm_and_ps(tmp2, phi_l_4);
448  phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
449  phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
450 
451  _mm_store_ps(min_phi_2, phi_l_min);
452  _mm_store_ps(max_phi_2, phi_l_max);
453 
454 }
455 
456 
457 static inline __m128 compare_sign(__m128 a, __m128 b)
458 {
459  const __m128i MASK = _mm_set1_epi32(0xffffffff);
460 
461  __m128 f = _mm_xor_ps(a,b);
462  __m128i i = _mm_castps_si128(f);
463 
464  i = _mm_srai_epi32(i,31);
465  i = _mm_xor_si128(i,MASK);
466 
467  f = _mm_castsi128_ps(i);
468 
469  return f;
470 }
471 
472 
473 void HelixHough::phiRange_sse(float* hit_x, float* hit_y, float* min_d, float* max_d, float* min_k, float* max_k, float* min_phi, float* max_phi, float hel, __m128& phi_3_out, __m128& phi_4_out)
474 {
475  __m128 helicity_vec = _mm_load1_ps(&(hel));
476 
477  __m128 x = _mm_load_ps(hit_x);
478  __m128 y = _mm_load_ps(hit_y);
479 
480  __m128 d_min = _mm_load_ps(min_d);
481  __m128 d_max = _mm_load_ps(max_d);
482  __m128 k_min = _mm_load_ps(min_k);
483  __m128 k_max = _mm_load_ps(max_k);
484 
485  __m128 hit_phi = _vec_atan2_ps(y,x);
486  //if phi < 0, phi += 2*pi
487  __m128 tmp1 = _mm_cmplt_ps(hit_phi, zero);
488  __m128 tmp2 = _mm_and_ps(tmp1, twopi);
489  tmp1 = _mm_andnot_ps(tmp1, zero);
490  tmp1 = _mm_xor_ps(tmp1, tmp2);
491  hit_phi = _mm_add_ps(hit_phi, tmp1);
492 
493  // min_d, max_k
494  __m128 d = d_min;
495  __m128 k = k_max;
496  __m128 D = _mm_mul_ps(x,x);
497  tmp1 = _mm_mul_ps(y,y);
498  D = _mm_add_ps(D,tmp1);
499  D = _vec_sqrt_ps(D);
500  __m128 D_inv = _vec_rec_ps(D);
501  __m128 ak = d;
502  ak = _mm_mul_ps(d, two);
503  tmp1 = _mm_mul_ps(d,d);
504  tmp1 = _mm_mul_ps(tmp1, k);
505  ak = _mm_add_ps(ak, tmp1);
506  tmp1 = _mm_mul_ps(D,D);
507  tmp1 = _mm_mul_ps(tmp1, k);
508  ak = _mm_add_ps(ak, tmp1);
509  ak = _mm_mul_ps(ak, D_inv);
510  ak = _mm_mul_ps(ak, one_o_2);
511  __m128 hk = _mm_mul_ps(d,k);
512  hk = _mm_add_ps(hk, one);
513  hk = _mm_mul_ps(hk,hk);
514  tmp1 = _mm_mul_ps(ak,ak);
515  hk = _mm_sub_ps(hk, tmp1);
516  __m128 neg = _mm_cmple_ps(hk, zero);
517  hk = _vec_sqrt_ps(hk);
518 
519  __m128 xk1 = _mm_mul_ps(ak, x);
520  tmp1 = _mm_mul_ps(hk,y);
521  __m128 xk2 = _mm_sub_ps(xk1, tmp1);
522  xk1 = _mm_add_ps(xk1, tmp1);
523  xk1 = _mm_mul_ps(xk1, D_inv);
524  xk2 = _mm_mul_ps(xk2, D_inv);
525 
526  __m128 yk1 = _mm_mul_ps(ak, y);
527  tmp1 = _mm_mul_ps(hk,x);
528  __m128 yk2 = _mm_add_ps(yk1, tmp1);
529  yk1 = _mm_sub_ps(yk1, tmp1);
530  yk1 = _mm_mul_ps(yk1, D_inv);
531  yk2 = _mm_mul_ps(yk2, D_inv);
532 
533  __m128 crossproduct = _mm_mul_ps(x, yk1);
534  __m128 crosstemp = _mm_mul_ps(y, xk1);
535  crossproduct = _mm_sub_ps(crossproduct, crosstemp);
536  __m128 correct_helicity = compare_sign(crossproduct, helicity_vec);
537 
538  __m128 xk = _mm_and_ps(correct_helicity, xk1);
539  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
540  xk = _mm_xor_ps(xk, tmp1);
541  __m128 yk = _mm_and_ps(correct_helicity, yk1);
542  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
543  yk = _mm_xor_ps(yk, tmp1);
544 
545 
546  __m128 phi_1 = _vec_atan2_ps(yk, xk);
547  //if phi < 0, phi += 2*pi
548  tmp1 = _mm_cmplt_ps(phi_1, zero);
549  tmp2 = _mm_and_ps(tmp1, twopi);
550  tmp1 = _mm_andnot_ps(tmp1, zero);
551  tmp1 = _mm_xor_ps(tmp1, tmp2);
552  phi_1 = _mm_add_ps(phi_1, tmp1);
553  // if neg==true, phi = hit_phi
554  tmp1 = _mm_and_ps(neg, hit_phi);
555  phi_1 = _mm_andnot_ps(neg, phi_1);
556  phi_1 = _mm_xor_ps(tmp1, phi_1);
557  phi_3_out = phi_1;
558 
559  // max_d, max_k
560  d = d_max;
561  k = k_max;
562  ak = d;
563  ak = _mm_mul_ps(d, two);
564  tmp1 = _mm_mul_ps(d,d);
565  tmp1 = _mm_mul_ps(tmp1, k);
566  ak = _mm_add_ps(ak, tmp1);
567  tmp1 = _mm_mul_ps(D,D);
568  tmp1 = _mm_mul_ps(tmp1, k);
569  ak = _mm_add_ps(ak, tmp1);
570  ak = _mm_mul_ps(ak, D_inv);
571  ak = _mm_mul_ps(ak, one_o_2);
572  hk = _mm_mul_ps(d,k);
573  hk = _mm_add_ps(hk, one);
574  hk = _mm_mul_ps(hk,hk);
575  tmp1 = _mm_mul_ps(ak,ak);
576  hk = _mm_sub_ps(hk, tmp1);
577  neg = _mm_cmple_ps(hk, zero);
578  hk = _vec_sqrt_ps(hk);
579 
580  xk1 = _mm_mul_ps(ak, x);
581  tmp1 = _mm_mul_ps(hk,y);
582  xk2 = _mm_sub_ps(xk1, tmp1);
583  xk1 = _mm_add_ps(xk1, tmp1);
584  xk1 = _mm_mul_ps(xk1, D_inv);
585  xk2 = _mm_mul_ps(xk2, D_inv);
586 
587  yk1 = _mm_mul_ps(ak, y);
588  tmp1 = _mm_mul_ps(hk,x);
589  yk2 = _mm_add_ps(yk1, tmp1);
590  yk1 = _mm_sub_ps(yk1, tmp1);
591  yk1 = _mm_mul_ps(yk1, D_inv);
592  yk2 = _mm_mul_ps(yk2, D_inv);
593 
594  xk = _mm_and_ps(correct_helicity, xk1);
595  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
596  xk = _mm_xor_ps(xk, tmp1);
597  yk = _mm_and_ps(correct_helicity, yk1);
598  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
599  yk = _mm_xor_ps(yk, tmp1);
600 
601  __m128 phi_2 = _vec_atan2_ps(yk, xk);
602  //if phi < 0, phi += 2*pi
603  tmp1 = _mm_cmplt_ps(phi_2, zero);
604  tmp2 = _mm_and_ps(tmp1, twopi);
605  tmp1 = _mm_andnot_ps(tmp1, zero);
606  tmp1 = _mm_xor_ps(tmp1, tmp2);
607  phi_2 = _mm_add_ps(phi_2, tmp1);
608  // if neg==true, phi = hit_phi
609  tmp1 = _mm_and_ps(neg, hit_phi);
610  phi_2 = _mm_andnot_ps(neg, phi_2);
611  phi_2 = _mm_xor_ps(tmp1, phi_2);
612  phi_4_out = phi_2;
613 
614 
615  // min_d, min_k
616  d = d_min;
617  k = k_min;
618  ak = d;
619  ak = _mm_mul_ps(d, two);
620  tmp1 = _mm_mul_ps(d,d);
621  tmp1 = _mm_mul_ps(tmp1, k);
622  ak = _mm_add_ps(ak, tmp1);
623  tmp1 = _mm_mul_ps(D,D);
624  tmp1 = _mm_mul_ps(tmp1, k);
625  ak = _mm_add_ps(ak, tmp1);
626  ak = _mm_mul_ps(ak, D_inv);
627  ak = _mm_mul_ps(ak, one_o_2);
628  hk = _mm_mul_ps(d,k);
629  hk = _mm_add_ps(hk, one);
630  hk = _mm_mul_ps(hk,hk);
631  tmp1 = _mm_mul_ps(ak,ak);
632  hk = _mm_sub_ps(hk, tmp1);
633  neg = _mm_cmple_ps(hk, zero);
634  hk = _vec_sqrt_ps(hk);
635 
636  xk1 = _mm_mul_ps(ak, x);
637  tmp1 = _mm_mul_ps(hk,y);
638  xk2 = _mm_sub_ps(xk1, tmp1);
639  xk1 = _mm_add_ps(xk1, tmp1);
640  xk1 = _mm_mul_ps(xk1, D_inv);
641  xk2 = _mm_mul_ps(xk2, D_inv);
642 
643  yk1 = _mm_mul_ps(ak, y);
644  tmp1 = _mm_mul_ps(hk,x);
645  yk2 = _mm_add_ps(yk1, tmp1);
646  yk1 = _mm_sub_ps(yk1, tmp1);
647  yk1 = _mm_mul_ps(yk1, D_inv);
648  yk2 = _mm_mul_ps(yk2, D_inv);
649 
650  xk = _mm_and_ps(correct_helicity, xk1);
651  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
652  xk = _mm_xor_ps(xk, tmp1);
653  yk = _mm_and_ps(correct_helicity, yk1);
654  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
655  yk = _mm_xor_ps(yk, tmp1);
656 
657  __m128 phi_3 = _vec_atan2_ps(yk, xk);
658  //if phi < 0, phi += 2*pi
659  tmp1 = _mm_cmplt_ps(phi_3, zero);
660  tmp2 = _mm_and_ps(tmp1, twopi);
661  tmp1 = _mm_andnot_ps(tmp1, zero);
662  tmp1 = _mm_xor_ps(tmp1, tmp2);
663  phi_3 = _mm_add_ps(phi_3, tmp1);
664  // if neg==true, phi = hit_phi
665  tmp1 = _mm_and_ps(neg, hit_phi);
666  phi_3 = _mm_andnot_ps(neg, phi_3);
667  phi_3 = _mm_xor_ps(tmp1, phi_3);
668 
669 
670  // max_d, min_k
671  d = d_max;
672  k = k_min;
673  ak = d;
674  ak = _mm_mul_ps(d, two);
675  tmp1 = _mm_mul_ps(d,d);
676  tmp1 = _mm_mul_ps(tmp1, k);
677  ak = _mm_add_ps(ak, tmp1);
678  tmp1 = _mm_mul_ps(D,D);
679  tmp1 = _mm_mul_ps(tmp1, k);
680  ak = _mm_add_ps(ak, tmp1);
681  ak = _mm_mul_ps(ak, D_inv);
682  ak = _mm_mul_ps(ak, one_o_2);
683  hk = _mm_mul_ps(d,k);
684  hk = _mm_add_ps(hk, one);
685  hk = _mm_mul_ps(hk,hk);
686  tmp1 = _mm_mul_ps(ak,ak);
687  hk = _mm_sub_ps(hk, tmp1);
688  neg = _mm_cmple_ps(hk, zero);
689  hk = _vec_sqrt_ps(hk);
690 
691  xk1 = _mm_mul_ps(ak, x);
692  tmp1 = _mm_mul_ps(hk,y);
693  xk2 = _mm_sub_ps(xk1, tmp1);
694  xk1 = _mm_add_ps(xk1, tmp1);
695  xk1 = _mm_mul_ps(xk1, D_inv);
696  xk2 = _mm_mul_ps(xk2, D_inv);
697 
698  yk1 = _mm_mul_ps(ak, y);
699  tmp1 = _mm_mul_ps(hk,x);
700  yk2 = _mm_add_ps(yk1, tmp1);
701  yk1 = _mm_sub_ps(yk1, tmp1);
702  yk1 = _mm_mul_ps(yk1, D_inv);
703  yk2 = _mm_mul_ps(yk2, D_inv);
704 
705  xk = _mm_and_ps(correct_helicity, xk1);
706  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
707  xk = _mm_xor_ps(xk, tmp1);
708  yk = _mm_and_ps(correct_helicity, yk1);
709  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
710  yk = _mm_xor_ps(yk, tmp1);
711 
712  __m128 phi_4 = _vec_atan2_ps(yk, xk);
713  //if phi < 0, phi += 2*pi
714  tmp1 = _mm_cmplt_ps(phi_4, zero);
715  tmp2 = _mm_and_ps(tmp1, twopi);
716  tmp1 = _mm_andnot_ps(tmp1, zero);
717  tmp1 = _mm_xor_ps(tmp1, tmp2);
718  phi_4 = _mm_add_ps(phi_4, tmp1);
719  // if neg==true, phi = hit_phi
720  tmp1 = _mm_and_ps(neg, hit_phi);
721  phi_4 = _mm_andnot_ps(neg, phi_4);
722  phi_4 = _mm_xor_ps(tmp1, phi_4);
723 
725 
726  // check if phi overlaps the 0,2pi jump
727  tmp1 = _mm_cmplt_ps(phi_1, pi_over_two);
728  tmp2 = _mm_cmplt_ps(phi_2, pi_over_two);
729  tmp1 = _mm_or_ps(tmp1, tmp2);
730  tmp2 = _mm_cmplt_ps(phi_3, pi_over_two);
731  tmp1 = _mm_or_ps(tmp1, tmp2);
732  tmp2 = _mm_cmplt_ps(phi_4, pi_over_two);
733  tmp1 = _mm_or_ps(tmp1, tmp2);
734 
735  tmp2 = _mm_cmpgt_ps(phi_1, three_pi_over_two);
736  __m128 tmp3 = _mm_cmpgt_ps(phi_2, three_pi_over_two);
737  tmp2 = _mm_or_ps(tmp2, tmp3);
738  tmp3 = _mm_cmpgt_ps(phi_3, three_pi_over_two);
739  tmp2 = _mm_or_ps(tmp2, tmp3);
740  tmp3 = _mm_cmpgt_ps(phi_4, three_pi_over_two);
741  tmp2 = _mm_or_ps(tmp2, tmp3);
742 
743  tmp1 = _mm_and_ps(tmp1, tmp2);
744 
745  // tmp1 is now all ones if phi overlaps the jump, all zeros otherwise
746  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
747  tmp2 = _mm_and_ps(tmp1, twopi);
748  tmp3 = _mm_andnot_ps(tmp1, zero);
749  tmp2 = _mm_xor_ps(tmp2, tmp3);
750 
751  __m128 tmp4 = _mm_cmpgt_ps(phi_1, three_pi_over_two);
752  tmp3 = _mm_and_ps(tmp4, tmp2);
753  __m128 tmp5 = _mm_andnot_ps(tmp4, zero);
754  tmp3 = _mm_xor_ps(tmp3, tmp5);
755  phi_1 = _mm_sub_ps(phi_1, tmp3);
756 
757  tmp4 = _mm_cmpgt_ps(phi_2, three_pi_over_two);
758  tmp3 = _mm_and_ps(tmp4, tmp2);
759  tmp5 = _mm_andnot_ps(tmp4, zero);
760  tmp3 = _mm_xor_ps(tmp3, tmp5);
761  phi_2 = _mm_sub_ps(phi_2, tmp3);
762 
763  tmp4 = _mm_cmpgt_ps(phi_3, three_pi_over_two);
764  tmp3 = _mm_and_ps(tmp4, tmp2);
765  tmp5 = _mm_andnot_ps(tmp4, zero);
766  tmp3 = _mm_xor_ps(tmp3, tmp5);
767  phi_3 = _mm_sub_ps(phi_3, tmp3);
768 
769  tmp4 = _mm_cmpgt_ps(phi_4, three_pi_over_two);
770  tmp3 = _mm_and_ps(tmp4, tmp2);
771  tmp5 = _mm_andnot_ps(tmp4, zero);
772  tmp3 = _mm_xor_ps(tmp3, tmp5);
773  phi_4 = _mm_sub_ps(phi_4, tmp3);
774 
775 
776  // find the minimum phi
777  __m128 phi_min = phi_1;
778  tmp2 = _mm_cmplt_ps(phi_2, phi_min);
779  tmp3 = _mm_and_ps(tmp2, phi_2);
780  phi_min = _mm_andnot_ps(tmp2, phi_min);
781  phi_min = _mm_xor_ps(phi_min, tmp3);
782  tmp2 = _mm_cmplt_ps(phi_3, phi_min);
783  tmp3 = _mm_and_ps(tmp2, phi_3);
784  phi_min = _mm_andnot_ps(tmp2, phi_min);
785  phi_min = _mm_xor_ps(phi_min, tmp3);
786  tmp2 = _mm_cmplt_ps(phi_4, phi_min);
787  tmp3 = _mm_and_ps(tmp2, phi_4);
788  phi_min = _mm_andnot_ps(tmp2, phi_min);
789  phi_min = _mm_xor_ps(phi_min, tmp3);
790 
791  // find the maximum phi
792  __m128 phi_max = phi_1;
793  tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
794  tmp3 = _mm_and_ps(tmp2, phi_2);
795  phi_max = _mm_andnot_ps(tmp2, phi_max);
796  phi_max = _mm_xor_ps(phi_max, tmp3);
797  tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
798  tmp3 = _mm_and_ps(tmp2, phi_3);
799  phi_max = _mm_andnot_ps(tmp2, phi_max);
800  phi_max = _mm_xor_ps(phi_max, tmp3);
801  tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
802  tmp3 = _mm_and_ps(tmp2, phi_4);
803  phi_max = _mm_andnot_ps(tmp2, phi_max);
804  phi_max = _mm_xor_ps(phi_max, tmp3);
805 
806 
807  _mm_store_ps(min_phi, phi_min);
808  _mm_store_ps(max_phi, phi_max);
809 }
810 
811 
812 void HelixHough::phiRange_sse(float* hit_x, float* hit_y, float* min_d, float* max_d, float* max_k, float* min_phi, float* max_phi, float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out)
813 {
814  __m128 helicity_vec = _mm_load1_ps(&(hel));
815 
816  __m128 x = _mm_load_ps(hit_x);
817  __m128 y = _mm_load_ps(hit_y);
818 
819  __m128 d_min = _mm_load_ps(min_d);
820  __m128 d_max = _mm_load_ps(max_d);
821  __m128 k_max = _mm_load_ps(max_k);
822 
823  __m128 hit_phi = _vec_atan2_ps(y,x);
824  //if phi < 0, phi += 2*pi
825  __m128 tmp1 = _mm_cmplt_ps(hit_phi, zero);
826  __m128 tmp2 = _mm_and_ps(tmp1, twopi);
827  tmp1 = _mm_andnot_ps(tmp1, zero);
828  tmp1 = _mm_xor_ps(tmp1, tmp2);
829  hit_phi = _mm_add_ps(hit_phi, tmp1);
830 
831  // min_d, max_k
832  __m128 d = d_min;
833  __m128 k = k_max;
834  __m128 D = _mm_mul_ps(x,x);
835  tmp1 = _mm_mul_ps(y,y);
836  D = _mm_add_ps(D,tmp1);
837  D = _vec_sqrt_ps(D);
838  __m128 D_inv = _vec_rec_ps(D);
839  __m128 ak = d;
840  ak = _mm_mul_ps(d, two);
841  tmp1 = _mm_mul_ps(d,d);
842  tmp1 = _mm_mul_ps(tmp1, k);
843  ak = _mm_add_ps(ak, tmp1);
844  tmp1 = _mm_mul_ps(D,D);
845  tmp1 = _mm_mul_ps(tmp1, k);
846  ak = _mm_add_ps(ak, tmp1);
847  ak = _mm_mul_ps(ak, D_inv);
848  ak = _mm_mul_ps(ak, one_o_2);
849  __m128 hk = _mm_mul_ps(d,k);
850  hk = _mm_add_ps(hk, one);
851  hk = _mm_mul_ps(hk,hk);
852  tmp1 = _mm_mul_ps(ak,ak);
853  hk = _mm_sub_ps(hk, tmp1);
854  __m128 neg = _mm_cmple_ps(hk, zero);
855  hk = _vec_sqrt_ps(hk);
856 
857  __m128 xk1 = _mm_mul_ps(ak, x);
858  tmp1 = _mm_mul_ps(hk,y);
859  __m128 xk2 = _mm_sub_ps(xk1, tmp1);
860  xk1 = _mm_add_ps(xk1, tmp1);
861  xk1 = _mm_mul_ps(xk1, D_inv);
862  xk2 = _mm_mul_ps(xk2, D_inv);
863 
864  __m128 yk1 = _mm_mul_ps(ak, y);
865  tmp1 = _mm_mul_ps(hk,x);
866  __m128 yk2 = _mm_add_ps(yk1, tmp1);
867  yk1 = _mm_sub_ps(yk1, tmp1);
868  yk1 = _mm_mul_ps(yk1, D_inv);
869  yk2 = _mm_mul_ps(yk2, D_inv);
870 
871  __m128 crossproduct = _mm_mul_ps(x, yk1);
872  __m128 crosstemp = _mm_mul_ps(y, xk1);
873  crossproduct = _mm_sub_ps(crossproduct, crosstemp);
874  __m128 correct_helicity = compare_sign(crossproduct, helicity_vec);
875 
876  __m128 xk = _mm_and_ps(correct_helicity, xk1);
877  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
878  xk = _mm_xor_ps(xk, tmp1);
879  __m128 yk = _mm_and_ps(correct_helicity, yk1);
880  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
881  yk = _mm_xor_ps(yk, tmp1);
882 
883 
884  __m128 phi_1 = _vec_atan2_ps(yk, xk);
885  //if phi < 0, phi += 2*pi
886  tmp1 = _mm_cmplt_ps(phi_1, zero);
887  tmp2 = _mm_and_ps(tmp1, twopi);
888  tmp1 = _mm_andnot_ps(tmp1, zero);
889  tmp1 = _mm_xor_ps(tmp1, tmp2);
890  phi_1 = _mm_add_ps(phi_1, tmp1);
891  // if neg==true, phi = hit_phi
892  tmp1 = _mm_and_ps(neg, hit_phi);
893  phi_1 = _mm_andnot_ps(neg, phi_1);
894  phi_1 = _mm_xor_ps(tmp1, phi_1);
895  phi_3_out = phi_1;
896 
897  // max_d, max_k
898  d = d_max;
899  k = k_max;
900  ak = d;
901  ak = _mm_mul_ps(d, two);
902  tmp1 = _mm_mul_ps(d,d);
903  tmp1 = _mm_mul_ps(tmp1, k);
904  ak = _mm_add_ps(ak, tmp1);
905  tmp1 = _mm_mul_ps(D,D);
906  tmp1 = _mm_mul_ps(tmp1, k);
907  ak = _mm_add_ps(ak, tmp1);
908  ak = _mm_mul_ps(ak, D_inv);
909  ak = _mm_mul_ps(ak, one_o_2);
910  hk = _mm_mul_ps(d,k);
911  hk = _mm_add_ps(hk, one);
912  hk = _mm_mul_ps(hk,hk);
913  tmp1 = _mm_mul_ps(ak,ak);
914  hk = _mm_sub_ps(hk, tmp1);
915  neg = _mm_cmple_ps(hk, zero);
916  hk = _vec_sqrt_ps(hk);
917 
918  xk1 = _mm_mul_ps(ak, x);
919  tmp1 = _mm_mul_ps(hk,y);
920  xk2 = _mm_sub_ps(xk1, tmp1);
921  xk1 = _mm_add_ps(xk1, tmp1);
922  xk1 = _mm_mul_ps(xk1, D_inv);
923  xk2 = _mm_mul_ps(xk2, D_inv);
924 
925  yk1 = _mm_mul_ps(ak, y);
926  tmp1 = _mm_mul_ps(hk,x);
927  yk2 = _mm_add_ps(yk1, tmp1);
928  yk1 = _mm_sub_ps(yk1, tmp1);
929  yk1 = _mm_mul_ps(yk1, D_inv);
930  yk2 = _mm_mul_ps(yk2, D_inv);
931 
932  xk = _mm_and_ps(correct_helicity, xk1);
933  tmp1 = _mm_andnot_ps(correct_helicity, xk2);
934  xk = _mm_xor_ps(xk, tmp1);
935  yk = _mm_and_ps(correct_helicity, yk1);
936  tmp1 = _mm_andnot_ps(correct_helicity, yk2);
937  yk = _mm_xor_ps(yk, tmp1);
938 
939  __m128 phi_2 = _vec_atan2_ps(yk, xk);
940  //if phi < 0, phi += 2*pi
941  tmp1 = _mm_cmplt_ps(phi_2, zero);
942  tmp2 = _mm_and_ps(tmp1, twopi);
943  tmp1 = _mm_andnot_ps(tmp1, zero);
944  tmp1 = _mm_xor_ps(tmp1, tmp2);
945  phi_2 = _mm_add_ps(phi_2, tmp1);
946  // if neg==true, phi = hit_phi
947  tmp1 = _mm_and_ps(neg, hit_phi);
948  phi_2 = _mm_andnot_ps(neg, phi_2);
949  phi_2 = _mm_xor_ps(tmp1, phi_2);
950  phi_4_out = phi_2;
951 
952  // check if phi overlaps the 0,2pi jump
953  tmp1 = _mm_cmplt_ps(phi_1, pi_over_two);
954  tmp2 = _mm_cmplt_ps(phi_2, pi_over_two);
955  tmp1 = _mm_or_ps(tmp1, tmp2);
956  tmp2 = _mm_cmplt_ps(phi_3, pi_over_two);
957  tmp1 = _mm_or_ps(tmp1, tmp2);
958  tmp2 = _mm_cmplt_ps(phi_4, pi_over_two);
959  tmp1 = _mm_or_ps(tmp1, tmp2);
960 
961  tmp2 = _mm_cmpgt_ps(phi_1, three_pi_over_two);
962  __m128 tmp3 = _mm_cmpgt_ps(phi_2, three_pi_over_two);
963  tmp2 = _mm_or_ps(tmp2, tmp3);
964  tmp3 = _mm_cmpgt_ps(phi_3, three_pi_over_two);
965  tmp2 = _mm_or_ps(tmp2, tmp3);
966  tmp3 = _mm_cmpgt_ps(phi_4, three_pi_over_two);
967  tmp2 = _mm_or_ps(tmp2, tmp3);
968 
969  tmp1 = _mm_and_ps(tmp1, tmp2);
970 
971  // tmp1 is now all ones if phi overlaps the jump, all zeros otherwise
972  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
973  tmp2 = _mm_and_ps(tmp1, twopi);
974  tmp3 = _mm_andnot_ps(tmp1, zero);
975  tmp2 = _mm_xor_ps(tmp2, tmp3);
976 
977  __m128 tmp4 = _mm_cmpgt_ps(phi_1, three_pi_over_two);
978  tmp3 = _mm_and_ps(tmp4, tmp2);
979  __m128 tmp5 = _mm_andnot_ps(tmp4, zero);
980  tmp3 = _mm_xor_ps(tmp3, tmp5);
981  phi_1 = _mm_sub_ps(phi_1, tmp3);
982 
983  tmp4 = _mm_cmpgt_ps(phi_2, three_pi_over_two);
984  tmp3 = _mm_and_ps(tmp4, tmp2);
985  tmp5 = _mm_andnot_ps(tmp4, zero);
986  tmp3 = _mm_xor_ps(tmp3, tmp5);
987  phi_2 = _mm_sub_ps(phi_2, tmp3);
988 
989  tmp4 = _mm_cmpgt_ps(phi_3, three_pi_over_two);
990  tmp3 = _mm_and_ps(tmp4, tmp2);
991  tmp5 = _mm_andnot_ps(tmp4, zero);
992  tmp3 = _mm_xor_ps(tmp3, tmp5);
993  phi_3 = _mm_sub_ps(phi_3, tmp3);
994 
995  tmp4 = _mm_cmpgt_ps(phi_4, three_pi_over_two);
996  tmp3 = _mm_and_ps(tmp4, tmp2);
997  tmp5 = _mm_andnot_ps(tmp4, zero);
998  tmp3 = _mm_xor_ps(tmp3, tmp5);
999  phi_4 = _mm_sub_ps(phi_4, tmp3);
1000 
1001 
1002  // find the minimum phi
1003  __m128 phi_min = phi_1;
1004  tmp2 = _mm_cmplt_ps(phi_2, phi_min);
1005  tmp3 = _mm_and_ps(tmp2, phi_2);
1006  phi_min = _mm_andnot_ps(tmp2, phi_min);
1007  phi_min = _mm_xor_ps(phi_min, tmp3);
1008  tmp2 = _mm_cmplt_ps(phi_3, phi_min);
1009  tmp3 = _mm_and_ps(tmp2, phi_3);
1010  phi_min = _mm_andnot_ps(tmp2, phi_min);
1011  phi_min = _mm_xor_ps(phi_min, tmp3);
1012  tmp2 = _mm_cmplt_ps(phi_4, phi_min);
1013  tmp3 = _mm_and_ps(tmp2, phi_4);
1014  phi_min = _mm_andnot_ps(tmp2, phi_min);
1015  phi_min = _mm_xor_ps(phi_min, tmp3);
1016 
1017  // find the maximum phi
1018  __m128 phi_max = phi_1;
1019  tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
1020  tmp3 = _mm_and_ps(tmp2, phi_2);
1021  phi_max = _mm_andnot_ps(tmp2, phi_max);
1022  phi_max = _mm_xor_ps(phi_max, tmp3);
1023  tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
1024  tmp3 = _mm_and_ps(tmp2, phi_3);
1025  phi_max = _mm_andnot_ps(tmp2, phi_max);
1026  phi_max = _mm_xor_ps(phi_max, tmp3);
1027  tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
1028  tmp3 = _mm_and_ps(tmp2, phi_4);
1029  phi_max = _mm_andnot_ps(tmp2, phi_max);
1030  phi_max = _mm_xor_ps(phi_max, tmp3);
1031 
1032 
1033  _mm_store_ps(min_phi, phi_min);
1034  _mm_store_ps(max_phi, phi_max);
1035 }
1036 
1037 
1038 void HelixHough::phiRange_sse(float* hit_x, float* hit_y, float* min_d, float* max_d, float* min_k, float* max_k, float* min_phi, float* max_phi, float* min_phi_2, float* max_phi_2, float hel, __m128& phi_3_out, __m128& phi_4_out, float* hit_x_2, float* hit_y_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
1039 {
1040  __m128 helicity_vec = _mm_load1_ps(&(hel));
1041 
1042  __m128 x = _mm_load_ps(hit_x); __m128 x_2 = _mm_load_ps(hit_x_2);
1043  __m128 y = _mm_load_ps(hit_y); __m128 y_2 = _mm_load_ps(hit_y_2);
1044 
1045  __m128 d_min = _mm_load_ps(min_d);
1046  __m128 d_max = _mm_load_ps(max_d);
1047  __m128 k_min = _mm_load_ps(min_k);
1048  __m128 k_max = _mm_load_ps(max_k);
1049 
1050  __m128 hit_phi = _vec_atan2_ps(y,x); __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
1051  //if phi < 0, phi += 2*pi
1052  __m128 tmp1 = _mm_cmplt_ps(hit_phi, zero); __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2, zero);
1053  __m128 tmp2 = _mm_and_ps(tmp1, twopi); __m128 tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1054  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1055  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1056  hit_phi = _mm_add_ps(hit_phi, tmp1); hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
1057 
1058  // min_d, max_k
1059  __m128 d = d_min;
1060  __m128 k = k_max;
1061  __m128 D = _mm_mul_ps(x,x); __m128 D_2 = _mm_mul_ps(x_2,x_2);
1062  tmp1 = _mm_mul_ps(y,y); tmp1_2 = _mm_mul_ps(y_2,y_2);
1063  D = _mm_add_ps(D,tmp1); D_2 = _mm_add_ps(D_2,tmp1_2);
1064  D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
1065  __m128 D_inv = _vec_rec_ps(D); __m128 D_inv_2 = _vec_rec_ps(D_2);
1066  __m128 ak = d; __m128 ak_2 = d;
1067  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1068  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1069  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1070  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1071  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1072  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1073  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1074  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1075  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1076  __m128 hk = _mm_mul_ps(d,k); __m128 hk_2 = _mm_mul_ps(d,k);
1077  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1078  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1079  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1080  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1081  __m128 neg = _mm_cmple_ps(hk, zero); __m128 neg_2 = _mm_cmple_ps(hk_2, zero);
1082  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1083 
1084  __m128 xk1 = _mm_mul_ps(ak, x); __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
1085  tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1086  __m128 xk2 = _mm_sub_ps(xk1, tmp1); __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1087  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1088  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1089  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1090 
1091  __m128 yk1 = _mm_mul_ps(ak, y); __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
1092  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2,x_2);
1093  __m128 yk2 = _mm_add_ps(yk1, tmp1); __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1094  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1095  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1096  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1097 
1098  __m128 crossproduct = _mm_mul_ps(x, yk1); __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
1099  __m128 crosstemp = _mm_mul_ps(y, xk1); __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
1100  crossproduct = _mm_sub_ps(crossproduct, crosstemp); crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
1101  __m128 correct_helicity = compare_sign(crossproduct, helicity_vec); __m128 correct_helicity_2 = compare_sign(crossproduct_2, helicity_vec);
1102 
1103  __m128 xk = _mm_and_ps(correct_helicity, xk1); __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1104  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1105  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1106  __m128 yk = _mm_and_ps(correct_helicity, yk1); __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1107  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1108  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1109 
1110 
1111  __m128 phi_1 = _vec_atan2_ps(yk, xk); __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
1112  //if phi < 0, phi += 2*pi
1113  tmp1 = _mm_cmplt_ps(phi_1, zero); tmp1_2 = _mm_cmplt_ps(phi_1_2, zero);
1114  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1115  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1116  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1117  phi_1 = _mm_add_ps(phi_1, tmp1); phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
1118  // if neg==true, phi = hit_phi
1119  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1120  phi_1 = _mm_andnot_ps(neg, phi_1); phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
1121  phi_1 = _mm_xor_ps(tmp1, phi_1); phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
1122  phi_3_out = phi_1; phi_3_out_2 = phi_1_2;
1123 
1124  // max_d, max_k
1125  d = d_max;
1126  k = k_max;
1127  ak = d; ak_2 = d;
1128  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1129  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1130  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1131  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1132  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1133  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1134  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1135  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1136  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1137  hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1138  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1139  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1140  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1141  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1142  neg = _mm_cmple_ps(hk, zero); neg_2 = _mm_cmple_ps(hk_2, zero);
1143  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1144 
1145  xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1146  tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1147  xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1148  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1149  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1150  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1151 
1152  yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1153  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1154  yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1155  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1156  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1157  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1158 
1159  xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1160  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1161  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1162  yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1163  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1164  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1165 
1166  __m128 phi_2 = _vec_atan2_ps(yk, xk); __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
1167  //if phi < 0, phi += 2*pi
1168  tmp1 = _mm_cmplt_ps(phi_2, zero); tmp1_2 = _mm_cmplt_ps(phi_2_2, zero);
1169  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1170  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1171  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1172  phi_2 = _mm_add_ps(phi_2, tmp1); phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
1173  // if neg==true, phi = hit_phi
1174  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1175  phi_2 = _mm_andnot_ps(neg, phi_2); phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
1176  phi_2 = _mm_xor_ps(tmp1, phi_2); phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
1177  phi_4_out = phi_2; phi_4_out_2 = phi_2_2;
1178 
1179 
1180  // min_d, min_k
1181  d = d_min;
1182  k = k_min;
1183  ak = d; ak_2 = d;
1184  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1185  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1186  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1187  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1188  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1189  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1190  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1191  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1192  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1193  hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1194  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1195  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1196  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1197  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1198  neg = _mm_cmple_ps(hk, zero); neg_2 = _mm_cmple_ps(hk_2, zero);
1199  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1200 
1201  xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1202  tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1203  xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1204  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1205  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1206  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1207 
1208  yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1209  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1210  yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1211  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1212  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1213  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1214 
1215  xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1216  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1217  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1218  yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1219  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1220  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1221 
1222  __m128 phi_3 = _vec_atan2_ps(yk, xk); __m128 phi_3_2 = _vec_atan2_ps(yk_2, xk_2);
1223  //if phi < 0, phi += 2*pi
1224  tmp1 = _mm_cmplt_ps(phi_3, zero); tmp1_2 = _mm_cmplt_ps(phi_3_2, zero);
1225  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1226  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1227  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1228  phi_3 = _mm_add_ps(phi_3, tmp1); phi_3_2 = _mm_add_ps(phi_3_2, tmp1_2);
1229  // if neg==true, phi = hit_phi
1230  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1231  phi_3 = _mm_andnot_ps(neg, phi_3); phi_3_2 = _mm_andnot_ps(neg_2, phi_3_2);
1232  phi_3 = _mm_xor_ps(tmp1, phi_3); phi_3_2 = _mm_xor_ps(tmp1_2, phi_3_2);
1233 
1234 
1235  // max_d, min_k
1236  d = d_max;
1237  k = k_min;
1238  ak = d; ak_2 = d;
1239  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1240  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1241  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1242  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1243  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1244  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1245  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1246  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1247  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1248  hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1249  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1250  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1251  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1252  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1253  neg = _mm_cmple_ps(hk, zero); neg_2 = _mm_cmple_ps(hk_2, zero);
1254  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1255 
1256  xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1257  tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1258  xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1259  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1260  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1261  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1262 
1263  yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1264  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1265  yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1266  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1267  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1268  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1269 
1270  xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1271  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1272  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1273  yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1274  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1275  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1276 
1277  __m128 phi_4 = _vec_atan2_ps(yk, xk); __m128 phi_4_2 = _vec_atan2_ps(yk_2, xk_2);
1278  //if phi < 0, phi += 2*pi
1279  tmp1 = _mm_cmplt_ps(phi_4, zero); tmp1_2 = _mm_cmplt_ps(phi_4_2, zero);
1280  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1281  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1282  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1283  phi_4 = _mm_add_ps(phi_4, tmp1); phi_4_2 = _mm_add_ps(phi_4_2, tmp1_2);
1284  // if neg==true, phi = hit_phi
1285  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1286  phi_4 = _mm_andnot_ps(neg, phi_4); phi_4_2 = _mm_andnot_ps(neg_2, phi_4_2);
1287  phi_4 = _mm_xor_ps(tmp1, phi_4); phi_4_2 = _mm_xor_ps(tmp1_2, phi_4_2);
1288 
1290 
1291  // check if phi overlaps the 0,2pi jump
1292  tmp1 = _mm_cmplt_ps(phi_1, pi_over_two); tmp1_2 = _mm_cmplt_ps(phi_1_2, pi_over_two);
1293  tmp2 = _mm_cmplt_ps(phi_2, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_2_2, pi_over_two);
1294  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1295  tmp2 = _mm_cmplt_ps(phi_3, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_3_2, pi_over_two);
1296  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1297  tmp2 = _mm_cmplt_ps(phi_4, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_4_2, pi_over_two);
1298  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1299 
1300  tmp2 = _mm_cmpgt_ps(phi_1, three_pi_over_two); tmp2_2 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
1301  __m128 tmp3 = _mm_cmpgt_ps(phi_2, three_pi_over_two); __m128 tmp3_2 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
1302  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1303  tmp3 = _mm_cmpgt_ps(phi_3, three_pi_over_two); tmp3_2 = _mm_cmpgt_ps(phi_3_2, three_pi_over_two);
1304  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1305  tmp3 = _mm_cmpgt_ps(phi_4, three_pi_over_two); tmp3_2 = _mm_cmpgt_ps(phi_4_2, three_pi_over_two);
1306  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1307 
1308  tmp1 = _mm_and_ps(tmp1, tmp2); tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
1309 
1310  // tmp1 is now all ones if phi overlaps the jump, all zeros otherwise
1311  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
1312  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1313  tmp3 = _mm_andnot_ps(tmp1, zero); tmp3_2 = _mm_andnot_ps(tmp1_2, zero);
1314  tmp2 = _mm_xor_ps(tmp2, tmp3); tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
1315 
1316  __m128 tmp4 = _mm_cmpgt_ps(phi_1, three_pi_over_two); __m128 tmp4_2 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
1317  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1318  __m128 tmp5 = _mm_andnot_ps(tmp4, zero); __m128 tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1319  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1320  phi_1 = _mm_sub_ps(phi_1, tmp3); phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
1321 
1322  tmp4 = _mm_cmpgt_ps(phi_2, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
1323  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1324  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1325  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1326  phi_2 = _mm_sub_ps(phi_2, tmp3); phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
1327 
1328  tmp4 = _mm_cmpgt_ps(phi_3, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_3_2, three_pi_over_two);
1329  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1330  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1331  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1332  phi_3 = _mm_sub_ps(phi_3, tmp3); phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
1333 
1334  tmp4 = _mm_cmpgt_ps(phi_4, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_4_2, three_pi_over_two);
1335  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1336  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1337  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1338  phi_4 = _mm_sub_ps(phi_4, tmp3); phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
1339 
1340 
1341  // find the minimum phi
1342  __m128 phi_min = phi_1; __m128 phi_min_2 = phi_1_2;
1343  tmp2 = _mm_cmplt_ps(phi_2, phi_min); tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
1344  tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1345  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1346  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1347  tmp2 = _mm_cmplt_ps(phi_3, phi_min); tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
1348  tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1349  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1350  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1351  tmp2 = _mm_cmplt_ps(phi_4, phi_min); tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
1352  tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1353  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1354  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1355 
1356  // find the maximum phi
1357  __m128 phi_max = phi_1; __m128 phi_max_2 = phi_1_2;
1358  tmp2 = _mm_cmpgt_ps(phi_2, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
1359  tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1360  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1361  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1362  tmp2 = _mm_cmpgt_ps(phi_3, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
1363  tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1364  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1365  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1366  tmp2 = _mm_cmpgt_ps(phi_4, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
1367  tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1368  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1369  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1370 
1371 
1372  _mm_store_ps(min_phi, phi_min); _mm_store_ps(min_phi_2, phi_min_2);
1373  _mm_store_ps(max_phi, phi_max); _mm_store_ps(max_phi_2, phi_max_2);
1374 }
1375 
1376 
1377 void HelixHough::phiRange_sse(float* hit_x, float* hit_y, float* min_d, float* max_d, float* /*min_k*/, float* max_k, float* min_phi, float* max_phi, float* min_phi_2, float* max_phi_2, float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out, float* hit_x_2, float* hit_y_2, __m128& phi_3_2, __m128& phi_4_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
1378 {
1379  __m128 helicity_vec = _mm_load1_ps(&(hel));
1380 
1381  __m128 x = _mm_load_ps(hit_x); __m128 x_2 = _mm_load_ps(hit_x_2);
1382  __m128 y = _mm_load_ps(hit_y); __m128 y_2 = _mm_load_ps(hit_y_2);
1383 
1384  __m128 d_min = _mm_load_ps(min_d);
1385  __m128 d_max = _mm_load_ps(max_d);
1386  __m128 k_max = _mm_load_ps(max_k);
1387 
1388  __m128 hit_phi = _vec_atan2_ps(y,x); __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
1389  //if phi < 0, phi += 2*pi
1390  __m128 tmp1 = _mm_cmplt_ps(hit_phi, zero); __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2, zero);
1391  __m128 tmp2 = _mm_and_ps(tmp1, twopi); __m128 tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1392  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1393  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1394  hit_phi = _mm_add_ps(hit_phi, tmp1); hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
1395 
1396  // min_d, max_k
1397  __m128 d = d_min;
1398  __m128 k = k_max;
1399  __m128 D = _mm_mul_ps(x,x); __m128 D_2 = _mm_mul_ps(x_2,x_2);
1400  tmp1 = _mm_mul_ps(y,y); tmp1_2 = _mm_mul_ps(y_2,y_2);
1401  D = _mm_add_ps(D,tmp1); D_2 = _mm_add_ps(D_2,tmp1_2);
1402  D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
1403  __m128 D_inv = _vec_rec_ps(D); __m128 D_inv_2 = _vec_rec_ps(D_2);
1404  __m128 ak = d; __m128 ak_2 = d;
1405  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1406  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1407  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1408  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1409  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1410  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1411  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1412  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1413  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1414  __m128 hk = _mm_mul_ps(d,k); __m128 hk_2 = _mm_mul_ps(d,k);
1415  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1416  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1417  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1418  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1419  __m128 neg = _mm_cmple_ps(hk, zero); __m128 neg_2 = _mm_cmple_ps(hk_2, zero);
1420  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1421 
1422  __m128 xk1 = _mm_mul_ps(ak, x); __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
1423  tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1424  __m128 xk2 = _mm_sub_ps(xk1, tmp1); __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1425  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1426  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1427  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1428 
1429  __m128 yk1 = _mm_mul_ps(ak, y); __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
1430  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2,x_2);
1431  __m128 yk2 = _mm_add_ps(yk1, tmp1); __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1432  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1433  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1434  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1435 
1436  __m128 crossproduct = _mm_mul_ps(x, yk1); __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
1437  __m128 crosstemp = _mm_mul_ps(y, xk1); __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
1438  crossproduct = _mm_sub_ps(crossproduct, crosstemp); crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
1439  __m128 correct_helicity = compare_sign(crossproduct, helicity_vec); __m128 correct_helicity_2 = compare_sign(crossproduct_2, helicity_vec);
1440 
1441  __m128 xk = _mm_and_ps(correct_helicity, xk1); __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1442  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1443  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1444  __m128 yk = _mm_and_ps(correct_helicity, yk1); __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1445  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1446  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1447 
1448 
1449  __m128 phi_1 = _vec_atan2_ps(yk, xk); __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
1450  //if phi < 0, phi += 2*pi
1451  tmp1 = _mm_cmplt_ps(phi_1, zero); tmp1_2 = _mm_cmplt_ps(phi_1_2, zero);
1452  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1453  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1454  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1455  phi_1 = _mm_add_ps(phi_1, tmp1); phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
1456  // if neg==true, phi = hit_phi
1457  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1458  phi_1 = _mm_andnot_ps(neg, phi_1); phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
1459  phi_1 = _mm_xor_ps(tmp1, phi_1); phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
1460  phi_3_out = phi_1; phi_3_out_2 = phi_1_2;
1461 
1462  // max_d, max_k
1463  d = d_max;
1464  k = k_max;
1465  ak = d; ak_2 = d;
1466  ak = _mm_mul_ps(d, two); ak_2 = _mm_mul_ps(d, two);
1467  tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1468  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1469  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1470  tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1471  tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1472  ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1473  ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1474  ak = _mm_mul_ps(ak, one_o_2); ak_2 = _mm_mul_ps(ak_2, one_o_2);
1475  hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1476  hk = _mm_add_ps(hk, one); hk_2 = _mm_add_ps(hk_2, one);
1477  hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1478  tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1479  hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1480  neg = _mm_cmple_ps(hk, zero); neg_2 = _mm_cmple_ps(hk_2, zero);
1481  hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1482 
1483  xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1484  tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1485  xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1486  xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1487  xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1488  xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1489 
1490  yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1491  tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1492  yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1493  yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1494  yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1495  yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1496 
1497  xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1498  tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1499  xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1500  yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1501  tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1502  yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1503 
1504  __m128 phi_2 = _vec_atan2_ps(yk, xk); __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
1505  //if phi < 0, phi += 2*pi
1506  tmp1 = _mm_cmplt_ps(phi_2, zero); tmp1_2 = _mm_cmplt_ps(phi_2_2, zero);
1507  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1508  tmp1 = _mm_andnot_ps(tmp1, zero); tmp1_2 = _mm_andnot_ps(tmp1_2, zero);
1509  tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1510  phi_2 = _mm_add_ps(phi_2, tmp1); phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
1511  // if neg==true, phi = hit_phi
1512  tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1513  phi_2 = _mm_andnot_ps(neg, phi_2); phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
1514  phi_2 = _mm_xor_ps(tmp1, phi_2); phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
1515  phi_4_out = phi_2; phi_4_out_2 = phi_2_2;
1516 
1517 
1519 
1520  // check if phi overlaps the 0,2pi jump
1521  tmp1 = _mm_cmplt_ps(phi_1, pi_over_two); tmp1_2 = _mm_cmplt_ps(phi_1_2, pi_over_two);
1522  tmp2 = _mm_cmplt_ps(phi_2, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_2_2, pi_over_two);
1523  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1524  tmp2 = _mm_cmplt_ps(phi_3, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_3_2, pi_over_two);
1525  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1526  tmp2 = _mm_cmplt_ps(phi_4, pi_over_two); tmp2_2 = _mm_cmplt_ps(phi_4_2, pi_over_two);
1527  tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1528 
1529  tmp2 = _mm_cmpgt_ps(phi_1, three_pi_over_two); tmp2_2 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
1530  __m128 tmp3 = _mm_cmpgt_ps(phi_2, three_pi_over_two); __m128 tmp3_2 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
1531  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1532  tmp3 = _mm_cmpgt_ps(phi_3, three_pi_over_two); tmp3_2 = _mm_cmpgt_ps(phi_3_2, three_pi_over_two);
1533  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1534  tmp3 = _mm_cmpgt_ps(phi_4, three_pi_over_two); tmp3_2 = _mm_cmpgt_ps(phi_4_2, three_pi_over_two);
1535  tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1536 
1537  tmp1 = _mm_and_ps(tmp1, tmp2); tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
1538 
1539  // tmp1 is now all ones if phi overlaps the jump, all zeros otherwise
1540  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
1541  tmp2 = _mm_and_ps(tmp1, twopi); tmp2_2 = _mm_and_ps(tmp1_2, twopi);
1542  tmp3 = _mm_andnot_ps(tmp1, zero); tmp3_2 = _mm_andnot_ps(tmp1_2, zero);
1543  tmp2 = _mm_xor_ps(tmp2, tmp3); tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
1544 
1545  __m128 tmp4 = _mm_cmpgt_ps(phi_1, three_pi_over_two); __m128 tmp4_2 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
1546  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1547  __m128 tmp5 = _mm_andnot_ps(tmp4, zero); __m128 tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1548  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1549  phi_1 = _mm_sub_ps(phi_1, tmp3); phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
1550 
1551  tmp4 = _mm_cmpgt_ps(phi_2, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
1552  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1553  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1554  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1555  phi_2 = _mm_sub_ps(phi_2, tmp3); phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
1556 
1557  tmp4 = _mm_cmpgt_ps(phi_3, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_3_2, three_pi_over_two);
1558  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1559  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1560  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1561  phi_3 = _mm_sub_ps(phi_3, tmp3); phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
1562 
1563  tmp4 = _mm_cmpgt_ps(phi_4, three_pi_over_two); tmp4_2 = _mm_cmpgt_ps(phi_4_2, three_pi_over_two);
1564  tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1565  tmp5 = _mm_andnot_ps(tmp4, zero); tmp5_2 = _mm_andnot_ps(tmp4_2, zero);
1566  tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1567  phi_4 = _mm_sub_ps(phi_4, tmp3); phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
1568 
1569 
1570  // find the minimum phi
1571  __m128 phi_min = phi_1; __m128 phi_min_2 = phi_1_2;
1572  tmp2 = _mm_cmplt_ps(phi_2, phi_min); tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
1573  tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1574  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1575  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1576  tmp2 = _mm_cmplt_ps(phi_3, phi_min); tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
1577  tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1578  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1579  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1580  tmp2 = _mm_cmplt_ps(phi_4, phi_min); tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
1581  tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1582  phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1583  phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1584 
1585  // find the maximum phi
1586  __m128 phi_max = phi_1; __m128 phi_max_2 = phi_1_2;
1587  tmp2 = _mm_cmpgt_ps(phi_2, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
1588  tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1589  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1590  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1591  tmp2 = _mm_cmpgt_ps(phi_3, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
1592  tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1593  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1594  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1595  tmp2 = _mm_cmpgt_ps(phi_4, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
1596  tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1597  phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1598  phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1599 
1600 
1601  _mm_store_ps(min_phi, phi_min); _mm_store_ps(min_phi_2, phi_min_2);
1602  _mm_store_ps(max_phi, phi_max); _mm_store_ps(max_phi_2, phi_max_2);
1603 }
1604 
1605