10 static const __m128
two = {2., 2., 2., 2.};
11 static const __m128
three_pi_over_two = {3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f};
12 static const __m128
SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
14 void HelixHough::phiRange_sse(
float* hit_x,
float* hit_y,
float* min_d,
float* max_d,
float* min_k,
float* max_k,
float* min_phi_1,
float* max_phi_1,
float* min_phi_2,
float* max_phi_2)
16 __m128
x = _mm_load_ps(hit_x);
17 __m128
y = _mm_load_ps(hit_y);
19 __m128 hit_phi = _vec_atan2_ps(y,x);
21 __m128
tmp1 = _mm_cmplt_ps(hit_phi,
zero);
23 tmp1 = _mm_andnot_ps(tmp1,
zero);
24 tmp1 = _mm_xor_ps(tmp1, tmp2);
25 hit_phi = _mm_add_ps(hit_phi, tmp1);
28 __m128
d = _mm_load_ps(min_d);
29 __m128
k = _mm_load_ps(min_k);
30 __m128 D = _mm_mul_ps(x,x);
31 tmp1 = _mm_mul_ps(y,y);
32 D = _mm_add_ps(D,tmp1);
34 __m128 D_inv = _vec_rec_ps(D);
36 ak = _mm_mul_ps(d,
two);
37 tmp1 = _mm_mul_ps(d,d);
38 tmp1 = _mm_mul_ps(tmp1, k);
39 ak = _mm_add_ps(ak, tmp1);
40 tmp1 = _mm_mul_ps(D,D);
41 tmp1 = _mm_mul_ps(tmp1, k);
42 ak = _mm_add_ps(ak, tmp1);
43 ak = _mm_mul_ps(ak, D_inv);
45 __m128 hk = _mm_mul_ps(d,k);
46 hk = _mm_add_ps(hk,
one);
47 hk = _mm_mul_ps(hk,hk);
48 tmp1 = _mm_mul_ps(ak,ak);
49 hk = _mm_sub_ps(hk, tmp1);
50 __m128 neg = _mm_cmple_ps(hk,
zero);
51 hk = _vec_sqrt_ps(hk);
53 __m128 xk1 = _mm_mul_ps(ak, x);
54 tmp1 = _mm_mul_ps(hk,y);
55 __m128 xk2 = _mm_sub_ps(xk1, tmp1);
56 xk1 = _mm_add_ps(xk1, tmp1);
57 xk1 = _mm_mul_ps(xk1, D_inv);
58 xk2 = _mm_mul_ps(xk2, D_inv);
60 __m128 yk1 = _mm_mul_ps(ak, y);
61 tmp1 = _mm_mul_ps(hk,x);
62 __m128 yk2 = _mm_add_ps(yk1, tmp1);
63 yk1 = _mm_sub_ps(yk1, tmp1);
64 yk1 = _mm_mul_ps(yk1, D_inv);
65 yk2 = _mm_mul_ps(yk2, D_inv);
67 __m128 phi_r_1 = _vec_atan2_ps(yk1, xk1);
69 tmp1 = _mm_cmplt_ps(phi_r_1,
zero);
70 tmp2 = _mm_and_ps(tmp1,
twopi);
71 tmp1 = _mm_andnot_ps(tmp1,
zero);
72 tmp1 = _mm_xor_ps(tmp1, tmp2);
73 phi_r_1 = _mm_add_ps(phi_r_1, tmp1);
75 tmp1 = _mm_and_ps(neg, hit_phi);
76 phi_r_1 = _mm_andnot_ps(neg, phi_r_1);
77 phi_r_1 = _mm_xor_ps(tmp1, phi_r_1);
79 __m128 phi_l_1 = _vec_atan2_ps(yk2, xk2);
81 tmp1 = _mm_cmplt_ps(phi_l_1,
zero);
82 tmp2 = _mm_and_ps(tmp1,
twopi);
83 tmp1 = _mm_andnot_ps(tmp1,
zero);
84 tmp1 = _mm_xor_ps(tmp1, tmp2);
85 phi_l_1 = _mm_add_ps(phi_l_1, tmp1);
87 tmp1 = _mm_and_ps(neg, hit_phi);
88 phi_l_1 = _mm_andnot_ps(neg, phi_l_1);
89 phi_l_1 = _mm_xor_ps(tmp1, phi_l_1);
93 d = _mm_load_ps(min_d);
94 k = _mm_load_ps(max_k);
96 tmp1 = _mm_mul_ps(y,y);
97 D = _mm_add_ps(D,tmp1);
99 D_inv = _vec_rec_ps(D);
101 ak = _mm_mul_ps(d,
two);
102 tmp1 = _mm_mul_ps(d,d);
103 tmp1 = _mm_mul_ps(tmp1, k);
104 ak = _mm_add_ps(ak, tmp1);
105 tmp1 = _mm_mul_ps(D,D);
106 tmp1 = _mm_mul_ps(tmp1, k);
107 ak = _mm_add_ps(ak, tmp1);
108 ak = _mm_mul_ps(ak, D_inv);
110 hk = _mm_mul_ps(d,k);
111 hk = _mm_add_ps(hk,
one);
112 hk = _mm_mul_ps(hk,hk);
113 tmp1 = _mm_mul_ps(ak,ak);
114 hk = _mm_sub_ps(hk, tmp1);
115 neg = _mm_cmple_ps(hk,
zero);
116 hk = _vec_sqrt_ps(hk);
118 xk1 = _mm_mul_ps(ak, x);
119 tmp1 = _mm_mul_ps(hk,y);
120 xk2 = _mm_sub_ps(xk1, tmp1);
121 xk1 = _mm_add_ps(xk1, tmp1);
122 xk1 = _mm_mul_ps(xk1, D_inv);
123 xk2 = _mm_mul_ps(xk2, D_inv);
125 yk1 = _mm_mul_ps(ak, y);
126 tmp1 = _mm_mul_ps(hk,x);
127 yk2 = _mm_add_ps(yk1, tmp1);
128 yk1 = _mm_sub_ps(yk1, tmp1);
129 yk1 = _mm_mul_ps(yk1, D_inv);
130 yk2 = _mm_mul_ps(yk2, D_inv);
132 __m128 phi_r_2 = _vec_atan2_ps(yk1, xk1);
134 tmp1 = _mm_cmplt_ps(phi_r_2,
zero);
135 tmp2 = _mm_and_ps(tmp1,
twopi);
136 tmp1 = _mm_andnot_ps(tmp1,
zero);
137 tmp1 = _mm_xor_ps(tmp1, tmp2);
138 phi_r_2 = _mm_add_ps(phi_r_2, tmp1);
140 tmp1 = _mm_and_ps(neg, hit_phi);
141 phi_r_2 = _mm_andnot_ps(neg, phi_r_2);
142 phi_r_2 = _mm_xor_ps(tmp1, phi_r_2);
144 __m128 phi_l_2 = _vec_atan2_ps(yk2, xk2);
146 tmp1 = _mm_cmplt_ps(phi_l_2,
zero);
147 tmp2 = _mm_and_ps(tmp1,
twopi);
148 tmp1 = _mm_andnot_ps(tmp1,
zero);
149 tmp1 = _mm_xor_ps(tmp1, tmp2);
150 phi_l_2 = _mm_add_ps(phi_l_2, tmp1);
152 tmp1 = _mm_and_ps(neg, hit_phi);
153 phi_l_2 = _mm_andnot_ps(neg, phi_l_2);
154 phi_l_2 = _mm_xor_ps(tmp1, phi_l_2);
157 d = _mm_load_ps(max_d);
158 k = _mm_load_ps(max_k);
160 tmp1 = _mm_mul_ps(y,y);
161 D = _mm_add_ps(D,tmp1);
163 D_inv = _vec_rec_ps(D);
165 ak = _mm_mul_ps(d,
two);
166 tmp1 = _mm_mul_ps(d,d);
167 tmp1 = _mm_mul_ps(tmp1, k);
168 ak = _mm_add_ps(ak, tmp1);
169 tmp1 = _mm_mul_ps(D,D);
170 tmp1 = _mm_mul_ps(tmp1, k);
171 ak = _mm_add_ps(ak, tmp1);
172 ak = _mm_mul_ps(ak, D_inv);
174 hk = _mm_mul_ps(d,k);
175 hk = _mm_add_ps(hk,
one);
176 hk = _mm_mul_ps(hk,hk);
177 tmp1 = _mm_mul_ps(ak,ak);
178 hk = _mm_sub_ps(hk, tmp1);
179 neg = _mm_cmple_ps(hk,
zero);
180 hk = _vec_sqrt_ps(hk);
182 xk1 = _mm_mul_ps(ak, x);
183 tmp1 = _mm_mul_ps(hk,y);
184 xk2 = _mm_sub_ps(xk1, tmp1);
185 xk1 = _mm_add_ps(xk1, tmp1);
186 xk1 = _mm_mul_ps(xk1, D_inv);
187 xk2 = _mm_mul_ps(xk2, D_inv);
189 yk1 = _mm_mul_ps(ak, y);
190 tmp1 = _mm_mul_ps(hk,x);
191 yk2 = _mm_add_ps(yk1, tmp1);
192 yk1 = _mm_sub_ps(yk1, tmp1);
193 yk1 = _mm_mul_ps(yk1, D_inv);
194 yk2 = _mm_mul_ps(yk2, D_inv);
196 __m128 phi_r_3 = _vec_atan2_ps(yk1, xk1);
198 tmp1 = _mm_cmplt_ps(phi_r_3,
zero);
199 tmp2 = _mm_and_ps(tmp1,
twopi);
200 tmp1 = _mm_andnot_ps(tmp1,
zero);
201 tmp1 = _mm_xor_ps(tmp1, tmp2);
202 phi_r_3 = _mm_add_ps(phi_r_3, tmp1);
204 tmp1 = _mm_and_ps(neg, hit_phi);
205 phi_r_3 = _mm_andnot_ps(neg, phi_r_3);
206 phi_r_3 = _mm_xor_ps(tmp1, phi_r_3);
208 __m128 phi_l_3 = _vec_atan2_ps(yk2, xk2);
210 tmp1 = _mm_cmplt_ps(phi_l_3,
zero);
211 tmp2 = _mm_and_ps(tmp1,
twopi);
212 tmp1 = _mm_andnot_ps(tmp1,
zero);
213 tmp1 = _mm_xor_ps(tmp1, tmp2);
214 phi_l_3 = _mm_add_ps(phi_l_3, tmp1);
216 tmp1 = _mm_and_ps(neg, hit_phi);
217 phi_l_3 = _mm_andnot_ps(neg, phi_l_3);
218 phi_l_3 = _mm_xor_ps(tmp1, phi_l_3);
221 d = _mm_load_ps(max_d);
222 k = _mm_load_ps(max_k);
224 tmp1 = _mm_mul_ps(y,y);
225 D = _mm_add_ps(D,tmp1);
227 D_inv = _vec_rec_ps(D);
229 ak = _mm_mul_ps(d,
two);
230 tmp1 = _mm_mul_ps(d,d);
231 tmp1 = _mm_mul_ps(tmp1, k);
232 ak = _mm_add_ps(ak, tmp1);
233 tmp1 = _mm_mul_ps(D,D);
234 tmp1 = _mm_mul_ps(tmp1, k);
235 ak = _mm_add_ps(ak, tmp1);
236 ak = _mm_mul_ps(ak, D_inv);
238 hk = _mm_mul_ps(d,k);
239 hk = _mm_add_ps(hk,
one);
240 hk = _mm_mul_ps(hk,hk);
241 tmp1 = _mm_mul_ps(ak,ak);
242 hk = _mm_sub_ps(hk, tmp1);
243 neg = _mm_cmple_ps(hk,
zero);
244 hk = _vec_sqrt_ps(hk);
246 xk1 = _mm_mul_ps(ak, x);
247 tmp1 = _mm_mul_ps(hk,y);
248 xk2 = _mm_sub_ps(xk1, tmp1);
249 xk1 = _mm_add_ps(xk1, tmp1);
250 xk1 = _mm_mul_ps(xk1, D_inv);
251 xk2 = _mm_mul_ps(xk2, D_inv);
253 yk1 = _mm_mul_ps(ak, y);
254 tmp1 = _mm_mul_ps(hk,x);
255 yk2 = _mm_add_ps(yk1, tmp1);
256 yk1 = _mm_sub_ps(yk1, tmp1);
257 yk1 = _mm_mul_ps(yk1, D_inv);
258 yk2 = _mm_mul_ps(yk2, D_inv);
260 __m128 phi_r_4 = _vec_atan2_ps(yk1, xk1);
262 tmp1 = _mm_cmplt_ps(phi_r_4,
zero);
263 tmp2 = _mm_and_ps(tmp1,
twopi);
264 tmp1 = _mm_andnot_ps(tmp1,
zero);
265 tmp1 = _mm_xor_ps(tmp1, tmp2);
266 phi_r_4 = _mm_add_ps(phi_r_4, tmp1);
268 tmp1 = _mm_and_ps(neg, hit_phi);
269 phi_r_4 = _mm_andnot_ps(neg, phi_r_4);
270 phi_r_4 = _mm_xor_ps(tmp1, phi_r_4);
272 __m128 phi_l_4 = _vec_atan2_ps(yk2, xk2);
274 tmp1 = _mm_cmplt_ps(phi_l_4,
zero);
275 tmp2 = _mm_and_ps(tmp1,
twopi);
276 tmp1 = _mm_andnot_ps(tmp1,
zero);
277 tmp1 = _mm_xor_ps(tmp1, tmp2);
278 phi_l_4 = _mm_add_ps(phi_l_4, tmp1);
280 tmp1 = _mm_and_ps(neg, hit_phi);
281 phi_l_4 = _mm_andnot_ps(neg, phi_l_4);
282 phi_l_4 = _mm_xor_ps(tmp1, phi_l_4);
289 tmp1 = _mm_or_ps(tmp1, tmp2);
291 tmp1 = _mm_or_ps(tmp1, tmp2);
293 tmp1 = _mm_or_ps(tmp1, tmp2);
297 tmp2 = _mm_or_ps(tmp2, tmp3);
299 tmp2 = _mm_or_ps(tmp2, tmp3);
301 tmp2 = _mm_or_ps(tmp2, tmp3);
303 tmp1 = _mm_and_ps(tmp1, tmp2);
307 tmp2 = _mm_and_ps(tmp1,
twopi);
308 tmp3 = _mm_andnot_ps(tmp1,
zero);
309 tmp2 = _mm_xor_ps(tmp2, tmp3);
312 tmp3 = _mm_and_ps(tmp4, tmp2);
313 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero);
314 tmp3 = _mm_xor_ps(tmp3, tmp5);
315 phi_r_1 = _mm_sub_ps(phi_r_1, tmp3);
318 tmp3 = _mm_and_ps(tmp4, tmp2);
319 tmp5 = _mm_andnot_ps(tmp4,
zero);
320 tmp3 = _mm_xor_ps(tmp3, tmp5);
321 phi_r_2 = _mm_sub_ps(phi_r_2, tmp3);
324 tmp3 = _mm_and_ps(tmp4, tmp2);
325 tmp5 = _mm_andnot_ps(tmp4,
zero);
326 tmp3 = _mm_xor_ps(tmp3, tmp5);
327 phi_r_3 = _mm_sub_ps(phi_r_3, tmp3);
330 tmp3 = _mm_and_ps(tmp4, tmp2);
331 tmp5 = _mm_andnot_ps(tmp4,
zero);
332 tmp3 = _mm_xor_ps(tmp3, tmp5);
333 phi_r_4 = _mm_sub_ps(phi_r_4, tmp3);
337 __m128 phi_r_min = phi_r_1;
338 tmp2 = _mm_cmplt_ps(phi_r_2, phi_r_min);
339 tmp3 = _mm_and_ps(tmp2, phi_r_2);
340 phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
341 phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
342 tmp2 = _mm_cmplt_ps(phi_r_3, phi_r_min);
343 tmp3 = _mm_and_ps(tmp2, phi_r_3);
344 phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
345 phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
346 tmp2 = _mm_cmplt_ps(phi_r_4, phi_r_min);
347 tmp3 = _mm_and_ps(tmp2, phi_r_4);
348 phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
349 phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
352 __m128 phi_r_max = phi_r_1;
353 tmp2 = _mm_cmpgt_ps(phi_r_2, phi_r_max);
354 tmp3 = _mm_and_ps(tmp2, phi_r_2);
355 phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
356 phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
357 tmp2 = _mm_cmpgt_ps(phi_r_3, phi_r_max);
358 tmp3 = _mm_and_ps(tmp2, phi_r_3);
359 phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
360 phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
361 tmp2 = _mm_cmpgt_ps(phi_r_4, phi_r_max);
362 tmp3 = _mm_and_ps(tmp2, phi_r_4);
363 phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
364 phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
366 _mm_store_ps(min_phi_1, phi_r_min);
367 _mm_store_ps(max_phi_1, phi_r_max);
374 tmp1 = _mm_or_ps(tmp1, tmp2);
376 tmp1 = _mm_or_ps(tmp1, tmp2);
378 tmp1 = _mm_or_ps(tmp1, tmp2);
382 tmp2 = _mm_or_ps(tmp2, tmp3);
384 tmp2 = _mm_or_ps(tmp2, tmp3);
386 tmp2 = _mm_or_ps(tmp2, tmp3);
388 tmp1 = _mm_and_ps(tmp1, tmp2);
392 tmp2 = _mm_and_ps(tmp1,
twopi);
393 tmp3 = _mm_andnot_ps(tmp1,
zero);
394 tmp2 = _mm_xor_ps(tmp2, tmp3);
397 tmp3 = _mm_and_ps(tmp4, tmp2);
398 tmp5 = _mm_andnot_ps(tmp4,
zero);
399 tmp3 = _mm_xor_ps(tmp3, tmp5);
400 phi_l_1 = _mm_sub_ps(phi_l_1, tmp3);
403 tmp3 = _mm_and_ps(tmp4, tmp2);
404 tmp5 = _mm_andnot_ps(tmp4,
zero);
405 tmp3 = _mm_xor_ps(tmp3, tmp5);
406 phi_l_2 = _mm_sub_ps(phi_l_2, tmp3);
409 tmp3 = _mm_and_ps(tmp4, tmp2);
410 tmp5 = _mm_andnot_ps(tmp4,
zero);
411 tmp3 = _mm_xor_ps(tmp3, tmp5);
412 phi_l_3 = _mm_sub_ps(phi_l_3, tmp3);
415 tmp3 = _mm_and_ps(tmp4, tmp2);
416 tmp5 = _mm_andnot_ps(tmp4,
zero);
417 tmp3 = _mm_xor_ps(tmp3, tmp5);
418 phi_l_4 = _mm_sub_ps(phi_l_4, tmp3);
422 __m128 phi_l_min = phi_l_1;
423 tmp2 = _mm_cmplt_ps(phi_l_2, phi_l_min);
424 tmp3 = _mm_and_ps(tmp2, phi_l_2);
425 phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
426 phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
427 tmp2 = _mm_cmplt_ps(phi_l_3, phi_l_min);
428 tmp3 = _mm_and_ps(tmp2, phi_l_3);
429 phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
430 phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
431 tmp2 = _mm_cmplt_ps(phi_l_4, phi_l_min);
432 tmp3 = _mm_and_ps(tmp2, phi_l_4);
433 phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
434 phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
437 __m128 phi_l_max = phi_l_1;
438 tmp2 = _mm_cmpgt_ps(phi_l_2, phi_l_max);
439 tmp3 = _mm_and_ps(tmp2, phi_l_2);
440 phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
441 phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
442 tmp2 = _mm_cmpgt_ps(phi_l_3, phi_l_max);
443 tmp3 = _mm_and_ps(tmp2, phi_l_3);
444 phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
445 phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
446 tmp2 = _mm_cmpgt_ps(phi_l_4, phi_l_max);
447 tmp3 = _mm_and_ps(tmp2, phi_l_4);
448 phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
449 phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
451 _mm_store_ps(min_phi_2, phi_l_min);
452 _mm_store_ps(max_phi_2, phi_l_max);
459 const __m128i MASK = _mm_set1_epi32(0xffffffff);
461 __m128 f = _mm_xor_ps(a,b);
462 __m128i i = _mm_castps_si128(f);
464 i = _mm_srai_epi32(i,31);
465 i = _mm_xor_si128(i,MASK);
467 f = _mm_castsi128_ps(i);
473 void HelixHough::phiRange_sse(
float* hit_x,
float* hit_y,
float* min_d,
float* max_d,
float* min_k,
float* max_k,
float* min_phi,
float* max_phi,
float hel, __m128& phi_3_out, __m128& phi_4_out)
475 __m128 helicity_vec = _mm_load1_ps(&(hel));
477 __m128
x = _mm_load_ps(hit_x);
478 __m128
y = _mm_load_ps(hit_y);
480 __m128 d_min = _mm_load_ps(min_d);
481 __m128 d_max = _mm_load_ps(max_d);
482 __m128 k_min = _mm_load_ps(min_k);
483 __m128 k_max = _mm_load_ps(max_k);
485 __m128 hit_phi = _vec_atan2_ps(y,x);
487 __m128
tmp1 = _mm_cmplt_ps(hit_phi,
zero);
489 tmp1 = _mm_andnot_ps(tmp1,
zero);
490 tmp1 = _mm_xor_ps(tmp1, tmp2);
491 hit_phi = _mm_add_ps(hit_phi, tmp1);
496 __m128 D = _mm_mul_ps(x,x);
497 tmp1 = _mm_mul_ps(y,y);
498 D = _mm_add_ps(D,tmp1);
500 __m128 D_inv = _vec_rec_ps(D);
502 ak = _mm_mul_ps(d,
two);
503 tmp1 = _mm_mul_ps(d,d);
504 tmp1 = _mm_mul_ps(tmp1, k);
505 ak = _mm_add_ps(ak, tmp1);
506 tmp1 = _mm_mul_ps(D,D);
507 tmp1 = _mm_mul_ps(tmp1, k);
508 ak = _mm_add_ps(ak, tmp1);
509 ak = _mm_mul_ps(ak, D_inv);
511 __m128 hk = _mm_mul_ps(d,k);
512 hk = _mm_add_ps(hk,
one);
513 hk = _mm_mul_ps(hk,hk);
514 tmp1 = _mm_mul_ps(ak,ak);
515 hk = _mm_sub_ps(hk, tmp1);
516 __m128 neg = _mm_cmple_ps(hk,
zero);
517 hk = _vec_sqrt_ps(hk);
519 __m128 xk1 = _mm_mul_ps(ak, x);
520 tmp1 = _mm_mul_ps(hk,y);
521 __m128 xk2 = _mm_sub_ps(xk1, tmp1);
522 xk1 = _mm_add_ps(xk1, tmp1);
523 xk1 = _mm_mul_ps(xk1, D_inv);
524 xk2 = _mm_mul_ps(xk2, D_inv);
526 __m128 yk1 = _mm_mul_ps(ak, y);
527 tmp1 = _mm_mul_ps(hk,x);
528 __m128 yk2 = _mm_add_ps(yk1, tmp1);
529 yk1 = _mm_sub_ps(yk1, tmp1);
530 yk1 = _mm_mul_ps(yk1, D_inv);
531 yk2 = _mm_mul_ps(yk2, D_inv);
533 __m128 crossproduct = _mm_mul_ps(x, yk1);
534 __m128 crosstemp = _mm_mul_ps(y, xk1);
535 crossproduct = _mm_sub_ps(crossproduct, crosstemp);
536 __m128 correct_helicity =
compare_sign(crossproduct, helicity_vec);
538 __m128 xk = _mm_and_ps(correct_helicity, xk1);
539 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
540 xk = _mm_xor_ps(xk, tmp1);
541 __m128 yk = _mm_and_ps(correct_helicity, yk1);
542 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
543 yk = _mm_xor_ps(yk, tmp1);
546 __m128
phi_1 = _vec_atan2_ps(yk, xk);
548 tmp1 = _mm_cmplt_ps(phi_1,
zero);
549 tmp2 = _mm_and_ps(tmp1,
twopi);
550 tmp1 = _mm_andnot_ps(tmp1,
zero);
551 tmp1 = _mm_xor_ps(tmp1, tmp2);
552 phi_1 = _mm_add_ps(phi_1, tmp1);
554 tmp1 = _mm_and_ps(neg, hit_phi);
555 phi_1 = _mm_andnot_ps(neg, phi_1);
556 phi_1 = _mm_xor_ps(tmp1, phi_1);
563 ak = _mm_mul_ps(d,
two);
564 tmp1 = _mm_mul_ps(d,d);
565 tmp1 = _mm_mul_ps(tmp1, k);
566 ak = _mm_add_ps(ak, tmp1);
567 tmp1 = _mm_mul_ps(D,D);
568 tmp1 = _mm_mul_ps(tmp1, k);
569 ak = _mm_add_ps(ak, tmp1);
570 ak = _mm_mul_ps(ak, D_inv);
572 hk = _mm_mul_ps(d,k);
573 hk = _mm_add_ps(hk,
one);
574 hk = _mm_mul_ps(hk,hk);
575 tmp1 = _mm_mul_ps(ak,ak);
576 hk = _mm_sub_ps(hk, tmp1);
577 neg = _mm_cmple_ps(hk,
zero);
578 hk = _vec_sqrt_ps(hk);
580 xk1 = _mm_mul_ps(ak, x);
581 tmp1 = _mm_mul_ps(hk,y);
582 xk2 = _mm_sub_ps(xk1, tmp1);
583 xk1 = _mm_add_ps(xk1, tmp1);
584 xk1 = _mm_mul_ps(xk1, D_inv);
585 xk2 = _mm_mul_ps(xk2, D_inv);
587 yk1 = _mm_mul_ps(ak, y);
588 tmp1 = _mm_mul_ps(hk,x);
589 yk2 = _mm_add_ps(yk1, tmp1);
590 yk1 = _mm_sub_ps(yk1, tmp1);
591 yk1 = _mm_mul_ps(yk1, D_inv);
592 yk2 = _mm_mul_ps(yk2, D_inv);
594 xk = _mm_and_ps(correct_helicity, xk1);
595 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
596 xk = _mm_xor_ps(xk, tmp1);
597 yk = _mm_and_ps(correct_helicity, yk1);
598 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
599 yk = _mm_xor_ps(yk, tmp1);
601 __m128
phi_2 = _vec_atan2_ps(yk, xk);
603 tmp1 = _mm_cmplt_ps(phi_2,
zero);
604 tmp2 = _mm_and_ps(tmp1,
twopi);
605 tmp1 = _mm_andnot_ps(tmp1,
zero);
606 tmp1 = _mm_xor_ps(tmp1, tmp2);
607 phi_2 = _mm_add_ps(phi_2, tmp1);
609 tmp1 = _mm_and_ps(neg, hit_phi);
610 phi_2 = _mm_andnot_ps(neg, phi_2);
611 phi_2 = _mm_xor_ps(tmp1, phi_2);
619 ak = _mm_mul_ps(d,
two);
620 tmp1 = _mm_mul_ps(d,d);
621 tmp1 = _mm_mul_ps(tmp1, k);
622 ak = _mm_add_ps(ak, tmp1);
623 tmp1 = _mm_mul_ps(D,D);
624 tmp1 = _mm_mul_ps(tmp1, k);
625 ak = _mm_add_ps(ak, tmp1);
626 ak = _mm_mul_ps(ak, D_inv);
628 hk = _mm_mul_ps(d,k);
629 hk = _mm_add_ps(hk,
one);
630 hk = _mm_mul_ps(hk,hk);
631 tmp1 = _mm_mul_ps(ak,ak);
632 hk = _mm_sub_ps(hk, tmp1);
633 neg = _mm_cmple_ps(hk,
zero);
634 hk = _vec_sqrt_ps(hk);
636 xk1 = _mm_mul_ps(ak, x);
637 tmp1 = _mm_mul_ps(hk,y);
638 xk2 = _mm_sub_ps(xk1, tmp1);
639 xk1 = _mm_add_ps(xk1, tmp1);
640 xk1 = _mm_mul_ps(xk1, D_inv);
641 xk2 = _mm_mul_ps(xk2, D_inv);
643 yk1 = _mm_mul_ps(ak, y);
644 tmp1 = _mm_mul_ps(hk,x);
645 yk2 = _mm_add_ps(yk1, tmp1);
646 yk1 = _mm_sub_ps(yk1, tmp1);
647 yk1 = _mm_mul_ps(yk1, D_inv);
648 yk2 = _mm_mul_ps(yk2, D_inv);
650 xk = _mm_and_ps(correct_helicity, xk1);
651 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
652 xk = _mm_xor_ps(xk, tmp1);
653 yk = _mm_and_ps(correct_helicity, yk1);
654 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
655 yk = _mm_xor_ps(yk, tmp1);
657 __m128 phi_3 = _vec_atan2_ps(yk, xk);
659 tmp1 = _mm_cmplt_ps(phi_3,
zero);
660 tmp2 = _mm_and_ps(tmp1,
twopi);
661 tmp1 = _mm_andnot_ps(tmp1,
zero);
662 tmp1 = _mm_xor_ps(tmp1, tmp2);
663 phi_3 = _mm_add_ps(phi_3, tmp1);
665 tmp1 = _mm_and_ps(neg, hit_phi);
666 phi_3 = _mm_andnot_ps(neg, phi_3);
667 phi_3 = _mm_xor_ps(tmp1, phi_3);
674 ak = _mm_mul_ps(d,
two);
675 tmp1 = _mm_mul_ps(d,d);
676 tmp1 = _mm_mul_ps(tmp1, k);
677 ak = _mm_add_ps(ak, tmp1);
678 tmp1 = _mm_mul_ps(D,D);
679 tmp1 = _mm_mul_ps(tmp1, k);
680 ak = _mm_add_ps(ak, tmp1);
681 ak = _mm_mul_ps(ak, D_inv);
683 hk = _mm_mul_ps(d,k);
684 hk = _mm_add_ps(hk,
one);
685 hk = _mm_mul_ps(hk,hk);
686 tmp1 = _mm_mul_ps(ak,ak);
687 hk = _mm_sub_ps(hk, tmp1);
688 neg = _mm_cmple_ps(hk,
zero);
689 hk = _vec_sqrt_ps(hk);
691 xk1 = _mm_mul_ps(ak, x);
692 tmp1 = _mm_mul_ps(hk,y);
693 xk2 = _mm_sub_ps(xk1, tmp1);
694 xk1 = _mm_add_ps(xk1, tmp1);
695 xk1 = _mm_mul_ps(xk1, D_inv);
696 xk2 = _mm_mul_ps(xk2, D_inv);
698 yk1 = _mm_mul_ps(ak, y);
699 tmp1 = _mm_mul_ps(hk,x);
700 yk2 = _mm_add_ps(yk1, tmp1);
701 yk1 = _mm_sub_ps(yk1, tmp1);
702 yk1 = _mm_mul_ps(yk1, D_inv);
703 yk2 = _mm_mul_ps(yk2, D_inv);
705 xk = _mm_and_ps(correct_helicity, xk1);
706 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
707 xk = _mm_xor_ps(xk, tmp1);
708 yk = _mm_and_ps(correct_helicity, yk1);
709 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
710 yk = _mm_xor_ps(yk, tmp1);
712 __m128 phi_4 = _vec_atan2_ps(yk, xk);
714 tmp1 = _mm_cmplt_ps(phi_4,
zero);
715 tmp2 = _mm_and_ps(tmp1,
twopi);
716 tmp1 = _mm_andnot_ps(tmp1,
zero);
717 tmp1 = _mm_xor_ps(tmp1, tmp2);
718 phi_4 = _mm_add_ps(phi_4, tmp1);
720 tmp1 = _mm_and_ps(neg, hit_phi);
721 phi_4 = _mm_andnot_ps(neg, phi_4);
722 phi_4 = _mm_xor_ps(tmp1, phi_4);
729 tmp1 = _mm_or_ps(tmp1, tmp2);
731 tmp1 = _mm_or_ps(tmp1, tmp2);
733 tmp1 = _mm_or_ps(tmp1, tmp2);
737 tmp2 = _mm_or_ps(tmp2, tmp3);
739 tmp2 = _mm_or_ps(tmp2, tmp3);
741 tmp2 = _mm_or_ps(tmp2, tmp3);
743 tmp1 = _mm_and_ps(tmp1, tmp2);
747 tmp2 = _mm_and_ps(tmp1,
twopi);
748 tmp3 = _mm_andnot_ps(tmp1,
zero);
749 tmp2 = _mm_xor_ps(tmp2, tmp3);
752 tmp3 = _mm_and_ps(tmp4, tmp2);
753 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero);
754 tmp3 = _mm_xor_ps(tmp3, tmp5);
755 phi_1 = _mm_sub_ps(phi_1, tmp3);
758 tmp3 = _mm_and_ps(tmp4, tmp2);
759 tmp5 = _mm_andnot_ps(tmp4,
zero);
760 tmp3 = _mm_xor_ps(tmp3, tmp5);
761 phi_2 = _mm_sub_ps(phi_2, tmp3);
764 tmp3 = _mm_and_ps(tmp4, tmp2);
765 tmp5 = _mm_andnot_ps(tmp4,
zero);
766 tmp3 = _mm_xor_ps(tmp3, tmp5);
767 phi_3 = _mm_sub_ps(phi_3, tmp3);
770 tmp3 = _mm_and_ps(tmp4, tmp2);
771 tmp5 = _mm_andnot_ps(tmp4,
zero);
772 tmp3 = _mm_xor_ps(tmp3, tmp5);
773 phi_4 = _mm_sub_ps(phi_4, tmp3);
777 __m128 phi_min =
phi_1;
778 tmp2 = _mm_cmplt_ps(phi_2, phi_min);
779 tmp3 = _mm_and_ps(tmp2, phi_2);
780 phi_min = _mm_andnot_ps(tmp2, phi_min);
781 phi_min = _mm_xor_ps(phi_min, tmp3);
782 tmp2 = _mm_cmplt_ps(phi_3, phi_min);
783 tmp3 = _mm_and_ps(tmp2, phi_3);
784 phi_min = _mm_andnot_ps(tmp2, phi_min);
785 phi_min = _mm_xor_ps(phi_min, tmp3);
786 tmp2 = _mm_cmplt_ps(phi_4, phi_min);
787 tmp3 = _mm_and_ps(tmp2, phi_4);
788 phi_min = _mm_andnot_ps(tmp2, phi_min);
789 phi_min = _mm_xor_ps(phi_min, tmp3);
792 __m128 phi_max =
phi_1;
793 tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
794 tmp3 = _mm_and_ps(tmp2, phi_2);
795 phi_max = _mm_andnot_ps(tmp2, phi_max);
796 phi_max = _mm_xor_ps(phi_max, tmp3);
797 tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
798 tmp3 = _mm_and_ps(tmp2, phi_3);
799 phi_max = _mm_andnot_ps(tmp2, phi_max);
800 phi_max = _mm_xor_ps(phi_max, tmp3);
801 tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
802 tmp3 = _mm_and_ps(tmp2, phi_4);
803 phi_max = _mm_andnot_ps(tmp2, phi_max);
804 phi_max = _mm_xor_ps(phi_max, tmp3);
807 _mm_store_ps(min_phi, phi_min);
808 _mm_store_ps(max_phi, phi_max);
812 void HelixHough::phiRange_sse(
float* hit_x,
float* hit_y,
float* min_d,
float* max_d,
float* max_k,
float* min_phi,
float* max_phi,
float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out)
814 __m128 helicity_vec = _mm_load1_ps(&(hel));
816 __m128
x = _mm_load_ps(hit_x);
817 __m128
y = _mm_load_ps(hit_y);
819 __m128 d_min = _mm_load_ps(min_d);
820 __m128 d_max = _mm_load_ps(max_d);
821 __m128 k_max = _mm_load_ps(max_k);
823 __m128 hit_phi = _vec_atan2_ps(y,x);
825 __m128
tmp1 = _mm_cmplt_ps(hit_phi,
zero);
827 tmp1 = _mm_andnot_ps(tmp1,
zero);
828 tmp1 = _mm_xor_ps(tmp1, tmp2);
829 hit_phi = _mm_add_ps(hit_phi, tmp1);
834 __m128 D = _mm_mul_ps(x,x);
835 tmp1 = _mm_mul_ps(y,y);
836 D = _mm_add_ps(D,tmp1);
838 __m128 D_inv = _vec_rec_ps(D);
840 ak = _mm_mul_ps(d,
two);
841 tmp1 = _mm_mul_ps(d,d);
842 tmp1 = _mm_mul_ps(tmp1, k);
843 ak = _mm_add_ps(ak, tmp1);
844 tmp1 = _mm_mul_ps(D,D);
845 tmp1 = _mm_mul_ps(tmp1, k);
846 ak = _mm_add_ps(ak, tmp1);
847 ak = _mm_mul_ps(ak, D_inv);
849 __m128 hk = _mm_mul_ps(d,k);
850 hk = _mm_add_ps(hk,
one);
851 hk = _mm_mul_ps(hk,hk);
852 tmp1 = _mm_mul_ps(ak,ak);
853 hk = _mm_sub_ps(hk, tmp1);
854 __m128 neg = _mm_cmple_ps(hk,
zero);
855 hk = _vec_sqrt_ps(hk);
857 __m128 xk1 = _mm_mul_ps(ak, x);
858 tmp1 = _mm_mul_ps(hk,y);
859 __m128 xk2 = _mm_sub_ps(xk1, tmp1);
860 xk1 = _mm_add_ps(xk1, tmp1);
861 xk1 = _mm_mul_ps(xk1, D_inv);
862 xk2 = _mm_mul_ps(xk2, D_inv);
864 __m128 yk1 = _mm_mul_ps(ak, y);
865 tmp1 = _mm_mul_ps(hk,x);
866 __m128 yk2 = _mm_add_ps(yk1, tmp1);
867 yk1 = _mm_sub_ps(yk1, tmp1);
868 yk1 = _mm_mul_ps(yk1, D_inv);
869 yk2 = _mm_mul_ps(yk2, D_inv);
871 __m128 crossproduct = _mm_mul_ps(x, yk1);
872 __m128 crosstemp = _mm_mul_ps(y, xk1);
873 crossproduct = _mm_sub_ps(crossproduct, crosstemp);
874 __m128 correct_helicity =
compare_sign(crossproduct, helicity_vec);
876 __m128 xk = _mm_and_ps(correct_helicity, xk1);
877 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
878 xk = _mm_xor_ps(xk, tmp1);
879 __m128 yk = _mm_and_ps(correct_helicity, yk1);
880 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
881 yk = _mm_xor_ps(yk, tmp1);
884 __m128
phi_1 = _vec_atan2_ps(yk, xk);
886 tmp1 = _mm_cmplt_ps(phi_1,
zero);
887 tmp2 = _mm_and_ps(tmp1,
twopi);
888 tmp1 = _mm_andnot_ps(tmp1,
zero);
889 tmp1 = _mm_xor_ps(tmp1, tmp2);
890 phi_1 = _mm_add_ps(phi_1, tmp1);
892 tmp1 = _mm_and_ps(neg, hit_phi);
893 phi_1 = _mm_andnot_ps(neg, phi_1);
894 phi_1 = _mm_xor_ps(tmp1, phi_1);
901 ak = _mm_mul_ps(d,
two);
902 tmp1 = _mm_mul_ps(d,d);
903 tmp1 = _mm_mul_ps(tmp1, k);
904 ak = _mm_add_ps(ak, tmp1);
905 tmp1 = _mm_mul_ps(D,D);
906 tmp1 = _mm_mul_ps(tmp1, k);
907 ak = _mm_add_ps(ak, tmp1);
908 ak = _mm_mul_ps(ak, D_inv);
910 hk = _mm_mul_ps(d,k);
911 hk = _mm_add_ps(hk,
one);
912 hk = _mm_mul_ps(hk,hk);
913 tmp1 = _mm_mul_ps(ak,ak);
914 hk = _mm_sub_ps(hk, tmp1);
915 neg = _mm_cmple_ps(hk,
zero);
916 hk = _vec_sqrt_ps(hk);
918 xk1 = _mm_mul_ps(ak, x);
919 tmp1 = _mm_mul_ps(hk,y);
920 xk2 = _mm_sub_ps(xk1, tmp1);
921 xk1 = _mm_add_ps(xk1, tmp1);
922 xk1 = _mm_mul_ps(xk1, D_inv);
923 xk2 = _mm_mul_ps(xk2, D_inv);
925 yk1 = _mm_mul_ps(ak, y);
926 tmp1 = _mm_mul_ps(hk,x);
927 yk2 = _mm_add_ps(yk1, tmp1);
928 yk1 = _mm_sub_ps(yk1, tmp1);
929 yk1 = _mm_mul_ps(yk1, D_inv);
930 yk2 = _mm_mul_ps(yk2, D_inv);
932 xk = _mm_and_ps(correct_helicity, xk1);
933 tmp1 = _mm_andnot_ps(correct_helicity, xk2);
934 xk = _mm_xor_ps(xk, tmp1);
935 yk = _mm_and_ps(correct_helicity, yk1);
936 tmp1 = _mm_andnot_ps(correct_helicity, yk2);
937 yk = _mm_xor_ps(yk, tmp1);
939 __m128
phi_2 = _vec_atan2_ps(yk, xk);
941 tmp1 = _mm_cmplt_ps(phi_2,
zero);
942 tmp2 = _mm_and_ps(tmp1,
twopi);
943 tmp1 = _mm_andnot_ps(tmp1,
zero);
944 tmp1 = _mm_xor_ps(tmp1, tmp2);
945 phi_2 = _mm_add_ps(phi_2, tmp1);
947 tmp1 = _mm_and_ps(neg, hit_phi);
948 phi_2 = _mm_andnot_ps(neg, phi_2);
949 phi_2 = _mm_xor_ps(tmp1, phi_2);
955 tmp1 = _mm_or_ps(tmp1, tmp2);
957 tmp1 = _mm_or_ps(tmp1, tmp2);
959 tmp1 = _mm_or_ps(tmp1, tmp2);
963 tmp2 = _mm_or_ps(tmp2, tmp3);
965 tmp2 = _mm_or_ps(tmp2, tmp3);
967 tmp2 = _mm_or_ps(tmp2, tmp3);
969 tmp1 = _mm_and_ps(tmp1, tmp2);
973 tmp2 = _mm_and_ps(tmp1,
twopi);
974 tmp3 = _mm_andnot_ps(tmp1,
zero);
975 tmp2 = _mm_xor_ps(tmp2, tmp3);
978 tmp3 = _mm_and_ps(tmp4, tmp2);
979 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero);
980 tmp3 = _mm_xor_ps(tmp3, tmp5);
981 phi_1 = _mm_sub_ps(phi_1, tmp3);
984 tmp3 = _mm_and_ps(tmp4, tmp2);
985 tmp5 = _mm_andnot_ps(tmp4,
zero);
986 tmp3 = _mm_xor_ps(tmp3, tmp5);
987 phi_2 = _mm_sub_ps(phi_2, tmp3);
990 tmp3 = _mm_and_ps(tmp4, tmp2);
991 tmp5 = _mm_andnot_ps(tmp4,
zero);
992 tmp3 = _mm_xor_ps(tmp3, tmp5);
993 phi_3 = _mm_sub_ps(phi_3, tmp3);
996 tmp3 = _mm_and_ps(tmp4, tmp2);
997 tmp5 = _mm_andnot_ps(tmp4,
zero);
998 tmp3 = _mm_xor_ps(tmp3, tmp5);
999 phi_4 = _mm_sub_ps(phi_4, tmp3);
1003 __m128 phi_min =
phi_1;
1004 tmp2 = _mm_cmplt_ps(phi_2, phi_min);
1005 tmp3 = _mm_and_ps(tmp2, phi_2);
1006 phi_min = _mm_andnot_ps(tmp2, phi_min);
1007 phi_min = _mm_xor_ps(phi_min, tmp3);
1008 tmp2 = _mm_cmplt_ps(phi_3, phi_min);
1009 tmp3 = _mm_and_ps(tmp2, phi_3);
1010 phi_min = _mm_andnot_ps(tmp2, phi_min);
1011 phi_min = _mm_xor_ps(phi_min, tmp3);
1012 tmp2 = _mm_cmplt_ps(phi_4, phi_min);
1013 tmp3 = _mm_and_ps(tmp2, phi_4);
1014 phi_min = _mm_andnot_ps(tmp2, phi_min);
1015 phi_min = _mm_xor_ps(phi_min, tmp3);
1018 __m128 phi_max =
phi_1;
1019 tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
1020 tmp3 = _mm_and_ps(tmp2, phi_2);
1021 phi_max = _mm_andnot_ps(tmp2, phi_max);
1022 phi_max = _mm_xor_ps(phi_max, tmp3);
1023 tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
1024 tmp3 = _mm_and_ps(tmp2, phi_3);
1025 phi_max = _mm_andnot_ps(tmp2, phi_max);
1026 phi_max = _mm_xor_ps(phi_max, tmp3);
1027 tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
1028 tmp3 = _mm_and_ps(tmp2, phi_4);
1029 phi_max = _mm_andnot_ps(tmp2, phi_max);
1030 phi_max = _mm_xor_ps(phi_max, tmp3);
1033 _mm_store_ps(min_phi, phi_min);
1034 _mm_store_ps(max_phi, phi_max);
1038 void HelixHough::phiRange_sse(
float* hit_x,
float* hit_y,
float* min_d,
float* max_d,
float* min_k,
float* max_k,
float* min_phi,
float* max_phi,
float* min_phi_2,
float* max_phi_2,
float hel, __m128& phi_3_out, __m128& phi_4_out,
float* hit_x_2,
float* hit_y_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
1040 __m128 helicity_vec = _mm_load1_ps(&(hel));
1042 __m128
x = _mm_load_ps(hit_x); __m128 x_2 = _mm_load_ps(hit_x_2);
1043 __m128
y = _mm_load_ps(hit_y); __m128 y_2 = _mm_load_ps(hit_y_2);
1045 __m128 d_min = _mm_load_ps(min_d);
1046 __m128 d_max = _mm_load_ps(max_d);
1047 __m128 k_min = _mm_load_ps(min_k);
1048 __m128 k_max = _mm_load_ps(max_k);
1050 __m128 hit_phi = _vec_atan2_ps(y,x); __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
1052 __m128
tmp1 = _mm_cmplt_ps(hit_phi,
zero); __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2,
zero);
1053 __m128
tmp2 = _mm_and_ps(tmp1,
twopi); __m128 tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1054 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1055 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1056 hit_phi = _mm_add_ps(hit_phi, tmp1); hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
1061 __m128 D = _mm_mul_ps(x,x); __m128 D_2 = _mm_mul_ps(x_2,x_2);
1062 tmp1 = _mm_mul_ps(y,y); tmp1_2 = _mm_mul_ps(y_2,y_2);
1063 D = _mm_add_ps(D,tmp1); D_2 = _mm_add_ps(D_2,tmp1_2);
1064 D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
1065 __m128 D_inv = _vec_rec_ps(D); __m128 D_inv_2 = _vec_rec_ps(D_2);
1066 __m128 ak =
d; __m128 ak_2 =
d;
1067 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1068 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1069 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1070 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1071 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1072 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1073 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1074 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1076 __m128 hk = _mm_mul_ps(d,k); __m128 hk_2 = _mm_mul_ps(d,k);
1077 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1078 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1079 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1080 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1081 __m128 neg = _mm_cmple_ps(hk,
zero); __m128 neg_2 = _mm_cmple_ps(hk_2,
zero);
1082 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1084 __m128 xk1 = _mm_mul_ps(ak, x); __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
1085 tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1086 __m128 xk2 = _mm_sub_ps(xk1, tmp1); __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1087 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1088 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1089 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1091 __m128 yk1 = _mm_mul_ps(ak, y); __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
1092 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2,x_2);
1093 __m128 yk2 = _mm_add_ps(yk1, tmp1); __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1094 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1095 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1096 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1098 __m128 crossproduct = _mm_mul_ps(x, yk1); __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
1099 __m128 crosstemp = _mm_mul_ps(y, xk1); __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
1100 crossproduct = _mm_sub_ps(crossproduct, crosstemp); crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
1101 __m128 correct_helicity =
compare_sign(crossproduct, helicity_vec); __m128 correct_helicity_2 =
compare_sign(crossproduct_2, helicity_vec);
1103 __m128 xk = _mm_and_ps(correct_helicity, xk1); __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1104 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1105 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1106 __m128 yk = _mm_and_ps(correct_helicity, yk1); __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1107 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1108 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1111 __m128
phi_1 = _vec_atan2_ps(yk, xk); __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
1113 tmp1 = _mm_cmplt_ps(phi_1,
zero); tmp1_2 = _mm_cmplt_ps(phi_1_2,
zero);
1114 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1115 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1116 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1117 phi_1 = _mm_add_ps(phi_1, tmp1); phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
1119 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1120 phi_1 = _mm_andnot_ps(neg, phi_1); phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
1121 phi_1 = _mm_xor_ps(tmp1, phi_1); phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
1122 phi_3_out =
phi_1; phi_3_out_2 = phi_1_2;
1128 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1129 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1130 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1131 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1132 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1133 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1134 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1135 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1137 hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1138 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1139 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1140 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1141 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1142 neg = _mm_cmple_ps(hk,
zero); neg_2 = _mm_cmple_ps(hk_2,
zero);
1143 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1145 xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1146 tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1147 xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1148 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1149 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1150 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1152 yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1153 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1154 yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1155 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1156 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1157 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1159 xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1160 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1161 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1162 yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1163 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1164 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1166 __m128
phi_2 = _vec_atan2_ps(yk, xk); __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
1168 tmp1 = _mm_cmplt_ps(phi_2,
zero); tmp1_2 = _mm_cmplt_ps(phi_2_2,
zero);
1169 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1170 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1171 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1172 phi_2 = _mm_add_ps(phi_2, tmp1); phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
1174 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1175 phi_2 = _mm_andnot_ps(neg, phi_2); phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
1176 phi_2 = _mm_xor_ps(tmp1, phi_2); phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
1177 phi_4_out =
phi_2; phi_4_out_2 = phi_2_2;
1184 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1185 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1186 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1187 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1188 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1189 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1190 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1191 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1193 hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1194 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1195 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1196 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1197 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1198 neg = _mm_cmple_ps(hk,
zero); neg_2 = _mm_cmple_ps(hk_2,
zero);
1199 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1201 xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1202 tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1203 xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1204 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1205 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1206 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1208 yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1209 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1210 yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1211 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1212 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1213 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1215 xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1216 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1217 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1218 yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1219 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1220 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1222 __m128 phi_3 = _vec_atan2_ps(yk, xk); __m128 phi_3_2 = _vec_atan2_ps(yk_2, xk_2);
1224 tmp1 = _mm_cmplt_ps(phi_3,
zero); tmp1_2 = _mm_cmplt_ps(phi_3_2,
zero);
1225 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1226 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1227 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1228 phi_3 = _mm_add_ps(phi_3, tmp1); phi_3_2 = _mm_add_ps(phi_3_2, tmp1_2);
1230 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1231 phi_3 = _mm_andnot_ps(neg, phi_3); phi_3_2 = _mm_andnot_ps(neg_2, phi_3_2);
1232 phi_3 = _mm_xor_ps(tmp1, phi_3); phi_3_2 = _mm_xor_ps(tmp1_2, phi_3_2);
1239 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1240 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1241 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1242 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1243 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1244 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1245 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1246 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1248 hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1249 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1250 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1251 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1252 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1253 neg = _mm_cmple_ps(hk,
zero); neg_2 = _mm_cmple_ps(hk_2,
zero);
1254 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1256 xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1257 tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1258 xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1259 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1260 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1261 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1263 yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1264 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1265 yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1266 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1267 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1268 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1270 xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1271 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1272 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1273 yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1274 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1275 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1277 __m128 phi_4 = _vec_atan2_ps(yk, xk); __m128 phi_4_2 = _vec_atan2_ps(yk_2, xk_2);
1279 tmp1 = _mm_cmplt_ps(phi_4,
zero); tmp1_2 = _mm_cmplt_ps(phi_4_2,
zero);
1280 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1281 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1282 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1283 phi_4 = _mm_add_ps(phi_4, tmp1); phi_4_2 = _mm_add_ps(phi_4_2, tmp1_2);
1285 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1286 phi_4 = _mm_andnot_ps(neg, phi_4); phi_4_2 = _mm_andnot_ps(neg_2, phi_4_2);
1287 phi_4 = _mm_xor_ps(tmp1, phi_4); phi_4_2 = _mm_xor_ps(tmp1_2, phi_4_2);
1294 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1296 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1298 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1302 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1304 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1306 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1308 tmp1 = _mm_and_ps(tmp1, tmp2); tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
1312 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1313 tmp3 = _mm_andnot_ps(tmp1,
zero); tmp3_2 = _mm_andnot_ps(tmp1_2,
zero);
1314 tmp2 = _mm_xor_ps(tmp2, tmp3); tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
1317 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1318 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero); __m128 tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1319 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1320 phi_1 = _mm_sub_ps(phi_1, tmp3); phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
1323 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1324 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1325 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1326 phi_2 = _mm_sub_ps(phi_2, tmp3); phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
1329 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1330 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1331 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1332 phi_3 = _mm_sub_ps(phi_3, tmp3); phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
1335 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1336 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1337 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1338 phi_4 = _mm_sub_ps(phi_4, tmp3); phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
1342 __m128 phi_min =
phi_1; __m128 phi_min_2 = phi_1_2;
1343 tmp2 = _mm_cmplt_ps(phi_2, phi_min); tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
1344 tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1345 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1346 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1347 tmp2 = _mm_cmplt_ps(phi_3, phi_min); tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
1348 tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1349 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1350 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1351 tmp2 = _mm_cmplt_ps(phi_4, phi_min); tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
1352 tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1353 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1354 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1357 __m128 phi_max =
phi_1; __m128 phi_max_2 = phi_1_2;
1358 tmp2 = _mm_cmpgt_ps(phi_2, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
1359 tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1360 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1361 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1362 tmp2 = _mm_cmpgt_ps(phi_3, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
1363 tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1364 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1365 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1366 tmp2 = _mm_cmpgt_ps(phi_4, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
1367 tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1368 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1369 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1372 _mm_store_ps(min_phi, phi_min); _mm_store_ps(min_phi_2, phi_min_2);
1373 _mm_store_ps(max_phi, phi_max); _mm_store_ps(max_phi_2, phi_max_2);
1377 void HelixHough::phiRange_sse(
float* hit_x,
float* hit_y,
float* min_d,
float* max_d,
float* ,
float* max_k,
float* min_phi,
float* max_phi,
float* min_phi_2,
float* max_phi_2,
float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out,
float* hit_x_2,
float* hit_y_2, __m128& phi_3_2, __m128& phi_4_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
1379 __m128 helicity_vec = _mm_load1_ps(&(hel));
1381 __m128
x = _mm_load_ps(hit_x); __m128 x_2 = _mm_load_ps(hit_x_2);
1382 __m128
y = _mm_load_ps(hit_y); __m128 y_2 = _mm_load_ps(hit_y_2);
1384 __m128 d_min = _mm_load_ps(min_d);
1385 __m128 d_max = _mm_load_ps(max_d);
1386 __m128 k_max = _mm_load_ps(max_k);
1388 __m128 hit_phi = _vec_atan2_ps(y,x); __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
1390 __m128
tmp1 = _mm_cmplt_ps(hit_phi,
zero); __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2,
zero);
1391 __m128
tmp2 = _mm_and_ps(tmp1,
twopi); __m128 tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1392 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1393 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1394 hit_phi = _mm_add_ps(hit_phi, tmp1); hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
1399 __m128 D = _mm_mul_ps(x,x); __m128 D_2 = _mm_mul_ps(x_2,x_2);
1400 tmp1 = _mm_mul_ps(y,y); tmp1_2 = _mm_mul_ps(y_2,y_2);
1401 D = _mm_add_ps(D,tmp1); D_2 = _mm_add_ps(D_2,tmp1_2);
1402 D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
1403 __m128 D_inv = _vec_rec_ps(D); __m128 D_inv_2 = _vec_rec_ps(D_2);
1404 __m128 ak =
d; __m128 ak_2 =
d;
1405 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1406 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1407 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1408 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1409 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1410 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1411 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1412 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1414 __m128 hk = _mm_mul_ps(d,k); __m128 hk_2 = _mm_mul_ps(d,k);
1415 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1416 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1417 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1418 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1419 __m128 neg = _mm_cmple_ps(hk,
zero); __m128 neg_2 = _mm_cmple_ps(hk_2,
zero);
1420 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1422 __m128 xk1 = _mm_mul_ps(ak, x); __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
1423 tmp1 = _mm_mul_ps(hk,y); tmp1_2 = _mm_mul_ps(hk_2,y_2);
1424 __m128 xk2 = _mm_sub_ps(xk1, tmp1); __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1425 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1426 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1427 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1429 __m128 yk1 = _mm_mul_ps(ak, y); __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
1430 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2,x_2);
1431 __m128 yk2 = _mm_add_ps(yk1, tmp1); __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1432 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1433 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1434 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1436 __m128 crossproduct = _mm_mul_ps(x, yk1); __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
1437 __m128 crosstemp = _mm_mul_ps(y, xk1); __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
1438 crossproduct = _mm_sub_ps(crossproduct, crosstemp); crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
1439 __m128 correct_helicity =
compare_sign(crossproduct, helicity_vec); __m128 correct_helicity_2 =
compare_sign(crossproduct_2, helicity_vec);
1441 __m128 xk = _mm_and_ps(correct_helicity, xk1); __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1442 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1443 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1444 __m128 yk = _mm_and_ps(correct_helicity, yk1); __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1445 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1446 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1449 __m128
phi_1 = _vec_atan2_ps(yk, xk); __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
1451 tmp1 = _mm_cmplt_ps(phi_1,
zero); tmp1_2 = _mm_cmplt_ps(phi_1_2,
zero);
1452 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1453 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1454 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1455 phi_1 = _mm_add_ps(phi_1, tmp1); phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
1457 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1458 phi_1 = _mm_andnot_ps(neg, phi_1); phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
1459 phi_1 = _mm_xor_ps(tmp1, phi_1); phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
1460 phi_3_out =
phi_1; phi_3_out_2 = phi_1_2;
1466 ak = _mm_mul_ps(d,
two); ak_2 = _mm_mul_ps(d,
two);
1467 tmp1 = _mm_mul_ps(d,d); tmp1_2 = _mm_mul_ps(d,d);
1468 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1469 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1470 tmp1 = _mm_mul_ps(D,D); tmp1_2 = _mm_mul_ps(D_2,D_2);
1471 tmp1 = _mm_mul_ps(tmp1, k); tmp1_2 = _mm_mul_ps(tmp1_2, k);
1472 ak = _mm_add_ps(ak, tmp1); ak_2 = _mm_add_ps(ak_2, tmp1_2);
1473 ak = _mm_mul_ps(ak, D_inv); ak_2 = _mm_mul_ps(ak_2, D_inv_2);
1475 hk = _mm_mul_ps(d,k); hk_2 = _mm_mul_ps(d,k);
1476 hk = _mm_add_ps(hk,
one); hk_2 = _mm_add_ps(hk_2,
one);
1477 hk = _mm_mul_ps(hk,hk); hk_2 = _mm_mul_ps(hk_2,hk_2);
1478 tmp1 = _mm_mul_ps(ak,ak); tmp1_2 = _mm_mul_ps(ak_2,ak_2);
1479 hk = _mm_sub_ps(hk, tmp1); hk_2 = _mm_sub_ps(hk_2, tmp1_2);
1480 neg = _mm_cmple_ps(hk,
zero); neg_2 = _mm_cmple_ps(hk_2,
zero);
1481 hk = _vec_sqrt_ps(hk); hk_2 = _vec_sqrt_ps(hk_2);
1483 xk1 = _mm_mul_ps(ak, x); xk1_2 = _mm_mul_ps(ak_2, x_2);
1484 tmp1 = _mm_mul_ps(hk, y); tmp1_2 = _mm_mul_ps(hk_2, y_2);
1485 xk2 = _mm_sub_ps(xk1, tmp1); xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
1486 xk1 = _mm_add_ps(xk1, tmp1); xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
1487 xk1 = _mm_mul_ps(xk1, D_inv); xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
1488 xk2 = _mm_mul_ps(xk2, D_inv); xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
1490 yk1 = _mm_mul_ps(ak, y); yk1_2 = _mm_mul_ps(ak_2, y_2);
1491 tmp1 = _mm_mul_ps(hk,x); tmp1_2 = _mm_mul_ps(hk_2, x_2);
1492 yk2 = _mm_add_ps(yk1, tmp1); yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
1493 yk1 = _mm_sub_ps(yk1, tmp1); yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
1494 yk1 = _mm_mul_ps(yk1, D_inv); yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
1495 yk2 = _mm_mul_ps(yk2, D_inv); yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
1497 xk = _mm_and_ps(correct_helicity, xk1); xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
1498 tmp1 = _mm_andnot_ps(correct_helicity, xk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
1499 xk = _mm_xor_ps(xk, tmp1); xk_2 = _mm_xor_ps(xk_2, tmp1_2);
1500 yk = _mm_and_ps(correct_helicity, yk1); yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
1501 tmp1 = _mm_andnot_ps(correct_helicity, yk2); tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
1502 yk = _mm_xor_ps(yk, tmp1); yk_2 = _mm_xor_ps(yk_2, tmp1_2);
1504 __m128
phi_2 = _vec_atan2_ps(yk, xk); __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
1506 tmp1 = _mm_cmplt_ps(phi_2,
zero); tmp1_2 = _mm_cmplt_ps(phi_2_2,
zero);
1507 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1508 tmp1 = _mm_andnot_ps(tmp1,
zero); tmp1_2 = _mm_andnot_ps(tmp1_2,
zero);
1509 tmp1 = _mm_xor_ps(tmp1, tmp2); tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
1510 phi_2 = _mm_add_ps(phi_2, tmp1); phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
1512 tmp1 = _mm_and_ps(neg, hit_phi); tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
1513 phi_2 = _mm_andnot_ps(neg, phi_2); phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
1514 phi_2 = _mm_xor_ps(tmp1, phi_2); phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
1515 phi_4_out =
phi_2; phi_4_out_2 = phi_2_2;
1523 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1525 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1527 tmp1 = _mm_or_ps(tmp1, tmp2); tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
1531 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1533 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1535 tmp2 = _mm_or_ps(tmp2, tmp3); tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
1537 tmp1 = _mm_and_ps(tmp1, tmp2); tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
1541 tmp2 = _mm_and_ps(tmp1,
twopi); tmp2_2 = _mm_and_ps(tmp1_2,
twopi);
1542 tmp3 = _mm_andnot_ps(tmp1,
zero); tmp3_2 = _mm_andnot_ps(tmp1_2,
zero);
1543 tmp2 = _mm_xor_ps(tmp2, tmp3); tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
1546 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1547 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero); __m128 tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1548 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1549 phi_1 = _mm_sub_ps(phi_1, tmp3); phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
1552 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1553 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1554 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1555 phi_2 = _mm_sub_ps(phi_2, tmp3); phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
1558 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1559 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1560 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1561 phi_3 = _mm_sub_ps(phi_3, tmp3); phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
1564 tmp3 = _mm_and_ps(tmp4, tmp2); tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
1565 tmp5 = _mm_andnot_ps(tmp4,
zero); tmp5_2 = _mm_andnot_ps(tmp4_2,
zero);
1566 tmp3 = _mm_xor_ps(tmp3, tmp5); tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
1567 phi_4 = _mm_sub_ps(phi_4, tmp3); phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
1571 __m128 phi_min =
phi_1; __m128 phi_min_2 = phi_1_2;
1572 tmp2 = _mm_cmplt_ps(phi_2, phi_min); tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
1573 tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1574 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1575 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1576 tmp2 = _mm_cmplt_ps(phi_3, phi_min); tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
1577 tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1578 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1579 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1580 tmp2 = _mm_cmplt_ps(phi_4, phi_min); tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
1581 tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1582 phi_min = _mm_andnot_ps(tmp2, phi_min); phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
1583 phi_min = _mm_xor_ps(phi_min, tmp3); phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
1586 __m128 phi_max =
phi_1; __m128 phi_max_2 = phi_1_2;
1587 tmp2 = _mm_cmpgt_ps(phi_2, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
1588 tmp3 = _mm_and_ps(tmp2, phi_2); tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
1589 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1590 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1591 tmp2 = _mm_cmpgt_ps(phi_3, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
1592 tmp3 = _mm_and_ps(tmp2, phi_3); tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
1593 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1594 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1595 tmp2 = _mm_cmpgt_ps(phi_4, phi_max); tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
1596 tmp3 = _mm_and_ps(tmp2, phi_4); tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
1597 phi_max = _mm_andnot_ps(tmp2, phi_max); phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
1598 phi_max = _mm_xor_ps(phi_max, tmp3); phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
1601 _mm_store_ps(min_phi, phi_min); _mm_store_ps(min_phi_2, phi_min_2);
1602 _mm_store_ps(max_phi, phi_max); _mm_store_ps(max_phi_2, phi_max_2);