10 static const __m128 
two = {2., 2., 2., 2.};
 
   11 static const __m128 
three_pi_over_two = {3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f};
 
   12 static const __m128 
SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
   14 void HelixHough::phiRange_sse(
float* hit_x, 
float* hit_y, 
float* min_d, 
float* max_d, 
float* min_k, 
float* max_k, 
float* min_phi_1, 
float* max_phi_1, 
float* min_phi_2, 
float* max_phi_2)
 
   16   __m128 
x = _mm_load_ps(hit_x);
 
   17   __m128 
y = _mm_load_ps(hit_y);
 
   19   __m128 hit_phi = _vec_atan2_ps(y,x);
 
   21   __m128 
tmp1 = _mm_cmplt_ps(hit_phi, 
zero);
 
   23   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
   24   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
   25   hit_phi = _mm_add_ps(hit_phi, tmp1);
 
   28   __m128 
d = _mm_load_ps(min_d);
 
   29   __m128 
k = _mm_load_ps(min_k);
 
   30   __m128 D = _mm_mul_ps(x,x);
 
   31   tmp1 = _mm_mul_ps(y,y);
 
   32   D = _mm_add_ps(D,tmp1);
 
   34   __m128 D_inv = _vec_rec_ps(D);
 
   36   ak = _mm_mul_ps(d, 
two);
 
   37   tmp1 = _mm_mul_ps(d,d);
 
   38   tmp1 = _mm_mul_ps(tmp1, k);
 
   39   ak = _mm_add_ps(ak, tmp1);
 
   40   tmp1 = _mm_mul_ps(D,D);
 
   41   tmp1 = _mm_mul_ps(tmp1, k);
 
   42   ak = _mm_add_ps(ak, tmp1);
 
   43   ak = _mm_mul_ps(ak, D_inv);
 
   45   __m128 hk = _mm_mul_ps(d,k);
 
   46   hk = _mm_add_ps(hk, 
one);
 
   47   hk = _mm_mul_ps(hk,hk);
 
   48   tmp1 = _mm_mul_ps(ak,ak);
 
   49   hk = _mm_sub_ps(hk, tmp1);
 
   50   __m128 neg = _mm_cmple_ps(hk, 
zero);
 
   51   hk = _vec_sqrt_ps(hk);
 
   53   __m128 xk1 = _mm_mul_ps(ak, x);
 
   54   tmp1 = _mm_mul_ps(hk,y);
 
   55   __m128 xk2 = _mm_sub_ps(xk1, tmp1);
 
   56   xk1 = _mm_add_ps(xk1, tmp1);
 
   57   xk1 = _mm_mul_ps(xk1, D_inv);
 
   58   xk2 = _mm_mul_ps(xk2, D_inv);
 
   60   __m128 yk1 = _mm_mul_ps(ak, y);
 
   61   tmp1 = _mm_mul_ps(hk,x);
 
   62   __m128 yk2 = _mm_add_ps(yk1, tmp1);
 
   63   yk1 = _mm_sub_ps(yk1, tmp1);
 
   64   yk1 = _mm_mul_ps(yk1, D_inv);
 
   65   yk2 = _mm_mul_ps(yk2, D_inv);
 
   67   __m128 phi_r_1 = _vec_atan2_ps(yk1, xk1);
 
   69   tmp1 = _mm_cmplt_ps(phi_r_1, 
zero);
 
   70   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
   71   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
   72   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
   73   phi_r_1 = _mm_add_ps(phi_r_1, tmp1);
 
   75   tmp1 = _mm_and_ps(neg, hit_phi);
 
   76   phi_r_1 = _mm_andnot_ps(neg, phi_r_1);
 
   77   phi_r_1 = _mm_xor_ps(tmp1, phi_r_1);
 
   79   __m128 phi_l_1 = _vec_atan2_ps(yk2, xk2);
 
   81   tmp1 = _mm_cmplt_ps(phi_l_1, 
zero);
 
   82   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
   83   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
   84   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
   85   phi_l_1 = _mm_add_ps(phi_l_1, tmp1);
 
   87   tmp1 = _mm_and_ps(neg, hit_phi);
 
   88   phi_l_1 = _mm_andnot_ps(neg, phi_l_1);
 
   89   phi_l_1 = _mm_xor_ps(tmp1, phi_l_1);
 
   93   d = _mm_load_ps(min_d);
 
   94   k = _mm_load_ps(max_k);
 
   96   tmp1 = _mm_mul_ps(y,y);
 
   97   D = _mm_add_ps(D,tmp1);
 
   99   D_inv = _vec_rec_ps(D);
 
  101   ak = _mm_mul_ps(d, 
two);
 
  102   tmp1 = _mm_mul_ps(d,d);
 
  103   tmp1 = _mm_mul_ps(tmp1, k);
 
  104   ak = _mm_add_ps(ak, tmp1);
 
  105   tmp1 = _mm_mul_ps(D,D);
 
  106   tmp1 = _mm_mul_ps(tmp1, k);
 
  107   ak = _mm_add_ps(ak, tmp1);
 
  108   ak = _mm_mul_ps(ak, D_inv);
 
  110   hk = _mm_mul_ps(d,k);
 
  111   hk = _mm_add_ps(hk, 
one);
 
  112   hk = _mm_mul_ps(hk,hk);
 
  113   tmp1 = _mm_mul_ps(ak,ak);
 
  114   hk = _mm_sub_ps(hk, tmp1);
 
  115   neg = _mm_cmple_ps(hk, 
zero);
 
  116   hk = _vec_sqrt_ps(hk);
 
  118   xk1 = _mm_mul_ps(ak, x);
 
  119   tmp1 = _mm_mul_ps(hk,y);
 
  120   xk2 = _mm_sub_ps(xk1, tmp1);
 
  121   xk1 = _mm_add_ps(xk1, tmp1);
 
  122   xk1 = _mm_mul_ps(xk1, D_inv);
 
  123   xk2 = _mm_mul_ps(xk2, D_inv);
 
  125   yk1 = _mm_mul_ps(ak, y);
 
  126   tmp1 = _mm_mul_ps(hk,x);
 
  127   yk2 = _mm_add_ps(yk1, tmp1);
 
  128   yk1 = _mm_sub_ps(yk1, tmp1);
 
  129   yk1 = _mm_mul_ps(yk1, D_inv);
 
  130   yk2 = _mm_mul_ps(yk2, D_inv);
 
  132   __m128 phi_r_2 = _vec_atan2_ps(yk1, xk1);
 
  134   tmp1 = _mm_cmplt_ps(phi_r_2, 
zero);
 
  135   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  136   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  137   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  138   phi_r_2 = _mm_add_ps(phi_r_2, tmp1);
 
  140   tmp1 = _mm_and_ps(neg, hit_phi);
 
  141   phi_r_2 = _mm_andnot_ps(neg, phi_r_2);
 
  142   phi_r_2 = _mm_xor_ps(tmp1, phi_r_2);
 
  144   __m128 phi_l_2 = _vec_atan2_ps(yk2, xk2);
 
  146   tmp1 = _mm_cmplt_ps(phi_l_2, 
zero);
 
  147   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  148   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  149   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  150   phi_l_2 = _mm_add_ps(phi_l_2, tmp1);
 
  152   tmp1 = _mm_and_ps(neg, hit_phi);
 
  153   phi_l_2 = _mm_andnot_ps(neg, phi_l_2);
 
  154   phi_l_2 = _mm_xor_ps(tmp1, phi_l_2);
 
  157   d = _mm_load_ps(max_d);
 
  158   k = _mm_load_ps(max_k);
 
  160   tmp1 = _mm_mul_ps(y,y);
 
  161   D = _mm_add_ps(D,tmp1);
 
  163   D_inv = _vec_rec_ps(D);
 
  165   ak = _mm_mul_ps(d, 
two);
 
  166   tmp1 = _mm_mul_ps(d,d);
 
  167   tmp1 = _mm_mul_ps(tmp1, k);
 
  168   ak = _mm_add_ps(ak, tmp1);
 
  169   tmp1 = _mm_mul_ps(D,D);
 
  170   tmp1 = _mm_mul_ps(tmp1, k);
 
  171   ak = _mm_add_ps(ak, tmp1);
 
  172   ak = _mm_mul_ps(ak, D_inv);
 
  174   hk = _mm_mul_ps(d,k);
 
  175   hk = _mm_add_ps(hk, 
one);
 
  176   hk = _mm_mul_ps(hk,hk);
 
  177   tmp1 = _mm_mul_ps(ak,ak);
 
  178   hk = _mm_sub_ps(hk, tmp1);
 
  179   neg = _mm_cmple_ps(hk, 
zero);
 
  180   hk = _vec_sqrt_ps(hk);
 
  182   xk1 = _mm_mul_ps(ak, x);
 
  183   tmp1 = _mm_mul_ps(hk,y);
 
  184   xk2 = _mm_sub_ps(xk1, tmp1);
 
  185   xk1 = _mm_add_ps(xk1, tmp1);
 
  186   xk1 = _mm_mul_ps(xk1, D_inv);
 
  187   xk2 = _mm_mul_ps(xk2, D_inv);
 
  189   yk1 = _mm_mul_ps(ak, y);
 
  190   tmp1 = _mm_mul_ps(hk,x);
 
  191   yk2 = _mm_add_ps(yk1, tmp1);
 
  192   yk1 = _mm_sub_ps(yk1, tmp1);
 
  193   yk1 = _mm_mul_ps(yk1, D_inv);
 
  194   yk2 = _mm_mul_ps(yk2, D_inv);
 
  196   __m128 phi_r_3 = _vec_atan2_ps(yk1, xk1);
 
  198   tmp1 = _mm_cmplt_ps(phi_r_3, 
zero);
 
  199   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  200   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  201   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  202   phi_r_3 = _mm_add_ps(phi_r_3, tmp1);
 
  204   tmp1 = _mm_and_ps(neg, hit_phi);
 
  205   phi_r_3 = _mm_andnot_ps(neg, phi_r_3);
 
  206   phi_r_3 = _mm_xor_ps(tmp1, phi_r_3);
 
  208   __m128 phi_l_3 = _vec_atan2_ps(yk2, xk2);
 
  210   tmp1 = _mm_cmplt_ps(phi_l_3, 
zero);
 
  211   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  212   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  213   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  214   phi_l_3 = _mm_add_ps(phi_l_3, tmp1);
 
  216   tmp1 = _mm_and_ps(neg, hit_phi);
 
  217   phi_l_3 = _mm_andnot_ps(neg, phi_l_3);
 
  218   phi_l_3 = _mm_xor_ps(tmp1, phi_l_3);
 
  221   d = _mm_load_ps(max_d);
 
  222   k = _mm_load_ps(max_k);
 
  224   tmp1 = _mm_mul_ps(y,y);
 
  225   D = _mm_add_ps(D,tmp1);
 
  227   D_inv = _vec_rec_ps(D);
 
  229   ak = _mm_mul_ps(d, 
two);
 
  230   tmp1 = _mm_mul_ps(d,d);
 
  231   tmp1 = _mm_mul_ps(tmp1, k);
 
  232   ak = _mm_add_ps(ak, tmp1);
 
  233   tmp1 = _mm_mul_ps(D,D);
 
  234   tmp1 = _mm_mul_ps(tmp1, k);
 
  235   ak = _mm_add_ps(ak, tmp1);
 
  236   ak = _mm_mul_ps(ak, D_inv);
 
  238   hk = _mm_mul_ps(d,k);
 
  239   hk = _mm_add_ps(hk, 
one);
 
  240   hk = _mm_mul_ps(hk,hk);
 
  241   tmp1 = _mm_mul_ps(ak,ak);
 
  242   hk = _mm_sub_ps(hk, tmp1);
 
  243   neg = _mm_cmple_ps(hk, 
zero);
 
  244   hk = _vec_sqrt_ps(hk);
 
  246   xk1 = _mm_mul_ps(ak, x);
 
  247   tmp1 = _mm_mul_ps(hk,y);
 
  248   xk2 = _mm_sub_ps(xk1, tmp1);
 
  249   xk1 = _mm_add_ps(xk1, tmp1);
 
  250   xk1 = _mm_mul_ps(xk1, D_inv);
 
  251   xk2 = _mm_mul_ps(xk2, D_inv);
 
  253   yk1 = _mm_mul_ps(ak, y);
 
  254   tmp1 = _mm_mul_ps(hk,x);
 
  255   yk2 = _mm_add_ps(yk1, tmp1);
 
  256   yk1 = _mm_sub_ps(yk1, tmp1);
 
  257   yk1 = _mm_mul_ps(yk1, D_inv);
 
  258   yk2 = _mm_mul_ps(yk2, D_inv);
 
  260   __m128 phi_r_4 = _vec_atan2_ps(yk1, xk1);
 
  262   tmp1 = _mm_cmplt_ps(phi_r_4, 
zero);
 
  263   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  264   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  265   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  266   phi_r_4 = _mm_add_ps(phi_r_4, tmp1);
 
  268   tmp1 = _mm_and_ps(neg, hit_phi);
 
  269   phi_r_4 = _mm_andnot_ps(neg, phi_r_4);
 
  270   phi_r_4 = _mm_xor_ps(tmp1, phi_r_4);
 
  272   __m128 phi_l_4 = _vec_atan2_ps(yk2, xk2);
 
  274   tmp1 = _mm_cmplt_ps(phi_l_4, 
zero);
 
  275   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  276   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  277   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  278   phi_l_4 = _mm_add_ps(phi_l_4, tmp1);
 
  280   tmp1 = _mm_and_ps(neg, hit_phi);
 
  281   phi_l_4 = _mm_andnot_ps(neg, phi_l_4);
 
  282   phi_l_4 = _mm_xor_ps(tmp1, phi_l_4);
 
  289   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  291   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  293   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  297   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  299   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  301   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  303   tmp1 = _mm_and_ps(tmp1, tmp2);
 
  307   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  308   tmp3 = _mm_andnot_ps(tmp1, 
zero);
 
  309   tmp2 = _mm_xor_ps(tmp2, tmp3);
 
  312   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  313   __m128 
tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  314   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  315   phi_r_1 = _mm_sub_ps(phi_r_1, tmp3);
 
  318   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  319   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  320   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  321   phi_r_2 = _mm_sub_ps(phi_r_2, tmp3);
 
  324   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  325   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  326   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  327   phi_r_3 = _mm_sub_ps(phi_r_3, tmp3);
 
  330   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  331   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  332   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  333   phi_r_4 = _mm_sub_ps(phi_r_4, tmp3);
 
  337   __m128 phi_r_min = phi_r_1;
 
  338   tmp2 = _mm_cmplt_ps(phi_r_2, phi_r_min);
 
  339   tmp3 = _mm_and_ps(tmp2, phi_r_2);
 
  340   phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
 
  341   phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
 
  342   tmp2 = _mm_cmplt_ps(phi_r_3, phi_r_min);
 
  343   tmp3 = _mm_and_ps(tmp2, phi_r_3);
 
  344   phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
 
  345   phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
 
  346   tmp2 = _mm_cmplt_ps(phi_r_4, phi_r_min);
 
  347   tmp3 = _mm_and_ps(tmp2, phi_r_4);
 
  348   phi_r_min = _mm_andnot_ps(tmp2, phi_r_min);
 
  349   phi_r_min = _mm_xor_ps(phi_r_min, tmp3);
 
  352   __m128 phi_r_max = phi_r_1;
 
  353   tmp2 = _mm_cmpgt_ps(phi_r_2, phi_r_max);
 
  354   tmp3 = _mm_and_ps(tmp2, phi_r_2);
 
  355   phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
 
  356   phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
 
  357   tmp2 = _mm_cmpgt_ps(phi_r_3, phi_r_max);
 
  358   tmp3 = _mm_and_ps(tmp2, phi_r_3);
 
  359   phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
 
  360   phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
 
  361   tmp2 = _mm_cmpgt_ps(phi_r_4, phi_r_max);
 
  362   tmp3 = _mm_and_ps(tmp2, phi_r_4);
 
  363   phi_r_max = _mm_andnot_ps(tmp2, phi_r_max);
 
  364   phi_r_max = _mm_xor_ps(phi_r_max, tmp3);
 
  366   _mm_store_ps(min_phi_1, phi_r_min);
 
  367   _mm_store_ps(max_phi_1, phi_r_max);
 
  374   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  376   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  378   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  382   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  384   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  386   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  388   tmp1 = _mm_and_ps(tmp1, tmp2);
 
  392   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  393   tmp3 = _mm_andnot_ps(tmp1, 
zero);
 
  394   tmp2 = _mm_xor_ps(tmp2, tmp3);
 
  397   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  398   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  399   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  400   phi_l_1 = _mm_sub_ps(phi_l_1, tmp3);
 
  403   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  404   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  405   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  406   phi_l_2 = _mm_sub_ps(phi_l_2, tmp3);
 
  409   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  410   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  411   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  412   phi_l_3 = _mm_sub_ps(phi_l_3, tmp3);
 
  415   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  416   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  417   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  418   phi_l_4 = _mm_sub_ps(phi_l_4, tmp3);
 
  422   __m128 phi_l_min = phi_l_1;
 
  423   tmp2 = _mm_cmplt_ps(phi_l_2, phi_l_min);
 
  424   tmp3 = _mm_and_ps(tmp2, phi_l_2);
 
  425   phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
 
  426   phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
 
  427   tmp2 = _mm_cmplt_ps(phi_l_3, phi_l_min);
 
  428   tmp3 = _mm_and_ps(tmp2, phi_l_3);
 
  429   phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
 
  430   phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
 
  431   tmp2 = _mm_cmplt_ps(phi_l_4, phi_l_min);
 
  432   tmp3 = _mm_and_ps(tmp2, phi_l_4);
 
  433   phi_l_min = _mm_andnot_ps(tmp2, phi_l_min);
 
  434   phi_l_min = _mm_xor_ps(phi_l_min, tmp3);
 
  437   __m128 phi_l_max = phi_l_1;
 
  438   tmp2 = _mm_cmpgt_ps(phi_l_2, phi_l_max);
 
  439   tmp3 = _mm_and_ps(tmp2, phi_l_2);
 
  440   phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
 
  441   phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
 
  442   tmp2 = _mm_cmpgt_ps(phi_l_3, phi_l_max);
 
  443   tmp3 = _mm_and_ps(tmp2, phi_l_3);
 
  444   phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
 
  445   phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
 
  446   tmp2 = _mm_cmpgt_ps(phi_l_4, phi_l_max);
 
  447   tmp3 = _mm_and_ps(tmp2, phi_l_4);
 
  448   phi_l_max = _mm_andnot_ps(tmp2, phi_l_max);
 
  449   phi_l_max = _mm_xor_ps(phi_l_max, tmp3);
 
  451   _mm_store_ps(min_phi_2, phi_l_min);
 
  452   _mm_store_ps(max_phi_2, phi_l_max);
 
  459   const __m128i MASK = _mm_set1_epi32(0xffffffff);
 
  461   __m128  f = _mm_xor_ps(a,b);
 
  462   __m128i i = _mm_castps_si128(f);
 
  464   i = _mm_srai_epi32(i,31);
 
  465   i = _mm_xor_si128(i,MASK);
 
  467   f = _mm_castsi128_ps(i);
 
  473 void HelixHough::phiRange_sse(
float* hit_x, 
float* hit_y, 
float* min_d, 
float* max_d, 
float* min_k, 
float* max_k, 
float* min_phi, 
float* max_phi, 
float hel, __m128& phi_3_out, __m128& phi_4_out)
 
  475   __m128 helicity_vec = _mm_load1_ps(&(hel));
 
  477   __m128 
x = _mm_load_ps(hit_x);
 
  478   __m128 
y = _mm_load_ps(hit_y);
 
  480   __m128 d_min = _mm_load_ps(min_d);
 
  481   __m128 d_max = _mm_load_ps(max_d);
 
  482   __m128 k_min = _mm_load_ps(min_k);
 
  483   __m128 k_max = _mm_load_ps(max_k);
 
  485   __m128 hit_phi = _vec_atan2_ps(y,x);
 
  487   __m128 
tmp1 = _mm_cmplt_ps(hit_phi, 
zero);
 
  489   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  490   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  491   hit_phi = _mm_add_ps(hit_phi, tmp1);
 
  496   __m128 D = _mm_mul_ps(x,x);
 
  497   tmp1 = _mm_mul_ps(y,y);
 
  498   D = _mm_add_ps(D,tmp1);
 
  500   __m128 D_inv = _vec_rec_ps(D);
 
  502   ak = _mm_mul_ps(d, 
two);
 
  503   tmp1 = _mm_mul_ps(d,d);
 
  504   tmp1 = _mm_mul_ps(tmp1, k);
 
  505   ak = _mm_add_ps(ak, tmp1);
 
  506   tmp1 = _mm_mul_ps(D,D);
 
  507   tmp1 = _mm_mul_ps(tmp1, k);
 
  508   ak = _mm_add_ps(ak, tmp1);
 
  509   ak = _mm_mul_ps(ak, D_inv);
 
  511   __m128 hk = _mm_mul_ps(d,k);
 
  512   hk = _mm_add_ps(hk, 
one);
 
  513   hk = _mm_mul_ps(hk,hk);
 
  514   tmp1 = _mm_mul_ps(ak,ak);
 
  515   hk = _mm_sub_ps(hk, tmp1);
 
  516   __m128 neg = _mm_cmple_ps(hk, 
zero);
 
  517   hk = _vec_sqrt_ps(hk);
 
  519   __m128 xk1 = _mm_mul_ps(ak, x);
 
  520   tmp1 = _mm_mul_ps(hk,y);
 
  521   __m128 xk2 = _mm_sub_ps(xk1, tmp1);
 
  522   xk1 = _mm_add_ps(xk1, tmp1);
 
  523   xk1 = _mm_mul_ps(xk1, D_inv);
 
  524   xk2 = _mm_mul_ps(xk2, D_inv);
 
  526   __m128 yk1 = _mm_mul_ps(ak, y);
 
  527   tmp1 = _mm_mul_ps(hk,x);
 
  528   __m128 yk2 = _mm_add_ps(yk1, tmp1);
 
  529   yk1 = _mm_sub_ps(yk1, tmp1);
 
  530   yk1 = _mm_mul_ps(yk1, D_inv);
 
  531   yk2 = _mm_mul_ps(yk2, D_inv);
 
  533   __m128 crossproduct = _mm_mul_ps(x, yk1);
 
  534   __m128 crosstemp = _mm_mul_ps(y, xk1);
 
  535   crossproduct = _mm_sub_ps(crossproduct, crosstemp);
 
  536   __m128 correct_helicity = 
compare_sign(crossproduct, helicity_vec);
 
  538   __m128 xk = _mm_and_ps(correct_helicity, xk1);
 
  539   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  540   xk = _mm_xor_ps(xk, tmp1);
 
  541   __m128 yk = _mm_and_ps(correct_helicity, yk1);
 
  542   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  543   yk = _mm_xor_ps(yk, tmp1);
 
  546   __m128 
phi_1 = _vec_atan2_ps(yk, xk);
 
  548   tmp1 = _mm_cmplt_ps(phi_1, 
zero);
 
  549   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  550   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  551   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  552   phi_1 = _mm_add_ps(phi_1, tmp1);
 
  554   tmp1 = _mm_and_ps(neg, hit_phi);
 
  555   phi_1 = _mm_andnot_ps(neg, phi_1);
 
  556   phi_1 = _mm_xor_ps(tmp1, phi_1);
 
  563   ak = _mm_mul_ps(d, 
two);
 
  564   tmp1 = _mm_mul_ps(d,d);
 
  565   tmp1 = _mm_mul_ps(tmp1, k);
 
  566   ak = _mm_add_ps(ak, tmp1);
 
  567   tmp1 = _mm_mul_ps(D,D);
 
  568   tmp1 = _mm_mul_ps(tmp1, k);
 
  569   ak = _mm_add_ps(ak, tmp1);
 
  570   ak = _mm_mul_ps(ak, D_inv);
 
  572   hk = _mm_mul_ps(d,k);
 
  573   hk = _mm_add_ps(hk, 
one);
 
  574   hk = _mm_mul_ps(hk,hk);
 
  575   tmp1 = _mm_mul_ps(ak,ak);
 
  576   hk = _mm_sub_ps(hk, tmp1);
 
  577   neg = _mm_cmple_ps(hk, 
zero);
 
  578   hk = _vec_sqrt_ps(hk);
 
  580   xk1 = _mm_mul_ps(ak, x);
 
  581   tmp1 = _mm_mul_ps(hk,y);
 
  582   xk2 = _mm_sub_ps(xk1, tmp1);
 
  583   xk1 = _mm_add_ps(xk1, tmp1);
 
  584   xk1 = _mm_mul_ps(xk1, D_inv);
 
  585   xk2 = _mm_mul_ps(xk2, D_inv);
 
  587   yk1 = _mm_mul_ps(ak, y);
 
  588   tmp1 = _mm_mul_ps(hk,x);
 
  589   yk2 = _mm_add_ps(yk1, tmp1);
 
  590   yk1 = _mm_sub_ps(yk1, tmp1);
 
  591   yk1 = _mm_mul_ps(yk1, D_inv);
 
  592   yk2 = _mm_mul_ps(yk2, D_inv);
 
  594   xk = _mm_and_ps(correct_helicity, xk1);
 
  595   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  596   xk = _mm_xor_ps(xk, tmp1);
 
  597   yk = _mm_and_ps(correct_helicity, yk1);
 
  598   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  599   yk = _mm_xor_ps(yk, tmp1);
 
  601   __m128 
phi_2 = _vec_atan2_ps(yk, xk);
 
  603   tmp1 = _mm_cmplt_ps(phi_2, 
zero);
 
  604   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  605   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  606   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  607   phi_2 = _mm_add_ps(phi_2, tmp1);
 
  609   tmp1 = _mm_and_ps(neg, hit_phi);
 
  610   phi_2 = _mm_andnot_ps(neg, phi_2);
 
  611   phi_2 = _mm_xor_ps(tmp1, phi_2);
 
  619   ak = _mm_mul_ps(d, 
two);
 
  620   tmp1 = _mm_mul_ps(d,d);
 
  621   tmp1 = _mm_mul_ps(tmp1, k);
 
  622   ak = _mm_add_ps(ak, tmp1);
 
  623   tmp1 = _mm_mul_ps(D,D);
 
  624   tmp1 = _mm_mul_ps(tmp1, k);
 
  625   ak = _mm_add_ps(ak, tmp1);
 
  626   ak = _mm_mul_ps(ak, D_inv);
 
  628   hk = _mm_mul_ps(d,k);
 
  629   hk = _mm_add_ps(hk, 
one);
 
  630   hk = _mm_mul_ps(hk,hk);
 
  631   tmp1 = _mm_mul_ps(ak,ak);
 
  632   hk = _mm_sub_ps(hk, tmp1);
 
  633   neg = _mm_cmple_ps(hk, 
zero);
 
  634   hk = _vec_sqrt_ps(hk);
 
  636   xk1 = _mm_mul_ps(ak, x);
 
  637   tmp1 = _mm_mul_ps(hk,y);
 
  638   xk2 = _mm_sub_ps(xk1, tmp1);
 
  639   xk1 = _mm_add_ps(xk1, tmp1);
 
  640   xk1 = _mm_mul_ps(xk1, D_inv);
 
  641   xk2 = _mm_mul_ps(xk2, D_inv);
 
  643   yk1 = _mm_mul_ps(ak, y);
 
  644   tmp1 = _mm_mul_ps(hk,x);
 
  645   yk2 = _mm_add_ps(yk1, tmp1);
 
  646   yk1 = _mm_sub_ps(yk1, tmp1);
 
  647   yk1 = _mm_mul_ps(yk1, D_inv);
 
  648   yk2 = _mm_mul_ps(yk2, D_inv);
 
  650   xk = _mm_and_ps(correct_helicity, xk1);
 
  651   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  652   xk = _mm_xor_ps(xk, tmp1);
 
  653   yk = _mm_and_ps(correct_helicity, yk1);
 
  654   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  655   yk = _mm_xor_ps(yk, tmp1);
 
  657   __m128 phi_3 = _vec_atan2_ps(yk, xk);
 
  659   tmp1 = _mm_cmplt_ps(phi_3, 
zero);
 
  660   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  661   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  662   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  663   phi_3 = _mm_add_ps(phi_3, tmp1);
 
  665   tmp1 = _mm_and_ps(neg, hit_phi);
 
  666   phi_3 = _mm_andnot_ps(neg, phi_3);
 
  667   phi_3 = _mm_xor_ps(tmp1, phi_3);
 
  674   ak = _mm_mul_ps(d, 
two);
 
  675   tmp1 = _mm_mul_ps(d,d);
 
  676   tmp1 = _mm_mul_ps(tmp1, k);
 
  677   ak = _mm_add_ps(ak, tmp1);
 
  678   tmp1 = _mm_mul_ps(D,D);
 
  679   tmp1 = _mm_mul_ps(tmp1, k);
 
  680   ak = _mm_add_ps(ak, tmp1);
 
  681   ak = _mm_mul_ps(ak, D_inv);
 
  683   hk = _mm_mul_ps(d,k);
 
  684   hk = _mm_add_ps(hk, 
one);
 
  685   hk = _mm_mul_ps(hk,hk);
 
  686   tmp1 = _mm_mul_ps(ak,ak);
 
  687   hk = _mm_sub_ps(hk, tmp1);
 
  688   neg = _mm_cmple_ps(hk, 
zero);
 
  689   hk = _vec_sqrt_ps(hk);
 
  691   xk1 = _mm_mul_ps(ak, x);
 
  692   tmp1 = _mm_mul_ps(hk,y);
 
  693   xk2 = _mm_sub_ps(xk1, tmp1);
 
  694   xk1 = _mm_add_ps(xk1, tmp1);
 
  695   xk1 = _mm_mul_ps(xk1, D_inv);
 
  696   xk2 = _mm_mul_ps(xk2, D_inv);
 
  698   yk1 = _mm_mul_ps(ak, y);
 
  699   tmp1 = _mm_mul_ps(hk,x);
 
  700   yk2 = _mm_add_ps(yk1, tmp1);
 
  701   yk1 = _mm_sub_ps(yk1, tmp1);
 
  702   yk1 = _mm_mul_ps(yk1, D_inv);
 
  703   yk2 = _mm_mul_ps(yk2, D_inv);
 
  705   xk = _mm_and_ps(correct_helicity, xk1);
 
  706   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  707   xk = _mm_xor_ps(xk, tmp1);
 
  708   yk = _mm_and_ps(correct_helicity, yk1);
 
  709   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  710   yk = _mm_xor_ps(yk, tmp1);
 
  712   __m128 phi_4 = _vec_atan2_ps(yk, xk);
 
  714   tmp1 = _mm_cmplt_ps(phi_4, 
zero);
 
  715   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  716   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  717   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  718   phi_4 = _mm_add_ps(phi_4, tmp1);
 
  720   tmp1 = _mm_and_ps(neg, hit_phi);
 
  721   phi_4 = _mm_andnot_ps(neg, phi_4);
 
  722   phi_4 = _mm_xor_ps(tmp1, phi_4);
 
  729   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  731   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  733   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  737   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  739   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  741   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  743   tmp1 = _mm_and_ps(tmp1, tmp2);
 
  747   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  748   tmp3 = _mm_andnot_ps(tmp1, 
zero);
 
  749   tmp2 = _mm_xor_ps(tmp2, tmp3);
 
  752   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  753   __m128 
tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  754   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  755   phi_1 = _mm_sub_ps(phi_1, tmp3);
 
  758   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  759   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  760   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  761   phi_2 = _mm_sub_ps(phi_2, tmp3);
 
  764   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  765   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  766   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  767   phi_3 = _mm_sub_ps(phi_3, tmp3);
 
  770   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  771   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  772   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  773   phi_4 = _mm_sub_ps(phi_4, tmp3);
 
  777   __m128 phi_min = 
phi_1;
 
  778   tmp2 = _mm_cmplt_ps(phi_2, phi_min);
 
  779   tmp3 = _mm_and_ps(tmp2, phi_2);
 
  780   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
  781   phi_min = _mm_xor_ps(phi_min, tmp3);
 
  782   tmp2 = _mm_cmplt_ps(phi_3, phi_min);
 
  783   tmp3 = _mm_and_ps(tmp2, phi_3);
 
  784   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
  785   phi_min = _mm_xor_ps(phi_min, tmp3);
 
  786   tmp2 = _mm_cmplt_ps(phi_4, phi_min);
 
  787   tmp3 = _mm_and_ps(tmp2, phi_4);
 
  788   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
  789   phi_min = _mm_xor_ps(phi_min, tmp3);
 
  792   __m128 phi_max = 
phi_1;
 
  793   tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
 
  794   tmp3 = _mm_and_ps(tmp2, phi_2);
 
  795   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
  796   phi_max = _mm_xor_ps(phi_max, tmp3);
 
  797   tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
 
  798   tmp3 = _mm_and_ps(tmp2, phi_3);
 
  799   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
  800   phi_max = _mm_xor_ps(phi_max, tmp3);
 
  801   tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
 
  802   tmp3 = _mm_and_ps(tmp2, phi_4);
 
  803   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
  804   phi_max = _mm_xor_ps(phi_max, tmp3);
 
  807   _mm_store_ps(min_phi, phi_min);
 
  808   _mm_store_ps(max_phi, phi_max);
 
  812 void HelixHough::phiRange_sse(
float* hit_x, 
float* hit_y, 
float* min_d, 
float* max_d, 
float* max_k, 
float* min_phi, 
float* max_phi, 
float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out)
 
  814   __m128 helicity_vec = _mm_load1_ps(&(hel));
 
  816   __m128 
x = _mm_load_ps(hit_x);
 
  817   __m128 
y = _mm_load_ps(hit_y);
 
  819   __m128 d_min = _mm_load_ps(min_d);
 
  820   __m128 d_max = _mm_load_ps(max_d);
 
  821   __m128 k_max = _mm_load_ps(max_k);
 
  823   __m128 hit_phi = _vec_atan2_ps(y,x);
 
  825   __m128 
tmp1 = _mm_cmplt_ps(hit_phi, 
zero);
 
  827   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  828   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  829   hit_phi = _mm_add_ps(hit_phi, tmp1);
 
  834   __m128 D = _mm_mul_ps(x,x);
 
  835   tmp1 = _mm_mul_ps(y,y);
 
  836   D = _mm_add_ps(D,tmp1);
 
  838   __m128 D_inv = _vec_rec_ps(D);
 
  840   ak = _mm_mul_ps(d, 
two);
 
  841   tmp1 = _mm_mul_ps(d,d);
 
  842   tmp1 = _mm_mul_ps(tmp1, k);
 
  843   ak = _mm_add_ps(ak, tmp1);
 
  844   tmp1 = _mm_mul_ps(D,D);
 
  845   tmp1 = _mm_mul_ps(tmp1, k);
 
  846   ak = _mm_add_ps(ak, tmp1);
 
  847   ak = _mm_mul_ps(ak, D_inv);
 
  849   __m128 hk = _mm_mul_ps(d,k);
 
  850   hk = _mm_add_ps(hk, 
one);
 
  851   hk = _mm_mul_ps(hk,hk);
 
  852   tmp1 = _mm_mul_ps(ak,ak);
 
  853   hk = _mm_sub_ps(hk, tmp1);
 
  854   __m128 neg = _mm_cmple_ps(hk, 
zero);
 
  855   hk = _vec_sqrt_ps(hk);
 
  857   __m128 xk1 = _mm_mul_ps(ak, x);
 
  858   tmp1 = _mm_mul_ps(hk,y);
 
  859   __m128 xk2 = _mm_sub_ps(xk1, tmp1);
 
  860   xk1 = _mm_add_ps(xk1, tmp1);
 
  861   xk1 = _mm_mul_ps(xk1, D_inv);
 
  862   xk2 = _mm_mul_ps(xk2, D_inv);
 
  864   __m128 yk1 = _mm_mul_ps(ak, y);
 
  865   tmp1 = _mm_mul_ps(hk,x);
 
  866   __m128 yk2 = _mm_add_ps(yk1, tmp1);
 
  867   yk1 = _mm_sub_ps(yk1, tmp1);
 
  868   yk1 = _mm_mul_ps(yk1, D_inv);
 
  869   yk2 = _mm_mul_ps(yk2, D_inv);
 
  871   __m128 crossproduct = _mm_mul_ps(x, yk1);
 
  872   __m128 crosstemp = _mm_mul_ps(y, xk1);
 
  873   crossproduct = _mm_sub_ps(crossproduct, crosstemp);
 
  874   __m128 correct_helicity = 
compare_sign(crossproduct, helicity_vec);
 
  876   __m128 xk = _mm_and_ps(correct_helicity, xk1);
 
  877   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  878   xk = _mm_xor_ps(xk, tmp1);
 
  879   __m128 yk = _mm_and_ps(correct_helicity, yk1);
 
  880   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  881   yk = _mm_xor_ps(yk, tmp1);
 
  884   __m128 
phi_1 = _vec_atan2_ps(yk, xk);
 
  886   tmp1 = _mm_cmplt_ps(phi_1, 
zero);
 
  887   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  888   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  889   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  890   phi_1 = _mm_add_ps(phi_1, tmp1);
 
  892   tmp1 = _mm_and_ps(neg, hit_phi);
 
  893   phi_1 = _mm_andnot_ps(neg, phi_1);
 
  894   phi_1 = _mm_xor_ps(tmp1, phi_1);
 
  901   ak = _mm_mul_ps(d, 
two);
 
  902   tmp1 = _mm_mul_ps(d,d);
 
  903   tmp1 = _mm_mul_ps(tmp1, k);
 
  904   ak = _mm_add_ps(ak, tmp1);
 
  905   tmp1 = _mm_mul_ps(D,D);
 
  906   tmp1 = _mm_mul_ps(tmp1, k);
 
  907   ak = _mm_add_ps(ak, tmp1);
 
  908   ak = _mm_mul_ps(ak, D_inv);
 
  910   hk = _mm_mul_ps(d,k);
 
  911   hk = _mm_add_ps(hk, 
one);
 
  912   hk = _mm_mul_ps(hk,hk);
 
  913   tmp1 = _mm_mul_ps(ak,ak);
 
  914   hk = _mm_sub_ps(hk, tmp1);
 
  915   neg = _mm_cmple_ps(hk, 
zero);
 
  916   hk = _vec_sqrt_ps(hk);
 
  918   xk1 = _mm_mul_ps(ak, x);
 
  919   tmp1 = _mm_mul_ps(hk,y);
 
  920   xk2 = _mm_sub_ps(xk1, tmp1);
 
  921   xk1 = _mm_add_ps(xk1, tmp1);
 
  922   xk1 = _mm_mul_ps(xk1, D_inv);
 
  923   xk2 = _mm_mul_ps(xk2, D_inv);
 
  925   yk1 = _mm_mul_ps(ak, y);
 
  926   tmp1 = _mm_mul_ps(hk,x);
 
  927   yk2 = _mm_add_ps(yk1, tmp1);
 
  928   yk1 = _mm_sub_ps(yk1, tmp1);
 
  929   yk1 = _mm_mul_ps(yk1, D_inv);
 
  930   yk2 = _mm_mul_ps(yk2, D_inv);
 
  932   xk = _mm_and_ps(correct_helicity, xk1);
 
  933   tmp1 = _mm_andnot_ps(correct_helicity, xk2);
 
  934   xk = _mm_xor_ps(xk, tmp1);
 
  935   yk = _mm_and_ps(correct_helicity, yk1);
 
  936   tmp1 = _mm_andnot_ps(correct_helicity, yk2);
 
  937   yk = _mm_xor_ps(yk, tmp1);
 
  939   __m128 
phi_2 = _vec_atan2_ps(yk, xk);
 
  941   tmp1 = _mm_cmplt_ps(phi_2, 
zero);
 
  942   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  943   tmp1 = _mm_andnot_ps(tmp1, 
zero);
 
  944   tmp1 = _mm_xor_ps(tmp1, tmp2);
 
  945   phi_2 = _mm_add_ps(phi_2, tmp1);
 
  947   tmp1 = _mm_and_ps(neg, hit_phi);
 
  948   phi_2 = _mm_andnot_ps(neg, phi_2);
 
  949   phi_2 = _mm_xor_ps(tmp1, phi_2);
 
  955   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  957   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  959   tmp1 = _mm_or_ps(tmp1, tmp2);
 
  963   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  965   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  967   tmp2 = _mm_or_ps(tmp2, tmp3);
 
  969   tmp1 = _mm_and_ps(tmp1, tmp2);
 
  973   tmp2 = _mm_and_ps(tmp1, 
twopi);
 
  974   tmp3 = _mm_andnot_ps(tmp1, 
zero);
 
  975   tmp2 = _mm_xor_ps(tmp2, tmp3);
 
  978   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  979   __m128 
tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  980   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  981   phi_1 = _mm_sub_ps(phi_1, tmp3);
 
  984   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  985   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  986   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  987   phi_2 = _mm_sub_ps(phi_2, tmp3);
 
  990   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  991   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  992   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  993   phi_3 = _mm_sub_ps(phi_3, tmp3);
 
  996   tmp3 = _mm_and_ps(tmp4, tmp2);
 
  997   tmp5 = _mm_andnot_ps(tmp4, 
zero);
 
  998   tmp3 = _mm_xor_ps(tmp3, tmp5);
 
  999   phi_4 = _mm_sub_ps(phi_4, tmp3);
 
 1003   __m128 phi_min = 
phi_1;
 
 1004   tmp2 = _mm_cmplt_ps(phi_2, phi_min);
 
 1005   tmp3 = _mm_and_ps(tmp2, phi_2);
 
 1006   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
 1007   phi_min = _mm_xor_ps(phi_min, tmp3);
 
 1008   tmp2 = _mm_cmplt_ps(phi_3, phi_min);
 
 1009   tmp3 = _mm_and_ps(tmp2, phi_3);
 
 1010   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
 1011   phi_min = _mm_xor_ps(phi_min, tmp3);
 
 1012   tmp2 = _mm_cmplt_ps(phi_4, phi_min);
 
 1013   tmp3 = _mm_and_ps(tmp2, phi_4);
 
 1014   phi_min = _mm_andnot_ps(tmp2, phi_min);
 
 1015   phi_min = _mm_xor_ps(phi_min, tmp3);
 
 1018   __m128 phi_max = 
phi_1;
 
 1019   tmp2 = _mm_cmpgt_ps(phi_2, phi_max);
 
 1020   tmp3 = _mm_and_ps(tmp2, phi_2);
 
 1021   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
 1022   phi_max = _mm_xor_ps(phi_max, tmp3);
 
 1023   tmp2 = _mm_cmpgt_ps(phi_3, phi_max);
 
 1024   tmp3 = _mm_and_ps(tmp2, phi_3);
 
 1025   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
 1026   phi_max = _mm_xor_ps(phi_max, tmp3);
 
 1027   tmp2 = _mm_cmpgt_ps(phi_4, phi_max);
 
 1028   tmp3 = _mm_and_ps(tmp2, phi_4);
 
 1029   phi_max = _mm_andnot_ps(tmp2, phi_max);
 
 1030   phi_max = _mm_xor_ps(phi_max, tmp3);
 
 1033   _mm_store_ps(min_phi, phi_min);
 
 1034   _mm_store_ps(max_phi, phi_max);
 
 1038 void HelixHough::phiRange_sse(
float* hit_x, 
float* hit_y, 
float* min_d, 
float* max_d, 
float* min_k, 
float* max_k, 
float* min_phi, 
float* max_phi, 
float* min_phi_2, 
float* max_phi_2, 
float hel, __m128& phi_3_out, __m128& phi_4_out, 
float* hit_x_2, 
float* hit_y_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
 
 1040   __m128 helicity_vec = _mm_load1_ps(&(hel));
 
 1042   __m128 
x = _mm_load_ps(hit_x);                                    __m128 x_2 = _mm_load_ps(hit_x_2);
 
 1043   __m128 
y = _mm_load_ps(hit_y);                                    __m128 y_2 = _mm_load_ps(hit_y_2);
 
 1045   __m128 d_min = _mm_load_ps(min_d);
 
 1046   __m128 d_max = _mm_load_ps(max_d);
 
 1047   __m128 k_min = _mm_load_ps(min_k);
 
 1048   __m128 k_max = _mm_load_ps(max_k);
 
 1050   __m128 hit_phi = _vec_atan2_ps(y,x);                               __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
 
 1052   __m128 
tmp1 = _mm_cmplt_ps(hit_phi, 
zero);                         __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2, 
zero);
 
 1053   __m128 
tmp2 = _mm_and_ps(tmp1, 
twopi);                             __m128 tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1054   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1055   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1056   hit_phi = _mm_add_ps(hit_phi, tmp1);                               hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
 
 1061   __m128 D = _mm_mul_ps(x,x);                                        __m128 D_2 = _mm_mul_ps(x_2,x_2);
 
 1062   tmp1 = _mm_mul_ps(y,y);                                            tmp1_2 = _mm_mul_ps(y_2,y_2);
 
 1063   D = _mm_add_ps(D,tmp1);                                            D_2 = _mm_add_ps(D_2,tmp1_2);
 
 1064   D = _vec_sqrt_ps(D);                                                D_2 = _vec_sqrt_ps(D_2);
 
 1065   __m128 D_inv = _vec_rec_ps(D);                                     __m128 D_inv_2 = _vec_rec_ps(D_2);
 
 1066   __m128 ak = 
d;                                                     __m128 ak_2 = 
d;
 
 1067   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1068   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1069   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1070   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1071   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1072   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1073   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1074   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1076   __m128 hk = _mm_mul_ps(d,k);                                       __m128 hk_2 = _mm_mul_ps(d,k);
 
 1077   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1078   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1079   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1080   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1081   __m128 neg = _mm_cmple_ps(hk, 
zero);                               __m128 neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1082   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1084   __m128 xk1 = _mm_mul_ps(ak, x);                                    __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1085   tmp1 = _mm_mul_ps(hk,y);                                           tmp1_2 = _mm_mul_ps(hk_2,y_2);
 
 1086   __m128 xk2 = _mm_sub_ps(xk1, tmp1);                                __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1087   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1088   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1089   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1091   __m128 yk1 = _mm_mul_ps(ak, y);                                    __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1092   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2,x_2);
 
 1093   __m128 yk2 = _mm_add_ps(yk1, tmp1);                                __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1094   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1095   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1096   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1098   __m128 crossproduct = _mm_mul_ps(x, yk1);                               __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
 
 1099   __m128 crosstemp = _mm_mul_ps(y, xk1);                                  __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
 
 1100   crossproduct = _mm_sub_ps(crossproduct, crosstemp);                     crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
 
 1101   __m128 correct_helicity = 
compare_sign(crossproduct, helicity_vec);     __m128 correct_helicity_2 = 
compare_sign(crossproduct_2, helicity_vec);
 
 1103   __m128 xk = _mm_and_ps(correct_helicity, xk1);                     __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);  
 
 1104   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1105   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1106   __m128 yk = _mm_and_ps(correct_helicity, yk1);                     __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1107   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1108   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1111   __m128 
phi_1 = _vec_atan2_ps(yk, xk);                              __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1113   tmp1 = _mm_cmplt_ps(phi_1, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_1_2, 
zero);
 
 1114   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1115   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1116   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1117   phi_1 = _mm_add_ps(phi_1, tmp1);                                   phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
 
 1119   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1120   phi_1 = _mm_andnot_ps(neg, phi_1);                                 phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
 
 1121   phi_1 = _mm_xor_ps(tmp1, phi_1);                                   phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
 
 1122   phi_3_out = 
phi_1;                                                 phi_3_out_2 = phi_1_2;
 
 1128   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1129   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1130   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1131   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1132   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1133   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1134   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1135   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1137   hk = _mm_mul_ps(d,k);                                              hk_2 = _mm_mul_ps(d,k);
 
 1138   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1139   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1140   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1141   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1142   neg = _mm_cmple_ps(hk, 
zero);                                      neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1143   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1145   xk1 = _mm_mul_ps(ak, x);                                           xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1146   tmp1 = _mm_mul_ps(hk,y);                                           tmp1_2 = _mm_mul_ps(hk_2,y_2);
 
 1147   xk2 = _mm_sub_ps(xk1, tmp1);                                       xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1148   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1149   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1150   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1152   yk1 = _mm_mul_ps(ak, y);                                           yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1153   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2, x_2);
 
 1154   yk2 = _mm_add_ps(yk1, tmp1);                                       yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1155   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1156   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1157   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1159   xk = _mm_and_ps(correct_helicity, xk1);                            xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
 
 1160   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1161   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1162   yk = _mm_and_ps(correct_helicity, yk1);                            yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1163   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1164   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1166   __m128 
phi_2 = _vec_atan2_ps(yk, xk);                              __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1168   tmp1 = _mm_cmplt_ps(phi_2, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_2_2, 
zero);
 
 1169   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1170   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1171   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1172   phi_2 = _mm_add_ps(phi_2, tmp1);                                   phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
 
 1174   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1175   phi_2 = _mm_andnot_ps(neg, phi_2);                                 phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
 
 1176   phi_2 = _mm_xor_ps(tmp1, phi_2);                                   phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
 
 1177   phi_4_out = 
phi_2;                                                 phi_4_out_2 = phi_2_2;
 
 1184   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1185   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1186   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1187   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1188   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1189   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1190   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1191   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1193   hk = _mm_mul_ps(d,k);                                              hk_2 = _mm_mul_ps(d,k);
 
 1194   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1195   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1196   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1197   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1198   neg = _mm_cmple_ps(hk, 
zero);                                      neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1199   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1201   xk1 = _mm_mul_ps(ak, x);                                           xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1202   tmp1 = _mm_mul_ps(hk, y);                                           tmp1_2 = _mm_mul_ps(hk_2, y_2);
 
 1203   xk2 = _mm_sub_ps(xk1, tmp1);                                       xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1204   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1205   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1206   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1208   yk1 = _mm_mul_ps(ak, y);                                           yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1209   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2, x_2);
 
 1210   yk2 = _mm_add_ps(yk1, tmp1);                                       yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1211   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1212   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1213   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1215   xk = _mm_and_ps(correct_helicity, xk1);                            xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
 
 1216   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1217   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1218   yk = _mm_and_ps(correct_helicity, yk1);                            yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1219   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1220   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1222   __m128 phi_3 = _vec_atan2_ps(yk, xk);                              __m128 phi_3_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1224   tmp1 = _mm_cmplt_ps(phi_3, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_3_2, 
zero);
 
 1225   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1226   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1227   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1228   phi_3 = _mm_add_ps(phi_3, tmp1);                                   phi_3_2 = _mm_add_ps(phi_3_2, tmp1_2);
 
 1230   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1231   phi_3 = _mm_andnot_ps(neg, phi_3);                                 phi_3_2 = _mm_andnot_ps(neg_2, phi_3_2);
 
 1232   phi_3 = _mm_xor_ps(tmp1, phi_3);                                   phi_3_2 = _mm_xor_ps(tmp1_2, phi_3_2);
 
 1239   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1240   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1241   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1242   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1243   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1244   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1245   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1246   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1248   hk = _mm_mul_ps(d,k);                                              hk_2 = _mm_mul_ps(d,k);
 
 1249   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1250   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1251   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1252   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1253   neg = _mm_cmple_ps(hk, 
zero);                                      neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1254   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1256   xk1 = _mm_mul_ps(ak, x);                                           xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1257   tmp1 = _mm_mul_ps(hk, y);                                           tmp1_2 = _mm_mul_ps(hk_2, y_2);
 
 1258   xk2 = _mm_sub_ps(xk1, tmp1);                                       xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1259   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1260   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1261   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1263   yk1 = _mm_mul_ps(ak, y);                                           yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1264   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2, x_2);
 
 1265   yk2 = _mm_add_ps(yk1, tmp1);                                       yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1266   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1267   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1268   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1270   xk = _mm_and_ps(correct_helicity, xk1);                            xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
 
 1271   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1272   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1273   yk = _mm_and_ps(correct_helicity, yk1);                            yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1274   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1275   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1277   __m128 phi_4 = _vec_atan2_ps(yk, xk);                              __m128 phi_4_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1279   tmp1 = _mm_cmplt_ps(phi_4, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_4_2, 
zero);
 
 1280   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1281   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1282   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1283   phi_4 = _mm_add_ps(phi_4, tmp1);                                   phi_4_2 = _mm_add_ps(phi_4_2, tmp1_2);
 
 1285   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1286   phi_4 = _mm_andnot_ps(neg, phi_4);                                 phi_4_2 = _mm_andnot_ps(neg_2, phi_4_2);
 
 1287   phi_4 = _mm_xor_ps(tmp1, phi_4);                                   phi_4_2 = _mm_xor_ps(tmp1_2, phi_4_2);
 
 1294   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1296   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1298   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1302   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1304   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1306   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1308   tmp1 = _mm_and_ps(tmp1, tmp2);                                     tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
 
 1312   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1313   tmp3 = _mm_andnot_ps(tmp1, 
zero);                                  tmp3_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1314   tmp2 = _mm_xor_ps(tmp2, tmp3);                                     tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
 
 1317   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1318   __m128 
tmp5 = _mm_andnot_ps(tmp4, 
zero);                           __m128 tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1319   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1320   phi_1 = _mm_sub_ps(phi_1, tmp3);                                   phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
 
 1323   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1324   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1325   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1326   phi_2 = _mm_sub_ps(phi_2, tmp3);                                   phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
 
 1329   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1330   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1331   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1332   phi_3 = _mm_sub_ps(phi_3, tmp3);                                   phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
 
 1335   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1336   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1337   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1338   phi_4 = _mm_sub_ps(phi_4, tmp3);                                   phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
 
 1342   __m128 phi_min = 
phi_1;                                            __m128 phi_min_2 = phi_1_2;
 
 1343   tmp2 = _mm_cmplt_ps(phi_2, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
 
 1344   tmp3 = _mm_and_ps(tmp2, phi_2);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
 
 1345   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1346   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1347   tmp2 = _mm_cmplt_ps(phi_3, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
 
 1348   tmp3 = _mm_and_ps(tmp2, phi_3);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
 
 1349   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1350   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1351   tmp2 = _mm_cmplt_ps(phi_4, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
 
 1352   tmp3 = _mm_and_ps(tmp2, phi_4);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
 
 1353   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1354   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1357   __m128 phi_max = 
phi_1;                                            __m128 phi_max_2 = phi_1_2;
 
 1358   tmp2 = _mm_cmpgt_ps(phi_2, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
 
 1359   tmp3 = _mm_and_ps(tmp2, phi_2);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
 
 1360   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1361   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1362   tmp2 = _mm_cmpgt_ps(phi_3, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
 
 1363   tmp3 = _mm_and_ps(tmp2, phi_3);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
 
 1364   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1365   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1366   tmp2 = _mm_cmpgt_ps(phi_4, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
 
 1367   tmp3 = _mm_and_ps(tmp2, phi_4);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
 
 1368   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1369   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1372   _mm_store_ps(min_phi, phi_min);                                    _mm_store_ps(min_phi_2, phi_min_2);
 
 1373   _mm_store_ps(max_phi, phi_max);                                    _mm_store_ps(max_phi_2, phi_max_2);
 
 1377 void HelixHough::phiRange_sse(
float* hit_x, 
float* hit_y, 
float* min_d, 
float* max_d, 
float* , 
float* max_k, 
float* min_phi, 
float* max_phi, 
float* min_phi_2, 
float* max_phi_2, 
float hel, __m128& phi_3, __m128& phi_4, __m128& phi_3_out, __m128& phi_4_out, 
float* hit_x_2, 
float* hit_y_2, __m128& phi_3_2, __m128& phi_4_2, __m128& phi_3_out_2, __m128& phi_4_out_2)
 
 1379   __m128 helicity_vec = _mm_load1_ps(&(hel));
 
 1381   __m128 
x = _mm_load_ps(hit_x);                                    __m128 x_2 = _mm_load_ps(hit_x_2);
 
 1382   __m128 
y = _mm_load_ps(hit_y);                                    __m128 y_2 = _mm_load_ps(hit_y_2);
 
 1384   __m128 d_min = _mm_load_ps(min_d);
 
 1385   __m128 d_max = _mm_load_ps(max_d);
 
 1386   __m128 k_max = _mm_load_ps(max_k);
 
 1388   __m128 hit_phi = _vec_atan2_ps(y,x);                               __m128 hit_phi_2 = _vec_atan2_ps(y_2,x_2);
 
 1390   __m128 
tmp1 = _mm_cmplt_ps(hit_phi, 
zero);                         __m128 tmp1_2 = _mm_cmplt_ps(hit_phi_2, 
zero);
 
 1391   __m128 
tmp2 = _mm_and_ps(tmp1, 
twopi);                             __m128 tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1392   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1393   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1394   hit_phi = _mm_add_ps(hit_phi, tmp1);                               hit_phi_2 = _mm_add_ps(hit_phi_2, tmp1_2);
 
 1399   __m128 D = _mm_mul_ps(x,x);                                        __m128 D_2 = _mm_mul_ps(x_2,x_2);
 
 1400   tmp1 = _mm_mul_ps(y,y);                                            tmp1_2 = _mm_mul_ps(y_2,y_2);
 
 1401   D = _mm_add_ps(D,tmp1);                                            D_2 = _mm_add_ps(D_2,tmp1_2);
 
 1402   D = _vec_sqrt_ps(D);                                                D_2 = _vec_sqrt_ps(D_2);
 
 1403   __m128 D_inv = _vec_rec_ps(D);                                     __m128 D_inv_2 = _vec_rec_ps(D_2);
 
 1404   __m128 ak = 
d;                                                     __m128 ak_2 = 
d;
 
 1405   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1406   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1407   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1408   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1409   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1410   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1411   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1412   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1414   __m128 hk = _mm_mul_ps(d,k);                                       __m128 hk_2 = _mm_mul_ps(d,k);
 
 1415   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1416   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1417   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1418   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1419   __m128 neg = _mm_cmple_ps(hk, 
zero);                               __m128 neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1420   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1422   __m128 xk1 = _mm_mul_ps(ak, x);                                    __m128 xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1423   tmp1 = _mm_mul_ps(hk,y);                                           tmp1_2 = _mm_mul_ps(hk_2,y_2);
 
 1424   __m128 xk2 = _mm_sub_ps(xk1, tmp1);                                __m128 xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1425   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1426   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1427   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1429   __m128 yk1 = _mm_mul_ps(ak, y);                                    __m128 yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1430   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2,x_2);
 
 1431   __m128 yk2 = _mm_add_ps(yk1, tmp1);                                __m128 yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1432   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1433   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1434   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1436   __m128 crossproduct = _mm_mul_ps(x, yk1);                               __m128 crossproduct_2 = _mm_mul_ps(x_2, yk1_2);
 
 1437   __m128 crosstemp = _mm_mul_ps(y, xk1);                                  __m128 crosstemp_2 = _mm_mul_ps(y_2, xk1_2);
 
 1438   crossproduct = _mm_sub_ps(crossproduct, crosstemp);                     crossproduct_2 = _mm_sub_ps(crossproduct_2, crosstemp_2);
 
 1439   __m128 correct_helicity = 
compare_sign(crossproduct, helicity_vec);     __m128 correct_helicity_2 = 
compare_sign(crossproduct_2, helicity_vec);
 
 1441   __m128 xk = _mm_and_ps(correct_helicity, xk1);                     __m128 xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);  
 
 1442   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1443   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1444   __m128 yk = _mm_and_ps(correct_helicity, yk1);                     __m128 yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1445   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1446   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1449   __m128 
phi_1 = _vec_atan2_ps(yk, xk);                              __m128 phi_1_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1451   tmp1 = _mm_cmplt_ps(phi_1, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_1_2, 
zero);
 
 1452   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1453   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1454   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1455   phi_1 = _mm_add_ps(phi_1, tmp1);                                   phi_1_2 = _mm_add_ps(phi_1_2, tmp1_2);
 
 1457   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1458   phi_1 = _mm_andnot_ps(neg, phi_1);                                 phi_1_2 = _mm_andnot_ps(neg_2, phi_1_2);
 
 1459   phi_1 = _mm_xor_ps(tmp1, phi_1);                                   phi_1_2 = _mm_xor_ps(tmp1_2, phi_1_2);
 
 1460   phi_3_out = 
phi_1;                                                 phi_3_out_2 = phi_1_2;
 
 1466   ak = _mm_mul_ps(d, 
two);                                           ak_2 = _mm_mul_ps(d, 
two);
 
 1467   tmp1 = _mm_mul_ps(d,d);                                            tmp1_2 = _mm_mul_ps(d,d);
 
 1468   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1469   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1470   tmp1 = _mm_mul_ps(D,D);                                            tmp1_2 = _mm_mul_ps(D_2,D_2);
 
 1471   tmp1 = _mm_mul_ps(tmp1, k);                                        tmp1_2 = _mm_mul_ps(tmp1_2, k);
 
 1472   ak = _mm_add_ps(ak, tmp1);                                         ak_2 = _mm_add_ps(ak_2, tmp1_2);
 
 1473   ak = _mm_mul_ps(ak, D_inv);                                        ak_2 = _mm_mul_ps(ak_2, D_inv_2);
 
 1475   hk = _mm_mul_ps(d,k);                                              hk_2 = _mm_mul_ps(d,k);
 
 1476   hk = _mm_add_ps(hk, 
one);                                          hk_2 = _mm_add_ps(hk_2, 
one);
 
 1477   hk = _mm_mul_ps(hk,hk);                                            hk_2 = _mm_mul_ps(hk_2,hk_2);
 
 1478   tmp1 = _mm_mul_ps(ak,ak);                                          tmp1_2 = _mm_mul_ps(ak_2,ak_2);
 
 1479   hk = _mm_sub_ps(hk, tmp1);                                         hk_2 = _mm_sub_ps(hk_2, tmp1_2);
 
 1480   neg = _mm_cmple_ps(hk, 
zero);                                      neg_2 = _mm_cmple_ps(hk_2, 
zero);
 
 1481   hk = _vec_sqrt_ps(hk);                                              hk_2 = _vec_sqrt_ps(hk_2);
 
 1483   xk1 = _mm_mul_ps(ak, x);                                           xk1_2 = _mm_mul_ps(ak_2, x_2);
 
 1484   tmp1 = _mm_mul_ps(hk, y);                                           tmp1_2 = _mm_mul_ps(hk_2, y_2);
 
 1485   xk2 = _mm_sub_ps(xk1, tmp1);                                       xk2_2 = _mm_sub_ps(xk1_2, tmp1_2);
 
 1486   xk1 = _mm_add_ps(xk1, tmp1);                                       xk1_2 = _mm_add_ps(xk1_2, tmp1_2);
 
 1487   xk1 = _mm_mul_ps(xk1, D_inv);                                      xk1_2 = _mm_mul_ps(xk1_2, D_inv_2);
 
 1488   xk2 = _mm_mul_ps(xk2, D_inv);                                      xk2_2 = _mm_mul_ps(xk2_2, D_inv_2);
 
 1490   yk1 = _mm_mul_ps(ak, y);                                           yk1_2 = _mm_mul_ps(ak_2, y_2);
 
 1491   tmp1 = _mm_mul_ps(hk,x);                                           tmp1_2 = _mm_mul_ps(hk_2, x_2);
 
 1492   yk2 = _mm_add_ps(yk1, tmp1);                                       yk2_2 = _mm_add_ps(yk1_2, tmp1_2);
 
 1493   yk1 = _mm_sub_ps(yk1, tmp1);                                       yk1_2 = _mm_sub_ps(yk1_2, tmp1_2);
 
 1494   yk1 = _mm_mul_ps(yk1, D_inv);                                      yk1_2 = _mm_mul_ps(yk1_2, D_inv_2);
 
 1495   yk2 = _mm_mul_ps(yk2, D_inv);                                      yk2_2 = _mm_mul_ps(yk2_2, D_inv_2);
 
 1497   xk = _mm_and_ps(correct_helicity, xk1);                            xk_2 = _mm_and_ps(correct_helicity_2, xk1_2);
 
 1498   tmp1 = _mm_andnot_ps(correct_helicity, xk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, xk2_2);
 
 1499   xk = _mm_xor_ps(xk, tmp1);                                         xk_2 = _mm_xor_ps(xk_2, tmp1_2);
 
 1500   yk = _mm_and_ps(correct_helicity, yk1);                            yk_2 = _mm_and_ps(correct_helicity_2, yk1_2);
 
 1501   tmp1 = _mm_andnot_ps(correct_helicity, yk2);                       tmp1_2 = _mm_andnot_ps(correct_helicity_2, yk2_2);
 
 1502   yk = _mm_xor_ps(yk, tmp1);                                         yk_2 = _mm_xor_ps(yk_2, tmp1_2);
 
 1504   __m128 
phi_2 = _vec_atan2_ps(yk, xk);                              __m128 phi_2_2 = _vec_atan2_ps(yk_2, xk_2);
 
 1506   tmp1 = _mm_cmplt_ps(phi_2, 
zero);                                  tmp1_2 = _mm_cmplt_ps(phi_2_2, 
zero);
 
 1507   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1508   tmp1 = _mm_andnot_ps(tmp1, 
zero);                                  tmp1_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1509   tmp1 = _mm_xor_ps(tmp1, tmp2);                                     tmp1_2 = _mm_xor_ps(tmp1_2, tmp2_2);
 
 1510   phi_2 = _mm_add_ps(phi_2, tmp1);                                   phi_2_2 = _mm_add_ps(phi_2_2, tmp1_2);
 
 1512   tmp1 = _mm_and_ps(neg, hit_phi);                                   tmp1_2 = _mm_and_ps(neg_2, hit_phi_2);
 
 1513   phi_2 = _mm_andnot_ps(neg, phi_2);                                 phi_2_2 = _mm_andnot_ps(neg_2, phi_2_2);
 
 1514   phi_2 = _mm_xor_ps(tmp1, phi_2);                                   phi_2_2 = _mm_xor_ps(tmp1_2, phi_2_2);
 
 1515   phi_4_out = 
phi_2;                                                 phi_4_out_2 = phi_2_2;
 
 1523   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1525   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1527   tmp1 = _mm_or_ps(tmp1, tmp2);                                      tmp1_2 = _mm_or_ps(tmp1_2, tmp2_2);
 
 1531   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1533   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1535   tmp2 = _mm_or_ps(tmp2, tmp3);                                      tmp2_2 = _mm_or_ps(tmp2_2, tmp3_2);
 
 1537   tmp1 = _mm_and_ps(tmp1, tmp2);                                     tmp1_2 = _mm_and_ps(tmp1_2, tmp2_2);
 
 1541   tmp2 = _mm_and_ps(tmp1, 
twopi);                                    tmp2_2 = _mm_and_ps(tmp1_2, 
twopi);
 
 1542   tmp3 = _mm_andnot_ps(tmp1, 
zero);                                  tmp3_2 = _mm_andnot_ps(tmp1_2, 
zero);
 
 1543   tmp2 = _mm_xor_ps(tmp2, tmp3);                                     tmp2_2 = _mm_xor_ps(tmp2_2, tmp3_2);
 
 1546   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1547   __m128 
tmp5 = _mm_andnot_ps(tmp4, 
zero);                           __m128 tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1548   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1549   phi_1 = _mm_sub_ps(phi_1, tmp3);                                   phi_1_2 = _mm_sub_ps(phi_1_2, tmp3_2);
 
 1552   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1553   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1554   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1555   phi_2 = _mm_sub_ps(phi_2, tmp3);                                   phi_2_2 = _mm_sub_ps(phi_2_2, tmp3_2);
 
 1558   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1559   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1560   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1561   phi_3 = _mm_sub_ps(phi_3, tmp3);                                   phi_3_2 = _mm_sub_ps(phi_3_2, tmp3_2);
 
 1564   tmp3 = _mm_and_ps(tmp4, tmp2);                                     tmp3_2 = _mm_and_ps(tmp4_2, tmp2_2);
 
 1565   tmp5 = _mm_andnot_ps(tmp4, 
zero);                                  tmp5_2 = _mm_andnot_ps(tmp4_2, 
zero);
 
 1566   tmp3 = _mm_xor_ps(tmp3, tmp5);                                     tmp3_2 = _mm_xor_ps(tmp3_2, tmp5_2);
 
 1567   phi_4 = _mm_sub_ps(phi_4, tmp3);                                   phi_4_2 = _mm_sub_ps(phi_4_2, tmp3_2);
 
 1571   __m128 phi_min = 
phi_1;                                            __m128 phi_min_2 = phi_1_2;
 
 1572   tmp2 = _mm_cmplt_ps(phi_2, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_2_2, phi_min_2);
 
 1573   tmp3 = _mm_and_ps(tmp2, phi_2);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
 
 1574   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1575   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1576   tmp2 = _mm_cmplt_ps(phi_3, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_3_2, phi_min_2);
 
 1577   tmp3 = _mm_and_ps(tmp2, phi_3);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
 
 1578   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1579   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1580   tmp2 = _mm_cmplt_ps(phi_4, phi_min);                               tmp2_2 = _mm_cmplt_ps(phi_4_2, phi_min_2);
 
 1581   tmp3 = _mm_and_ps(tmp2, phi_4);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
 
 1582   phi_min = _mm_andnot_ps(tmp2, phi_min);                            phi_min_2 = _mm_andnot_ps(tmp2_2, phi_min_2);
 
 1583   phi_min = _mm_xor_ps(phi_min, tmp3);                               phi_min_2 = _mm_xor_ps(phi_min_2, tmp3_2);
 
 1586   __m128 phi_max = 
phi_1;                                            __m128 phi_max_2 = phi_1_2;
 
 1587   tmp2 = _mm_cmpgt_ps(phi_2, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_2_2, phi_max_2);
 
 1588   tmp3 = _mm_and_ps(tmp2, phi_2);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_2_2);
 
 1589   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1590   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1591   tmp2 = _mm_cmpgt_ps(phi_3, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_3_2, phi_max_2);
 
 1592   tmp3 = _mm_and_ps(tmp2, phi_3);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_3_2);
 
 1593   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1594   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1595   tmp2 = _mm_cmpgt_ps(phi_4, phi_max);                               tmp2_2 = _mm_cmpgt_ps(phi_4_2, phi_max_2);
 
 1596   tmp3 = _mm_and_ps(tmp2, phi_4);                                    tmp3_2 = _mm_and_ps(tmp2_2, phi_4_2);
 
 1597   phi_max = _mm_andnot_ps(tmp2, phi_max);                            phi_max_2 = _mm_andnot_ps(tmp2_2, phi_max_2);
 
 1598   phi_max = _mm_xor_ps(phi_max, tmp3);                               phi_max_2 = _mm_xor_ps(phi_max_2, tmp3_2);
 
 1601   _mm_store_ps(min_phi, phi_min);                                    _mm_store_ps(min_phi_2, phi_min_2);
 
 1602   _mm_store_ps(max_phi, phi_max);                                    _mm_store_ps(max_phi_2, phi_max_2);