EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HelixHough_dzdlRange_sse.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file HelixHough_dzdlRange_sse.cpp
1 #include "HelixHough.h"
2 
3 #include "vector_math_inline.h"
4 
5 #include <emmintrin.h>
6 #include <xmmintrin.h>
7 
8 using namespace std;
9 
10 
11 static const __m128 one_o_100 = {0.01,0.01,0.01,0.01};
12 static const __m128 close_one = {0.999,0.999,0.999,0.999};
13 static const __m128 two = {2., 2., 2., 2.};
14 static const __m128 four = {4., 4., 4., 4.};
15 static const __m128 one_o_3 = {0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333};
16 static const __m128 _3_o_20 = {0.15, 0.15, 0.15, 0.15};
17 static const __m128 _5_o_56 = {8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02};
18 static const __m128 SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
19 
20 
21 void HelixHough::dzdlRange_sse(float* x_a, float* y_a, float* z_a, float cosphi1, float sinphi1, float cosphi2, float sinphi2, float min_k_val, float max_k_val, float min_d_val, float max_d_val, float* min_z0_val, float* max_z0_val, float* min_dzdl_a, float* max_dzdl_a)
22 {
23  __m128 x = _mm_load_ps(x_a);
24  __m128 y = _mm_load_ps(y_a);
25  __m128 z = _mm_load_ps(z_a);
26 
27  __m128 cosphi_min = _mm_load1_ps(&(cosphi1));
28  __m128 cosphi_max = _mm_load1_ps(&(cosphi2));
29  __m128 sinphi_min = _mm_load1_ps(&(sinphi1));
30  __m128 sinphi_max = _mm_load1_ps(&(sinphi2));
31 
32  __m128 min_k = _mm_load1_ps(&(min_k_val));
33  __m128 max_k = _mm_load1_ps(&(max_k_val));
34  __m128 min_d = _mm_load1_ps(&(min_d_val));
35  __m128 max_d = _mm_load1_ps(&(max_d_val));
36  __m128 min_z0 = _mm_load_ps((min_z0_val));
37  __m128 max_z0 = _mm_load_ps((max_z0_val));
38 
39  __m128 d = min_d; __m128 d_2 = max_d;
40  __m128 k = min_k; __m128 k_2 = max_k;
41  __m128 dx = _mm_mul_ps(cosphi_min, d); __m128 dx_2 = _mm_mul_ps(cosphi_max,d_2);
42  __m128 dy = _mm_mul_ps(sinphi_min,d); __m128 dy_2 = _mm_mul_ps(sinphi_max,d_2);
43  __m128 D = _mm_sub_ps(x, dx); __m128 D_2 = _mm_sub_ps(x, dx_2);
44  D = _mm_mul_ps(D, D); D_2 = _mm_mul_ps(D_2, D_2);
45  __m128 tmp1 = _mm_sub_ps(y, dy); __m128 tmp1_2 = _mm_sub_ps(y, dy_2);
46  tmp1 = _mm_mul_ps(tmp1, tmp1); tmp1_2 = _mm_mul_ps(tmp1_2, tmp1_2);
47  D = _mm_add_ps(D, tmp1); D_2 = _mm_add_ps(D_2, tmp1_2);
48  D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
49  __m128 v = _mm_mul_ps(one_o_2, k); __m128 v_2 = _mm_mul_ps(one_o_2, k_2);
50  v = _mm_mul_ps(v, D); v_2 = _mm_mul_ps(v_2, D_2);
51  //if(v > 0.999){v = 0.999;} //if(v > 0.999){v = 0.999;}
52  tmp1 = _mm_cmpgt_ps(v, close_one); tmp1_2 = _mm_cmpgt_ps(v_2, close_one);
53  __m128 tmp2 = _mm_and_ps(tmp1, close_one); __m128 tmp2_2 = _mm_and_ps(tmp1_2, close_one);
54  __m128 tmp3 = _mm_andnot_ps(tmp1, v); __m128 tmp3_2 = _mm_andnot_ps(tmp1_2, v_2);
55  v = _mm_xor_ps(tmp2, tmp3); v_2 = _mm_xor_ps(tmp2_2, tmp3_2);
56  __m128 one_o_v = _vec_rec_ps(v); __m128 one_o_v_2 = _vec_rec_ps(v_2);
57  //power series assuming v_v is small //power series assuming v_v is small
58  __m128 s = zero; __m128 s_2 = zero;
59  __m128 temp1 = _mm_mul_ps(v, v); __m128 temp1_2 = _mm_mul_ps(v_2, v_2);
60  __m128 temp2 = _mm_mul_ps(one_o_2, D); __m128 temp2_2 = _mm_mul_ps(one_o_2, D_2);
61  tmp1 = _mm_mul_ps(two, temp2); tmp1_2 = _mm_mul_ps(two, temp2_2);
62  s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
63  temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
64  tmp1 = _mm_mul_ps(temp2, one_o_3); tmp1_2 = _mm_mul_ps(temp2_2, one_o_3);
65  s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
66  temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
67  tmp1 = _mm_mul_ps(temp2, _3_o_20); tmp1_2 = _mm_mul_ps(temp2_2, _3_o_20);
68  s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
69  temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
70  tmp1 = _mm_mul_ps(temp2, _5_o_56); tmp1_2 = _mm_mul_ps(temp2_2, _5_o_56);
71  s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
73  //otherwise we calculate an arcsin //otherwise we calculate an arcsin
74  //asin(x) = 2*atan( x/( 1 + sqrt( 1 - x*x ) ) ) //asin(x) = 2*atan( x/( 1 + sqrt( 1 - x*x ) ) )
75  //s = 2*asin(v)/k //s = 2*asin(v)/k
76  tmp1 = _mm_mul_ps(v, v); tmp1_2 = _mm_mul_ps(v_2, v_2);
77  tmp1 = _mm_sub_ps(one, tmp1); tmp1_2 = _mm_sub_ps(one, tmp1_2);
78  tmp1 = _vec_sqrt_ps(tmp1); tmp1_2 = _vec_sqrt_ps(tmp1_2);
79  tmp1 = _mm_add_ps(one, tmp1); tmp1_2 = _mm_add_ps(one, tmp1_2);
80  tmp1 = _mm_mul_ps(tmp1, one_o_v); tmp1_2 = _mm_mul_ps(tmp1_2, one_o_v_2);
81  tmp2 = _vec_atan_ps(tmp1); tmp2_2 = _vec_atan_ps(tmp1_2);
82  tmp2 = _mm_sub_ps(pi_over_two, tmp2); tmp2_2 = _mm_sub_ps(pi_over_two, tmp2_2);
83  tmp2 = _mm_mul_ps(four, tmp2); tmp2_2 = _mm_mul_ps(four, tmp2_2);
84  tmp2 = _mm_mul_ps(tmp2, one_o_v); tmp2_2 = _mm_mul_ps(tmp2_2, one_o_v_2);
85  tmp2 = _mm_mul_ps(tmp2, D); tmp2_2 = _mm_mul_ps(tmp2_2, D_2);
86  tmp2 = _mm_mul_ps(tmp2, one_o_2); tmp2_2 = _mm_mul_ps(tmp2_2, one_o_2);
88  //choose between the two methods to calculate s //choose between the two methods to calculate s
89  tmp1 = _mm_cmpgt_ps(v, one_o_100); tmp1_2 = _mm_cmpgt_ps(v_2, one_o_100);
90  tmp3 = _mm_and_ps(tmp1, tmp2); tmp3_2 = _mm_and_ps(tmp1_2, tmp2_2);
91  tmp2 = _mm_andnot_ps(tmp1, s); tmp2_2 = _mm_andnot_ps(tmp1_2, s_2);
92  __m128 s1 = _mm_xor_ps(tmp3, tmp2); __m128 s2 = _mm_xor_ps(tmp3_2, tmp2_2);
93 
94 
95 
96 
98  __m128 dz2 = _mm_sub_ps(z, max_z0);
99  dz2 = _mm_mul_ps(dz2, dz2);
100  tmp1 = _mm_mul_ps(s1, s1);
101  tmp1 = _mm_add_ps(tmp1, dz2);
102  __m128 dzdl_1 = _mm_div_ps(dz2, tmp1);
103  dzdl_1 = _vec_sqrt_ps(dzdl_1);
104  //if z < max_z0, dzdl = -dzdl_1
105  tmp1 = _mm_cmplt_ps(z, max_z0);
106  tmp2 = _mm_xor_ps(dzdl_1, SIGNMASK);
107  tmp3 = _mm_and_ps(tmp1, tmp2);
108  dzdl_1 = _mm_andnot_ps(tmp1, dzdl_1);
109  dzdl_1 = _mm_xor_ps(dzdl_1, tmp3);
110 
112  dz2 = _mm_sub_ps(z, min_z0);
113  dz2 = _mm_mul_ps(dz2, dz2);
114  tmp1 = _mm_mul_ps(s1, s1);
115  tmp1 = _mm_add_ps(tmp1, dz2);
116  __m128 dzdl_2 = _mm_div_ps(dz2, tmp1);
117  dzdl_2 = _vec_sqrt_ps(dzdl_2);
118  //if z < min_z0, dzdl = -dzdl_2
119  tmp1 = _mm_cmplt_ps(z, min_z0);
120  tmp2 = _mm_xor_ps(dzdl_2, SIGNMASK);
121  tmp3 = _mm_and_ps(tmp1, tmp2);
122  dzdl_2 = _mm_andnot_ps(tmp1, dzdl_2);
123  dzdl_2 = _mm_xor_ps(dzdl_2, tmp3);
124 
126  dz2 = _mm_sub_ps(z, max_z0);
127  dz2 = _mm_mul_ps(dz2, dz2);
128  tmp1 = _mm_mul_ps(s2, s2);
129  tmp1 = _mm_add_ps(tmp1, dz2);
130  __m128 dzdl_3 = _mm_div_ps(dz2, tmp1);
131  dzdl_3 = _vec_sqrt_ps(dzdl_3);
132  //if z < max_z0, dzdl = -dzdl_3
133  tmp1 = _mm_cmplt_ps(z, max_z0);
134  tmp2 = _mm_xor_ps(dzdl_3, SIGNMASK);
135  tmp3 = _mm_and_ps(tmp1, tmp2);
136  dzdl_3 = _mm_andnot_ps(tmp1, dzdl_3);
137  dzdl_3 = _mm_xor_ps(dzdl_3, tmp3);
138 
140  dz2 = _mm_sub_ps(z, min_z0);
141  dz2 = _mm_mul_ps(dz2, dz2);
142  tmp1 = _mm_mul_ps(s2, s2);
143  tmp1 = _mm_add_ps(tmp1, dz2);
144  __m128 dzdl_4 = _mm_div_ps(dz2, tmp1);
145  dzdl_4 = _vec_sqrt_ps(dzdl_4);
146  //if z < min_z0, dzdl = -dzdl_4
147  tmp1 = _mm_cmplt_ps(z, min_z0);
148  tmp2 = _mm_xor_ps(dzdl_4, SIGNMASK);
149  tmp3 = _mm_and_ps(tmp1, tmp2);
150  dzdl_4 = _mm_andnot_ps(tmp1, dzdl_4);
151  dzdl_4 = _mm_xor_ps(dzdl_4, tmp3);
152 
153 
154 
155 
156  __m128 dzdl_max = dzdl_1;
157  tmp1 = _mm_cmpgt_ps(dzdl_2, dzdl_max);
158  tmp2 = _mm_and_ps(tmp1, dzdl_2);
159  tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
160  dzdl_max = _mm_xor_ps(tmp2, tmp3);
161  tmp1 = _mm_cmpgt_ps(dzdl_3, dzdl_max);
162  tmp2 = _mm_and_ps(tmp1, dzdl_3);
163  tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
164  dzdl_max = _mm_xor_ps(tmp2, tmp3);
165  tmp1 = _mm_cmpgt_ps(dzdl_4, dzdl_max);
166  tmp2 = _mm_and_ps(tmp1, dzdl_4);
167  tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
168  dzdl_max = _mm_xor_ps(tmp2, tmp3);
169 
170  __m128 dzdl_min = dzdl_1;
171  tmp1 = _mm_cmplt_ps(dzdl_2, dzdl_min);
172  tmp2 = _mm_and_ps(tmp1, dzdl_2);
173  tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
174  dzdl_min = _mm_xor_ps(tmp2, tmp3);
175  tmp1 = _mm_cmplt_ps(dzdl_3, dzdl_min);
176  tmp2 = _mm_and_ps(tmp1, dzdl_3);
177  tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
178  dzdl_min = _mm_xor_ps(tmp2, tmp3);
179  tmp1 = _mm_cmplt_ps(dzdl_4, dzdl_min);
180  tmp2 = _mm_and_ps(tmp1, dzdl_4);
181  tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
182  dzdl_min = _mm_xor_ps(tmp2, tmp3);
183 
184 
185 
186  _mm_store_ps(min_dzdl_a, dzdl_min);
187  _mm_store_ps(max_dzdl_a, dzdl_max);
188 }