9 static const unsigned int allones[4]
__attribute__((aligned(16))) = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
10 static const unsigned int allzeroes[4]
__attribute__((aligned(16))) = {0,0,0,0};
11 static const __m128
one_o_2 = {0.5,0.5,0.5,0.5};
17 __m128i
vec_sgn = _mm_load_si128((__m128i*)sign_int);
18 __m128
phi_sign = _mm_and_ps((__m128)vec_sgn, v_phi);
19 __m128
pos_phi = _mm_xor_ps(phi_sign, v_phi);
23 phi_sign = _mm_and_ps((__m128)vec_sgn, v_phi_m_pi);
28 phi_sign = _mm_and_ps((__m128)vec_sgn, v_twopi_m_phi);
33 phisel = _mm_or_ps(phisel, tmp2);
35 phisel = _mm_or_ps(phisel, tmp2);
41 __m128
num = _mm_and_ps(phisel, sinval);
42 tmp2 = _mm_andnot_ps(phisel, cosval);
43 num = _mm_xor_ps(num, tmp2);
44 __m128
denom = _mm_and_ps(phisel, cosval);
45 tmp2 = _mm_andnot_ps(phisel, sinval);
46 denom = _mm_xor_ps(denom, tmp2);
47 tanphi = _mm_div_ps(num, denom);
50 tmp2 = _mm_andnot_ps(phisel, v_phi);
51 phi_compare = _mm_xor_ps(phi_compare, tmp2);
52 __m128
tmp3 = _mm_mul_ps(tanphi, tanphi);
53 tmp3 = _mm_add_ps(
one, tmp3);
54 tmp3 = _mm_div_ps(
one, tmp3);
55 tmp3 = _mm_sqrt_ps(tmp3);
56 tmp2 = _mm_cmpgt_ps(phi_compare,
pi);
58 tmp2 = _mm_and_ps(tmp2, tmp4);
59 tmp2 = _mm_and_ps((__m128)vec_sgn, tmp2);
60 tmp3 = _mm_xor_ps(tmp3, tmp2);
61 tmp4 = _mm_mul_ps(tanphi, tmp3);
62 ux = _mm_and_ps(phisel, tmp3);
63 tmp2 = _mm_andnot_ps(phisel, tmp4);
64 ux = _mm_xor_ps(ux, tmp2);
65 uy = _mm_and_ps(phisel, tmp4);
66 tmp2 = _mm_andnot_ps(phisel, tmp3);
67 uy = _mm_xor_ps(uy, tmp2);
71 static inline void __attribute__((always_inline)) _vec_invRRangeOnePhi(__m128
x, __m128
y, __m128 v_phi, __m128
v_mind, __m128
v_maxd, __m128&
v_mink, __m128&
v_maxk, __m128&
reflection_min, __m128&
reflection_max, __m128&
incorrect_quadrant, __m128&
correct_quadrant)
73 __m128
dx = {0.,0.,0.,0.};
74 __m128
dy = {0.,0.,0.,0.};
75 __m128 ux = {0.,0.,0.,0.};
76 __m128 uy = {0.,0.,0.,0.};
77 __m128 tanphi = {0.,0.,0.,0.};
78 __m128 phisel = {0.,0.,0.,0.};
79 __m128
t = {0.,0.,0.,0.};
80 __m128
tP = {0.,0.,0.,0.};
81 __m128
mx = {0.,0.,0.,0.};
82 __m128
my = {0.,0.,0.,0.};
83 __m128
Ix = {0.,0.,0.,0.};
84 __m128
Iy = {0.,0.,0.,0.};
85 __m128
r1 = {0.,0.,0.,0.};
86 __m128
r2 = {0.,0.,0.,0.};
88 __m128i
vec_ones = _mm_load_si128((__m128i*)allones);
89 __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
93 tP = _mm_mul_ps(ux, x);
94 __m128
tmp1 = _mm_mul_ps(uy, y);
95 tP = _mm_add_ps(tP, tmp1);
96 dx = _mm_mul_ps(ux, v_mind);
97 dy = _mm_mul_ps(uy, v_mind);
99 tmp1 = _mm_cmplt_ps(v_mind, tP);
100 reflection_min = _mm_or_ps(reflection_min, tmp1);
101 tmp1 = _mm_xor_ps(tmp1, (__m128)vec_ones);
102 reflection_max = _mm_or_ps(reflection_max, tmp1);
103 mx = _mm_add_ps(dx, x);
105 my = _mm_add_ps(dy, y);
108 __m128 tmp2 = _mm_mul_ps(mx, tanphi);
109 tmp2 = _mm_sub_ps(my, tmp2);
110 tmp1 = _mm_sub_ps(y, dy);
111 __m128 tmp3 = _mm_mul_ps(tanphi, tmp1);
112 tmp1 = _mm_sub_ps(x, dx);
113 tmp1 = _mm_add_ps(tmp1, tmp3);
114 tmp1 = _mm_div_ps(tmp2, tmp1);
115 tmp2 = _mm_mul_ps(tanphi, my);
116 tmp2 = _mm_sub_ps(tmp2, mx);
117 tmp3 = _mm_sub_ps(x, dx);
118 tmp3 = _mm_mul_ps(tmp3, tanphi);
119 __m128 tmp4 = _mm_sub_ps(y, dy);
120 tmp3 = _mm_add_ps(tmp3, tmp4);
121 tmp2 = _mm_div_ps(tmp2, tmp3);
122 t = _mm_and_ps(phisel, tmp1);
123 tmp3 = _mm_andnot_ps(phisel, tmp2);
124 t = _mm_xor_ps(t, tmp3);
126 tmp1 = _mm_sub_ps(y,dy);
127 tmp1 = _mm_mul_ps(tmp1, t);
128 Ix = _mm_add_ps(mx, tmp1);
129 tmp2 = _mm_sub_ps(dx,x);
130 tmp2 = _mm_mul_ps(tmp2, t);
131 Iy = _mm_add_ps(my, tmp2);
134 __m128
Ix_sign = _mm_and_ps((__m128)vec_sgn, Ix);
136 __m128
ux_sign = _mm_and_ps((__m128)vec_sgn, ux);
138 tmp1 = _mm_cmpeq_ps(Ix_sign, ux_sign);
140 __m128
Iy_sign = _mm_and_ps((__m128)vec_sgn, Iy);
141 __m128
uy_sign = _mm_and_ps((__m128)vec_sgn, uy);
142 tmp2 = _mm_cmpeq_ps(Iy_sign, uy_sign);
143 tmp3 = _mm_and_ps(tmp1, tmp2);
144 correct_quadrant = _mm_or_ps(correct_quadrant, tmp3);
145 tmp3 = _mm_xor_ps(tmp3, (__m128)vec_ones);
146 incorrect_quadrant = _mm_or_ps(incorrect_quadrant, tmp3);
148 tmp1 = _mm_sub_ps(Ix, dx);
149 tmp1 = _mm_mul_ps(tmp1, tmp1);
150 tmp2 = _mm_sub_ps(Iy, dy);
151 tmp2 = _mm_mul_ps(tmp2, tmp2);
152 r1 = _mm_add_ps(tmp1, tmp2);
153 r1 = _mm_sqrt_ps(r1);
156 dx = _mm_mul_ps(ux, v_maxd);
157 dy = _mm_mul_ps(uy, v_maxd);
159 tmp1 = _mm_cmplt_ps(v_maxd, tP);
160 reflection_min = _mm_or_ps(reflection_min, tmp1);
161 tmp1 = _mm_xor_ps(tmp1, (__m128)vec_ones);
162 reflection_max = _mm_or_ps(reflection_max, tmp1);
163 mx = _mm_add_ps(dx, x);
165 my = _mm_add_ps(dy, y);
168 tmp2 = _mm_mul_ps(mx, tanphi);
169 tmp2 = _mm_sub_ps(my, tmp2);
170 tmp1 = _mm_sub_ps(y, dy);
171 tmp3 = _mm_mul_ps(tanphi, tmp1);
172 tmp1 = _mm_sub_ps(x, dx);
173 tmp1 = _mm_add_ps(tmp1, tmp3);
174 tmp1 = _mm_div_ps(tmp2, tmp1);
175 tmp2 = _mm_mul_ps(tanphi, my);
176 tmp2 = _mm_sub_ps(tmp2, mx);
177 tmp3 = _mm_sub_ps(x, dx);
178 tmp3 = _mm_mul_ps(tmp3, tanphi);
179 tmp4 = _mm_sub_ps(y, dy);
180 tmp3 = _mm_add_ps(tmp3, tmp4);
181 tmp2 = _mm_div_ps(tmp2, tmp3);
182 t = _mm_and_ps(phisel, tmp1);
183 tmp3 = _mm_andnot_ps(phisel, tmp2);
184 t = _mm_xor_ps(t, tmp3);
186 tmp1 = _mm_sub_ps(y,dy);
187 tmp1 = _mm_mul_ps(tmp1, t);
188 Ix = _mm_add_ps(mx, tmp1);
189 tmp2 = _mm_sub_ps(dx,x);
190 tmp2 = _mm_mul_ps(tmp2, t);
191 Iy = _mm_add_ps(my, tmp2);
194 Ix_sign = _mm_and_ps((__m128)vec_sgn, Ix);
196 ux_sign = _mm_and_ps((__m128)vec_sgn, ux);
198 tmp1 = _mm_cmpeq_ps(Ix_sign, ux_sign);
200 Iy_sign = _mm_and_ps((__m128)vec_sgn, Iy);
201 uy_sign = _mm_and_ps((__m128)vec_sgn, uy);
202 tmp2 = _mm_cmpeq_ps(Iy_sign, uy_sign);
203 tmp3 = _mm_and_ps(tmp1, tmp2);
204 correct_quadrant = _mm_or_ps(correct_quadrant, tmp3);
205 tmp3 = _mm_xor_ps(tmp3, (__m128)vec_ones);
206 incorrect_quadrant = _mm_or_ps(incorrect_quadrant, tmp3);
208 tmp1 = _mm_sub_ps(Ix, dx);
209 tmp1 = _mm_mul_ps(tmp1, tmp1);
210 tmp2 = _mm_sub_ps(Iy, dy);
211 tmp2 = _mm_mul_ps(tmp2, tmp2);
212 r2 = _mm_add_ps(tmp1, tmp2);
213 r2 = _mm_sqrt_ps(r2);
217 tmp1 = _mm_cmplt_ps(r1, r2);
218 v_mink = _mm_and_ps(tmp1, invr2);
219 tmp2 = _mm_andnot_ps(tmp1, invr1);
220 v_mink = _mm_xor_ps(v_mink, tmp2);
221 v_maxk = _mm_and_ps(tmp1, invr1);
222 tmp2 = _mm_andnot_ps(tmp1, invr2);
223 v_maxk = _mm_xor_ps(v_maxk, tmp2);
227 void HelixHough::kappaRange_sse(
const SimpleHit3D& hit,
float* minphi,
float* maxphi,
float* mind,
float* maxd,
float* min_invR,
float* max_invR)
229 __m128 v_minphi = _mm_load_ps(minphi);
230 __m128 v_maxphi = _mm_load_ps(maxphi);
231 __m128 v_mind = _mm_load_ps(mind);
232 __m128 v_maxd = _mm_load_ps(maxd);
233 __m128 v_mink = _mm_load_ps(min_invR);
234 __m128 v_maxk = _mm_load_ps(max_invR);
236 float x_arr[4]
__attribute__((aligned(16))) = {hit.x,hit.x,hit.x,hit.x};
237 float y_arr[4]
__attribute__((aligned(16))) = {hit.y,hit.y,hit.y,hit.y};
241 __m128 x = _mm_load_ps(x_arr);
242 __m128 y = _mm_load_ps(y_arr);
244 __m128i vec_zeroes = _mm_load_si128((__m128i*)allzeroes);
246 __m128 reflection_min = (__m128)(vec_zeroes);
247 __m128 reflection_max = (__m128)(vec_zeroes);
248 __m128 correct_quadrant = (__m128)(vec_zeroes);
249 __m128 incorrect_quadrant = (__m128)(vec_zeroes);
251 __m128 v_mink1 = {0.,0.,0.,0.};
252 __m128 v_maxk1 = {0.,0.,0.,0.};
253 _vec_invRRangeOnePhi(x, y, v_minphi, v_mind, v_maxd, v_mink1, v_maxk1, reflection_min, reflection_max, incorrect_quadrant, correct_quadrant);
255 __m128 v_mink2 = {0.,0.,0.,0.};
256 __m128 v_maxk2 = {0.,0.,0.,0.};
257 _vec_invRRangeOnePhi(x, y, v_maxphi, v_mind, v_maxd, v_mink2, v_maxk2, reflection_min, reflection_max, incorrect_quadrant, correct_quadrant);
259 __m128 tmp1 = _mm_cmplt_ps(v_mink1, v_mink2);
260 v_mink = _mm_and_ps(tmp1, v_mink1);
261 __m128 tmp2 = _mm_andnot_ps(tmp1, v_mink2);
262 v_mink = _mm_xor_ps(v_mink, tmp2);
264 tmp1 = _mm_cmpgt_ps(v_maxk1, v_maxk2);
265 v_maxk = _mm_and_ps(tmp1, v_maxk1);
266 tmp2 = _mm_andnot_ps(tmp1, v_maxk2);
267 v_maxk = _mm_xor_ps(v_maxk, tmp2);
269 tmp1 = _mm_and_ps(reflection_min, reflection_max);
270 v_mink = _mm_andnot_ps(tmp1, v_mink);
271 tmp2 = _mm_and_ps(tmp1,
zero);
272 v_mink = _mm_xor_ps(v_mink, tmp2);
274 tmp1 = _mm_and_ps(correct_quadrant, incorrect_quadrant);
275 v_mink = _mm_andnot_ps(tmp1, v_mink);
276 tmp2 = _mm_and_ps(tmp1,
zero);
277 v_mink = _mm_xor_ps(v_mink, tmp2);
279 _mm_store_ps(min_invR, v_mink);
280 _mm_store_ps(max_invR, v_maxk);