11 static const __m128
one_o_100 = {0.01,0.01,0.01,0.01};
12 static const __m128
close_one = {0.999,0.999,0.999,0.999};
13 static const __m128
two = {2., 2., 2., 2.};
14 static const __m128
four = {4., 4., 4., 4.};
15 static const __m128
one_o_3 = {0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333};
16 static const __m128
_3_o_20 = {0.15, 0.15, 0.15, 0.15};
17 static const __m128
_5_o_56 = {8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02};
18 static const __m128
SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
21 void HelixHough::dzdlRange_sse(
float* x_a,
float* y_a,
float* z_a,
float cosphi1,
float sinphi1,
float cosphi2,
float sinphi2,
float min_k_val,
float max_k_val,
float min_d_val,
float max_d_val,
float* min_z0_val,
float* max_z0_val,
float* min_dzdl_a,
float* max_dzdl_a)
23 __m128
x = _mm_load_ps(x_a);
24 __m128
y = _mm_load_ps(y_a);
25 __m128
z = _mm_load_ps(z_a);
27 __m128 cosphi_min = _mm_load1_ps(&(cosphi1));
28 __m128 cosphi_max = _mm_load1_ps(&(cosphi2));
29 __m128 sinphi_min = _mm_load1_ps(&(sinphi1));
30 __m128 sinphi_max = _mm_load1_ps(&(sinphi2));
32 __m128 min_k = _mm_load1_ps(&(min_k_val));
33 __m128 max_k = _mm_load1_ps(&(max_k_val));
34 __m128 min_d = _mm_load1_ps(&(min_d_val));
35 __m128 max_d = _mm_load1_ps(&(max_d_val));
36 __m128 min_z0 = _mm_load_ps((min_z0_val));
37 __m128 max_z0 = _mm_load_ps((max_z0_val));
39 __m128
d = min_d; __m128
d_2 = max_d;
40 __m128
k = min_k; __m128 k_2 = max_k;
41 __m128
dx = _mm_mul_ps(cosphi_min, d); __m128 dx_2 = _mm_mul_ps(cosphi_max,d_2);
42 __m128
dy = _mm_mul_ps(sinphi_min,d); __m128 dy_2 = _mm_mul_ps(sinphi_max,d_2);
43 __m128 D = _mm_sub_ps(x, dx); __m128 D_2 = _mm_sub_ps(x, dx_2);
44 D = _mm_mul_ps(D, D); D_2 = _mm_mul_ps(D_2, D_2);
45 __m128
tmp1 = _mm_sub_ps(y, dy); __m128 tmp1_2 = _mm_sub_ps(y, dy_2);
46 tmp1 = _mm_mul_ps(tmp1, tmp1); tmp1_2 = _mm_mul_ps(tmp1_2, tmp1_2);
47 D = _mm_add_ps(D, tmp1); D_2 = _mm_add_ps(D_2, tmp1_2);
48 D = _vec_sqrt_ps(D); D_2 = _vec_sqrt_ps(D_2);
49 __m128
v = _mm_mul_ps(
one_o_2, k); __m128 v_2 = _mm_mul_ps(
one_o_2, k_2);
50 v = _mm_mul_ps(v, D); v_2 = _mm_mul_ps(v_2, D_2);
54 __m128
tmp3 = _mm_andnot_ps(tmp1, v); __m128 tmp3_2 = _mm_andnot_ps(tmp1_2, v_2);
55 v = _mm_xor_ps(tmp2, tmp3); v_2 = _mm_xor_ps(tmp2_2, tmp3_2);
56 __m128
one_o_v = _vec_rec_ps(v); __m128 one_o_v_2 = _vec_rec_ps(v_2);
59 __m128
temp1 = _mm_mul_ps(v, v); __m128 temp1_2 = _mm_mul_ps(v_2, v_2);
61 tmp1 = _mm_mul_ps(
two, temp2); tmp1_2 = _mm_mul_ps(
two, temp2_2);
62 s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
63 temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
64 tmp1 = _mm_mul_ps(temp2,
one_o_3); tmp1_2 = _mm_mul_ps(temp2_2,
one_o_3);
65 s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
66 temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
67 tmp1 = _mm_mul_ps(temp2,
_3_o_20); tmp1_2 = _mm_mul_ps(temp2_2,
_3_o_20);
68 s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
69 temp2 = _mm_mul_ps(temp2, temp1); temp2_2 = _mm_mul_ps(temp2_2, temp1_2);
70 tmp1 = _mm_mul_ps(temp2,
_5_o_56); tmp1_2 = _mm_mul_ps(temp2_2,
_5_o_56);
71 s = _mm_add_ps(s, tmp1); s_2 = _mm_add_ps(s_2, tmp1_2);
76 tmp1 = _mm_mul_ps(v, v); tmp1_2 = _mm_mul_ps(v_2, v_2);
77 tmp1 = _mm_sub_ps(
one, tmp1); tmp1_2 = _mm_sub_ps(
one, tmp1_2);
78 tmp1 = _vec_sqrt_ps(tmp1); tmp1_2 = _vec_sqrt_ps(tmp1_2);
79 tmp1 = _mm_add_ps(
one, tmp1); tmp1_2 = _mm_add_ps(
one, tmp1_2);
80 tmp1 = _mm_mul_ps(tmp1, one_o_v); tmp1_2 = _mm_mul_ps(tmp1_2, one_o_v_2);
81 tmp2 = _vec_atan_ps(tmp1); tmp2_2 = _vec_atan_ps(tmp1_2);
83 tmp2 = _mm_mul_ps(
four, tmp2); tmp2_2 = _mm_mul_ps(
four, tmp2_2);
84 tmp2 = _mm_mul_ps(tmp2, one_o_v); tmp2_2 = _mm_mul_ps(tmp2_2, one_o_v_2);
85 tmp2 = _mm_mul_ps(tmp2, D); tmp2_2 = _mm_mul_ps(tmp2_2, D_2);
86 tmp2 = _mm_mul_ps(tmp2,
one_o_2); tmp2_2 = _mm_mul_ps(tmp2_2,
one_o_2);
90 tmp3 = _mm_and_ps(tmp1, tmp2); tmp3_2 = _mm_and_ps(tmp1_2, tmp2_2);
91 tmp2 = _mm_andnot_ps(tmp1, s); tmp2_2 = _mm_andnot_ps(tmp1_2, s_2);
92 __m128
s1 = _mm_xor_ps(tmp3, tmp2); __m128
s2 = _mm_xor_ps(tmp3_2, tmp2_2);
98 __m128
dz2 = _mm_sub_ps(z, max_z0);
99 dz2 = _mm_mul_ps(dz2, dz2);
100 tmp1 = _mm_mul_ps(s1, s1);
101 tmp1 = _mm_add_ps(tmp1, dz2);
102 __m128 dzdl_1 = _mm_div_ps(dz2, tmp1);
103 dzdl_1 = _vec_sqrt_ps(dzdl_1);
105 tmp1 = _mm_cmplt_ps(z, max_z0);
106 tmp2 = _mm_xor_ps(dzdl_1,
SIGNMASK);
107 tmp3 = _mm_and_ps(tmp1, tmp2);
108 dzdl_1 = _mm_andnot_ps(tmp1, dzdl_1);
109 dzdl_1 = _mm_xor_ps(dzdl_1, tmp3);
112 dz2 = _mm_sub_ps(z, min_z0);
113 dz2 = _mm_mul_ps(dz2, dz2);
114 tmp1 = _mm_mul_ps(s1, s1);
115 tmp1 = _mm_add_ps(tmp1, dz2);
116 __m128 dzdl_2 = _mm_div_ps(dz2, tmp1);
117 dzdl_2 = _vec_sqrt_ps(dzdl_2);
119 tmp1 = _mm_cmplt_ps(z, min_z0);
120 tmp2 = _mm_xor_ps(dzdl_2,
SIGNMASK);
121 tmp3 = _mm_and_ps(tmp1, tmp2);
122 dzdl_2 = _mm_andnot_ps(tmp1, dzdl_2);
123 dzdl_2 = _mm_xor_ps(dzdl_2, tmp3);
126 dz2 = _mm_sub_ps(z, max_z0);
127 dz2 = _mm_mul_ps(dz2, dz2);
128 tmp1 = _mm_mul_ps(s2, s2);
129 tmp1 = _mm_add_ps(tmp1, dz2);
130 __m128 dzdl_3 = _mm_div_ps(dz2, tmp1);
131 dzdl_3 = _vec_sqrt_ps(dzdl_3);
133 tmp1 = _mm_cmplt_ps(z, max_z0);
134 tmp2 = _mm_xor_ps(dzdl_3,
SIGNMASK);
135 tmp3 = _mm_and_ps(tmp1, tmp2);
136 dzdl_3 = _mm_andnot_ps(tmp1, dzdl_3);
137 dzdl_3 = _mm_xor_ps(dzdl_3, tmp3);
140 dz2 = _mm_sub_ps(z, min_z0);
141 dz2 = _mm_mul_ps(dz2, dz2);
142 tmp1 = _mm_mul_ps(s2, s2);
143 tmp1 = _mm_add_ps(tmp1, dz2);
144 __m128 dzdl_4 = _mm_div_ps(dz2, tmp1);
145 dzdl_4 = _vec_sqrt_ps(dzdl_4);
147 tmp1 = _mm_cmplt_ps(z, min_z0);
148 tmp2 = _mm_xor_ps(dzdl_4,
SIGNMASK);
149 tmp3 = _mm_and_ps(tmp1, tmp2);
150 dzdl_4 = _mm_andnot_ps(tmp1, dzdl_4);
151 dzdl_4 = _mm_xor_ps(dzdl_4, tmp3);
156 __m128 dzdl_max = dzdl_1;
157 tmp1 = _mm_cmpgt_ps(dzdl_2, dzdl_max);
158 tmp2 = _mm_and_ps(tmp1, dzdl_2);
159 tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
160 dzdl_max = _mm_xor_ps(tmp2, tmp3);
161 tmp1 = _mm_cmpgt_ps(dzdl_3, dzdl_max);
162 tmp2 = _mm_and_ps(tmp1, dzdl_3);
163 tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
164 dzdl_max = _mm_xor_ps(tmp2, tmp3);
165 tmp1 = _mm_cmpgt_ps(dzdl_4, dzdl_max);
166 tmp2 = _mm_and_ps(tmp1, dzdl_4);
167 tmp3 = _mm_andnot_ps(tmp1, dzdl_max);
168 dzdl_max = _mm_xor_ps(tmp2, tmp3);
170 __m128 dzdl_min = dzdl_1;
171 tmp1 = _mm_cmplt_ps(dzdl_2, dzdl_min);
172 tmp2 = _mm_and_ps(tmp1, dzdl_2);
173 tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
174 dzdl_min = _mm_xor_ps(tmp2, tmp3);
175 tmp1 = _mm_cmplt_ps(dzdl_3, dzdl_min);
176 tmp2 = _mm_and_ps(tmp1, dzdl_3);
177 tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
178 dzdl_min = _mm_xor_ps(tmp2, tmp3);
179 tmp1 = _mm_cmplt_ps(dzdl_4, dzdl_min);
180 tmp2 = _mm_and_ps(tmp1, dzdl_4);
181 tmp3 = _mm_andnot_ps(tmp1, dzdl_min);
182 dzdl_min = _mm_xor_ps(tmp2, tmp3);
186 _mm_store_ps(min_dzdl_a, dzdl_min);
187 _mm_store_ps(max_dzdl_a, dzdl_max);