EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CreateSeedsForGroupSycl.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file CreateSeedsForGroupSycl.cpp
1 // This file is part of the Acts project.
2 //
3 // Copyright (C) 2020 CERN for the benefit of the Acts project
4 //
5 // This Source Code Form is subject to the terms of the Mozilla Public
6 // License, v. 2.0. If a copy of the MPL was not distributed with this
7 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 
9 // System include(s)
10 #include <algorithm>
11 #include <cstdint>
12 #include <cstring>
13 #include <exception>
14 #include <functional>
15 #include <vector>
16 
17 // Acts include(s)
19 
20 // SYCL plugin include(s)
24 
25 // SYCL include
26 #include <CL/sycl.hpp>
27 
28 namespace Acts::Sycl {
29 // Kernel classes in order of execution.
30 class duplet_search_bottom_kernel;
31 class duplet_search_top_kernel;
32 class ind_copy_bottom_kernel;
33 class ind_copy_top_kernel;
34 class transform_coord_bottom_kernel;
35 class transform_coord_top_kernel;
36 class triplet_search_kernel;
37 class filter_2sp_fixed_kernel;
38 
40  const QueueWrapper& wrappedQueue,
41  const detail::DeviceSeedfinderConfig& seedfinderConfig,
42  const DeviceExperimentCuts& deviceCuts,
43  const std::vector<detail::DeviceSpacePoint>& bottomSPs,
44  const std::vector<detail::DeviceSpacePoint>& middleSPs,
45  const std::vector<detail::DeviceSpacePoint>& topSPs,
46  std::vector<std::vector<detail::SeedData>>& seeds) {
47  // Each vector stores data of space points in simplified
48  // structures of float variables
49  // M: number of middle space points
50  // B: number of bottom space points
51  // T: number of top space points
52  const uint32_t M = middleSPs.size();
53  const uint32_t B = bottomSPs.size();
54  const uint32_t T = topSPs.size();
55 
56  // Up to the Nth space point, the sum of compatible bottom/top space points.
57  // We need these for indexing other vectors later in the algorithm.
58  // These are prefix sum arrays, with a leading zero.
59  std::vector<uint32_t> sumBotCompUptoMid(M + 1, 0);
60  std::vector<uint32_t> sumTopCompUptoMid(M + 1, 0);
61  std::vector<uint32_t> sumBotTopCombined(M + 1, 0);
62 
63  // After completing the duplet search, we'll have successfully contructed
64  // two bipartite graphs for bottom-middle and top-middle space points.
65  // We store the indices of the middle space points of the corresponding
66  // edges.
67  std::vector<uint32_t> indMidBotComp;
68  std::vector<uint32_t> indMidTopComp;
69 
70  // Number of edges for middle-bottom and middle-top duplet bipartite graphs.
71  uint64_t edgesBottom = 0;
72  uint64_t edgesTop = 0;
73  // Number of possible compatible triplets. This is the sum of the
74  // combination of the number of compatible bottom and compatible top duplets
75  // per middle space point. (nb0*nt0 + nb1*nt1 + ... where nbk is the number
76  // of comp. bot. SPs for the kth middle SP)
77  uint64_t edgesComb = 0;
78 
79  try {
80  auto* q = wrappedQueue.getQueue();
81  uint64_t globalBufferSize =
82  q->get_device().get_info<cl::sycl::info::device::global_mem_size>();
83  uint64_t maxWorkGroupSize =
84  q->get_device().get_info<cl::sycl::info::device::max_work_group_size>();
85 
86  // Device allocations
87  detail::DeviceSpacePoint* deviceBottomSPs =
88  cl::sycl::malloc_device<detail::DeviceSpacePoint>(B, *q);
89  detail::DeviceSpacePoint* deviceMiddleSPs =
90  cl::sycl::malloc_device<detail::DeviceSpacePoint>(M, *q);
91  detail::DeviceSpacePoint* deviceTopSPs =
92  cl::sycl::malloc_device<detail::DeviceSpacePoint>(T, *q);
93 
94  // The limit of compatible bottom [top] space points per middle space
95  // point is B [T]. Temporarily we reserve buffers of this size (M*B and
96  // M*T). We store the indices of bottom [top] space points in bottomSPs
97  // [topSPs]. We move the indices to optimal size vectors for easier
98  // indexing.
99  uint32_t* deviceTmpIndBot = cl::sycl::malloc_device<uint32_t>(M * B, *q);
100  uint32_t* deviceTmpIndTop = cl::sycl::malloc_device<uint32_t>(M * T, *q);
101 
102  q->memcpy(deviceBottomSPs, bottomSPs.data(),
103  sizeof(detail::DeviceSpacePoint) * (B));
104  q->memcpy(deviceMiddleSPs, middleSPs.data(),
105  sizeof(detail::DeviceSpacePoint) * (M));
106  q->memcpy(deviceTopSPs, topSPs.data(),
107  sizeof(detail::DeviceSpacePoint) * (T));
108 
109  // Calculate 2 dimensional range of bottom-middle duplet search kernel
110  // We'll have a total of M*B threads globally, but we need to give the
111  // nd_range the global dimensions so that they are an exact multiple of
112  // the local dimensions. That's why we need this calculation.
113 
114  cl::sycl::nd_range<2> bottomDupletNDRange =
115  calculate2DimNDRange(M, B, maxWorkGroupSize);
116  cl::sycl::nd_range<2> topDupletNDRange =
117  calculate2DimNDRange(M, T, maxWorkGroupSize);
118 
119  // Count compatible bottom/top space points per middle space point.
120  std::vector<uint32_t> countBotDuplets(M, 0);
121  std::vector<uint32_t> countTopDuplets(M, 0);
122 
123  //*********************************************//
124  // ********** DUPLET SEARCH - BEGIN ********** //
125  //*********************************************//
126 
127  {
128  sycl::buffer<uint32_t> countBotBuf(countBotDuplets.data(), M);
129  sycl::buffer<uint32_t> countTopBuf(countTopDuplets.data(), M);
130  q->submit([&](cl::sycl::handler& h) {
131  auto countBotDupletsAcc =
132  sycl::ONEAPI::atomic_accessor<uint32_t, 1,
133  sycl::ONEAPI::memory_order::relaxed,
134  sycl::ONEAPI::memory_scope::device>(
135  countBotBuf, h);
136  h.parallel_for<duplet_search_bottom_kernel>(
137  bottomDupletNDRange, [=](cl::sycl::nd_item<2> item) {
138  const auto mid = item.get_global_id(0);
139  const auto bot = item.get_global_id(1);
140  // We check whether this thread actually makes sense (within
141  // bounds).
142  // The number of threads is usually a factor of 2, or 3*2^k (k
143  // \in N), etc. Without this check we may index out of arrays.
144  if (mid < M && bot < B) {
145  const auto midSP = deviceMiddleSPs[mid];
146  const auto botSP = deviceBottomSPs[bot];
147 
148  const auto deltaR = midSP.r - botSP.r;
149  const auto cotTheta = (midSP.z - botSP.z) / deltaR;
150  const auto zOrigin = midSP.z - midSP.r * cotTheta;
151 
152  if (!(deltaR < seedfinderConfig.deltaRMin) &&
153  !(deltaR > seedfinderConfig.deltaRMax) &&
154  !(cl::sycl::abs(cotTheta) > seedfinderConfig.cotThetaMax) &&
155  !(zOrigin < seedfinderConfig.collisionRegionMin) &&
156  !(zOrigin > seedfinderConfig.collisionRegionMax)) {
157  // We keep counting duplets with atomic access.
158  const auto ind = countBotDupletsAcc[mid].fetch_add(1);
159  deviceTmpIndBot[mid * B + ind] = bot;
160  }
161  }
162  });
163  });
164 
165  q->submit([&](cl::sycl::handler& h) {
166  auto countTopDupletsAcc =
167  sycl::ONEAPI::atomic_accessor<uint32_t, 1,
168  sycl::ONEAPI::memory_order::relaxed,
169  sycl::ONEAPI::memory_scope::device>(
170  countTopBuf, h);
171  h.parallel_for<duplet_search_top_kernel>(
172  topDupletNDRange, [=](cl::sycl::nd_item<2> item) {
173  const auto mid = item.get_global_id(0);
174  const auto top = item.get_global_id(1);
175  // We check whether this thread actually makes sense (within
176  // bounds).
177  if (mid < M && top < T) {
178  const auto midSP = deviceMiddleSPs[mid];
179  const auto topSP = deviceTopSPs[top];
180 
181  const auto deltaR = topSP.r - midSP.r;
182  const auto cotTheta = (topSP.z - midSP.z) / deltaR;
183  const auto zOrigin = midSP.z - midSP.r * cotTheta;
184 
185  if (!(deltaR < seedfinderConfig.deltaRMin) &&
186  !(deltaR > seedfinderConfig.deltaRMax) &&
187  !(cl::sycl::abs(cotTheta) > seedfinderConfig.cotThetaMax) &&
188  !(zOrigin < seedfinderConfig.collisionRegionMin) &&
189  !(zOrigin > seedfinderConfig.collisionRegionMax)) {
190  // We keep counting duplets with atomic access.
191  const auto ind = countTopDupletsAcc[mid].fetch_add(1);
192  deviceTmpIndTop[mid * T + ind] = top;
193  }
194  }
195  });
196  });
197  } // sync (buffers get destroyed and duplet counts are copied back to
198  // countBotDuplets and countTopDuplets)
199 
200  //*********************************************//
201  // *********** DUPLET SEARCH - END *********** //
202  //*********************************************//
203 
204  // retrieve results from counting duplets
205  {
206  // Construct prefix sum arrays of duplet counts.
207  // These will later be used to index other arrays based on middle SP
208  // indices.
209  for (uint32_t i = 1; i < M + 1; ++i) {
210  sumBotCompUptoMid[i] +=
211  sumBotCompUptoMid[i - 1] + countBotDuplets[i - 1];
212  sumTopCompUptoMid[i] +=
213  sumTopCompUptoMid[i - 1] + countTopDuplets[i - 1];
214  sumBotTopCombined[i] += sumBotTopCombined[i - 1] +
215  countTopDuplets[i - 1] * countBotDuplets[i - 1];
216  }
217 
218  edgesBottom = sumBotCompUptoMid[M];
219  edgesTop = sumTopCompUptoMid[M];
220  edgesComb = sumBotTopCombined[M];
221 
222  indMidBotComp.reserve(edgesBottom);
223  indMidTopComp.reserve(edgesTop);
224 
225  // Fill arrays of middle SP indices of found duplets (bottom and top).
226  for (uint32_t mid = 0; mid < M; ++mid) {
227  std::fill_n(std::back_inserter(indMidBotComp), countBotDuplets[mid],
228  mid);
229  std::fill_n(std::back_inserter(indMidTopComp), countTopDuplets[mid],
230  mid);
231  }
232  }
233 
234  if (edgesBottom > 0 && edgesTop > 0) {
235  // Calcualte global and local range of execution for edgesBottom number of
236  // threads. Local range is the same as block size in CUDA.
237  cl::sycl::nd_range<1> edgesBotNdRange =
238  calculate1DimNDRange(edgesBottom, maxWorkGroupSize);
239 
240  // Global and local range of execution for edgesTop number of threads.
241  cl::sycl::nd_range<1> edgesTopNdRange =
242  calculate1DimNDRange(edgesTop, maxWorkGroupSize);
243 
244  // EXPLANATION OF INDEXING (fisrt part)
245  /*
246  (for bottom-middle duplets, but it is the same for middle-tops)
247 
248  In case we have 4 middle SP and 5 bottom SP, our temporary array of
249  the compatible bottom duplet indices would look like this:
250  ---------------------
251  mid0 | 0 | 3 | 4 | 1 | - | Indices in the columns correspond to
252  mid1 | 3 | 2 | - | - | - | bottom SP indices in the bottomSPs
253  mid2 | - | - | - | - | - | array. Threads are executed concurrently,
254  mid3 | 4 | 2 | 1 | - | - | so the order of indices is random.
255  ---------------------
256  We will refer to this structure as a bipartite graph, as it can be
257  described by a graph of nodes for middle and bottom SPs, and edges
258  between one middle and one bottom SP, but never two middle or two
259  bottom SPs.
260 
261  We will flatten this matrix out, and store the indices the
262  following way (this is deviceIndBot):
263  -------------------------------------
264  | 0 | 3 | 4 | 1 | 3 | 2 | 4 | 2 | 1 |
265  -------------------------------------
266 
267  Also the length of this array is equal to edgesBottom, which is 9 in
268  this example. It is the number of the edges of the bottom-middle
269  bipartite graph.
270 
271  To find out where the indices of bottom SPs start for a particular
272  middle SP, we use prefix sum arrays.
273  We know how many duplets were found for each middle SP (this is
274  deviceCountBotDuplets).
275  -----------------
276  | 4 | 2 | 0 | 3 |
277  -----------------
278 
279  We will make a prefix sum array of these counts, with a leading zero:
280  (this is deviceSumBot)
281  ---------------------
282  | 0 | 4 | 6 | 6 | 9 |
283  ---------------------
284 
285  If we have the middle SP with index 1, then we know that the indices
286  of the compatible bottom SPs are in the range (left closed, right
287  open) [deviceSumBot[1] , deviceSumBot[2] ) of deviceIndBot. In this
288  case, these indices are 3 and 2, so we'd use these to index
289  deviceBottomSPs to gather data about the bottom SP.
290 
291  To be able to get the indices of middle SPs in constant time inside
292  kernels, we will also prepare arrays that store the indices of the
293  middleSPs of the edges (deviceMidIndPerBot).
294  -------------------------------------
295  | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 3 | 3 |
296  -------------------------------------
297 
298  (For the same purpose, we could also do a binary search on the
299  deviceSumBot array, and we will do exactly that later, in the triplet
300  search kernel.)
301 
302  We will execute the coordinate transformation on edgesBottom threads,
303  or 9 in our example.
304 
305  The size of the array storing our transformed coordinates
306  (deviceLinBot) is also edgesBottom, the sum of bottom duplets we
307  found so far.
308  */
309 
310  sycl::buffer<uint32_t> numTopDupletsBuf(countTopDuplets.data(), M);
311 
312  // We store the indices of the BOTTOM/TOP space points of the edges of
313  // the bottom-middle and top-middle bipartite duplet graphs. They index
314  // the bottomSPs and topSPs vectors.
315  uint32_t* deviceIndBot =
316  cl::sycl::malloc_device<uint32_t>(edgesBottom, *q);
317  uint32_t* deviceIndTop = cl::sycl::malloc_device<uint32_t>(edgesTop, *q);
318 
319  // We store the indices of the MIDDLE space points of the edges of the
320  // bottom-middle and top-middle bipartite duplet graphs.
321  // They index the middleSP vector.
322  uint32_t* deviceMidIndPerBot =
323  cl::sycl::malloc_device<uint32_t>(edgesBottom, *q);
324  uint32_t* deviceMidIndPerTop =
325  cl::sycl::malloc_device<uint32_t>(edgesTop, *q);
326 
327  // Partial sum arrays of deviceNumBot and deviceNum
328  uint32_t* deviceSumBot = cl::sycl::malloc_device<uint32_t>(M + 1, *q);
329  uint32_t* deviceSumTop = cl::sycl::malloc_device<uint32_t>(M + 1, *q);
330 
331  // Partial sum array of the combinations of compatible bottom and top
332  // space points per middle space point.
333  uint32_t* deviceSumComb = cl::sycl::malloc_device<uint32_t>(M + 1, *q);
334 
335  // Allocations for coordinate transformation.
336  detail::DeviceLinEqCircle* deviceLinBot =
337  cl::sycl::malloc_device<detail::DeviceLinEqCircle>(edgesBottom, *q);
338  detail::DeviceLinEqCircle* deviceLinTop =
339  cl::sycl::malloc_device<detail::DeviceLinEqCircle>(edgesTop, *q);
340 
341  q->memcpy(deviceMidIndPerBot, indMidBotComp.data(),
342  sizeof(uint32_t) * edgesBottom);
343  q->memcpy(deviceMidIndPerTop, indMidTopComp.data(),
344  sizeof(uint32_t) * edgesTop);
345  q->memcpy(deviceSumBot, sumBotCompUptoMid.data(),
346  sizeof(uint32_t) * (M + 1));
347  q->memcpy(deviceSumTop, sumTopCompUptoMid.data(),
348  sizeof(uint32_t) * (M + 1));
349  q->memcpy(deviceSumComb, sumBotTopCombined.data(),
350  sizeof(uint32_t) * (M + 1));
351  q->wait();
352 
353  // Copy indices from temporary matrices to final, optimal size vectors.
354  // We will use these for easier indexing.
355  {
356  q->submit([&](cl::sycl::handler& h) {
357  h.parallel_for<ind_copy_bottom_kernel>(
358  edgesBotNdRange, [=](cl::sycl::nd_item<1> item) {
359  auto idx = item.get_global_linear_id();
360  if (idx < edgesBottom) {
361  auto mid = deviceMidIndPerBot[idx];
362  auto ind = deviceTmpIndBot[mid * B + idx - deviceSumBot[mid]];
363  deviceIndBot[idx] = ind;
364  }
365  });
366  });
367 
368  q->submit([&](cl::sycl::handler& h) {
369  h.parallel_for<ind_copy_top_kernel>(
370  edgesTopNdRange, [=](cl::sycl::nd_item<1> item) {
371  auto idx = item.get_global_linear_id();
372  if (idx < edgesTop) {
373  auto mid = deviceMidIndPerTop[idx];
374  auto ind = deviceTmpIndTop[mid * T + idx - deviceSumTop[mid]];
375  deviceIndTop[idx] = ind;
376  }
377  });
378  });
379  } // sync
380 
381  //************************************************//
382  // *** LINEAR EQUATION TRANSFORMATION - BEGIN *** //
383  //************************************************//
384 
385  // transformation of circle equation (x,y) into linear equation (u,v)
386  // x^2 + y^2 - 2x_0*x - 2y_0*y = 0
387  // is transformed into
388  // 1 - 2x_0*u - 2y_0*v = 0
389 
390  // coordinate transformation middle-bottom pairs
391  auto linB = q->submit([&](cl::sycl::handler& h) {
392  h.parallel_for<transform_coord_bottom_kernel>(
393  edgesBotNdRange, [=](cl::sycl::nd_item<1> item) {
394  auto idx = item.get_global_linear_id();
395  if (idx < edgesBottom) {
396  const auto midSP = deviceMiddleSPs[deviceMidIndPerBot[idx]];
397  const auto botSP = deviceBottomSPs[deviceIndBot[idx]];
398 
399  const auto xM = midSP.x;
400  const auto yM = midSP.y;
401  const auto zM = midSP.z;
402  const auto rM = midSP.r;
403  const auto varianceZM = midSP.varZ;
404  const auto varianceRM = midSP.varR;
405  const auto cosPhiM = xM / rM;
406  const auto sinPhiM = yM / rM;
407 
408  const auto deltaX = botSP.x - xM;
409  const auto deltaY = botSP.y - yM;
410  const auto deltaZ = botSP.z - zM;
411 
412  const auto x = deltaX * cosPhiM + deltaY * sinPhiM;
413  const auto y = deltaY * cosPhiM - deltaX * sinPhiM;
414  const auto iDeltaR2 = 1.f / (deltaX * deltaX + deltaY * deltaY);
415  const auto iDeltaR = cl::sycl::sqrt(iDeltaR2);
416  const auto cot_theta = -(deltaZ * iDeltaR);
417 
419  L.cotTheta = cot_theta;
420  L.zo = zM - rM * cot_theta;
421  L.iDeltaR = iDeltaR;
422  L.u = x * iDeltaR2;
423  L.v = y * iDeltaR2;
424  L.er = ((varianceZM + botSP.varZ) +
425  (cot_theta * cot_theta) * (varianceRM + botSP.varR)) *
426  iDeltaR2;
427 
428  deviceLinBot[idx] = L;
429  }
430  });
431  });
432 
433  // coordinate transformation middle-top pairs
434  auto linT = q->submit([&](cl::sycl::handler& h) {
435  h.parallel_for<transform_coord_top_kernel>(
436  edgesTopNdRange, [=](cl::sycl::nd_item<1> item) {
437  auto idx = item.get_global_linear_id();
438  if (idx < edgesTop) {
439  const auto midSP = deviceMiddleSPs[deviceMidIndPerTop[idx]];
440  const auto topSP = deviceTopSPs[deviceIndTop[idx]];
441 
442  const auto xM = midSP.x;
443  const auto yM = midSP.y;
444  const auto zM = midSP.z;
445  const auto rM = midSP.r;
446  const auto varianceZM = midSP.varZ;
447  const auto varianceRM = midSP.varR;
448  const auto cosPhiM = xM / rM;
449  const auto sinPhiM = yM / rM;
450 
451  const auto deltaX = topSP.x - xM;
452  const auto deltaY = topSP.y - yM;
453  const auto deltaZ = topSP.z - zM;
454 
455  const auto x = deltaX * cosPhiM + deltaY * sinPhiM;
456  const auto y = deltaY * cosPhiM - deltaX * sinPhiM;
457  const auto iDeltaR2 = 1.f / (deltaX * deltaX + deltaY * deltaY);
458  const auto iDeltaR = cl::sycl::sqrt(iDeltaR2);
459  const auto cot_theta = deltaZ * iDeltaR;
460 
462  L.cotTheta = cot_theta;
463  L.zo = zM - rM * cot_theta;
464  L.iDeltaR = iDeltaR;
465  L.u = x * iDeltaR2;
466  L.v = y * iDeltaR2;
467  L.er = ((varianceZM + topSP.varZ) +
468  (cot_theta * cot_theta) * (varianceRM + topSP.varR)) *
469  iDeltaR2;
470 
471  deviceLinTop[idx] = L;
472  }
473  });
474  });
475 
476  //************************************************//
477  // **** LINEAR EQUATION TRANSFORMATION - END **** //
478  //************************************************//
479 
480  //************************************************//
481  // *********** TRIPLET SEARCH - BEGIN *********** //
482  //************************************************//
483 
484  // EXPLANATION OF INDEXING (second part)
485  /*
486  For the triplet search, we calculate the upper limit of constructible
487  triplets.
488 
489  For this, we multiply the number of compatible bottom and compatible
490  top SPs for each middle SP, and add these together.
491  (nb0*nt0 + nb1*nt1 + ... where nbk is the number of compatible bottom
492  SPs for the kth middle SP, similarly ntb is for tops)
493 
494  sumBotTopCombined is a prefix sum array (of length M+1) of the
495  calculated combinations.
496 
497  sumBotTopCombined:
498  ________________________________________________________
499  | | | | | M | M = number
500  | 0 | nb0*nt0 | nb0*nt0 + nb1*nt1 | ... | ∑ nbi+nti | of middle
501  |_____|_________|___________________|_____|_i=0________| space points
502 
503  We will start kernels and reserve memory for these combinations but
504  only so much we can fit into memory at once.
505 
506  We limit our memory usage to globalBufferSize/2, this is currently
507  hard-coded, but it could be configured. Actually, it would be better
508  to use a separate object that manages memory allocations and
509  deallocations and we could ask it to lend us as much memory as it is
510  happy to give.
511 
512  For later, let maxMemoryAllocation be maximum allocatable memory for
513  triplet search.
514 
515  We start by adding up summing the combinations, until we arrive at a
516  k which for:
517 
518  k+1
519  ∑ nbi+nti > maxMemoryAllocation
520  i=0
521  (or k == M).
522 
523  So we know, that we need to start our first kernel for the first k
524  middle SPs.
525 
526  Inside the triplet search kernel we start with a binary search, to
527  find out which middle SP the thread corresponds to. Note, that
528  sumBotTopCombined is a monotone increasing series of values which
529  allows us to do a binary search on it.
530 
531  Inside the triplet search kernel we count the triplets for fixed
532  bottom and middle SP. This is deviceCountTriplets.
533 
534  The triplet filter kernel is calculated on threads equal to all possible
535  bottom-middle combinations for the first k middle SPs, which are
536  the sum of bottom-middle duplets. (For the next kernel it would be the
537  bottom-middle combinations from the (k+1)th middle SP to another jth
538  middle SP j<=M.)
539 
540  This will be numTripletFilterThreads =
541  sumBotCompUptoMid[lastMiddle] - sumBotCompUptoMid[firstMiddle]
542 
543  If the triplet search and triplet filter kernel finished, we continue
544  summing up possible triplet combinations from the (k+1)th middle SP.
545 
546  Inside the kernels we need to use offset because of this, to be able to
547  map threads to space point indices.
548 
549  This offset is sumCombUptoFirstMiddle.
550  */
551 
552  const auto maxMemoryAllocation =
553  std::min(edgesComb,
554  globalBufferSize / uint64_t((sizeof(detail::DeviceTriplet) +
555  sizeof(detail::SeedData)) *
556  2));
557 
558  detail::DeviceTriplet* deviceCurvImpact =
559  cl::sycl::malloc_device<detail::DeviceTriplet>(maxMemoryAllocation,
560  (*q));
561 
562  // Reserve memory in advance for seed indices and weight
563  // Other way around would allocating it inside the loop
564  // -> less memory usage, but more frequent allocation and deallocation
565  detail::SeedData* deviceSeedArray =
566  cl::sycl::malloc_device<detail::SeedData>(maxMemoryAllocation, *q);
567 
568  // Counting the seeds in the second kernel allows us to copy back the
569  // right number of seeds, and no more.
570 
571  seeds.resize(M);
572  uint32_t sumSeeds = 0;
573  std::vector<uint32_t> deviceCountTriplets(edgesBottom, 0);
574 
575  // Do the triplet search and triplet filter for 2 sp fixed for middle
576  // space points in the interval [firstMiddle, lastMiddle).
577 
578  uint32_t lastMiddle = 0;
579  for (uint32_t firstMiddle = 0; firstMiddle < M;
580  firstMiddle = lastMiddle) {
581  // Determine the interval [firstMiddle, lastMiddle) right end based
582  // on memory requirements.
583  while (lastMiddle + 1 <= M && (sumBotTopCombined[lastMiddle + 1] -
584  sumBotTopCombined[firstMiddle] <
585  maxMemoryAllocation)) {
586  ++lastMiddle;
587  }
588 
589  const auto numTripletSearchThreads =
590  sumBotTopCombined[lastMiddle] - sumBotTopCombined[firstMiddle];
591 
592  if (numTripletSearchThreads == 0)
593  continue;
594 
595  sumSeeds = 0;
596  deviceCountTriplets.resize(edgesBottom, 0);
597 
598  const auto numTripletFilterThreads =
599  sumBotCompUptoMid[lastMiddle] - sumBotCompUptoMid[firstMiddle];
600 
601  const auto sumCombUptoFirstMiddle = sumBotTopCombined[firstMiddle];
602 
603  // Nd_range with maximum block size for triplet search and filter.
604  // (global and local range is already given)
605  cl::sycl::nd_range<1> tripletSearchNDRange =
606  calculate1DimNDRange(numTripletSearchThreads, maxWorkGroupSize);
607 
608  cl::sycl::nd_range<1> tripletFilterNDRange =
609  calculate1DimNDRange(numTripletFilterThreads, maxWorkGroupSize);
610 
611  sycl::buffer<uint32_t> countTripletsBuf(deviceCountTriplets.data(),
612  edgesBottom);
613 
614  auto tripletKernel = q->submit([&](cl::sycl::handler& h) {
615  h.depends_on({linB, linT});
616  auto countTripletsAcc =
617  sycl::ONEAPI::atomic_accessor<uint32_t, 1,
618  sycl::ONEAPI::memory_order::relaxed,
619  sycl::ONEAPI::memory_scope::device>(
620  countTripletsBuf, h);
621  auto numTopDupletsAcc = numTopDupletsBuf.get_access<
622  sycl::access::mode::read, sycl::access::target::global_buffer>(h);
623  h.parallel_for<triplet_search_kernel>(
624  tripletSearchNDRange, [=](cl::sycl::nd_item<1> item) {
625  const uint32_t idx = item.get_global_linear_id();
626  if (idx < numTripletSearchThreads) {
627  // Retrieve the index of the corresponding middle
628  // space point by binary search
629  auto L = firstMiddle;
630  auto R = lastMiddle;
631  auto mid = L;
632  while (L < R - 1) {
633  mid = (L + R) / 2;
634  // To be able to search in deviceSumComb, we need
635  // to use an offset (sumCombUptoFirstMiddle).
636  if (idx + sumCombUptoFirstMiddle < deviceSumComb[mid]) {
637  R = mid;
638  } else {
639  L = mid;
640  }
641  }
642  mid = L;
643 
644  const auto numT = numTopDupletsAcc[mid];
645  const auto threadIdxForMiddleSP =
646  (idx - deviceSumComb[mid] + sumCombUptoFirstMiddle);
647 
648  // NOTES ON THREAD MAPPING TO SPACE POINTS
649  /*
650  We need to map bottom and top SP indices to this
651  thread.
652 
653  So we are mapping one bottom and one top SP to this thread
654  (we already have a middle SP) which gives us a tiplet.
655 
656  This is done in the following way: We
657  calculated the number of possible triplet
658  combinations for this middle SP (let it be
659  num_comp_bot*num_comp_top). Let num_comp_bot = 2
660  and num_comp_top=3 in this example. So we have 2
661  compatible bottom and 3 compatible top SP for this
662  middle SP.
663 
664  That gives us 6 threads altogether:
665  ===========================================
666  thread: | 0 | 1 | 2 | 3 | 4 | 5 |
667  bottom id: | bot0 | bot0 | bot0 | bot1 | bot1 | bot1 |
668  top id: | top0 | top1 | top2 | top0 | top1 | top2 |
669  ===========================================
670 
671  If we divide 6 by the number of compatible top SP
672  for this middle SP, or deviceNumTopDuplets[mid]
673  which is 3 now, we get the id for the bottom SP.
674  Similarly, if we take modulo
675  deviceNumTopDuplets[mid], we get the id for the
676  top SP.
677 
678  So if threadIdxForMiddleSP = 3, then ib = 1 and it = 0.
679 
680  We can use these ids together with
681  deviceSumBot[mid] and deviceSumTop[mid] to be able
682  to index our other arrays.
683 
684  These other arrays are deviceIndBot and deviceIndTop.
685 
686  So to retrieve the bottom SP index for this thread, we'd
687  have to index the deviceIndBot array at
688  deviceSumBot[mid] + ib
689  which is the id for the bottom SP that we just calculated
690  (ib = 1 in the example).
691  */
692 
693  const auto ib =
694  deviceSumBot[mid] + (threadIdxForMiddleSP / numT);
695  const auto it =
696  deviceSumTop[mid] + (threadIdxForMiddleSP % numT);
697 
698  const auto linBotEq = deviceLinBot[ib];
699  const auto linTopEq = deviceLinTop[it];
700  const auto midSP = deviceMiddleSPs[mid];
701 
702  const auto Vb = linBotEq.v;
703  const auto Ub = linBotEq.u;
704  const auto Erb = linBotEq.er;
705  const auto cotThetab = linBotEq.cotTheta;
706  const auto iDeltaRb = linBotEq.iDeltaR;
707 
708  const auto Vt = linTopEq.v;
709  const auto Ut = linTopEq.u;
710  const auto Ert = linTopEq.er;
711  const auto cotThetat = linTopEq.cotTheta;
712  const auto iDeltaRt = linTopEq.iDeltaR;
713 
714  const auto rM = midSP.r;
715  const auto varianceRM = midSP.varR;
716  const auto varianceZM = midSP.varZ;
717 
718  auto iSinTheta2 = (1.f + cotThetab * cotThetab);
719  auto scatteringInRegion2 =
720  seedfinderConfig.maxScatteringAngle2 * iSinTheta2;
721  scatteringInRegion2 *= seedfinderConfig.sigmaScattering *
722  seedfinderConfig.sigmaScattering;
723  auto error2 =
724  Ert + Erb +
725  2.f * (cotThetab * cotThetat * varianceRM + varianceZM) *
726  iDeltaRb * iDeltaRt;
727  auto deltaCotTheta = cotThetab - cotThetat;
728  auto deltaCotTheta2 = deltaCotTheta * deltaCotTheta;
729 
730  deltaCotTheta = cl::sycl::abs(deltaCotTheta);
731  auto error = cl::sycl::sqrt(error2);
732  auto dCotThetaMinusError2 =
733  deltaCotTheta2 + error2 - 2.f * deltaCotTheta * error;
734  auto dU = Ut - Ub;
735 
736  if ((!(deltaCotTheta2 - error2 > 0.f) ||
737  !(dCotThetaMinusError2 > scatteringInRegion2)) &&
738  !(dU == 0.f)) {
739  auto A = (Vt - Vb) / dU;
740  auto S2 = 1.f + A * A;
741  auto B = Vb - A * Ub;
742  auto B2 = B * B;
743 
744  auto iHelixDiameter2 = B2 / S2;
745  auto pT2scatter =
746  4.f * iHelixDiameter2 * seedfinderConfig.pT2perRadius;
747  auto p2scatter = pT2scatter * iSinTheta2;
748  auto Im = cl::sycl::abs((A - B * rM) * rM);
749 
750  if (!(S2 < B2 * seedfinderConfig.minHelixDiameter2) &&
751  !((deltaCotTheta2 - error2 > 0.f) &&
752  (dCotThetaMinusError2 >
753  p2scatter * seedfinderConfig.sigmaScattering *
754  seedfinderConfig.sigmaScattering)) &&
755  !(Im > seedfinderConfig.impactMax)) {
756  const auto top = deviceIndTop[it];
757  // this will be the t-th top space point for
758  // fixed middle and bottom SP
759  auto t = countTripletsAcc[ib].fetch_add(1);
760  /*
761  deviceSumComb[mid] - sumCombUptoFirstMiddle:
762  gives the memory location reserved for this
763  middle SP
764 
765  (idx-deviceSumComb[mid]+sumCombUptoFirstMiddle:
766  this is the nth thread for this middle SP
767 
768  (idx-deviceSumComb[mid]+sumCombUptoFirstMiddle)/numT:
769  this is the mth bottom SP for this middle SP
770 
771  multiplying this by numT gives the memory
772  location for this middle and bottom SP
773 
774  and by adding t to it, we will end up storing
775  compatible triplet candidates for this middle
776  and bottom SP right next to each other
777  starting from the given memory location
778  */
779  const auto tripletIdx = deviceSumComb[mid] -
780  sumCombUptoFirstMiddle +
781  (((idx - deviceSumComb[mid] +
782  sumCombUptoFirstMiddle) /
783  numT) *
784  numT) +
785  t;
786 
788  T.curvature = B / cl::sycl::sqrt(S2);
789  T.impact = Im;
790  T.topSPIndex = top;
791  deviceCurvImpact[tripletIdx] = T;
792  }
793  }
794  }
795  });
796  });
797 
798  {
799  sycl::buffer<uint32_t> countSeedsBuf(&sumSeeds, 1);
800  q->submit([&](cl::sycl::handler& h) {
801  h.depends_on(tripletKernel);
802  auto countSeedsAcc = sycl::ONEAPI::atomic_accessor<
803  uint32_t, 1, sycl::ONEAPI::memory_order::relaxed,
804  sycl::ONEAPI::memory_scope::device>(countSeedsBuf, h);
805  auto countTripletsAcc = countTripletsBuf.get_access<
806  sycl::access::mode::read, sycl::access::target::global_buffer>(
807  h);
808  auto numTopDupletsAcc = numTopDupletsBuf.get_access<
809  sycl::access::mode::read, sycl::access::target::global_buffer>(
810  h);
811  h.parallel_for<filter_2sp_fixed_kernel>(
812  tripletFilterNDRange, [=](cl::sycl::nd_item<1> item) {
813  if (item.get_global_linear_id() < numTripletFilterThreads) {
814  const auto idx =
815  deviceSumBot[firstMiddle] + item.get_global_linear_id();
816  const auto mid = deviceMidIndPerBot[idx];
817  const auto bot = deviceIndBot[idx];
818 
819  const auto tripletBegin =
820  deviceSumComb[mid] - sumCombUptoFirstMiddle +
821  (idx - deviceSumBot[mid]) * numTopDupletsAcc[mid];
822  const auto tripletEnd =
823  tripletBegin + countTripletsAcc[idx];
824 
825  for (auto i1 = tripletBegin; i1 < tripletEnd; ++i1) {
826  const auto current = deviceCurvImpact[i1];
827  const auto top = current.topSPIndex;
828 
829  const auto invHelixDiameter = current.curvature;
830  const auto lowerLimitCurv =
831  invHelixDiameter -
832  seedfinderConfig.deltaInvHelixDiameter;
833  const auto upperLimitCurv =
834  invHelixDiameter +
835  seedfinderConfig.deltaInvHelixDiameter;
836  const auto currentTop_r = deviceTopSPs[top].r;
837  auto weight = -(current.impact *
838  seedfinderConfig.impactWeightFactor);
839 
840  uint32_t compatCounter = 0;
841 
842  // By default compatSeedLimit is 2 -> 2 is
843  // currently hard coded, because variable length
844  // arrays are not supported in SYCL kernels.
845  float compatibleSeedR[2];
846  for (auto i2 = tripletBegin;
847  i2 < tripletEnd &&
848  compatCounter < seedfinderConfig.compatSeedLimit;
849  ++i2) {
850  const auto other = deviceCurvImpact[i2];
851 
852  const auto otherCurv = other.curvature;
853  const auto otherTop_r =
854  deviceTopSPs[other.topSPIndex].r;
855  const float deltaR =
856  cl::sycl::abs(currentTop_r - otherTop_r);
857  if (deltaR >= seedfinderConfig.filterDeltaRMin &&
858  otherCurv >= lowerLimitCurv &&
859  otherCurv <= upperLimitCurv) {
860  uint32_t c = 0;
861  for (;
862  c < compatCounter &&
863  cl::sycl::abs(compatibleSeedR[c] - otherTop_r) >=
864  seedfinderConfig.filterDeltaRMin;
865  ++c) {
866  }
867  if (c == compatCounter) {
868  compatibleSeedR[c] = otherTop_r;
869  ++compatCounter;
870  }
871  }
872  }
873 
874  weight +=
875  compatCounter * seedfinderConfig.compatSeedWeight;
876 
877  const auto bottomSP = deviceBottomSPs[bot];
878  const auto middleSP = deviceMiddleSPs[mid];
879  const auto topSP = deviceTopSPs[top];
880 
881  weight +=
882  deviceCuts.seedWeight(bottomSP, middleSP, topSP);
883 
884  if (deviceCuts.singleSeedCut(weight, bottomSP, middleSP,
885  topSP)) {
886  const auto i = countSeedsAcc[0].fetch_add(1);
888  D.bottom = bot;
889  D.top = top;
890  D.middle = mid;
891  D.weight = weight;
892  deviceSeedArray[i] = D;
893  }
894  }
895  }
896  });
897  });
898  } // sync, countSeedsBuf gets destroyed and its value is copied back to
899  // sumSeeds
900 
901  if (sumSeeds != 0) {
902  std::vector<detail::SeedData> hostSeedArray(sumSeeds);
903  auto e0 = q->memcpy(&hostSeedArray[0], deviceSeedArray,
904  sumSeeds * sizeof(detail::SeedData));
905  e0.wait();
906 
907  for (uint32_t t = 0; t < sumSeeds; ++t) {
908  auto m = hostSeedArray[t].middle;
909  seeds[m].push_back(hostSeedArray[t]);
910  }
911  }
912  }
913 
914  //************************************************//
915  // ************ TRIPLET SEARCH - END ************ //
916  //************************************************//
917 
918  // NOTES ON MEMORY MANAGEMENT
919  /*
920  Note that memory management is very naive, but this should only be a
921  temporary solution. A better idea is to use a separate memory
922  manager, see the CUDA (2) implementation.
923  */
924 
925  cl::sycl::free(deviceLinBot, *q);
926  cl::sycl::free(deviceLinTop, *q);
927 
928  cl::sycl::free(deviceIndBot, *q);
929  cl::sycl::free(deviceIndTop, *q);
930  cl::sycl::free(deviceMidIndPerBot, *q);
931  cl::sycl::free(deviceMidIndPerTop, *q);
932  cl::sycl::free(deviceSumBot, *q);
933  cl::sycl::free(deviceSumTop, *q);
934  cl::sycl::free(deviceSumComb, *q);
935 
936  cl::sycl::free(deviceCurvImpact, *q);
937  cl::sycl::free(deviceSeedArray, *q);
938  }
939 
940  cl::sycl::free(deviceTmpIndBot, *q);
941  cl::sycl::free(deviceTmpIndTop, *q);
942  cl::sycl::free(deviceBottomSPs, *q);
943  cl::sycl::free(deviceMiddleSPs, *q);
944  cl::sycl::free(deviceTopSPs, *q);
945  } catch (cl::sycl::exception const& e) {
948  ACTS_FATAL("Caught synchronous SYCL exception:\n" << e.what())
949  exit(0);
950  }
951 };
952 } // namespace Acts::Sycl