209 int nObject = (int)
object.size();
213 const unsigned int threads = (
unsigned int)(omp_get_max_threads());
215 std::vector<unsigned int> mortonCodesKeyTemp(nObject);
216 std::vector<int> mortonCodesIdxTemp(nObject);
217 std::vector<unsigned int> s(256 * threads);
225#pragma omp parallel num_threads(threads)
228 unsigned int* destKey = mortonCodesKeyTemp.data();
231 int* destIdx = mortonCodesIdxTemp.data();
233 const unsigned int tid = (
unsigned int)(omp_get_thread_num());
234 const unsigned int nt = (
unsigned int)(omp_get_num_threads());
237 const unsigned int div = nObject / nt;
238 const unsigned int mod = nObject % nt;
240 const unsigned int chunk = div + (tid < mod ? 1u : 0u);
243 const unsigned int left_index = (tid < mod) ? tid * (div + 1u) : mod * (div + 1u) + (tid - mod) * div;
244 const unsigned int right_index = (chunk == 0u) ? left_index : (left_index + chunk - 1u);
246 for (
unsigned int digit = 0; digit <
sizeof(
unsigned int); ++digit)
248 unsigned int s_sum[256] = { 0 };
249 unsigned int s0[256] = { 0 };
259 for (
unsigned int i = left_index; i <= right_index; ++i)
261 const unsigned int bucket = (sourceKey[i] >> (8u * digit)) & 0xFFu;
268 for (
unsigned int b = 0; b < 256u; ++b)
269 s[256u * tid + b] = s0[b];
276 for (
unsigned int t = 0; t < nt; ++t)
277 for (
unsigned int b = 0; b < 256u; ++b)
279 s_sum[b] += s[256u * t + b];
282 s0[b] += s[256u * t + b];
288 for (
unsigned int b = 1; b < 256u; ++b)
290 s_sum[b] += s_sum[b - 1];
291 s0[b] += s_sum[b - 1];
297 for (
int i = (
int)(right_index); i >= (int)(left_index); --i)
299 const unsigned int bucket = (sourceKey[i] >> (8u * digit)) & 0xFFu;
302 const unsigned int pos = --s0[bucket];
306 destKey[pos] = sourceKey[i];
307 destIdx[pos] = sourceIdx[i];
316 std::swap(sourceKey, destKey);
317 std::swap(sourceIdx, destIdx);
325 int nObject = (int)
object.size();
334 std::vector<int> levelTemp(n);
335 std::vector<int> indexTemp(n);
337 const unsigned int threads = (
unsigned int)(omp_get_max_threads());
338 std::vector<unsigned int> s(256 * threads);
340#pragma omp parallel num_threads(threads)
343 int* destKey = levelTemp.data();
346 int* destIdx = indexTemp.data();
348 const unsigned int tid = (
unsigned int)(omp_get_thread_num());
349 const unsigned int nt = (
unsigned int)(omp_get_num_threads());
351 const unsigned int div = n / nt;
352 const unsigned int mod = n % nt;
354 const unsigned int chunk = div + (tid < mod ? 1u : 0u);
356 const unsigned int left_index = (tid < mod) ? tid * (div + 1u) : mod * (div + 1u) + (tid - mod) * div;
358 const unsigned int right_index = (chunk == 0u) ? left_index : (left_index + chunk - 1u);
361 for (
unsigned int digit = 0; digit <
sizeof(int); ++digit)
363 unsigned int s_sum[256] = { 0 };
364 unsigned int s0[256] = { 0 };
368 for (
unsigned int i = left_index; i <= right_index; ++i)
370 const unsigned int bucket =
371 ((
unsigned int)(sourceKey[i]) >> (8u * digit)) & 0xFFu;
376 for (
unsigned int b = 0; b < 256u; ++b)
377 s[256u * tid + b] = s0[b];
381 for (
unsigned int t = 0; t < nt; ++t)
383 for (
unsigned int b = 0; b < 256u; ++b)
385 s_sum[b] += s[256u * t + b];
387 s0[b] += s[256u * t + b];
391 for (
unsigned int b = 1; b < 256u; ++b)
393 s_sum[b] += s_sum[b - 1];
394 s0[b] += s_sum[b - 1];
399 for (
int i = (
int)(right_index);
400 i >= (int)(left_index); --i)
402 const unsigned int bucket = ((
unsigned int)(sourceKey[i]) >> (8u * digit)) & 0xFFu;
404 const unsigned int pos = --s0[bucket];
406 destKey[pos] = sourceKey[i];
407 destIdx[pos] = sourceIdx[i];
412 std::swap(sourceKey, destKey);
413 std::swap(sourceIdx, destIdx);
419#pragma omp parallel for
420 for (
int k = 0; k < n; ++k)
429 using int2 = std::pair<int, int>;
453 const int nnodes = 2 * (int)
object.size() - 1;
454 const int nbodies = (int)
object.size();
457 for (
int k = nbodies; k < nnodes; ++k)
477 int2 chdPair =
child[srt];
479 for (i = 0; i < 2; i++) {
480 int chd = i * chdPair.second + (1 - i) * chdPair.first;
482 ch = (chd >= nbodies) ? (chd - nbodies) : ((nnodes - 1) -
indexSortT[chd]);
484 if ((chd >= nbodies) || (
mass[nnodes - 1 - ch] >= 0))
496 for (i = 0; i < 2; i++)
498 const int chd = i * chdPair.second + (1 - i) * chdPair.first;
506 ::fmin(xyAB[0], xyAB[2]), ::fmin(xyAB[1], xyAB[3]),
507 ::fmax(xyAB[0], xyAB[2]), ::fmax(xyAB[1], xyAB[3])
520 ::fmin(lu[0][0], lu[1][0]), ::fmin(lu[0][1], lu[1][1]),
521 ::fmax(lu[0][2], lu[1][2]), ::fmax(lu[0][3], lu[1][3])
527 const double2 zero = { 0.0, 0.0 };
528 double2 momh0 = zero;
529 double2 momh1 = zero;
530 double2 momh2 = zero;
531 double2 momh3 = zero;
532 double2 momh4 = zero;
533 double2 momh5 = zero;
534 double2 momh6 = zero;
535 double2 momh7 = zero;
536 double2 momh8 = zero;
537 double2 momh9 = zero;
538 double2 momh10 = zero;
539 double2 momh11 = zero;
542 for (i = 0; i < 2; i++)
544 const int chd = i * chdPair.second + (1 - i) * chdPair.first;
552 mom0 = double2{
gamma[sortedBody], 0.0 };
554 mom1 = mom2 = mom3 = mom4 = mom5 = mom6 = mom7 = mom8 = mom9 = mom10 = mom11 = double2{ 0.0, 0.0 };
555 double2 pos =
object[sortedBody];
632 ch = (nnodes - 1) - srtT;
636 mom0 = double2{
moms[nch + 0][0], (double)0 };
637 mom1 =
moms[nch + 1];
638 mom2 =
moms[nch + 2];
639 mom3 =
moms[nch + 3];
640 mom4 =
moms[nch + 4];
641 mom5 =
moms[nch + 5];
642 mom6 =
moms[nch + 6];
643 mom7 =
moms[nch + 7];
644 mom8 =
moms[nch + 8];
645 mom9 =
moms[nch + 9];
646 mom10 =
moms[nch + 10];
647 mom11 =
moms[nch + 11];
669 momh1 += multz(mom0, z);
670 momh2 += 2 * multz(mom1, z);
671 momh3 += 3 * multz(mom2, z);
672 momh4 += 4 * multz(mom3, z);
673 momh5 += 5 * multz(mom4, z);
674 momh6 += 6 * multz(mom5, z);
675 momh7 += 7 * multz(mom6, z);
676 momh8 += 8 * multz(mom7, z);
677 momh9 += 9 * multz(mom8, z);
678 momh10 += 10 * multz(mom9, z);
679 momh11 += 11 * multz(mom10, z);
683 momh2 += multz(mom0, z);
684 momh3 += 3 * multz(mom1, z);
685 momh4 += 6.0 * multz(mom2, z);
686 momh5 += 10.0 * multz(mom3, z);
687 momh6 += 15.0 * multz(mom4, z);
688 momh7 += 21.0 * multz(mom5, z);
689 momh8 += 28.0 * multz(mom6, z);
690 momh9 += 36.0 * multz(mom7, z);
691 momh10 += 45.0 * multz(mom8, z);
692 momh11 += 55.0 * multz(mom9, z);
696 momh3 += multz(mom0, z);
697 momh4 += 4 * multz(mom1, z);
698 momh5 += 10.0 * multz(mom2, z);
699 momh6 += 20.0 * multz(mom3, z);
700 momh7 += 35.0 * multz(mom4, z);
701 momh8 += 56.0 * multz(mom5, z);
702 momh9 += 84.0 * multz(mom6, z);
703 momh10 += 120.0 * multz(mom7, z);
704 momh11 += 165.0 * multz(mom8, z);
708 momh4 += multz(mom0, z);
709 momh5 += 5 * multz(mom1, z);
710 momh6 += 15.0 * multz(mom2, z);
711 momh7 += 35.0 * multz(mom3, z);
712 momh8 += 70.0 * multz(mom4, z);
713 momh9 += 126.0 * multz(mom5, z);
714 momh10 += 210.0 * multz(mom6, z);
715 momh11 += 330.0 * multz(mom7, z);
719 momh5 += multz(mom0, z);
720 momh6 += 6 * multz(mom1, z);
721 momh7 += 21.0 * multz(mom2, z);
722 momh8 += 56.0 * multz(mom3, z);
723 momh9 += 126.0 * multz(mom4, z);
724 momh10 += 252.0 * multz(mom5, z);
725 momh11 += 462.0 * multz(mom6, z);
729 momh6 += multz(mom0, z);
730 momh7 += 7 * multz(mom1, z);
731 momh8 += 28.0 * multz(mom2, z);
732 momh9 += 84.0 * multz(mom3, z);
733 momh10 += 210.0 * multz(mom4, z);
734 momh11 += 462.0 * multz(mom5, z);
738 momh7 += multz(mom0, z);
739 momh8 += 8 * multz(mom1, z);
740 momh9 += 36.0 * multz(mom2, z);
741 momh10 += 120.0 * multz(mom3, z);
742 momh11 += 330.0 * multz(mom4, z);
746 momh8 += multz(mom0, z);
747 momh9 += 9 * multz(mom1, z);
748 momh10 += 45.0 * multz(mom2, z);
749 momh11 += 165.0 * multz(mom3, z);
753 momh9 += multz(mom0, z);
754 momh10 += 10 * multz(mom1, z);
755 momh11 += 55.0 * multz(mom2, z);
759 momh10 += multz(mom0, z);
760 momh11 += 11 * multz(mom1, z);
764 momh11 += multz(mom0, z);
769 moms[kch + 0] = momh0;
770 moms[kch + 1] = momh1;
771 moms[kch + 2] = momh2;
772 moms[kch + 3] = momh3;
773 moms[kch + 4] = momh4;
774 moms[kch + 5] = momh5;
775 moms[kch + 6] = momh6;
776 moms[kch + 7] = momh7;
777 moms[kch + 8] = momh8;
778 moms[kch + 9] = momh9;
779 moms[kch + 10] = momh10;
780 moms[kch + 11] = momh11;
791 mass[nnodes - 1 - k] = cm;
807 using int2 = std::pair<int, int>;
815 const int nnodes = 2 * (int)
object.size() - 1;
816 const int nbodies = (int)
object.size();
818 for (
int k = nbodies; k < nnodes; ++k)
836 const int kch = ((nnodes - 1) - k);
838 int2 chdPair =
child[srt];
844 for (i = 0; i < 2; i++)
846 int chd = i * chdPair.second + (1 - i) * chdPair.first;
847 ch = (chd >= nbodies) ? (chd - nbodies) : ((nnodes - 1) -
indexSortT[chd]);
848 if ((chd >= nbodies) || (
mass[nnodes - 1 - ch] >= 0))
855 for (i = 0; i < 2; i++)
857 const int chd = i * chdPair.second + (1 - i) * chdPair.first;
865 ::fmin(xyAB[0], xyAB[2]), ::fmin(xyAB[1], xyAB[3]),
866 ::fmax(xyAB[0], xyAB[2]), ::fmax(xyAB[1], xyAB[3])
879 const double4 loup = double4{
880 ::fmin(lu[0][0], lu[1][0]),
881 ::fmin(lu[0][1], lu[1][1]),
882 ::fmax(lu[0][2], lu[1][2]),
883 ::fmax(lu[0][3], lu[1][3]) };
889 0.5 * (loup[0] + loup[2]),
890 0.5 * (loup[1] + loup[3])
899 mass[nnodes - 1 - k] = m[0] + m[1];
908 float t1 = (float)omp_get_wtime();
909 int nObject = (int)
object.size();
914 auto minmaxX = std::minmax_element(
object.begin(),
object.end(), Point2D::cmp<
'x'>);
915 auto minmaxY = std::minmax_element(
object.begin(),
object.end(), Point2D::cmp<
'y'>);
916 minr =
Point2D({ (*minmaxX.first)[0], (*minmaxY.first)[1] });
917 maxr =
Point2D({ (*minmaxX.second)[0], (*minmaxY.second)[1] });
920 double lmax, quadSideFactor;
923 quadSideFactor =
rbound / lmax;
925#pragma omp parallel for
926 for (
int bdy = 0; bdy < nObject; ++bdy)
930 unsigned int xx =
ExpandBits((
unsigned int)rScaled[0]);
931 unsigned int yy =
ExpandBits((
unsigned int)rScaled[1]);
942 for (
int i = 0; i < nObject - 1; ++i)
946 int Deltap1 =
Delta(i, i + 1);
947 int Deltam1 =
Delta(i, i - 1);
949 int d = (Deltap1 - Deltam1 > 0) ? 1 : -1;
951 int delta_min = (d > 0) ? Deltam1 : Deltap1;
954 int pos = i + Lmax * d;
956 while (
Delta(i, pos) > delta_min)
963 for (
int t = (Lmax >> 1); t >= 1; t >>= 1)
965 pos = i + (L + t) * d;
967 if (
Delta(i, pos) > delta_min)
974 int delta_node =
Delta(i, j);
980 for (
int p = 1, t =
ceilhalf(L); L > (1 << (p - 1)); ++p, t =
ceilpow2(L, p))
982 pos = i + (s + t) * d;
984 int dl =
Delta(i, pos);
990 int gammaPos = i + s * d + d * (d < 0);
992 int Mmin = std::min(i, j);
993 int Mmax = std::max(i, j);
996 int right = gammaPos + 1;
999 int childLeft = (Mmin == gammaPos) * nObject + left;
1000 range[childLeft] = { Mmin, gammaPos };
1004 int childRight = (Mmax == gammaPos + 1) * nObject + right;
1005 range[childRight] = { gammaPos + 1, Mmax };
1008 child[i] = { childLeft, childRight };
1014 float t2 = (float)omp_get_wtime();
1040 float t1 = (float)omp_get_wtime();
1044 using int2 = std::pair<int, int>;
1046 if (
object.size() > 0)
1048 double itolsq = 1.0 / (theta * theta);
1050 const int nbodies = (int)
object.size();
1051 const int nnodes = 2 * nbodies - 1;
1052 const int npoints = (int)cntrTree.
object.size();
1059 double d_1, d_2, d_3, dst23, dst12;
1064 double2 v{ 0.0,0.0 };
1073 const int maxDepth = 32;
1075 int posStack[maxDepth];
1076 int nodeStack[maxDepth];
1080#pragma omp parallel for
1081 for (
int k = 0; k < npoints; ++k)
1083 d_1 = d_2 = d_3 = 1e+5;
1085 p = cntrTree.
object[indexOfPoint];
1089 nodeStack[0] = nnodes - 1;
1093 pd = posStack[depth];
1094 nd = nodeStack[depth];
1100 const int chd = (pd == 0) ? chBoth.first : chBoth.second;
1103 posStack[depth] = pd;
1105 isVortex = (chd >= nbodies);
1113 ps =
object[vortexIndex];
1114 gm =
gamma[vortexIndex];
1120 n = (nnodes - 1) - srtT;
1125 sumSide2 = (gab[2] - gab[0] + gab[3] - gab[1]);
1126 sumSide2 *= sumSide2;
1130 r2 = dr[0] * dr[0] + dr[1] * dr[1];
1131 if (isVortex || ((sumSide2 + eps2) * itolsq < r2))
1142 if ((r2 < d_3) && (r2 > 0.0))
1144 dst23 = std::fmin(r2, d_2);
1145 d_3 = std::fmax(r2, d_2);
1147 dst12 = std::fmin(dst23, d_1);
1148 d_2 = std::fmax(dst23, d_1);
1153 const double f = gm / std::fmax(r2, eps2);
1157 if ((!isVortex) && (order > 1) && r2 > 0.0)
1159 Point2D cftr = (1.0 / r2) * dr;
1162 for (
int s = 1; s < order; ++s)
1164 th = multz(th, cftr);
1166 Point2D add = multzA(th, mom[s]);
1174 if (depth + 1 < maxDepth)
1177 posStack[depth] = 0;
1178 nodeStack[depth] = n;
1194 float t2 = (float)omp_get_wtime();