fastm/BHgpu  1.5
Быстрые алгоритмы в вихревых методах моделирования плоских течений
main.cpp
См. документацию.
1 /*--------------------------------*- BHgpu -*----------------*---------------*\
2 | ##### ## ## | | Version 1.5 |
3 | ## ## ## ## #### ## ## | BHgpu: Barnes-Hut method | 2023/08/29 |
4 | ##### ###### ## ## ## | for 2D vortex particles *----------------*
5 | ## ## ## ## ## ## ## | Open Source Code |
6 | ##### ## ## #### #### | https://www.github.com/vortexmethods/fastm |
7 | |
8 | Copyright (C) 2020-2023 I. Marchevsky, E. Ryatina, A. Kolganova |
9 | Copyright (C) 2013, Texas State University-San Marcos. All rights reserved. |
10 *-----------------------------------------------------------------------------*
11 | File name: main.cpp |
12 | Info: Source code of BHgpu |
13 | |
14 | This file is part of BHgpu. |
15 | BHcu is free software: you can redistribute it and/or modify it |
16 | under the terms of the GNU General Public License as published by |
17 | the Free Software Foundation, either version 3 of the License, or |
18 | (at your option) any later version. |
19 | |
20 | BHcu is distributed in the hope that it will be useful, but WITHOUT |
21 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
22 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
23 | for more details. |
24 | |
25 | You should have received a copy of the GNU General Public License |
26 | along with BHgpu. If not, see <http://www.gnu.org/licenses/>. |
27 \*---------------------------------------------------------------------------*/
28 
29 /*
30  * Portions of this program were originally released under the following license
31  *
32  * CUDA BarnesHut v3.1: Simulation of the gravitational forces
33  * in a galactic cluster using the Barnes-Hut n-body algorithm
34  *
35  * Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without modification,
38  * are permitted for academic, research, experimental, or personal use provided that
39  * the following conditions are met:
40  *
41  * * Redistributions of source code must retain the above copyright notice,
42  * this list of conditions and the following disclaimer.
43  * * Redistributions in binary form must reproduce the above copyright notice,
44  * this list of conditions and the following disclaimer in the documentation
45  * and/or other materials provided with the distribution.
46  * * Neither the name of Texas State University-San Marcos nor the names of its
47  * contributors may be used to endorse or promote products derived from this
48  * software without specific prior written permission.
49  *
50  * For all other uses, please contact the Office for Commercialization and Industry
51  * Relations at Texas State University-San Marcos <http://www.txstate.edu/ocir/>.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
55  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED
56  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
57  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
58  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
60  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
61  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
62  * OF THE POSSIBILITY OF SUCH DAMAGE.
63  *
64  * Author: Martin Burtscher <burtscher@txstate.edu>
65  *
66  */
67 
78 #include <algorithm>
79 #include <fstream>
80 #include <iostream>
81 #include <math.h>
82 #include <stdlib.h>
83 #include <stdio.h>
84 #include <vector>
85 
86 #include "omp.h"
87 
88 #include "cuKernels.cuh"
89 #include "Logo.h"
90 #include "Params.h"
91 #include "Point2D.h"
92 #include "Vortex2D.h"
93 
94 
95 using namespace BHcu;
96 
97 const real IDPI = (real)0.15915494309189534;
98 
99 /******************************************************************************/
100 /******************************************************************************/
101 void setBinom(std::vector<int>& cft);
102 
103 //Контроль правильности построения дерева
104 /*
105  void traverseT(int v, int* TEMPchild)
106  {
107  int v0 = TEMPchild[2*v+0];
108  int v1 = TEMPchild[2*v+1];
109  if (v0 > 0)
110  traverseT(v0, TEMPchild);
111  if (v1 > 0)
112  traverseT(v1, TEMPchild);
113  }
114 
115 
116  //for (int i = 0; i < nbodies - 1; ++i)
117  // traverse(i, Mchildh.data(), nbodies);
118  //std::cout << "Tree is correct!" << std::endl;
119 */
120 
121 
122 int main(int argc, char** argv)
123 {
125  //без буквы "l" на конце - на host,
126  //c буквой "l" на конце - на device
127  std::vector<realVortex> vtx; //вихри из файла
128  realVortex* vtxl;
129 
130  int* massl; //массы (единица для точечного вихря, число вихрей для ячейки) //todo зачем они на хосте?
131 
132  std::vector<realPoint> vel; //для вычисляемых скоростей
133  realPoint* vell; // - быстрым методом
134  realPoint* vellBS; // - прямым методом Био--Савара
135 
136  std::vector<int> cft; //биномиальные коэффициенты
137 
138  realPoint* maxrl, *minrl; //габаритный прямоугольник
139 
140  realPoint* momsl; //мультипольные моменты всех ячеек; хранятся в виде <mom_0x, mom_0y=0 mom_1x, mom_1y, ..., mom_px, mom_py>, <для второй ячейки> ...
141 
142  //Число мультипроцессоров, заполняется функцией setBlocks(blocks)
143  int blocks;
144 
145  //Число ячеек дерева и тел
146  int nnodes, nbodies;
147 
148  //Радиус вихря и параметр близости и их квадраты
149  real epssq = (real)(EPS * EPS);
150  real itolsq = (real)(1 / (THETA * THETA));
151 
152  //Статистика
153  float timing[7], mintiming[7] = { 1e+10, 1e+10, 1e+10, 1e+10, 1e+10, 1e+10, 1e+10 }, avtiming[7]{};
154  double runtime, minruntime = 1e+10, avruntime = 0;
155 
156 
157  PrintLogoToStream(std::cout);
158 
159  //Проверка функционирования видеокарты
160  CudaSelect(dev);
161  setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)
162 
163  //Массивы исходных данных и результата на host
164  //Указатели на массивы, хранящиеся на device
165 
166  //For Morton tree
167  int* MmortonCodesKeyUnsortl;
168  int* MmortonCodesKeyl;
169 
170  int* MmortonCodesIdxUnsortl; //0 1 2 3 ... nbodies-1
171  int* MmortonCodesIdxl;
172 
173  int* MlevelUnsortl;
174  int* MlevelSortl;
175 
176  int* MindexUnsortl; //0 1 2 3 ... nbodies-2
177  int* MindexSortl;
178  int* MindexSortTl;
179 
180 
181  realPoint* Mposl; //Положения внутренних узлов в дерева Карраса
182  realPoint* Msizel; //Размеры внутренних ячеек
183  int* Mparentl; //Номер ячейки-родителя
184  intPair* Mchildl; //Потомки внутренних ячеек
185  intPair* Mrangel; //Диапазон частиц во внутренней ячейке
186 
187  // generate input
188  std::ifstream infile;
189  infile.open(nameFile);
190 
191  infile >> nbodies;
192  nnodes = nbodies * 2;
193  if (nnodes < 1024 * blocks)
194  nnodes = 1024 * blocks;
195  while ((nnodes & (32 - 1)) != 0) // 32 - это размер варпа
196  nnodes++;
197  nnodes--;
198 
199  try {
200  vel.resize(nbodies);
201 
202  cft.resize(order * (order + 1));
203  setBinom(cft);
204  }
205  catch (...)
206  {
207  fprintf(stderr, "cannot allocate host memory\n"); exit(-1);
208  }
209 
210  vtx.resize(nbodies);
211 
212  for (int i = 0; i < nbodies; i++) {
213  infile >> vtx[i].r()[0] >> vtx[i].r()[1] >> vtx[i].g();
214  }
215 
219 
220  KernelsOptimization();
221 
222  PrintConfiguration(nbodies);
223 
224  for (int run = 0; run < runs; run++)
225  {
226  for (int i = 0; i < 6; i++)
227  timing[i] = 0;
228 
229  // allocate memory
230  if (run == 0) {
231  unsigned long long int mem = 0;
232 
233  massl = (int*)cudaNew(nbodies - 1, sizeof(int));
234  mem += (nbodies - 1) * sizeof(int);
235 
236  vtxl = (realVortex*)cudaNew(nbodies, sizeof(realVortex));
237  mem += nbodies * sizeof(realVortex);
238 
239  momsl = (realPoint*)cudaNew((nbodies-1) * order, sizeof(realPoint));
240  mem += (nbodies - 1) * order * sizeof(realPoint);
241 
242 
243 
244  vell = (realPoint*)cudaNew(nbodies, sizeof(realPoint));
245  mem += nbodies * sizeof(realPoint);
246 
247  vellBS = (realPoint*)cudaNew(nbodies, sizeof(realPoint));
248  mem += nbodies * sizeof(realPoint);
249 
250  maxrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
251  minrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
252  mem += 2 * blocks * FACTOR1 * sizeof(realPoint);
253 
255  MmortonCodesKeyUnsortl = (int*)cudaNew(nbodies, sizeof(int));
256  MmortonCodesKeyl = (int*)cudaNew(nbodies, sizeof(int));
257  MmortonCodesIdxUnsortl = (int*)cudaNew(nbodies, sizeof(int));
258  MmortonCodesIdxl = (int*)cudaNew(nbodies, sizeof(int));
259  mem += 4 * nbodies * sizeof(int);
260 
261  Mposl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
262  Msizel = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
263  mem += 2 * (nbodies - 1) * sizeof(realPoint);
264 
265  Mparentl = (int*)cudaNew(nnodes, sizeof(int));
266  mem += nnodes * sizeof(int);
267 
268  Mchildl = (intPair*)cudaNew(nbodies-1, sizeof(intPair));
269  mem += (nbodies - 1) * sizeof(intPair);
270 
271  Mrangel = (intPair*)cudaNew(nnodes, sizeof(intPair)); //Нужно ли для всех?
272  mem += nnodes * sizeof(intPair);
273 
274  MlevelUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
275  MlevelSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
276  MindexUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
277  MindexSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
278  MindexSortTl = (int*)cudaNew(nbodies - 1, sizeof(int));
279  mem += 5 * (nbodies - 1) * sizeof(int);
280 
281  printf("Total allocated memory: %llu bytes = %llu Mbytes\n", mem, mem >> 20);
282  }
283 
284  if (run == 0)
285  {
286  cudaCopyVecToDevice(vtx.data(), vtxl, nbodies, sizeof(realVortex));
287  //cudaCopyVecToDevice(vel.data(), vell, nbodies, sizeof(realPoint));
288  //cudaCopyVecToDevice(vel.data(), vellBS, nbodies, sizeof(realPoint));
289  setBinomCftConst(cft.data());
290  }
291 
292  // run timesteps (launch GPU kernels)
293 
294  double starttime, endtime;
295  starttime = omp_get_wtime();
296 
297 
298  timing[0] += cuInitializationKernel();
299 
309  timing[1] += McuBoundingBoxKernel(nbodies, vtxl, Mposl, maxrl, minrl);
310 
311  //realPoint maxrh, minrh;
312  //cudaCopyVecFromDevice(maxrl, &maxrh, 2, sizeof(real));
313  //cudaCopyVecFromDevice(minrl, &minrh, 2, sizeof(real));
314  //
315  //std::cout << "x : " << minrh[0] << "..." << maxrh[0] << "\n";
316  //std::cout << "y : " << minrh[1] << "..." << maxrh[1] << "\n";
317 
329  auto ttt1 = McuMortonCodesKernel(nbodies, vtxl, MmortonCodesKeyUnsortl, MmortonCodesIdxUnsortl, MmortonCodesKeyl, MmortonCodesIdxl, Mrangel);
330  timing[2] += ttt1;
331 // std::cout << "McuMortonCodesKernel = " << ttt1 << std::endl;
332 
342  auto ttt2 = McuMortonInternalNodesKernel(nbodies, MmortonCodesKeyl, Mparentl, Mchildl, Mrangel);
343  timing[2] += ttt2;
344 // std::cout << "McuMortonInternalNodesKernel = " << ttt2 << std::endl;
345 
361  auto ttt3 = McuMortonInternalCellsGeometryKernel(nbodies, nnodes, MmortonCodesKeyl, Mposl, Msizel, Mrangel, MlevelUnsortl, MlevelSortl, MindexUnsortl, MindexSortl, MindexSortTl);
362  timing[2] += ttt3;
363 // std::cout << "McuMortonInternalCellsGeometryKernel = " << ttt3 << std::endl;
364 
373  auto ttt4 = cuClearKernel2(nnodes, nbodies, massl, momsl);
374  timing[3] += ttt4;
375 // std::cout << "McuCleearKernel23 = " << ttt4 << std::endl;
376 
391  timing[4] += cuSummarizationKernel2(nnodes, nbodies, Mchildl, massl, momsl, vtxl, MmortonCodesIdxl, Mposl, MindexSortl, MindexSortTl);
392 
410  timing[5] += cuForceCalculationKernel2(nnodes, nbodies, itolsq, epssq, Mchildl, momsl, vtxl, MmortonCodesIdxl, Mposl, MindexSortl, MindexSortTl, vell, Msizel);
411 
412  endtime = omp_get_wtime();
413  runtime = endtime - starttime;
414  if (minruntime > runtime)
415  minruntime = runtime;
416  avruntime += runtime;
417 
418  timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
419 
420  for (int i = 1; i <= 6; ++i)
421  {
422  if (mintiming[i] > timing[i])
423  mintiming[i] = timing[i];
424  avtiming[i] += timing[i];
425  }
426 
427  PrintStatistics(run, runs, timing, mintiming, avtiming, runtime, minruntime, avruntime);
428  }
429 
430  if (compare || save)
431  {
432  cudaCopyVecFromDevice(vell, vel.data(), nbodies, sizeof(realPoint));
433  for (int i = 0; i < nbodies; ++i)
434  vel[i] *= IDPI;
435  }
436 
437  if (compare)
438  {
440 
441  std::vector<realPoint> veloBS(nbodies);
442 
443  bool realBSfromFile = BSfromFile;
444  if (BSfromFile)
445  {
446  std::ifstream f("../../res/velBS" + task + ".txt");
447  if (realBSfromFile = f.good())
448  std::cout << "File with Biot-Savart results is found, loading it... " << std::flush;
449  else
450  std::cout << "File with Biot-Savart results is not found, computing it... " << std::flush;
451  }
452 
453  if (!realBSfromFile)
454  {
455  float tBS = cuForceDirectCalculationKernel(nnodes, nbodies, epssq, vtxl, vellBS);
456  std::cout << "done!" << std::endl;
457  std::cout << "Time (Biot-Savart): " << tBS << " ms" << std::endl;
458 
459  cudaCopyVecFromDevice(vellBS, veloBS.data(), nbodies, sizeof(realPoint));
460 
461  for (int i = 0; i < nbodies; ++i)
462  veloBS[i] *= IDPI;
463 
464  std::ofstream velFileBS("../../res/velBS" + task + ".txt");
465  velFileBS.precision(16);
466  for (int i = 0; i < nbodies; i++)
467  velFileBS << veloBS[i][0] << " " << veloBS[i][1] << std::endl;
468  velFileBS.close();
469  }
470  else
471  {
472  std::ifstream velFileBS("../../res/velBS" + task + ".txt");
473  for (int i = 0; i < nbodies; i++)
474  velFileBS >> veloBS[i][0] >> veloBS[i][1];
475  velFileBS.close();
476  std::cout << "done!" << std::endl;
477  }
478 
479  real err = 0, absVel = 0;
480  for (int i = 0; i < nbodies; i++)
481  {
482  err += (vel[i] - veloBS[i]).length<real>();
483  absVel = std::max(absVel, veloBS[i].length<real>());
484  //err += (vel[i] - veloBS[i]).length2();
485  }
486  err /= nbodies;
487 
488  //err = sqrt(err)/nbodies;
489  //absVel = 1.0;
490 
491  PrintAccuracyError(err / absVel);
492 
493  //printf("BHcu: %.6e %.6e\n", vel[0][0], vel[0][1]);
494  //printf("BScu: %.6e %.6e\n", veloBS[0][0], veloBS[0][1]);
495  }
496 
497  if (save)
498  {
499  std::ofstream velFile("../../res/BH" + task + ".txt");
500  velFile.precision(16);
501  for (int i = 0; i < nbodies; i++)
502  velFile << vel[i][0] << " " << vel[i][1] << std::endl;
503  velFile.close();
504  }
505  std::cout << "Goodbye! " << std::endl;
506 
507  cudaDelete(massl);
508  cudaDelete(vtxl);
509  cudaDelete(vell);
510  cudaDelete(vellBS);
511  cudaDelete(momsl);
512 
513  cudaDelete(maxrl);
514  cudaDelete(minrl);
515 
517  cudaDelete(MmortonCodesKeyUnsortl);
518  cudaDelete(MmortonCodesIdxUnsortl);
519  cudaDelete(MmortonCodesKeyl);
520  cudaDelete(MmortonCodesIdxl);
521 
522  cudaDelete(Mposl);
523  cudaDelete(Msizel);
524  cudaDelete(Mparentl);
525  cudaDelete(Mchildl);
526  cudaDelete(Mrangel);
527 
528  cudaDelete(MlevelUnsortl);
529  cudaDelete(MlevelSortl);
530  cudaDelete(MindexUnsortl);
531  cudaDelete(MindexSortl);
532  return 0;
533 }
534 
535 void setBinom(std::vector<int>& cft)
536 {
537  cft[0] = 1;
538 
539  for (int i = 1; i < order; ++i)
540  {
541  cft[i * order + 0] = 1;
542  cft[i * order + i] = 1;
543  for (int j = 1; j < i; ++j)
544  cft[i * order + j] = cft[(i - 1) * order + j] + cft[(i - 1) * order + (j - 1)];
545  }
546 }
void PrintStatistics(int run, int runs, const float *timing, const float *mintiming, const float *avtiming, double runtime, double minruntime, double avruntime)
Definition: Logo.h:79
void PrintLogoToStream(std::ostream &str)
Definition: Logo.h:48
static const std::string nameFile
Definition: Params.h:72
#define realVortex
Definition: Params.h:120
int main(int argc, char **argv)
Definition: main.cpp:122
Definition: Logo.h:45
void PrintConfiguration(int nbodies)
Definition: Logo.h:64
void PrintAccuracyHead()
Definition: Logo.h:128
#define realPoint
Definition: Params.h:119
static const int dev
Definition: Params.h:98
const real IDPI
Definition: main.cpp:97
static const std::string task
Definition: Params.h:74
#define intPair
Definition: Params.h:121
#define THETA
Параметр точности
Definition: Params.h:63
Параметры решаемой задачи
static const bool save
Definition: Params.h:83
void PrintAccuracyError(real val)
Definition: Logo.h:135
static const bool BSfromFile
Definition: Params.h:81
#define EPS
Радиус вихревого элемента
Definition: Params.h:47
void setBinom(std::vector< int > &cft)
Definition: main.cpp:535
static const int order
Definition: Params.h:69
#define real
Definition: Params.h:113
static const int runs
Definition: Params.h:86
Логотип
static const bool compare
Definition: Params.h:77