Skip to content

Commit

Permalink
基本完成
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexZFX committed Jun 2, 2019
1 parent 4d8cbf9 commit 1415d79
Show file tree
Hide file tree
Showing 14 changed files with 797 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.vscode
.idea
38 changes: 37 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
# cuda-nbody
# cuda-nbody

大三下分布式并行计算实验代码,实验在Nvidia的Courses上进行,利用cuda对 nbody 算法进行优化。

网上搜索nbody的cuda优化已经有很多相应的思路介绍,我这里基于网上的各种思路实现和尝试,使用Nvidia的环境进行编写和测试,给出了最基础的并行版本和我的最终版本代码,代码中也有一定的注释说明。

代码中的check函数之类的是 Nvidia 平台上测试使用的。

## 优化思路如下

1. 首先将原有代码并行化,每个线程处理一个位置的body(参考[nbody_parallel.cu](./src/noby_parallel.cu))。
2. 自己控制内存的copy,申请和释放,消除缺页异常等。(cudaMallocManaged -> cudaMalloc / cudaMallocHost)
3. 调整 BLOCK_SIZE 得到相对最佳的值(测试环境上为32)。
4. 使用 shared_memory 进行优化,一个线程块共用一块 shared_memory,每个线程取部分数据提高数据访存效率。
5. 观察 body_force 函数,发现主要处理最后为加法,故将原本的每个线程处理一个 body 进行改进,不同块中的多个线程共同处理一个 body 的数据信息,进一步提升并行率。
6. 测试了使用 shuffle 特性,但是在我测试条件下性能并没有提升且有点下降,等会会附图说明。

基本就像上面描述的,从 parallel 版本到 shared 版本改动比较多,写了一小部分注释,不太明白的可以联系我来说明。有一些优化是有一点针对性的所以不一定能通用,像shuffle版本修改 block_size 似乎就结果不正确了,因为性能不高就没有深究了。

## 测试说明
测试环境 V-100 信息
![v100](./pic/v100.png)

未加速时CPU程序性能
![ori](./pic/ori.png)

基础并行版本性能
![parallel](./pic/parallel.png)

shuffle 版本性能
![shuffle](./pic/shuffle.png)

使用 shared_memory 的性能
![shared](./pic/shared.png)

偷偷再加了一点针对4096个body的优化的最佳结果
![best](./pic/best.png)
26 changes: 26 additions & 0 deletions header/check.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include <stdio.h>
#include <math.h>

void checkAccuracy(float *p, int nBodies)
{
if (nBodies != 2<<11) return;

// Assuming N is set to 11, the x y and z coordinates of a particle
// (chosen at random) should
// equal the following values:

// -11.943975
// 3.198896
// 10.517184

int position1IsCorrect = ( (int)(p[9*6]) ) == -11;
int position2IsCorrect = ( (int)(p[(9*6) + 1] ) == 3);
int position3IsCorrect = ( (int)(p[(9*6) + 2] ) == 10);
int positionsAreCorrect = (position1IsCorrect == 1) && (position2IsCorrect == 1) && (position3IsCorrect == 1);

printf("%s\n", positionsAreCorrect
? "Simulator is calculating positions correctly."
: "Simulator is not calculating positions correctly.");

return;
}
54 changes: 54 additions & 0 deletions header/timer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef TIMER_H
#define TIMER_H

#include <stdlib.h>

#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
#ifndef __USE_BSD
#define __USE_BSD
#endif
#include <sys/time.h>
#endif

#ifdef WIN32
double PCFreq = 0.0;
__int64 timerStart = 0;
#else
struct timeval timerStart;
#endif

void StartTimer()
{
#ifdef WIN32
LARGE_INTEGER li;
if(!QueryPerformanceFrequency(&li))
printf("QueryPerformanceFrequency failed!\n");

PCFreq = (double)li.QuadPart/1000.0;

QueryPerformanceCounter(&li);
timerStart = li.QuadPart;
#else
gettimeofday(&timerStart, NULL);
#endif
}

// time elapsed in ms
double GetTimer()
{
#ifdef WIN32
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return (double)(li.QuadPart-timerStart)/PCFreq;
#else
struct timeval timerStop, timerElapsed;
gettimeofday(&timerStop, NULL);
timersub(&timerStop, &timerStart, &timerElapsed);
return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
#endif
}

#endif // TIMER_H
Binary file added pic/V100.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pic/best.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pic/ori.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pic/parallel.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pic/shared.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pic/shuffle.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
158 changes: 158 additions & 0 deletions src/nbody_ori.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f

/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/

typedef struct
{
float x, y, z, vx, vy, vz;
} Body;

/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/

void randomizeBodies(float *data, int n)
{
for (int i = 0; i < n; i++)
{
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/

void bodyForce(Body *p, float dt, int n)
{
for (int i = 0; i < n; ++i)
{
float Fx = 0.0f;
float Fy = 0.0f;
float Fz = 0.0f;

for (int j = 0; j < n; j++)
{
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3;
Fy += dy * invDist3;
Fz += dz * invDist3;
}

p[i].vx += dt * Fx;
p[i].vy += dt * Fy;
p[i].vz += dt * Fz;
}
}

int main(const int argc, const char **argv)
{

/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/

int nBodies = 2 << 11;
int salt = 0;
if (argc > 1)
nBodies = 2 << atoi(argv[1]);

/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/

if (argc > 2)
salt = atoi(argv[2]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies * sizeof(Body);
float *buf;

buf = (float *)malloc(bytes);

Body *p = (Body *)buf;

/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/

randomizeBodies(buf, 6 * nBodies); // Init pos / vel data

double totalTime = 0.0;

/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/

/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++)
{
StartTimer();
/*******************************************************************/

/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/

bodyForce(p, dt, nBodies); // compute interbody forces

/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/

for (int i = 0; i < nBodies; i++)
{ // integrate position
p[i].x += p[i].vx * dt;
p[i].y += p[i].vy * dt;
p[i].z += p[i].vz * dt;
}

/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}

double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;

#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/

/*
* Feel free to modify code below.
*/

free(buf);
}
Loading

0 comments on commit 1415d79

Please sign in to comment.