-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
AlexZFX
committed
Jun 2, 2019
1 parent
4d8cbf9
commit 1415d79
Showing
14 changed files
with
797 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.vscode | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,37 @@ | ||
# cuda-nbody | ||
# cuda-nbody | ||
|
||
大三下分布式并行计算实验代码,实验在Nvidia的Courses上进行,利用cuda对 nbody 算法进行优化。 | ||
|
||
网上搜索nbody的cuda优化已经有很多相应的思路介绍,我这里基于网上的各种思路实现和尝试,使用Nvidia的环境进行编写和测试,给出了最基础的并行版本和我的最终版本代码,代码中也有一定的注释说明。 | ||
|
||
代码中的check函数之类的是 Nvidia 平台上测试使用的。 | ||
|
||
## 优化思路如下 | ||
|
||
1. 首先将原有代码并行化,每个线程处理一个位置的body(参考[nbody_parallel.cu](./src/noby_parallel.cu))。 | ||
2. 自己控制内存的copy,申请和释放,消除缺页异常等。(cudaMallocManaged -> cudaMalloc / cudaMallocHost) | ||
3. 调整 BLOCK_SIZE 得到相对最佳的值(测试环境上为32)。 | ||
4. 使用 shared_memory 进行优化,一个线程块共用一块 shared_memory,每个线程取部分数据提高数据访存效率。 | ||
5. 观察 body_force 函数,发现主要处理最后为加法,故将原本的每个线程处理一个 body 进行改进,不同块中的多个线程共同处理一个 body 的数据信息,进一步提升并行率。 | ||
6. 测试了使用 shuffle 特性,但是在我测试条件下性能并没有提升且有点下降,等会会附图说明。 | ||
|
||
基本就像上面描述的,从 parallel 版本到 shared 版本改动比较多,写了一小部分注释,不太明白的可以联系我来说明。有一些优化是有一点针对性的所以不一定能通用,像shuffle版本修改 block_size 似乎就结果不正确了,因为性能不高就没有深究了。 | ||
|
||
## 测试说明 | ||
测试环境 V-100 信息 | ||
![v100](./pic/v100.png) | ||
|
||
未加速时CPU程序性能 | ||
![ori](./pic/ori.png) | ||
|
||
基础并行版本性能 | ||
![parallel](./pic/parallel.png) | ||
|
||
shuffle 版本性能 | ||
![shuffle](./pic/shuffle.png) | ||
|
||
使用 shared_memory 的性能 | ||
![shared](./pic/shared.png) | ||
|
||
偷偷再加了一点针对4096个body的优化的最佳结果 | ||
![best](./pic/best.png) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#include <stdio.h> | ||
#include <math.h> | ||
|
||
void checkAccuracy(float *p, int nBodies) | ||
{ | ||
if (nBodies != 2<<11) return; | ||
|
||
// Assuming N is set to 11, the x y and z coordinates of a particle | ||
// (chosen at random) should | ||
// equal the following values: | ||
|
||
// -11.943975 | ||
// 3.198896 | ||
// 10.517184 | ||
|
||
int position1IsCorrect = ( (int)(p[9*6]) ) == -11; | ||
int position2IsCorrect = ( (int)(p[(9*6) + 1] ) == 3); | ||
int position3IsCorrect = ( (int)(p[(9*6) + 2] ) == 10); | ||
int positionsAreCorrect = (position1IsCorrect == 1) && (position2IsCorrect == 1) && (position3IsCorrect == 1); | ||
|
||
printf("%s\n", positionsAreCorrect | ||
? "Simulator is calculating positions correctly." | ||
: "Simulator is not calculating positions correctly."); | ||
|
||
return; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#ifndef TIMER_H | ||
#define TIMER_H | ||
|
||
#include <stdlib.h> | ||
|
||
#ifdef WIN32 | ||
#define WIN32_LEAN_AND_MEAN | ||
#include <windows.h> | ||
#else | ||
#ifndef __USE_BSD | ||
#define __USE_BSD | ||
#endif | ||
#include <sys/time.h> | ||
#endif | ||
|
||
#ifdef WIN32 | ||
double PCFreq = 0.0; | ||
__int64 timerStart = 0; | ||
#else | ||
struct timeval timerStart; | ||
#endif | ||
|
||
void StartTimer() | ||
{ | ||
#ifdef WIN32 | ||
LARGE_INTEGER li; | ||
if(!QueryPerformanceFrequency(&li)) | ||
printf("QueryPerformanceFrequency failed!\n"); | ||
|
||
PCFreq = (double)li.QuadPart/1000.0; | ||
|
||
QueryPerformanceCounter(&li); | ||
timerStart = li.QuadPart; | ||
#else | ||
gettimeofday(&timerStart, NULL); | ||
#endif | ||
} | ||
|
||
// time elapsed in ms | ||
double GetTimer() | ||
{ | ||
#ifdef WIN32 | ||
LARGE_INTEGER li; | ||
QueryPerformanceCounter(&li); | ||
return (double)(li.QuadPart-timerStart)/PCFreq; | ||
#else | ||
struct timeval timerStop, timerElapsed; | ||
gettimeofday(&timerStop, NULL); | ||
timersub(&timerStop, &timerStart, &timerElapsed); | ||
return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; | ||
#endif | ||
} | ||
|
||
#endif // TIMER_H |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include "timer.h" | ||
#include "check.h" | ||
#include <cuda_runtime.h> | ||
|
||
#define SOFTENING 1e-9f | ||
|
||
/* | ||
* Each body contains x, y, and z coordinate positions, | ||
* as well as velocities in the x, y, and z directions. | ||
*/ | ||
|
||
typedef struct | ||
{ | ||
float x, y, z, vx, vy, vz; | ||
} Body; | ||
|
||
/* | ||
* Do not modify this function. A constraint of this exercise is | ||
* that it remain a host function. | ||
*/ | ||
|
||
void randomizeBodies(float *data, int n) | ||
{ | ||
for (int i = 0; i < n; i++) | ||
{ | ||
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; | ||
} | ||
} | ||
|
||
/* | ||
* This function calculates the gravitational impact of all bodies in the system | ||
* on all others, but does not update their positions. | ||
*/ | ||
|
||
void bodyForce(Body *p, float dt, int n) | ||
{ | ||
for (int i = 0; i < n; ++i) | ||
{ | ||
float Fx = 0.0f; | ||
float Fy = 0.0f; | ||
float Fz = 0.0f; | ||
|
||
for (int j = 0; j < n; j++) | ||
{ | ||
float dx = p[j].x - p[i].x; | ||
float dy = p[j].y - p[i].y; | ||
float dz = p[j].z - p[i].z; | ||
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING; | ||
float invDist = rsqrtf(distSqr); | ||
float invDist3 = invDist * invDist * invDist; | ||
|
||
Fx += dx * invDist3; | ||
Fy += dy * invDist3; | ||
Fz += dz * invDist3; | ||
} | ||
|
||
p[i].vx += dt * Fx; | ||
p[i].vy += dt * Fy; | ||
p[i].vz += dt * Fz; | ||
} | ||
} | ||
|
||
int main(const int argc, const char **argv) | ||
{ | ||
|
||
/* | ||
* Do not change the value for `nBodies` here. If you would like to modify it, | ||
* pass values into the command line. | ||
*/ | ||
|
||
int nBodies = 2 << 11; | ||
int salt = 0; | ||
if (argc > 1) | ||
nBodies = 2 << atoi(argv[1]); | ||
|
||
/* | ||
* This salt is for assessment reasons. Tampering with it will result in automatic failure. | ||
*/ | ||
|
||
if (argc > 2) | ||
salt = atoi(argv[2]); | ||
|
||
const float dt = 0.01f; // time step | ||
const int nIters = 10; // simulation iterations | ||
|
||
int bytes = nBodies * sizeof(Body); | ||
float *buf; | ||
|
||
buf = (float *)malloc(bytes); | ||
|
||
Body *p = (Body *)buf; | ||
|
||
/* | ||
* As a constraint of this exercise, `randomizeBodies` must remain a host function. | ||
*/ | ||
|
||
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data | ||
|
||
double totalTime = 0.0; | ||
|
||
/* | ||
* This simulation will run for 10 cycles of time, calculating gravitational | ||
* interaction amongst bodies, and adjusting their positions to reflect. | ||
*/ | ||
|
||
/*******************************************************************/ | ||
// Do not modify these 2 lines of code. | ||
for (int iter = 0; iter < nIters; iter++) | ||
{ | ||
StartTimer(); | ||
/*******************************************************************/ | ||
|
||
/* | ||
* You will likely wish to refactor the work being done in `bodyForce`, | ||
* as well as the work to integrate the positions. | ||
*/ | ||
|
||
bodyForce(p, dt, nBodies); // compute interbody forces | ||
|
||
/* | ||
* This position integration cannot occur until this round of `bodyForce` has completed. | ||
* Also, the next round of `bodyForce` cannot begin until the integration is complete. | ||
*/ | ||
|
||
for (int i = 0; i < nBodies; i++) | ||
{ // integrate position | ||
p[i].x += p[i].vx * dt; | ||
p[i].y += p[i].vy * dt; | ||
p[i].z += p[i].vz * dt; | ||
} | ||
|
||
/*******************************************************************/ | ||
// Do not modify the code in this section. | ||
const double tElapsed = GetTimer() / 1000.0; | ||
totalTime += tElapsed; | ||
} | ||
|
||
double avgTime = totalTime / (double)(nIters); | ||
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime; | ||
|
||
#ifdef ASSESS | ||
checkPerformance(buf, billionsOfOpsPerSecond, salt); | ||
#else | ||
checkAccuracy(buf, nBodies); | ||
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond); | ||
salt += 1; | ||
#endif | ||
/*******************************************************************/ | ||
|
||
/* | ||
* Feel free to modify code below. | ||
*/ | ||
|
||
free(buf); | ||
} |
Oops, something went wrong.