Skip to content

Commit

Permalink
cuda学习资料
Browse files Browse the repository at this point in the history
  • Loading branch information
SeventhBlue committed Apr 4, 2020
1 parent dbb5565 commit 2ce7599
Show file tree
Hide file tree
Showing 32 changed files with 1,764 additions and 0 deletions.
Binary file added 第一课/第一课.pptx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added 第七课/第七课.pptx
Binary file not shown.
243 changes: 243 additions & 0 deletions 第七课/课程代码/deviceQuery_simplified.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <string>

int *pArgc = NULL;
char **pArgv = NULL;

// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);

if (CUDA_SUCCESS != error)
{
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
error, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}


// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;

sSMtoCores nGpuArchCoresPerSM[] =
{
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
{ -1, -1 }
};

int index = 0;

while (nGpuArchCoresPerSM[index].SM != -1)
{
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
{
return nGpuArchCoresPerSM[index].Cores;
}

index++;
}

// If we don't find the values, we default use the previous one to run properly
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
return nGpuArchCoresPerSM[7].Cores;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
pArgc = &argc;
pArgv = argv;

printf("%s Starting...\n\n", argv[0]);
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

if (error_id != cudaSuccess)
{
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}

// This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0)
{
printf("There are no available device(s) that support CUDA\n");
}
else
{
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
}

int dev, driverVersion = 0, runtimeVersion = 0;

for (dev = 0; dev < deviceCount; ++dev)
{
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);

printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

// Console log
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);

char msg[256];
sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n",
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
printf("%s", msg);

printf(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n",
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
// This is supported in CUDA 5.0 (runtime API device properties)
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);

if (deviceProp.l2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
}
#else
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

if (L2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
}
#endif

printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n",
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);

printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Warp size: %d\n", deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID);

const char *sComputeMode[] =
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
}

// csv masterlog info
// *****************************
// exe and CUDA driver name
printf("\n");
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
char cTemp[16];

// driver version
sProfileString += ", CUDA Driver Version = ";
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
sProfileString += cTemp;

// Runtime version
sProfileString += ", CUDA Runtime Version = ";
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
sProfileString += cTemp;

// Device count
sProfileString += ", NumDevs = ";
sprintf(cTemp, "%d", deviceCount);
sProfileString += cTemp;

// Print Out all device Names
for (dev = 0; dev < deviceCount; ++dev)
{
sprintf(cTemp, ", Device%d = ", dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
sProfileString += cTemp;
sProfileString += deviceProp.name;
}

sProfileString += "\n";
printf("%s", sProfileString.c_str());

// finish
exit(EXIT_SUCCESS);
}
40 changes: 40 additions & 0 deletions 第七课/课程代码/gputimer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;

GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}

~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}

void Start()
{
cudaEventRecord(start, 0);
}

void Stop()
{
cudaEventRecord(stop, 0);
}

float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};

#endif /* __GPU_TIMER_H__ */
2 changes: 2 additions & 0 deletions 第七课/课程代码/nvcc_commands.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nvcc -o transpose transpose.cu
nvcc -o deviceQuery_simplified deviceQuery_simplified.cpp
Loading

0 comments on commit 2ce7599

Please sign in to comment.