-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
dbb5565
commit 2ce7599
Showing
32 changed files
with
1,764 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
/* | ||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved. | ||
* | ||
* Please refer to the NVIDIA end user license agreement (EULA) associated | ||
* with this source code for terms and conditions that govern your use of | ||
* this software. Any use, reproduction, disclosure, or distribution of | ||
* this software and related documentation outside the terms of the EULA | ||
* is strictly prohibited. | ||
* | ||
*/ | ||
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */ | ||
|
||
#include <cuda.h> | ||
#include <cuda_runtime_api.h> | ||
#include <stdio.h> | ||
#include <string> | ||
|
||
int *pArgc = NULL; | ||
char **pArgv = NULL; | ||
|
||
// This function wraps the CUDA Driver API into a template function | ||
template <class T> | ||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) | ||
{ | ||
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); | ||
|
||
if (CUDA_SUCCESS != error) | ||
{ | ||
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", | ||
error, __FILE__, __LINE__); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
|
||
// Beginning of GPU Architecture definitions | ||
inline int _ConvertSMVer2Cores(int major, int minor) | ||
{ | ||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM | ||
typedef struct | ||
{ | ||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version | ||
int Cores; | ||
} sSMtoCores; | ||
|
||
sSMtoCores nGpuArchCoresPerSM[] = | ||
{ | ||
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class | ||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class | ||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class | ||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class | ||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class | ||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class | ||
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class | ||
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class | ||
{ -1, -1 } | ||
}; | ||
|
||
int index = 0; | ||
|
||
while (nGpuArchCoresPerSM[index].SM != -1) | ||
{ | ||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) | ||
{ | ||
return nGpuArchCoresPerSM[index].Cores; | ||
} | ||
|
||
index++; | ||
} | ||
|
||
// If we don't find the values, we default use the previous one to run properly | ||
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores); | ||
return nGpuArchCoresPerSM[7].Cores; | ||
} | ||
|
||
//////////////////////////////////////////////////////////////////////////////// | ||
// Program main | ||
//////////////////////////////////////////////////////////////////////////////// | ||
int | ||
main(int argc, char **argv) | ||
{ | ||
pArgc = &argc; | ||
pArgv = argv; | ||
|
||
printf("%s Starting...\n\n", argv[0]); | ||
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); | ||
|
||
int deviceCount = 0; | ||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); | ||
|
||
if (error_id != cudaSuccess) | ||
{ | ||
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
// This function call returns 0 if there are no CUDA capable devices. | ||
if (deviceCount == 0) | ||
{ | ||
printf("There are no available device(s) that support CUDA\n"); | ||
} | ||
else | ||
{ | ||
printf("Detected %d CUDA Capable device(s)\n", deviceCount); | ||
} | ||
|
||
int dev, driverVersion = 0, runtimeVersion = 0; | ||
|
||
for (dev = 0; dev < deviceCount; ++dev) | ||
{ | ||
cudaSetDevice(dev); | ||
cudaDeviceProp deviceProp; | ||
cudaGetDeviceProperties(&deviceProp, dev); | ||
|
||
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); | ||
|
||
// Console log | ||
cudaDriverGetVersion(&driverVersion); | ||
cudaRuntimeGetVersion(&runtimeVersion); | ||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); | ||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); | ||
|
||
char msg[256]; | ||
sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", | ||
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); | ||
printf("%s", msg); | ||
|
||
printf(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", | ||
deviceProp.multiProcessorCount, | ||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), | ||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); | ||
printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); | ||
|
||
|
||
#if CUDART_VERSION >= 5000 | ||
// This is supported in CUDA 5.0 (runtime API device properties) | ||
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); | ||
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); | ||
|
||
if (deviceProp.l2CacheSize) | ||
{ | ||
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); | ||
} | ||
#else | ||
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) | ||
int memoryClock; | ||
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); | ||
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); | ||
int memBusWidth; | ||
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); | ||
printf(" Memory Bus Width: %d-bit\n", memBusWidth); | ||
int L2CacheSize; | ||
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); | ||
|
||
if (L2CacheSize) | ||
{ | ||
printf(" L2 Cache Size: %d bytes\n", L2CacheSize); | ||
} | ||
#endif | ||
|
||
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", | ||
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], | ||
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); | ||
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", | ||
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], | ||
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); | ||
|
||
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); | ||
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); | ||
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); | ||
printf(" Warp size: %d\n", deviceProp.warpSize); | ||
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); | ||
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); | ||
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", | ||
deviceProp.maxThreadsDim[0], | ||
deviceProp.maxThreadsDim[1], | ||
deviceProp.maxThreadsDim[2]); | ||
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", | ||
deviceProp.maxGridSize[0], | ||
deviceProp.maxGridSize[1], | ||
deviceProp.maxGridSize[2]); | ||
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); | ||
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); | ||
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); | ||
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); | ||
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); | ||
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); | ||
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); | ||
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); | ||
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); | ||
printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID); | ||
|
||
const char *sComputeMode[] = | ||
{ | ||
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", | ||
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", | ||
"Prohibited (no host thread can use ::cudaSetDevice() with this device)", | ||
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", | ||
"Unknown", | ||
NULL | ||
}; | ||
printf(" Compute Mode:\n"); | ||
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); | ||
} | ||
|
||
// csv masterlog info | ||
// ***************************** | ||
// exe and CUDA driver name | ||
printf("\n"); | ||
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; | ||
char cTemp[16]; | ||
|
||
// driver version | ||
sProfileString += ", CUDA Driver Version = "; | ||
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); | ||
sProfileString += cTemp; | ||
|
||
// Runtime version | ||
sProfileString += ", CUDA Runtime Version = "; | ||
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); | ||
sProfileString += cTemp; | ||
|
||
// Device count | ||
sProfileString += ", NumDevs = "; | ||
sprintf(cTemp, "%d", deviceCount); | ||
sProfileString += cTemp; | ||
|
||
// Print Out all device Names | ||
for (dev = 0; dev < deviceCount; ++dev) | ||
{ | ||
sprintf(cTemp, ", Device%d = ", dev); | ||
cudaDeviceProp deviceProp; | ||
cudaGetDeviceProperties(&deviceProp, dev); | ||
sProfileString += cTemp; | ||
sProfileString += deviceProp.name; | ||
} | ||
|
||
sProfileString += "\n"; | ||
printf("%s", sProfileString.c_str()); | ||
|
||
// finish | ||
exit(EXIT_SUCCESS); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#ifndef __GPU_TIMER_H__ | ||
#define __GPU_TIMER_H__ | ||
|
||
struct GpuTimer | ||
{ | ||
cudaEvent_t start; | ||
cudaEvent_t stop; | ||
|
||
GpuTimer() | ||
{ | ||
cudaEventCreate(&start); | ||
cudaEventCreate(&stop); | ||
} | ||
|
||
~GpuTimer() | ||
{ | ||
cudaEventDestroy(start); | ||
cudaEventDestroy(stop); | ||
} | ||
|
||
void Start() | ||
{ | ||
cudaEventRecord(start, 0); | ||
} | ||
|
||
void Stop() | ||
{ | ||
cudaEventRecord(stop, 0); | ||
} | ||
|
||
float Elapsed() | ||
{ | ||
float elapsed; | ||
cudaEventSynchronize(stop); | ||
cudaEventElapsedTime(&elapsed, start, stop); | ||
return elapsed; | ||
} | ||
}; | ||
|
||
#endif /* __GPU_TIMER_H__ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
nvcc -o transpose transpose.cu | ||
nvcc -o deviceQuery_simplified deviceQuery_simplified.cpp |
Oops, something went wrong.