cuda学习资料

SeventhBlue · Apr 4, 2020 · 2ce7599 · 2ce7599
1 parent dbb5565
commit 2ce7599
Show file tree

Hide file tree

Showing 32 changed files with 1,764 additions and 0 deletions.
diff --git a/第一课/第一课.pptx b/第一课/第一课.pptx
diff --git a/第一课/资料/CUDA专家手册：GPU编程权威指南_迷你书.pdf b/第一课/资料/CUDA专家手册：GPU编程权威指南_迷你书.pdf
diff --git a/第一课/资料/professional_cuda_c_programming.pdf b/第一课/资料/professional_cuda_c_programming.pdf
diff --git a/第七课/第七课.pptx b/第七课/第七课.pptx
diff --git a/第七课/课程代码/deviceQuery_simplified.cpp b/第七课/课程代码/deviceQuery_simplified.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <string>
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);
+
+    if (CUDA_SUCCESS != error)
+    {
+        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
+                error, __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+}
+
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
+    return nGpuArchCoresPerSM[7].Cores;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;
+
+    printf("%s Starting...\n\n", argv[0]);
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
+
+    int deviceCount = 0;
+    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+    if (error_id != cudaSuccess)
+    {
+        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
+        exit(EXIT_FAILURE);
+    }
+
+    // This function call returns 0 if there are no CUDA capable devices.
+    if (deviceCount == 0)
+    {
+        printf("There are no available device(s) that support CUDA\n");
+    }
+    else
+    {
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+    }
+
+    int dev, driverVersion = 0, runtimeVersion = 0;
+
+    for (dev = 0; dev < deviceCount; ++dev)
+    {
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+
+        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+
+        // Console log
+        cudaDriverGetVersion(&driverVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
+
+        char msg[256];
+        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
+                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
+        printf("%s", msg);
+
+        printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
+               deviceProp.multiProcessorCount,
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
+        printf("  GPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+
+
+#if CUDART_VERSION >= 5000
+        // This is supported in CUDA 5.0 (runtime API device properties)
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);
+
+        if (deviceProp.l2CacheSize)
+        {
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
+        }
+#else
+        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
+        int memoryClock;
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
+        int memBusWidth;
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        int L2CacheSize;
+        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+        if (L2CacheSize)
+        {
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
+        }
+#endif
+
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
+               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
+               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
+
+        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
+        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
+        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
+        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
+               deviceProp.maxThreadsDim[0],
+               deviceProp.maxThreadsDim[1],
+               deviceProp.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
+               deviceProp.maxGridSize[0],
+               deviceProp.maxGridSize[1],
+               deviceProp.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
+        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
+        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID);
+
+        const char *sComputeMode[] =
+        {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+            "Unknown",
+            NULL
+        };
+        printf("  Compute Mode:\n");
+        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
+    }
+
+    // csv masterlog info
+    // *****************************
+    // exe and CUDA driver name
+    printf("\n");
+    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+    char cTemp[16];
+
+    // driver version
+    sProfileString += ", CUDA Driver Version = ";
+    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+    sProfileString +=  cTemp;
+
+    // Runtime version
+    sProfileString += ", CUDA Runtime Version = ";
+    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+    sProfileString +=  cTemp;
+
+    // Device count
+    sProfileString += ", NumDevs = ";
+    sprintf(cTemp, "%d", deviceCount);
+    sProfileString += cTemp;
+
+    // Print Out all device Names
+    for (dev = 0; dev < deviceCount; ++dev)
+    {
+        sprintf(cTemp, ", Device%d = ", dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        sProfileString += cTemp;
+        sProfileString += deviceProp.name;
+    }
+
+    sProfileString += "\n";
+    printf("%s", sProfileString.c_str());
+
+    // finish
+    exit(EXIT_SUCCESS);
+}
diff --git a/第七课/课程代码/gputimer.h b/第七课/课程代码/gputimer.h
@@ -0,0 +1,40 @@
+#ifndef __GPU_TIMER_H__
+#define __GPU_TIMER_H__
+
+struct GpuTimer
+{
+      cudaEvent_t start;
+      cudaEvent_t stop;
+
+      GpuTimer()
+      {
+            cudaEventCreate(&start);
+            cudaEventCreate(&stop);
+      }
+
+      ~GpuTimer()
+      {
+            cudaEventDestroy(start);
+            cudaEventDestroy(stop);
+      }
+
+      void Start()
+      {
+            cudaEventRecord(start, 0);
+      }
+
+      void Stop()
+      {
+            cudaEventRecord(stop, 0);
+      }
+
+      float Elapsed()
+      {
+            float elapsed;
+            cudaEventSynchronize(stop);
+            cudaEventElapsedTime(&elapsed, start, stop);
+            return elapsed;
+      }
+};
+
+#endif  /* __GPU_TIMER_H__ */
diff --git a/第七课/课程代码/nvcc_commands.txt b/第七课/课程代码/nvcc_commands.txt
@@ -0,0 +1,2 @@
+nvcc -o transpose transpose.cu
+nvcc -o deviceQuery_simplified deviceQuery_simplified.cpp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		nvcc -o transpose transpose.cu
		nvcc -o deviceQuery_simplified deviceQuery_simplified.cpp