Skip to content

Commit

Permalink
Update first way to calc pixel sum for image and add second one.
Browse files Browse the repository at this point in the history
  • Loading branch information
volodymyrkochyn committed May 11, 2019
1 parent c498c48 commit d8ec52c
Show file tree
Hide file tree
Showing 2 changed files with 254 additions and 19 deletions.
46 changes: 27 additions & 19 deletions cuda.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
%%cu

// BMP-related data types based on Microsoft's own

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
Expand Down Expand Up @@ -106,6 +104,7 @@ int sum(const unsigned char* image, size_t size)

start_t = clock();
int sum = 0;
// count only one color channel
for (size_t i = 0; i < size; i+=3)
{
sum += image[i];
Expand All @@ -132,17 +131,20 @@ int sum(const unsigned char* image, size_t size)
} while (0)


__global__ void sum_simple(unsigned char *g_ivec, int *g_ovec){
//extern __shared__ int sdata[];
int idx = threadIdx.x;
/*sdata[idx] = g_ivec[idx];
for (unsigned int s=1; s < blockDim.x ; s *= 2) {
__global__ void sum_simple(unsigned char *g_ivec, int *g_ovec, int index)
{
extern __shared__ int sdata[];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
sdata[idx] = g_ivec[(idx+index*blockDim.x)*3];

__syncthreads();
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if (idx % (2*s) == 0) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}*/
g_ovec[0] += g_ivec[idx];
}
g_ovec[0] += sdata[0];
}

int main()
Expand All @@ -163,29 +165,35 @@ int main()
cudaEventCreate(&start);
cudaEventCreate(&stop);

unsigned char *d_image;
int *d_result, *h_result ;

//ALLOCATE HOST MEM
h_result = (int *) malloc(sizeof(int));
int *h_result = (int *) malloc(sizeof(int));

//ALLOCATE MEM
cudaMalloc(&d_image, 1024);
cudaMalloc(&d_result, sizeof(int));
int *d_result;
unsigned char *d_image;
cudaMalloc(&d_image, size);
cudaMalloc(&d_result, sizeof(int) * 2);
cudaCheckErrors("cudaMalloc fail \n");

cudaEventRecord(start, 0);

cudaCheckErrors("Kernel CALL fail \n");
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//printf ("Time for the filling kernel: %f ms\n", time);

cudaMemcpy(image, d_image, 1024*sizeof(unsigned char), cudaMemcpyDeviceToHost);
// copy from host to device
cudaMemcpy(d_image, image, size*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaCheckErrors("Memory copying filled image fail \n");

cudaEventRecord(start, 0);
sum_simple <<< 1, 1024 >>> (d_image, d_result);
size_t threadsPerBlock = 1024;

// calculate for one color channel
int memSize = threadsPerBlock * 12; // use maximum count of shared memory
int bound = size / memSize / 3;
for (int i = 0; i < bound; ++i)
sum_simple <<< 1, threadsPerBlock, memSize*sizeof(int) >>> (d_image, d_result, i);
cudaCheckErrors("Kernel sum_reduce_simple CALL fail \n");
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
Expand All @@ -200,7 +208,7 @@ int main()
cudaFree(d_result);
cudaCheckErrors("cudaFree fail \n");

printf ("SUM is: %d\n",h_result[0]);
printf ("SUM is: %d\n",h_result[0]);

free(image);
return 0;
Expand Down
227 changes: 227 additions & 0 deletions cuda_second.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
%%cu

// BMP-related data types based on Microsoft's own

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// aliases for C/C++ primitive data types
// https://msdn.microsoft.com/en-us/library/cc230309.aspx
typedef uint8_t BYTE;
typedef uint32_t DWORD;
typedef int32_t LONG;
typedef uint16_t WORD;

// information about the type, size, and layout of a file
// https://msdn.microsoft.com/en-us/library/dd183374(v=vs.85).aspx
typedef struct
{
WORD bfType;
DWORD bfSize;
WORD bfReserved1;
WORD bfReserved2;
DWORD bfOffBits;
} __attribute__((__packed__))
BITMAPFILEHEADER;

// information about the dimensions and color format
// https://msdn.microsoft.com/en-us/library/dd183376(v=vs.85).aspx
typedef struct
{
DWORD biSize;
LONG biWidth;
LONG biHeight;
WORD biPlanes;
WORD biBitCount;
DWORD biCompression;
DWORD biSizeImage;
LONG biXPelsPerMeter;
LONG biYPelsPerMeter;
DWORD biClrUsed;
DWORD biClrImportant;
} __attribute__((__packed__))
BITMAPINFOHEADER;

// relative intensities of red, green, and blue
// https://msdn.microsoft.com/en-us/library/dd162939(v=vs.85).aspx
typedef struct
{
BYTE rgbtBlue;
BYTE rgbtGreen;
BYTE rgbtRed;
} __attribute__((__packed__))
RGBTRIPLE;


unsigned char* loadImage(const char *fileNama, size_t &size)
{
FILE *inptr = fopen(fileNama, "r");
if (inptr == NULL)
{
fprintf(stderr, "Could not open %s.\n", fileNama);
return NULL;
}

// read infile's BITMAPFILEHEADER
BITMAPFILEHEADER bf;
fread(&bf, sizeof(BITMAPFILEHEADER), 1, inptr);

// read infile's BITMAPINFOHEADER
BITMAPINFOHEADER bi;
fread(&bi, sizeof(BITMAPINFOHEADER), 1, inptr);

// ensure infile is (likely) a 24-bit uncompressed BMP 4.0
if (bf.bfType != 0x4d42 || bf.bfOffBits != 54 || bi.biSize != 40 ||
bi.biBitCount != 24 || bi.biCompression != 0)
{
fclose(inptr);
fprintf(stderr, "Unsupported file format.\n");
return NULL;
}

// determine padding for scanlines
int padding = (4 - (bi.biWidth * sizeof(RGBTRIPLE)) % 4) % 4;

size = abs(bi.biHeight) * bi.biWidth;
unsigned char* data = (unsigned char*)malloc(size);

// iterate over infile's scanlines
for (int i = 0, biHeight = abs(bi.biHeight); i < biHeight; i++)
{
fread(data + i, sizeof(unsigned char), bi.biWidth, inptr);
// skip over padding, if any
fseek(inptr, padding, SEEK_CUR);
}
return data;
}

int sum(const unsigned char* image, size_t size)
{
clock_t start_t;
clock_t end_t;
clock_t clock_delta;
double clock_delta_ms;

start_t = clock();
int sum = 0;
// count only one color channel
for (size_t i = 0; i < size; i+=3)
{
sum += image[i];
}
end_t = clock();

clock_delta = end_t - start_t;
clock_delta_ms = ((double)clock_delta/CLOCKS_PER_SEC)*1000;
printf("Sum time, ms\t %.4f \t\n", clock_delta_ms);
return sum;
}


#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error at runtime: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)


__global__ void sum_simple(unsigned char *g_ivec, int *g_ovec, int index)
{
extern __shared__ int sdata[];

//each thread load s one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = (index * 1024 + blockIdx.x * blockDim.x + threadIdx.x)*3;
sdata[tid] = g_ivec[i];
__syncthreads();

// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x ; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0)
g_ovec[index + blockIdx.x] = sdata[0];
}

int main()
{
const char *file = "/content/drive/My Drive/111.bmp";
size_t size = 0;
unsigned char* image = loadImage(file, size);
if (image == NULL)
return 1;

printf("Image size: %zu\n", size);
printf("Pixel sum: %d\n", sum(image, size));

/*-----------------------------------------*/

cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);

//ALLOCATE HOST MEM
size_t threadsPerBlock = 1024;
// divired by 3 means one color channel
const size_t resultSize = size / threadsPerBlock / 3;
int *h_result = (int *) malloc(sizeof(int) * resultSize);

//ALLOCATE MEM
int *d_result;
unsigned char *d_image;
cudaMalloc(&d_image, size);
cudaMalloc(&d_result, sizeof(int) * resultSize);
cudaCheckErrors("cudaMalloc fail \n");

cudaEventRecord(start, 0);

cudaCheckErrors("Kernel CALL fail \n");
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);

// copy from host to device
cudaMemcpy(d_image, image, size*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaCheckErrors("Memory copying filled image fail \n");

cudaEventRecord(start, 0);

int bound = resultSize / 1024;
for (int i = 0; i < bound; ++i)
sum_simple <<< 1024, threadsPerBlock, 1024*sizeof(int) >>> (d_image, d_result, i);
cudaCheckErrors("Kernel sum_reduce_simple CALL fail \n");
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the sum_reduce_simple kernel: %f ms\n", time);

cudaMemcpy(h_result, d_result, sizeof(int) * resultSize, cudaMemcpyDeviceToHost);
cudaCheckErrors("Memory copying result fail \n");

for (int i = 1; i < resultSize; ++i)
h_result[0] += h_result[i];

//FREE MEM
cudaFree(d_image);
cudaFree(d_result);
cudaCheckErrors("cudaFree fail \n");

printf ("SUM is: %d\n",h_result[0]);

free(image);
free(h_result);
return 0;
}

0 comments on commit d8ec52c

Please sign in to comment.