-
Notifications
You must be signed in to change notification settings - Fork 0
/
2dConvolution.cu
106 lines (84 loc) · 3.54 KB
/
2dConvolution.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define TILE_WIDTH 16
#define Mask_width 5
#define Mask_radius (Mask_width / 2)
#define w (TILE_WIDTH + Mask_width - 1)
__global__ void convolution_tiled(float *I, const float *M, float *P, int channels, int width, int height) {
__shared__ float N_ds[w][w];
for (int k = 0; k < channels; k++) {
// First phase: Main tile loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x;// tile index
int tileY = dest / w; //row in shared Memory
int tileX = dest % w; //col in shared Memory
int row = blockIdx.y * TILE_WIDTH + tileY - Mask_radius; // index of pixel in input image
int col = blockIdx.x * TILE_WIDTH + tileX - Mask_radius; // index of pixel in input image
if (row >= 0 && row < height && col >= 0 && col < width) {
N_ds[tileY][tileX] = I[(row * width + col) * channels + k];
} else {
N_ds[tileY][tileX] = 0.0f;
}
// Second phase: Loading edge pixels or halo regions
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
tileY = dest / w;
tileX = dest % w;
row = blockIdx.y * TILE_WIDTH + tileY - Mask_radius;
col = blockIdx.x * TILE_WIDTH + tileX - Mask_radius;
if (tileY < w) {
if (row >= 0 && row < height && col >= 0 && col < width) {
N_ds[tileY][tileX] = I[(row * width + col) * channels + k];
} else {
N_ds[tileY][tileX] = 0.0f;
}
}
__syncthreads();
// Convolution: Multiply and accumulate
float accum = 0.0f;
for (int y = 0; y < Mask_width; y++) {
for (int x = 0; x < Mask_width; x++) {
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * devMask[y * Mask_width + x];
}
}
// Store result to output
int y = blockIdx.y * TILE_WIDTH + threadIdx.y;
int x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width) {
P[(y * width + x) * channels + k] = accum;
}
__syncthreads();
}
}
__constant__ float devMask[Mask_width * Mask_width];
int main() {
// Input dimensions
int channels = 3, width = 1024, height = 1024;
// Allocate host memory
float *h_I = (float *)malloc(width * height * channels * sizeof(float));
float *h_M = (float *)malloc(Mask_width * Mask_width * sizeof(float));
float *h_P = (float *)malloc(width * height * channels * sizeof(float));
// Initialize data
// (Fill h_I and h_M with values)
// Allocate device memory
float *d_I, *d_P;
cudaMalloc((void **)&d_I, width * height * channels * sizeof(float));
cudaMalloc((void **)&d_P, width * height * channels * sizeof(float));
// Copy data to device
cudaMemcpy(d_I, h_I, width * height * channels * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(devMask, h_M, Mask_width * Mask_width * sizeof(float));
// Kernel launch configuration
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
dim3 dimGrid((width + TILE_WIDTH - 1) / TILE_WIDTH, (height + TILE_WIDTH - 1) / TILE_WIDTH, 1);
// Launch kernel
convolution_tiled<<<dimGrid, dimBlock>>>(d_I, devMask, d_P, channels, width, height);
// Copy result back to host
cudaMemcpy(h_P, d_P, width * height * channels * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
free(h_I);
free(h_M);
free(h_P);
cudaFree(d_I);
cudaFree(d_P);
return 0;
}