forked from jwetzl/MAPSuperresolution
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
133 changed files
with
3,598 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
cmake_minimum_required(VERSION 2.8) | ||
project(Superresolution) | ||
|
||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") | ||
|
||
find_package(CUDA REQUIRED) | ||
find_package(FreeImage REQUIRED) | ||
find_package(CudaLBFGS REQUIRED) | ||
|
||
include(CheckComputeCapability.cmake) | ||
|
||
include_directories(${CUDALBFGS_INCLUDE_DIRS}) | ||
include_directories(${FreeImage_INCLUDE_DIRS}) | ||
|
||
# --- | ||
|
||
option(SUPERRES_ERROR_CHECKING "Enable CUDA error checking for the superres project.") | ||
if (SUPERRES_ERROR_CHECKING) | ||
add_definitions(-DSUPERRES_ERROR_CHECKING) | ||
endif() | ||
|
||
option(SUPERRES_TIMING "Enable timing for the superres project.") | ||
if (SUPERRES_TIMING) | ||
add_definitions(-DSUPERRES_TIMING) | ||
endif() | ||
|
||
option(SUPERRES_STORE_TRANSPOSE "Store transpose of system matrix for faster A^Tx multiplication." ON) | ||
|
||
if (SUPERRES_STORE_TRANSPOSE) | ||
add_definitions(-DSUPERRES_STORE_TRANSPOSE) | ||
endif() | ||
|
||
# --- | ||
|
||
file(GLOB CU_FILES *.cu) | ||
file(GLOB H_FILES *.h) | ||
file(GLOB CPP_FILES *.cpp) | ||
|
||
source_group("CUDA Sources" FILES ${CU_FILES}) | ||
source_group("CPP Sources" FILES ${CPP_FILES}) | ||
source_group("Headers" FILES ${H_FILES}) | ||
|
||
if (NOT DEFINED CUDA_cusparse_LIBRARY OR NOT ${CUDA_cusparse_LIBRARY}) | ||
find_library(CUDA_cusparse_LIBRARY NAMES cusparse HINTS /usr/local/cuda/lib) | ||
endif() | ||
|
||
cuda_add_executable(superres ${CU_FILES} ${CPP_FILES} ${H_FILES}) | ||
target_link_libraries(superres | ||
${CUDALBFGS_LIBRARIES} ${FreeImage_LIBRARY} | ||
${CUDA_cusparse_LIBRARY}) | ||
cuda_add_cublas_to_target(superres) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
############################# | ||
# Check for GPUs present and their compute capability | ||
# based on http://stackoverflow.com/questions/2285185/easiest-way-to-test-for-existence-of-cuda-capable-gpu-from-cmake/2297877#2297877 (Christopher Bruns) | ||
if(CUDA_FOUND) | ||
message(STATUS "${CMAKE_SOURCE_DIR}/cuda_compute_capability.c") | ||
try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR | ||
${CMAKE_BINARY_DIR} | ||
${CMAKE_SOURCE_DIR}/cuda_compute_capability.c | ||
CMAKE_FLAGS | ||
-DINCLUDE_DIRECTORIES:STRING=${CUDA_TOOLKIT_INCLUDE} | ||
-DLINK_LIBRARIES:STRING=${CUDA_CUDART_LIBRARY} | ||
COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR | ||
RUN_OUTPUT_VARIABLE RUN_OUTPUT_VAR) | ||
message(STATUS "Compile: ${RUN_OUTPUT_VAR}") | ||
if (COMPILE_RESULT_VAR) | ||
message(STATUS "compiled -> " ${RUN_RESULT_VAR}) | ||
else() | ||
message(STATUS "didn't compile") | ||
endif() | ||
# COMPILE_RESULT_VAR is TRUE when compile succeeds | ||
# RUN_RESULT_VAR is zero when a GPU is found | ||
if(COMPILE_RESULT_VAR AND NOT RUN_RESULT_VAR) | ||
message(STATUS "worked") | ||
set(CUDA_HAVE_GPU TRUE CACHE BOOL "Whether CUDA-capable GPU is present") | ||
set(CUDA_COMPUTE_CAPABILITY ${RUN_OUTPUT_VAR} CACHE STRING "Compute capability of CUDA-capable GPU present") | ||
set(CUDA_GENERATE_CODE "arch=compute_${CUDA_COMPUTE_CAPABILITY},code=sm_${CUDA_COMPUTE_CAPABILITY}" CACHE STRING "Which GPU architectures to generate code for (each arch/code pair will be passed as --generate-code option to nvcc, separate multiple pairs by ;)") | ||
mark_as_advanced(CUDA_COMPUTE_CAPABILITY CUDA_GENERATE_CODE) | ||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch compute_${CUDA_COMPUTE_CAPABILITY}) | ||
else() | ||
message(STATUS "didn't work") | ||
set(CUDA_HAVE_GPU FALSE CACHE BOOL "Whether CUDA-capable GPU is present") | ||
endif() | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
find_path(FreeImage_INCLUDE_DIR FreeImage.h HINTS /usr/include /usr/local/include /opt/local/include ${FreeImage_DIR}/include) | ||
find_library(FreeImage_LIBRARY NAMES freeimage HINTS /usr/lib /usr/local/lib /opt/local/lib ${FreeImage_DIR}/lib) | ||
|
||
set(FreeImage_INCLUDE_DIRS ${FreeImage_INCLUDE_DIR}) | ||
set(FreeImage_LIBRARIES ${FreeImage_LIBRARY}) | ||
|
||
include(FindPackageHandleStandardArgs) | ||
find_package_handle_standard_args(FreeImage DEFAULT_MSG FreeImage_INCLUDE_DIR FreeImage_LIBRARY) | ||
|
||
mark_as_advanced(FreeImage_INCLUDE_DIR FreeImage_LIBRARY) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/** | ||
* ___ _ _ ___ _ __ __ _ ___ | ||
* / __| | | | \ /_\ | \/ | /_\ | _ \ | ||
* | (__| |_| | |) / _ \ | |\/| |/ _ \| _/ | ||
* \___|\___/|___/_/_\_\_|_|__|_/_/_\_\_|_ ___ | ||
* / __| | | | _ \ __| _ \___| _ \ __/ __| | ||
* \__ \ |_| | _/ _|| /___| / _|\__ \ | ||
* |___/\___/|_| |___|_|_\ |_|_\___|___/ | ||
* 2012 | ||
* | ||
* by Jens Wetzl ([email protected]) | ||
* and Oliver Taubmann ([email protected]) | ||
* | ||
* This work is licensed under a Creative Commons | ||
* Attribution 3.0 Unported License. (CC-BY) | ||
* http://creativecommons.org/licenses/by/3.0/ | ||
* | ||
**/ | ||
|
||
#ifndef GPU_HANDLES_H | ||
#define GPU_HANDLES_H | ||
|
||
#include "cudalbfgs_error_checking.h" | ||
|
||
// A small wrapper for cuBLAS and cuSPARSE states with | ||
// automatic initialization and cleanup. | ||
struct GPUHandles | ||
{ | ||
cublasHandle_t cublasHandle; | ||
cusparseHandle_t cusparseHandle; | ||
cusparseMatDescr_t cusparseDescriptor; | ||
|
||
GPUHandles() | ||
{ | ||
CublasSafeCall ( cublasCreate (&cublasHandle) ); | ||
CusparseSafeCall( cusparseCreate (&cusparseHandle) ); | ||
CusparseSafeCall( cusparseCreateMatDescr(&cusparseDescriptor) ); | ||
|
||
CusparseSafeCall( cusparseSetMatType (cusparseDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL) ); | ||
CusparseSafeCall( cusparseSetMatIndexBase(cusparseDescriptor, CUSPARSE_INDEX_BASE_ZERO) ); | ||
} | ||
|
||
~GPUHandles() | ||
{ | ||
CublasSafeCall ( cublasDestroy (cublasHandle) ); | ||
CusparseSafeCall( cusparseDestroy (cusparseHandle) ); | ||
CusparseSafeCall( cusparseDestroyMatDescr(cusparseDescriptor) ); | ||
} | ||
}; | ||
|
||
#endif // GPU_HANDLES_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
/** | ||
* ___ _ _ ___ _ __ __ _ ___ | ||
* / __| | | | \ /_\ | \/ | /_\ | _ \ | ||
* | (__| |_| | |) / _ \ | |\/| |/ _ \| _/ | ||
* \___|\___/|___/_/_\_\_|_|__|_/_/_\_\_|_ ___ | ||
* / __| | | | _ \ __| _ \___| _ \ __/ __| | ||
* \__ \ |_| | _/ _|| /___| / _|\__ \ | ||
* |___/\___/|_| |___|_|_\ |_|_\___|___/ | ||
* 2012 | ||
* | ||
* by Jens Wetzl ([email protected]) | ||
* and Oliver Taubmann ([email protected]) | ||
* | ||
* This work is licensed under a Creative Commons | ||
* Attribution 3.0 Unported License. (CC-BY) | ||
* http://creativecommons.org/licenses/by/3.0/ | ||
* | ||
**/ | ||
|
||
#include "HuberLaplacian.h" | ||
#include "Reduction.h" | ||
|
||
#include "CudaLBFGS/error_checking.h" | ||
|
||
namespace gpu_HuberLaplacian | ||
{ | ||
__global__ void laplacian(float *dst, const float *src, const size_t width, const size_t height, | ||
const size_t pixelsPerThread); | ||
__global__ void huber(float *x, const size_t width, const size_t height, const float alpha, | ||
const float strength, const size_t pixelsPerThread, float *f); | ||
|
||
float *d_tmp; | ||
} | ||
|
||
HuberLaplacian::HuberLaplacian(const size_t height, const size_t width, | ||
const float alpha, const float strength) | ||
: cost_function(height * width) | ||
, m_height(height) | ||
, m_width(width) | ||
, m_alpha(alpha) | ||
, m_strength(strength) | ||
{ | ||
CudaSafeCall( cudaMalloc(&gpu_HuberLaplacian::d_tmp, m_numDimensions * sizeof(float)) ); | ||
CudaSafeCall( cudaMalloc( (void**) &m_reductionArray, width * height * sizeof(float)) ); | ||
CudaSafeCall( cudaMalloc( (void**) &m_reductionArray2, 1024 * sizeof(float)) ); | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_atomic = new timer("priorOther"); | ||
m_filter = new timer("priorFilter"); | ||
#endif | ||
} | ||
|
||
HuberLaplacian::~HuberLaplacian() | ||
{ | ||
CudaSafeCall( cudaFree(gpu_HuberLaplacian::d_tmp) ); | ||
CudaSafeCall( cudaFree(m_reductionArray) ); | ||
CudaSafeCall( cudaFree(m_reductionArray2) ); | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_atomic->saveMeasurement(); | ||
m_filter->saveMeasurement(); | ||
|
||
delete m_atomic; | ||
delete m_filter; | ||
#endif | ||
} | ||
|
||
void HuberLaplacian::f_gradf(const float *d_x, float *d_f, float *d_gradf) | ||
{ | ||
using namespace gpu_HuberLaplacian; | ||
|
||
dim3 blockDim(512); | ||
|
||
const size_t pixelsPerThread = 8; | ||
size_t threadsPerColumn = (m_height % pixelsPerThread == 0) ? (m_height / pixelsPerThread) | ||
: (m_height / pixelsPerThread) + 1; | ||
size_t threads = threadsPerColumn * m_width; | ||
|
||
dim3 gridDim = (threads % blockDim.x == 0) ? (threads / blockDim.x) | ||
: (threads / blockDim.x) + 1; | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_filter->start(); | ||
#endif | ||
|
||
// Compute image Laplacian | ||
laplacian<<<gridDim, blockDim>>>(d_tmp, d_x, m_width, m_height, pixelsPerThread); | ||
|
||
CudaCheckError(); | ||
cudaDeviceSynchronize(); | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_filter->stop(); | ||
m_atomic->start(); | ||
#endif | ||
|
||
CudaSafeCall( cudaMemset(m_reductionArray, 0, m_width * m_height * sizeof(float)) ); | ||
|
||
// Compute prior function value and gradient without final filtering | ||
huber<<<gridDim, blockDim>>>(d_tmp, m_width, m_height, m_alpha, m_strength, pixelsPerThread, m_reductionArray); | ||
|
||
CudaCheckError(); | ||
cudaDeviceSynchronize(); | ||
|
||
Reduction::sumReduction(m_reductionArray, m_width, m_height, m_width, d_f, m_reductionArray2); | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_atomic->stop(); | ||
m_filter->start(); | ||
#endif | ||
|
||
// Compute Laplacian of the gradient | ||
laplacian<<<gridDim, blockDim>>>(d_gradf, d_tmp, m_width, m_height, pixelsPerThread); | ||
|
||
CudaCheckError(); | ||
cudaDeviceSynchronize(); | ||
|
||
#ifdef SUPERRES_TIMING | ||
m_filter->stop(); | ||
#endif | ||
} | ||
|
||
|
||
namespace gpu_HuberLaplacian | ||
{ | ||
|
||
__global__ void laplacian(float *dst, const float *src, const size_t width, const size_t height, | ||
const size_t pixelsPerThread) | ||
{ | ||
const size_t col = (blockIdx.x * blockDim.x + threadIdx.x) % width; | ||
const size_t crow = (blockIdx.x * blockDim.x + threadIdx.x) / width * pixelsPerThread; | ||
|
||
if (col >= width || crow >= height) | ||
return; | ||
|
||
const size_t srow = crow + 1; | ||
const size_t erow = min((unsigned int)(crow + pixelsPerThread - 1), (unsigned int)(height - 1)); | ||
|
||
// First element | ||
|
||
const size_t firstIdx = crow * width + col; | ||
|
||
dst[firstIdx] = src[firstIdx]; | ||
|
||
if (crow + 1 < height) dst[firstIdx] -= 0.25f * src[firstIdx + width]; // S | ||
if (crow >= 1) dst[firstIdx] -= 0.25f * src[firstIdx - width]; // N | ||
if (col + 1 < width) dst[firstIdx] -= 0.25f * src[firstIdx + 1]; // E | ||
if (col >= 1) dst[firstIdx] -= 0.25f * src[firstIdx - 1]; // W | ||
|
||
// Inner elements | ||
|
||
for (int row = srow; row < erow; ++row) | ||
{ | ||
const size_t cIdx = row * width + col; | ||
|
||
// C, S, N (always exist) | ||
dst[cIdx] = src[cIdx] - 0.25f * (src[cIdx + width] + src[cIdx - width]); | ||
|
||
if (col + 1 < width) dst[cIdx] -= 0.25f * src[cIdx + 1]; // E | ||
if (col >= 1) dst[cIdx] -= 0.25f * src[cIdx - 1]; // W | ||
} | ||
|
||
if (erow <= crow) | ||
return; | ||
|
||
// Last element | ||
|
||
const size_t lastIdx = erow * width + col; | ||
|
||
dst[lastIdx] = src[lastIdx] - 0.25f * src[lastIdx - width]; // C, N | ||
|
||
if (erow + 1 < height) dst[lastIdx] -= 0.25f * src[lastIdx + width]; // S | ||
if (col + 1 < width) dst[lastIdx] -= 0.25f * src[lastIdx + 1]; // E | ||
if (col >= 1) dst[lastIdx] -= 0.25f * src[lastIdx - 1]; // W | ||
} | ||
|
||
__global__ void huber(float *a, const size_t width, const size_t height, const float alpha, | ||
const float strength, const size_t pixelsPerThread, float *f) | ||
{ | ||
const size_t col = (blockIdx.x * blockDim.x + threadIdx.x) % width; | ||
const size_t crow = (blockIdx.x * blockDim.x + threadIdx.x) / width * pixelsPerThread; | ||
|
||
if (col >= width || crow >= height) | ||
return; | ||
|
||
const size_t erow = min((unsigned int)(crow + pixelsPerThread), (unsigned int)height); | ||
|
||
const float alpha2 = alpha * alpha; | ||
|
||
float colF = 0.0f; | ||
|
||
for (size_t row = crow; row < erow; ++row) | ||
{ | ||
const size_t idx = row * width + col; | ||
|
||
// Pseudo-Huber loss function | ||
const float root = sqrtf(1.0f + a[idx]*a[idx] / alpha2); | ||
colF += alpha2 * (root - 1.0f); | ||
a[idx] *= strength / root; | ||
} | ||
|
||
colF *= strength; | ||
f[blockIdx.x * blockDim.x + threadIdx.x] = colF; | ||
} | ||
|
||
} |
Oops, something went wrong.