MAPSuperresolution

mingliangfu · Jan 2, 2013 · f1f9382 · f1f9382
1 parent 53d7686
commit f1f9382
Show file tree

Hide file tree

Showing 133 changed files with 3,598 additions and 4 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 2.8)
+project(Superresolution)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
+
+find_package(CUDA REQUIRED)
+find_package(FreeImage REQUIRED)
+find_package(CudaLBFGS REQUIRED)
+
+include(CheckComputeCapability.cmake)
+
+include_directories(${CUDALBFGS_INCLUDE_DIRS})
+include_directories(${FreeImage_INCLUDE_DIRS})
+
+# ---
+
+option(SUPERRES_ERROR_CHECKING "Enable CUDA error checking for the superres project.")
+if (SUPERRES_ERROR_CHECKING)
+	add_definitions(-DSUPERRES_ERROR_CHECKING)
+endif()
+
+option(SUPERRES_TIMING "Enable timing for the superres project.")
+if (SUPERRES_TIMING)
+	add_definitions(-DSUPERRES_TIMING)
+endif()
+
+option(SUPERRES_STORE_TRANSPOSE "Store transpose of system matrix for faster A^Tx multiplication." ON)
+
+if (SUPERRES_STORE_TRANSPOSE)
+	add_definitions(-DSUPERRES_STORE_TRANSPOSE)
+endif()
+
+# ---
+
+file(GLOB CU_FILES *.cu)
+file(GLOB H_FILES *.h)
+file(GLOB CPP_FILES *.cpp)
+
+source_group("CUDA Sources" FILES ${CU_FILES})
+source_group("CPP Sources" FILES ${CPP_FILES})
+source_group("Headers" FILES ${H_FILES})
+
+if (NOT DEFINED CUDA_cusparse_LIBRARY OR NOT ${CUDA_cusparse_LIBRARY})
+	find_library(CUDA_cusparse_LIBRARY NAMES cusparse HINTS /usr/local/cuda/lib)
+endif()
+
+cuda_add_executable(superres ${CU_FILES} ${CPP_FILES} ${H_FILES})
+target_link_libraries(superres 
+	${CUDALBFGS_LIBRARIES} ${FreeImage_LIBRARY}
+	${CUDA_cusparse_LIBRARY})
+cuda_add_cublas_to_target(superres)
diff --git a/CheckComputeCapability.cmake b/CheckComputeCapability.cmake
@@ -0,0 +1,33 @@
+#############################
+# Check for GPUs present and their compute capability
+# based on http://stackoverflow.com/questions/2285185/easiest-way-to-test-for-existence-of-cuda-capable-gpu-from-cmake/2297877#2297877 (Christopher Bruns)
+if(CUDA_FOUND)
+	message(STATUS "${CMAKE_SOURCE_DIR}/cuda_compute_capability.c")
+	try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
+		${CMAKE_BINARY_DIR} 
+		${CMAKE_SOURCE_DIR}/cuda_compute_capability.c
+		CMAKE_FLAGS 
+		-DINCLUDE_DIRECTORIES:STRING=${CUDA_TOOLKIT_INCLUDE}
+		-DLINK_LIBRARIES:STRING=${CUDA_CUDART_LIBRARY}
+		COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR
+		RUN_OUTPUT_VARIABLE RUN_OUTPUT_VAR)
+	message(STATUS "Compile: ${RUN_OUTPUT_VAR}")
+	if (COMPILE_RESULT_VAR)
+		message(STATUS "compiled -> " ${RUN_RESULT_VAR})
+	else()
+		message(STATUS "didn't compile")
+	endif()
+	# COMPILE_RESULT_VAR is TRUE when compile succeeds
+	# RUN_RESULT_VAR is zero when a GPU is found
+	if(COMPILE_RESULT_VAR AND NOT RUN_RESULT_VAR)
+		message(STATUS "worked")
+		set(CUDA_HAVE_GPU TRUE CACHE BOOL "Whether CUDA-capable GPU is present")
+		set(CUDA_COMPUTE_CAPABILITY ${RUN_OUTPUT_VAR} CACHE STRING "Compute capability of CUDA-capable GPU present")
+		set(CUDA_GENERATE_CODE "arch=compute_${CUDA_COMPUTE_CAPABILITY},code=sm_${CUDA_COMPUTE_CAPABILITY}" CACHE STRING "Which GPU architectures to generate code for (each arch/code pair will be passed as --generate-code option to nvcc, separate multiple pairs by ;)")
+		mark_as_advanced(CUDA_COMPUTE_CAPABILITY CUDA_GENERATE_CODE)
+		set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch compute_${CUDA_COMPUTE_CAPABILITY})
+	else()
+		message(STATUS "didn't work")
+		set(CUDA_HAVE_GPU FALSE CACHE BOOL "Whether CUDA-capable GPU is present")
+	endif()
+endif()
diff --git a/FindFreeImage.cmake b/FindFreeImage.cmake
@@ -0,0 +1,10 @@
+find_path(FreeImage_INCLUDE_DIR FreeImage.h HINTS /usr/include /usr/local/include /opt/local/include ${FreeImage_DIR}/include)
+find_library(FreeImage_LIBRARY NAMES freeimage HINTS /usr/lib /usr/local/lib /opt/local/lib ${FreeImage_DIR}/lib)
+
+set(FreeImage_INCLUDE_DIRS ${FreeImage_INCLUDE_DIR})
+set(FreeImage_LIBRARIES ${FreeImage_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FreeImage DEFAULT_MSG FreeImage_INCLUDE_DIR FreeImage_LIBRARY)
+
+mark_as_advanced(FreeImage_INCLUDE_DIR FreeImage_LIBRARY)
diff --git a/GPUHandles.h b/GPUHandles.h
@@ -0,0 +1,51 @@
+/**
+ *   ___ _   _ ___   _     __  __   _   ___     
+ *  / __| | | |   \ /_\   |  \/  | /_\ | _ \    
+ * | (__| |_| | |) / _ \  | |\/| |/ _ \|  _/    
+ *  \___|\___/|___/_/_\_\_|_|__|_/_/_\_\_|_ ___ 
+ *       / __| | | | _ \ __| _ \___| _ \ __/ __|
+ *       \__ \ |_| |  _/ _||   /___|   / _|\__ \
+ *       |___/\___/|_| |___|_|_\   |_|_\___|___/
+ *                                          2012
+ *
+ *   by Jens Wetzl           ([email protected])
+ *  and Oliver Taubmann ([email protected])
+ *
+ *  This work is licensed under a Creative Commons
+ *  Attribution 3.0 Unported License. (CC-BY)
+ *  http://creativecommons.org/licenses/by/3.0/
+ * 
+ **/
+
+#ifndef GPU_HANDLES_H
+#define GPU_HANDLES_H
+
+#include "cudalbfgs_error_checking.h"
+
+// A small wrapper for cuBLAS and cuSPARSE states with
+// automatic initialization and cleanup.
+struct GPUHandles
+{
+	cublasHandle_t     cublasHandle;
+	cusparseHandle_t   cusparseHandle;
+	cusparseMatDescr_t cusparseDescriptor;
+
+	GPUHandles()
+	{
+		CublasSafeCall  ( cublasCreate          (&cublasHandle)       );
+		CusparseSafeCall( cusparseCreate        (&cusparseHandle)     );
+		CusparseSafeCall( cusparseCreateMatDescr(&cusparseDescriptor) );
+
+		CusparseSafeCall( cusparseSetMatType     (cusparseDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL) );
+		CusparseSafeCall( cusparseSetMatIndexBase(cusparseDescriptor, CUSPARSE_INDEX_BASE_ZERO)     );
+	}
+
+	~GPUHandles()
+	{
+		CublasSafeCall  ( cublasDestroy          (cublasHandle)       );
+		CusparseSafeCall( cusparseDestroy        (cusparseHandle)     );
+		CusparseSafeCall( cusparseDestroyMatDescr(cusparseDescriptor) );
+	}
+};
+
+#endif // GPU_HANDLES_H
diff --git a/HuberLaplacian.cu b/HuberLaplacian.cu
@@ -0,0 +1,206 @@
+/**
+ *   ___ _   _ ___   _     __  __   _   ___     
+ *  / __| | | |   \ /_\   |  \/  | /_\ | _ \    
+ * | (__| |_| | |) / _ \  | |\/| |/ _ \|  _/    
+ *  \___|\___/|___/_/_\_\_|_|__|_/_/_\_\_|_ ___ 
+ *       / __| | | | _ \ __| _ \___| _ \ __/ __|
+ *       \__ \ |_| |  _/ _||   /___|   / _|\__ \
+ *       |___/\___/|_| |___|_|_\   |_|_\___|___/
+ *                                          2012
+ *
+ *   by Jens Wetzl           ([email protected])
+ *  and Oliver Taubmann ([email protected])
+ *
+ *  This work is licensed under a Creative Commons
+ *  Attribution 3.0 Unported License. (CC-BY)
+ *  http://creativecommons.org/licenses/by/3.0/
+ * 
+ **/
+
+#include "HuberLaplacian.h"
+#include "Reduction.h"
+
+#include "CudaLBFGS/error_checking.h"
+
+namespace gpu_HuberLaplacian
+{
+	__global__ void laplacian(float *dst, const float *src, const size_t width, const size_t height, 
+	                    const size_t pixelsPerThread);
+	__global__ void huber(float *x, const size_t width, const size_t height, const float alpha, 
+	                      const float strength, const size_t pixelsPerThread, float *f);
+
+	float *d_tmp;
+}
+
+HuberLaplacian::HuberLaplacian(const size_t height, const size_t width,
+                               const float alpha, const float strength)
+   : cost_function(height * width)
+   , m_height(height)
+   , m_width(width)
+   , m_alpha(alpha)
+   , m_strength(strength)
+{
+	CudaSafeCall( cudaMalloc(&gpu_HuberLaplacian::d_tmp, m_numDimensions * sizeof(float)) );
+	CudaSafeCall( cudaMalloc( (void**) &m_reductionArray,  width * height * sizeof(float)) );
+	CudaSafeCall( cudaMalloc( (void**) &m_reductionArray2, 1024 * sizeof(float)) );
+
+#ifdef SUPERRES_TIMING
+	m_atomic = new timer("priorOther");
+	m_filter = new timer("priorFilter");
+#endif
+}
+
+HuberLaplacian::~HuberLaplacian()
+{
+	CudaSafeCall( cudaFree(gpu_HuberLaplacian::d_tmp) );
+	CudaSafeCall( cudaFree(m_reductionArray)    );
+	CudaSafeCall( cudaFree(m_reductionArray2)   );
+
+#ifdef SUPERRES_TIMING
+	m_atomic->saveMeasurement();
+	m_filter->saveMeasurement();
+
+	delete m_atomic;
+	delete m_filter;
+#endif
+}
+
+void HuberLaplacian::f_gradf(const float *d_x, float *d_f, float *d_gradf)
+{
+	using namespace gpu_HuberLaplacian;
+
+	dim3 blockDim(512);
+
+	const size_t pixelsPerThread = 8;
+	size_t threadsPerColumn = (m_height % pixelsPerThread == 0) ? (m_height / pixelsPerThread)
+	                                                            : (m_height / pixelsPerThread) + 1;
+	size_t threads = threadsPerColumn * m_width;
+
+	dim3 gridDim  = (threads % blockDim.x == 0) ? (threads / blockDim.x)
+		                                        : (threads / blockDim.x) + 1;
+
+#ifdef SUPERRES_TIMING
+	m_filter->start();
+#endif
+
+	// Compute image Laplacian
+	laplacian<<<gridDim, blockDim>>>(d_tmp, d_x, m_width, m_height, pixelsPerThread);
+
+	CudaCheckError();
+	cudaDeviceSynchronize();
+
+#ifdef SUPERRES_TIMING
+	m_filter->stop();
+	m_atomic->start();
+#endif
+
+	CudaSafeCall( cudaMemset(m_reductionArray, 0, m_width * m_height * sizeof(float)) );
+
+	// Compute prior function value and gradient without final filtering
+	huber<<<gridDim, blockDim>>>(d_tmp, m_width, m_height, m_alpha, m_strength, pixelsPerThread, m_reductionArray);
+
+	CudaCheckError();
+	cudaDeviceSynchronize();
+
+	Reduction::sumReduction(m_reductionArray, m_width, m_height, m_width, d_f, m_reductionArray2);
+
+#ifdef SUPERRES_TIMING
+	m_atomic->stop();
+	m_filter->start();
+#endif
+
+	// Compute Laplacian of the gradient
+	laplacian<<<gridDim, blockDim>>>(d_gradf, d_tmp, m_width, m_height, pixelsPerThread);
+
+	CudaCheckError();
+	cudaDeviceSynchronize();
+
+#ifdef SUPERRES_TIMING
+	m_filter->stop();
+#endif
+}
+
+
+namespace gpu_HuberLaplacian
+{
+
+	__global__ void laplacian(float *dst, const float *src, const size_t width, const size_t height, 
+	                    const size_t pixelsPerThread)
+	{
+		const size_t col  = (blockIdx.x * blockDim.x + threadIdx.x) % width;
+		const size_t crow = (blockIdx.x * blockDim.x + threadIdx.x) / width * pixelsPerThread;
+
+		if (col >= width || crow >= height)
+			return;
+
+		const size_t srow = crow + 1;
+		const size_t erow = min((unsigned int)(crow + pixelsPerThread - 1), (unsigned int)(height - 1));
+
+		// First element
+
+		const size_t firstIdx = crow * width + col;
+
+		dst[firstIdx] = src[firstIdx];
+
+		if (crow + 1 <  height) dst[firstIdx] -= 0.25f * src[firstIdx + width]; // S
+		if (crow     >= 1)      dst[firstIdx] -= 0.25f * src[firstIdx - width]; // N
+		if (col + 1  <  width)  dst[firstIdx] -= 0.25f * src[firstIdx + 1]; // E
+		if (col      >= 1)      dst[firstIdx] -= 0.25f * src[firstIdx - 1]; // W
+
+		// Inner elements
+
+		for (int row = srow; row < erow; ++row)
+		{
+			const size_t cIdx = row * width + col;
+
+			// C, S, N (always exist)
+			dst[cIdx] = src[cIdx] - 0.25f * (src[cIdx + width] + src[cIdx - width]);
+
+			if (col + 1 < width) dst[cIdx] -= 0.25f * src[cIdx + 1]; // E
+			if (col     >= 1)    dst[cIdx] -= 0.25f * src[cIdx - 1]; // W
+		}
+
+		if (erow <= crow)
+			return;
+
+		// Last element
+
+		const size_t lastIdx = erow * width + col;
+
+		dst[lastIdx] = src[lastIdx] - 0.25f * src[lastIdx - width]; // C, N
+
+		if (erow + 1 <  height) dst[lastIdx] -= 0.25f * src[lastIdx + width]; // S
+		if (col + 1  <  width)  dst[lastIdx] -= 0.25f * src[lastIdx + 1]; // E
+		if (col      >= 1)      dst[lastIdx] -= 0.25f * src[lastIdx - 1]; // W
+	}
+
+	__global__ void huber(float *a, const size_t width, const size_t height, const float alpha, 
+	                      const float strength, const size_t pixelsPerThread, float *f)
+	{
+		const size_t col  = (blockIdx.x * blockDim.x + threadIdx.x) % width;
+		const size_t crow = (blockIdx.x * blockDim.x + threadIdx.x) / width * pixelsPerThread;
+
+		if (col >= width || crow >= height)
+			return;
+
+		const size_t erow = min((unsigned int)(crow + pixelsPerThread), (unsigned int)height);
+
+		const float alpha2 = alpha * alpha;
+
+		float colF = 0.0f;
+
+		for (size_t row = crow; row < erow; ++row)
+		{
+			const size_t idx = row * width + col;
+
+			// Pseudo-Huber loss function
+			const float root = sqrtf(1.0f + a[idx]*a[idx] / alpha2); 
+			colF += alpha2 * (root - 1.0f);
+			a[idx] *= strength / root;
+		}
+
+		colF *= strength;
+		f[blockIdx.x * blockDim.x + threadIdx.x] = colF;
+	}
+
+}