From d8c5dfc852363b1847ab781f178a1d51e0c3ac49 Mon Sep 17 00:00:00 2001
From: Raaj <yaadhavraaj@gmail.com>
Date: Mon, 22 Jan 2018 21:56:09 -0500
Subject: [PATCH] Alpha CPU_ONLY version (#394)

* Converted Caffe to submodule git

* added support for display of .hpp files in qtcreator

* added function to display as a string array size

* added logging code

* added a caffeutil function class to display blob size

* wip resize cpu

* implemented resizeAndMerge cpu

* completion of cpu nms and resize functions

* added warp affine function. but its very slow so kept commented

* WIP. Intel Caffe compile

* wip. cpu version

* wip. intel mkl

* Bug fix revert cpu gpu on resize

* NMS Border Patrol

* Bug fix for cmake to check if caffe pulled

* hands extractor cpu

* Face extractor cpu made to work

* Removed caffeutil

* changed array toString function

* cleanup

* added explanation for logic nms

* update installation text for cpu prelim

* Removed duplicated `if (BUILD_DOCS)`

* Removed duplicated include(cmake/Utils.cmake)

* Recovered deleted text from OP master branch

* Removed duplicated `find_package(CuDNN)`

* Update CMakeLists.txt

* Update CMakeLists.txt

* OpenCL not available yet (removed)

* Updated spacing

* Recovering original format

* Fixed typo

* Fixed messages bug

* Fixed message bug

* printSize working on Windows

* updates to doc on cpu version

* OpenMP only on CPU_ONLY. Cout override, std::memcpy instead of memcpy

* additional fixes to profiler cmake. change to std::copy

* Fixed pthread leftovers

* Fixed minor typos and format

* NMS remove if 18, resize convert to smart pointer

* update of mkl lib for new mkl version

* Fix to select cpu only if no cuda available

* Some updates to the Doc for CPU Version

* moved ostream to cpp. remove mkl from util

* Add gines CMake

* remove cpu doc (temp)

* add try catch to print blocks

* move cout overload outside class

* Bug fix to cout overload. Not possible to use error function as its outside scope

* More clean cout and fixed typo
---
 doc/installation_cmake.md                   |  11 +-
 examples/openpose/CMakeLists.txt            |   4 +-
 examples/tests/CMakeLists.txt               |   3 +-
 examples/tutorial_add_module/CMakeLists.txt |  16 +-
 examples/tutorial_pose/CMakeLists.txt       |   4 +-
 examples/tutorial_thread/CMakeLists.txt     |   4 +-
 examples/tutorial_wrapper/CMakeLists.txt    |   4 +-
 include/openpose/core/array.hpp             | 115 +++++++----
 include/openpose/core/common.hpp            |   5 +-
 include/openpose/core/datum.hpp             |   6 +
 include/openpose/core/macros.hpp            |  28 +--
 include/openpose/core/point.hpp             |  11 +-
 include/openpose/core/rectangle.hpp         |   8 +-
 src/openpose/CMakeLists.txt                 |   8 +-
 src/openpose/core/CMakeLists.txt            |   9 +-
 src/openpose/core/array.cpp                 |  24 ++-
 src/openpose/core/maximumBase.cpp           |  80 ++++----
 src/openpose/core/netCaffe.cpp              |  12 +-
 src/openpose/core/nmsBase.cpp               | 200 +++++++++++++++-----
 src/openpose/core/point.cpp                 |   2 +-
 src/openpose/core/rectangle.cpp             |   1 -
 src/openpose/core/resizeAndMergeBase.cpp    | 133 +++++++++----
 src/openpose/face/CMakeLists.txt            |   6 +-
 src/openpose/face/faceExtractorCaffe.cpp    |  47 +++--
 src/openpose/hand/CMakeLists.txt            |   6 +-
 src/openpose/hand/handExtractorCaffe.cpp    |  47 +++--
 src/openpose/pose/CMakeLists.txt            |   6 +-
 src/openpose/pose/poseExtractorCaffe.cpp    |  10 +-
 src/openpose/utilities/CMakeLists.txt       |   7 +-
 29 files changed, 559 insertions(+), 258 deletions(-)

diff --git a/doc/installation_cmake.md b/doc/installation_cmake.md
index c03a1f27d..057c698f4 100644
--- a/doc/installation_cmake.md
+++ b/doc/installation_cmake.md
@@ -10,11 +10,11 @@ OpenPose - Installation using CMake
 6. [Uninstallation](#uninstallation)
 7. [Optional Settings](#optional-settings)
     1. [MPI Model](#mpi-model)
-    2. [Custom Caffe (Ubuntu Only)](#custom-caffe-ubuntu-only)
-    3. [Custom OpenCV (Ubuntu Only)](#custom-opencv-ubuntu-only)
-    4. [OpenPose 3D Reconstruction Demo (Windows Only)](#openpose-3d-reconstruction-demo-windows-only)
-    5. [Doxygen Documentation Autogeneration (Ubuntu Only)](#doxygen-documentation-autogeneration-ubuntu-only)
-    6. [CMake Command Line Configuration (Ubuntu Only)](#cmake-command-line-configuration-ubuntu-only)
+    3. [Custom Caffe (Ubuntu Only)](#custom-caffe-ubuntu-only)
+    4. [Custom OpenCV (Ubuntu Only)](#custom-opencv-ubuntu-only)
+    5. [OpenPose 3D Reconstruction Demo (Windows Only)](#openpose-3d-reconstruction-demo-windows-only)
+    6. [Doxygen Documentation Autogeneration (Ubuntu Only)](#doxygen-documentation-autogeneration-ubuntu-only)
+    7. [CMake Command Line Configuration (Ubuntu Only)](#cmake-command-line-configuration-ubuntu-only)
 
 
 
@@ -216,3 +216,4 @@ If Caffe is not already present but OpenCV is, then use the below command.
 ```bash
 cmake -DOpenCV_DIR=/home/"${USER}"/softwares/opencv/build
 ```
+
diff --git a/examples/openpose/CMakeLists.txt b/examples/openpose/CMakeLists.txt
index 2a4a7c134..905630f9c 100644
--- a/examples/openpose/CMakeLists.txt
+++ b/examples/openpose/CMakeLists.txt
@@ -13,7 +13,7 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS})
   
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples")
@@ -21,4 +21,4 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
         ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY) 
   endif (WIN32)
 
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/examples/tests/CMakeLists.txt b/examples/tests/CMakeLists.txt
index 7e9df8758..f1f67edda 100644
--- a/examples/tests/CMakeLists.txt
+++ b/examples/tests/CMakeLists.txt
@@ -13,7 +13,7 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS})
   
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Tests")
@@ -22,3 +22,4 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
   endif (WIN32)
 
 endforeach()
+
diff --git a/examples/tutorial_add_module/CMakeLists.txt b/examples/tutorial_add_module/CMakeLists.txt
index bd6305ef1..3fb74a359 100644
--- a/examples/tutorial_add_module/CMakeLists.txt
+++ b/examples/tutorial_add_module/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(EXAMPLE_FILES 
+set(EXAMPLE_FILES
     1_custom_post_processing.cpp)
 
 include(${CMAKE_SOURCE_DIR}/cmake/Utils.cmake)
@@ -6,7 +6,7 @@ include(${CMAKE_SOURCE_DIR}/cmake/Utils.cmake)
 foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   get_filename_component(SOURCE_NAME ${EXAMPLE_FILE} NAME_WE)
-  
+
   if (UNIX AND NOT APPLE)
     set(EXE_NAME "${SOURCE_NAME}.bin")
   elseif (WIN32)
@@ -15,13 +15,13 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} 
-      ${Caffe_LIBS})
-  
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY}
+      ${Caffe_LIBS} ${MKL_LIBS})
+
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/AddModule")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user 
-        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY) 
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
   endif (WIN32)
 
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/examples/tutorial_pose/CMakeLists.txt b/examples/tutorial_pose/CMakeLists.txt
index 71a285d9b..f5cf0a66b 100644
--- a/examples/tutorial_pose/CMakeLists.txt
+++ b/examples/tutorial_pose/CMakeLists.txt
@@ -14,7 +14,7 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS})
+  target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS})
   
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Pose")
@@ -22,4 +22,4 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
         ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY) 
   endif (WIN32)
 
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/examples/tutorial_thread/CMakeLists.txt b/examples/tutorial_thread/CMakeLists.txt
index df4ac8c52..e6a620700 100644
--- a/examples/tutorial_thread/CMakeLists.txt
+++ b/examples/tutorial_thread/CMakeLists.txt
@@ -16,7 +16,7 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS})
   
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Thread")
@@ -24,4 +24,4 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
         ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY) 
   endif (WIN32)
 
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/examples/tutorial_wrapper/CMakeLists.txt b/examples/tutorial_wrapper/CMakeLists.txt
index 84feb1033..9c1e10411 100644
--- a/examples/tutorial_wrapper/CMakeLists.txt
+++ b/examples/tutorial_wrapper/CMakeLists.txt
@@ -17,7 +17,7 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
 
   message(STATUS "Adding Example ${EXE_NAME}")
   add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS})
   
   if (WIN32)
     set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Wrapper")
@@ -25,4 +25,4 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
         ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY) 
   endif (WIN32)
 
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/include/openpose/core/array.hpp b/include/openpose/core/array.hpp
index 2d478b7ca..62e65fea6 100644
--- a/include/openpose/core/array.hpp
+++ b/include/openpose/core/array.hpp
@@ -4,40 +4,44 @@
 #include <memory> // std::shared_ptr
 #include <vector>
 #include <opencv2/core/core.hpp> // cv::Mat
+#include <openpose/core/macros.hpp>
 
 namespace op
 {
     /**
      * Array<T>: The OpenPose Basic Raw Data Container
-     * This template class implements a multidimensional data array. It is our basic data container, analogous to cv::Mat in OpenCV, Tensor in
-     * Torch/TensorFlow or Blob in Caffe.
-     * It wraps a cv::Mat and a std::shared_ptr, both of them pointing to the same raw data. I.e. they both share the same memory, so we can read
-     * and modify this data in both formats with no performance impact.
+     * This template class implements a multidimensional data array. It is our basic data container, analogous to
+     * cv::Mat in OpenCV, Tensor in Torch/TensorFlow or Blob in Caffe.
+     * It wraps a cv::Mat and a std::shared_ptr, both of them pointing to the same raw data. I.e. they both share the
+     * same memory, so we can read and modify this data in both formats with no performance impact.
      * Hence, it keeps high performance while adding high-level functions.
      */
     template<typename T>
     class Array
     {
     public:
-        // -------------------------------------------------- Constructors and Data Allocator Functions -------------------------------------------------- //
+        // ------------------------------ Constructors and Data Allocator Functions ------------------------------ //
         /**
          * Array constructor.
          * Equivalent to default constructor + reset(const int size).
-         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to: new T[5].
+         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to
+         * `new T[5]`.
          */
         explicit Array(const int size);
 
         /**
          * Array constructor.
          * Equivalent to default constructor + reset(const std::vector<int>& size = {}).
-         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to: new T[3*5*2].
+         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to
+         * `new T[3*5*2]`.
          */
         explicit Array(const std::vector<int>& sizes = {});
 
         /**
          * Array constructor.
          * Equivalent to default constructor + reset(const int size, const T value).
-         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to: new T[5].
+         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to
+         * `new T[5]`.
          * @param value Initial value for each component of the Array.
          */
         Array(const int size, const T value);
@@ -45,14 +49,16 @@ namespace op
         /**
          * Array constructor.
          * Equivalent to default constructor + reset(const std::vector<int>& size, const T value).
-         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to: new T[3*5*2].
+         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to:
+         * `new T[3*5*2]`.
          * @param value Initial value for each component of the Array.
          */
         Array(const std::vector<int>& sizes, const T value);
 
         /**
          * Copy constructor.
-         * It performs `fast copy`: For performance purpose, copying a Array<T> or Datum or cv::Mat just copies the reference, it still shares the same internal data.
+         * It performs `fast copy`: For performance purpose, copying a Array<T> or Datum or cv::Mat just copies the
+         * reference, it still shares the same internal data.
          * Modifying the copied element will modify the original one.
          * Use clone() for a slower but real copy, similarly to cv::Mat and Array<T>.
          * @param array Array to be copied.
@@ -85,7 +91,8 @@ namespace op
         /**
          * Clone function.
          * Similar to cv::Mat::clone and Datum::clone.
-         * It performs a real but slow copy of the data, i.e., even if the copied element is modified, the original one is not.
+         * It performs a real but slow copy of the data, i.e., even if the copied element is modified, the original
+         * one is not.
          * @return The resulting Array.
          */
         Array<T> clone() const;
@@ -93,29 +100,35 @@ namespace op
         /**
          * Data allocation function.
          * It allocates the required space for the memory (it does not initialize that memory).
-         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to: new T[5].
+         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to
+         * `new T[5]`.
          */
         void reset(const int size);
 
         /**
          * Data allocation function.
-         * Similar to reset(const int size), but it allocates a multi-dimensional array of dimensions each of the values of the argument.
-         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to: new T[3*5*2].
+         * Similar to reset(const int size), but it allocates a multi-dimensional array of dimensions each of the
+         * values of the argument.
+         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to
+         * `new T[3*5*2]`.
          */
         void reset(const std::vector<int>& sizes = {});
 
         /**
          * Data allocation function.
          * Similar to reset(const int size), but initializing the data to the value specified by the second argument.
-         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to: new T[5].
+         * @param size Integer with the number of T element to be allocated. E.g. size = 5 is internally similar to
+         * `new T[5]`.
          * @param value Initial value for each component of the Array.
          */
         void reset(const int size, const T value);
 
         /**
          * Data allocation function.
-         * Similar to reset(const std::vector<int>& size), but initializing the data to the value specified by the second argument.
-         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to: new T[3*5*2].
+         * Similar to reset(const std::vector<int>& size), but initializing the data to the value specified by the
+         * second argument.
+         * @param sizes Vector with the size of each dimension. E.g. size = {3, 5, 2} is internally similar to
+         * `new T[3*5*2]`.
          * @param value Initial value for each component of the Array.
          */
         void reset(const std::vector<int>& sizes, const T value);
@@ -136,7 +149,7 @@ namespace op
 
 
 
-        // -------------------------------------------------- Data Information Functions -------------------------------------------------- //
+        // ------------------------------ Data Information Functions ------------------------------ //
         /**
          * Check whether memory has been allocated.
          * @return True if no memory has been allocated, false otherwise.
@@ -148,17 +161,26 @@ namespace op
 
         /**
          * Return a vector with the size of each dimension allocated.
-         * @return A std::vector<int> with the size of each dimension. If no memory has been allocated, it will return an empty std::vector.
+         * @return A std::vector<int> with the size of each dimension. If no memory has been allocated, it will return
+         * an empty std::vector.
          */
         inline std::vector<int> getSize() const
         {
             return mSize;
         }
 
+        /**
+         * Return a string with the size of each dimension allocated.
+         * @return A std::stringwith the size of each dimension. If no memory has been allocated, it will return an
+         * empty string.
+         */
+        std::string printSize() const;
+
         /**
          * Return a vector with the size of the desired dimension.
          * @param index Dimension to check its size.
-         * @return Size of the desired dimension. It will return 0 if the requested dimension is higher than the number of dimensions.
+         * @return Size of the desired dimension. It will return 0 if the requested dimension is higher than the number
+         * of dimensions.
          */
         int getSize(const int index) const;
 
@@ -183,14 +205,16 @@ namespace op
 
         /**
          * Similar to getVolume(), but in this case it just returns the volume between the desired dimensions.
-         * E.g. for a Array<T> of size = {2,5,3}, the volume or total number of elements for getVolume(1,2) is: 5x3 = 15.
-         * @return The total volume of the allocated data between the desired dimensions. If the index are out of bounds, it throws an error.
+         * E.g. for a Array<T> of size = {2,5,3}, the volume or total number of elements for getVolume(1,2) is
+         * 5x3 = 15.
+         * @return The total volume of the allocated data between the desired dimensions. If the index are out of
+         * bounds, it throws an error.
          */
         size_t getVolume(const int indexA, const int indexB) const;
 
 
 
-        // -------------------------------------------------- Data Access Functions And Operators -------------------------------------------------- //
+        // ------------------------------ Data Access Functions And Operators ------------------------------ //
         /**
          * Return a raw pointer to the data. Similar to: std::shared_ptr::get().
          * Note: if you modify the pointer data, you will directly modify it in the Array<T> instance too.
@@ -213,8 +237,10 @@ namespace op
 
         /**
          * Return a cv::Mat wrapper to the data. It forbids the data to be modified.
-         * OpenCV only admits unsigned char, signed char, int, float & double. If the T class is not supported by OpenCV, it will throw an error.
-         * Note: Array<T> does not return an editable cv::Mat because some OpenCV functions reallocate memory and it would not longer point to the Array<T> instance.
+         * OpenCV only admits unsigned char, signed char, int, float & double. If the T class is not supported by
+         * OpenCV, it will throw an error.
+         * Note: Array<T> does not return an editable cv::Mat because some OpenCV functions reallocate memory and it
+         * would not longer point to the Array<T> instance.
          * If you want to perform some OpenCV operation on the Array data, you can use: 
          *     editedCvMat = array.getConstCvMat().clone();
          *     // modify data
@@ -226,7 +252,8 @@ namespace op
         /**
          * Analogous to getConstCvMat, but in this case it returns a editable cv::Mat.
          * Very important: Only allowed functions which do not provoke data reallocation.
-         * E.g. resizing functions will not work and they would provoke an undefined behaviour and/or execution crashes.
+         * E.g. resizing functions will not work and they would provoke an undefined behaviour and/or execution
+         * crashes.
          * @return A cv::Mat pointing to the data.
          */
         cv::Mat& getCvMat();
@@ -234,7 +261,8 @@ namespace op
         /**
          * [] operator
          * Similar to the [] operator for raw pointer data.
-         * If debug mode is enabled, then it will check that the desired index is in the data range, and it will throw an exception otherwise (similar to the at operator).
+         * If debug mode is enabled, then it will check that the desired index is in the data range, and it will throw
+         * an exception otherwise (similar to the at operator).
          * @param index The desired memory location.
          * @return A editable reference to the data on the desired index location.
          */
@@ -249,7 +277,8 @@ namespace op
 
         /**
          * [] operator
-         * Same functionality as operator[](const int index), but it forbids modifying the value. Otherwise, const functions would not be able to call the [] operator.
+         * Same functionality as operator[](const int index), but it forbids modifying the value. Otherwise, const
+         * functions would not be able to call the [] operator.
          * @param index The desired memory location.
          * @return A non-editable reference to the data on the desired index location.
          */
@@ -264,7 +293,8 @@ namespace op
 
         /**
          * [] operator
-         * Same functionality as operator[](const int index), but it lets the user introduce the multi-dimensional index.
+         * Same functionality as operator[](const int index), but it lets the user introduce the multi-dimensional
+         * index.
          * E.g. given a (10 x 10 x 10) array, array[11] is equivalent to array[{1,1,0}]
          * @param indexes Vector with the desired memory location.
          * @return A editable reference to the data on the desired index location.
@@ -276,7 +306,8 @@ namespace op
 
         /**
          * [] operator
-         * Same functionality as operator[](const std::vector<int>& indexes), but it forbids modifying the value. Otherwise, const functions would not be able to call the [] operator.
+         * Same functionality as operator[](const std::vector<int>& indexes), but it forbids modifying the value.
+         * Otherwise, const functions would not be able to call the [] operator.
          * @param indexes Vector with the desired memory location.
          * @return A non-editable reference to the data on the desired index location.
          */
@@ -287,7 +318,8 @@ namespace op
 
         /**
          * at() function
-         * Same functionality as operator[](const int index), but it always check whether the indexes are within the data bounds. Otherwise, it will throw an error.
+         * Same functionality as operator[](const int index), but it always check whether the indexes are within the
+         * data bounds. Otherwise, it will throw an error.
          * @param index The desired memory location.
          * @return A editable reference to the data on the desired index location.
          */
@@ -298,7 +330,8 @@ namespace op
 
         /**
          * at() function
-         * Same functionality as operator[](const int index) const, but it always check whether the indexes are within the data bounds. Otherwise, it will throw an error.
+         * Same functionality as operator[](const int index) const, but it always check whether the indexes are within
+         * the data bounds. Otherwise, it will throw an error.
          * @param index The desired memory location.
          * @return A non-editable reference to the data on the desired index location.
          */
@@ -309,7 +342,8 @@ namespace op
 
         /**
          * at() function
-         * Same functionality as operator[](const std::vector<int>& indexes), but it always check whether the indexes are within the data bounds. Otherwise, it will throw an error.
+         * Same functionality as operator[](const std::vector<int>& indexes), but it always check whether the indexes
+         * are within the data bounds. Otherwise, it will throw an error.
          * @param indexes Vector with the desired memory location.
          * @return A editable reference to the data on the desired index location.
          */
@@ -320,7 +354,8 @@ namespace op
 
         /**
          * at() function
-         * Same functionality as operator[](const std::vector<int>& indexes) const, but it always check whether the indexes are within the data bounds. Otherwise, it will throw an error.
+         * Same functionality as operator[](const std::vector<int>& indexes) const, but it always check whether the
+         * indexes are within the data bounds. Otherwise, it will throw an error.
          * @param indexes Vector with the desired memory location.
          * @return A non-editable reference to the data on the desired index location.
          */
@@ -350,7 +385,8 @@ namespace op
         std::pair<bool, cv::Mat> mCvMatData;
 
         /**
-         * Auxiliar function that both operator[](const std::vector<int>& indexes) and operator[](const std::vector<int>& indexes) const use.
+         * Auxiliar function that both operator[](const std::vector<int>& indexes) and
+         * operator[](const std::vector<int>& indexes) const use.
          * It turn the multi-dimensions indexes into the 1-dimension equivalent index.
          * @param indexes Vector with the desired memory location.
          * @return The equivalent 1-D index.
@@ -358,7 +394,8 @@ namespace op
         int getIndex(const std::vector<int>& indexes) const;
 
         /**
-         * Similar to getIndex(const std::vector<int>& indexes) const, but used for at(const std::vector<int>& indexes) and at(const std::vector<int>& indexes) const.
+         * Similar to getIndex(const std::vector<int>& indexes) const, but used for at(const std::vector<int>& indexes)
+         * and at(const std::vector<int>& indexes) const.
          * It also checks whether the index is within the allocated memory.
          * @param indexes Vector with the desired memory location.
          * @return The equivalent 1-D index.
@@ -373,10 +410,14 @@ namespace op
         T& commonAt(const int index) const;
 
         /**
-         * Private auxiliar function that sets the cv::Mat wrapper and makes it point to the same data than std::shared_ptr points to.
+         * Private auxiliar function that sets the cv::Mat wrapper and makes it point to the same data than
+         * std::shared_ptr points to.
          */
         void setCvMatFromSharedPtr();
     };
+
+    // Static methods
+    OVERLOAD_C_OUT(Array)
 }
 
 #endif // OPENPOSE_CORE_ARRAY_HPP
diff --git a/include/openpose/core/common.hpp b/include/openpose/core/common.hpp
index 4cec181c2..db2703035 100644
--- a/include/openpose/core/common.hpp
+++ b/include/openpose/core/common.hpp
@@ -8,11 +8,12 @@
 #include <vector>
 // OpenPose most used classes
 #include <openpose/core/array.hpp>
+#include <openpose/core/macros.hpp>
 #include <openpose/core/point.hpp>
 #include <openpose/core/rectangle.hpp>
 #include <openpose/utilities/errorAndLog.hpp>
 #include <openpose/utilities/profiler.hpp>
-// Macros at the end, otherwise circular dependency with array, point & rectangle
-#include <openpose/core/macros.hpp>
+// Datum at the end, otherwise circular dependency with array, point & rectangle
+#include <openpose/core/datum.hpp>
 
 #endif // OPENPOSE_CORE_COMMON_HPP
diff --git a/include/openpose/core/datum.hpp b/include/openpose/core/datum.hpp
index b184c0e33..072227206 100644
--- a/include/openpose/core/datum.hpp
+++ b/include/openpose/core/datum.hpp
@@ -304,6 +304,12 @@ namespace op
             return id != datum.id;
         }
     };
+
+    // Defines for Datum. Added here rather than in `macros.hpp` to avoid circular dependencies
+    #define DATUM_BASE_NO_PTR std::vector<Datum>
+    #define DATUM_BASE std::shared_ptr<DATUM_BASE_NO_PTR>
+    #define DEFINE_TEMPLATE_DATUM(templateName) template class OP_API templateName<DATUM_BASE>
+    #define COMPILE_TEMPLATE_DATUM(templateName) extern DEFINE_TEMPLATE_DATUM(templateName)
 }
 
 #endif // OPENPOSE_CORE_DATUM_HPP
diff --git a/include/openpose/core/macros.hpp b/include/openpose/core/macros.hpp
index 0e4ee35ab..1b21b162c 100644
--- a/include/openpose/core/macros.hpp
+++ b/include/openpose/core/macros.hpp
@@ -1,6 +1,10 @@
 #ifndef OPENPOSE_CORE_MACROS_HPP
 #define OPENPOSE_CORE_MACROS_HPP
 
+#include <memory> // std::shared_ptr
+#include <ostream>
+#include <vector>
+
 #ifndef _WIN32
     #define OP_API
 #elif defined OP_EXPORTS
@@ -15,11 +19,6 @@
     #pragma warning( disable: 4275 ) // non dll-interface structXXX used as base
 #endif
 
-#define DATUM_BASE_NO_PTR std::vector<Datum>
-#define DATUM_BASE std::shared_ptr<DATUM_BASE_NO_PTR>
-#define DEFINE_TEMPLATE_DATUM(templateName) template class OP_API templateName<DATUM_BASE>
-#define COMPILE_TEMPLATE_DATUM(templateName) extern DEFINE_TEMPLATE_DATUM(templateName)
-
 #define UNUSED(unusedVariable) (void)(unusedVariable)
 
 #define DELETE_COPY(className) \
@@ -45,6 +44,17 @@
     template classType OP_API className<double>; \
     template classType OP_API className<long double>
 
+/**
+ * cout operator overload calling toString() function
+ * @return std::ostream containing output from toString()
+ */
+#define OVERLOAD_C_OUT(className) \
+    template<typename T> std::ostream &operator<<(std::ostream& ostream, const op::className<T>& obj) \
+    { \
+        ostream << obj.toString(); \
+        return ostream; \
+    }
+
 // Instantiate a class with float and double specifications
 #define COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, class)
 #define COMPILE_TEMPLATE_FLOATING_TYPES_STRUCT(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, struct)
@@ -65,12 +75,4 @@ namespace boost
     template <typename T> class shared_ptr; // E.g., boost::shared_ptr<caffe::Blob<float>>
 }
 
-// Includes at the end, since this macros class does not need them, but the files that call this
-// file. However, keeping the files at the beginning might create a circular include linking problem.
-#include <memory> // std::shared_ptr
-#include <vector>
-#include <openpose/core/datum.hpp>
-#include <openpose/core/point.hpp>
-#include <openpose/core/rectangle.hpp>
-
 #endif // OPENPOSE_CORE_MACROS_HPP
diff --git a/include/openpose/core/point.hpp b/include/openpose/core/point.hpp
index 55f62000e..f5486dade 100644
--- a/include/openpose/core/point.hpp
+++ b/include/openpose/core/point.hpp
@@ -2,6 +2,7 @@
 #define OPENPOSE_CORE_POINT_HPP
 
 #include <string>
+#include <openpose/core/macros.hpp>
 
 namespace op
 {
@@ -15,7 +16,8 @@ namespace op
 
         /**
          * Copy constructor.
-         * It performs `fast copy`: For performance purpose, copying a Point<T> or Point<T> or cv::Mat just copies the reference, it still shares the same internal data.
+         * It performs `fast copy`: For performance purpose, copying a Point<T> or Point<T> or cv::Mat just copies the
+         * reference, it still shares the same internal data.
          * Modifying the copied element will modify the original one.
          * Use clone() for a slower but real copy, similarly to cv::Mat and Point<T>.
          * @param point Point to be copied.
@@ -61,7 +63,7 @@ namespace op
 
 
 
-        // -------------------------------------------------- Comparison operators -------------------------------------------------- //
+        // ------------------------------ Comparison operators ------------------------------ //
         /**
          * Less comparison operator.
          * @param point Point<T> to be compared.
@@ -126,7 +128,7 @@ namespace op
 
 
 
-        // -------------------------------------------------- Basic Operators -------------------------------------------------- //
+        // ------------------------------ Basic Operators ------------------------------ //
         Point<T>& operator+=(const Point<T>& point);
 
         Point<T> operator+(const Point<T>& point) const;
@@ -151,6 +153,9 @@ namespace op
 
         Point<T> operator/(const T value) const;
     };
+
+    // Static methods
+    OVERLOAD_C_OUT(Point)
 }
 
 #endif // OPENPOSE_CORE_POINT_HPP
diff --git a/include/openpose/core/rectangle.hpp b/include/openpose/core/rectangle.hpp
index 9887f847e..b5ffe0a5e 100644
--- a/include/openpose/core/rectangle.hpp
+++ b/include/openpose/core/rectangle.hpp
@@ -2,6 +2,7 @@
 #define OPENPOSE_CORE_RECTANGLE_HPP
 
 #include <string>
+#include <openpose/core/macros.hpp>
 #include <openpose/core/point.hpp>
 
 namespace op
@@ -18,7 +19,8 @@ namespace op
 
         /**
          * Copy constructor.
-         * It performs `fast copy`: For performance purpose, copying a Rectangle<T> or Datum or cv::Mat just copies the reference, it still shares the same internal data.
+         * It performs `fast copy`: For performance purpose, copying a Rectangle<T> or Datum or cv::Mat just copies
+         * the reference, it still shares the same internal data.
          * Modifying the copied element will modify the original one.
          * Use clone() for a slower but real copy, similarly to cv::Mat and Rectangle<T>.
          * @param rectangle Rectangle to be copied.
@@ -71,7 +73,7 @@ namespace op
          */
         std::string toString() const;
 
-        // -------------------------------------------------- Basic Operators -------------------------------------------------- //
+        // ------------------------------ Basic Operators ------------------------------ //
         Rectangle<T>& operator*=(const T value);
 
         Rectangle<T> operator*(const T value) const;
@@ -84,6 +86,8 @@ namespace op
     // Static methods
     template<typename T>
     Rectangle<T> recenter(const Rectangle<T>& rectangle, const T newWidth, const T newHeight);
+
+    OVERLOAD_C_OUT(Rectangle)
 }
 
 #endif // OPENPOSE_CORE_RECTANGLE_HPP
diff --git a/src/openpose/CMakeLists.txt b/src/openpose/CMakeLists.txt
index a32c516c6..576e83317 100644
--- a/src/openpose/CMakeLists.txt
+++ b/src/openpose/CMakeLists.txt
@@ -16,12 +16,16 @@ foreach (FILE ${OP_HEADERS_UNFILTERED})
   endif ()
 endforeach(FILE ${OP_HEADERS_UNFILTERED})
 
-cuda_add_library(openpose ${SOURCES_OPENPOSE} ${OP_HEADERS})
+if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose ${SOURCES_OPENPOSE} ${OP_HEADERS})
+else()
+    add_library(openpose ${SOURCES_OPENPOSE} ${OP_HEADERS})
+endif ()
 
 # Ubuntu
 if(UNIX AND NOT APPLE)
   target_link_libraries(openpose ${OpenCV_LIBS} ${Caffe_LIBS}
-      ${GFLAGS_LIBRARY} ${GLOG_LIBRARY})
+      ${GFLAGS_LIBRARY} ${GLOG_LIBRARY} ${MKL_LIBS})
   if (CMAKE_COMPILER_IS_GNUCXX)
     foreach (SUB_DIR ${SUB_DIRS})
         set_target_properties(openpose_${SUB_DIR} PROPERTIES COMPILE_FLAGS ${OP_CXX_FLAGS})
diff --git a/src/openpose/core/CMakeLists.txt b/src/openpose/core/CMakeLists.txt
index 28fe32ff8..505f353a1 100644
--- a/src/openpose/core/CMakeLists.txt
+++ b/src/openpose/core/CMakeLists.txt
@@ -28,10 +28,15 @@ set(SOURCES_OP_CORE_WITH_CP ${SOURCES_OP_CORE_WITH_CP} PARENT_SCOPE)
 set(SOURCES_OPENPOSE ${SOURCES_OPENPOSE} ${SOURCES_OP_CORE_WITH_CP} PARENT_SCOPE)
 
 if (UNIX AND NOT APPLE)
-  cuda_add_library(openpose_core ${SOURCES_OP_CORE})
+  if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose_core ${SOURCES_OP_CORE})
+  else()
+    add_library(openpose_core ${SOURCES_OP_CORE})
+  endif ()
+
   add_library(caffe SHARED IMPORTED)
   set_property(TARGET caffe PROPERTY IMPORTED_LOCATION ${Caffe_LIBS}) 
-  target_link_libraries(openpose_core caffe)
+  target_link_libraries(openpose_core caffe ${MKL_LIBS})
 
   if (BUILD_CAFFE)
     add_dependencies(openpose_core openpose_caffe)
diff --git a/src/openpose/core/array.cpp b/src/openpose/core/array.cpp
index e40619edf..97492a74f 100644
--- a/src/openpose/core/array.cpp
+++ b/src/openpose/core/array.cpp
@@ -1,6 +1,5 @@
 #include <typeinfo> // typeid
 #include <numeric> // std::accumulate
-#include <openpose/core/macros.hpp>
 #include <openpose/utilities/errorAndLog.hpp>
 #include <openpose/core/array.hpp>
 
@@ -396,6 +395,29 @@ namespace op
         }
     }
 
+    template<typename T>
+    std::string Array<T>::printSize() const
+    {
+        try
+        {
+            auto counter = 0u;
+            std::string sizeString = "[ ";
+            for (const auto& i : mSize)
+            {
+                sizeString += std::to_string(i);
+                if (++counter < mSize.size())
+                    sizeString += " x ";
+            }
+            sizeString += " ]";
+            return sizeString;
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return "";
+        }
+    }
+
     template<typename T>
     int Array<T>::getIndex(const std::vector<int>& indexes) const
     {
diff --git a/src/openpose/core/maximumBase.cpp b/src/openpose/core/maximumBase.cpp
index eded94480..38908ac16 100644
--- a/src/openpose/core/maximumBase.cpp
+++ b/src/openpose/core/maximumBase.cpp
@@ -9,51 +9,45 @@ namespace op
     {
         try
         {
-            UNUSED(targetPtr);
-            UNUSED(sourcePtr);
-            UNUSED(targetSize);
-            UNUSED(sourceSize);
-            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
-
             // // TODO: ideally done, try, debug & compare to *.cu
-            // TODO: (maybe): remove thrust dependencies for computers without CUDA?
-            // const auto height = sourceSize[2];
-            // const auto width = sourceSize[3];
-            // const auto imageOffset = height * width;
-            // const auto num = targetSize[0];
-            // const auto channels = targetSize[1];
-            // const auto numberParts = targetSize[2];
-            // const auto numberSubparts = targetSize[3];
+            const auto height = sourceSize[2];
+            const auto width = sourceSize[3];
+            const auto imageOffset = height * width;
+            const auto num = targetSize[0];
+            const auto channels = targetSize[1];
+            const auto numberParts = targetSize[2];
+            const auto numberSubparts = targetSize[3];
+
+            // log("sourceSize[0]: " + std::to_string(sourceSize[0])); // = 1
+            // log("sourceSize[1]: " + std::to_string(sourceSize[1])); // = #body_parts+bck=22(hands) or 71(face)
+            // log("sourceSize[2]: " + std::to_string(sourceSize[2])); // = 368 = height
+            // log("sourceSize[3]: " + std::to_string(sourceSize[3])); // = 368 = width
+            // log("targetSize[0]: " + std::to_string(targetSize[0])); // = 1
+            // log("targetSize[1]: " + std::to_string(targetSize[1])); // = 1
+            // log("targetSize[2]: " + std::to_string(targetSize[2])); // = 21(hands) or 70 (face)
+            // log("targetSize[3]: " + std::to_string(targetSize[3])); // = 3 = [x, y, score]
+            // log(" ");
 
-            // // log("sourceSize[0]: " + std::to_string(sourceSize[0])); // = 1
-            // // log("sourceSize[1]: " + std::to_string(sourceSize[1])); // = #body_parts+bck=22(hands) or 71(face)
-            // // log("sourceSize[2]: " + std::to_string(sourceSize[2])); // = 368 = height
-            // // log("sourceSize[3]: " + std::to_string(sourceSize[3])); // = 368 = width
-            // // log("targetSize[0]: " + std::to_string(targetSize[0])); // = 1
-            // // log("targetSize[1]: " + std::to_string(targetSize[1])); // = 1
-            // // log("targetSize[2]: " + std::to_string(targetSize[2])); // = 21(hands) or 70 (face)
-            // // log("targetSize[3]: " + std::to_string(targetSize[3])); // = 3 = [x, y, score]
-            // // log(" ");
-            // for (auto n = 0; n < num; n++)
-            // {
-            //     for (auto c = 0; c < channels; c++)
-            //     {
-            //         // // Parameters
-            //         const auto offsetChannel = (n * channels + c);
-            //         for (auto part = 0; part < numberParts; part++)
-            //         {
-            //             auto* targetPtrOffsetted = targetPtr + (offsetChannel + part) * numberSubparts;
-            //             const auto* const sourcePtrOffsetted = sourcePtr + (offsetChannel + part) * imageOffset;
-            //             // Option a - 6.3 fps
-            //             const auto sourceIndexIterator = thrust::max_element(thrust::host, sourcePtrOffsetted,
-            //                                                                  sourcePtrOffsetted + imageOffset);
-            //             const auto sourceIndex = (int)(sourceIndexIterator - sourcePtrOffsetted);
-            //             targetPtrOffsetted[0] = sourceIndex % width;
-            //             targetPtrOffsetted[1] = sourceIndex / width;
-            //             targetPtrOffsetted[2] = sourcePtrOffsetted[sourceIndex];
-            //         }
-            //     }
-            // }
+            for (auto n = 0; n < num; n++)
+            {
+                for (auto c = 0; c < channels; c++)
+                {
+                    // Parameters
+                    const auto offsetChannel = (n * channels + c);
+                    for (auto part = 0; part < numberParts; part++)
+                    {
+                        auto* targetPtrOffsetted = targetPtr + (offsetChannel + part) * numberSubparts;
+                        const auto* const sourcePtrOffsetted = sourcePtr + (offsetChannel + part) * imageOffset;
+                        cv::Mat source(cv::Size(width, height), CV_32FC1, const_cast<T*>(sourcePtrOffsetted));
+                        double minVal, maxVal;
+                        cv::Point minLoc, maxLoc;
+                        cv::minMaxLoc(source, &minVal, &maxVal, &minLoc, &maxLoc);
+                        targetPtrOffsetted[0] = maxLoc.x;
+                        targetPtrOffsetted[1] = maxLoc.y;
+                        targetPtrOffsetted[2] = maxVal;
+                    }
+                }
+            }
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/core/netCaffe.cpp b/src/openpose/core/netCaffe.cpp
index 00fd4c30a..1452c41f2 100644
--- a/src/openpose/core/netCaffe.cpp
+++ b/src/openpose/core/netCaffe.cpp
@@ -64,7 +64,9 @@ namespace op
             {
                 caffeNet->blobs()[0]->Reshape(dimensions);
                 caffeNet->Reshape();
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
             }
             catch (const std::exception& e)
             {
@@ -116,13 +118,17 @@ namespace op
                 #endif
                 upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
                 upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
                 // Set spOutputBlob
                 upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
                 if (upImpl->spOutputBlob == nullptr)
                     error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: "
                           + upImpl->mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
             #endif
         }
         catch (const std::exception& e)
diff --git a/src/openpose/core/nmsBase.cpp b/src/openpose/core/nmsBase.cpp
index db9995a32..74b4a3b0f 100644
--- a/src/openpose/core/nmsBase.cpp
+++ b/src/openpose/core/nmsBase.cpp
@@ -1,58 +1,164 @@
 #include <openpose/core/nmsBase.hpp>
+#include <opencv2/opencv.hpp>
 
 namespace op
 {
     template <typename T>
-    void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+    void nmsRegisterKernelCPU(int* kernelPtr, const T* const sourcePtr, const int w, const int h,
+                              const T& threshold, const int x, const int y)
+    {
+        // We have three scenarios for NMS, one for the border, 1 for the 1st inner border, and
+        // 1 for the rest. cv::resize adds artifacts around the 1st inner border, causing two
+        // maximas to occur side by side. Eg. [1 1 0.8 0.8 0.5 ..]. The CUDA kernel gives
+        // [0.8 1 0.8 0.8 0.5 ..] Hence for this special case in the 1st inner border, we look at the
+        // visible regions.
+
+        const auto index = y*w + x;
+        if (1 < x && x < (w-2) && 1 < y && y < (h-2))
+        {
+            const auto value = sourcePtr[index];
+            if (value > threshold)
+            {
+                const auto topLeft     = sourcePtr[(y-1)*w + x-1];
+                const auto top         = sourcePtr[(y-1)*w + x];
+                const auto topRight    = sourcePtr[(y-1)*w + x+1];
+                const auto left        = sourcePtr[    y*w + x-1];
+                const auto right       = sourcePtr[    y*w + x+1];
+                const auto bottomLeft  = sourcePtr[(y+1)*w + x-1];
+                const auto bottom      = sourcePtr[(y+1)*w + x];
+                const auto bottomRight = sourcePtr[(y+1)*w + x+1];
+
+                if (value > topLeft && value > top && value > topRight
+                    && value > left && value > right
+                        && value > bottomLeft && value > bottom && value > bottomRight)
+                    kernelPtr[index] = 1;
+                else
+                    kernelPtr[index] = 0;
+            }
+            else
+                kernelPtr[index] = 0;
+        }
+        else if (x == 1 || x == (w-2) || y == 1 || y == (h-2))
+        {
+            //kernelPtr[index] = 0;
+            const auto value = sourcePtr[index];
+            if (value > threshold)
+            {
+                const auto topLeft      = ((0 < x && 0 < y)         ? sourcePtr[(y-1)*w + x-1]  : threshold);
+                const auto top          = (0 < y                    ? sourcePtr[(y-1)*w + x]    : threshold);
+                const auto topRight     = ((0 < y && x < (w-1))     ? sourcePtr[(y-1)*w + x+1]  : threshold);
+                const auto left         = (0 < x                    ? sourcePtr[    y*w + x-1]  : threshold);
+                const auto right        = (x < (w-1)                ? sourcePtr[y*w + x+1]      : threshold);
+                const auto bottomLeft   = ((y < (h-1) && 0 < x)     ? sourcePtr[(y+1)*w + x-1]  : threshold);
+                const auto bottom       = (y < (h-1)                ? sourcePtr[(y+1)*w + x]    : threshold);
+                const auto bottomRight  = ((x < (w-1) && y < (h-1)) ? sourcePtr[(y+1)*w + x+1]  : threshold);
+
+                if (value >= topLeft && value >= top && value >= topRight
+                    && value >= left && value >= right
+                        && value >= bottomLeft && value >= bottom && value >= bottomRight)
+                    kernelPtr[index] = 1;
+                else
+                    kernelPtr[index] = 0;
+            }
+            else
+                kernelPtr[index] = 0;
+        }
+        else
+            kernelPtr[index] = 0;
+    }
+
+    template <typename T>
+    void nmsAccuratePeakPosition(const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
+                                 const int& width, const int& height, T* output)
+    {
+        T xAcc = 0.f;
+        T yAcc = 0.f;
+        T scoreAcc = 0.f;
+        const auto dWidth = 3;
+        const auto dHeight = 3;
+        for (auto dy = -dHeight ; dy <= dHeight ; dy++)
+        {
+            const auto y = peakLocY + dy;
+            if (0 <= y && y < height) // Default height = 368
+            {
+                for (auto dx = -dWidth ; dx <= dWidth ; dx++)
+                {
+                    const auto x = peakLocX + dx;
+                    if (0 <= x && x < width) // Default width = 656
+                    {
+                        const auto score = sourcePtr[y * width + x];
+                        if (score > 0)
+                        {
+                            xAcc += x*score;
+                            yAcc += y*score;
+                            scoreAcc += score;
+                        }
+                    }
+                }
+            }
+        }
+
+        output[0] = xAcc / scoreAcc;
+        output[1] = yAcc / scoreAcc;
+        output[2] = sourcePtr[peakLocY*width + peakLocX];
+    }
+
+    template <typename T>
+    void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
     {
         try
         {
-            UNUSED(targetPtr);
-            UNUSED(kernelPtr);
-            UNUSED(sourcePtr);
-            UNUSED(threshold);
-            UNUSED(targetSize);
-            UNUSED(sourceSize);
-            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
+            // Security checks
+            if (sourceSize.empty())
+                error("sourceSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+            if (targetSize.empty())
+                error("targetSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+            if (threshold < 0 || threshold > 1.0)
+                error("threshold value invalid.", __LINE__, __FUNCTION__, __FILE__);
+
+            // Params
+            const auto channels = targetSize[1]; // 57
+            const auto sourceHeight = sourceSize[2]; // 368
+            const auto sourceWidth = sourceSize[3]; // 496
+            const auto targetPeaks = targetSize[2]; // 97
+            const auto targetPeakVec = targetSize[3]; // 3
+            const auto sourceChannelOffset = sourceWidth * sourceHeight;
+            const auto targetChannelOffset = targetPeaks * targetPeakVec;
+
+            // Per channel operation
+            for (auto c = 0 ; c < channels ; c++)
+            {
+                auto* currKernelPtr = &kernelPtr[c*sourceChannelOffset];
+                const T* currSourcePtr = &sourcePtr[c*sourceChannelOffset];
 
-            // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE MAX NUMBER OF PEAKS
-            // const int num = bottom->shape(0);
-            // //const int channel = bottom->shape(1);
-            // const int oriSpatialHeight = bottom->shape(2);
-            // const int oriSpatialWidth = bottom->shape(3);
+                for (auto y = 0; y < sourceHeight; y++)
+                    for (auto x = 0; x < sourceWidth; x++)
+                        nmsRegisterKernelCPU(currKernelPtr, currSourcePtr, sourceWidth, sourceHeight, threshold, x, y);
 
-            // T* dst_pointer = top->mutable_cpu_data();
-            // const T* const src_pointer = bottom->cpu_data();
-            // const int offset2 = oriSpatialHeight * oriSpatialWidth;
-            // const int offset2_dst = (mMaxPeaks+1)*2;
+                auto currentPeakCount = 1;
+                auto* currTargetPtr = &targetPtr[c*targetChannelOffset];
+                for (auto y = 0; y < sourceHeight; y++)
+                {
+                    for (auto x = 0; x < sourceWidth; x++)
+                    {
+                        const auto index = y*sourceWidth + x;
+                        // Find high intensity points
+                        if (currentPeakCount < targetPeaks)
+                        {
+                            if (currKernelPtr[index] == 1)
+                            {
+                                // Accurate Peak Position
+                                nmsAccuratePeakPosition(currSourcePtr, x, y, sourceWidth, sourceHeight,
+                                                        &currTargetPtr[currentPeakCount*3]);
+                                currentPeakCount++;
+                            }
+                        }
 
-            //stupid method
-            // for (int n = 0; n < num; n++)
-            // {
-            //     //assume only one channel
-            //     int peakCount = 0;
-            //     for (int y = 0; y < oriSpatialHeight; y++)
-            //     {
-            //         for (int x = 0; x < oriSpatialWidth; x++)
-            //         {
-            //             const T value = src_pointer[n * offset2 + y*oriSpatialWidth + x];
-            //             if (value >= mThreshold)
-            //             {
-            //                 const T top = (y == 0) ? 0 : src_pointer[n * offset2 + (y-1)*oriSpatialWidth + x];
-            //                 const T bottom = (y == oriSpatialHeight - 1) ? 0 : src_pointer[n * offset2 + (y+1)*oriSpatialWidth + x];
-            //                 const T left = (x == 0) ? 0 : src_pointer[n * offset2 + y*oriSpatialWidth + (x-1)];
-            //                 const T right = (x == oriSpatialWidth - 1) ? 0 : src_pointer[n * offset2 + y*oriSpatialWidth + (x+1)];
-            //                 if (value > top && value > bottom && value > left && value > right)
-            //                 {
-            //                     dst_pointer[n*offset2_dst + (peakCount + 1) * 2] = x;
-            //                     dst_pointer[n*offset2_dst + (peakCount + 1) * 2 + 1] = y;
-            //                     peakCount++;
-            //                 }
-            //             }
-            //         }
-            //     }
-            //     dst_pointer[n*offset2_dst] = peakCount;
-            // }
+                    }
+                }
+                currTargetPtr[0] = currentPeakCount-1;
+            }
         }
         catch (const std::exception& e)
         {
@@ -60,6 +166,8 @@ namespace op
         }
     }
 
-    template void nmsCpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
-    template void nmsCpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    template void nmsCpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    template void nmsCpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
 }
diff --git a/src/openpose/core/point.cpp b/src/openpose/core/point.cpp
index dcf42dbd2..a2398b638 100644
--- a/src/openpose/core/point.cpp
+++ b/src/openpose/core/point.cpp
@@ -1,6 +1,6 @@
-#include <openpose/core/macros.hpp>
 #include <openpose/utilities/errorAndLog.hpp>
 #include <openpose/core/point.hpp>
+#include <ostream>
 
 namespace op
 {
diff --git a/src/openpose/core/rectangle.cpp b/src/openpose/core/rectangle.cpp
index 48938fc63..29670f9cc 100644
--- a/src/openpose/core/rectangle.cpp
+++ b/src/openpose/core/rectangle.cpp
@@ -1,4 +1,3 @@
-#include <openpose/core/macros.hpp>
 #include <openpose/utilities/errorAndLog.hpp>
 #include <openpose/core/rectangle.hpp>
 
diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp
index 18da77fbb..bdbfa08ec 100644
--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -1,4 +1,7 @@
-// #include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <openpose/utilities/fastMath.hpp>
+#include <openpose/utilities/openCv.hpp>
 #include <openpose/core/resizeAndMergeBase.hpp>
 
 namespace op
@@ -11,50 +14,98 @@ namespace op
     {
         try
         {
-            UNUSED(targetPtr);
-            UNUSED(sourcePtrs);
-            UNUSED(scaleInputToNetInputs);
-            UNUSED(targetSize);
-            UNUSED(sourceSizes);
-            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
+            // Security checks
+            if (sourceSizes.empty())
+                error("sourceSizes cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+            if (sourcePtrs.size() != sourceSizes.size() || sourceSizes.size() != scaleInputToNetInputs.size())
+                error("Size(sourcePtrs) must match size(sourceSizes) and size(scaleInputToNetInputs). Currently: "
+                      + std::to_string(sourcePtrs.size()) + " vs. " + std::to_string(sourceSizes.size()) + " vs. "
+                      + std::to_string(scaleInputToNetInputs.size()) + ".", __LINE__, __FUNCTION__, __FILE__);
 
-            // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE SCALES (I.E. SCALE NUMBER, START AND GAP) 
-            // const int num = bottom->shape(0);
-            // const int channel = bottom->shape(1);
-            // const int sourceHeight = bottom->shape(2);
-            // const int sourceWidth = bottom->shape(3);
-            // const int targetHeight = top->shape(2);
-            // const int targetWidth = top->shape(3);
+            // Params
+            const auto nums = (signed)sourceSizes.size();
+            const auto channels = targetSize[1]; // 57
+            const auto targetHeight = targetSize[2]; // 368
+            const auto targetWidth = targetSize[3]; // 496
+            const auto targetChannelOffset = targetWidth * targetHeight;
 
-            // //stupid method
-            // for (int n = 0; n < num; n++)
-            // {
-            //     for (int c = 0; c < channel; c++)
-            //     {
-            //         //fill source
-            //         cv::Mat source(sourceWidth, sourceHeight, CV_32FC1);
-            //         const auto sourceOffsetChannel = sourceHeight * sourceWidth;
-            //         const auto sourceOffsetNum = sourceOffsetChannel * channel;
-            //         const auto sourceOffset = n*sourceOffsetNum + c*sourceOffsetChannel;
-            //         const T* const sourcePtrs = bottom->cpu_data();
-            //         for (int y = 0; y < sourceHeight; y++)
-            //             for (int x = 0; x < sourceWidth; x++)
-            //                 source.at<T>(x,y) = sourcePtrs[sourceOffset + y*sourceWidth + x];
+            // No multi-scale merging or no merging required
+            if (sourceSizes.size() == 1)
+            {
+                // Params
+                const auto& sourceSize = sourceSizes[0];
+                const auto sourceHeight = sourceSize[2]; // 368/8 ..
+                const auto sourceWidth = sourceSize[3]; // 496/8 ..
+                const auto sourceChannelOffset = sourceHeight * sourceWidth;
+                if (sourceSize[0] != 1)
+                    error("It should never reache this point. Notify us otherwise.",
+                          __LINE__, __FUNCTION__, __FILE__);
 
-            //         // spatial resize
-            //         cv::Mat target;
-            //         cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC);
+                // Per channel resize
+                const T* sourcePtr = sourcePtrs[0];
+                for (auto c = 0 ; c < channels ; c++)
+                {
+                    cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1,
+                                   const_cast<T*>(&sourcePtr[c*sourceChannelOffset]));
+                    cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1,
+                                   (&targetPtr[c*targetChannelOffset]));
+                    cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC);
+                }
+            }
+            // Multi-scale merging
+            else
+            {
+                // Construct temp targets. We resuse targetPtr to store first scale
+                std::vector<std::unique_ptr<T>> tempTargetPtrs;
+                for (auto n = 1; n < nums; n++){
+                    tempTargetPtrs.emplace_back(std::unique_ptr<T>(new T[targetChannelOffset * channels]()));
+                }
 
-            //         //fill top
-            //         const auto targetOffsetChannel = targetHeight * targetWidth;
-            //         const auto targetOffsetNum = targetOffsetChannel * channel;
-            //         const auto targetOffset = n*targetOffsetNum + c*targetOffsetChannel;
-            //         T* targetPtr = top->mutable_cpu_data();
-            //         for (int y = 0; y < targetHeight; y++)
-            //             for (int x = 0; x < targetWidth; x++)
-            //                 targetPtr[targetOffset + y*targetWidth + x] = target.at<T>(x,y);
-            //     }
-            // }
+                // Resize and sum
+                for (auto n = 0; n < nums; n++){
+
+                    // Params
+                    const auto& sourceSize = sourceSizes[n];
+                    const auto sourceHeight = sourceSize[2]; // 368/6 ..
+                    const auto sourceWidth = sourceSize[3]; // 496/8 ..
+                    const auto sourceChannelOffset = sourceHeight * sourceWidth;
+
+                    // Access pointers
+                    const T* sourcePtr = sourcePtrs[n];
+                    T* tempTargetPtr;
+                    if(n != 0)
+                        tempTargetPtr = tempTargetPtrs[n-1].get();
+                    else
+                        tempTargetPtr = targetPtr;
+
+                    T* firstTempTargetPtr = targetPtr;
+                    for (auto c = 0 ; c < channels ; c++)
+                    {
+                        // Resize
+                        cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1,
+                                       const_cast<T*>(&sourcePtr[c*sourceChannelOffset]));
+                        cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1,
+                                       (&tempTargetPtr[c*targetChannelOffset]));
+                        cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC);
+
+                        // Add
+                        if (n != 0)
+                        {
+                            cv::Mat addTarget(cv::Size(targetWidth, targetHeight), CV_32FC1,
+                                              (&firstTempTargetPtr[c*targetChannelOffset]));
+                            cv::add(target, addTarget, addTarget);
+                        }
+                    }
+                }
+
+                // Average
+                for (auto c = 0 ; c < channels ; c++)
+                {
+                    cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1, (&targetPtr[c*targetChannelOffset]));
+                    target /= (float)nums;
+                }
+
+            }
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/face/CMakeLists.txt b/src/openpose/face/CMakeLists.txt
index 421e59df7..d8a3625fb 100644
--- a/src/openpose/face/CMakeLists.txt
+++ b/src/openpose/face/CMakeLists.txt
@@ -15,7 +15,11 @@ set(SOURCES_OP_FACE_WITH_CP ${SOURCES_OP_FACE_WITH_CP} PARENT_SCOPE)
 set(SOURCES_OPENPOSE ${SOURCES_OPENPOSE} ${SOURCES_OP_FACE_WITH_CP} PARENT_SCOPE)
 
 if (UNIX AND NOT APPLE)
-  cuda_add_library(openpose_face ${SOURCES_OP_FACE})
+  if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose_face ${SOURCES_OP_FACE})
+  else()
+    add_library(openpose_face ${SOURCES_OP_FACE})
+  endif ()
   
   if (BUILD_CAFFE)
     add_dependencies(openpose_face openpose_caffe)
diff --git a/src/openpose/face/faceExtractorCaffe.cpp b/src/openpose/face/faceExtractorCaffe.cpp
index 372ff4e1b..236a8e501 100644
--- a/src/openpose/face/faceExtractorCaffe.cpp
+++ b/src/openpose/face/faceExtractorCaffe.cpp
@@ -1,4 +1,4 @@
-#if defined USE_CAFFE && defined USE_CUDA
+#if defined USE_CAFFE
     #include <caffe/blob.hpp>
 #endif
 #include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
@@ -15,7 +15,7 @@ namespace op
 {
     struct FaceExtractorCaffe::ImplFaceExtractorCaffe
     {
-        #if defined USE_CAFFE && defined USE_CUDA
+        #if defined USE_CAFFE
             bool netInitialized;
             std::shared_ptr<NetCaffe> spNetCaffe;
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
@@ -36,7 +36,7 @@ namespace op
         #endif
     };
 
-    #if defined USE_CAFFE && defined USE_CUDA
+    #if defined USE_CAFFE
         void updateFaceHeatMapsForPerson(Array<float>& heatMaps, const int person, const ScaleMode heatMapScaleMode,
                                          const float* heatMapsGpuPtr)
         {
@@ -47,8 +47,13 @@ namespace op
                 const auto volumeBodyParts = FACE_NUMBER_PARTS * channelOffset;
                 auto totalOffset = 0u;
                 auto* heatMapsPtr = &heatMaps.getPtr()[person*volumeBodyParts];
-                // Copy face parts
-                cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
+                // Copy face parts                                      
+                #ifdef USE_CUDA
+                    cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
+                #else
+                    //std::memcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float));
+                    std::copy(heatMapsGpuPtr, heatMapsGpuPtr + volumeBodyParts, heatMapsPtr);
+                #endif
                 // Change from [0,1] to [-1,1]
                 if (heatMapScaleMode == ScaleMode::PlusMinusOne)
                     for (auto i = 0u ; i < volumeBodyParts ; i++)
@@ -84,7 +89,9 @@ namespace op
                 // Pose extractor blob and layer
                 maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                 // Cuda check
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
             }
             catch (const std::exception& e)
             {
@@ -98,13 +105,13 @@ namespace op
                                            const std::vector<HeatMapType>& heatMapTypes,
                                            const ScaleMode heatMapScale, const bool enableGoogleLogging) :
         FaceExtractor{netInputSize, netOutputSize, heatMapTypes, heatMapScale}
-        #if defined USE_CAFFE && defined USE_CUDA
+        #if defined USE_CAFFE
         , upImpl{new ImplFaceExtractorCaffe{modelFolder, gpuId, enableGoogleLogging}}
         #endif
     {
         try
         {
-            #if !defined USE_CAFFE || !defined USE_CUDA
+            #if !defined USE_CAFFE
                 UNUSED(netInputSize);
                 UNUSED(netOutputSize);
                 UNUSED(modelFolder);
@@ -129,17 +136,21 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE && defined USE_CUDA
+            #if defined USE_CAFFE
                 // Logging
                 log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
                 // Initialize Caffe net
                 upImpl->spNetCaffe->initializationOnThread();
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
                 // Initialize blobs
                 upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
                 upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
                 upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
                 // Logging
                 log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
             #endif
@@ -156,7 +167,7 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE && defined USE_CUDA
+            #if defined USE_CAFFE
                 if (!faceRectangles.empty())
                 {
                     // Security checks
@@ -273,9 +284,15 @@ namespace op
                                 mFaceKeypoints[baseIndex+2] = score;
                             }
                             // HeatMaps: storing
-                            if (!mHeatMapTypes.empty())
-                                updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
-                                                            upImpl->spHeatMapsBlob->gpu_data());
+                            if (!mHeatMapTypes.empty()){
+                                #ifdef USE_CUDA
+                                    updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
+                                                                upImpl->spHeatMapsBlob->gpu_data());
+                                #else
+                                    updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
+                                                                upImpl->spHeatMapsBlob->cpu_data());
+                                #endif
+                            }
                         }
                     }
                     // // Debugging
diff --git a/src/openpose/hand/CMakeLists.txt b/src/openpose/hand/CMakeLists.txt
index 52b95c2aa..59d7dd4d1 100644
--- a/src/openpose/hand/CMakeLists.txt
+++ b/src/openpose/hand/CMakeLists.txt
@@ -15,7 +15,11 @@ set(SOURCES_OP_HAND_WITH_CP ${SOURCES_OP_HAND_WITH_CP} PARENT_SCOPE)
 set(SOURCES_OPENPOSE ${SOURCES_OPENPOSE} ${SOURCES_OP_HAND_WITH_CP} PARENT_SCOPE)
 
 if (UNIX AND NOT APPLE)
-  cuda_add_library(openpose_hand ${SOURCES_OP_HAND})
+  if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose_hand ${SOURCES_OP_HAND})
+  else()
+    add_library(openpose_hand ${SOURCES_OP_HAND})
+  endif ()
   
   if (BUILD_CAFFE)
     add_dependencies(openpose_hand openpose_caffe)
diff --git a/src/openpose/hand/handExtractorCaffe.cpp b/src/openpose/hand/handExtractorCaffe.cpp
index 26ff909a6..c0d2a466b 100644
--- a/src/openpose/hand/handExtractorCaffe.cpp
+++ b/src/openpose/hand/handExtractorCaffe.cpp
@@ -1,4 +1,4 @@
-#if defined USE_CAFFE && defined USE_CUDA
+#if defined USE_CAFFE
     #include <caffe/blob.hpp>
 #endif
 #include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
@@ -16,7 +16,7 @@ namespace op
 {
     struct HandExtractorCaffe::ImplHandExtractorCaffe
     {
-        #if defined USE_CAFFE && defined USE_CUDA
+        #if defined USE_CAFFE
             bool netInitialized;
             std::shared_ptr<NetCaffe> spNetCaffe;
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
@@ -38,7 +38,7 @@ namespace op
         #endif
     };
 
-    #if defined USE_CAFFE && defined USE_CUDA
+    #if defined USE_CAFFE
         void cropFrame(Array<float>& handImageCrop, cv::Mat& affineMatrix, const cv::Mat& cvInputData,
                        const Rectangle<float>& handRectangle, const int netInputSide,
                        const Point<int>& netOutputSize, const bool mirrorImage)
@@ -134,7 +134,12 @@ namespace op
                 auto totalOffset = 0u;
                 auto* heatMapsPtr = &heatMaps.getPtr()[person*volumeBodyParts];
                 // Copy hand parts
-                cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
+                #ifdef USE_CUDA
+                    cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
+                #else
+                    //std::memcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float));
+                    std::copy(heatMapsGpuPtr, heatMapsGpuPtr + volumeBodyParts, heatMapsPtr);
+                #endif
                 // Change from [0,1] to [-1,1]
                 if (heatMapScaleMode == ScaleMode::PlusMinusOne)
                     for (auto i = 0u ; i < volumeBodyParts ; i++)
@@ -170,7 +175,9 @@ namespace op
                 // Pose extractor blob and layer
                 maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                 // Cuda check
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
             }
             catch (const std::exception& e)
             {
@@ -186,13 +193,13 @@ namespace op
                                            const ScaleMode heatMapScale,
                                            const bool enableGoogleLogging) :
         HandExtractor{netInputSize, netOutputSize, numberScales, rangeScales, heatMapTypes, heatMapScale}
-        #if defined USE_CAFFE && defined USE_CUDA
+        #if defined USE_CAFFE
         , upImpl{new ImplHandExtractorCaffe{modelFolder, gpuId, enableGoogleLogging}}
         #endif
     {
         try
         {
-            #if !defined USE_CAFFE || !defined USE_CUDA
+            #if !defined USE_CAFFE
                 UNUSED(netInputSize);
                 UNUSED(netOutputSize);
                 UNUSED(modelFolder);
@@ -219,17 +226,21 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE && defined USE_CUDA
+            #if defined USE_CAFFE
                 // Logging
                 log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
                 // Initialize Caffe net
                 upImpl->spNetCaffe->initializationOnThread();
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #if defined USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
                 // Initialize blobs
                 upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
                 upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
                 upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #if defined USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
                 // Logging
                 log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
             #endif
@@ -246,7 +257,7 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE && defined USE_CUDA
+            #if defined USE_CAFFE
                 if (!handRectangles.empty())
                 {
                     // Security checks
@@ -353,9 +364,15 @@ namespace op
                                     }
                                 }
                                 // HeatMaps: storing
-                                if (!mHeatMapTypes.empty())
-                                    updateHandHeatMapsForPerson(mHeatMaps[hand], person, mHeatMapScaleMode,
-                                                                upImpl->spHeatMapsBlob->gpu_data());
+                                if (!mHeatMapTypes.empty()){
+                                    #ifdef USE_CUDA
+                                        updateHandHeatMapsForPerson(mHeatMaps[hand], person, mHeatMapScaleMode,
+                                                                    upImpl->spHeatMapsBlob->gpu_data());
+                                    #else
+                                        updateHandHeatMapsForPerson(mHeatMaps[hand], person, mHeatMapScaleMode,
+                                                                    upImpl->spHeatMapsBlob->cpu_data());
+                                    #endif
+                                }
                             }
                         }
                     }
@@ -384,7 +401,7 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE && defined USE_CUDA
+            #if defined USE_CAFFE
                 // 1. Deep net
                 upImpl->spNetCaffe->forwardPass(mHandImageCrop);
 
diff --git a/src/openpose/pose/CMakeLists.txt b/src/openpose/pose/CMakeLists.txt
index 684ecfddd..df4fcf60e 100644
--- a/src/openpose/pose/CMakeLists.txt
+++ b/src/openpose/pose/CMakeLists.txt
@@ -19,7 +19,11 @@ set(SOURCES_OP_POSE_WITH_CP ${SOURCES_OP_POSE_WITH_CP} PARENT_SCOPE)
 set(SOURCES_OPENPOSE ${SOURCES_OPENPOSE} ${SOURCES_OP_POSE_WITH_CP} PARENT_SCOPE)
 
 if (UNIX AND NOT APPLE)
-  cuda_add_library(openpose_pose ${SOURCES_OP_POSE})
+  if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose_pose ${SOURCES_OP_POSE})
+  else()
+    add_library(openpose_pose ${SOURCES_OP_POSE})
+  endif ()
   
   if (BUILD_CAFFE)
     add_dependencies(openpose_pose openpose_caffe)
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index db9e2ad64..f03c4c2df 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -80,6 +80,7 @@ namespace op
             try
             {
                 // HeatMaps extractor blob and layer
+                // Caffe modifies bottom - Heatmap gets resized
                 const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
                 resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
                                              getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput);
@@ -227,6 +228,7 @@ namespace op
                     // Reshape blobs if required
                     // Note: In order to resize to input size to have same results as Matlab, uncomment the commented
                     // lines
+                    // Note: For dynamic sizes (e.g. a folder with images of different aspect ratio)
                     if (!vectorsAreEqual(upImpl->mNetInput4DSizes.at(i), inputNetData[i].getSize()))
                         // || !vectorsAreEqual(upImpl->mScaleInputToNetInputs, scaleInputToNetInputs))
                     {
@@ -247,11 +249,9 @@ namespace op
                 const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                 upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
                 #ifdef USE_CUDA
-                    upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs,                             // ~5ms
-                                                               {upImpl->spHeatMapsBlob.get()});
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                    upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~5ms
                 #else
-                    error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                    upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
                 #endif
 
                 // 3. Get peaks by Non-Maximum Suppression
@@ -260,7 +260,7 @@ namespace op
                     upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #else
-                    error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                    upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                 #endif
 
                 // Get scale net to output (i.e. image input)
diff --git a/src/openpose/utilities/CMakeLists.txt b/src/openpose/utilities/CMakeLists.txt
index 20256741c..08e8e7cc0 100644
--- a/src/openpose/utilities/CMakeLists.txt
+++ b/src/openpose/utilities/CMakeLists.txt
@@ -14,7 +14,12 @@ set(SOURCES_OP_UTILITIES_WITH_CP ${SOURCES_OP_UTILITIES_WITH_CP} PARENT_SCOPE)
 set(SOURCES_OPENPOSE ${SOURCES_OPENPOSE} ${SOURCES_OP_UTILITIES_WITH_CP} PARENT_SCOPE)
 
 if (UNIX AND NOT APPLE)
-  cuda_add_library(openpose_utilities ${SOURCES_OP_UTILITIES})
+  if (${GPU_MODE} MATCHES "CUDA")
+    cuda_add_library(openpose_utilities ${SOURCES_OP_UTILITIES})
+  else()
+    add_library(openpose_utilities ${SOURCES_OP_UTILITIES})
+  endif ()
+
   target_link_libraries(openpose_utilities openpose_producer openpose_filestream)
 
   install(TARGETS openpose_utilities