[RUNTIME] Add fp16/fp32 conversion functions (apache#1766)

yangulei · Sep 25, 2018 · bde5303 · bde5303
1 parent e8d6d9a
commit bde5303
Show file tree

Hide file tree

Showing 21 changed files with 301 additions and 48 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "dmlc-core"]
-	path = dmlc-core
+	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core
 [submodule "HalideIR"]
-	path = HalideIR
+	path = 3rdparty/HalideIR
 	url = https://github.com/dmlc/HalideIR
 [submodule "dlpack"]
-	path = dlpack
+	path = 3rdparty/dlpack
 	url = https://github.com/dmlc/dlpack
diff --git a/HalideIR → 3rdparty/HalideIR b/HalideIR → 3rdparty/HalideIR
diff --git a/3rdparty/compiler-rt/builtin_fp16.h b/3rdparty/compiler-rt/builtin_fp16.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2009-2015 by llvm/compiler-rt contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt.
+ */
+
+#include <cstdint>
+
+static inline uint32_t __clz(uint32_t x) {
+  // count leading zeros
+  int n = 32;
+  uint32_t y;
+
+  y = x >>16; if (y) { n = n -16; x = y; }
+  y = x >> 8; if (y) { n = n - 8; x = y; }
+  y = x >> 4; if (y) { n = n - 4; x = y; }
+  y = x >> 2; if (y) { n = n - 2; x = y; }
+  y = x >> 1; if (y) return n - 2;
+  return n - x;
+}
+
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __truncXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcSignificandMask = srcMinNormal - 1;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1;
+  const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1);
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T) * 8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const int underflowExponent = srcExpBias + 1 - dstExpBias;
+  const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
+  const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS;
+  const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS;
+
+  const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1);
+  const DST_REP_T dstNaNCode = dstQNaN - 1;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  if (aAbs - underflow < aAbs - overflow) {
+    // The exponent of a is within the range of normal numbers in the
+    // destination format.  We can convert by simply right-shifting with
+    // rounding and adjusting the exponent.
+    absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS);
+    absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS;
+
+    const SRC_REP_T roundBits = aAbs & roundMask;
+    // Round to nearest
+    if (roundBits > halfway)
+      absResult++;
+      // Ties to even
+    else if (roundBits == halfway)
+      absResult += absResult & 1;
+  }
+  else if (aAbs > srcInfinity) {
+    // a is NaN.
+    // Conjure the result by beginning with infinity, setting the qNaN
+    // bit and inserting the (truncated) trailing NaN field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= dstQNaN;
+    absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
+  }
+  else if (aAbs >= overflow) {
+    // a overflows to infinity.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+  }
+  else {
+    // a underflows on conversion to the destination type or is an exact
+    // zero.  The result may be a denormal or zero.  Extract the exponent
+    // to get the shift amount for the denormalization.
+    const int aExp = aAbs >> SRC_SIG_BITS;
+    const int shift = srcExpBias - dstExpBias - aExp + 1;
+
+    const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal;
+
+    // Right shift by the denormalization amount with sticky.
+    if (shift > SRC_SIG_BITS) {
+      absResult = 0;
+    } else {
+      const bool sticky = significand << (srcBits - shift);
+      SRC_REP_T denormalizedSignificand = significand >> shift | sticky;
+      absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
+      const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
+      // Round to nearest
+      if (roundBits > halfway)
+        absResult++;
+        // Ties to even
+      else if (roundBits == halfway)
+        absResult += absResult & 1;
+    }
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
+
+template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __extendXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T)*8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted
+  // to (signed) int.  To avoid that, explicitly cast to SRC_REP_T.
+  if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+    // a is a normal number.
+    // Extend to the destination type by shifting the significand and
+    // exponent into the proper position and rebiasing the exponent.
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS;
+  }
+
+  else if (aAbs >= srcInfinity) {
+    // a is NaN or infinity.
+    // Conjure the result by beginning with infinity, then setting the qNaN
+    // bit (if needed) and right-aligning the rest of the trailing NaN
+    // payload field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
+  }
+  else if (aAbs) {
+    // a is denormal.
+    // renormalize the significand and clear the leading bit, then insert
+    // the correct adjusted exponent in the destination type.
+    const int scale = __clz(aAbs) - __clz(srcMinNormal);
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale);
+    absResult ^= dstMinNormal;
+    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
+    absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
+  }
+  else {
+    // a is zero.
+    absResult = 0;
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
diff --git a/dlpack → 3rdparty/dlpack b/dlpack → 3rdparty/dlpack
diff --git a/dmlc-core → 3rdparty/dmlc-core b/dmlc-core → 3rdparty/dmlc-core
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -50,8 +50,9 @@ tvm_option(USE_RANDOM "Build with random support" OFF)
 
 # include directories
 include_directories("include")
-include_directories("dlpack/include")
-include_directories("dmlc-core/include")
+include_directories("3rdparty/dlpack/include")
+include_directories("3rdparty/dmlc-core/include")
+include_directories("3rdparty/compiler-rt")
 
 # initial variables
 set(TVM_LINKER_LIBS "")
@@ -87,8 +88,8 @@ else(MSVC)
 endif(MSVC)
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
-FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "3rdparty/HalideIR/src/*.h"
                                 "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_INCLUDE})
@@ -127,7 +128,7 @@ file(GLOB_RECURSE NNVM_COMPILER_SRCS
 file(GLOB TOPI_SRCS
     topi/src/*.cc
 )
-file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
+file(GLOB_RECURSE HALIDEIR_SRCS 3rdparty/HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
 file(GLOB RUNTIME_SRCS src/runtime/*.cc)
 
@@ -194,7 +195,7 @@ target_link_libraries(nnvm_compiler tvm)
 # Related headers
 target_include_directories(
   tvm
-  PUBLIC "HalideIR/src"
+  PUBLIC "3rdparty/HalideIR/src"
   PUBLIC "topi/include")
 target_include_directories(
   tvm_topi
@@ -244,12 +245,12 @@ if (INSTALL_DEV)
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "HalideIR/src/." DESTINATION "include/HalideIR"
+    DIRECTORY "3rdparty/HalideIR/src/." DESTINATION "include/HalideIR"
     FILES_MATCHING
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "dlpack/include/." DESTINATION "include"
+    DIRECTORY "3rdparty/dlpack/include/." DESTINATION "include"
     FILES_MATCHING
     PATTERN "*.h"
     )

diff --git a/Makefile b/Makefile
@@ -4,11 +4,11 @@ ROOTDIR = $(CURDIR)
 	 cython cython2 cython3 web runtime vta
 
 ifndef DMLC_CORE_PATH
-  DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
+  DMLC_CORE_PATH = $(ROOTDIR)/3rdparty/dmlc-core
 endif
 
 ifndef DLPACK_PATH
-  DLPACK_PATH = $(ROOTDIR)/dlpack
+  DLPACK_PATH = $(ROOTDIR)/3rdparty/dlpack
 endif
 
 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
@@ -50,10 +50,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 
 # Lint scripts
 cpplint:
-	python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
-	python3 dmlc-core/scripts/lint.py topi cpp topi/include;
-	python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
-	python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
+	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 3rdparty/dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src verilog\
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
@@ -63,7 +63,7 @@ pylint:
 	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 
 jnilint:
-	python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 
 lint: cpplint pylint jnilint
 

diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed

diff --git a/apps/android_rpc/app/src/main/jni/Android.mk b/apps/android_rpc/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed

diff --git a/apps/extension/Makefile b/apps/extension/Makefile
@@ -2,9 +2,9 @@
 TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
-	-I${TVM_ROOT}/dmlc-core/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I${TVM_ROOT}/HalideIR/src
+	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
 PKG_LDFLAGS =-L${TVM_ROOT}/lib
 UNAME_S := $(shell uname -s)

diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
@@ -1,12 +1,12 @@
 # Makefile Example to deploy TVM modules.
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 
 PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
 

diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -8,8 +8,8 @@
  *  - Compile with -std=c++11
  *  - Add the following include path
  *     - /path/to/tvm/include/
- *     - /path/to/tvm/dmlc-core/include/
- *     - /path/to/tvm/dlpack/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
  *   - Add -lpthread -ldl to the linked library.
  *   - You are good to go.
  *   - See the Makefile in the same folder for example.