Backport several fixes into 2.7.x. (#2579)

* Fix `common_type` specialization for extended floating point types (#2483) * Fix `common_type` specialization for extended floating point types The machinery we had in place was not really suited to specialize `common_type` because it would take precendence over the actual implementation of `common_type` In that case, we only specialized `common_type<__half, __half>` but not `common_type<__half, __half&>` and so on. This shows how brittle the whole thing is and that it is not extensible. Rather than putting another bandaid over it, add a proper 5th step in the common_type detection that properly treats combinations of an extended floating point type with an arithmetic type. Allowing arithmetic types it necessary to keep machinery like `pow(__half, 2)` working. Fixes [BUG]: `is_common_type` trait is broken when mixing rvalue references #2419 * Work around MSVC declval bug * Disable system header for narrowing conversion check (#2465) There is an incredible compiler bug reported in nvbug4867473 where the use of system header changes the way some types are instantiated. The culprit seems to be that within a system header the compiler accepts narrowing conversions that it should not accept Work around it by moving __is_non_narrowing_convertible to its own header that is included before we define the system header machinery * Drop 2 relative includes that snuck in (#2492) * Fix popc.h when architecture is not x86 on MSVC. (#2524) * Fix popc when architecture is not x86 * Update libcudacxx/include/cuda/std/__bit/popc.h --------- Co-authored-by: Michael Schellenberger Costa <[email protected]> * Make `bit_cast` play nice with extended floating point types (#2434) * Move `__is_nvbf16` and `__is_nvfp16` to their own file * Make `bit_cast` play nice with extended floating point types --------- Co-authored-by: Michael Schellenberger Costa <[email protected]>
NVIDIA · Oct 15, 2024 · 70016e1 · 70016e1
1 parent 05e019a
commit 70016e1
Show file tree

Hide file tree

Showing 19 changed files with 410 additions and 128 deletions.
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,8 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include "cuda_ptx_generated.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)

diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/is_trivially_default_constructible.h>
 #include <cuda/std/detail/libcxx/include/cstring>
@@ -32,13 +33,19 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  define _LIBCUDACXX_CONSTEXPR_BIT_CAST constexpr
 #else // ^^^ _LIBCUDACXX_BIT_CAST ^^^ / vvv !_LIBCUDACXX_BIT_CAST vvv
 #  define _LIBCUDACXX_CONSTEXPR_BIT_CAST
+#  if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 8
+// GCC starting with GCC8 warns about our extended floating point types having protected data members
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess")
+#  endif // _CCCL_COMPILER_GCC >= 8
 #endif // !_LIBCUDACXX_BIT_CAST
 
-template <class _To,
-          class _From,
-          __enable_if_t<(sizeof(_To) == sizeof(_From)), int>            = 0,
-          __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To), int>   = 0,
-          __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From), int> = 0>
+template <
+  class _To,
+  class _From,
+  __enable_if_t<(sizeof(_To) == sizeof(_From)), int>                                                                = 0,
+  __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To) || _CCCL_TRAIT(__is_extended_floating_point, _To), int>     = 0,
+  __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From) || _CCCL_TRAIT(__is_extended_floating_point, _From), int> = 0>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
 {
 #if defined(_LIBCUDACXX_BIT_CAST)
@@ -53,6 +60,12 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit
 #endif // !_LIBCUDACXX_BIT_CAST
 }
 
+#if !defined(_LIBCUDACXX_BIT_CAST)
+#  if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 8
+_CCCL_DIAG_POP
+#  endif // _CCCL_COMPILER_GCC >= 8
+#endif // !_LIBCUDACXX_BIT_CAST
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___BIT_BIT_CAST_H
diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h
@@ -26,6 +26,15 @@
 
 #if defined(_CCCL_COMPILER_MSVC)
 #  include <intrin.h>
+
+#  if defined(_M_ARM64)
+#    define _LIBCUDACXX_MSVC_POPC(x)   _CountOneBits(x)
+#    define _LIBCUDACXX_MSVC_POPC64(x) _CountOneBits64(x)
+#  else // ^^^ _M_ARM64 ^^^ / vvv !_M_ARM64 vvv
+#    define _LIBCUDACXX_MSVC_POPC(x)   __popcnt(x)
+#    define _LIBCUDACXX_MSVC_POPC64(x) __popcnt64(x)
+#  endif // !_M_ARM64
+
 #endif // _CCCL_COMPILER_MSVC
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -95,7 +104,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
 {
   if (!__libcpp_default_is_constant_evaluated())
   {
-    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt(__x));))
+    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC(__x));))
   }
 
   return __fallback_popc64(static_cast<uint64_t>(__x));
@@ -105,7 +114,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x)
 {
   if (!__libcpp_default_is_constant_evaluated())
   {
-    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt64(__x));))
+    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC64(__x));))
   }
 
   return __fallback_popc64(static_cast<uint64_t>(__x));

diff --git a/libcudacxx/include/cuda/std/__cccl/is_non_narrowing_convertible.h b/libcudacxx/include/cuda/std/__cccl/is_non_narrowing_convertible.h
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CCCL_IS_NON_NARROWING_CONVERTIBLE_H
+#define __CCCL_IS_NON_NARROWING_CONVERTIBLE_H
+
+#include <cuda/std/__cccl/compiler.h>
+
+//! There is compiler bug that results in incorrect results for the below `__is_non_narrowing_convertible` check.
+//! This breaks some common functionality, so this *must* be included outside of a system header. See nvbug4867473.
+#if defined(_CCCL_FORCE_SYSTEM_HEADER_GCC) || defined(_CCCL_FORCE_SYSTEM_HEADER_CLANG) \
+  || defined(_CCCL_FORCE_SYSTEM_HEADER_MSVC)
+#  error \
+    "This header must be included only within the <cuda/std/__cccl/system_header>. This most likely means a mix and match of different versions of CCCL."
+#endif // system header detected
+
+namespace __cccl_internal
+{
+
+#if defined(_CCCL_CUDA_COMPILER) && (defined(__CUDACC__) || defined(_NVHPC_CUDA) || defined(_CCCL_COMPILER_NVRTC))
+template <class _Tp>
+__host__ __device__ _Tp&& __cccl_declval(int);
+template <class _Tp>
+__host__ __device__ _Tp __cccl_declval(long);
+template <class _Tp>
+__host__ __device__ decltype(__cccl_internal::__cccl_declval<_Tp>(0)) __cccl_declval() noexcept;
+
+// This requires a type to be implicitly convertible (also non-arithmetic)
+template <class _Tp>
+__host__ __device__ void __cccl_accepts_implicit_conversion(_Tp) noexcept;
+#else // ^^^ CUDA compilation ^^^ / vvv no CUDA compilation
+template <class _Tp>
+_Tp&& __cccl_declval(int);
+template <class _Tp>
+_Tp __cccl_declval(long);
+template <class _Tp>
+decltype(__cccl_internal::__cccl_declval<_Tp>(0)) __cccl_declval() noexcept;
+
+// This requires a type to be implicitly convertible (also non-arithmetic)
+template <class _Tp>
+void __cccl_accepts_implicit_conversion(_Tp) noexcept;
+#endif // no CUDA compilation
+
+template <class...>
+using __cccl_void_t = void;
+
+template <class _Dest, class _Source, class = void>
+struct __is_non_narrowing_convertible
+{
+  static constexpr bool value = false;
+};
+
+// This also prohibits narrowing conversion in case of arithmetic types
+template <class _Dest, class _Source>
+struct __is_non_narrowing_convertible<_Dest,
+                                      _Source,
+                                      __cccl_void_t<decltype(__cccl_internal::__cccl_accepts_implicit_conversion<_Dest>(
+                                                      __cccl_internal::__cccl_declval<_Source>())),
+                                                    decltype(_Dest{__cccl_internal::__cccl_declval<_Source>()})>>
+{
+  static constexpr bool value = true;
+};
+
+} // namespace __cccl_internal
+
+#endif // __CCCL_IS_NON_NARROWING_CONVERTIBLE_H
diff --git a/libcudacxx/include/cuda/std/__cccl/system_header.h b/libcudacxx/include/cuda/std/__cccl/system_header.h
@@ -12,6 +12,7 @@
 #define __CCCL_SYSTEM_HEADER_H
 
 #include <cuda/std/__cccl/compiler.h>
+#include <cuda/std/__cccl/is_non_narrowing_convertible.h> // IWYU pragma: export
 
 // Enforce that cccl headers are treated as system headers
 #if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC)

diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -33,52 +33,60 @@ _CCCL_DIAG_POP
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 #  include <cuda/std/__type_traits/is_constructible.h>
+#  include <cuda/std/__type_traits/is_extended_floating_point.h>
 #  include <cuda/std/cmath>
 #  include <cuda/std/complex>
 
 #  if !defined(_CCCL_COMPILER_NVRTC)
 #    include <sstream> // for std::basic_ostringstream
 #  endif // !_CCCL_COMPILER_NVRTC
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
+// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
+namespace __cccl_internal
+{
 template <>
-struct __is_nvbf16<__nv_bfloat16> : true_type
-{};
+struct __is_non_narrowing_convertible<__nv_bfloat16, float>
+{
+  static constexpr bool value = true;
+};
 
 template <>
-struct __complex_alignment<__nv_bfloat16> : integral_constant<size_t, alignof(__nv_bfloat162)>
-{};
+struct __is_non_narrowing_convertible<__nv_bfloat16, double>
+{
+  static constexpr bool value = true;
+};
 
 template <>
-struct __type_to_vector<__nv_bfloat16>
+struct __is_non_narrowing_convertible<float, __nv_bfloat16>
 {
-  using __type = __nv_bfloat162;
+  static constexpr bool value = true;
 };
 
 template <>
-struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
+struct __is_non_narrowing_convertible<double, __nv_bfloat16>
 {
-  typedef __nv_bfloat16 _ValueType;
-  typedef complex<__nv_bfloat16> _ComplexType;
+  static constexpr bool value = true;
 };
+} // namespace __cccl_internal
 
-// This is a workaround against the user defining macros __CUDA_NO_BFLOAT16_CONVERSIONS__ __CUDA_NO_BFLOAT16_OPERATORS__
-template <>
-struct __complex_can_implicitly_construct<__nv_bfloat16, float> : true_type
-{};
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <>
-struct __complex_can_implicitly_construct<__nv_bfloat16, double> : true_type
+struct __complex_alignment<__nv_bfloat16> : integral_constant<size_t, alignof(__nv_bfloat162)>
 {};
 
 template <>
-struct __complex_can_implicitly_construct<float, __nv_bfloat16> : true_type
-{};
+struct __type_to_vector<__nv_bfloat16>
+{
+  using __type = __nv_bfloat162;
+};
 
 template <>
-struct __complex_can_implicitly_construct<double, __nv_bfloat16> : true_type
-{};
+struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
+{
+  typedef __nv_bfloat16 _ValueType;
+  typedef complex<__nv_bfloat16> _ComplexType;
+};
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept
@@ -111,14 +119,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__nv_bfloat162)) compl
       : __repr_(__re, __im)
   {}
 
-  template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
+  template <class _Up, __enable_if_t<__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
       : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}
 
   template <class _Up,
-            __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
-            __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
+            __enable_if_t<!__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0,
+            __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>                           = 0>
   _LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
       : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}

diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -30,52 +30,60 @@
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 #  include <cuda/std/__type_traits/is_constructible.h>
+#  include <cuda/std/__type_traits/is_extended_floating_point.h>
 #  include <cuda/std/cmath>
 #  include <cuda/std/complex>
 
 #  if !defined(_CCCL_COMPILER_NVRTC)
 #    include <sstream> // for std::basic_ostringstream
 #  endif // !_CCCL_COMPILER_NVRTC
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
+// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
+namespace __cccl_internal
+{
 template <>
-struct __is_nvfp16<__half> : true_type
-{};
+struct __is_non_narrowing_convertible<__half, float>
+{
+  static constexpr bool value = true;
+};
 
 template <>
-struct __complex_alignment<__half> : integral_constant<size_t, alignof(__half2)>
-{};
+struct __is_non_narrowing_convertible<__half, double>
+{
+  static constexpr bool value = true;
+};
 
 template <>
-struct __type_to_vector<__half>
+struct __is_non_narrowing_convertible<float, __half>
 {
-  using __type = __half2;
+  static constexpr bool value = true;
 };
 
 template <>
-struct __libcpp_complex_overload_traits<__half, false, false>
+struct __is_non_narrowing_convertible<double, __half>
 {
-  typedef __half _ValueType;
-  typedef complex<__half> _ComplexType;
+  static constexpr bool value = true;
 };
+} // namespace __cccl_internal
 
-// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
-template <>
-struct __complex_can_implicitly_construct<__half, float> : true_type
-{};
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <>
-struct __complex_can_implicitly_construct<__half, double> : true_type
+struct __complex_alignment<__half> : integral_constant<size_t, alignof(__half2)>
 {};
 
 template <>
-struct __complex_can_implicitly_construct<float, __half> : true_type
-{};
+struct __type_to_vector<__half>
+{
+  using __type = __half2;
+};
 
 template <>
-struct __complex_can_implicitly_construct<double, __half> : true_type
-{};
+struct __libcpp_complex_overload_traits<__half, false, false>
+{
+  typedef __half _ValueType;
+  typedef complex<__half> _ComplexType;
+};
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI __half __convert_to_half(const _Tp& __value) noexcept
@@ -108,14 +116,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__half2)) complex<__ha
       : __repr_(__re, __im)
   {}
 
-  template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
+  template <class _Up, __enable_if_t<__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
       : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}
 
   template <class _Up,
-            __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
-            __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
+            __enable_if_t<!__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0,
+            __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>                           = 0>
   _LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
       : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}