Skip to content

Commit

Permalink
Backport several fixes into 2.7.x. (#2579)
Browse files Browse the repository at this point in the history
* Fix `common_type` specialization for extended floating point types (#2483)

* Fix `common_type` specialization for extended floating point types

The machinery we had in place was not really suited to specialize `common_type` because it would take precendence over the actual implementation of `common_type`

In that case, we only specialized `common_type<__half, __half>` but not `common_type<__half, __half&>` and so on.

This shows how brittle the whole thing is and that it is not extensible.

Rather than putting another bandaid over it, add a proper 5th step in the common_type detection that properly treats combinations of an extended floating point type with an arithmetic type.

Allowing arithmetic types it necessary to keep machinery like `pow(__half, 2)` working.

Fixes [BUG]: `is_common_type`  trait is broken when mixing rvalue references #2419

* Work around MSVC declval bug

* Disable system header for narrowing conversion check (#2465)

There is an incredible compiler bug reported in nvbug4867473 where the use of system header changes the way some types are instantiated.

The culprit seems to be that within a system header the compiler accepts narrowing conversions that it should not accept

Work around it by moving __is_non_narrowing_convertible to its own header that is included before we define the system header machinery

* Drop 2 relative includes that snuck in (#2492)

* Fix popc.h when architecture is not x86 on MSVC. (#2524)

* Fix popc when architecture is not x86

* Update libcudacxx/include/cuda/std/__bit/popc.h

---------

Co-authored-by: Michael Schellenberger Costa <[email protected]>

* Make `bit_cast` play nice with extended floating point types (#2434)

* Move `__is_nvbf16` and `__is_nvfp16` to their own file

* Make `bit_cast` play nice with extended floating point types

---------

Co-authored-by: Michael Schellenberger Costa <[email protected]>
  • Loading branch information
wmaxey and miscco authored Oct 15, 2024
1 parent 05e019a commit 70016e1
Show file tree
Hide file tree
Showing 19 changed files with 410 additions and 128 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

#include <cuda/std/detail/__config>

#include "cuda_ptx_generated.h"

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down
23 changes: 18 additions & 5 deletions libcudacxx/include/cuda/std/__bit/bit_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#endif // no system header

#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_extended_floating_point.h>
#include <cuda/std/__type_traits/is_trivially_copyable.h>
#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
#include <cuda/std/detail/libcxx/include/cstring>
Expand All @@ -32,13 +33,19 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
# define _LIBCUDACXX_CONSTEXPR_BIT_CAST constexpr
#else // ^^^ _LIBCUDACXX_BIT_CAST ^^^ / vvv !_LIBCUDACXX_BIT_CAST vvv
# define _LIBCUDACXX_CONSTEXPR_BIT_CAST
# if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 8
// GCC starting with GCC8 warns about our extended floating point types having protected data members
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess")
# endif // _CCCL_COMPILER_GCC >= 8
#endif // !_LIBCUDACXX_BIT_CAST

template <class _To,
class _From,
__enable_if_t<(sizeof(_To) == sizeof(_From)), int> = 0,
__enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To), int> = 0,
__enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From), int> = 0>
template <
class _To,
class _From,
__enable_if_t<(sizeof(_To) == sizeof(_From)), int> = 0,
__enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To) || _CCCL_TRAIT(__is_extended_floating_point, _To), int> = 0,
__enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From) || _CCCL_TRAIT(__is_extended_floating_point, _From), int> = 0>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
{
#if defined(_LIBCUDACXX_BIT_CAST)
Expand All @@ -53,6 +60,12 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit
#endif // !_LIBCUDACXX_BIT_CAST
}

#if !defined(_LIBCUDACXX_BIT_CAST)
# if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 8
_CCCL_DIAG_POP
# endif // _CCCL_COMPILER_GCC >= 8
#endif // !_LIBCUDACXX_BIT_CAST

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX___BIT_BIT_CAST_H
13 changes: 11 additions & 2 deletions libcudacxx/include/cuda/std/__bit/popc.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@

#if defined(_CCCL_COMPILER_MSVC)
# include <intrin.h>

# if defined(_M_ARM64)
# define _LIBCUDACXX_MSVC_POPC(x) _CountOneBits(x)
# define _LIBCUDACXX_MSVC_POPC64(x) _CountOneBits64(x)
# else // ^^^ _M_ARM64 ^^^ / vvv !_M_ARM64 vvv
# define _LIBCUDACXX_MSVC_POPC(x) __popcnt(x)
# define _LIBCUDACXX_MSVC_POPC64(x) __popcnt64(x)
# endif // !_M_ARM64

#endif // _CCCL_COMPILER_MSVC

_LIBCUDACXX_BEGIN_NAMESPACE_STD
Expand Down Expand Up @@ -95,7 +104,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
{
if (!__libcpp_default_is_constant_evaluated())
{
NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt(__x));))
NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC(__x));))
}

return __fallback_popc64(static_cast<uint64_t>(__x));
Expand All @@ -105,7 +114,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x)
{
if (!__libcpp_default_is_constant_evaluated())
{
NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt64(__x));))
NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC64(__x));))
}

return __fallback_popc64(static_cast<uint64_t>(__x));
Expand Down
73 changes: 73 additions & 0 deletions libcudacxx/include/cuda/std/__cccl/is_non_narrowing_convertible.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CCCL_IS_NON_NARROWING_CONVERTIBLE_H
#define __CCCL_IS_NON_NARROWING_CONVERTIBLE_H

#include <cuda/std/__cccl/compiler.h>

//! There is compiler bug that results in incorrect results for the below `__is_non_narrowing_convertible` check.
//! This breaks some common functionality, so this *must* be included outside of a system header. See nvbug4867473.
#if defined(_CCCL_FORCE_SYSTEM_HEADER_GCC) || defined(_CCCL_FORCE_SYSTEM_HEADER_CLANG) \
|| defined(_CCCL_FORCE_SYSTEM_HEADER_MSVC)
# error \
"This header must be included only within the <cuda/std/__cccl/system_header>. This most likely means a mix and match of different versions of CCCL."
#endif // system header detected

namespace __cccl_internal
{

#if defined(_CCCL_CUDA_COMPILER) && (defined(__CUDACC__) || defined(_NVHPC_CUDA) || defined(_CCCL_COMPILER_NVRTC))
template <class _Tp>
__host__ __device__ _Tp&& __cccl_declval(int);
template <class _Tp>
__host__ __device__ _Tp __cccl_declval(long);
template <class _Tp>
__host__ __device__ decltype(__cccl_internal::__cccl_declval<_Tp>(0)) __cccl_declval() noexcept;

// This requires a type to be implicitly convertible (also non-arithmetic)
template <class _Tp>
__host__ __device__ void __cccl_accepts_implicit_conversion(_Tp) noexcept;
#else // ^^^ CUDA compilation ^^^ / vvv no CUDA compilation
template <class _Tp>
_Tp&& __cccl_declval(int);
template <class _Tp>
_Tp __cccl_declval(long);
template <class _Tp>
decltype(__cccl_internal::__cccl_declval<_Tp>(0)) __cccl_declval() noexcept;

// This requires a type to be implicitly convertible (also non-arithmetic)
template <class _Tp>
void __cccl_accepts_implicit_conversion(_Tp) noexcept;
#endif // no CUDA compilation

template <class...>
using __cccl_void_t = void;

template <class _Dest, class _Source, class = void>
struct __is_non_narrowing_convertible
{
static constexpr bool value = false;
};

// This also prohibits narrowing conversion in case of arithmetic types
template <class _Dest, class _Source>
struct __is_non_narrowing_convertible<_Dest,
_Source,
__cccl_void_t<decltype(__cccl_internal::__cccl_accepts_implicit_conversion<_Dest>(
__cccl_internal::__cccl_declval<_Source>())),
decltype(_Dest{__cccl_internal::__cccl_declval<_Source>()})>>
{
static constexpr bool value = true;
};

} // namespace __cccl_internal

#endif // __CCCL_IS_NON_NARROWING_CONVERTIBLE_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__cccl/system_header.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#define __CCCL_SYSTEM_HEADER_H

#include <cuda/std/__cccl/compiler.h>
#include <cuda/std/__cccl/is_non_narrowing_convertible.h> // IWYU pragma: export

// Enforce that cccl headers are treated as system headers
#if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC)
Expand Down
54 changes: 31 additions & 23 deletions libcudacxx/include/cuda/std/__complex/nvbf16.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,52 +33,60 @@ _CCCL_DIAG_POP
# include <cuda/std/__type_traits/enable_if.h>
# include <cuda/std/__type_traits/integral_constant.h>
# include <cuda/std/__type_traits/is_constructible.h>
# include <cuda/std/__type_traits/is_extended_floating_point.h>
# include <cuda/std/cmath>
# include <cuda/std/complex>

# if !defined(_CCCL_COMPILER_NVRTC)
# include <sstream> // for std::basic_ostringstream
# endif // !_CCCL_COMPILER_NVRTC

_LIBCUDACXX_BEGIN_NAMESPACE_STD

// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
namespace __cccl_internal
{
template <>
struct __is_nvbf16<__nv_bfloat16> : true_type
{};
struct __is_non_narrowing_convertible<__nv_bfloat16, float>
{
static constexpr bool value = true;
};

template <>
struct __complex_alignment<__nv_bfloat16> : integral_constant<size_t, alignof(__nv_bfloat162)>
{};
struct __is_non_narrowing_convertible<__nv_bfloat16, double>
{
static constexpr bool value = true;
};

template <>
struct __type_to_vector<__nv_bfloat16>
struct __is_non_narrowing_convertible<float, __nv_bfloat16>
{
using __type = __nv_bfloat162;
static constexpr bool value = true;
};

template <>
struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
struct __is_non_narrowing_convertible<double, __nv_bfloat16>
{
typedef __nv_bfloat16 _ValueType;
typedef complex<__nv_bfloat16> _ComplexType;
static constexpr bool value = true;
};
} // namespace __cccl_internal

// This is a workaround against the user defining macros __CUDA_NO_BFLOAT16_CONVERSIONS__ __CUDA_NO_BFLOAT16_OPERATORS__
template <>
struct __complex_can_implicitly_construct<__nv_bfloat16, float> : true_type
{};
_LIBCUDACXX_BEGIN_NAMESPACE_STD

template <>
struct __complex_can_implicitly_construct<__nv_bfloat16, double> : true_type
struct __complex_alignment<__nv_bfloat16> : integral_constant<size_t, alignof(__nv_bfloat162)>
{};

template <>
struct __complex_can_implicitly_construct<float, __nv_bfloat16> : true_type
{};
struct __type_to_vector<__nv_bfloat16>
{
using __type = __nv_bfloat162;
};

template <>
struct __complex_can_implicitly_construct<double, __nv_bfloat16> : true_type
{};
struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
{
typedef __nv_bfloat16 _ValueType;
typedef complex<__nv_bfloat16> _ComplexType;
};

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept
Expand Down Expand Up @@ -111,14 +119,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__nv_bfloat162)) compl
: __repr_(__re, __im)
{}

template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
template <class _Up, __enable_if_t<__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0>
_LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
: __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
{}

template <class _Up,
__enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
__enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0>
__enable_if_t<!__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0,
__enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0>
_LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
: __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
{}
Expand Down
54 changes: 31 additions & 23 deletions libcudacxx/include/cuda/std/__complex/nvfp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,52 +30,60 @@
# include <cuda/std/__type_traits/enable_if.h>
# include <cuda/std/__type_traits/integral_constant.h>
# include <cuda/std/__type_traits/is_constructible.h>
# include <cuda/std/__type_traits/is_extended_floating_point.h>
# include <cuda/std/cmath>
# include <cuda/std/complex>

# if !defined(_CCCL_COMPILER_NVRTC)
# include <sstream> // for std::basic_ostringstream
# endif // !_CCCL_COMPILER_NVRTC

_LIBCUDACXX_BEGIN_NAMESPACE_STD

// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
namespace __cccl_internal
{
template <>
struct __is_nvfp16<__half> : true_type
{};
struct __is_non_narrowing_convertible<__half, float>
{
static constexpr bool value = true;
};

template <>
struct __complex_alignment<__half> : integral_constant<size_t, alignof(__half2)>
{};
struct __is_non_narrowing_convertible<__half, double>
{
static constexpr bool value = true;
};

template <>
struct __type_to_vector<__half>
struct __is_non_narrowing_convertible<float, __half>
{
using __type = __half2;
static constexpr bool value = true;
};

template <>
struct __libcpp_complex_overload_traits<__half, false, false>
struct __is_non_narrowing_convertible<double, __half>
{
typedef __half _ValueType;
typedef complex<__half> _ComplexType;
static constexpr bool value = true;
};
} // namespace __cccl_internal

// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
template <>
struct __complex_can_implicitly_construct<__half, float> : true_type
{};
_LIBCUDACXX_BEGIN_NAMESPACE_STD

template <>
struct __complex_can_implicitly_construct<__half, double> : true_type
struct __complex_alignment<__half> : integral_constant<size_t, alignof(__half2)>
{};

template <>
struct __complex_can_implicitly_construct<float, __half> : true_type
{};
struct __type_to_vector<__half>
{
using __type = __half2;
};

template <>
struct __complex_can_implicitly_construct<double, __half> : true_type
{};
struct __libcpp_complex_overload_traits<__half, false, false>
{
typedef __half _ValueType;
typedef complex<__half> _ComplexType;
};

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI __half __convert_to_half(const _Tp& __value) noexcept
Expand Down Expand Up @@ -108,14 +116,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__half2)) complex<__ha
: __repr_(__re, __im)
{}

template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
template <class _Up, __enable_if_t<__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0>
_LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
: __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
{}

template <class _Up,
__enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
__enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0>
__enable_if_t<!__cccl_internal::__is_non_narrowing_convertible<value_type, _Up>::value, int> = 0,
__enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0>
_LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
: __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
{}
Expand Down
Loading

0 comments on commit 70016e1

Please sign in to comment.