Skip to content

Commit

Permalink
Add warnings for large labels when using GPU histogram algorithms (dm…
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell authored Oct 26, 2017
1 parent 13e7a2c commit d9d5293
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 23 deletions.
14 changes: 7 additions & 7 deletions doc/gpu/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ Specify the 'tree_method' parameter as one of the following algorithms.
### Algorithms

```eval_rst
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
| tree_method | Description |
+==============+===============================================================================================================================================+
| gpu_exact | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'gpu_hist' |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
| gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tree_method | Description |
+==============+=================================================================================================================================================================================================================+
| gpu_exact | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'gpu_hist' |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Much faster and uses considerably less memory. NOTE: Cannot be used with labels larger in magnitude than 2^16 due to it's histogram aggregation algorithm. |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
```

### Supported parameters
Expand Down
16 changes: 8 additions & 8 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,19 @@ class bst_gpair_internal {

template<>
inline XGBOOST_DEVICE float bst_gpair_internal<int64_t>::GetGrad() const {
return grad_ * 1e-5f;
return grad_ * 1e-4f;
}
template<>
inline XGBOOST_DEVICE float bst_gpair_internal<int64_t>::GetHess() const {
return hess_ * 1e-5f;
return hess_ * 1e-4f;
}
template<>
inline XGBOOST_DEVICE void bst_gpair_internal<int64_t>::SetGrad(float g) {
grad_ = static_cast<int64_t>(std::round(g * 1e5));
grad_ = static_cast<int64_t>(std::round(g * 1e4));
}
template<>
inline XGBOOST_DEVICE void bst_gpair_internal<int64_t>::SetHess(float h) {
hess_ = static_cast<int64_t>(std::round(h * 1e5));
hess_ = static_cast<int64_t>(std::round(h * 1e4));
}

} // namespace detail
Expand All @@ -194,10 +194,10 @@ typedef detail::bst_gpair_internal<float> bst_gpair;
/*! \brief High precision gradient statistics pair */
typedef detail::bst_gpair_internal<double> bst_gpair_precise;

/*! \brief High precision gradient statistics pair with integer backed
* storage. Operators are associative where floating point versions are not
* associative. */
typedef detail::bst_gpair_internal<int64_t> bst_gpair_integer;
/*! \brief High precision gradient statistics pair with integer backed
* storage. Operators are associative where floating point versions are not
* associative. */
typedef detail::bst_gpair_internal<int64_t> bst_gpair_integer;

/*! \brief small eps gap for minimum split decision. */
const bst_float rt_eps = 1e-6f;
Expand Down
35 changes: 29 additions & 6 deletions src/tree/updater_gpu_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,27 @@
namespace xgboost {
namespace tree {

/**
* \fn void CheckGradientMax(const dh::dvec<bst_gpair>& gpair)
*
* \brief Check maximum gradient value is below 2^16. This is to prevent
* overflow when using integer gradient summation.
*/

inline void CheckGradientMax(const dh::dvec<bst_gpair>& gpair) {
auto dptr = thrust::device_ptr<const float>(
reinterpret_cast<const float*>(gpair.data()));
float abs_max = thrust::reduce(dptr, dptr + (gpair.size() * 2), 0.f,
[=] __device__(float a, float b) {
a = abs(a);
b = abs(b);
return max(a, b);
});

CHECK_LT(abs_max, std::pow(2.0f, 16.0f))
<< "Labels are too large for this algorithm. Rescale to less than 2^16.";
}

struct GPUTrainingParam {
// minimum amount of hessian(weight) allowed in a child
float min_child_weight;
Expand Down Expand Up @@ -64,8 +85,8 @@ struct DeviceSplitCandidate {
: loss_chg(-FLT_MAX), dir(LeftDir), fvalue(0), findex(-1) {}

template <typename param_t>
__host__ __device__ void Update(const DeviceSplitCandidate &other,
const param_t& param) {
__host__ __device__ void Update(const DeviceSplitCandidate& other,
const param_t& param) {
if (other.loss_chg > loss_chg &&
other.left_sum.GetHess() >= param.min_child_weight &&
other.right_sum.GetHess() >= param.min_child_weight) {
Expand Down Expand Up @@ -170,8 +191,10 @@ struct SumCallbackOp {
};

template <typename gpair_t>
__device__ inline float device_calc_loss_chg(
const GPUTrainingParam& param, const gpair_t& left, const gpair_t& parent_sum, const float& parent_gain) {
__device__ inline float device_calc_loss_chg(const GPUTrainingParam& param,
const gpair_t& left,
const gpair_t& parent_sum,
const float& parent_gain) {
gpair_t right = parent_sum - left;
float left_gain = CalcGain(param, left.GetGrad(), left.GetHess());
float right_gain = CalcGain(param, right.GetGrad(), right.GetHess());
Expand All @@ -187,8 +210,8 @@ __device__ float inline loss_chg_missing(const gpair_t& scan,
bool& missing_left_out) { // NOLINT
float missing_left_loss =
device_calc_loss_chg(param, scan + missing, parent_sum, parent_gain);
float missing_right_loss = device_calc_loss_chg(
param, scan, parent_sum, parent_gain);
float missing_right_loss =
device_calc_loss_chg(param, scan, parent_sum, parent_gain);

if (missing_left_loss >= missing_right_loss) {
missing_left_out = true;
Expand Down
3 changes: 3 additions & 0 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,9 @@ class GPUHistMaker : public TreeUpdater {
device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
gpair.begin() + device_row_segments[d_idx + 1]);

// Check gradients are within acceptable size range
CheckGradientMax(device_gpair[d_idx]);

subsample_gpair(&device_gpair[d_idx], param.subsample,
device_row_segments[d_idx]);

Expand Down
6 changes: 4 additions & 2 deletions src/tree/updater_gpu_hist_experimental.cu
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ struct DeviceShard {
ridx_segments.front() = std::make_pair(0, ridx.size());
this->gpair.copy(host_gpair.begin() + row_start_idx,
host_gpair.begin() + row_end_idx);
// Check gradients are within acceptable size range
CheckGradientMax(gpair);
hist.Reset();
}

Expand Down Expand Up @@ -551,8 +553,8 @@ class GPUHistMakerExperimental : public TreeUpdater {
__device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
unsigned ballot = __ballot(val == left_nidx);
if (threadIdx.x % 32 == 0) {
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
}
}

Expand Down

0 comments on commit d9d5293

Please sign in to comment.