Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#123 from ShuRaymond/paddlebox
Browse files Browse the repository at this point in the history
merge /jiaoxuewu/PaddleBox
  • Loading branch information
jack603047588 authored Oct 11, 2024
2 parents 677edee + 4c076e5 commit eefcbc0
Show file tree
Hide file tree
Showing 13 changed files with 1,972 additions and 1,042 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/framework/boxps_trainer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ void BoxPSTrainer::InitDumpEnv() {
// dump_futures_.emplace_back(pool->Run([this, i]() { this->DumpWork(i); }));
// }
// VLOG(0) << "init dump write file thread num=" << dump_thread_num_;
localfs_mkdir(dump_fields_path_);
localfs_mkdir(dump_fields_path_);
}
// final dump env
void BoxPSTrainer::FinalizeDumpEnv() {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/boxps_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ void BoxPSWorker::Initialize(const TrainerDesc& desc) {
<< ", dump thread num: " << dump_thread_num_;
}
VLOG(1) << "boxps_worker init device num: " << device_num_;

}

void BoxPSWorker::Finalize() {
Expand Down Expand Up @@ -1205,6 +1206,9 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
}
str_os << "]";
}
if (sync_points_.find(op.get()) != sync_points_.end()) {
str_os << ", sync point";
}
str_os << "\n";
}
auto box_ptr = BoxWrapper::GetInstance();
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/framework/data_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
USE_INT_STAT(STAT_total_feasign_num_in_mem);
DECLARE_bool(graph_get_neighbor_id);
DECLARE_bool(padbox_dataset_enable_unrollinstance);

PADDLE_DEFINE_EXPORTED_bool(padbox_disable_ins_shuffle,
false,
"paddle disable ins shuffle ,default false");
namespace paddle {
namespace framework {

Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/framework/device_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,6 @@ class BoxPSWorker : public DeviceWorker {
virtual void DumpField(const Scope& scope,
int dump_mode,
int dump_interval = 10000);

private:
void OpenDump(const int &tid);
void WriteDump(const int &tid, const std::string& buf);
Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/framework/fleet/box_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/string/string_helper.h"

#include "paddle/phi/common/data_type.h"
#include "paddle/fluid/framework/fleet/metrics.h"
#include "paddle/fluid/framework/fleet/box_wrapper_kernel.h"

Expand Down Expand Up @@ -361,6 +361,11 @@ class MetricMsg {
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
PADDLE_ENFORCE_EQ(
paddle::experimental::SizeOf(gpu_tensor.dtype()),
sizeof(T),
platform::errors::InvalidArgument(
"Error: monitor var `%s` type error.", varname.c_str()));
*data = gpu_tensor.data<T>();
*len = gpu_tensor.numel();
}
Expand All @@ -379,6 +384,11 @@ class MetricMsg {
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
PADDLE_ENFORCE_EQ(
paddle::experimental::SizeOf(gpu_tensor.dtype()),
sizeof(T),
platform::errors::InvalidArgument(
"Error: monitor var `%s` type error.", varname.c_str()));
auto* gpu_data = gpu_tensor.data<T>();
auto len = gpu_tensor.numel();
data->resize(len);
Expand Down
55 changes: 42 additions & 13 deletions paddle/fluid/framework/fleet/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
#endif

#if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE) || defined(PADDLE_WITH_BOX_PS)

PADDLE_DEFINE_EXPORTED_bool(enable_debug_print_metrics_info,
false,
"enable debug print metrics info, default false");

namespace paddle {
namespace framework {

Expand All @@ -39,11 +44,13 @@ void BasicAucCalculator::add_unlock_data(double pred, int label) {
platform::errors::PreconditionNotMet(
"label must be equal to 0 or 1, but its value is: %d", label));

int pos = static_cast<int>(pred * _table_size);
int pos = std::min(static_cast<int>(pred * _table_size), _table_size - 1);
_local_abserr += fabs(pred - label);
_local_sqrerr += (pred - label) * (pred - label);
_local_pred += pred;
++_table[label][pos];
_local_label += label;
_table[label][pos] += 1.0;
_local_total_num += 1.0;
}

void BasicAucCalculator::add_unlock_data(double pred, int label, float sample_scale) {
Expand All @@ -54,12 +61,14 @@ void BasicAucCalculator::add_unlock_data(double pred, int label, float sample_sc
PADDLE_ENFORCE_EQ(label * label, label,
platform::errors::PreconditionNotMet(
"label must be equal to 0 or 1, but its value is: %d", label));

int pos = static_cast<int>(pred * _table_size);
int pos = std::min(static_cast<int>(pred * _table_size), _table_size - 1);
_local_abserr += fabs(pred - label);
_local_sqrerr += (pred - label) * (pred - label);
_local_pred += pred * sample_scale;
_local_label += label;
_table[label][pos] += sample_scale;
_local_total_num += sample_scale;
}

void BasicAucCalculator::add_unlock_data_with_float_label(double pred, double label) {
Expand All @@ -68,7 +77,7 @@ void BasicAucCalculator::add_unlock_data_with_float_label(double pred, double la
PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
"pred should be lower than 1"));

int pos = static_cast<int>(pred * _table_size);
int pos = std::min(static_cast<int>(pred * _table_size), _table_size - 1);
PADDLE_ENFORCE_GE(
pos, 0,
platform::errors::PreconditionNotMet(
Expand All @@ -80,8 +89,10 @@ void BasicAucCalculator::add_unlock_data_with_float_label(double pred, double la
_local_abserr += fabs(pred - label);
_local_sqrerr += (pred - label) * (pred - label);
_local_pred += pred;
_local_label += label;
_table[0][pos] += 1 - label;
_table[1][pos] += label;
_local_total_num += 1.0;
}

void BasicAucCalculator::add_unlock_data_with_continue_label(double pred,
Expand All @@ -90,7 +101,7 @@ void BasicAucCalculator::add_unlock_data_with_continue_label(double pred,
_local_sqrerr += (pred - label) * (pred - label);
_local_pred += pred;
_local_label += label;
++_local_total_num;
_local_total_num += 1.0;
}

void BasicAucCalculator::add_nan_inf_unlock_data(float pred, int label){
Expand Down Expand Up @@ -324,38 +335,56 @@ void BasicAucCalculator::compute() {
fp = newfp;
tp = newtp;
}

if (fp < 1e-3 || tp < 1e-3) {
_auc = -0.5; // which means all nonclick or click
} else {
_auc = area / (fp * tp);
}

double total_ins_num = _local_total_num;
if (node_size > 1) {
#ifdef PADDLE_WITH_BOX_PS
// allreduce sum
double local_err[3] = {_local_abserr, _local_sqrerr, _local_pred};
boxps::MPICluster::Ins().allreduce_sum(local_err, 3);
double local_err[5] = {
_local_abserr, _local_sqrerr, _local_pred, _local_label, _local_total_num};
boxps::MPICluster::Ins().allreduce_sum(local_err, 5);
#elif defined(PADDLE_WITH_GLOO)
// allreduce sum
std::vector<double> local_err_temp{_local_abserr, _local_sqrerr, _local_pred};
std::vector<double> local_err_temp{
_local_abserr, _local_sqrerr, _local_pred, _local_label, _local_total_num};
auto local_err = gloo_wrapper->AllReduce(local_err_temp, "sum");
#else
// allreduce sum
double local_err[3] = {_local_abserr, _local_sqrerr, _local_pred};
double local_err[5] = {
_local_abserr, _local_sqrerr, _local_pred, _local_label, _local_total_num};
#endif
_mae = local_err[0] / (fp + tp);
_rmse = sqrt(local_err[1] / (fp + tp));
_predicted_ctr = local_err[2] / (fp + tp);
total_ins_num = local_err[4];
} else {
_mae = _local_abserr / (fp + tp);
_rmse = sqrt(_local_sqrerr / (fp + tp));
_predicted_ctr = _local_pred / (fp + tp);
}
_actual_ctr = tp / (fp + tp);

_size = fp + tp;

// add debug info print
if (FLAGS_enable_debug_print_metrics_info) {
LOG(WARNING) << "total ins num: " << total_ins_num
<< ", local ins num: " << _local_total_num
<< ", fp: " << fp
<< ", tp: " << tp
<< ", label: " << _local_label
<< ", pred: " << _local_pred
<< ", abs: " << _local_abserr
<< ", sqr: " << _local_sqrerr;
}

PADDLE_ENFORCE_LT(abs(total_ins_num - _size),
0.5,
platform::errors::InvalidArgument(
"The table ins num not equal real total num."));
calculate_bucket_error(table[0], table[1]);
}

Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel bac

if(WITH_XPU)
xpu_library(batch_fc_kernel
SRCS batch_fc_kernel.h batch_fc_kernel.kps)
SRCS batch_fc_kernel.h batch_fc_kernel.kps
DEPS resnet_unit_op)
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} batch_fc_kernel)
else()
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/operators/fused/fused_seq_tensor_op.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include <cublas.h>
#include <fstream>
#include <string>
#include "paddle/fluid/operators/fused/fused_seq_tensor_op.h" // don't remove this
Expand Down
Loading

0 comments on commit eefcbc0

Please sign in to comment.