forked from dmlc/xgboost
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgboost_wrapper.cpp
353 lines (342 loc) · 12.8 KB
/
xgboost_wrapper.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
// implementations in ctypes
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
#include <cmath>
#include <algorithm>
// include all std functions
using namespace std;
#ifdef _MSC_VER
#define isnan(x) (_isnan(x) != 0)
#endif
#include "./xgboost_wrapper.h"
#include "../src/data.h"
#include "../src/learner/learner-inl.hpp"
#include "../src/io/io.h"
#include "../src/utils/utils.h"
#include "../src/utils/group_data.h"
#include "../src/io/simple_dmatrix-inl.hpp"
using namespace xgboost;
using namespace xgboost::io;
namespace xgboost {
namespace wrapper {
// booster wrapper class
class Booster: public learner::BoostLearner {
public:
explicit Booster(const std::vector<DataMatrix*>& mats) {
this->silent = 1;
this->init_model = false;
this->SetCacheData(mats);
}
inline const float *Pred(const DataMatrix &dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
this->CheckInitModel();
this->Predict(dmat, (option_mask&1) != 0, &this->preds_, ntree_limit, (option_mask&2) != 0);
*len = static_cast<bst_ulong>(this->preds_.size());
return BeginPtr(this->preds_);
}
inline void BoostOneIter(const DataMatrix &train,
float *grad, float *hess, bst_ulong len) {
this->gpair_.resize(len);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
gpair_[j] = bst_gpair(grad[j], hess[j]);
}
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
}
inline void CheckInitModel(void) {
if (!init_model) {
this->InitModel(); init_model = true;
}
}
inline void LoadModel(const char *fname) {
learner::BoostLearner::LoadModel(fname);
this->init_model = true;
}
inline void LoadModelFromBuffer(const void *buf, size_t size) {
utils::MemoryFixSizeBuffer fs((void*)buf, size);
learner::BoostLearner::LoadModel(fs);
this->init_model = true;
}
inline const char *GetModelRaw(bst_ulong *out_len) {
model_str.resize(0);
utils::MemoryBufferStream fs(&model_str);
learner::BoostLearner::SaveModel(fs);
*out_len = static_cast<bst_ulong>(model_str.length());
if (*out_len == 0) {
return NULL;
} else {
return &model_str[0];
}
}
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
model_dump = this->DumpModel(fmap, with_stats);
model_dump_cptr.resize(model_dump.size());
for (size_t i = 0; i < model_dump.size(); ++i) {
model_dump_cptr[i] = model_dump[i].c_str();
}
*len = static_cast<bst_ulong>(model_dump.size());
return BeginPtr(model_dump_cptr);
}
// temporal fields
// temporal data to save evaluation dump
std::string eval_str;
// temporal data to save model dump
std::string model_str;
// temporal space to save model dump
std::vector<std::string> model_dump;
std::vector<const char*> model_dump_cptr;
private:
bool init_model;
};
#if !defined(XGBOOST_STRICT_CXX98_)
inline bool CheckNAN(float v) {
return isnan(v);
}
#else
// redirect to defs in R
bool CheckNAN(float v);
#endif
} // namespace wrapper
} // namespace xgboost
using namespace xgboost::wrapper;
extern "C"{
void* XGDMatrixCreateFromFile(const char *fname, int silent) {
return LoadDataMatrix(fname, silent != 0, false, false);
}
void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.row_ptr_.resize(nindptr);
for (bst_ulong i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
}
mat.row_data_.resize(nelem);
for (bst_ulong i = 0; i < nelem; ++i) {
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
mat.info.info.num_col = std::max(mat.info.info.num_col,
static_cast<size_t>(indices[i]+1));
}
mat.info.info.num_row = nindptr - 1;
return p_mat;
}
XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem) {
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
utils::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
builder.InitBudget(0, nthread);
long ncol = static_cast<long>(nindptr - 1);
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) {
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(indices[j], tid);
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) {
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.Push(indices[j],
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
tid);
}
}
mat.info.info.num_row = mat.row_ptr_.size() - 1;
mat.info.info.num_col = static_cast<size_t>(ncol);
return p_mat;
}
void* XGDMatrixCreateFromMat(const float *data,
bst_ulong nrow,
bst_ulong ncol,
float missing) {
bool nan_missing = CheckNAN(missing);
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.info.info.num_row = nrow;
mat.info.info.num_col = ncol;
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) {
if (CheckNAN(data[j])) {
utils::Check(nan_missing,
"There are NAN in the matrix, however, you did not set missing=NAN");
} else {
if (nan_missing || data[j] != missing) {
mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
++nelem;
}
}
}
mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
}
return p_mat;
}
void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
bst_ulong len) {
DMatrixSimple tmp;
DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
if (dsrc.magic != DMatrixSimple::kMagic) {
tmp.CopyFrom(dsrc);
}
DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ?
*static_cast<DMatrixSimple*>(handle): tmp);
DMatrixSimple *p_ret = new DMatrixSimple();
DMatrixSimple &ret = *p_ret;
utils::Check(src.info.group_ptr.size() == 0,
"slice does not support group structure");
ret.Clear();
ret.info.info.num_row = len;
ret.info.info.num_col = src.info.num_col();
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
iter->BeforeFirst();
utils::Assert(iter->Next(), "slice");
const RowBatch &batch = iter->Value();
for (bst_ulong i = 0; i < len; ++i) {
const int ridx = idxset[i];
RowBatch::Inst inst = batch[ridx];
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
ret.row_data_.resize(ret.row_data_.size() + inst.length);
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
sizeof(RowBatch::Entry) * inst.length);
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
if (src.info.labels.size() != 0) {
ret.info.labels.push_back(src.info.labels[ridx]);
}
if (src.info.weights.size() != 0) {
ret.info.weights.push_back(src.info.weights[ridx]);
}
if (src.info.info.root_index.size() != 0) {
ret.info.info.root_index.push_back(src.info.info.root_index[ridx]);
}
if (src.info.info.fold_index.size() != 0) {
ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]);
}
}
return p_ret;
}
void XGDMatrixFree(void *handle) {
delete static_cast<DataMatrix*>(handle);
}
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent != 0);
}
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) {
std::vector<float> &vec =
static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
vec.resize(len);
memcpy(BeginPtr(vec), info, sizeof(float) * len);
}
void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) {
std::vector<unsigned> &vec =
static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
vec.resize(len);
memcpy(BeginPtr(vec), info, sizeof(unsigned) * len);
}
void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len) {
DataMatrix *pmat = static_cast<DataMatrix*>(handle);
pmat->info.group_ptr.resize(len + 1);
pmat->info.group_ptr[0] = 0;
for (uint64_t i = 0; i < len; ++i) {
pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i]+group[i];
}
}
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<float> &vec =
static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field);
*len = static_cast<bst_ulong>(vec.size());
return BeginPtr(vec);
}
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<unsigned> &vec =
static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
*len = static_cast<bst_ulong>(vec.size());
return BeginPtr(vec);
}
bst_ulong XGDMatrixNumRow(const void *handle) {
return static_cast<bst_ulong>(static_cast<const DataMatrix*>(handle)->info.num_row());
}
// xgboost implementation
void *XGBoosterCreate(void *dmats[], bst_ulong len) {
std::vector<DataMatrix*> mats;
for (bst_ulong i = 0; i < len; ++i) {
DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
mats.push_back(dtr);
}
return new Booster(mats);
}
void XGBoosterFree(void *handle) {
delete static_cast<Booster*>(handle);
}
void XGBoosterSetParam(void *handle, const char *name, const char *value) {
static_cast<Booster*>(handle)->SetParam(name, value);
}
void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain) {
Booster *bst = static_cast<Booster*>(handle);
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
bst->CheckInitModel();
bst->CheckInit(dtr);
bst->UpdateOneIter(iter, *dtr);
}
void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
bst->CheckInitModel();
bst->CheckInit(dtr);
bst->BoostOneIter(*dtr, grad, hess, len);
}
const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
std::vector<std::string> names;
std::vector<const DataMatrix*> mats;
for (bst_ulong i = 0; i < len; ++i) {
mats.push_back(static_cast<DataMatrix*>(dmats[i]));
names.push_back(std::string(evnames[i]));
}
bst->CheckInitModel();
bst->eval_str = bst->EvalOneIter(iter, mats, names);
return bst->eval_str.c_str();
}
const float *XGBoosterPredict(void *handle, void *dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), option_mask, ntree_limit, len);
}
void XGBoosterLoadModel(void *handle, const char *fname) {
static_cast<Booster*>(handle)->LoadModel(fname);
}
void XGBoosterSaveModel(const void *handle, const char *fname) {
static_cast<const Booster*>(handle)->SaveModel(fname);
}
void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
}
const char *XGBoosterGetModelRaw(void *handle, bst_ulong *out_len) {
return static_cast<Booster*>(handle)->GetModelRaw(out_len);
}
const char** XGBoosterDumpModel(void *handle, const char *fmap, int with_stats, bst_ulong *len){
utils::FeatMap featmap;
if (strlen(fmap) != 0) {
featmap.LoadText(fmap);
}
return static_cast<Booster*>(handle)->GetModelDump(featmap, with_stats != 0, len);
}
}