Skip to content

Commit

Permalink
Write ELLPACK pages to disk (#4879)
Browse files Browse the repository at this point in the history
* add ellpack source
* add batch param
* extract function to parse cache info
* construct ellpack info separately
* push batch to ellpack page
* write ellpack page.
* make sparse page source reusable
  • Loading branch information
rongou authored and trivialfis committed Oct 23, 2019
1 parent 310fe60 commit 5b1715d
Show file tree
Hide file tree
Showing 25 changed files with 934 additions and 407 deletions.
1 change: 0 additions & 1 deletion amalgamation/xgboost-all0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@

#if DMLC_ENABLE_STD_THREAD
#include "../src/data/sparse_page_dmatrix.cc"
#include "../src/data/sparse_page_writer.cc"
#endif

// tress
Expand Down
76 changes: 59 additions & 17 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,18 @@ struct Entry {
}
};

/*!
* \brief Parameters for constructing batches.
*/
struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id;
/*! \brief Maximum number of bins per feature for histograms. */
int max_bin;
/*! \brief Number of rows in a GPU batch, used for finding quantiles on GPU. */
int gpu_batch_nrows;
};

/*!
* \brief In-memory storage unit of sparse batch, stored in CSR format.
*/
Expand Down Expand Up @@ -191,14 +203,17 @@ class SparsePage {
SparsePage() {
this->Clear();
}
/*! \return number of instance in the page */

/*! \return Number of instances in the page. */
inline size_t Size() const {
return offset.Size() - 1;
}

/*! \return estimation of memory cost of this page */
inline size_t MemCostBytes() const {
return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
}

/*! \brief clear the page */
inline void Clear() {
base_rowid = 0;
Expand All @@ -208,6 +223,11 @@ class SparsePage {
data.HostVector().clear();
}

/*! \brief Set the base row id for this page. */
inline void SetBaseRowId(size_t row_id) {
base_rowid = row_id;
}

SparsePage GetTranspose(int num_columns) const;

void SortRows() {
Expand Down Expand Up @@ -238,13 +258,6 @@ class SparsePage {
* \param batch The row batch to be pushed
*/
void PushCSC(const SparsePage& batch);
/*!
* \brief Push one instance into page
* \param inst an instance row
*/
void Push(const Inst &inst);

size_t Size() { return offset.Size() - 1; }
};

class CSCPage: public SparsePage {
Expand All @@ -268,9 +281,31 @@ class EllpackPageImpl;
*/
class EllpackPage {
public:
explicit EllpackPage(DMatrix* dmat);
/*!
* \brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();

/*!
* \brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(DMatrix* dmat, const BatchParam& param);

/*! \brief Destructor. */
~EllpackPage();

/*! \return Number of instances in the page. */
size_t Size() const;

/*! \brief Set the base row id for this page. */
void SetBaseRowId(size_t row_id);

const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }

Expand Down Expand Up @@ -356,7 +391,8 @@ class DataSource : public dmlc::DataIter<T> {
* There are two ways to create a customized DMatrix that reads in user defined-format.
*
* - Provide a dmlc::Parser and pass into the DMatrix::Create
* - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER;
* - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by
* DMLC_REGISTER_DATA_PARSER;
* - This works best for user defined data input source, such as data-base, filesystem.
* - Provide a DataSource, that can be passed to DMatrix::Create
* This can be used to re-use inmemory data structure into DMatrix.
Expand All @@ -373,7 +409,7 @@ class DMatrix {
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
*/
template<typename T>
BatchSet<T> GetBatches();
BatchSet<T> GetBatches(const BatchParam& param = {});
// the following are column meta data, should be able to answer them fast.
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
Expand All @@ -389,6 +425,12 @@ class DMatrix {
* \return The created DMatrix.
*/
virtual void SaveToLocalFile(const std::string& fname);

/*! \brief Whether the matrix is dense. */
bool IsDense() const {
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
}

/*!
* \brief Load DMatrix from URI.
* \param uri The URI of input.
Expand Down Expand Up @@ -438,27 +480,27 @@ class DMatrix {
virtual BatchSet<SparsePage> GetRowBatches() = 0;
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
};

template<>
inline BatchSet<SparsePage> DMatrix::GetBatches() {
inline BatchSet<SparsePage> DMatrix::GetBatches(const BatchParam&) {
return GetRowBatches();
}

template<>
inline BatchSet<CSCPage> DMatrix::GetBatches() {
inline BatchSet<CSCPage> DMatrix::GetBatches(const BatchParam&) {
return GetColumnBatches();
}

template<>
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
inline BatchSet<SortedCSCPage> DMatrix::GetBatches(const BatchParam&) {
return GetSortedColumnBatches();
}

template<>
inline BatchSet<EllpackPage> DMatrix::GetBatches() {
return GetEllpackBatches();
inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
return GetEllpackBatches(param);
}
} // namespace xgboost

Expand Down
17 changes: 13 additions & 4 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -540,16 +540,21 @@ class BulkAllocator {
}

public:
BulkAllocator() = default;
BulkAllocator() = default;
// prevent accidental copying, moving or assignment of this object
BulkAllocator(const BulkAllocator&) = delete;
BulkAllocator(BulkAllocator&&) = delete;
void operator=(const BulkAllocator&) = delete;
void operator=(BulkAllocator&&) = delete;

~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
/*!
* \brief Clear the bulk allocator.
*
* This frees the GPU memory managed by this allocator.
*/
void Clear() {
for (size_t i = 0; i < d_ptr_.size(); i++) { // NOLINT(modernize-loop-convert)
if (d_ptr_[i] != nullptr) {
safe_cuda(cudaSetDevice(device_idx_[i]));
XGBDeviceAllocator<char> allocator;
allocator.deallocate(thrust::device_ptr<char>(d_ptr_[i]), size_[i]);
Expand All @@ -558,6 +563,10 @@ class BulkAllocator {
}
}

~BulkAllocator() {
Clear();
}

// returns sum of bytes for all allocations
size_t Size() {
return std::accumulate(size_.begin(), size_.end(), static_cast<size_t>(0));
Expand Down
42 changes: 4 additions & 38 deletions src/data/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
#endif // DMLC_ENABLE_STD_THREAD

namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>);
} // namespace dmlc

namespace xgboost {
Expand Down Expand Up @@ -329,31 +332,6 @@ DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
} // namespace xgboost

namespace xgboost {
data::SparsePageFormat* data::SparsePageFormat::Create(const std::string& name) {
auto *e = ::dmlc::Registry< ::xgboost::data::SparsePageFormatReg>::Get()->Find(name);
if (e == nullptr) {
LOG(FATAL) << "Unknown format type " << name;
}
return (e->body)();
}

std::pair<std::string, std::string>
data::SparsePageFormat::DecideFormat(const std::string& cache_prefix) {
size_t pos = cache_prefix.rfind(".fmt-");

if (pos != std::string::npos) {
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
size_t cpos = fmt.rfind('-');
if (cpos != std::string::npos) {
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
} else {
return std::make_pair(fmt, fmt);
}
} else {
std::string raw = "raw";
return std::make_pair(raw, raw);
}
}
SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose;
common::ParallelGroupBuilder<Entry> builder(&transpose.offset.HostVector(),
Expand Down Expand Up @@ -476,18 +454,6 @@ void SparsePage::PushCSC(const SparsePage &batch) {
self_offset = std::move(offset);
}

void SparsePage::Push(const Inst &inst) {
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
offset_vec.push_back(offset_vec.back() + inst.size());
size_t begin = data_vec.size();
data_vec.resize(begin + inst.size());
if (inst.size() != 0) {
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
sizeof(Entry) * inst.size());
}
}

namespace data {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
Expand Down
6 changes: 2 additions & 4 deletions src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
/*!
* Copyright 2019 XGBoost contributors
*
* \file ellpack_page.cc
*/
#ifndef XGBOOST_USE_CUDA

#include <xgboost/data.h>

// dummy implementation of ELlpackPage in case CUDA is not used
// dummy implementation of EllpackPage in case CUDA is not used
namespace xgboost {

class EllpackPageImpl {};

EllpackPage::EllpackPage(DMatrix* dmat) {
EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
}

Expand Down
Loading

0 comments on commit 5b1715d

Please sign in to comment.