Skip to content

Commit 2bf8d0f

Browse files
slarenJohannesGaessler
andauthoredMar 18, 2024
backend : offload large batches to GPU (ggml-org#6083)
* backend : offload large batches to GPU * fix hip * code cleanup * fix CUDA split buffers * Update ggml-backend-impl.h Co-authored-by: Johannes Gäßler <[email protected]> * cuda : fix memset without set_device * imatrix : remove sched affix from weight names * sched : add a new split if the current one has too many inputs reduce max inputs per split more cleanup * update backends ggml-ci --------- Co-authored-by: Johannes Gäßler <[email protected]>
1 parent 496bc79 commit 2bf8d0f

14 files changed

+349
-396
lines changed
 

‎examples/imatrix/imatrix.cpp

+25-7
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
5656
const struct ggml_tensor * src0 = t->src[0];
5757
const struct ggml_tensor * src1 = t->src[1];
5858

59+
std::string wname;
60+
{
61+
// remove any prefix and suffixes from the name
62+
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
63+
const char * p = strchr(src0->name, '#');
64+
if (p != NULL) {
65+
p = p + 1;
66+
const char * q = strchr(p, '#');
67+
if (q != NULL) {
68+
wname = std::string(p, q - p);
69+
} else {
70+
wname = p;
71+
}
72+
} else {
73+
wname = src0->name;
74+
}
75+
}
76+
5977
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
6078
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
6179
if (ask) {
6280
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
6381
if (t->op != GGML_OP_MUL_MAT) return false;
6482
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
65-
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
83+
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
6684
return true;
6785
}
6886

@@ -94,20 +112,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
94112
// this is necessary to guarantee equal number of "ncall" for each tensor
95113
for (int ex = 0; ex < n_as; ++ex) {
96114
src0 = t->src[2 + ex];
97-
auto& e = m_stats[src0->name];
115+
auto& e = m_stats[wname];
98116
if (e.values.empty()) {
99117
e.values.resize(src1->ne[0], 0);
100118
}
101119
else if (e.values.size() != (size_t)src1->ne[0]) {
102-
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
120+
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
103121
exit(1); //GGML_ASSERT(false);
104122
}
105123
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
106124
// using the following line, we can correct for that if needed
107125
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
108126
++e.ncall;
109127
if (m_params.verbosity > 1) {
110-
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
128+
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
111129
}
112130
for (int row = 0; row < (int)src1->ne[1]; ++row) {
113131
const int excur = m_ids[row*n_as + idx];
@@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
129147
}
130148
}
131149
} else {
132-
auto& e = m_stats[src0->name];
150+
auto& e = m_stats[wname];
133151
if (e.values.empty()) {
134152
e.values.resize(src1->ne[0], 0);
135153
}
136154
else if (e.values.size() != (size_t)src1->ne[0]) {
137-
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
155+
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
138156
exit(1); //GGML_ASSERT(false);
139157
}
140158
++e.ncall;
141159
if (m_params.verbosity > 1) {
142-
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
160+
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
143161
}
144162
for (int row = 0; row < (int)src1->ne[1]; ++row) {
145163
const float * x = data + row * src1->ne[0];

‎examples/llama-bench/llama-bench.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,10 @@ static std::string get_cpu_info() {
114114
static std::string get_gpu_info() {
115115
std::string id;
116116
#ifdef GGML_USE_CUBLAS
117-
int count = ggml_cuda_get_device_count();
117+
int count = ggml_backend_cuda_get_device_count();
118118
for (int i = 0; i < count; i++) {
119119
char buf[128];
120-
ggml_cuda_get_device_description(i, buf, sizeof(buf));
120+
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
121121
id += buf;
122122
if (i < count - 1) {
123123
id += "/";

‎ggml-alloc.c

+7-3
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548548
for (int i = 0; i < graph->n_nodes; i++) {
549549
struct ggml_tensor * node = graph->nodes[i];
550550

551-
if (ggml_is_view(node)) {
551+
// TODO: better way to add external dependencies
552+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554+
// itself is never used and should not be considered a dependency
555+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
552556
struct ggml_tensor * view_src = node->view_src;
553557
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554558
}
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565569

566570
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567571

568-
// allocate explicit inputs and leafs
569-
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
572+
// allocate explicit inputs
573+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
570574
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571575
}
572576
}

‎ggml-backend-impl.h

+5
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ extern "C" {
103103
// check if the backend supports an operation
104104
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105105

106+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
108+
// even if the weight has to be copied from the CPU temporarily
109+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110+
106111
// (optional) event synchronization
107112
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108113
void (*GGML_CALL event_free) (ggml_backend_event_t event);

0 commit comments

Comments
 (0)
Please sign in to comment.