Skip to content

Commit

Permalink
feat: upgrade llama.cpp (TabbyML#645)
Browse files Browse the repository at this point in the history
* feat: upgrade llama.cpp

* update download files

* update changelog

* Update CHANGELOG.md

* Update CHANGELOG.md
  • Loading branch information
wsxiaoys authored Oct 27, 2023
1 parent 89d1765 commit f378405
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 18 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# v0.5.0 [Unreleased]

## Notice
* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252

## Features

## Fixes and Improvements

* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637

Expand Down
2 changes: 1 addition & 1 deletion crates/llama-cpp-bindings/llama.cpp
Submodule llama.cpp updated 139 files
18 changes: 3 additions & 15 deletions crates/llama-cpp-bindings/src/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
model_(std::move(model)),
ctx_(std::move(ctx)) {
batch_ = llama_batch_init(N_BATCH, 0);
}

void start(rust::Slice<const uint32_t> input_token_ids) override {
Expand All @@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
}

uint32_t eos_token() const override {
return llama_token_eos(ctx_.get());
return llama_token_eos(llama_get_model(ctx_.get()));
}

private:
uint32_t sample() const {
auto* ctx = ctx_.get();

auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
auto logits = llama_get_logits_ith(ctx, 0);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));

// Greedy sampling (always select the highest logit).
Expand All @@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
n_past_ = 0;
}

batch_.n_tokens = size;
for (size_t i = 0; i < size; ++i) {
batch_.token[i] = data[i];
batch_.pos[i] = n_past_ + i;
batch_.seq_id[i] = 0;
batch_.logits[i] = false;
}
batch_.logits[size - 1] = true;

auto* ctx = ctx_.get();
llama_kv_cache_tokens_rm(ctx, n_past_, -1);
if (llama_decode(ctx, batch_)) {
if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
throw std::runtime_error("Failed to eval");
}

Expand All @@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
size_t n_past_;
owned<llama_model> model_;
owned<llama_context> ctx_;

llama_batch batch_;
};

static int g_llama_cpp_log_level = 0;
Expand Down
4 changes: 4 additions & 0 deletions crates/tabby-common/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,8 @@ impl ModelDir {
pub fn ggml_q8_0_file(&self) -> String {
self.path_string("ggml/q8_0.gguf")
}

pub fn ggml_q8_0_v2_file(&self) -> String {
self.path_string("ggml/q8_0.v2.gguf")
}
}
2 changes: 1 addition & 1 deletion crates/tabby-download/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl Downloader {
let files = vec![
("tabby.json", true),
("tokenizer.json", true),
("ggml/q8_0.gguf", true),
("ggml/q8_0.v2.gguf", true),
];
self.download_files(&files).await
}
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby/src/serve/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ fn create_ctranslate2_engine(

fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
.model_path(model_dir.ggml_q8_0_file())
.model_path(model_dir.ggml_q8_0_v2_file())
.tokenizer_path(model_dir.tokenizer_file())
.use_gpu(device.ggml_use_gpu())
.build()
Expand Down

0 comments on commit f378405

Please sign in to comment.