Skip to content

Commit

Permalink
feat: Update llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
abetlen committed Mar 3, 2024
1 parent d5df431 commit 0e70984
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
36 changes: 34 additions & 2 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p
)

# // Abort callback
# // If not NULL, called before ggml computation
# // If it returns true, the computation is aborted
# typedef bool (*ggml_abort_callback)(void * data);
ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)

# llama.h bindings

_lib.llama_max_devices.argtypes = []
Expand Down Expand Up @@ -560,10 +566,16 @@ class llama_model_params(ctypes.Structure):
# enum ggml_type type_v; // data type for V cache

# // Keep the booleans together to avoid misalignment during copy-by-value.
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)

# // Abort callback
# // if it returns true, execution of llama_decode() will be aborted
# // currently works only with CPU execution
# ggml_abort_callback abort_callback;
# void * abort_callback_data;
# };
class llama_context_params(ctypes.Structure):
"""Parameters for llama_context
Expand Down Expand Up @@ -591,6 +603,8 @@ class llama_context_params(ctypes.Structure):
embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
"""

_fields_ = [
Expand All @@ -616,6 +630,8 @@ class llama_context_params(ctypes.Structure):
("embedding", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("do_pooling", ctypes.c_bool),
("abort_callback", ggml_abort_callback),
("abort_callback_data", ctypes.c_void_p),
]


Expand Down Expand Up @@ -1703,8 +1719,24 @@ def llama_set_n_threads(
"""
...

# // Set abort callback
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
@ctypes_function(
"llama_set_abort_callback",
[llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
None,
)
def llama_set_abort_callback(
ctx: llama_context_p,
abort_callback: Callable[[ctypes.c_void_p], None],
abort_callback_data: ctypes.c_void_p,
/,
):
"""Set abort callback"""
...


# // Token logits obtained from the last call to llama_eval()
# // Token logits obtained from the last call to llama_decode()
# // The logits for the last token are stored in the last row
# // Logits for which llama_batch.logits[i] == 0 are undefined
# // Rows: n_tokens provided with llama_batch
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 41 files
+1 −1 .devops/nix/sif.nix
+3 −1 .github/workflows/python-check-requirements.yml
+9 −6 .github/workflows/server.yml
+21 −0 README-sycl.md
+3 −2 README.md
+4 −0 common/common.cpp
+103 −101 convert-hf-to-gguf.py
+3 −3 convert-llama-ggml-to-gguf.py
+36 −40 convert.py
+2 −2 examples/infill/infill.cpp
+6 −11 examples/llama-bench/llama-bench.cpp
+2 −15 examples/server/README.md
+0 −228 examples/server/api_like_OAI.py
+31 −15 examples/server/server.cpp
+35 −15 examples/server/tests/README.md
+4 −1 examples/server/tests/features/environment.py
+1 −0 examples/server/tests/features/issues.feature
+3 −2 examples/server/tests/features/parallel.feature
+55 −0 examples/server/tests/features/passkey.feature
+2 −1 examples/server/tests/features/security.feature
+15 −8 examples/server/tests/features/server.feature
+201 −58 examples/server/tests/features/steps/steps.py
+3 −2 examples/server/tests/features/wrong_usages.feature
+1 −0 examples/server/tests/requirements.txt
+1 −1 examples/server/tests/tests.sh
+1 −2 examples/server/utils.hpp
+1 −1 examples/sycl/ls-sycl-device.cpp
+12 −5 examples/sycl/run-llama2.sh
+7 −3 flake.nix
+71 −72 ggml-cuda.cu
+77 −75 ggml-metal.metal
+162 −118 ggml-quants.c
+1,444 −817 ggml-sycl.cpp
+5 −0 ggml-sycl.h
+21 −0 gguf-py/gguf/constants.py
+1 −1 gguf-py/gguf/gguf_writer.py
+2 −0 gguf-py/gguf/tensor_mapping.py
+315 −63 llama.cpp
+11 −2 llama.h
+1 −0 requirements/requirements-convert-hf-to-gguf.txt
+213 −0 scripts/pod-llama.sh

0 comments on commit 0e70984

Please sign in to comment.