forked from BeastByteAI/scikit-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into feature_focused_summarizer
- Loading branch information
Showing
28 changed files
with
1,096 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,9 +8,10 @@ dependencies = [ | |
"pandas>=1.5.0", | ||
"openai>=0.27.0", | ||
"tqdm>=4.60.0", | ||
"annoy>=1.17.2", | ||
] | ||
name = "scikit-llm" | ||
version = "0.1.0b3" | ||
version = "0.2.0" | ||
authors = [ | ||
{ name="Oleg Kostromin", email="[email protected]" }, | ||
{ name="Iryna Kondrashchenko", email="[email protected]" }, | ||
|
@@ -24,10 +25,9 @@ classifiers = [ | |
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
] | ||
dynamic = ["optional-dependencies"] | ||
|
||
[tool.setuptools.dynamic.optional-dependencies] | ||
dev = { file = ["requirements-dev.txt"] } | ||
[project.optional-dependencies] | ||
gpt4all = ["gpt4all>=0.2.0"] | ||
|
||
[tool.ruff] | ||
select = [ | ||
|
@@ -80,12 +80,13 @@ target-version = ['py38', 'py39', 'py310', 'py311'] | |
profile = "black" | ||
filter_files = true | ||
known_first_party = ["skllm", "skllm.*"] | ||
skip = ["__init__.py"] | ||
|
||
[tool.docformatter] | ||
close-quotes-on-newline = true # D209 | ||
|
||
[tool.interrogate] | ||
fail-under = 80 | ||
fail-under = 65 | ||
ignore-module = true | ||
ignore-nested-functions = true | ||
ignore-private = true | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,5 @@ isort | |
ruff | ||
docformatter | ||
interrogate | ||
numpy | ||
pandas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier | ||
# ordering is important here to prevent circular imports | ||
from skllm.models.gpt_zero_shot_clf import ( | ||
MultiLabelZeroShotGPTClassifier, | ||
ZeroShotGPTClassifier, | ||
) | ||
from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier | ||
from skllm.models.gpt_dyn_few_shot_clf import DynamicFewShotGPTClassifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from skllm.gpt4all_client import get_chat_completion as _g4a_get_chat_completion | ||
from skllm.openai.chatgpt import get_chat_completion as _oai_get_chat_completion | ||
|
||
|
||
def get_chat_completion( | ||
messages: dict, openai_key: str=None, openai_org: str=None, model: str="gpt-3.5-turbo", max_retries: int=3 | ||
): | ||
""" | ||
Gets a chat completion from the OpenAI API. | ||
""" | ||
if model.startswith("gpt4all::"): | ||
return _g4a_get_chat_completion(messages, model[9:]) | ||
else: | ||
return _oai_get_chat_completion( | ||
messages, openai_key, openai_org, model, max_retries | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from typing import Dict | ||
|
||
try: | ||
from gpt4all import GPT4All | ||
except (ImportError, ModuleNotFoundError): | ||
GPT4All = None | ||
|
||
_loaded_models = {} | ||
|
||
|
||
def get_chat_completion(messages: Dict, model: str="ggml-gpt4all-j-v1.3-groovy") -> Dict: | ||
""" | ||
Gets a chat completion from GPT4All | ||
Parameters | ||
---------- | ||
messages : Dict | ||
The messages to use as a prompt for the chat completion. | ||
model : str | ||
The model to use for the chat completion. Defaults to "ggml-gpt4all-j-v1.3-groovy". | ||
Returns | ||
------- | ||
completion : Dict | ||
""" | ||
if GPT4All is None: | ||
raise ImportError( | ||
"gpt4all is not installed, try `pip install scikit-llm[gpt4all]`" | ||
) | ||
if model not in _loaded_models.keys(): | ||
_loaded_models[model] = GPT4All(model) | ||
|
||
return _loaded_models[model].chat_completion( | ||
messages, verbose=False, streaming=False, temp=1e-10 | ||
) | ||
|
||
|
||
def unload_models() -> None: | ||
global _loaded_models | ||
_loaded_models = {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from skllm.memory._annoy import AnnoyMemoryIndex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import os | ||
import tempfile | ||
from typing import Any, List | ||
|
||
from annoy import AnnoyIndex | ||
from numpy import ndarray | ||
|
||
from skllm.memory.base import _BaseMemoryIndex | ||
|
||
|
||
class AnnoyMemoryIndex(_BaseMemoryIndex): | ||
"""Memory index using Annoy. | ||
Parameters | ||
---------- | ||
dim : int | ||
dimensionality of the vectors | ||
metric : str, optional | ||
metric to use, by default "euclidean" | ||
""" | ||
|
||
def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None: | ||
self._index = AnnoyIndex(dim, metric) | ||
self.metric = metric | ||
self.dim = dim | ||
self.built = False | ||
|
||
def add(self, id: int, vector: ndarray) -> None: | ||
"""Adds a vector to the index. | ||
Parameters | ||
---------- | ||
id : Any | ||
identifier for the vector | ||
vector : ndarray | ||
vector to add to the index | ||
""" | ||
if self.built: | ||
raise RuntimeError("Cannot add vectors after index is built.") | ||
self._index.add_item(id, vector) | ||
|
||
def build(self) -> None: | ||
"""Builds the index. | ||
No new vectors can be added after building. | ||
""" | ||
self._index.build(-1) | ||
self.built = True | ||
|
||
def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]: | ||
"""Retrieves the k nearest neighbors for each vector. | ||
Parameters | ||
---------- | ||
vectors : ndarray | ||
vectors to retrieve nearest neighbors for | ||
k : int | ||
number of nearest neighbors to retrieve | ||
Returns | ||
------- | ||
List | ||
ids of retrieved nearest neighbors | ||
""" | ||
if not self.built: | ||
raise RuntimeError("Cannot retrieve vectors before the index is built.") | ||
return [ | ||
self._index.get_nns_by_vector(v, k, search_k=-1, include_distances=False) | ||
for v in vectors | ||
] | ||
|
||
def __getstate__(self) -> dict: | ||
"""Returns the state of the object. To store the actual annoy index, it | ||
has to be written to a temporary file. | ||
Returns | ||
------- | ||
dict | ||
state of the object | ||
""" | ||
state = self.__dict__.copy() | ||
|
||
# save index to temporary file | ||
with tempfile.NamedTemporaryFile(delete=False) as tmp: | ||
temp_filename = tmp.name | ||
self._index.save(temp_filename) | ||
|
||
# read bytes from the file | ||
with open(temp_filename, "rb") as tmp: | ||
index_bytes = tmp.read() | ||
|
||
# store bytes representation in state | ||
state["_index"] = index_bytes | ||
|
||
# remove temporary file | ||
os.remove(temp_filename) | ||
|
||
return state | ||
|
||
def __setstate__(self, state: dict) -> None: | ||
"""Sets the state of the object. It restores the annoy index from the | ||
bytes representation. | ||
Parameters | ||
---------- | ||
state : dict | ||
state of the object | ||
""" | ||
self.__dict__.update(state) | ||
# restore index from bytes | ||
with tempfile.NamedTemporaryFile(delete=False) as tmp: | ||
temp_filename = tmp.name | ||
tmp.write(self._index) | ||
|
||
self._index = AnnoyIndex(self.dim, self.metric) | ||
self._index.load(temp_filename) | ||
|
||
# remove temporary file | ||
os.remove(temp_filename) |
Oops, something went wrong.