Skip to content

Commit

Permalink
added sklearn index
Browse files Browse the repository at this point in the history
  • Loading branch information
OKUA1 committed Aug 19, 2023
1 parent 38ad5e9 commit 6f1e789
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 18 deletions.
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,6 @@ Note: as the model is not being re-trained, but uses the training data during in

### Dynamic Few-Shot Text Classification

*To use this feature, you need to install `annoy` library:*

```bash
pip install scikit-llm[annoy]
```

`DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.

*How does it work?*
Expand All @@ -227,6 +221,20 @@ clf.fit(X, y)
labels = clf.predict(X)
```

By default the classifier uses kneighbors algorithm from sklearn, which might be slow for large datasets. In this case, it is possible to switch to [annoy](https://github.com/spotify/annoy):

```bash
pip install scikit-llm[annoy]
```

```python
from skllm.memory._annoy import AnnoyMemoryIndex
from skllm.memory.base import IndexConstructor

index = IndexConstructor(AnnoyMemoryIndex)
clf = DynamicFewShotGPTClassifier(memory_index=index)
```

### Text Classification with Google PaLM 2

At the moment 3 PaLM based models are available in test mode:
Expand Down
2 changes: 1 addition & 1 deletion skllm/memory/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from skllm.memory._annoy import AnnoyMemoryIndex
from skllm.memory._sklearn_nn import SklearnMemoryIndex
21 changes: 15 additions & 6 deletions skllm/memory/_annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,43 @@ class AnnoyMemoryIndex(_BaseMemoryIndex):
metric to use, by default "euclidean"
"""

def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None:
def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
if AnnoyIndex is None:
raise ImportError(
"Annoy is not installed. Please install annoy by running `pip install scikit-llm[annoy]`."
"Annoy is not installed. Please install annoy by running `pip install"
" scikit-llm[annoy]`."
)
self._index = AnnoyIndex(dim, metric)
self.metric = metric
self.dim = dim
self.built = False
self._index = None
self._counter = 0

def add(self, id: int, vector: ndarray) -> None:
def add(self, vector: ndarray) -> None:
"""Adds a vector to the index.
Parameters
----------
id : Any
identifier for the vector
vector : ndarray
vector to add to the index
"""
if self.built:
raise RuntimeError("Cannot add vectors after index is built.")
if self.dim < 0:
raise ValueError("Dimensionality must be positive.")
if not self._index:
self._index = AnnoyIndex(self.dim, self.metric)
id = self._counter
self._index.add_item(id, vector)
self._counter += 1

def build(self) -> None:
"""Builds the index.
No new vectors can be added after building.
"""
if self.dim < 0:
raise ValueError("Dimensionality must be positive.")
self._index.build(-1)
self.built = True

Expand All @@ -70,6 +78,7 @@ def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]:
List
ids of retrieved nearest neighbors
"""
print("ANNOY RETRIEVE")
if not self.built:
raise RuntimeError("Cannot retrieve vectors before the index is built.")
return [
Expand Down
66 changes: 66 additions & 0 deletions skllm/memory/_sklearn_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Any, List

import numpy as np
from sklearn.neighbors import NearestNeighbors

from skllm.memory.base import _BaseMemoryIndex


class SklearnMemoryIndex(_BaseMemoryIndex):
"""Memory index using Sklearn's NearestNeighbors.
Parameters
----------
dim : int
dimensionality of the vectors
metric : str, optional
metric to use, by default "euclidean"
"""

def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
self._index = NearestNeighbors(metric=metric, **kwargs)
self.metric = metric
self.dim = dim
self.built = False
self.data = []

def add(self, vector: np.ndarray) -> None:
"""Adds a vector to the index.
Parameters
----------
vector : np.ndarray
vector to add to the index
"""
if self.built:
raise RuntimeError("Cannot add vectors after index is built.")
self.data.append(vector)

def build(self) -> None:
"""Builds the index.
No new vectors can be added after building.
"""
data_matrix = np.array(self.data)
self._index.fit(data_matrix)
self.built = True

def retrieve(self, vectors: np.ndarray, k: int) -> List[List[int]]:
"""Retrieves the k nearest neighbors for each vector.
Parameters
----------
vectors : np.ndarray
vectors to retrieve nearest neighbors for
k : int
number of nearest neighbors to retrieve
Returns
-------
List
ids of retrieved nearest neighbors
"""
if not self.built:
raise RuntimeError("Cannot retrieve vectors before the index is built.")
_, indices = self._index.kneighbors(vectors, n_neighbors=k)
return indices.tolist()
11 changes: 10 additions & 1 deletion skllm/memory/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Any, List
from typing import Any, List, Type

from numpy import ndarray

Expand Down Expand Up @@ -43,3 +43,12 @@ def build(self) -> None:
All build parameters should be passed to the constructor.
"""
pass


class IndexConstructor:
def __init__(self, index: Type[_BaseMemoryIndex], **kwargs: Any) -> None:
self.index = index
self.kwargs = kwargs

def __call__(self) -> _BaseMemoryIndex:
return self.index(**self.kwargs)
17 changes: 13 additions & 4 deletions skllm/models/gpt/gpt_dyn_few_shot_clf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import numpy as np
import pandas as pd

from skllm.memory import AnnoyMemoryIndex
from skllm.memory import SklearnMemoryIndex
from skllm.memory.base import IndexConstructor
from skllm.models._base import _BaseZeroShotGPTClassifier
from skllm.preprocessing import GPTVectorizer
from skllm.prompts.builders import build_few_shot_prompt_slc
Expand Down Expand Up @@ -35,6 +36,8 @@ class DynamicFewShotGPTClassifier(_BaseZeroShotGPTClassifier):
default_label : Optional[Union[List[str], str]] , default : 'Random'
The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
label will be chosen based on probabilities from the training set.
memory_index : Optional[IndexConstructor], default : None
The memory index constructor to use. If None, a SklearnMemoryIndex will be used.
"""

def __init__(
Expand All @@ -44,9 +47,11 @@ def __init__(
openai_org: str | None = None,
openai_model: str = "gpt-3.5-turbo",
default_label: str | None = "Random",
memory_index: IndexConstructor | None = None,
):
super().__init__(openai_key, openai_org, openai_model, default_label)
self.n_examples = n_examples
self.memory_index = memory_index

def fit(
self,
Expand Down Expand Up @@ -79,9 +84,13 @@ def fit(
partition = X[y == cls]
self.data_[cls]["partition"] = partition
embeddings = self.embedding_model_.transform(partition)
index = AnnoyMemoryIndex(embeddings.shape[1])
for i, embedding in enumerate(embeddings):
index.add(i, embedding)
if self.memory_index is not None:
index = self.memory_index()
index.dim = embeddings.shape[1]
else:
index = SklearnMemoryIndex(embeddings.shape[1])
for embedding in embeddings:
index.add(embedding)
index.build()
self.data_[cls]["index"] = index

Expand Down

0 comments on commit 6f1e789

Please sign in to comment.