added sklearn index

email516888 · Aug 19, 2023 · 6f1e789 · 6f1e789
1 parent 38ad5e9
commit 6f1e789
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -202,12 +202,6 @@ Note: as the model is not being re-trained, but uses the training data during in
 
 ### Dynamic Few-Shot Text Classification
 
-*To use this feature, you need to install `annoy` library:*
-
-```bash
-pip install scikit-llm[annoy]
-```
-
 `DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.
 
 *How does it work?*
@@ -227,6 +221,20 @@ clf.fit(X, y)
 labels = clf.predict(X)
 ```
 
+By default the classifier uses kneighbors algorithm from sklearn, which might be slow for large datasets. In this case, it is possible to switch to [annoy](https://github.com/spotify/annoy):
+
+```bash
+pip install scikit-llm[annoy]
+```
+
+```python
+from skllm.memory._annoy import AnnoyMemoryIndex
+from skllm.memory.base import IndexConstructor
+
+index = IndexConstructor(AnnoyMemoryIndex)
+clf = DynamicFewShotGPTClassifier(memory_index=index)
+```
+
 ### Text Classification with Google PaLM 2
 
 At the moment 3 PaLM based models are available in test mode:

diff --git a/skllm/memory/__init__.py b/skllm/memory/__init__.py
@@ -1 +1 @@
-from skllm.memory._annoy import AnnoyMemoryIndex
+from skllm.memory._sklearn_nn import SklearnMemoryIndex
diff --git a/skllm/memory/_annoy.py b/skllm/memory/_annoy.py
@@ -23,35 +23,43 @@ class AnnoyMemoryIndex(_BaseMemoryIndex):
         metric to use, by default "euclidean"
     """
 
-    def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None:
+    def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
         if AnnoyIndex is None:
             raise ImportError(
-                "Annoy is not installed. Please install annoy by running `pip install scikit-llm[annoy]`."
+                "Annoy is not installed. Please install annoy by running `pip install"
+                " scikit-llm[annoy]`."
             )
-        self._index = AnnoyIndex(dim, metric)
         self.metric = metric
         self.dim = dim
         self.built = False
+        self._index = None
+        self._counter = 0
 
-    def add(self, id: int, vector: ndarray) -> None:
+    def add(self, vector: ndarray) -> None:
         """Adds a vector to the index.
 
         Parameters
         ----------
-        id : Any
-            identifier for the vector
         vector : ndarray
             vector to add to the index
         """
         if self.built:
             raise RuntimeError("Cannot add vectors after index is built.")
+        if self.dim < 0:
+            raise ValueError("Dimensionality must be positive.")
+        if not self._index:
+            self._index = AnnoyIndex(self.dim, self.metric)
+        id = self._counter
         self._index.add_item(id, vector)
+        self._counter += 1
 
     def build(self) -> None:
         """Builds the index.
 
         No new vectors can be added after building.
         """
+        if self.dim < 0:
+            raise ValueError("Dimensionality must be positive.")
         self._index.build(-1)
         self.built = True
 
@@ -70,6 +78,7 @@ def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]:
         List
             ids of retrieved nearest neighbors
         """
+        print("ANNOY RETRIEVE")
         if not self.built:
             raise RuntimeError("Cannot retrieve vectors before the index is built.")
         return [

diff --git a/skllm/memory/_sklearn_nn.py b/skllm/memory/_sklearn_nn.py
@@ -0,0 +1,66 @@
+from typing import Any, List
+
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+from skllm.memory.base import _BaseMemoryIndex
+
+
+class SklearnMemoryIndex(_BaseMemoryIndex):
+    """Memory index using Sklearn's NearestNeighbors.
+
+    Parameters
+    ----------
+    dim : int
+        dimensionality of the vectors
+    metric : str, optional
+        metric to use, by default "euclidean"
+    """
+
+    def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
+        self._index = NearestNeighbors(metric=metric, **kwargs)
+        self.metric = metric
+        self.dim = dim
+        self.built = False
+        self.data = []
+
+    def add(self, vector: np.ndarray) -> None:
+        """Adds a vector to the index.
+
+        Parameters
+        ----------
+        vector : np.ndarray
+            vector to add to the index
+        """
+        if self.built:
+            raise RuntimeError("Cannot add vectors after index is built.")
+        self.data.append(vector)
+
+    def build(self) -> None:
+        """Builds the index.
+
+        No new vectors can be added after building.
+        """
+        data_matrix = np.array(self.data)
+        self._index.fit(data_matrix)
+        self.built = True
+
+    def retrieve(self, vectors: np.ndarray, k: int) -> List[List[int]]:
+        """Retrieves the k nearest neighbors for each vector.
+
+        Parameters
+        ----------
+        vectors : np.ndarray
+            vectors to retrieve nearest neighbors for
+        k : int
+            number of nearest neighbors to retrieve
+
+        Returns
+        -------
+        List
+            ids of retrieved nearest neighbors
+        """
+        if not self.built:
+            raise RuntimeError("Cannot retrieve vectors before the index is built.")
+        _, indices = self._index.kneighbors(vectors, n_neighbors=k)
+        return indices.tolist()
diff --git a/skllm/memory/base.py b/skllm/memory/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, List
+from typing import Any, List, Type
 
 from numpy import ndarray
 
@@ -43,3 +43,12 @@ def build(self) -> None:
         All build parameters should be passed to the constructor.
         """
         pass
+
+
+class IndexConstructor:
+    def __init__(self, index: Type[_BaseMemoryIndex], **kwargs: Any) -> None:
+        self.index = index
+        self.kwargs = kwargs
+
+    def __call__(self) -> _BaseMemoryIndex:
+        return self.index(**self.kwargs)
diff --git a/skllm/models/gpt/gpt_dyn_few_shot_clf.py b/skllm/models/gpt/gpt_dyn_few_shot_clf.py
@@ -3,7 +3,8 @@
 import numpy as np
 import pandas as pd
 
-from skllm.memory import AnnoyMemoryIndex
+from skllm.memory import SklearnMemoryIndex
+from skllm.memory.base import IndexConstructor
 from skllm.models._base import _BaseZeroShotGPTClassifier
 from skllm.preprocessing import GPTVectorizer
 from skllm.prompts.builders import build_few_shot_prompt_slc
@@ -35,6 +36,8 @@ class DynamicFewShotGPTClassifier(_BaseZeroShotGPTClassifier):
     default_label : Optional[Union[List[str], str]] , default : 'Random'
         The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
         label will be chosen based on probabilities from the training set.
+    memory_index : Optional[IndexConstructor], default : None
+        The memory index constructor to use. If None, a SklearnMemoryIndex will be used.
     """
 
     def __init__(
@@ -44,9 +47,11 @@ def __init__(
         openai_org: str | None = None,
         openai_model: str = "gpt-3.5-turbo",
         default_label: str | None = "Random",
+        memory_index: IndexConstructor | None = None,
     ):
         super().__init__(openai_key, openai_org, openai_model, default_label)
         self.n_examples = n_examples
+        self.memory_index = memory_index
 
     def fit(
         self,
@@ -79,9 +84,13 @@ def fit(
             partition = X[y == cls]
             self.data_[cls]["partition"] = partition
             embeddings = self.embedding_model_.transform(partition)
-            index = AnnoyMemoryIndex(embeddings.shape[1])
-            for i, embedding in enumerate(embeddings):
-                index.add(i, embedding)
+            if self.memory_index is not None:
+                index = self.memory_index()
+                index.dim = embeddings.shape[1]
+            else:
+                index = SklearnMemoryIndex(embeddings.shape[1])
+            for embedding in embeddings:
+                index.add(embedding)
             index.build()
             self.data_[cls]["index"] = index
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from skllm.memory._annoy import AnnoyMemoryIndex
		from skllm.memory._sklearn_nn import SklearnMemoryIndex