v0.16 (MaartenGr#1572)

* Zeroshot Topic Modeling * Seed (domain-specific) words * More LLM documentation, including Zephyr example * Add support for Cohere's Embed v3 * Added llama.cpp * Added HUGE changelog and up version for upcoming release
lmcinnes · Nov 27, 2023 · 61a2cd2 · 61a2cd2
1 parent bcb3ca2
commit 61a2cd2
Show file tree

Hide file tree

Showing 30 changed files with 1,541 additions and 202 deletions.
diff --git a/README.md b/README.md
@@ -33,12 +33,12 @@ BERTopic supports all kinds of topic modeling techniques:
  <tr>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html">Multimodal</a></td>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html">Multi-aspect</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#text-generation-prompts">Text Generation/LLM</a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/llm.html">Text Generation/LLM</a></td>
  </tr>
  <tr>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models</a></td>
-    <td></td>
-    <td></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zero-shot <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words <b>(new!)</b></a></td>
  </tr>
 </table>
 
@@ -159,8 +159,8 @@ import openai
 from bertopic.representation import OpenAI
 
 # Fine-tune topic representations with GPT
-openai.api_key = "sk-..."
-representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
+client = openai.OpenAI(api_key="sk-...")
+representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
 topic_model = BERTopic(representation_model=representation_model)
 ```
 
@@ -259,6 +259,7 @@ There are many different use cases in which topic modeling can be used. As such,
 | [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` |
 | [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` |
 | [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` |
+| [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` |
 | [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` |
 
 

diff --git a/bertopic/__init__.py b/bertopic/__init__.py
@@ -1,6 +1,6 @@
 from bertopic._bertopic import BERTopic
 
-__version__ = "0.15.0"
+__version__ = "0.16.0"
 
 __all__ = [
     "BERTopic",

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import logging
 from collections.abc import Iterable
 from scipy.sparse import csr_matrix
@@ -13,7 +14,10 @@ def __init__(self, level):
         self.logger.propagate = False
 
     def info(self, message):
-        self.logger.info("{}".format(message))
+        self.logger.info(f"{message}")
+
+    def warning(self, message):
+        self.logger.warning(f"WARNING: {message}")
 
     def set_level(self, level):
         levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
@@ -32,10 +36,11 @@ def _add_handler(self):
 
 def check_documents_type(documents):
     """ Check whether the input documents are indeed a list of strings """
-    if isinstance(documents, Iterable) and not isinstance(documents, str):
+    if isinstance(documents, pd.DataFrame):
+        raise TypeError("Make sure to supply a list of strings, not a dataframe.")
+    elif isinstance(documents, Iterable) and not isinstance(documents, str):
         if not any([isinstance(doc, str) for doc in documents]):
             raise TypeError("Make sure that the iterable only contains strings.")
-
     else:
         raise TypeError("Make sure that the documents variable is an iterable containing strings only.")
 
@@ -94,15 +99,16 @@ def __getattr__(self, *args, **kwargs):
     def __call__(self, *args, **kwargs):
         raise ModuleNotFoundError(self.msg)
 
+
 def validate_distance_matrix(X, n_samples):
     """ Validate the distance matrix and convert it to a condensed distance matrix
     if necessary.
 
-    A valid distance matrix is either a square matrix of shape (n_samples, n_samples) 
-    with zeros on the diagonal and non-negative values or condensed distance matrix 
-    of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the 
+    A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
+    with zeros on the diagonal and non-negative values or condensed distance matrix
+    of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
     distance matrix.
-    
+
     Arguments:
         X: Distance matrix to validate.
         n_samples: Number of samples in the dataset.
@@ -118,26 +124,26 @@ def validate_distance_matrix(X, n_samples):
     if len(s) == 1:
         # check it has correct size
         n = s[0]
-        if n != (n_samples * (n_samples -1) / 2):
+        if n != (n_samples * (n_samples - 1) / 2):
             raise ValueError("The condensed distance matrix must have "
-                            "shape (n*(n-1)/2,).")
+                             "shape (n*(n-1)/2,).")
     elif len(s) == 2:
         # check it has correct size
         if (s[0] != n_samples) or (s[1] != n_samples):
             raise ValueError("The distance matrix must be of shape "
-                            "(n, n) where n is the number of samples.")
+                             "(n, n) where n is the number of samples.")
         # force zero diagonal and convert to condensed
         np.fill_diagonal(X, 0)
         X = squareform(X)
     else:
         raise ValueError("The distance matrix must be either a 1-D condensed "
-                        "distance matrix of shape (n*(n-1)/2,) or a "
-                        "2-D square distance matrix of shape (n, n)."
-                        "where n is the number of documents."
-                        "Got a distance matrix of shape %s" % str(s))
+                         "distance matrix of shape (n*(n-1)/2,) or a "
+                         "2-D square distance matrix of shape (n, n)."
+                         "where n is the number of documents."
+                         "Got a distance matrix of shape %s" % str(s))
 
     # Make sure its entries are non-negative
     if np.any(X < 0):
         raise ValueError("Distance matrix cannot contain negative values.")
 
-    return X
+    return X
diff --git a/bertopic/backend/_cohere.py b/bertopic/backend/_cohere.py
@@ -1,14 +1,13 @@
 import time
-import cohere
 import numpy as np
 from tqdm import tqdm
-from typing import List
+from typing import Any, List, Mapping
 from bertopic.backend import BaseEmbedder
 
 
 class CohereBackend(BaseEmbedder):
     """ Cohere Embedding Model
-    
+
     Arguments:
         client: A `cohere` client.
         embedding_model: A Cohere model. Default is "large".
@@ -17,6 +16,9 @@ class CohereBackend(BaseEmbedder):
         delay_in_seconds: If a `batch_size` is given, use this set
                           the delay in seconds between batches.
         batch_size: The size of each batch.
+        embed_kwargs: Kwargs passed to `cohere.Client.embed`.
+                            Can be used to define additional parameters
+                            such as `input_type`
 
     Examples:
 
@@ -27,17 +29,34 @@ class CohereBackend(BaseEmbedder):
     client = cohere.Client("APIKEY")
     cohere_model = CohereBackend(client)
     ```
+
+    If you want to specify `input_type`:
+
+    ```python
+    cohere_model = CohereBackend(
+        client,
+        embedding_model="embed-english-v3.0",
+        embed_kwargs={"input_type": "clustering"}
+    )
+    ```
     """
-    def __init__(self, 
+    def __init__(self,
                  client,
                  embedding_model: str = "large",
                  delay_in_seconds: float = None,
-                 batch_size: int = None):
+                 batch_size: int = None,
+                 embed_kwargs: Mapping[str, Any] = {}):
         super().__init__()
         self.client = client
         self.embedding_model = embedding_model
         self.delay_in_seconds = delay_in_seconds
         self.batch_size = batch_size
+        self.embed_kwargs = embed_kwargs
+
+        if self.embed_kwargs.get("model"):
+            self.embedding_model = embed_kwargs.get("model")
+        else:
+            self.embed_kwargs["model"] = self.embedding_model
 
     def embed(self,
               documents: List[str],
@@ -57,19 +76,19 @@ def embed(self,
         if self.batch_size is not None:
             embeddings = []
             for batch in tqdm(self._chunks(documents), disable=not verbose):
-                response = self.client.embed(batch, model=self.embedding_model)
+                response = self.client.embed(batch, **self.embed_kwargs)
                 embeddings.extend(response.embeddings)
-                
+
                 # Delay subsequent calls
                 if self.delay_in_seconds:
                     time.sleep(self.delay_in_seconds)
 
         # Extract embeddings all at once
         else:
-            response = self.client.embed(documents, model=self.embedding_model)
+            response = self.client.embed(documents, **self.embed_kwargs)
             embeddings = response.embeddings
         return np.array(embeddings)
-    
-    def _chunks(self, documents):     
+
+    def _chunks(self, documents):
         for i in range(0, len(documents), self.batch_size):
-            yield documents[i:i + self.batch_size]
+            yield documents[i:i + self.batch_size]
diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py
@@ -2,39 +2,50 @@
 import openai
 import numpy as np
 from tqdm import tqdm
-from typing import List
+from typing import List, Mapping, Any
 from bertopic.backend import BaseEmbedder
 
 
 class OpenAIBackend(BaseEmbedder):
     """ OpenAI Embedding Model
-    
+
     Arguments:
-        embedding_model: An OpenAI model. Default is 
+        client: A `openai.OpenAI` client.
+        embedding_model: An OpenAI model. Default is
                          For an overview of models see:
                          https://platform.openai.com/docs/models/embeddings
         delay_in_seconds: If a `batch_size` is given, use this set
                           the delay in seconds between batches.
         batch_size: The size of each batch.
+        generator_kwargs: Kwargs passed to `openai.Embedding.create`.
+                          Can be used to define custom engines or
+                          deployment_ids.
 
     Examples:
 
     ```python
     import openai
     from bertopic.backend import OpenAIBackend
 
-    openai.api_key = MY_API_KEY
-    openai_embedder = OpenAIBackend("text-embedding-ada-002")
+    client = openai.OpenAI(api_key="sk-...")
+    openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
     ```
     """
-    def __init__(self, 
+    def __init__(self,
                  embedding_model: str = "text-embedding-ada-002",
                  delay_in_seconds: float = None,
-                 batch_size: int = None):
+                 batch_size: int = None,
+                 generator_kwargs: Mapping[str, Any] = {}):
         super().__init__()
         self.embedding_model = embedding_model
         self.delay_in_seconds = delay_in_seconds
         self.batch_size = batch_size
+        self.generator_kwargs = generator_kwargs
+
+        if self.generator_kwargs.get("model"):
+            self.embedding_model = generator_kwargs.get("model")
+        elif not self.generator_kwargs.get("engine"):
+            self.generator_kwargs["model"] = self.embedding_model
 
     def embed(self,
               documents: List[str],
@@ -54,7 +65,7 @@ def embed(self,
         if self.batch_size is not None:
             embeddings = []
             for batch in tqdm(self._chunks(documents), disable=not verbose):
-                response = openai.Embedding.create(input=batch, model=self.embedding_model)
+                response = openai.Embedding.create(input=batch, **self.generator_kwargs)
                 embeddings.extend([r["embedding"] for r in response["data"]])
 
                 # Delay subsequent calls
@@ -63,10 +74,10 @@ def embed(self,
 
         # Extract embeddings all at once
         else:
-            response = openai.Embedding.create(input=documents, model=self.embedding_model)
+            response = openai.Embedding.create(input=documents, **self.generator_kwargs)
             embeddings = [r["embedding"] for r in response["data"]]
         return np.array(embeddings)
-    
-    def _chunks(self, documents):     
+
+    def _chunks(self, documents):
         for i in range(0, len(documents), self.batch_size):
             yield documents[i:i + self.batch_size]
diff --git a/bertopic/representation/__init__.py b/bertopic/representation/__init__.py
@@ -4,6 +4,14 @@
 from bertopic.representation._keybert import KeyBERTInspired
 from bertopic.representation._mmr import MaximalMarginalRelevance
 
+
+# Llama CPP Generator
+try:
+    from bertopic.representation._llamacpp import LlamaCPP
+except ModuleNotFoundError:
+    msg = "`pip install llama-cpp-python` \n\n"
+    LlamaCPP = NotInstalled("llama.cpp", "llama-cpp-python", custom_msg=msg)
+
 # Text Generation using transformers
 try:
     from bertopic.representation._textgeneration import TextGeneration
@@ -25,7 +33,7 @@
     msg = "`pip install openai` \n\n"
     OpenAI = NotInstalled("OpenAI", "openai", custom_msg=msg)
 
-# OpenAI Generator
+# LangChain Generator
 try:
     from bertopic.representation._langchain import LangChain
 except ModuleNotFoundError:
@@ -45,7 +53,6 @@
     VisualRepresentation = NotInstalled("a visual representation model", "vision")
 
 
-
 __all__ = [
     "BaseRepresentation",
     "TextGeneration",
@@ -56,5 +63,6 @@
     "Cohere",
     "OpenAI",
     "LangChain",
+    "LlamaCPP",
     "VisualRepresentation"
 ]
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
@@ -44,7 +44,7 @@ class Cohere(BaseRepresentation):
     https://docs.cohere.ai/docs
 
     Arguments:
-        client: A cohere.Client
+        client: A `cohere.Client`
         model: Model to use within Cohere, defaults to `"xlarge"`.
         prompt: The prompt to be used in the model. If no prompt is given,
                 `self.default_prompt_` is used instead.