Skip to content

Commit

Permalink
v0.16 (MaartenGr#1572)
Browse files Browse the repository at this point in the history
* Zeroshot Topic Modeling
* Seed (domain-specific) words
* More LLM documentation, including Zephyr example
* Add support for Cohere's Embed v3
* Added llama.cpp
* Added HUGE changelog and up version for upcoming release
  • Loading branch information
MaartenGr authored Nov 27, 2023
1 parent bcb3ca2 commit 61a2cd2
Show file tree
Hide file tree
Showing 30 changed files with 1,541 additions and 202 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ BERTopic supports all kinds of topic modeling techniques:
<tr>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html">Multimodal</a></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html">Multi-aspect</a></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#text-generation-prompts">Text Generation/LLM</a></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/llm.html">Text Generation/LLM</a></td>
</tr>
<tr>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models</a></td>
<td></td>
<td></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zero-shot <b>(new!)</b></a></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models <b>(new!)</b></a></td>
<td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words <b>(new!)</b></a></td>
</tr>
</table>

Expand Down Expand Up @@ -159,8 +159,8 @@ import openai
from bertopic.representation import OpenAI

# Fine-tune topic representations with GPT
openai.api_key = "sk-..."
representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
client = openai.OpenAI(api_key="sk-...")
representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
topic_model = BERTopic(representation_model=representation_model)
```

Expand Down Expand Up @@ -259,6 +259,7 @@ There are many different use cases in which topic modeling can be used. As such,
| [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` |
| [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` |
| [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` |
| [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` |
| [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` |


Expand Down
2 changes: 1 addition & 1 deletion bertopic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bertopic._bertopic import BERTopic

__version__ = "0.15.0"
__version__ = "0.16.0"

__all__ = [
"BERTopic",
Expand Down
435 changes: 335 additions & 100 deletions bertopic/_bertopic.py

Large diffs are not rendered by default.

36 changes: 21 additions & 15 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
import logging
from collections.abc import Iterable
from scipy.sparse import csr_matrix
Expand All @@ -13,7 +14,10 @@ def __init__(self, level):
self.logger.propagate = False

def info(self, message):
self.logger.info("{}".format(message))
self.logger.info(f"{message}")

def warning(self, message):
self.logger.warning(f"WARNING: {message}")

def set_level(self, level):
levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
Expand All @@ -32,10 +36,11 @@ def _add_handler(self):

def check_documents_type(documents):
""" Check whether the input documents are indeed a list of strings """
if isinstance(documents, Iterable) and not isinstance(documents, str):
if isinstance(documents, pd.DataFrame):
raise TypeError("Make sure to supply a list of strings, not a dataframe.")
elif isinstance(documents, Iterable) and not isinstance(documents, str):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError("Make sure that the iterable only contains strings.")

else:
raise TypeError("Make sure that the documents variable is an iterable containing strings only.")

Expand Down Expand Up @@ -94,15 +99,16 @@ def __getattr__(self, *args, **kwargs):
def __call__(self, *args, **kwargs):
raise ModuleNotFoundError(self.msg)


def validate_distance_matrix(X, n_samples):
""" Validate the distance matrix and convert it to a condensed distance matrix
if necessary.
A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
with zeros on the diagonal and non-negative values or condensed distance matrix
of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
with zeros on the diagonal and non-negative values or condensed distance matrix
of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
distance matrix.
Arguments:
X: Distance matrix to validate.
n_samples: Number of samples in the dataset.
Expand All @@ -118,26 +124,26 @@ def validate_distance_matrix(X, n_samples):
if len(s) == 1:
# check it has correct size
n = s[0]
if n != (n_samples * (n_samples -1) / 2):
if n != (n_samples * (n_samples - 1) / 2):
raise ValueError("The condensed distance matrix must have "
"shape (n*(n-1)/2,).")
"shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
raise ValueError("The distance matrix must be of shape "
"(n, n) where n is the number of samples.")
"(n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
else:
raise ValueError("The distance matrix must be either a 1-D condensed "
"distance matrix of shape (n*(n-1)/2,) or a "
"2-D square distance matrix of shape (n, n)."
"where n is the number of documents."
"Got a distance matrix of shape %s" % str(s))
"distance matrix of shape (n*(n-1)/2,) or a "
"2-D square distance matrix of shape (n, n)."
"where n is the number of documents."
"Got a distance matrix of shape %s" % str(s))

# Make sure its entries are non-negative
if np.any(X < 0):
raise ValueError("Distance matrix cannot contain negative values.")

return X
return X
41 changes: 30 additions & 11 deletions bertopic/backend/_cohere.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import time
import cohere
import numpy as np
from tqdm import tqdm
from typing import List
from typing import Any, List, Mapping
from bertopic.backend import BaseEmbedder


class CohereBackend(BaseEmbedder):
""" Cohere Embedding Model
Arguments:
client: A `cohere` client.
embedding_model: A Cohere model. Default is "large".
Expand All @@ -17,6 +16,9 @@ class CohereBackend(BaseEmbedder):
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
embed_kwargs: Kwargs passed to `cohere.Client.embed`.
Can be used to define additional parameters
such as `input_type`
Examples:
Expand All @@ -27,17 +29,34 @@ class CohereBackend(BaseEmbedder):
client = cohere.Client("APIKEY")
cohere_model = CohereBackend(client)
```
If you want to specify `input_type`:
```python
cohere_model = CohereBackend(
client,
embedding_model="embed-english-v3.0",
embed_kwargs={"input_type": "clustering"}
)
```
"""
def __init__(self,
def __init__(self,
client,
embedding_model: str = "large",
delay_in_seconds: float = None,
batch_size: int = None):
batch_size: int = None,
embed_kwargs: Mapping[str, Any] = {}):
super().__init__()
self.client = client
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
self.embed_kwargs = embed_kwargs

if self.embed_kwargs.get("model"):
self.embedding_model = embed_kwargs.get("model")
else:
self.embed_kwargs["model"] = self.embedding_model

def embed(self,
documents: List[str],
Expand All @@ -57,19 +76,19 @@ def embed(self,
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(documents), disable=not verbose):
response = self.client.embed(batch, model=self.embedding_model)
response = self.client.embed(batch, **self.embed_kwargs)
embeddings.extend(response.embeddings)

# Delay subsequent calls
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)

# Extract embeddings all at once
else:
response = self.client.embed(documents, model=self.embedding_model)
response = self.client.embed(documents, **self.embed_kwargs)
embeddings = response.embeddings
return np.array(embeddings)
def _chunks(self, documents):

def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
yield documents[i:i + self.batch_size]
yield documents[i:i + self.batch_size]
33 changes: 22 additions & 11 deletions bertopic/backend/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,50 @@
import openai
import numpy as np
from tqdm import tqdm
from typing import List
from typing import List, Mapping, Any
from bertopic.backend import BaseEmbedder


class OpenAIBackend(BaseEmbedder):
""" OpenAI Embedding Model
Arguments:
embedding_model: An OpenAI model. Default is
client: A `openai.OpenAI` client.
embedding_model: An OpenAI model. Default is
For an overview of models see:
https://platform.openai.com/docs/models/embeddings
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
generator_kwargs: Kwargs passed to `openai.Embedding.create`.
Can be used to define custom engines or
deployment_ids.
Examples:
```python
import openai
from bertopic.backend import OpenAIBackend
openai.api_key = MY_API_KEY
openai_embedder = OpenAIBackend("text-embedding-ada-002")
client = openai.OpenAI(api_key="sk-...")
openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
```
"""
def __init__(self,
def __init__(self,
embedding_model: str = "text-embedding-ada-002",
delay_in_seconds: float = None,
batch_size: int = None):
batch_size: int = None,
generator_kwargs: Mapping[str, Any] = {}):
super().__init__()
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
self.generator_kwargs = generator_kwargs

if self.generator_kwargs.get("model"):
self.embedding_model = generator_kwargs.get("model")
elif not self.generator_kwargs.get("engine"):
self.generator_kwargs["model"] = self.embedding_model

def embed(self,
documents: List[str],
Expand All @@ -54,7 +65,7 @@ def embed(self,
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(documents), disable=not verbose):
response = openai.Embedding.create(input=batch, model=self.embedding_model)
response = openai.Embedding.create(input=batch, **self.generator_kwargs)
embeddings.extend([r["embedding"] for r in response["data"]])

# Delay subsequent calls
Expand All @@ -63,10 +74,10 @@ def embed(self,

# Extract embeddings all at once
else:
response = openai.Embedding.create(input=documents, model=self.embedding_model)
response = openai.Embedding.create(input=documents, **self.generator_kwargs)
embeddings = [r["embedding"] for r in response["data"]]
return np.array(embeddings)
def _chunks(self, documents):

def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
yield documents[i:i + self.batch_size]
12 changes: 10 additions & 2 deletions bertopic/representation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
from bertopic.representation._keybert import KeyBERTInspired
from bertopic.representation._mmr import MaximalMarginalRelevance


# Llama CPP Generator
try:
from bertopic.representation._llamacpp import LlamaCPP
except ModuleNotFoundError:
msg = "`pip install llama-cpp-python` \n\n"
LlamaCPP = NotInstalled("llama.cpp", "llama-cpp-python", custom_msg=msg)

# Text Generation using transformers
try:
from bertopic.representation._textgeneration import TextGeneration
Expand All @@ -25,7 +33,7 @@
msg = "`pip install openai` \n\n"
OpenAI = NotInstalled("OpenAI", "openai", custom_msg=msg)

# OpenAI Generator
# LangChain Generator
try:
from bertopic.representation._langchain import LangChain
except ModuleNotFoundError:
Expand All @@ -45,7 +53,6 @@
VisualRepresentation = NotInstalled("a visual representation model", "vision")



__all__ = [
"BaseRepresentation",
"TextGeneration",
Expand All @@ -56,5 +63,6 @@
"Cohere",
"OpenAI",
"LangChain",
"LlamaCPP",
"VisualRepresentation"
]
2 changes: 1 addition & 1 deletion bertopic/representation/_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Cohere(BaseRepresentation):
https://docs.cohere.ai/docs
Arguments:
client: A cohere.Client
client: A `cohere.Client`
model: Model to use within Cohere, defaults to `"xlarge"`.
prompt: The prompt to be used in the model. If no prompt is given,
`self.default_prompt_` is used instead.
Expand Down
Loading

0 comments on commit 61a2cd2

Please sign in to comment.