Skip to content

Commit

Permalink
Revert "run ruff linter"
Browse files Browse the repository at this point in the history
This reverts commit 829e573.
  • Loading branch information
CShorten committed Mar 9, 2024
1 parent 79e3187 commit 275f133
Show file tree
Hide file tree
Showing 43 changed files with 179 additions and 105 deletions.
2 changes: 1 addition & 1 deletion dsp/modules/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .anthropic import Claude
from .azure_openai import AzureOpenAI
from .bedrock import *
from .cache_utils import *
Expand All @@ -14,3 +13,4 @@
from .pyserini import *
from .sbert import *
from .sentence_vectorizer import *
from .anthropic import Claude
12 changes: 7 additions & 5 deletions dsp/modules/anthropic.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging
import os
from typing import Any, Optional

import backoff
import json
from typing import Optional, Any
from anthropic import Anthropic, RateLimitError

from dsp.modules.lm import LM
import logging


logger = logging.getLogger(__name__)

Expand All @@ -22,7 +23,7 @@ def backoff_hdlr(details):


def giveup_hdlr(details):
"""Wrapper function that decides when to give up on retry"""
"""wrapper function that decides when to give up on retry"""
if "rate limits" in details.message:
return False
return True
Expand All @@ -35,7 +36,7 @@ def __init__(
model: str = "claude-instant-1.2",
api_key: Optional[str] = None,
api_base: Optional[str] = None,
**kwargs,
**kwargs
):
super().__init__(model)
self.provider = "anthropic"
Expand Down Expand Up @@ -104,6 +105,7 @@ def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
Returns:
list[str]: list of completion choices
"""

assert only_completed, "for now"
assert return_sorted is False, "for now"

Expand Down
11 changes: 8 additions & 3 deletions dsp/modules/aws_lm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""A generalized AWS LLM.
"""
A generalized AWS LLM.
"""

from __future__ import annotations
Expand All @@ -16,7 +17,8 @@


class AWSLM(LM):
"""This class adds support for an AWS model
"""
This class adds support for an AWS model
"""

def __init__(
Expand All @@ -32,6 +34,7 @@ def __init__(
"""_summary_
Args:
service_name (str): Used in context of invoking the boto3 API.
region_name (str, optional): The AWS region where this LM is hosted.
model (str, optional): An LM name, e.g., a bedrock name or an AWS endpoint.
Expand Down Expand Up @@ -98,6 +101,7 @@ def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:

def basic_request(self, prompt, **kwargs) -> str | list[str]:
"""Query the endpoint."""

# Remove any texts that are too long
formatted_prompt: str
if self._truncate_long_prompt_prompts:
Expand Down Expand Up @@ -162,7 +166,8 @@ def __call__(
return_sorted: bool = False,
**kwargs,
) -> list[str]:
"""Query the AWS LLM.
"""
Query the AWS LLM.
There is only support for only_completed=True and return_sorted=False
right now.
Expand Down
1 change: 1 addition & 0 deletions dsp/modules/azure_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def __call__(
Returns:
list[dict[str, Any]]: list of completion choices
"""

assert only_completed, "for now"
assert return_sorted is False, "for now"

Expand Down
10 changes: 6 additions & 4 deletions dsp/modules/azurecognitivesearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,18 @@ def __call__(self, query: str, k: int = 10) -> Union[list[str], list[dotdict]]:
return [dotdict(psg) for psg in topk]

def azure_search_request(key_content: str, key_score: str, client: SearchClient, query: str, top: int =1):
"""Search in Azure Cognitive Search Index
"""
'''
Search in Azure Cognitive Search Index
'''
results = client.search(search_text=query,top=top)
results = process_azure_result(results, key_content, key_content)

return results

def process_azure_result(results:SearchItemPaged, content_key:str, content_score: str):
"""Process received result from Azure Cognitive Search as dictionary array and map content and score to correct format
"""
'''
process received result from Azure Cognitive Search as dictionary array and map content and score to correct format
'''
res = []
for result in results:
tmp = {}
Expand Down
1 change: 0 additions & 1 deletion dsp/modules/clarifai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ class ClarifaiLLM(LM):
model (str, optional): Clarifai URL of the model. Defaults to "Mistral-7B-Instruct".
api_key (Optional[str], optional): CLARIFAI_PAT token. Defaults to None.
**kwargs: Additional arguments to pass to the API provider.
Example:
import dspy
dspy.configure(lm=dspy.Clarifai(model=MODEL_URL,
Expand Down
5 changes: 3 additions & 2 deletions dsp/modules/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def backoff_hdlr(details):


def giveup_hdlr(details):
"""Wrapper function that decides when to give up on retry"""
"""wrapper function that decides when to give up on retry"""
if "rate limits" in details.message:
return False
return True
Expand All @@ -42,7 +42,8 @@ def __init__(
stop_sequences: list[str] = [],
**kwargs,
):
"""Parameters
"""
Parameters
----------
model : str
Which pre-trained model from Cohere to use?
Expand Down
6 changes: 4 additions & 2 deletions dsp/modules/finetuning/finetune_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ def _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config):


def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
"""Resize tokenizer and embedding.
"""
Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
Expand All @@ -248,7 +249,8 @@ def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):

@dataclass
class DataCollatorForSupervisedDataset:
"""Collate examples for supervised fine-tuning.
"""
Collate examples for supervised fine-tuning.
"""
tokenizer: PreTrainedTokenizer

Expand Down
5 changes: 3 additions & 2 deletions dsp/modules/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def backoff_hdlr(details):


def giveup_hdlr(details):
"""Wrapper function that decides when to give up on retry"""
"""wrapper function that decides when to give up on retry"""
if "rate limits" in details.message:
return False
return True
Expand Down Expand Up @@ -64,7 +64,8 @@ def __init__(
safety_settings: Optional[Iterable] = BLOCK_ONLY_HIGH,
**kwargs,
):
"""Parameters
"""
Parameters
----------
model : str
Which pre-trained model from Google to use?
Expand Down
1 change: 1 addition & 0 deletions dsp/modules/gpt3.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def __call__(
Returns:
list[dict[str, Any]]: list of completion choices
"""

assert only_completed, "for now"
assert return_sorted is False, "for now"

Expand Down
3 changes: 2 additions & 1 deletion dsp/modules/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def openai_to_hf(**kwargs):
class HFModel(LM):
def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool = False,
hf_device_map: Literal["auto", "balanced", "balanced_low_0", "sequential"] = "auto"):
"""Wrapper for Hugging Face models
"""wrapper for Hugging Face models
Args:
model (str): HF model identifier to load and use
Expand All @@ -37,6 +37,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
hf_device_map (str, optional): HF config strategy to load the model.
Recommeded to use "auto", which will help loading large models using accelerate. Defaults to "auto".
"""

super().__init__(model)
self.provider = "hf"
self.is_client = is_client
Expand Down
1 change: 1 addition & 0 deletions dsp/modules/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def __call__(
Returns:
list[dict[str, Any]]: list of completion choices
"""

assert only_completed, "for now"
assert return_sorted is False, "for now"

Expand Down
25 changes: 14 additions & 11 deletions dsp/modules/pyserini.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,21 @@ def __init__(self,
dataset: Dataset = None,
id_field: str = '_id',
text_fields: list[str] = ['text']) -> None:
"""Args:
query_encoder (`str`):
Huggingface model to encode queries
index (`str`):
Either a prebuilt index from pyserini or a local path to a faiss index
dataset (`Dataset`):
Only required when using a local faiss index. The dataset should be the one that has been put into the faiss index.
id_field (`str`):
The name of the id field of the dataset used for retrieval.
text_fields (`list[str]`):
A list of the names of the text fields for the dataset used for retrieval.
"""
Args:
query_encoder (`str`):
Huggingface model to encode queries
index (`str`):
Either a prebuilt index from pyserini or a local path to a faiss index
dataset (`Dataset`):
Only required when using a local faiss index. The dataset should be the one that has been put into the faiss index.
id_field (`str`):
The name of the id field of the dataset used for retrieval.
text_fields (`list[str]`):
A list of the names of the text fields for the dataset used for retrieval.
"""

# Keep pyserini as an optional dependency
from pyserini.prebuilt_index_info import FAISS_INDEX_INFO, IMPACT_INDEX_INFO, TF_INDEX_INFO
from pyserini.search import FaissSearcher
Expand Down
25 changes: 15 additions & 10 deletions dsp/modules/sentence_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@


class BaseSentenceVectorizer(abc.ABC):
"""Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
'''
Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
for ANN/KNN indexes. `__call__` method takes `List[Example]` as a single input, then extracts
`field_to_vectorize` from every Example and convert them into embeddings.
You can customize extraction logic in the `_extract_text_from_examples` method.
"""
'''
# embeddings will be computed based on the string in this attribute of Example object
field_to_vectorize = 'text_to_vectorize'

Expand All @@ -28,11 +29,12 @@ def _extract_text_from_examples(self, inp_examples: List) -> List[str]:


class SentenceTransformersVectorizer(BaseSentenceVectorizer):
"""Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
'''
Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
https://huggingface.co/models?library=sentence-transformers
More details about models:
https://www.sbert.net/docs/pretrained_models.html
"""
'''
def __init__(
self,
model_name_or_path: str = 'all-MiniLM-L6-v2',
Expand Down Expand Up @@ -91,9 +93,10 @@ def __call__(self, inp_examples: List) -> np.ndarray:


class NaiveGetFieldVectorizer(BaseSentenceVectorizer):
"""If embeddings were precomputed, then we could just extract them from the proper field
'''
If embeddings were precomputed, then we could just extract them from the proper field
(set by `field_with_embedding`) from each `Example`.
"""
'''
def __init__(self, field_with_embedding: str = 'vectorized'):
self.field_with_embedding = field_with_embedding

Expand All @@ -107,11 +110,12 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:


class CohereVectorizer(BaseSentenceVectorizer):
"""This vectorizer uses the Cohere API to convert texts to embeddings.
'''
This vectorizer uses the Cohere API to convert texts to embeddings.
More about the available models: https://docs.cohere.com/reference/embed
`api_key` should be passed as an argument and can be retrieved
from https://dashboard.cohere.com/api-keys
"""
'''
def __init__(
self,
api_key: str,
Expand Down Expand Up @@ -156,10 +160,11 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:


class OpenAIVectorizer(BaseSentenceVectorizer):
"""This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
'''
This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
recommended. More about the model: https://openai.com/blog/new-and-improved-embedding-model/
`api_key` should be passed as an argument or as env variable (`OPENAI_API_KEY`).
"""
'''
def __init__(
self,
model: str = 'text-embedding-ada-002',
Expand Down
10 changes: 7 additions & 3 deletions dsp/primitives/demonstrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def sample(train: list[Example], k: int):

def all_but(train: list[Example], x: Example) -> list[Example]:
"""Removes the example x from the train set by comparing the question and history."""

output = [
y
for y in train
Expand Down Expand Up @@ -126,13 +127,15 @@ def passage_has_answers(passage: str, answers: list[str]) -> bool:


def cast_naive_get_only_question_text(inp_example: Example) -> Example:
"""Extracts question as a field to vectorize with Vectorizer object. `question` field is used.
"""
Extracts question as a field to vectorize with Vectorizer object. `question` field is used.
"""
return inp_example.copy(text_to_vectorize=inp_example.question)


def cast_naive_get_question_and_answer(inp_example: Example) -> Example:
"""Extracts question and answer as fields to vectorize with Vectorizer object.
"""
Extracts question and answer as fields to vectorize with Vectorizer object.
`question` and `answer` fields are used. They will be concatenated with the word "Answer"
between.
"""
Expand All @@ -147,7 +150,8 @@ def knn(
cast: Callable[[Example], Example] = cast_naive_get_only_question_text,
**knn_args,
) -> Callable[[Example, int], list[Example]]:
"""A function that vectorizes train data using `dsm.settings.vectorizer`, then build an ANN/KNN
"""
A function that vectorizes train data using `dsm.settings.vectorizer`, then build an ANN/KNN
index to search similar questions among `train` samples.
Args:
Expand Down
1 change: 1 addition & 0 deletions dsp/primitives/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def majority(

def majority_vote_(completions: Completions, normalize: bool, prediction_field: str):
"""Core logic for majority vote."""

if not dsp.settings.lm:
raise AssertionError("No LM is loaded.")

Expand Down
6 changes: 4 additions & 2 deletions dsp/utils/ann_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@


def determine_devices(max_gpu_devices: int = 0) -> Tuple[int, bool]:
"""Determine which device we should use
"""
Determine which device we should use
Args:
max_gpu_devices: an integer value, define how many GPUs we'll use.
-1 means all devices. 0 means there are no GPUs. Default is 0.
Expand Down Expand Up @@ -86,7 +87,8 @@ def create_faiss_index(
in_list_dist_type: str = 'L2',
centroid_dist_type: str = 'L2',
) -> Index:
"""Create IVF index (with IP or L2 dist), without adding data and training
"""
Create IVF index (with IP or L2 dist), without adding data and training
Args:
emb_dim: size of each embedding
n_objects: size of a trainset for index. Used to determine optimal type
Expand Down
Loading

0 comments on commit 275f133

Please sign in to comment.