Revert "run ruff linter"

This reverts commit 829e573.
bakebrain · Mar 9, 2024 · 275f133 · 275f133
1 parent 79e3187
commit 275f133
Show file tree

Hide file tree

Showing 43 changed files with 179 additions and 105 deletions.
diff --git a/dsp/modules/__init__.py b/dsp/modules/__init__.py
@@ -1,4 +1,3 @@
-from .anthropic import Claude
 from .azure_openai import AzureOpenAI
 from .bedrock import *
 from .cache_utils import *
@@ -14,3 +13,4 @@
 from .pyserini import *
 from .sbert import *
 from .sentence_vectorizer import *
+from .anthropic import Claude
diff --git a/dsp/modules/anthropic.py b/dsp/modules/anthropic.py
@@ -1,11 +1,12 @@
-import logging
 import os
-from typing import Any, Optional
-
 import backoff
+import json
+from typing import Optional, Any
 from anthropic import Anthropic, RateLimitError
 
 from dsp.modules.lm import LM
+import logging
+
 
 logger = logging.getLogger(__name__)
 
@@ -22,7 +23,7 @@ def backoff_hdlr(details):
 
 
 def giveup_hdlr(details):
-    """Wrapper function that decides when to give up on retry"""
+    """wrapper function that decides when to give up on retry"""
     if "rate limits" in details.message:
         return False
     return True
@@ -35,7 +36,7 @@ def __init__(
             model: str = "claude-instant-1.2",
             api_key: Optional[str] = None,
             api_base: Optional[str] = None,
-            **kwargs,
+            **kwargs
     ):
         super().__init__(model)
         self.provider = "anthropic"
@@ -104,6 +105,7 @@ def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
         Returns:
             list[str]: list of completion choices
         """
+
         assert only_completed, "for now"
         assert return_sorted is False, "for now"
 

diff --git a/dsp/modules/aws_lm.py b/dsp/modules/aws_lm.py
@@ -1,4 +1,5 @@
-"""A generalized AWS LLM.
+"""
+A generalized AWS LLM.
 """
 
 from __future__ import annotations
@@ -16,7 +17,8 @@
 
 
 class AWSLM(LM):
-    """This class adds support for an AWS model
+    """
+    This class adds support for an AWS model
     """
 
     def __init__(
@@ -32,6 +34,7 @@ def __init__(
         """_summary_
 
         Args:
+
             service_name (str): Used in context of invoking the boto3 API.
             region_name (str, optional): The AWS region where this LM is hosted.
             model (str, optional): An LM name, e.g., a bedrock name or an AWS endpoint.
@@ -98,6 +101,7 @@ def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:
 
     def basic_request(self, prompt, **kwargs) -> str | list[str]:
         """Query the endpoint."""
+
         # Remove any texts that are too long
         formatted_prompt: str
         if self._truncate_long_prompt_prompts:
@@ -162,7 +166,8 @@ def __call__(
         return_sorted: bool = False,
         **kwargs,
     ) -> list[str]:
-        """Query the AWS LLM.
+        """
+        Query the AWS LLM.
 
         There is only support for only_completed=True and return_sorted=False
         right now.

diff --git a/dsp/modules/azure_openai.py b/dsp/modules/azure_openai.py
@@ -193,6 +193,7 @@ def __call__(
         Returns:
             list[dict[str, Any]]: list of completion choices
         """
+
         assert only_completed, "for now"
         assert return_sorted is False, "for now"
 

diff --git a/dsp/modules/azurecognitivesearch.py b/dsp/modules/azurecognitivesearch.py
@@ -44,16 +44,18 @@ def __call__(self, query: str, k: int = 10) -> Union[list[str], list[dotdict]]:
         return [dotdict(psg) for psg in topk]
 
 def azure_search_request(key_content: str, key_score: str,  client: SearchClient, query: str, top: int =1):
-    """Search in Azure Cognitive Search Index
-    """
+    '''
+    Search in Azure Cognitive Search Index
+    '''
     results = client.search(search_text=query,top=top)
     results = process_azure_result(results, key_content, key_content)
 
     return results
 
 def process_azure_result(results:SearchItemPaged, content_key:str, content_score: str):
-    """Process received result from Azure Cognitive Search as dictionary array and map content and score to correct format
-    """
+    '''
+    process received result from Azure Cognitive Search as dictionary array and map content and score to correct format
+    '''
     res = []
     for result in results:
         tmp = {}

diff --git a/dsp/modules/clarifai.py b/dsp/modules/clarifai.py
@@ -11,7 +11,6 @@ class ClarifaiLLM(LM):
         model (str, optional): Clarifai URL of the model. Defaults to "Mistral-7B-Instruct".
         api_key (Optional[str], optional): CLARIFAI_PAT token. Defaults to None.
         **kwargs: Additional arguments to pass to the API provider.
-
     Example:
         import dspy
         dspy.configure(lm=dspy.Clarifai(model=MODEL_URL,

diff --git a/dsp/modules/cohere.py b/dsp/modules/cohere.py
@@ -23,7 +23,7 @@ def backoff_hdlr(details):
 
 
 def giveup_hdlr(details):
-    """Wrapper function that decides when to give up on retry"""
+    """wrapper function that decides when to give up on retry"""
     if "rate limits" in details.message:
         return False
     return True
@@ -42,7 +42,8 @@ def __init__(
         stop_sequences: list[str] = [],
         **kwargs,
     ):
-        """Parameters
+        """
+        Parameters
         ----------
         model : str
             Which pre-trained model from Cohere to use?

diff --git a/dsp/modules/finetuning/finetune_hf.py b/dsp/modules/finetuning/finetune_hf.py
@@ -229,7 +229,8 @@ def _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config):
 
 
 def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
-    """Resize tokenizer and embedding.
+    """
+    Resize tokenizer and embedding.
     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
     """
     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
@@ -248,7 +249,8 @@ def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
 
 @dataclass
 class DataCollatorForSupervisedDataset:
-    """Collate examples for supervised fine-tuning.
+    """
+    Collate examples for supervised fine-tuning.
     """
     tokenizer: PreTrainedTokenizer
 

diff --git a/dsp/modules/google.py b/dsp/modules/google.py
@@ -25,7 +25,7 @@ def backoff_hdlr(details):
 
 
 def giveup_hdlr(details):
-    """Wrapper function that decides when to give up on retry"""
+    """wrapper function that decides when to give up on retry"""
     if "rate limits" in details.message:
         return False
     return True
@@ -64,7 +64,8 @@ def __init__(
         safety_settings: Optional[Iterable] = BLOCK_ONLY_HIGH,
         **kwargs,
     ):
-        """Parameters
+        """
+        Parameters
         ----------
         model : str
             Which pre-trained model from Google to use?

diff --git a/dsp/modules/gpt3.py b/dsp/modules/gpt3.py
@@ -173,6 +173,7 @@ def __call__(
         Returns:
             list[dict[str, Any]]: list of completion choices
         """
+
         assert only_completed, "for now"
         assert return_sorted is False, "for now"
 

diff --git a/dsp/modules/hf.py b/dsp/modules/hf.py
@@ -28,7 +28,7 @@ def openai_to_hf(**kwargs):
 class HFModel(LM):
     def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool = False,
                  hf_device_map: Literal["auto", "balanced", "balanced_low_0", "sequential"] = "auto"):
-        """Wrapper for Hugging Face models
+        """wrapper for Hugging Face models
 
         Args:
             model (str): HF model identifier to load and use
@@ -37,6 +37,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
             hf_device_map (str, optional): HF config strategy to load the model.
                 Recommeded to use "auto", which will help loading large models using accelerate. Defaults to "auto".
         """
+
         super().__init__(model)
         self.provider = "hf"
         self.is_client = is_client

diff --git a/dsp/modules/ollama.py b/dsp/modules/ollama.py
@@ -164,6 +164,7 @@ def __call__(
         Returns:
             list[dict[str, Any]]: list of completion choices
         """
+
         assert only_completed, "for now"
         assert return_sorted is False, "for now"
 

diff --git a/dsp/modules/pyserini.py b/dsp/modules/pyserini.py
@@ -15,18 +15,21 @@ def __init__(self,
                  dataset: Dataset = None,
                  id_field: str = '_id',
                  text_fields: list[str] = ['text']) -> None:
-        """Args:
-        query_encoder (`str`):
-            Huggingface model to encode queries
-        index (`str`):
-            Either a prebuilt index from pyserini or a local path to a faiss index
-        dataset (`Dataset`):
-            Only required when using a local faiss index. The dataset should be the one that has been put into the faiss index.
-        id_field (`str`):
-            The name of the id field of the dataset used for retrieval.
-        text_fields (`list[str]`):
-            A list of the names of the text fields for the dataset used for retrieval.
         """
+        Args:
+        
+            query_encoder (`str`):
+                Huggingface model to encode queries
+            index (`str`):
+                Either a prebuilt index from pyserini or a local path to a faiss index
+            dataset (`Dataset`):
+                Only required when using a local faiss index. The dataset should be the one that has been put into the faiss index.
+            id_field (`str`):
+                The name of the id field of the dataset used for retrieval.
+            text_fields (`list[str]`):
+                A list of the names of the text fields for the dataset used for retrieval.
+        """
+
         # Keep pyserini as an optional dependency
         from pyserini.prebuilt_index_info import FAISS_INDEX_INFO, IMPACT_INDEX_INFO, TF_INDEX_INFO
         from pyserini.search import FaissSearcher

diff --git a/dsp/modules/sentence_vectorizer.py b/dsp/modules/sentence_vectorizer.py
@@ -6,11 +6,12 @@
 
 
 class BaseSentenceVectorizer(abc.ABC):
-    """Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
+    '''
+    Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
     for ANN/KNN indexes. `__call__` method takes `List[Example]` as a single input, then extracts
     `field_to_vectorize` from every Example and convert them into embeddings.
     You can customize extraction logic in the `_extract_text_from_examples` method.
-    """
+    '''
     # embeddings will be computed based on the string in this attribute of Example object
     field_to_vectorize = 'text_to_vectorize'
 
@@ -28,11 +29,12 @@ def _extract_text_from_examples(self, inp_examples: List) -> List[str]:
 
 
 class SentenceTransformersVectorizer(BaseSentenceVectorizer):
-    """Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
+    '''
+    Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
     https://huggingface.co/models?library=sentence-transformers
     More details about models:
     https://www.sbert.net/docs/pretrained_models.html
-    """
+    '''
     def __init__(
         self,
         model_name_or_path: str = 'all-MiniLM-L6-v2',
@@ -91,9 +93,10 @@ def __call__(self, inp_examples: List) -> np.ndarray:
 
 
 class NaiveGetFieldVectorizer(BaseSentenceVectorizer):
-    """If embeddings were precomputed, then we could just extract them from the proper field
+    '''
+    If embeddings were precomputed, then we could just extract them from the proper field 
     (set by `field_with_embedding`) from each `Example`.
-    """
+    '''
     def __init__(self, field_with_embedding: str = 'vectorized'):
         self.field_with_embedding = field_with_embedding
 
@@ -107,11 +110,12 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
 
 class CohereVectorizer(BaseSentenceVectorizer):
-    """This vectorizer uses the Cohere API to convert texts to embeddings.
+    '''
+    This vectorizer uses the Cohere API to convert texts to embeddings.
     More about the available models: https://docs.cohere.com/reference/embed
     `api_key` should be passed as an argument and can be retrieved
     from https://dashboard.cohere.com/api-keys
-    """
+    '''
     def __init__(
         self,
         api_key: str,
@@ -156,10 +160,11 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
 
 class OpenAIVectorizer(BaseSentenceVectorizer):
-    """This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
+    '''
+    This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
     recommended. More about the model: https://openai.com/blog/new-and-improved-embedding-model/
     `api_key` should be passed as an argument or as env variable (`OPENAI_API_KEY`).
-    """
+    '''
     def __init__(
         self,
         model: str = 'text-embedding-ada-002',

diff --git a/dsp/primitives/demonstrate.py b/dsp/primitives/demonstrate.py
@@ -90,6 +90,7 @@ def sample(train: list[Example], k: int):
 
 def all_but(train: list[Example], x: Example) -> list[Example]:
     """Removes the example x from the train set by comparing the question and history."""
+
     output = [
         y
         for y in train
@@ -126,13 +127,15 @@ def passage_has_answers(passage: str, answers: list[str]) -> bool:
 
 
 def cast_naive_get_only_question_text(inp_example: Example) -> Example:
-    """Extracts question as a field to vectorize with Vectorizer object. `question` field is used.
+    """
+    Extracts question as a field to vectorize with Vectorizer object. `question` field is used.
     """
     return inp_example.copy(text_to_vectorize=inp_example.question)
 
 
 def cast_naive_get_question_and_answer(inp_example: Example) -> Example:
-    """Extracts question and answer as fields to vectorize with Vectorizer object.
+    """
+    Extracts question and answer as fields to vectorize with Vectorizer object.
     `question` and `answer` fields are used. They will be concatenated with the word "Answer"
     between.
     """
@@ -147,7 +150,8 @@ def knn(
     cast: Callable[[Example], Example] = cast_naive_get_only_question_text,
     **knn_args,
 ) -> Callable[[Example, int], list[Example]]:
-    """A function that vectorizes train data using `dsm.settings.vectorizer`, then build an ANN/KNN
+    """
+    A function that vectorizes train data using `dsm.settings.vectorizer`, then build an ANN/KNN
     index to search similar questions among `train` samples.
 
     Args:

diff --git a/dsp/primitives/predict.py b/dsp/primitives/predict.py
@@ -199,6 +199,7 @@ def majority(
 
 def majority_vote_(completions: Completions, normalize: bool, prediction_field: str):
     """Core logic for majority vote."""
+
     if not dsp.settings.lm:
         raise AssertionError("No LM is loaded.")
 

diff --git a/dsp/utils/ann_utils.py b/dsp/utils/ann_utils.py
@@ -11,7 +11,8 @@
 
 
 def determine_devices(max_gpu_devices: int = 0) -> Tuple[int, bool]:
-    """Determine which device we should use
+    """
+    Determine which device we should use
     Args:
         max_gpu_devices: an integer value, define how many GPUs we'll use.
             -1 means all devices. 0 means there are no GPUs. Default is 0.
@@ -86,7 +87,8 @@ def create_faiss_index(
     in_list_dist_type: str = 'L2',
     centroid_dist_type: str = 'L2',
 ) -> Index:
-    """Create IVF index (with IP or L2 dist), without adding data and training
+    """
+    Create IVF index (with IP or L2 dist), without adding data and training
     Args:
         emb_dim: size of each embedding
         n_objects: size of a trainset for index. Used to determine optimal type