Big reformat push

jonasdebeukelaer · Mar 1, 2024 · bfd6742 · bfd6742
1 parent ae96ab9
commit bfd6742
Show file tree

Hide file tree

Showing 72 changed files with 271 additions and 315 deletions.
diff --git a/dsp/evaluation/utils.py b/dsp/evaluation/utils.py
@@ -7,7 +7,7 @@
     from IPython.display import display as ipython_display
 except ImportError:
     ipython_display = print
-from dsp.utils import EM, F1, HotPotF1
+from dsp.utils import EM
 
 
 def evaluateRetrieval(fn, dev, metric=None):

diff --git a/dsp/modules/aws_lm.py b/dsp/modules/aws_lm.py
@@ -81,7 +81,7 @@ def _call_model(self, body: str) -> str | list[str]:
 
     @abstractmethod
     def _extract_input_parameters(
-        self, body: dict[Any, Any]
+        self, body: dict[Any, Any],
     ) -> dict[str, str | float | int]:
         pass
 
@@ -94,7 +94,7 @@ def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:
         else:
             llm_out = [generated.replace(formatted_prompt, "") for generated in llm_out]
         self.history.append(
-            {"prompt": formatted_prompt, "response": llm_out, "kwargs": body}
+            {"prompt": formatted_prompt, "response": llm_out, "kwargs": body},
         )
         return llm_out
 
@@ -107,20 +107,20 @@ def basic_request(self, prompt, **kwargs) -> str | list[str]:
             truncated_prompt: str = self._truncate_prompt(prompt)
             formatted_prompt = self._format_prompt(truncated_prompt)
         else:
-            formatted_prompt = self._format_prompt((prompt))
+            formatted_prompt = self._format_prompt(prompt)
 
         llm_out: str | list[str]
         if "n" in kwargs.keys():
             if self._batch_n:
                 llm_out = self._simple_api_call(
-                    formatted_prompt=formatted_prompt, **kwargs
+                    formatted_prompt=formatted_prompt, **kwargs,
                 )
             else:
                 del kwargs["n"]
                 llm_out = []
                 for _ in range(0, kwargs["n"]):
                     generated: str | list[str] = self._simple_api_call(
-                        formatted_prompt=formatted_prompt, **kwargs
+                        formatted_prompt=formatted_prompt, **kwargs,
                     )
                     if isinstance(generated, str):
                         llm_out.append(generated)

diff --git a/dsp/modules/azurecognitivesearch.py b/dsp/modules/azurecognitivesearch.py
@@ -1,14 +1,14 @@
-from typing import Optional, Union, Any
+from typing import Union, Any
 
 from dsp.utils import dotdict
 try:
     from azure.core.credentials import AzureKeyCredential
     from azure.search.documents import SearchClient
     from azure.search.documents._paging import SearchItemPaged
-except ImportError as e:
+except ImportError:
     raise ImportError(
         "You need to install azure-search-documents library"
-        "Please use the command: pip install azure-search-documents"
+        "Please use the command: pip install azure-search-documents",
     )
 
 class AzureCognitiveSearch:

diff --git a/dsp/modules/bedrock.py b/dsp/modules/bedrock.py
@@ -47,7 +47,7 @@ def _create_body(self, prompt: str, **kwargs) -> dict[str, str | float]:
         query_args: dict[str, Any] = self._sanitize_kwargs(base_args)
         query_args["prompt"] = prompt
         # AWS Bedrock forbids these keys
-        if "max_tokens" in query_args.keys():
+        if "max_tokens" in query_args:
             max_tokens: int = query_args["max_tokens"]
             input_tokens: int = self._estimate_tokens(prompt)
             max_tokens_to_sample: int = max_tokens - input_tokens
@@ -67,7 +67,7 @@ def _call_model(self, body: str) -> str:
         return completion
 
     def _extract_input_parameters(
-        self, body: dict[Any, Any]
+        self, body: dict[Any, Any],
     ) -> dict[str, str | float | int]:
         return body
 

diff --git a/dsp/modules/cohere.py b/dsp/modules/cohere.py
@@ -17,7 +17,7 @@ def backoff_hdlr(details):
     print(
         "Backing off {wait:0.1f} seconds after {tries} tries "
         "calling function {target} with kwargs "
-        "{kwargs}".format(**details)
+        "{kwargs}".format(**details),
     )
 
 
@@ -39,7 +39,7 @@ def __init__(
         model: str = "command-nightly",
         api_key: Optional[str] = None,
         stop_sequences: list[str] = [],
-        **kwargs
+        **kwargs,
     ):
         """
         Parameters
@@ -66,7 +66,7 @@ def __init__(
             "frequency_penalty": 0,
             "presence_penalty": 0,
             "num_generations": 1,
-            **kwargs
+            **kwargs,
         }
         self.stop_sequences = stop_sequences
         self.max_num_generations = 5
@@ -109,7 +109,7 @@ def __call__(
         prompt: str,
         only_completed: bool = True,
         return_sorted: bool = False,
-        **kwargs
+        **kwargs,
     ):
         assert only_completed, "for now"
         assert return_sorted is False, "for now"

diff --git a/dsp/modules/colbertv2.py b/dsp/modules/colbertv2.py
@@ -22,7 +22,7 @@ def __init__(
         self.url = f"{url}:{port}" if port else url
 
     def __call__(
-        self, query: str, k: int = 10, simplify: bool = False
+        self, query: str, k: int = 10, simplify: bool = False,
     ) -> Union[list[str], list[dotdict]]:
         if self.post_requests:
             topk: list[dict[str, Any]] = colbertv2_post_request(self.url, query, k)
@@ -49,7 +49,7 @@ def colbertv2_get_request_v2(url: str, query: str, k: int):
     return topk[:k]
 
 
-@functools.lru_cache(maxsize=None)
+@functools.cache
 @NotebookCacheMemory.cache
 def colbertv2_get_request_v2_wrapped(*args, **kwargs):
     return colbertv2_get_request_v2(*args, **kwargs)
@@ -67,7 +67,7 @@ def colbertv2_post_request_v2(url: str, query: str, k: int):
     return res.json()["topk"][:k]
 
 
-@functools.lru_cache(maxsize=None)
+@functools.cache
 @NotebookCacheMemory.cache
 def colbertv2_post_request_v2_wrapped(*args, **kwargs):
     return colbertv2_post_request_v2(*args, **kwargs)

diff --git a/dsp/modules/databricks.py b/dsp/modules/databricks.py
@@ -1,21 +1,18 @@
 import logging
-from logging.handlers import RotatingFileHandler
 
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(message)s',
     handlers=[
-        logging.FileHandler('openai_usage.log')
-    ]
+        logging.FileHandler('openai_usage.log'),
+    ],
 )
 
 import functools
 import json
-from typing import Any, Literal, Optional, cast
+from typing import Literal, Optional
 
-import dsp
-import backoff
 import openai
 
 from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
@@ -35,7 +32,7 @@ def backoff_hdlr(details):
     print(
         "Backing off {wait:0.1f} seconds after {tries} tries "
         "calling function {target} with kwargs "
-        "{kwargs}".format(**details)
+        "{kwargs}".format(**details),
     )
 
 class Databricks(GPT3):

diff --git a/dsp/modules/finetuning/finetune_hf.py b/dsp/modules/finetuning/finetune_hf.py
@@ -5,7 +5,6 @@
 import copy
 import glob
 import torch
-import random
 import warnings
 import evaluate
 import numpy as np
@@ -247,7 +246,7 @@ def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
 
 
 @dataclass
-class DataCollatorForSupervisedDataset(object):
+class DataCollatorForSupervisedDataset:
     """
     Collate examples for supervised fine-tuning.
     """
@@ -316,7 +315,7 @@ def finetune_hf(data_path, target, config):
         # training completed, load best model
         ckpts = glob.glob(f'{output_dir}/checkpoint*')
         final_ckpt = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))[-1]
-        with open(os.path.join(final_ckpt, 'trainer_state.json'), 'r') as f:
+        with open(os.path.join(final_ckpt, 'trainer_state.json')) as f:
             state = json.load(f)
         best_model_checkpoint = state['best_model_checkpoint']
 
@@ -331,8 +330,8 @@ def finetune_hf(data_path, target, config):
         encoder_decoder_model = ("ConditionalGeneration" in architecture) or ("T5WithLMHeadModel" in architecture)
         decoder_only_model = ("CausalLM" in architecture) or ("GPT2LMHeadModel" in architecture)
         assert encoder_decoder_model or decoder_only_model, f"Unknown HuggingFace model class: {target}"
-        assert not config['fid'] or encoder_decoder_model, f"Model must be encoder-decoder for Fusion in Decoder"
-        assert not config['fid'] or not config['peft'], f"FiD and PEFT can't be trained together"
+        assert not config['fid'] or encoder_decoder_model, "Model must be encoder-decoder for Fusion in Decoder"
+        assert not config['fid'] or not config['peft'], "FiD and PEFT can't be trained together"
 
         # load model
         AutoModelClass = AutoModelForSeq2SeqLM if encoder_decoder_model else AutoModelForCausalLM

diff --git a/dsp/modules/google.py b/dsp/modules/google.py
@@ -1,5 +1,6 @@
 import os
-from typing import Any, Iterable, Optional
+from typing import Any, Optional
+from collections.abc import Iterable
 import backoff
 
 from dsp.modules.lm import LM
@@ -18,7 +19,7 @@ def backoff_hdlr(details):
     print(
         "Backing off {wait:0.1f} seconds after {tries} tries "
         "calling function {target} with kwargs "
-        "{kwargs}".format(**details)
+        "{kwargs}".format(**details),
     )
 
 
@@ -32,19 +33,19 @@ def giveup_hdlr(details):
 BLOCK_ONLY_HIGH = [
   {
     "category": "HARM_CATEGORY_HARASSMENT",
-    "threshold": "BLOCK_ONLY_HIGH"
+    "threshold": "BLOCK_ONLY_HIGH",
   },
   {
     "category": "HARM_CATEGORY_HATE_SPEECH",
-    "threshold": "BLOCK_ONLY_HIGH"
+    "threshold": "BLOCK_ONLY_HIGH",
   },
   {
     "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-    "threshold": "BLOCK_ONLY_HIGH"
+    "threshold": "BLOCK_ONLY_HIGH",
   },
   {
     "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-    "threshold": "BLOCK_ONLY_HIGH"
+    "threshold": "BLOCK_ONLY_HIGH",
   },
 ]
 
@@ -60,7 +61,7 @@ def __init__(
         model: str = "models/gemini-1.0-pro",
         api_key: Optional[str] = None,
         safety_settings: Optional[Iterable] = BLOCK_ONLY_HIGH,
-        **kwargs
+        **kwargs,
     ):
         """
         Parameters
@@ -89,7 +90,7 @@ def __init__(
             "max_output_tokens": 2048,
             "top_p": 1,
             "top_k": 1,
-            **kwargs
+            **kwargs,
         }
 
         self.config = genai.GenerationConfig(**kwargs)
@@ -145,7 +146,7 @@ def __call__(
         prompt: str,
         only_completed: bool = True,
         return_sorted: bool = False,
-        **kwargs
+        **kwargs,
     ):
         assert only_completed, "for now"
         assert return_sorted is False, "for now"

diff --git a/dsp/modules/gpt3.py b/dsp/modules/gpt3.py
@@ -1,5 +1,4 @@
 import logging
-from logging.handlers import RotatingFileHandler
 
 # Configure logging
 logging.basicConfig(
@@ -43,7 +42,7 @@ def backoff_hdlr(details):
     print(
         "Backing off {wait:0.1f} seconds after {tries} tries "
         "calling function {target} with kwargs "
-        "{kwargs}".format(**details)
+        "{kwargs}".format(**details),
     )
 
 

diff --git a/dsp/modules/hf.py b/dsp/modules/hf.py
@@ -1,13 +1,9 @@
-import os
-import json
 # from peft import PeftConfig, PeftModel
 # from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from typing import Optional, Literal
 
 from dsp.modules.lm import LM
 # from dsp.modules.finetuning.finetune_hf import preprocess_prompt
-from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
-import functools
 
 def openai_to_hf(**kwargs):
     hf_kwargs = {}
@@ -51,7 +47,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
                 import torch
             except ImportError as exc:
                 raise ModuleNotFoundError(
-                    "You need to install Hugging Face transformers library to use HF models."
+                    "You need to install Hugging Face transformers library to use HF models.",
                 ) from exc
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             try:
@@ -85,7 +81,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
             except ValueError:
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model if checkpoint is None else checkpoint,
-                    device_map=self.device_map
+                    device_map=self.device_map,
                 )
                 self.drop_prompt_from_output = True
                 self.tokenizer = AutoTokenizer.from_pretrained(model)