Skip to content

Commit

Permalink
Big reformat push
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasahle committed Mar 1, 2024
1 parent ae96ab9 commit bfd6742
Show file tree
Hide file tree
Showing 72 changed files with 271 additions and 315 deletions.
2 changes: 1 addition & 1 deletion dsp/evaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from IPython.display import display as ipython_display
except ImportError:
ipython_display = print
from dsp.utils import EM, F1, HotPotF1
from dsp.utils import EM


def evaluateRetrieval(fn, dev, metric=None):
Expand Down
10 changes: 5 additions & 5 deletions dsp/modules/aws_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _call_model(self, body: str) -> str | list[str]:

@abstractmethod
def _extract_input_parameters(
self, body: dict[Any, Any]
self, body: dict[Any, Any],
) -> dict[str, str | float | int]:
pass

Expand All @@ -94,7 +94,7 @@ def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:
else:
llm_out = [generated.replace(formatted_prompt, "") for generated in llm_out]
self.history.append(
{"prompt": formatted_prompt, "response": llm_out, "kwargs": body}
{"prompt": formatted_prompt, "response": llm_out, "kwargs": body},
)
return llm_out

Expand All @@ -107,20 +107,20 @@ def basic_request(self, prompt, **kwargs) -> str | list[str]:
truncated_prompt: str = self._truncate_prompt(prompt)
formatted_prompt = self._format_prompt(truncated_prompt)
else:
formatted_prompt = self._format_prompt((prompt))
formatted_prompt = self._format_prompt(prompt)

llm_out: str | list[str]
if "n" in kwargs.keys():
if self._batch_n:
llm_out = self._simple_api_call(
formatted_prompt=formatted_prompt, **kwargs
formatted_prompt=formatted_prompt, **kwargs,
)
else:
del kwargs["n"]
llm_out = []
for _ in range(0, kwargs["n"]):
generated: str | list[str] = self._simple_api_call(
formatted_prompt=formatted_prompt, **kwargs
formatted_prompt=formatted_prompt, **kwargs,
)
if isinstance(generated, str):
llm_out.append(generated)
Expand Down
6 changes: 3 additions & 3 deletions dsp/modules/azurecognitivesearch.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from typing import Optional, Union, Any
from typing import Union, Any

from dsp.utils import dotdict
try:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents._paging import SearchItemPaged
except ImportError as e:
except ImportError:
raise ImportError(
"You need to install azure-search-documents library"
"Please use the command: pip install azure-search-documents"
"Please use the command: pip install azure-search-documents",
)

class AzureCognitiveSearch:
Expand Down
4 changes: 2 additions & 2 deletions dsp/modules/bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _create_body(self, prompt: str, **kwargs) -> dict[str, str | float]:
query_args: dict[str, Any] = self._sanitize_kwargs(base_args)
query_args["prompt"] = prompt
# AWS Bedrock forbids these keys
if "max_tokens" in query_args.keys():
if "max_tokens" in query_args:
max_tokens: int = query_args["max_tokens"]
input_tokens: int = self._estimate_tokens(prompt)
max_tokens_to_sample: int = max_tokens - input_tokens
Expand All @@ -67,7 +67,7 @@ def _call_model(self, body: str) -> str:
return completion

def _extract_input_parameters(
self, body: dict[Any, Any]
self, body: dict[Any, Any],
) -> dict[str, str | float | int]:
return body

Expand Down
8 changes: 4 additions & 4 deletions dsp/modules/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def backoff_hdlr(details):
print(
"Backing off {wait:0.1f} seconds after {tries} tries "
"calling function {target} with kwargs "
"{kwargs}".format(**details)
"{kwargs}".format(**details),
)


Expand All @@ -39,7 +39,7 @@ def __init__(
model: str = "command-nightly",
api_key: Optional[str] = None,
stop_sequences: list[str] = [],
**kwargs
**kwargs,
):
"""
Parameters
Expand All @@ -66,7 +66,7 @@ def __init__(
"frequency_penalty": 0,
"presence_penalty": 0,
"num_generations": 1,
**kwargs
**kwargs,
}
self.stop_sequences = stop_sequences
self.max_num_generations = 5
Expand Down Expand Up @@ -109,7 +109,7 @@ def __call__(
prompt: str,
only_completed: bool = True,
return_sorted: bool = False,
**kwargs
**kwargs,
):
assert only_completed, "for now"
assert return_sorted is False, "for now"
Expand Down
6 changes: 3 additions & 3 deletions dsp/modules/colbertv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(
self.url = f"{url}:{port}" if port else url

def __call__(
self, query: str, k: int = 10, simplify: bool = False
self, query: str, k: int = 10, simplify: bool = False,
) -> Union[list[str], list[dotdict]]:
if self.post_requests:
topk: list[dict[str, Any]] = colbertv2_post_request(self.url, query, k)
Expand All @@ -49,7 +49,7 @@ def colbertv2_get_request_v2(url: str, query: str, k: int):
return topk[:k]


@functools.lru_cache(maxsize=None)
@functools.cache
@NotebookCacheMemory.cache
def colbertv2_get_request_v2_wrapped(*args, **kwargs):
return colbertv2_get_request_v2(*args, **kwargs)
Expand All @@ -67,7 +67,7 @@ def colbertv2_post_request_v2(url: str, query: str, k: int):
return res.json()["topk"][:k]


@functools.lru_cache(maxsize=None)
@functools.cache
@NotebookCacheMemory.cache
def colbertv2_post_request_v2_wrapped(*args, **kwargs):
return colbertv2_post_request_v2(*args, **kwargs)
Expand Down
11 changes: 4 additions & 7 deletions dsp/modules/databricks.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
import logging
from logging.handlers import RotatingFileHandler

# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s',
handlers=[
logging.FileHandler('openai_usage.log')
]
logging.FileHandler('openai_usage.log'),
],
)

import functools
import json
from typing import Any, Literal, Optional, cast
from typing import Literal, Optional

import dsp
import backoff
import openai

from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
Expand All @@ -35,7 +32,7 @@ def backoff_hdlr(details):
print(
"Backing off {wait:0.1f} seconds after {tries} tries "
"calling function {target} with kwargs "
"{kwargs}".format(**details)
"{kwargs}".format(**details),
)

class Databricks(GPT3):
Expand Down
9 changes: 4 additions & 5 deletions dsp/modules/finetuning/finetune_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import copy
import glob
import torch
import random
import warnings
import evaluate
import numpy as np
Expand Down Expand Up @@ -247,7 +246,7 @@ def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):


@dataclass
class DataCollatorForSupervisedDataset(object):
class DataCollatorForSupervisedDataset:
"""
Collate examples for supervised fine-tuning.
"""
Expand Down Expand Up @@ -316,7 +315,7 @@ def finetune_hf(data_path, target, config):
# training completed, load best model
ckpts = glob.glob(f'{output_dir}/checkpoint*')
final_ckpt = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))[-1]
with open(os.path.join(final_ckpt, 'trainer_state.json'), 'r') as f:
with open(os.path.join(final_ckpt, 'trainer_state.json')) as f:
state = json.load(f)
best_model_checkpoint = state['best_model_checkpoint']

Expand All @@ -331,8 +330,8 @@ def finetune_hf(data_path, target, config):
encoder_decoder_model = ("ConditionalGeneration" in architecture) or ("T5WithLMHeadModel" in architecture)
decoder_only_model = ("CausalLM" in architecture) or ("GPT2LMHeadModel" in architecture)
assert encoder_decoder_model or decoder_only_model, f"Unknown HuggingFace model class: {target}"
assert not config['fid'] or encoder_decoder_model, f"Model must be encoder-decoder for Fusion in Decoder"
assert not config['fid'] or not config['peft'], f"FiD and PEFT can't be trained together"
assert not config['fid'] or encoder_decoder_model, "Model must be encoder-decoder for Fusion in Decoder"
assert not config['fid'] or not config['peft'], "FiD and PEFT can't be trained together"

# load model
AutoModelClass = AutoModelForSeq2SeqLM if encoder_decoder_model else AutoModelForCausalLM
Expand Down
19 changes: 10 additions & 9 deletions dsp/modules/google.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from typing import Any, Iterable, Optional
from typing import Any, Optional
from collections.abc import Iterable
import backoff

from dsp.modules.lm import LM
Expand All @@ -18,7 +19,7 @@ def backoff_hdlr(details):
print(
"Backing off {wait:0.1f} seconds after {tries} tries "
"calling function {target} with kwargs "
"{kwargs}".format(**details)
"{kwargs}".format(**details),
)


Expand All @@ -32,19 +33,19 @@ def giveup_hdlr(details):
BLOCK_ONLY_HIGH = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_ONLY_HIGH"
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_ONLY_HIGH"
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_ONLY_HIGH"
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_ONLY_HIGH"
"threshold": "BLOCK_ONLY_HIGH",
},
]

Expand All @@ -60,7 +61,7 @@ def __init__(
model: str = "models/gemini-1.0-pro",
api_key: Optional[str] = None,
safety_settings: Optional[Iterable] = BLOCK_ONLY_HIGH,
**kwargs
**kwargs,
):
"""
Parameters
Expand Down Expand Up @@ -89,7 +90,7 @@ def __init__(
"max_output_tokens": 2048,
"top_p": 1,
"top_k": 1,
**kwargs
**kwargs,
}

self.config = genai.GenerationConfig(**kwargs)
Expand Down Expand Up @@ -145,7 +146,7 @@ def __call__(
prompt: str,
only_completed: bool = True,
return_sorted: bool = False,
**kwargs
**kwargs,
):
assert only_completed, "for now"
assert return_sorted is False, "for now"
Expand Down
3 changes: 1 addition & 2 deletions dsp/modules/gpt3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from logging.handlers import RotatingFileHandler

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -43,7 +42,7 @@ def backoff_hdlr(details):
print(
"Backing off {wait:0.1f} seconds after {tries} tries "
"calling function {target} with kwargs "
"{kwargs}".format(**details)
"{kwargs}".format(**details),
)


Expand Down
8 changes: 2 additions & 6 deletions dsp/modules/hf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import os
import json
# from peft import PeftConfig, PeftModel
# from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from typing import Optional, Literal

from dsp.modules.lm import LM
# from dsp.modules.finetuning.finetune_hf import preprocess_prompt
from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
import functools

def openai_to_hf(**kwargs):
hf_kwargs = {}
Expand Down Expand Up @@ -51,7 +47,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
import torch
except ImportError as exc:
raise ModuleNotFoundError(
"You need to install Hugging Face transformers library to use HF models."
"You need to install Hugging Face transformers library to use HF models.",
) from exc
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
Expand Down Expand Up @@ -85,7 +81,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
except ValueError:
self.model = AutoModelForCausalLM.from_pretrained(
model if checkpoint is None else checkpoint,
device_map=self.device_map
device_map=self.device_map,
)
self.drop_prompt_from_output = True
self.tokenizer = AutoTokenizer.from_pretrained(model)
Expand Down
Loading

0 comments on commit bfd6742

Please sign in to comment.