Skip to content

Commit

Permalink
Merge pull request decodingml#13 from decodingml/module-4
Browse files Browse the repository at this point in the history
Fine-tuning module using Qwak for workflow management and CometML for Experiment Tracking
  • Loading branch information
Joywalker authored May 17, 2024
2 parents e317bb0 + db447ea commit 9af55d9
Show file tree
Hide file tree
Showing 12 changed files with 408 additions and 1 deletion.
2 changes: 1 addition & 1 deletion course/module-3/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ select = [
python = ">=3.10, <3.12"
pydantic = "^2.6.3"
pydantic-settings = "^2.1.0"
bytewax = "^0.18.2"
bytewax = "0.18.2"
pika = "^1.3.2"
qdrant-client = "^1.8.0"
unstructured = "^0.12.6"
Expand Down
4 changes: 4 additions & 0 deletions course/module-4/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
HUGGINGFACE_ACCESS_TOKEN = "str"
COMET_API_KEY = "str"
COMET_WORKSPACE = "str"
COMET_PROJECT = "scrabble"
10 changes: 10 additions & 0 deletions course/module-4/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
help:
@grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done

qwak-start-deploy: # Build qwak model remotely
qwak models build -f build_config.yaml .

local-test: # Test inference locally
python test_local.py


52 changes: 52 additions & 0 deletions course/module-4/build_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
build_env:
docker:
assumed_iam_role_arn: null
base_image: public.ecr.aws/qwak-us-east-1/qwak-base:0.0.13-gpu
cache: true
env_vars:
- HUGGINGFACE_ACCESS_TOKEN=""
- COMET_API_KEY=""
- COMET_WORKSPACE=""
- COMET_PROJECT=""
no_cache: false
params: []
push: true
python_env:
dependency_file_path: finetuning/requirements.txt
git_credentials: null
git_credentials_secret: null
poetry: null
virtualenv: null
remote:
is_remote: true
resources:
cpus: null
gpu_amount: null
gpu_type: null
instance: gpu.a10.2xl
memory: null
build_properties:
branch: finetuning
build_id: null
gpu_compatible: false
model_id: copywriter_model
model_uri:
dependency_required_folders: []
git_branch: master
git_credentials: null
git_credentials_secret: null
git_secret_ssh: null
main_dir: finetuning
uri: .
tags: []
deploy: false
deployment_instance: null
post_build: null
pre_build: null
purchase_option: null
step:
tests: true
validate_build_artifact: true
validate_build_artifact_timeout: 120
verbose: 0

5 changes: 5 additions & 0 deletions course/module-4/finetuning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .model import CopywriterMistralModel


def load_model():
return CopywriterMistralModel()
12 changes: 12 additions & 0 deletions course/module-4/finetuning/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
training_arguments:
output_dir: "mistral_instruct_generation"
max_steps: 10
per_device_train_batch_size: 1
logging_steps: 10
save_strategy: "epoch"
evaluation_strategy: "steps"
eval_steps: 2
learning_rate: 0.0002
fp16: true
remove_unused_columns: false
lr_scheduler_type: "constant"
52 changes: 52 additions & 0 deletions course/module-4/finetuning/dataset_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import json
import os
import logging
from comet_ml import Experiment
from sklearn.model_selection import train_test_split

from finetuning.settings import settings
class DatasetClient:
def __init__(self, output_dir: str = "./finetuning"):
self.project = settings.COMET_PROJECT
self.api_key = settings.COMET_API_KEY
self.experiment = Experiment(
api_key=self.api_key,
project_name=self.project
)
self.output_dir = output_dir

def get_artifact(self, artifact_name: str):
try:
logged_artifact = self.experiment.get_artifact(artifact_name)
logged_artifact.download(self.output_dir)
self.experiment.end()
logging.info(f'Successfully downloaded {artifact_name} at location {self.output_dir}')
except Exception as e:
logging.error(f"Error retrieving artifact: {str(e)}")

def split_data(self, artifact_name: str) -> tuple:
try:
training_file_path = os.path.join(self.output_dir, 'train.json')
validation_file_path = os.path.join(self.output_dir, 'validation.json')
file_name = artifact_name + ".json"
with open(os.path.join(self.output_dir,file_name), 'r') as file:
data = json.load(file)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

with open(training_file_path, 'w') as train_file:
json.dump(train_data, train_file)

with open(validation_file_path, 'w') as val_file:
json.dump(val_data, val_file)

logging.info("Data split into train.json and validation.json successfully.")
return training_file_path, validation_file_path
except Exception as e:
logging.error(f"Error splitting data: {str(e)}")

def download_dataset(self, file_name: str):
self.get_artifact(file_name)
return self.split_data(file_name)


207 changes: 207 additions & 0 deletions course/module-4/finetuning/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import logging
import os

import comet_ml
import pandas as pd
import qwak
import torch as th
import yaml
from comet_ml import Experiment
from datasets import DatasetDict, load_dataset
from finetuning.dataset_client import DatasetClient
from finetuning.settings import settings
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from qwak.model.adapters import DefaultOutputAdapter
from qwak.model.base import QwakModel
from qwak.model.schema import ModelSchema
from qwak.model.schema_entities import InferenceOutput, RequestInput
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
PreTrainedModel,
Trainer,
TrainingArguments,
)


class CopywriterMistralModel(QwakModel):
def __init__(
self,
is_saved: bool = False,
model_save_dir: str = "./model",
model_type: str = "mistralai/Mistral-7B-Instruct-v0.1",
comet_artifact_name: str = "cleaned_posts",
config_file: str = "./finetuning/config.yaml",
):
self._prep_environment()
self.experiment = None
self.model_save_dir = model_save_dir
self.model_type = model_type
self.comet_dataset_artifact = comet_artifact_name
self.training_args_config_file = config_file
if is_saved:
self.experiment = Experiment(
api_key=settings.COMET_API_KEY,
project_name=settings.COMET_PROJECT,
workspace=settings.COMET_WORKSPACE,
)

def _prep_environment(self):
os.environ["TOKENIZERS_PARALLELISM"] = settings.TOKENIZERS_PARALLELISM
th.cuda.empty_cache()
logging.info("Emptied cuda cache. Environment prepared successfully!")

def init_model(self):
self.model = AutoModelForCausalLM.from_pretrained(
self.model_type,
token=settings.HUGGINGFACE_ACCESS_TOKEN,
device_map=th.cuda.current_device(),
quantization_config=self.nf4_config,
use_cache=False,
torchscript=True,
)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_type, token=settings.HUGGINGFACE_ACCESS_TOKEN
)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.padding_side = "right"
logging.info(f"Initialized model{self.model_type} successfully")

def _init_4bit_config(self):
self.nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=th.bfloat16,
)
if self.experiment:
self.experiment.log_parameters(self.nf4_config)
logging.info(
"Initialized config for param representation on 4bits successfully!"
)

def _initialize_qlora(self, model: PreTrainedModel) -> PeftModel:
self.qlora_config = LoraConfig(
lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM"
)

if self.experiment:
self.experiment.log_parameters(self.qlora_config)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, self.qlora_config)
logging.info("Initialized qlora config successfully!")
return model

def _init_trainig_args(self):
with open(self.training_args_config_file, "r") as file:
config = yaml.safe_load(file)
self.training_arguments = TrainingArguments(**config["training_arguments"])
if self.experiment:
self.experiment.log_parameters(self.training_arguments)
logging.info("Initialized training arguments successfully!")

def _remove_model_class_attributes(self):
# remove needed in order to skip default serialization with Pickle done by Qwak
del self.model
del self.trainer
del self.experiment

def generate_prompt(self, sample: dict) -> dict:
full_prompt = f"""<s>[INST]{sample['instruction']}
[/INST] {sample['content']}</s>"""
result = self.tokenize(full_prompt)
return result

def tokenize(self, prompt: str) -> dict:
result = self.tokenizer(
prompt,
padding="max_length",
max_length=2300,
truncation=True,
)
result["labels"] = result["input_ids"].copy()
return result

def load_dataset(self) -> DatasetDict:
dataset_handler = DatasetClient()
train_data_file, validation_data_file = dataset_handler.download_dataset(
self.comet_dataset_artifact
)
data_files = {"train": train_data_file, "validation": validation_data_file}
raw_datasets = load_dataset("json", data_files=data_files)
train_dataset, val_dataset = self.preprocess_data_split(raw_datasets)
return DatasetDict({"train": train_dataset, "validation": val_dataset})

def preprocess_data_split(self, raw_datasets: DatasetDict):
train_data = raw_datasets["train"]
val_data = raw_datasets["validation"]
generated_train_dataset = train_data.map(self.generate_prompt)
generated_train_dataset = generated_train_dataset.remove_columns(
["instruction", "content"]
)
generated_val_dataset = val_data.map(self.generate_prompt)
generated_val_dataset = generated_val_dataset.remove_columns(
["instruction", "content"]
)
return generated_train_dataset, generated_val_dataset

def build(self):
self._init_4bit_config()
self.init_model()
if self.experiment:
self.experiment.log_parameters(self.nf4_config)
self.model = self._initialize_qlora(self.model)
self._init_trainig_args()
tokenized_datasets = self.load_dataset()
self.device = th.device("cuda" if th.cuda.is_available() else "cpu")
self.model = self.model.to(self.device)
self.trainer = Trainer(
model=self.model,
args=self.training_arguments,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=self.tokenizer,
)
logging.info("Initialized model trainer")
self.trainer.train()
logging.info("Finished model finetuning!")
self.trainer.save_model(self.model_save_dir)
logging.info(f"Finished saving model to {self.model_save_dir}")
self.experiment.end()
self._remove_model_class_attributes()
logging.info("Finished removing model class attributes!")

def initialize_model(self):
self.model = AutoModelForCausalLM.from_pretrained(
self.model_save_dir,
token=settings.HUGGINGFACE_ACCESS_TOKEN,
quantization_config=self.nf4_config,
)
logging.info(f"Successfully loaded model from {self.model_save_dir}")

def schema(self) -> ModelSchema:
return ModelSchema(
inputs=[RequestInput(name="instruction", type=str)],
outputs=[InferenceOutput(name="content", type=str)],
)

@qwak.api(output_adapter=DefaultOutputAdapter())
def predict(self, df):
input_text = list(df["instruction"].values)
input_ids = self.tokenizer(
input_text, return_tensors="pt", add_special_tokens=True
)
input_ids = input_ids.to(self.device)

generated_ids = self.model.generate(
**input_ids,
max_new_tokens=3000,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)

decoded_output = self.tokenizer.batch_decode(generated_ids)

return pd.DataFrame([{"content": decoded_output}])
11 changes: 11 additions & 0 deletions course/module-4/finetuning/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
numpy
pandas
peft
datasets
transformers
safetensors
comet_ml
accelerate
bitsandbytes
scikit-learn
pydantic_settings
13 changes: 13 additions & 0 deletions course/module-4/finetuning/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pydantic_settings import BaseSettings, SettingsConfigDict


class AppSettings(BaseSettings):
model_config = SettingsConfigDict()

TOKENIZERS_PARALLELISM: str = "false"
HUGGINGFACE_ACCESS_TOKEN: str = ""
COMET_API_KEY: str = ""
COMET_WORKSPACE: str = ""
COMET_PROJECT: str = ""

settings = AppSettings()
Loading

0 comments on commit 9af55d9

Please sign in to comment.