forked from decodingml/llm-twin-course
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request decodingml#13 from decodingml/module-4
Fine-tuning module using Qwak for workflow management and CometML for Experiment Tracking
- Loading branch information
Showing
12 changed files
with
408 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
HUGGINGFACE_ACCESS_TOKEN = "str" | ||
COMET_API_KEY = "str" | ||
COMET_WORKSPACE = "str" | ||
COMET_PROJECT = "scrabble" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
help: | ||
@grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done | ||
|
||
qwak-start-deploy: # Build qwak model remotely | ||
qwak models build -f build_config.yaml . | ||
|
||
local-test: # Test inference locally | ||
python test_local.py | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
build_env: | ||
docker: | ||
assumed_iam_role_arn: null | ||
base_image: public.ecr.aws/qwak-us-east-1/qwak-base:0.0.13-gpu | ||
cache: true | ||
env_vars: | ||
- HUGGINGFACE_ACCESS_TOKEN="" | ||
- COMET_API_KEY="" | ||
- COMET_WORKSPACE="" | ||
- COMET_PROJECT="" | ||
no_cache: false | ||
params: [] | ||
push: true | ||
python_env: | ||
dependency_file_path: finetuning/requirements.txt | ||
git_credentials: null | ||
git_credentials_secret: null | ||
poetry: null | ||
virtualenv: null | ||
remote: | ||
is_remote: true | ||
resources: | ||
cpus: null | ||
gpu_amount: null | ||
gpu_type: null | ||
instance: gpu.a10.2xl | ||
memory: null | ||
build_properties: | ||
branch: finetuning | ||
build_id: null | ||
gpu_compatible: false | ||
model_id: copywriter_model | ||
model_uri: | ||
dependency_required_folders: [] | ||
git_branch: master | ||
git_credentials: null | ||
git_credentials_secret: null | ||
git_secret_ssh: null | ||
main_dir: finetuning | ||
uri: . | ||
tags: [] | ||
deploy: false | ||
deployment_instance: null | ||
post_build: null | ||
pre_build: null | ||
purchase_option: null | ||
step: | ||
tests: true | ||
validate_build_artifact: true | ||
validate_build_artifact_timeout: 120 | ||
verbose: 0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .model import CopywriterMistralModel | ||
|
||
|
||
def load_model(): | ||
return CopywriterMistralModel() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
training_arguments: | ||
output_dir: "mistral_instruct_generation" | ||
max_steps: 10 | ||
per_device_train_batch_size: 1 | ||
logging_steps: 10 | ||
save_strategy: "epoch" | ||
evaluation_strategy: "steps" | ||
eval_steps: 2 | ||
learning_rate: 0.0002 | ||
fp16: true | ||
remove_unused_columns: false | ||
lr_scheduler_type: "constant" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import json | ||
import os | ||
import logging | ||
from comet_ml import Experiment | ||
from sklearn.model_selection import train_test_split | ||
|
||
from finetuning.settings import settings | ||
class DatasetClient: | ||
def __init__(self, output_dir: str = "./finetuning"): | ||
self.project = settings.COMET_PROJECT | ||
self.api_key = settings.COMET_API_KEY | ||
self.experiment = Experiment( | ||
api_key=self.api_key, | ||
project_name=self.project | ||
) | ||
self.output_dir = output_dir | ||
|
||
def get_artifact(self, artifact_name: str): | ||
try: | ||
logged_artifact = self.experiment.get_artifact(artifact_name) | ||
logged_artifact.download(self.output_dir) | ||
self.experiment.end() | ||
logging.info(f'Successfully downloaded {artifact_name} at location {self.output_dir}') | ||
except Exception as e: | ||
logging.error(f"Error retrieving artifact: {str(e)}") | ||
|
||
def split_data(self, artifact_name: str) -> tuple: | ||
try: | ||
training_file_path = os.path.join(self.output_dir, 'train.json') | ||
validation_file_path = os.path.join(self.output_dir, 'validation.json') | ||
file_name = artifact_name + ".json" | ||
with open(os.path.join(self.output_dir,file_name), 'r') as file: | ||
data = json.load(file) | ||
|
||
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42) | ||
|
||
with open(training_file_path, 'w') as train_file: | ||
json.dump(train_data, train_file) | ||
|
||
with open(validation_file_path, 'w') as val_file: | ||
json.dump(val_data, val_file) | ||
|
||
logging.info("Data split into train.json and validation.json successfully.") | ||
return training_file_path, validation_file_path | ||
except Exception as e: | ||
logging.error(f"Error splitting data: {str(e)}") | ||
|
||
def download_dataset(self, file_name: str): | ||
self.get_artifact(file_name) | ||
return self.split_data(file_name) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
import logging | ||
import os | ||
|
||
import comet_ml | ||
import pandas as pd | ||
import qwak | ||
import torch as th | ||
import yaml | ||
from comet_ml import Experiment | ||
from datasets import DatasetDict, load_dataset | ||
from finetuning.dataset_client import DatasetClient | ||
from finetuning.settings import settings | ||
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training | ||
from qwak.model.adapters import DefaultOutputAdapter | ||
from qwak.model.base import QwakModel | ||
from qwak.model.schema import ModelSchema | ||
from qwak.model.schema_entities import InferenceOutput, RequestInput | ||
from transformers import ( | ||
AutoModelForCausalLM, | ||
AutoTokenizer, | ||
BitsAndBytesConfig, | ||
PreTrainedModel, | ||
Trainer, | ||
TrainingArguments, | ||
) | ||
|
||
|
||
class CopywriterMistralModel(QwakModel): | ||
def __init__( | ||
self, | ||
is_saved: bool = False, | ||
model_save_dir: str = "./model", | ||
model_type: str = "mistralai/Mistral-7B-Instruct-v0.1", | ||
comet_artifact_name: str = "cleaned_posts", | ||
config_file: str = "./finetuning/config.yaml", | ||
): | ||
self._prep_environment() | ||
self.experiment = None | ||
self.model_save_dir = model_save_dir | ||
self.model_type = model_type | ||
self.comet_dataset_artifact = comet_artifact_name | ||
self.training_args_config_file = config_file | ||
if is_saved: | ||
self.experiment = Experiment( | ||
api_key=settings.COMET_API_KEY, | ||
project_name=settings.COMET_PROJECT, | ||
workspace=settings.COMET_WORKSPACE, | ||
) | ||
|
||
def _prep_environment(self): | ||
os.environ["TOKENIZERS_PARALLELISM"] = settings.TOKENIZERS_PARALLELISM | ||
th.cuda.empty_cache() | ||
logging.info("Emptied cuda cache. Environment prepared successfully!") | ||
|
||
def init_model(self): | ||
self.model = AutoModelForCausalLM.from_pretrained( | ||
self.model_type, | ||
token=settings.HUGGINGFACE_ACCESS_TOKEN, | ||
device_map=th.cuda.current_device(), | ||
quantization_config=self.nf4_config, | ||
use_cache=False, | ||
torchscript=True, | ||
) | ||
self.tokenizer = AutoTokenizer.from_pretrained( | ||
self.model_type, token=settings.HUGGINGFACE_ACCESS_TOKEN | ||
) | ||
self.tokenizer.pad_token = self.tokenizer.eos_token | ||
self.tokenizer.padding_side = "right" | ||
logging.info(f"Initialized model{self.model_type} successfully") | ||
|
||
def _init_4bit_config(self): | ||
self.nf4_config = BitsAndBytesConfig( | ||
load_in_4bit=True, | ||
bnb_4bit_quant_type="nf4", | ||
bnb_4bit_use_double_quant=True, | ||
bnb_4bit_compute_dtype=th.bfloat16, | ||
) | ||
if self.experiment: | ||
self.experiment.log_parameters(self.nf4_config) | ||
logging.info( | ||
"Initialized config for param representation on 4bits successfully!" | ||
) | ||
|
||
def _initialize_qlora(self, model: PreTrainedModel) -> PeftModel: | ||
self.qlora_config = LoraConfig( | ||
lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM" | ||
) | ||
|
||
if self.experiment: | ||
self.experiment.log_parameters(self.qlora_config) | ||
|
||
model = prepare_model_for_kbit_training(model) | ||
model = get_peft_model(model, self.qlora_config) | ||
logging.info("Initialized qlora config successfully!") | ||
return model | ||
|
||
def _init_trainig_args(self): | ||
with open(self.training_args_config_file, "r") as file: | ||
config = yaml.safe_load(file) | ||
self.training_arguments = TrainingArguments(**config["training_arguments"]) | ||
if self.experiment: | ||
self.experiment.log_parameters(self.training_arguments) | ||
logging.info("Initialized training arguments successfully!") | ||
|
||
def _remove_model_class_attributes(self): | ||
# remove needed in order to skip default serialization with Pickle done by Qwak | ||
del self.model | ||
del self.trainer | ||
del self.experiment | ||
|
||
def generate_prompt(self, sample: dict) -> dict: | ||
full_prompt = f"""<s>[INST]{sample['instruction']} | ||
[/INST] {sample['content']}</s>""" | ||
result = self.tokenize(full_prompt) | ||
return result | ||
|
||
def tokenize(self, prompt: str) -> dict: | ||
result = self.tokenizer( | ||
prompt, | ||
padding="max_length", | ||
max_length=2300, | ||
truncation=True, | ||
) | ||
result["labels"] = result["input_ids"].copy() | ||
return result | ||
|
||
def load_dataset(self) -> DatasetDict: | ||
dataset_handler = DatasetClient() | ||
train_data_file, validation_data_file = dataset_handler.download_dataset( | ||
self.comet_dataset_artifact | ||
) | ||
data_files = {"train": train_data_file, "validation": validation_data_file} | ||
raw_datasets = load_dataset("json", data_files=data_files) | ||
train_dataset, val_dataset = self.preprocess_data_split(raw_datasets) | ||
return DatasetDict({"train": train_dataset, "validation": val_dataset}) | ||
|
||
def preprocess_data_split(self, raw_datasets: DatasetDict): | ||
train_data = raw_datasets["train"] | ||
val_data = raw_datasets["validation"] | ||
generated_train_dataset = train_data.map(self.generate_prompt) | ||
generated_train_dataset = generated_train_dataset.remove_columns( | ||
["instruction", "content"] | ||
) | ||
generated_val_dataset = val_data.map(self.generate_prompt) | ||
generated_val_dataset = generated_val_dataset.remove_columns( | ||
["instruction", "content"] | ||
) | ||
return generated_train_dataset, generated_val_dataset | ||
|
||
def build(self): | ||
self._init_4bit_config() | ||
self.init_model() | ||
if self.experiment: | ||
self.experiment.log_parameters(self.nf4_config) | ||
self.model = self._initialize_qlora(self.model) | ||
self._init_trainig_args() | ||
tokenized_datasets = self.load_dataset() | ||
self.device = th.device("cuda" if th.cuda.is_available() else "cpu") | ||
self.model = self.model.to(self.device) | ||
self.trainer = Trainer( | ||
model=self.model, | ||
args=self.training_arguments, | ||
train_dataset=tokenized_datasets["train"], | ||
eval_dataset=tokenized_datasets["validation"], | ||
tokenizer=self.tokenizer, | ||
) | ||
logging.info("Initialized model trainer") | ||
self.trainer.train() | ||
logging.info("Finished model finetuning!") | ||
self.trainer.save_model(self.model_save_dir) | ||
logging.info(f"Finished saving model to {self.model_save_dir}") | ||
self.experiment.end() | ||
self._remove_model_class_attributes() | ||
logging.info("Finished removing model class attributes!") | ||
|
||
def initialize_model(self): | ||
self.model = AutoModelForCausalLM.from_pretrained( | ||
self.model_save_dir, | ||
token=settings.HUGGINGFACE_ACCESS_TOKEN, | ||
quantization_config=self.nf4_config, | ||
) | ||
logging.info(f"Successfully loaded model from {self.model_save_dir}") | ||
|
||
def schema(self) -> ModelSchema: | ||
return ModelSchema( | ||
inputs=[RequestInput(name="instruction", type=str)], | ||
outputs=[InferenceOutput(name="content", type=str)], | ||
) | ||
|
||
@qwak.api(output_adapter=DefaultOutputAdapter()) | ||
def predict(self, df): | ||
input_text = list(df["instruction"].values) | ||
input_ids = self.tokenizer( | ||
input_text, return_tensors="pt", add_special_tokens=True | ||
) | ||
input_ids = input_ids.to(self.device) | ||
|
||
generated_ids = self.model.generate( | ||
**input_ids, | ||
max_new_tokens=3000, | ||
do_sample=True, | ||
pad_token_id=self.tokenizer.eos_token_id, | ||
) | ||
|
||
decoded_output = self.tokenizer.batch_decode(generated_ids) | ||
|
||
return pd.DataFrame([{"content": decoded_output}]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
numpy | ||
pandas | ||
peft | ||
datasets | ||
transformers | ||
safetensors | ||
comet_ml | ||
accelerate | ||
bitsandbytes | ||
scikit-learn | ||
pydantic_settings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from pydantic_settings import BaseSettings, SettingsConfigDict | ||
|
||
|
||
class AppSettings(BaseSettings): | ||
model_config = SettingsConfigDict() | ||
|
||
TOKENIZERS_PARALLELISM: str = "false" | ||
HUGGINGFACE_ACCESS_TOKEN: str = "" | ||
COMET_API_KEY: str = "" | ||
COMET_WORKSPACE: str = "" | ||
COMET_PROJECT: str = "" | ||
|
||
settings = AppSettings() |
Oops, something went wrong.