Merge pull request decodingml#13 from decodingml/module-4

Fine-tuning module using Qwak for workflow management and CometML for Experiment Tracking
aiegoo · May 17, 2024 · 9af55d9 · 9af55d9
2 parents e317bb0 + db447ea
commit 9af55d9
Show file tree

Hide file tree

Showing 12 changed files with 408 additions and 1 deletion.
diff --git a/course/module-3/pyproject.toml b/course/module-3/pyproject.toml
@@ -22,7 +22,7 @@ select = [
 python = ">=3.10, <3.12"
 pydantic = "^2.6.3"
 pydantic-settings = "^2.1.0"
-bytewax = "^0.18.2"
+bytewax = "0.18.2"
 pika = "^1.3.2"
 qdrant-client = "^1.8.0"
 unstructured = "^0.12.6"

diff --git a/course/module-4/.env.example b/course/module-4/.env.example
@@ -0,0 +1,4 @@
+HUGGINGFACE_ACCESS_TOKEN = "str"
+COMET_API_KEY = "str"
+COMET_WORKSPACE = "str"
+COMET_PROJECT = "scrabble"
diff --git a/course/module-4/Makefile b/course/module-4/Makefile
@@ -0,0 +1,10 @@
+help:
+	@grep -E '^[a-zA-Z0-9 -]+:.*#'  Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done
+
+qwak-start-deploy: # Build qwak model remotely
+	qwak models build -f build_config.yaml .
+
+local-test: # Test inference locally
+	python test_local.py
+
+
diff --git a/course/module-4/build_config.yaml b/course/module-4/build_config.yaml
@@ -0,0 +1,52 @@
+build_env:
+  docker:
+    assumed_iam_role_arn: null
+    base_image: public.ecr.aws/qwak-us-east-1/qwak-base:0.0.13-gpu
+    cache: true
+    env_vars:
+    - HUGGINGFACE_ACCESS_TOKEN=""
+    - COMET_API_KEY=""
+    - COMET_WORKSPACE=""
+    - COMET_PROJECT=""
+    no_cache: false
+    params: []
+    push: true 
+  python_env:
+    dependency_file_path: finetuning/requirements.txt
+    git_credentials: null
+    git_credentials_secret: null
+    poetry: null
+    virtualenv: null
+  remote:
+    is_remote: true
+    resources:
+      cpus: null
+      gpu_amount: null
+      gpu_type: null
+      instance: gpu.a10.2xl
+      memory: null
+build_properties:
+  branch: finetuning
+  build_id: null
+  gpu_compatible: false
+  model_id: copywriter_model
+  model_uri:
+    dependency_required_folders: []
+    git_branch: master
+    git_credentials: null
+    git_credentials_secret: null
+    git_secret_ssh: null
+    main_dir: finetuning
+    uri: .
+  tags: []
+deploy: false
+deployment_instance: null
+post_build: null
+pre_build: null
+purchase_option: null
+step:
+  tests: true
+  validate_build_artifact: true
+  validate_build_artifact_timeout: 120
+verbose: 0
+
diff --git a/course/module-4/finetuning/__init__.py b/course/module-4/finetuning/__init__.py
@@ -0,0 +1,5 @@
+from .model import CopywriterMistralModel
+
+
+def load_model():
+    return CopywriterMistralModel()
diff --git a/course/module-4/finetuning/config.yaml b/course/module-4/finetuning/config.yaml
@@ -0,0 +1,12 @@
+training_arguments:
+  output_dir: "mistral_instruct_generation"
+  max_steps: 10
+  per_device_train_batch_size: 1
+  logging_steps: 10
+  save_strategy: "epoch"
+  evaluation_strategy: "steps"
+  eval_steps: 2
+  learning_rate: 0.0002
+  fp16: true
+  remove_unused_columns: false
+  lr_scheduler_type: "constant"
diff --git a/course/module-4/finetuning/dataset_client.py b/course/module-4/finetuning/dataset_client.py
@@ -0,0 +1,52 @@
+import json
+import os
+import logging
+from comet_ml import Experiment
+from sklearn.model_selection import train_test_split
+
+from finetuning.settings import settings
+class DatasetClient:
+    def __init__(self, output_dir: str = "./finetuning"):
+        self.project = settings.COMET_PROJECT
+        self.api_key = settings.COMET_API_KEY
+        self.experiment = Experiment(
+            api_key=self.api_key,
+            project_name=self.project
+        )
+        self.output_dir = output_dir
+
+    def get_artifact(self, artifact_name: str):
+        try:
+            logged_artifact = self.experiment.get_artifact(artifact_name)
+            logged_artifact.download(self.output_dir)
+            self.experiment.end()
+            logging.info(f'Successfully downloaded  {artifact_name} at location {self.output_dir}')
+        except Exception as e:
+            logging.error(f"Error retrieving artifact: {str(e)}")
+
+    def split_data(self, artifact_name: str) -> tuple:
+        try:
+            training_file_path = os.path.join(self.output_dir, 'train.json')
+            validation_file_path = os.path.join(self.output_dir, 'validation.json')
+            file_name = artifact_name + ".json"
+            with open(os.path.join(self.output_dir,file_name), 'r') as file:
+                data = json.load(file)
+
+            train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
+
+            with open(training_file_path, 'w') as train_file:
+                json.dump(train_data, train_file)
+
+            with open(validation_file_path, 'w') as val_file:
+                json.dump(val_data, val_file)
+
+            logging.info("Data split into train.json and validation.json successfully.")
+            return training_file_path, validation_file_path
+        except Exception as e:
+            logging.error(f"Error splitting data: {str(e)}")
+
+    def download_dataset(self, file_name: str):
+        self.get_artifact(file_name)
+        return self.split_data(file_name)
+
+
diff --git a/course/module-4/finetuning/model.py b/course/module-4/finetuning/model.py
@@ -0,0 +1,207 @@
+import logging
+import os
+
+import comet_ml
+import pandas as pd
+import qwak
+import torch as th
+import yaml
+from comet_ml import Experiment
+from datasets import DatasetDict, load_dataset
+from finetuning.dataset_client import DatasetClient
+from finetuning.settings import settings
+from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
+from qwak.model.adapters import DefaultOutputAdapter
+from qwak.model.base import QwakModel
+from qwak.model.schema import ModelSchema
+from qwak.model.schema_entities import InferenceOutput, RequestInput
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    PreTrainedModel,
+    Trainer,
+    TrainingArguments,
+)
+
+
+class CopywriterMistralModel(QwakModel):
+    def __init__(
+        self,
+        is_saved: bool = False,
+        model_save_dir: str = "./model",
+        model_type: str = "mistralai/Mistral-7B-Instruct-v0.1",
+        comet_artifact_name: str = "cleaned_posts",
+        config_file: str = "./finetuning/config.yaml",
+    ):
+        self._prep_environment()
+        self.experiment = None
+        self.model_save_dir = model_save_dir
+        self.model_type = model_type
+        self.comet_dataset_artifact = comet_artifact_name
+        self.training_args_config_file = config_file
+        if is_saved:
+            self.experiment = Experiment(
+                api_key=settings.COMET_API_KEY,
+                project_name=settings.COMET_PROJECT,
+                workspace=settings.COMET_WORKSPACE,
+            )
+
+    def _prep_environment(self):
+        os.environ["TOKENIZERS_PARALLELISM"] = settings.TOKENIZERS_PARALLELISM
+        th.cuda.empty_cache()
+        logging.info("Emptied cuda cache. Environment prepared successfully!")
+
+    def init_model(self):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_type,
+            token=settings.HUGGINGFACE_ACCESS_TOKEN,
+            device_map=th.cuda.current_device(),
+            quantization_config=self.nf4_config,
+            use_cache=False,
+            torchscript=True,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_type, token=settings.HUGGINGFACE_ACCESS_TOKEN
+        )
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.padding_side = "right"
+        logging.info(f"Initialized model{self.model_type} successfully")
+
+    def _init_4bit_config(self):
+        self.nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=th.bfloat16,
+        )
+        if self.experiment:
+            self.experiment.log_parameters(self.nf4_config)
+        logging.info(
+            "Initialized config for param representation on 4bits successfully!"
+        )
+
+    def _initialize_qlora(self, model: PreTrainedModel) -> PeftModel:
+        self.qlora_config = LoraConfig(
+            lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM"
+        )
+
+        if self.experiment:
+            self.experiment.log_parameters(self.qlora_config)
+
+        model = prepare_model_for_kbit_training(model)
+        model = get_peft_model(model, self.qlora_config)
+        logging.info("Initialized qlora config successfully!")
+        return model
+
+    def _init_trainig_args(self):
+        with open(self.training_args_config_file, "r") as file:
+            config = yaml.safe_load(file)
+        self.training_arguments = TrainingArguments(**config["training_arguments"])
+        if self.experiment:
+            self.experiment.log_parameters(self.training_arguments)
+        logging.info("Initialized training arguments successfully!")
+
+    def _remove_model_class_attributes(self):
+        # remove needed in order to skip default serialization with Pickle done by Qwak
+        del self.model
+        del self.trainer
+        del self.experiment
+
+    def generate_prompt(self, sample: dict) -> dict:
+        full_prompt = f"""<s>[INST]{sample['instruction']}
+        [/INST] {sample['content']}</s>"""
+        result = self.tokenize(full_prompt)
+        return result
+
+    def tokenize(self, prompt: str) -> dict:
+        result = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=2300,
+            truncation=True,
+        )
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+    def load_dataset(self) -> DatasetDict:
+        dataset_handler = DatasetClient()
+        train_data_file, validation_data_file = dataset_handler.download_dataset(
+            self.comet_dataset_artifact
+        )
+        data_files = {"train": train_data_file, "validation": validation_data_file}
+        raw_datasets = load_dataset("json", data_files=data_files)
+        train_dataset, val_dataset = self.preprocess_data_split(raw_datasets)
+        return DatasetDict({"train": train_dataset, "validation": val_dataset})
+
+    def preprocess_data_split(self, raw_datasets: DatasetDict):
+        train_data = raw_datasets["train"]
+        val_data = raw_datasets["validation"]
+        generated_train_dataset = train_data.map(self.generate_prompt)
+        generated_train_dataset = generated_train_dataset.remove_columns(
+            ["instruction", "content"]
+        )
+        generated_val_dataset = val_data.map(self.generate_prompt)
+        generated_val_dataset = generated_val_dataset.remove_columns(
+            ["instruction", "content"]
+        )
+        return generated_train_dataset, generated_val_dataset
+
+    def build(self):
+        self._init_4bit_config()
+        self.init_model()
+        if self.experiment:
+            self.experiment.log_parameters(self.nf4_config)
+        self.model = self._initialize_qlora(self.model)
+        self._init_trainig_args()
+        tokenized_datasets = self.load_dataset()
+        self.device = th.device("cuda" if th.cuda.is_available() else "cpu")
+        self.model = self.model.to(self.device)
+        self.trainer = Trainer(
+            model=self.model,
+            args=self.training_arguments,
+            train_dataset=tokenized_datasets["train"],
+            eval_dataset=tokenized_datasets["validation"],
+            tokenizer=self.tokenizer,
+        )
+        logging.info("Initialized model trainer")
+        self.trainer.train()
+        logging.info("Finished model finetuning!")
+        self.trainer.save_model(self.model_save_dir)
+        logging.info(f"Finished saving model to {self.model_save_dir}")
+        self.experiment.end()
+        self._remove_model_class_attributes()
+        logging.info("Finished removing model class attributes!")
+
+    def initialize_model(self):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_save_dir,
+            token=settings.HUGGINGFACE_ACCESS_TOKEN,
+            quantization_config=self.nf4_config,
+        )
+        logging.info(f"Successfully loaded model from {self.model_save_dir}")
+
+    def schema(self) -> ModelSchema:
+        return ModelSchema(
+            inputs=[RequestInput(name="instruction", type=str)],
+            outputs=[InferenceOutput(name="content", type=str)],
+        )
+
+    @qwak.api(output_adapter=DefaultOutputAdapter())
+    def predict(self, df):
+        input_text = list(df["instruction"].values)
+        input_ids = self.tokenizer(
+            input_text, return_tensors="pt", add_special_tokens=True
+        )
+        input_ids = input_ids.to(self.device)
+
+        generated_ids = self.model.generate(
+            **input_ids,
+            max_new_tokens=3000,
+            do_sample=True,
+            pad_token_id=self.tokenizer.eos_token_id,
+        )
+
+        decoded_output = self.tokenizer.batch_decode(generated_ids)
+
+        return pd.DataFrame([{"content": decoded_output}])
diff --git a/course/module-4/finetuning/requirements.txt b/course/module-4/finetuning/requirements.txt
@@ -0,0 +1,11 @@
+numpy
+pandas
+peft
+datasets
+transformers
+safetensors
+comet_ml
+accelerate
+bitsandbytes
+scikit-learn
+pydantic_settings
diff --git a/course/module-4/finetuning/settings.py b/course/module-4/finetuning/settings.py
@@ -0,0 +1,13 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class AppSettings(BaseSettings):
+    model_config = SettingsConfigDict()
+
+    TOKENIZERS_PARALLELISM: str = "false"
+    HUGGINGFACE_ACCESS_TOKEN: str = ""
+    COMET_API_KEY: str = ""
+    COMET_WORKSPACE: str = ""
+    COMET_PROJECT: str = ""
+
+settings = AppSettings()