forked from modelscope/ms-swift
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix Pissa and OLoRA (modelscope#1852)
- Loading branch information
1 parent
8a1606d
commit 14f10be
Showing
5 changed files
with
109 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import os | ||
import types | ||
|
||
import numpy as np | ||
import torch | ||
from peft import PeftModel | ||
from transformers import TrainerCallback | ||
from transformers.modeling_utils import unwrap_model | ||
|
||
|
||
class TrainerAdapterCallback(TrainerCallback): | ||
|
||
def __init__(self, args): | ||
self.global_step = 0 | ||
self.args = args | ||
|
||
# offload original_modules to cpu, to save memory | ||
def on_train_begin(self, _args, state, control, **kwargs): | ||
model = kwargs['model'] | ||
if hasattr(model, 'set_active_adapters'): | ||
model.set_active_adapters(model.adapters.keys(), offload='cpu') | ||
if self.args.sft_type == 'adalora': | ||
model.peft_config['default'].total_step = state.max_steps | ||
|
||
def zero_grad(_self, *args, **kwargs): | ||
_self.update_and_allocate(self.global_step + 1) | ||
_self._zero_grad(*args, **kwargs) | ||
|
||
model._zero_grad = model.zero_grad | ||
model.zero_grad = types.MethodType(zero_grad, model) | ||
|
||
def on_step_end(self, _args, state, control, **kwargs): | ||
if self.args.sft_type == 'adalora': | ||
self.global_step = state.global_step | ||
|
||
|
||
class DynamicLayerActivationCallback(TrainerCallback): | ||
|
||
def __init__(self, n_layers: int, step_interval: int, model: torch.nn.Module): | ||
super().__init__() | ||
self.n_layers = n_layers | ||
self.step_interval = step_interval | ||
self.model = model | ||
layers_name = None | ||
layers = None | ||
for name, module in model.named_modules(): | ||
if isinstance(module, torch.nn.ModuleList): | ||
layers_name = name | ||
layers = module | ||
break | ||
assert layers_name is not None | ||
self.layers_attribute = layers_name | ||
self.total_layers = len(layers) | ||
|
||
# Freeze all layers upon initialization | ||
self.freeze_all_layers() | ||
self.active_layers_indices = [] | ||
|
||
def freeze_all_layers(self): | ||
layers = self.model.get_submodule(self.layers_attribute) | ||
for layer in layers: | ||
for param in layer.parameters(): | ||
param.requires_grad = False | ||
|
||
def on_step_begin(self, args, state, control, **kwargs): | ||
# Check if it's time to switch active layers, including at step 0 | ||
if state.global_step % self.step_interval == 0 or state.global_step == 1: | ||
self.switch_active_layers() | ||
|
||
def switch_active_layers(self): | ||
# First, disable gradients for all layers | ||
self.freeze_all_layers() | ||
|
||
# Randomly select n_layers to activate | ||
layers = self.model.get_submodule(self.layers_attribute) | ||
self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False) | ||
# Enable gradients only for the selected layers | ||
for idx in self.active_layers_indices: | ||
for param in layers[idx].parameters(): | ||
param.requires_grad = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters