Skip to content

Commit

Permalink
[Minor] accelerate loading quantized model
Browse files Browse the repository at this point in the history
  • Loading branch information
Jiaming Tang committed Jul 26, 2023
1 parent 5f377ef commit f22c2a3
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions awq/quantize/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def real_quantize_model_weight(
if init_only:
q_linear = WQLinear.from_linear(
module, w_bit, q_config['q_group_size'], True)
q_linear.to(next(layer.parameters()).device)
set_op_by_name(layer, name, q_linear)
else:
module.cuda()
module.weight.data, scales, zeros = pseudo_quantize_tensor(module.weight.data, n_bit=w_bit, get_scale_zp=True, **q_config)
Expand All @@ -130,7 +132,10 @@ def real_quantize_model_weight(
q_linear = WQLinear.from_linear(
module, w_bit, q_config['q_group_size'], False, scales, zeros)
module.cpu()
q_linear.to(next(layer.parameters()).device)
set_op_by_name(layer, name, q_linear)
torch.cuda.empty_cache()
gc.collect()
q_linear.to(next(layer.parameters()).device)
set_op_by_name(layer, name, q_linear)
torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()
gc.collect()

0 comments on commit f22c2a3

Please sign in to comment.