[Minor] accelerate loading quantized model

zTaoplus · Jul 26, 2023 · f22c2a3 · f22c2a3
1 parent 5f377ef
commit f22c2a3
Showing 1 changed file with 9 additions and 4 deletions.
diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
@@ -122,6 +122,8 @@ def real_quantize_model_weight(
             if init_only:
                 q_linear = WQLinear.from_linear(
                     module, w_bit, q_config['q_group_size'], True)
+                q_linear.to(next(layer.parameters()).device)
+                set_op_by_name(layer, name, q_linear)
             else:
                 module.cuda()
                 module.weight.data, scales, zeros = pseudo_quantize_tensor(module.weight.data, n_bit=w_bit, get_scale_zp=True, **q_config)
@@ -130,7 +132,10 @@ def real_quantize_model_weight(
                 q_linear = WQLinear.from_linear(
                     module, w_bit, q_config['q_group_size'], False, scales, zeros)
                 module.cpu()
-            q_linear.to(next(layer.parameters()).device)
-            set_op_by_name(layer, name, q_linear)
-            torch.cuda.empty_cache()
-            gc.collect()
+                q_linear.to(next(layer.parameters()).device)
+                set_op_by_name(layer, name, q_linear)
+                torch.cuda.empty_cache()
+                gc.collect()
+
+    torch.cuda.empty_cache()
+    gc.collect()