[Minor] add quantization example scripts & update readme (mit-han-lab…

…#146)
hubayirp · Feb 26, 2024 · 9c45e01 · 9c45e01
1 parent a8d9527
commit 9c45e01
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -60,7 +60,8 @@ Check out [TinyChat](tinychat), which offers a turn-key solution for **on-device
   - [AWQ Model Zoo](#awq-model-zoo)
   - [Examples](#examples)
   - [Usage](#usage)
-  - [Evaluation](#evaluation)
+  - [Results on Vision-Language Models (VILA-7b/13B)](#results-on-vision-language-models-vila-7b13b)
+  - [Inference speed ( Token/sec )](#inference-speed--tokensec-)
   - [Reference](#reference)
   - [Related Projects](#related-projects)
 
@@ -107,13 +108,15 @@ The detailed support list:
 | Models | Sizes                       | INT4-g128 | INT3-g128 |
 | ------ | --------------------------- | --------- | --------- |
 | [Llama2](/scripts/llama_example.sh)  | 7B/13B/70B  | ✅         | ✅        |
-| [LLaMA](/scripts/llama_example.sh)  | 7B/13B/30B/65B              | ✅         | ✅        |
+| [LLaMA](/scripts/llama2_example.sh)  | 7B/13B/30B/65B              | ✅         | ✅        |
 | [OPT](/scripts/opt_example.sh)    | 125m/1.3B/2.7B/6.7B/13B/30B | ✅         | ✅        |
-| CodeLlama | 7B/13B/34B               | ✅         | ✅        |
-| StarCoder | 15.5B                    | ✅         | ✅        |
-| Vicuna-v1.1 | 7B/13B                 | ✅         |           |
-| LLaVA-v0 | 13B                       | ✅         |           |
-| VILA    | 7B/13B                     | ✅         |           |
+| [CodeLlama](/scripts/codellama_example.sh) | 7B/13B/34B               | ✅         | ✅        |
+| [StarCoder](/scripts/starcoder_example.sh) | 15.5B                    | ✅         | ✅        |
+| [Vicuna-v1.1](/scripts/vicuna_example.sh) | 7B/13B                 | ✅         |           |
+| [LLaVA-v0](/scripts/llava_example.sh) | 13B                       | ✅         |           |
+| [VILA](/scripts/vila_example.sh)    | 7B/13B                     | ✅         |           |
+
+Note: We only list models that we have prepare the [AWQ searching results](https://huggingface.co/datasets/mit-han-lab/awq-model-zoo/tree/main) in the table above. AWQ also supports models such as LLaVA-v1.5 7B, and you may need to run the [AWQ search](#usage) on your own to quantize these models.
 
 ## Examples
 

diff --git a/scripts/codellama_example.sh b/scripts/codellama_example.sh
@@ -0,0 +1,25 @@
+MODEL=CodeLlama-13b-Instruct
+
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt
diff --git a/scripts/llama2_example.sh b/scripts/llama2_example.sh
@@ -0,0 +1,25 @@
+MODEL=llama-2-7b
+
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt
diff --git a/scripts/llava_example.sh b/scripts/llava_example.sh
@@ -0,0 +1,12 @@
+MODEL=llava-13b-v0
+
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/llava-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/llava-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
diff --git a/scripts/starcoder_example.sh b/scripts/starcoder_example.sh
@@ -0,0 +1,25 @@
+MODEL=starcoder
+
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt
diff --git a/scripts/vicuna_example.sh b/scripts/vicuna_example.sh
@@ -0,0 +1,25 @@
+MODEL=vicuna-7b
+
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt
diff --git a/scripts/vila_example.sh b/scripts/vila_example.sh
@@ -1,4 +1,4 @@
-MODEL=vila-oss-7b
+MODEL=vila-7b
 
 # run AWQ search (optional; we provided the pre-computed results)
 python -m awq.entry --model_path /dataset/vila-hf/$MODEL \