spcl
diff --git a/‎fake_quant/README.md
+38 b/‎fake_quant/README.md
+38
diff --git a/‎fake_quant/data_utils.py
+105 b/‎fake_quant/data_utils.py
+105
diff --git a/‎fake_quant/eval_utils.py
+150 b/‎fake_quant/eval_utils.py
+150
@@ -0,0 +1,38 @@
+# Fake Quantization in QuaRot
+
+
+In this directory, we provide the torch scripts for the experiments in QuaRot. 
+
+
+## Language Generation and Zero-Shot Evaluations
+
+Currently, we only support **LLaMa-2** models. You can simply run the `main.py` to reproduce the results in the paper. The most important arguments are:
+
+- `--model`: the model name (or path to the weights)
+- `--bsz`: the batch size for PPL evaluation
+- `--rotate`: whether we want to rotate the model
+- `--lm_eval`: whether we want to run LM-Eval for Zero-Shot tasks
+- `--tasks`: the tasks for LM-Eval
+- `--cal_dataset`: the calibration dataset for GPTQ quantization
+- `--a_bits`: the number of bits for activation quantization
+- `--w_bits`: the number of bits for weight quantization
+- `--v_bits`: the number of bits for value quantization
+- `--k_bits`: the number of bits for key quantization
+- `--w_clip`: Whether we want to clip the weights
+- `--a_clip_ratio`: The ratio of clipping for activation
+- `--k_clip_ratio`: The ratio of clipping for key
+- `--v_clip_ratio`: The ratio of clipping for value
+- `--w_asym`: Whether we want to use asymmetric quantization for weights
+- `--a_asym`: Whether we want to use asymmetric quantization for activation
+- `--v_asym`: Whether we want to use asymmetric quantization for value
+- `--k_asym`: Whether we want to use asymmetric quantization for key
+- `--a_groupsize`: The group size for activation quantization
+- `--w_groupsize`: The group size for weight quantization
+- `--v_groupsize`: The group size for value quantization
+- `--k_groupsize`: The group size for key quantization
+  
+For example, to run the perplexity of `LLaMA2-7B` model with quantizing all weights and activations, you can run the following command:
+
+```bash
+/bin/python main.py --model meta-llama/Llama-2-7b-hf  --rotate --a_bits 4 --v_bits 4 --k_bits 4 --w_bits 4 --w_clip
+```
@@ -0,0 +1,105 @@
+import datasets
+import random
+import transformers
+
+def get_wikitext2(nsamples, seed, seqlen, model, hf_token, eval_mode=False):
+    
+    if hf_token is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False)
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False, use_auth_token=hf_token)
+        
+    if eval_mode:
+        testdata = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+        testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
+        return testenc
+    else:
+        traindata = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
+        trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')    
+        random.seed(seed)
+        trainloader = []
+        for _ in range(nsamples):
+            i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+            j = i + seqlen
+            inp = trainenc.input_ids[:, i:j]
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+        return trainloader
+
+def get_c4_new(nsamples, seed, seqlen, model, hf_token=None, eval_mode=False):
+
+    if hf_token is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False)
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False, use_auth_token=hf_token)
+
+    if eval_mode:
+        valdata = datasets.load_dataset(
+        'allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
+        valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
+        valenc = valenc.input_ids[:, :(256 * seqlen)]
+        class TokenizerWrapper:
+            def __init__(self, input_ids):
+                self.input_ids = input_ids
+        valenc = TokenizerWrapper(valenc)
+        return valenc
+    else:
+        traindata = datasets.load_dataset(
+            'allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
+        
+        random.seed(seed)
+        trainloader = []
+        for _ in range(nsamples):
+            while True:
+                i = random.randint(0, len(traindata) - 1)
+                trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+                if trainenc.input_ids.shape[1] >= seqlen:
+                    break
+            i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+            j = i + seqlen
+            inp = trainenc.input_ids[:, i:j]
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+        return trainloader
+
+    
+
+
+def get_ptb_new(nsamples, seed, seqlen, model, hf_token, eval_mode=False):
+    
+        
+    if hf_token is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False)
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False, use_auth_token=hf_token)
+    
+    if eval_mode:
+        testdata = datasets.load_dataset('ptb_text_only', 'penn_treebank', split='test')
+        testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
+        return testenc
+    else:
+        traindata = datasets.load_dataset('ptb_text_only', 'penn_treebank', split='train')
+        trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
+        random.seed(seed)
+        trainloader = []
+        for _ in range(nsamples):
+            i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+            j = i + seqlen
+            inp = trainenc.input_ids[:, i:j]
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+        return trainloader
+
+
+def get_loaders(
+    name, nsamples=128, seed=0, seqlen=2048, model='', hf_token=None, eval_mode=False
+):
+    if 'wikitext2' in name:
+        return get_wikitext2(nsamples, seed, seqlen, model, hf_token, eval_mode)
+    if 'ptb' in name:
+        return get_ptb_new(nsamples, seed, seqlen, model, hf_token, eval_mode)
+    if 'c4' in name:
+        return get_c4_new(nsamples, seed, seqlen, model, hf_token, eval_mode)
@@ -0,0 +1,150 @@
+import utils
+import model_utils
+import quant_utils
+import torch
+import os
+import logging
+from tqdm import tqdm
+
+
+@torch.no_grad()
+def evaluator(model, testenc, dev, args):
+
+    model.eval()
+
+    if 'opt' in args.model:
+        opt_type = True
+        llama_type = False
+    elif 'meta' in args.model:
+        llama_type = True
+        opt_type = False
+    else:
+        raise ValueError(f'Unknown model {args.model}')
+
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+
+    if opt_type:
+        layers = model.model.decoder.layers
+        model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+        model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+        if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+            model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+        if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+            model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+
+    elif llama_type:
+        layers = model.model.layers
+        model.model.embed_tokens = model.model.embed_tokens.to(dev)
+
+    layers[0] = layers[0].to(dev)
+
+    # Convert the whole text of evaluation dataset into batches of sequences.
+    input_ids = testenc.input_ids  # (1, text_len)
+    nsamples = input_ids.numel() // model.seqlen  # The tail is truncated.
+    input_ids = input_ids[:, :nsamples * model.seqlen].view(nsamples, model.seqlen).to(dev)  # (nsamples, seqlen)
+
+    batch_size = args.bsz
+    input_ids = [input_ids[i:i + batch_size] for i in range(0, nsamples, batch_size)]
+    nbatches = len(input_ids)
+
+    dtype = next(iter(model.parameters())).dtype
+    # The input of the first decoder layer.
+    inps = torch.zeros(
+        (nbatches, batch_size, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    inps = [0] * nbatches
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(torch.nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            if llama_type:
+                cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+   
+    for i in range(nbatches):
+        batch = input_ids[i]
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+
+    if opt_type:
+        model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+        model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+        if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+            model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+        if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+            model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    elif llama_type:
+        model.model.embed_tokens = model.model.embed_tokens.cpu()
+        position_ids = cache['position_ids']
+
+    torch.cuda.empty_cache()
+    outs = [0] * nbatches
+    attention_mask = cache['attention_mask']
+
+    for i in tqdm(range(len(layers)), desc="(Eval) Layers"):
+        layer = layers[i].to(dev)
+
+        # Dump the layer input and output
+        if args.capture_layer_io and args.layer_idx == i:
+            captured_io = model_utils.capture_layer_io(model_utils.get_model_type(model), layer, inps)
+            save_path = model_utils.get_layer_io_save_path(args)
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+            torch.save(captured_io, save_path)
+            logging.info(f'Dumped layer input and output to: {save_path}')
+
+        for j in range(nbatches):
+            if opt_type:
+                outs[j] = layer(inps[j], attention_mask=attention_mask)[0]
+            elif llama_type:
+                outs[j] = layer(inps[j], attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if opt_type:
+        if model.model.decoder.final_layer_norm is not None:
+            model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+        if model.model.decoder.project_out is not None:
+            model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+
+    elif llama_type:
+        if model.model.norm is not None:
+            model.model.norm = model.model.norm.to(dev)
+
+    model.lm_head = model.lm_head.to(dev)
+    nlls = []
+    loss_fct = torch.nn.CrossEntropyLoss(reduction = "none")
+    for i in range(nbatches):
+        hidden_states = inps[i]
+        if opt_type:
+            if model.model.decoder.final_layer_norm is not None:
+                hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+            if model.model.decoder.project_out is not None:
+                hidden_states = model.model.decoder.project_out(hidden_states)
+        elif llama_type:
+            if model.model.norm is not None:
+                hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :]
+        shift_labels = input_ids[i][:, 1:]
+        loss = loss_fct(shift_logits.permute(0, 2, 1), shift_labels)
+        neg_log_likelihood = loss.float().mean(dim=1)
+        nlls.append(neg_log_likelihood)
+    nlls_tensor = torch.cat(nlls)
+    ppl = torch.exp(nlls_tensor.mean())
+    model.config.use_cache = use_cache
+    logging.info(f'\n{args.eval_dataset.upper()} PPL: {ppl.item():.3f}')
+    return ppl.item()