From a3afc5b241844f359aaece5706e5cef3c2be34b8 Mon Sep 17 00:00:00 2001
From: Ubuntu <ruanslv@gmail.com>
Date: Tue, 18 Jul 2023 16:00:55 +0000
Subject: [PATCH] llama_v1 move

---
 FAQ.md     |  51 -----------------------
 example.py | 119 -----------------------------------------------------
 2 files changed, 170 deletions(-)
 delete mode 100644 FAQ.md
 delete mode 100755 example.py
diff --git a/FAQ.md b/FAQ.md
deleted file mode 100644
index 87ac67e13..000000000
--- a/FAQ.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# FAQ
-## <a name="1"></a>1. The download.sh script doesn't work on default bash in MacOS X:
-
-Please see answers from theses issues:
- - https://github.com/facebookresearch/llama/issues/41#issuecomment-1451290160
- - https://github.com/facebookresearch/llama/issues/53#issue-1606582963
-
-
-## <a name="2"></a>2. Generations are bad! 
-
-Keep in mind these models are not finetuned for question answering. As such, they should be prompted so that the expected answer is the natural continuation of the prompt.
-
-Here are a few examples of prompts (from [issue#69](https://github.com/facebookresearch/llama/issues/69)) geared towards finetuned models, and how to modify them to get the expected results:
- - Do not prompt with "What is the meaning of life? Be concise and do not repeat yourself." but with "I believe the meaning of life is"
- - Do not prompt with "Explain the theory of relativity." but with "Simply put, the theory of relativity states that"
- - Do not prompt with "Ten easy steps to build a website..." but with "Building a website can be done in 10 simple steps:\n"
-
-To be able to directly prompt the models with questions / instructions, you can either:
- - Prompt it with few-shot examples so that the model understands the task you have in mind.
- - Finetune the models on datasets of instructions to make them more robust to input prompts.
-
-We've updated `example.py` with more sample prompts. Overall, always keep in mind that models are very sensitive to prompts (particularly when they have not been finetuned).
-
-## <a name="3"></a>3. CUDA Out of memory errors
-
-The `example.py` file pre-allocates a cache according to these settings:
-```python
-model_args: ModelArgs = ModelArgs(max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params)
-```
-
-Accounting for 14GB of memory for the model weights (7B model), this leaves 16GB available for the decoding cache which stores 2 * 2 * n_layers * max_batch_size * max_seq_len * n_heads * head_dim bytes.
-
-With default parameters, this cache was about 17GB (2 * 2 * 32 * 32 * 1024 * 32 * 128) for the 7B model.
-
-We've added command line options to `example.py` and changed the default `max_seq_len` to 512 which should allow decoding on 30GB GPUs.
-
-Feel free to lower these settings according to your hardware.
-
-## <a name="4"></a>4. Other languages
-The model was trained primarily on English, but also on a few other languages with Latin or Cyrillic alphabets.
-
-For instance, LLaMA was trained on Wikipedia for the 20 following languages: bg, ca, cs, da, de, en, es, fr, hr, hu, it, nl, pl, pt, ro, ru, sl, sr, sv, uk.
-
-LLaMA's tokenizer splits unseen characters into UTF-8 bytes, as a result, it might also be able to process other languages like Chinese or Japanese, even though they use different characters.
-
-Although the fraction of these languages in the training was negligible, LLaMA still showcases some abilities in Chinese-English translation:
-
-```
-Prompt = "J'aime le chocolat = I like chocolate\n祝你一天过得愉快 ="
-Output = "I wish you a nice day"
-```
\ No newline at end of file
diff --git a/example.py b/example.py
deleted file mode 100755
index fba9a54a5..000000000
--- a/example.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the GNU General Public License version 3.
-
-from typing import Tuple
-import os
-import sys
-import torch
-import fire
-import time
-import json
-
-from pathlib import Path
-
-from fairscale.nn.model_parallel.initialize import initialize_model_parallel
-
-from llama import ModelArgs, Transformer, Tokenizer, LLaMA
-
-
-def setup_model_parallel() -> Tuple[int, int]:
-    local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    world_size = int(os.environ.get("WORLD_SIZE", -1))
-
-    torch.distributed.init_process_group("nccl")
-    initialize_model_parallel(world_size)
-    torch.cuda.set_device(local_rank)
-
-    # seed must be the same in all processes
-    torch.manual_seed(1)
-    return local_rank, world_size
-
-
-def load(
-    ckpt_dir: str,
-    tokenizer_path: str,
-    local_rank: int,
-    world_size: int,
-    max_seq_len: int,
-    max_batch_size: int,
-) -> LLaMA:
-    start_time = time.time()
-    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
-    assert world_size == len(
-        checkpoints
-    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
-    ckpt_path = checkpoints[local_rank]
-    print("Loading")
-    checkpoint = torch.load(ckpt_path, map_location="cpu")
-    with open(Path(ckpt_dir) / "params.json", "r") as f:
-        params = json.loads(f.read())
-
-    model_args: ModelArgs = ModelArgs(
-        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
-    )
-    tokenizer = Tokenizer(model_path=tokenizer_path)
-    model_args.vocab_size = tokenizer.n_words
-    torch.set_default_tensor_type(torch.cuda.HalfTensor)
-    model = Transformer(model_args)
-    torch.set_default_tensor_type(torch.FloatTensor)
-    model.load_state_dict(checkpoint, strict=False)
-
-    generator = LLaMA(model, tokenizer)
-    print(f"Loaded in {time.time() - start_time:.2f} seconds")
-    return generator
-
-
-def main(
-    ckpt_dir: str,
-    tokenizer_path: str,
-    temperature: float = 0.8,
-    top_p: float = 0.95,
-    max_seq_len: int = 512,
-    max_batch_size: int = 32,
-):
-    local_rank, world_size = setup_model_parallel()
-    if local_rank > 0:
-        sys.stdout = open(os.devnull, "w")
-
-    generator = load(
-        ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
-    )
-
-    prompts = [
-        # For these prompts, the expected answer is the natural continuation of the prompt
-        "I believe the meaning of life is",
-        "Simply put, the theory of relativity states that ",
-        "Building a website can be done in 10 simple steps:\n",
-        # Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api
-        """Tweet: "I hate it when my phone battery dies."
-Sentiment: Negative
-###
-Tweet: "My day has been 👍"
-Sentiment: Positive
-###
-Tweet: "This is the link to the article"
-Sentiment: Neutral
-###
-Tweet: "This new music video was incredibile"
-Sentiment:""",
-        """Translate English to French:
-
-sea otter => loutre de mer
-
-peppermint => menthe poivrée
-
-plush girafe => girafe peluche
-
-cheese =>""",
-    ]
-    results = generator.generate(
-        prompts, max_gen_len=256, temperature=temperature, top_p=top_p
-    )
-
-    for result in results:
-        print(result)
-        print("\n==================================\n")
-
-
-if __name__ == "__main__":
-    fire.Fire(main)