forked from alexellis/actions-batch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllama.sh
executable file
·84 lines (57 loc) · 1.88 KB
/
llama.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
## Example by Alex Ellis
# Download a model from HuggingFace and run an inference against
# a list of questions in a text file, using 300 tokens.
## Adapted from
# https://swharden.com/blog/2023-07-29-ai-chat-locally-with-python/
# Model: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF
TOKENS=150
cat > questions.txt <<EOF
Q: What are the names of the days of the week? A:
Q: Summarise your training data in one sentence. A:
Q: What is the best way to learn? A:
Q: Who was known as the Stoic Emperor? A:
Q: What would Marcus Aurelius say was the key to peace of mind? A:
EOF
python -m venv .venv
chmod +x .venv/bin/activate
.venv/bin/activate
pip install llama-cpp-python
pip install --upgrade huggingface_hub
# This is the longest part of the job:
huggingface-cli download \
TheBloke/Llama-2-7B-Chat-GGUF \
config.json llama-2-7b-chat.Q5_K_M.gguf --local-dir .
cat > main.py <<EOF
#!/bin/python
import time
# load the large language model file
from llama_cpp import Llama
LLM = Llama(model_path="./llama-2-7b-chat.Q5_K_M.gguf")
questions = []
with open("questions.txt") as f:
questions = f.readlines()
print("Using {} tokens".format($TOKENS))
for question in questions:
# create a text prompt
prompt = question.strip()
if prompt == "":
continue
startTime = time.time()
# generate a response (takes several seconds)
output = LLM(prompt,max_tokens=$TOKENS, stop=[])
duration = time.time() - startTime
print("")
print("[{}] {}".format(duration, prompt))
print(output["choices"][0]["text"])
print("")
EOF
mkdir -p uploads
chmod +x main.py
cp ./main.py uploads/
# fd1 is stdout
# fd2 is stderr which has a lot of noise, so we're redirecting it to /dev/null
./main.py 1> uploads/output.txt 2> uploads/output-stderr.txt
# ./main.py > uploads/output.txt
# Also display the results in the job log
cat uploads/output.txt