forked from AbanteAI/archive-old-cli-mentat
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generate synthetic transcripts from commits (AbanteAI#210)
A script that: * Iterates over github commits * Uses GPT to make a request and step by step summary * Converts the diff to a replacement format llm response * packages them into training examples
- Loading branch information
1 parent
91ea894
commit 0a4e87d
Showing
1 changed file
with
213 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import asyncio | ||
import json | ||
import os | ||
import subprocess | ||
from pathlib import Path | ||
from textwrap import dedent | ||
from unittest.mock import AsyncMock | ||
|
||
import openai | ||
from git import Repo | ||
|
||
from mentat.code_context import CodeContext, CodeContextSettings | ||
from mentat.code_file_manager import CodeFileManager | ||
from mentat.config_manager import ConfigManager | ||
from mentat.llm_api import count_tokens | ||
from mentat.parsers.git_parser import GitParser | ||
from mentat.parsers.replacement_parser import ReplacementParser | ||
from mentat.session_context import SESSION_CONTEXT, SessionContext | ||
from tests.benchmarks.utils import clone_repo | ||
|
||
system_prompt = dedent("""\ | ||
You are part of an automated system for making synthetic data. You will be given the \ | ||
output of `git show` for a commit. Your job is to write down what could have been a \ | ||
user request that would lead to this commit. Focus more on the feature added or bug \ | ||
fixed or the why of the commit than on the exact code changes. End that message with \ | ||
END. Then write a step by step plan which if followed would lead to this commit. \ | ||
Please respond with only those two things separated by END. Do not prepend either \ | ||
one with additional labels such as "User Request:" or "Plan:". Don't surround either \ | ||
with quotes or other delimiters. Don't mention mechanical details like what tools you \ | ||
might use or the need to open files in your step by step guide. Focus on the changes \ | ||
themselves. Number your steps 1,2,3... Put each step on its own line.""") | ||
|
||
|
||
def ask_gpt_for_prompt_and_plan(hexsha, diff): | ||
# TODO: cache the cache | ||
if os.path.exists("gpt-output-cache.json"): | ||
with open("gpt-output-cache.json", "r") as f: | ||
cache = json.load(f) | ||
else: | ||
cache = {} | ||
|
||
if hexsha in cache: | ||
return cache[hexsha] | ||
|
||
messages = [ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "system", "content": diff}, | ||
] | ||
response = openai.ChatCompletion.create( | ||
model="gpt-4-0314", | ||
messages=messages, | ||
) | ||
|
||
message = response["choices"][0]["message"]["content"] | ||
ans = { | ||
"request": message.split("END")[0].strip(), | ||
"plan": message.split("END")[1].strip(), | ||
} | ||
|
||
cache[hexsha] = ans | ||
with open("gpt-output-cache.json", "w") as f: | ||
json.dump(cache, f) | ||
|
||
return ans | ||
|
||
|
||
def bound_files(file_edits, padding=5): | ||
files = [] | ||
for file_edit in file_edits: | ||
if file_edit.is_creation: | ||
continue | ||
if len(file_edit.replacements) != 0: | ||
min_line = 10000 | ||
max_line = 1 | ||
for replacement in file_edit.replacements: | ||
min_line = min(min_line, replacement.starting_line) | ||
max_line = max(max_line, replacement.ending_line) | ||
files.append( | ||
Path( | ||
"%s:%d-%d" | ||
% ( | ||
file_edit.file_path, | ||
max(1, min_line - padding), | ||
max_line + padding, | ||
) | ||
) | ||
) | ||
else: | ||
files.append(file_edit.file_path) | ||
return files | ||
|
||
|
||
async def translate_commits_to_transcripts(repo, count=10, skip=[]): | ||
transcripts = {} | ||
|
||
for commit in repo.iter_commits("HEAD", max_count=count): | ||
try: | ||
sha = commit.hexsha | ||
print("SHA:", sha) | ||
# Necessary for CodeContext to work | ||
repo.git.checkout(commit.parents[0].hexsha) | ||
if sha in skip: | ||
continue | ||
shown = subprocess.check_output(["git", "show", sha]).decode("utf-8") | ||
if count_tokens(shown, "gpt-4") > 6000: | ||
print("Skipping because too long") | ||
continue | ||
|
||
parsedLLMResponse = GitParser().parse_string(shown) | ||
# There are a lot of empty commits because they are created when another | ||
# author merges a PR without squashing. | ||
if len(parsedLLMResponse.file_edits) == 0: | ||
continue | ||
|
||
code_context_settings = CodeContextSettings(False, False, False, False, 0) | ||
code_context = CodeContext(AsyncMock(), os.getcwd(), code_context_settings) | ||
code_context.set_paths(bound_files(parsedLLMResponse.file_edits), []) | ||
|
||
code_message = await code_context.get_code_message("", "gpt-4-0314", 0) | ||
prompt_and_plan = ask_gpt_for_prompt_and_plan(sha, shown) | ||
parsedLLMResponse.conversation = prompt_and_plan["plan"] | ||
|
||
llmResponse = ReplacementParser().file_edits_to_llm_message( | ||
parsedLLMResponse | ||
) | ||
conversation = { | ||
"messages": [ | ||
{"role": "user", "content": prompt_and_plan["request"]}, | ||
{"role": "system", "content": code_message}, | ||
{"role": "assistant", "content": llmResponse}, | ||
] | ||
} | ||
transcript = json.dumps(conversation) | ||
transcripts[sha] = transcript | ||
except Exception as e: | ||
print(e) | ||
continue | ||
return transcripts | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Convert git commits into transcripts") | ||
parser.add_argument( | ||
"--repo", | ||
type=str, | ||
default="http://github.com/AbanteAI/mentat", | ||
help="The repo to convert to transcripts", | ||
) | ||
parser.add_argument( | ||
"--commit", | ||
type=str, | ||
default="HEAD", | ||
help="The commit to convert to a transcript", | ||
) | ||
parser.add_argument( | ||
"--count", | ||
type=int, | ||
default=10, | ||
help="The number of commits to convert to transcripts", | ||
) | ||
args = parser.parse_args() | ||
clone_repo(args.repo, "for_transcripts") | ||
os.chdir("tests/benchmarks/repos/for_transcripts") | ||
skip = [] | ||
old_transcripts = {} | ||
if os.path.exists("transcripts.jsonl"): | ||
with open("transcripts.jsonl", "r") as f: | ||
old_transcripts = json.loads(f.read()) | ||
skip = list(old_transcripts.keys()) | ||
|
||
stream = AsyncMock() | ||
config = ConfigManager(os.getcwd(), stream) | ||
code_context_settings = CodeContextSettings(False, False, False, False, 0) | ||
code_context = CodeContext(stream, os.getcwd(), code_context_settings) | ||
session_context = SessionContext( | ||
stream, | ||
None, | ||
os.getcwd(), | ||
config, | ||
ReplacementParser(), | ||
code_context, | ||
CodeFileManager(), | ||
None, | ||
) | ||
SESSION_CONTEXT.set(session_context) | ||
repo = Repo(".") | ||
repo.git.checkout(args.commit) | ||
transcripts = asyncio.run( | ||
translate_commits_to_transcripts(repo, count=args.count, skip=skip) | ||
) | ||
gpt_3_examples = [] | ||
gpt_4_examples = [] | ||
for _, transcript in transcripts.items(): | ||
length3 = count_tokens(transcript, "gpt-3.5-turbo-0613") | ||
length4 = count_tokens(transcript, "gpt-4-0613") | ||
if length3 < 4097: | ||
gpt_3_examples.append(transcript) | ||
if length4 < 8192: | ||
gpt_4_examples.append(transcript) | ||
|
||
transcripts.update(old_transcripts) | ||
with open("transcripts.jsonl", "w") as f: | ||
json.dump(transcripts, f) | ||
with open("transcripts_gpt3.jsonl", "a") as f: | ||
for transcript in gpt_3_examples: | ||
f.write(transcript + "\n") | ||
with open("transcripts_gpt4.jsonl", "a") as f: | ||
for transcript in gpt_4_examples: | ||
f.write(transcript + "\n") | ||
repo.git.checkout(args.commit) |