Skip to content

Commit

Permalink
Generate synthetic transcripts from commits (AbanteAI#210)
Browse files Browse the repository at this point in the history
A script that:
* Iterates over github commits
* Uses GPT to make a request and step by step summary
* Converts the diff to a replacement format llm response
* packages them into training examples
  • Loading branch information
jakethekoenig authored Oct 24, 2023
1 parent 91ea894 commit 0a4e87d
Showing 1 changed file with 213 additions and 0 deletions.
213 changes: 213 additions & 0 deletions scripts/git_log_to_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python

import argparse
import asyncio
import json
import os
import subprocess
from pathlib import Path
from textwrap import dedent
from unittest.mock import AsyncMock

import openai
from git import Repo

from mentat.code_context import CodeContext, CodeContextSettings
from mentat.code_file_manager import CodeFileManager
from mentat.config_manager import ConfigManager
from mentat.llm_api import count_tokens
from mentat.parsers.git_parser import GitParser
from mentat.parsers.replacement_parser import ReplacementParser
from mentat.session_context import SESSION_CONTEXT, SessionContext
from tests.benchmarks.utils import clone_repo

system_prompt = dedent("""\
You are part of an automated system for making synthetic data. You will be given the \
output of `git show` for a commit. Your job is to write down what could have been a \
user request that would lead to this commit. Focus more on the feature added or bug \
fixed or the why of the commit than on the exact code changes. End that message with \
END. Then write a step by step plan which if followed would lead to this commit. \
Please respond with only those two things separated by END. Do not prepend either \
one with additional labels such as "User Request:" or "Plan:". Don't surround either \
with quotes or other delimiters. Don't mention mechanical details like what tools you \
might use or the need to open files in your step by step guide. Focus on the changes \
themselves. Number your steps 1,2,3... Put each step on its own line.""")


def ask_gpt_for_prompt_and_plan(hexsha, diff):
# TODO: cache the cache
if os.path.exists("gpt-output-cache.json"):
with open("gpt-output-cache.json", "r") as f:
cache = json.load(f)
else:
cache = {}

if hexsha in cache:
return cache[hexsha]

messages = [
{"role": "system", "content": system_prompt},
{"role": "system", "content": diff},
]
response = openai.ChatCompletion.create(
model="gpt-4-0314",
messages=messages,
)

message = response["choices"][0]["message"]["content"]
ans = {
"request": message.split("END")[0].strip(),
"plan": message.split("END")[1].strip(),
}

cache[hexsha] = ans
with open("gpt-output-cache.json", "w") as f:
json.dump(cache, f)

return ans


def bound_files(file_edits, padding=5):
files = []
for file_edit in file_edits:
if file_edit.is_creation:
continue
if len(file_edit.replacements) != 0:
min_line = 10000
max_line = 1
for replacement in file_edit.replacements:
min_line = min(min_line, replacement.starting_line)
max_line = max(max_line, replacement.ending_line)
files.append(
Path(
"%s:%d-%d"
% (
file_edit.file_path,
max(1, min_line - padding),
max_line + padding,
)
)
)
else:
files.append(file_edit.file_path)
return files


async def translate_commits_to_transcripts(repo, count=10, skip=[]):
transcripts = {}

for commit in repo.iter_commits("HEAD", max_count=count):
try:
sha = commit.hexsha
print("SHA:", sha)
# Necessary for CodeContext to work
repo.git.checkout(commit.parents[0].hexsha)
if sha in skip:
continue
shown = subprocess.check_output(["git", "show", sha]).decode("utf-8")
if count_tokens(shown, "gpt-4") > 6000:
print("Skipping because too long")
continue

parsedLLMResponse = GitParser().parse_string(shown)
# There are a lot of empty commits because they are created when another
# author merges a PR without squashing.
if len(parsedLLMResponse.file_edits) == 0:
continue

code_context_settings = CodeContextSettings(False, False, False, False, 0)
code_context = CodeContext(AsyncMock(), os.getcwd(), code_context_settings)
code_context.set_paths(bound_files(parsedLLMResponse.file_edits), [])

code_message = await code_context.get_code_message("", "gpt-4-0314", 0)
prompt_and_plan = ask_gpt_for_prompt_and_plan(sha, shown)
parsedLLMResponse.conversation = prompt_and_plan["plan"]

llmResponse = ReplacementParser().file_edits_to_llm_message(
parsedLLMResponse
)
conversation = {
"messages": [
{"role": "user", "content": prompt_and_plan["request"]},
{"role": "system", "content": code_message},
{"role": "assistant", "content": llmResponse},
]
}
transcript = json.dumps(conversation)
transcripts[sha] = transcript
except Exception as e:
print(e)
continue
return transcripts


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert git commits into transcripts")
parser.add_argument(
"--repo",
type=str,
default="http://github.com/AbanteAI/mentat",
help="The repo to convert to transcripts",
)
parser.add_argument(
"--commit",
type=str,
default="HEAD",
help="The commit to convert to a transcript",
)
parser.add_argument(
"--count",
type=int,
default=10,
help="The number of commits to convert to transcripts",
)
args = parser.parse_args()
clone_repo(args.repo, "for_transcripts")
os.chdir("tests/benchmarks/repos/for_transcripts")
skip = []
old_transcripts = {}
if os.path.exists("transcripts.jsonl"):
with open("transcripts.jsonl", "r") as f:
old_transcripts = json.loads(f.read())
skip = list(old_transcripts.keys())

stream = AsyncMock()
config = ConfigManager(os.getcwd(), stream)
code_context_settings = CodeContextSettings(False, False, False, False, 0)
code_context = CodeContext(stream, os.getcwd(), code_context_settings)
session_context = SessionContext(
stream,
None,
os.getcwd(),
config,
ReplacementParser(),
code_context,
CodeFileManager(),
None,
)
SESSION_CONTEXT.set(session_context)
repo = Repo(".")
repo.git.checkout(args.commit)
transcripts = asyncio.run(
translate_commits_to_transcripts(repo, count=args.count, skip=skip)
)
gpt_3_examples = []
gpt_4_examples = []
for _, transcript in transcripts.items():
length3 = count_tokens(transcript, "gpt-3.5-turbo-0613")
length4 = count_tokens(transcript, "gpt-4-0613")
if length3 < 4097:
gpt_3_examples.append(transcript)
if length4 < 8192:
gpt_4_examples.append(transcript)

transcripts.update(old_transcripts)
with open("transcripts.jsonl", "w") as f:
json.dump(transcripts, f)
with open("transcripts_gpt3.jsonl", "a") as f:
for transcript in gpt_3_examples:
f.write(transcript + "\n")
with open("transcripts_gpt4.jsonl", "a") as f:
for transcript in gpt_4_examples:
f.write(transcript + "\n")
repo.git.checkout(args.commit)

0 comments on commit 0a4e87d

Please sign in to comment.