forked from AbanteAI/archive-old-cli-mentat
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
A parser for the git diff format is introduced. And a test for the translation script between block, replacement and git_diff formats is introduced. A bug in the parser was detected and fixed. Co-authored-by: Paul Swingle <[email protected]>
- Loading branch information
1 parent
9ce6d78
commit da81771
Showing
10 changed files
with
336 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
from pathlib import Path | ||
from textwrap import dedent | ||
from typing import Any, AsyncGenerator, List | ||
|
||
from mentat.git_handler import GIT_ROOT | ||
from mentat.llm_api import chunk_to_lines | ||
from mentat.parsers.file_edit import FileEdit, Replacement | ||
from mentat.parsers.parser import ParsedLLMResponse | ||
|
||
# The git diff format should not be used with an LLM because it contains information like SHAs | ||
# which it would not know about. It also involves certain arithmetic that it couldn't reliably do | ||
# and typically prints out extra lines which aid human readability but would cost tokens. | ||
# It is implemented to create training data from git diffs. Therefore there's not need for it to | ||
# work asynchronously or stream partial results. In theory one could work this into the existing | ||
# Parser class but it is simpler to assume one has the whole string up front and make heavy use of | ||
# split. | ||
|
||
|
||
class GitParser: | ||
# This doesn't actually "stream and parse" but it is named this way to match the interface of | ||
# the production parsers for use in the translation script. | ||
async def stream_and_parse_llm_response( | ||
self, | ||
response: AsyncGenerator[Any, None], | ||
) -> ParsedLLMResponse: | ||
string = "" | ||
async for chunk in response: | ||
for content in chunk_to_lines(chunk): | ||
string += content | ||
return self.parse_string(string) | ||
|
||
def parse_string(self, git_diff: str) -> ParsedLLMResponse: | ||
git_root = GIT_ROOT.get() | ||
# This is safe because actual code is prepended with ' ', + or -. | ||
split_on_diff = git_diff.split("\ndiff --git ") | ||
|
||
# Use commit message for conversation | ||
commit_message = dedent(split_on_diff[0].split("\n\n")[1].strip()) | ||
|
||
file_edits: List[FileEdit] = [] | ||
for diff in split_on_diff[1:]: | ||
is_creation = "new file mode" in diff | ||
is_deletion = "deleted file mode" in diff | ||
first_line = diff.split("\n")[0] | ||
start_file_name = Path(first_line.split()[0][2:]).resolve() | ||
end_file_name = Path(first_line.split()[1][2:]).resolve() | ||
if start_file_name != end_file_name: | ||
new_name = end_file_name | ||
else: | ||
new_name = None | ||
|
||
file_edit = FileEdit( | ||
git_root / start_file_name, | ||
[], | ||
is_creation=is_creation, | ||
is_deletion=is_deletion, | ||
rename_file_path=new_name, | ||
) | ||
diff_split = diff.split("\n@@") | ||
if not is_deletion: | ||
for change in diff_split[1:]: | ||
line_info = change.split("@@")[0] | ||
start_line = int(line_info.split()[0].split(",")[0][1:]) - 1 | ||
end_line = start_line + int(line_info.split()[0].split(",")[1]) | ||
line_changes = change.split("@@")[1] | ||
code_lines = line_changes.split("\n") | ||
# This check is necessary because new code sometimes starts on the same line | ||
# as @@ sometimes on the next line. | ||
if code_lines[0] == "": | ||
code_lines = code_lines[1:] | ||
if code_lines[-1] == "": | ||
code_lines = code_lines[:-1] | ||
|
||
# Git diff gives context for human readability we don't want to train the llm | ||
# to produce. | ||
starting_repetition = 0 | ||
for line in code_lines: | ||
if line.startswith(" "): | ||
starting_repetition += 1 | ||
else: | ||
break | ||
ending_repetition = 0 | ||
for line in reversed(code_lines): | ||
if line.startswith(" "): | ||
ending_repetition += 1 | ||
else: | ||
break | ||
start_line += starting_repetition | ||
end_line -= ending_repetition | ||
|
||
lines: List[str] = [] | ||
for line in code_lines[ | ||
starting_repetition : len(code_lines) - ending_repetition | ||
]: | ||
if not line.startswith("-"): | ||
lines.append(line[1:]) | ||
|
||
file_edit.replacements.append( | ||
Replacement(start_line, end_line, lines) | ||
) | ||
|
||
file_edits.append(file_edit) | ||
|
||
return ParsedLLMResponse(git_diff, commit_message, file_edits) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
Fifth commit | ||
|
||
@@start | ||
{ | ||
"file": "a", | ||
"action": "replace", | ||
"start-line": 2, | ||
"end-line": 11 | ||
} | ||
@@code | ||
3 | ||
4 | ||
5 | ||
6 | ||
0 | ||
10 | ||
11 | ||
12 | ||
@@end | ||
@@start | ||
{ | ||
"file": "d", | ||
"action": "delete-file" | ||
} | ||
@@end | ||
@@start | ||
{ | ||
"file": "c", | ||
"action": "rename-file", | ||
"name": "e" | ||
} | ||
@@end | ||
@@start | ||
{ | ||
"file": "c", | ||
"action": "delete", | ||
"start-line": 8, | ||
"end-line": 8 | ||
} | ||
@@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
commit 07aa8bae2d4170767b427a56e044f393e86b462f | ||
Author: Jake Koenig <[email protected]> | ||
Date: Thu Oct 19 07:28:08 2023 -0700 | ||
|
||
Fifth commit | ||
|
||
diff --git a/a b/a | ||
index ec588a0..248c392 100644 | ||
--- a/a | ||
+++ b/a | ||
@@ -1,11 +1,9 @@ | ||
1 | ||
-2 | ||
3 | ||
4 | ||
5 | ||
6 | ||
-7 | ||
-8 | ||
-9 | ||
0 | ||
- | ||
+10 | ||
+11 | ||
+12 | ||
diff --git a/d b/d | ||
deleted file mode 100644 | ||
index ec588a0..0000000 | ||
--- a/d | ||
+++ /dev/null | ||
@@ -1,11 +0,0 @@ | ||
-1 | ||
-2 | ||
-3 | ||
-4 | ||
-5 | ||
-6 | ||
-7 | ||
-8 | ||
-9 | ||
-0 | ||
- | ||
diff --git a/c b/e | ||
similarity index 90% | ||
rename from c | ||
rename to e | ||
index ec588a0..720a0a7 100644 | ||
--- a/c | ||
+++ b/e | ||
@@ -5,7 +5,6 @@ | ||
5 | ||
6 | ||
7 | ||
-8 | ||
9 | ||
0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
Fifth commit | ||
|
||
@ a starting_line=2 ending_line=11 | ||
3 | ||
4 | ||
5 | ||
6 | ||
0 | ||
10 | ||
11 | ||
12 | ||
@ | ||
@ d - | ||
@ c e | ||
@ c starting_line=8 ending_line=8 | ||
@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.