Skip to content

Commit

Permalink
Merge pull request ishepard#263 from ishepard/improve_modified_files
Browse files Browse the repository at this point in the history
Improve modified files
  • Loading branch information
ishepard authored Apr 21, 2023
2 parents cbd8355 + 778c99f commit 241edcb
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 116 deletions.
115 changes: 55 additions & 60 deletions pydriller/domain/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,22 +154,14 @@ class ModifiedFile:

def __init__(
self,
old_path: Optional[str],
new_path: Optional[str],
change_type: ModificationType,
diff_and_content: Dict[str, Any],
diff: Diff,
):
"""
Initialize a modified file. A modified file carries on information
regarding the changed file. Normally, you shouldn't initialize a new
one.
"""
self._old_path = Path(old_path) if old_path is not None else None
self._new_path = Path(new_path) if new_path is not None else None
self.change_type = change_type
self.diff: str = diff_and_content["diff"]
self.content: Optional[bytes] = diff_and_content["content"]
self.content_before: Optional[bytes] = diff_and_content["content_before"]
self._c_diff = diff

self._nloc = None
self._complexity = None
Expand All @@ -187,6 +179,49 @@ def __hash__(self) -> int:
string = f"{self.change_type.name} {self.new_path} {self.content!r}"
return hash(hashlib.sha256(string.encode("utf-8")).hexdigest())

@property
def change_type(self) -> ModificationType:
return self._from_change_to_modification_type(self._c_diff)

@staticmethod
def _from_change_to_modification_type(diff: Diff) -> ModificationType:
if diff.new_file:
return ModificationType.ADD
if diff.deleted_file:
return ModificationType.DELETE
if diff.renamed_file:
return ModificationType.RENAME
if diff.a_blob and diff.b_blob and diff.a_blob != diff.b_blob:
return ModificationType.MODIFY

return ModificationType.UNKNOWN

@property
def diff(self) -> str:
return self._get_decoded_str(self._c_diff.diff) or ''

def _get_decoded_str(self, diff: Union[str, bytes, None]) -> Optional[str]:
try:
if type(diff) == bytes:
return diff.decode("utf-8", "ignore")
if type(diff) == str:
return diff
return None
except (AttributeError, ValueError):
logger.debug(f"Could not load the diff of file {self.filename}")
return None

@property
def content(self) -> Optional[bytes]:
return self._get_undecoded_content(self._c_diff.b_blob)

@property
def content_before(self) -> Optional[bytes]:
return self._get_undecoded_content(self._c_diff.a_blob)

def _get_undecoded_content(self, blob: Optional[IndexObject]) -> Optional[bytes]:
return blob.data_stream.read() if blob is not None else None

@property
def source_code(self) -> Optional[str]:
if self.content and type(self.content) == bytes:
Expand Down Expand Up @@ -234,8 +269,8 @@ def old_path(self) -> Optional[str]:
:return: str old_path
"""
if self._old_path is not None:
return str(self._old_path)
if self._c_diff.a_path:
return str(Path(self._c_diff.a_path))
return None

@property
Expand All @@ -245,8 +280,8 @@ def new_path(self) -> Optional[str]:
:return: str new_path
"""
if self._new_path is not None:
return str(self._new_path)
if self._c_diff.b_path:
return str(Path(self._c_diff.b_path))
return None

@property
Expand All @@ -258,13 +293,13 @@ def filename(self) -> str:
:return: str filename
"""
if self._new_path is not None and str(self._new_path) != "/dev/null":
path = self._new_path
if self.new_path is not None and self.new_path != "/dev/null":
path = self.new_path
else:
assert self._old_path
path = self._old_path
assert self.old_path
path = self.old_path

return path.name
return Path(path).name

@property
def language_supported(self) -> bool:
Expand Down Expand Up @@ -718,39 +753,12 @@ def modified_files(self) -> List[ModifiedFile]:
def _parse_diff(self, diff_index: List[Diff]) -> List[ModifiedFile]:
modified_files_list = []
for diff in diff_index:
old_path = diff.a_path
new_path = diff.b_path
change_type = self._from_change_to_modification_type(diff)

diff_and_content = {
"diff": self._get_decoded_str(diff.diff),
"content_before": self._get_undecoded_content(diff.a_blob),
"content": self._get_undecoded_content(diff.b_blob),
}

modified_files_list.append(
ModifiedFile(old_path, new_path, change_type, diff_and_content)
ModifiedFile(diff=diff)
)

return modified_files_list

def _get_decoded_str(self, diff: Union[str, bytes, None]) -> Optional[str]:
try:
if type(diff) == bytes:
return diff.decode("utf-8", "ignore")
if type(diff) == str:
return diff
return None
except (AttributeError, ValueError):
logger.debug(
"Could not load the diff of a " "file in commit %s",
self._c_object.hexsha,
)
return None

def _get_undecoded_content(self, blob: Optional[IndexObject]) -> Optional[bytes]:
return blob.data_stream.read() if blob is not None else None

@property
def in_main_branch(self) -> bool:
"""
Expand Down Expand Up @@ -905,19 +913,6 @@ def _good_change_proportion(

return proportion

@staticmethod
def _from_change_to_modification_type(diff: Diff) -> ModificationType:
if diff.new_file:
return ModificationType.ADD
if diff.deleted_file:
return ModificationType.DELETE
if diff.renamed_file:
return ModificationType.RENAME
if diff.a_blob and diff.b_blob and diff.a_blob != diff.b_blob:
return ModificationType.MODIFY

return ModificationType.UNKNOWN

def __eq__(self, other: object) -> bool:
if not isinstance(other, Commit):
return NotImplemented
Expand Down
1 change: 1 addition & 0 deletions pydriller/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ def _calculate_last_commits(self, commit: Commit,
path = mod.new_path
if mod.change_type == ModificationType.RENAME or mod.change_type == ModificationType.DELETE:
path = mod.old_path

deleted_lines = mod.diff_parsed['deleted']

assert path is not None, "We could not find the path to the file"
Expand Down
2 changes: 2 additions & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
mock
types-mock
pytest
psutil
2 changes: 1 addition & 1 deletion tests/integration/test_commit_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def test_filepath_with_since():


def test_since_as_filter():
since_as_filter = datetime(2018, 6, 6, 0, 0, 0, tzinfo=timezone.utc)
since_as_filter = datetime(2018, 6, 6, tzinfo=timezone.utc)

assert len(list(Repository(
path_to_repo='test-repos/since_as_filter',
Expand Down
88 changes: 33 additions & 55 deletions tests/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
# limitations under the License.
from pydriller.git import Git
from pathlib import Path
from mock import patch
import pytest
import logging

from pydriller.domain.commit import ModifiedFile, ModificationType
from pydriller.domain.commit import ModifiedFile

logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
Expand All @@ -40,41 +41,29 @@ def test_equal(repo: Git):
assert c1 != c3


def test_filename():
diff_and_sc = {
'diff': '',
'content': b'',
'content_before': b''
}
m1 = ModifiedFile('dspadini/pydriller/myfile.py',
'dspadini/pydriller/mynewfile.py',
ModificationType.ADD, diff_and_sc)
m3 = ModifiedFile('dspadini/pydriller/myfile.py',
'dspadini/pydriller/mynewfile.py',
ModificationType.ADD, diff_and_sc)
m2 = ModifiedFile('dspadini/pydriller/myfile.py',
None,
ModificationType.ADD, diff_and_sc)
@patch('git.diff.Diff')
def test_filename(mocked_diff):
mocked_diff.a_path = 'dspadini/pydriller/myfile.py'
mocked_diff.b_path = 'dspadini/pydriller/mynewfile.py'

m1 = ModifiedFile(mocked_diff)

assert m1.filename == 'mynewfile.py'
assert m2.filename == 'myfile.py'
assert m1 != m2
assert m3 == m1

assert m1.new_path == str(Path('dspadini/pydriller/mynewfile.py'))
assert m1.old_path == str(Path('dspadini/pydriller/myfile.py'))


def test_metrics_python():
@patch('git.diff.Diff')
def test_metrics_python(mocked_diff):
with open('test-repos/lizard/git_repository.py', 'rb') as f:
content = f.read()

diff_and_sc = {
'diff': '',
'content': content,
'content_before': content
}
mocked_diff.a_path = 'test-repos/lizard/git_repository.py'
mocked_diff.b_path = "test-repos/lizard/git_repository.py"
mocked_diff.b_blob.data_stream.read.return_value = content

m1 = ModifiedFile('test-repos/lizard/git_repository.py',
"test-repos/lizard/git_repository.py",
ModificationType.MODIFY, diff_and_sc)
m1 = ModifiedFile(mocked_diff)

assert m1.nloc == 196
assert m1.token_count == 1009
Expand Down Expand Up @@ -136,19 +125,16 @@ def test_changed_methods():
assert len(mod.changed_methods) == 3


def test_metrics_cpp():
@patch('git.diff.Diff')
def test_metrics_cpp(mocked_diff):
with open('test-repos/lizard/FileCPP.cpp', 'rb') as f:
content = f.read()

diff_and_sc = {
'diff': '',
'content': content,
'content_before': content
}
mocked_diff.a_path = 'test-repos/lizard/FileCPP.cpp'
mocked_diff.b_path = "test-repos/lizard/FileCPP.cpp"
mocked_diff.b_blob.data_stream.read.return_value = content

m1 = ModifiedFile('test-repos/lizard/FileCPP.cpp',
"test-repos/lizard/FileCPP.cpp",
ModificationType.MODIFY, diff_and_sc)
m1 = ModifiedFile(mocked_diff)

assert m1.nloc == 793
assert m1.token_count == 5564
Expand All @@ -157,19 +143,16 @@ def test_metrics_cpp():
assert len(m1.methods) == 16


def test_metrics_java():
@patch('git.diff.Diff')
def test_metrics_java(mocked_diff):
with open('test-repos/lizard/FileJava.java', 'rb') as f:
content = f.read()

diff_and_sc = {
'diff': '',
'content': content,
'content_before': content
}
mocked_diff.a_path = 'test-repos/lizard/FileJava.java'
mocked_diff.b_path = "test-repos/lizard/FileJava.java"
mocked_diff.b_blob.data_stream.read.return_value = content

m1 = ModifiedFile('test-repos/lizard/FileJava.java',
"test-repos/lizard/FileJava.java",
ModificationType.MODIFY, diff_and_sc)
m1 = ModifiedFile(mocked_diff)

assert m1.nloc == 466
assert m1.token_count == 3809
Expand All @@ -178,18 +161,13 @@ def test_metrics_java():
assert len(m1.methods) == 46


def test_metrics_not_supported_file():
@patch('git.diff.Diff')
def test_metrics_not_supported_file(mocked_diff):
content = b'asd !&%@*&^@\n jjdkj'

diff_and_sc = {
'diff': '',
'content': content,
'content_before': content
}
mocked_diff.b_blob.data_stream.read.return_value = content

m1 = ModifiedFile('test-repos/lizard/NotSupported.pdf',
"test-repos/lizard/NotSupported.pdf",
ModificationType.MODIFY, diff_and_sc)
m1 = ModifiedFile(mocked_diff)

assert m1.nloc is None

Expand Down
4 changes: 4 additions & 0 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def modification(request):
[("test-repos/diff", "9a985d4a12a3a12f009ef39750fd9b2187b766d1")],
indirect=True)
def test_extract_line_number_and_content(modification: ModifiedFile):
assert modification.diff_parsed
added = modification.diff_parsed['added']
deleted = modification.diff_parsed['deleted']

Expand All @@ -42,6 +43,7 @@ def test_extract_line_number_and_content(modification: ModifiedFile):
[("test-repos/diff", "f45ee2f8976d5f018a1e4ec83eb4556a3df8b0a5")],
indirect=True)
def test_additions(modification: ModifiedFile):
assert modification.diff_parsed
added = modification.diff_parsed['added']
deleted = modification.diff_parsed['deleted']

Expand All @@ -58,6 +60,7 @@ def test_additions(modification: ModifiedFile):
[("test-repos/diff", "147c7ce9f725a0e259d63f0bf4e6c8ac085ff8c8")],
indirect=True)
def test_deletions(modification: ModifiedFile):
assert modification.diff_parsed
added = modification.diff_parsed['added']
deleted = modification.diff_parsed['deleted']

Expand Down Expand Up @@ -86,6 +89,7 @@ def test_diff_no_newline(modification: ModifiedFile):
\\ No newline at end of file
in diffs. This test asserts these additional lines are parsed correctly.
"""
assert modification.diff_parsed
added = modification.diff_parsed['added']
deleted = modification.diff_parsed['deleted']

Expand Down
1 change: 1 addition & 0 deletions tests/test_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def test_should_detail_a_commit(repo: Git):
assert len(commit.modified_files) == 1

assert commit.modified_files[0].new_path == "Matricula.java"
assert commit.modified_files[0].diff
assert commit.modified_files[0].diff.startswith("@@ -0,0 +1,62 @@\n+package model;") is True
assert commit.modified_files[0].content is not None
assert commit.modified_files[0].content.decode().startswith("package model;") is True
Expand Down

0 comments on commit 241edcb

Please sign in to comment.