Skip to content

Commit

Permalink
Fix #89, improve a corner case check in which stray quotes will lead …
Browse files Browse the repository at this point in the history
…the parser astray
  • Loading branch information
mangiucugna committed Dec 26, 2024
1 parent 9a2005c commit 465e568
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 29 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "json_repair"
version = "0.33.0"
version = "0.34.0"
license = {file = "LICENSE"}
authors = [
{ name="Stefano Baccianella", email="[email protected]" },
Expand Down
91 changes: 65 additions & 26 deletions src/json_repair/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@


class JSONParser:
# Constants
STRING_DELIMITERS = ['"', "'", "“", "”"]

def __init__(
self,
json_str: Union[str, StringFileWrapper],
Expand Down Expand Up @@ -89,7 +92,9 @@ def parse_json(
)
return ""
# <string> starts with a quote
elif not self.context.empty and (char in ['"', "'", "“"] or char.isalpha()):
elif not self.context.empty and (
char in self.STRING_DELIMITERS or char.isalpha()
):
return self.parse_string()
# <number> starts with [0-9] or minus
elif not self.context.empty and (
Expand Down Expand Up @@ -130,6 +135,8 @@ def parse_object(self) -> Dict[str, JSONReturnType]:
# <member> starts with a <string>
key = ""
while self.get_char_at():
# The rollback index needs to be updated here in case the key is empty
rollback_index = self.index
key = str(self.parse_string())

if key != "" or (key == "" and self.get_char_at() == ":"):
Expand All @@ -140,6 +147,12 @@ def parse_object(self) -> Dict[str, JSONReturnType]:
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
)
self.index = rollback_index - 1
# add an opening curly brace to make this work
self.json_str = (
self.json_str[: self.index + 1]
+ "{"
+ self.json_str[self.index + 1 :]
)
break

# Skip filler whitespaces
Expand Down Expand Up @@ -227,7 +240,7 @@ def parse_string(self) -> Union[str, bool, None]:

char = self.get_char_at()
# A valid string can only start with a valid quote or, in our case, with a literal
while char and char not in ['"', "'", "“"] and not char.isalnum():
while char and char not in self.STRING_DELIMITERS and not char.isalnum():
self.index += 1
char = self.get_char_at()

Expand Down Expand Up @@ -262,35 +275,61 @@ def parse_string(self) -> Union[str, bool, None]:
if not missing_quotes:
self.index += 1

self.skip_whitespaces_at()
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
if self.get_char_at() == lstring_delimiter:
# If it's an empty key, this was easy
if (
self.context.current == ContextValues.OBJECT_KEY
and self.get_char_at(1) == ":"
):
self.index += 1
return ""
# Find the next delimiter
i = self.skip_to_character(character=rstring_delimiter, idx=1)
next_c = self.get_char_at(i)
# Now check that the next character is also a delimiter to ensure that we have "".....""
# In that case we ignore this rstring delimiter
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
self.log(
"While parsing a string, we found a valid starting doubled quote, ignoring it",
)
doubled_quotes = True
self.index += 1
else:
# Ok this is not a doubled quote, check if this is an empty string or not
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
if self.get_char_at() in self.STRING_DELIMITERS:
# If the next character is the same type of quote, then we manage it as double quotes
if self.get_char_at() == lstring_delimiter:
# If it's an empty key, this was easy
if (
self.context.current == ContextValues.OBJECT_KEY
and self.get_char_at(1) == ":"
):
self.index += 1
return ""
if self.get_char_at(1) == lstring_delimiter:
# There's something fishy about this, we found doubled quotes and then again quotes
self.log(
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
)
return ""
# Find the next delimiter
i = self.skip_to_character(character=rstring_delimiter, idx=1)
next_c = self.get_char_at(i)
if next_c not in [",", "]", "}"]:
# Now check that the next character is also a delimiter to ensure that we have "".....""
# In that case we ignore this rstring delimiter
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
self.log(
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
"While parsing a string, we found a valid starting doubled quote",
)
doubled_quotes = True
self.index += 1
else:
# Ok this is not a doubled quote, check if this is an empty string or not
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
next_c = self.get_char_at(i)
if next_c in self.STRING_DELIMITERS + ["{", "["]:
# something fishy is going on here
self.log(
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
)
self.index += 1
return ""
elif next_c not in [",", "]", "}"]:
self.log(
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
)
self.index += 1
else:
# Otherwise we need to do another check before continuing
i = self.skip_to_character(character=rstring_delimiter, idx=1)
next_c = self.get_char_at(i)
if not next_c:
# mmmm that delimiter never appears again, this is a mistake
self.log(
"While parsing a string, we found a quote but it was a mistake, ignoring it",
)
return ""

# Initialize our return value
string_acc = ""
Expand Down
21 changes: 21 additions & 0 deletions src/json_repair/string_file_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,24 @@ def __len__(self) -> int:
self.length = self.fd.tell()
self.fd.seek(current_position)
return self.length

def __setitem__(self, index: Union[int, slice], value: str) -> None:
"""
Set a character or a slice of characters in the file.
Args:
index (slice): The slice of characters to set.
value (str): The value to set at the specified index or slice.
"""
if isinstance(index, slice):
start = index.start or 0
else:
start = index or 0

if start < 0:
start += len(self)

current_position = self.fd.tell()
self.fd.seek(start)
self.fd.write(value)
self.fd.seek(current_position)
4 changes: 2 additions & 2 deletions tests/test_json_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,14 @@ def test_object_edge_cases():
assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", " key2": "value2"}'
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}'
assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}'
assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}'
assert repair_json('[{"lorem": {"ipsum": "sic"}, "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, "lorem", {"ipsum": "sic"}]'
assert repair_json('[{"lorem": {"ipsum": "sic"}, """" "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, {"lorem": {"ipsum": "sic"}}]'

def test_number_edge_cases():
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
Expand Down

0 comments on commit 465e568

Please sign in to comment.