Skip to content

Commit

Permalink
[lazylex/html] Tag lexer doesn't ignore trailing data before > or />
Browse files Browse the repository at this point in the history
Now we need to lex quoted values.

This also exposed other errors

- need to allow single quoted strings
- unquoted value regex is too strict

There are some errors in Soil too:

- unclosed tags at the end
  • Loading branch information
Andy C committed Jan 12, 2025
1 parent 06efda8 commit 8af7e9c
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 14 deletions.
16 changes: 15 additions & 1 deletion doctools/cmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ demo-quirks() {

export PYTHONPATH=.

cmark-py <<'EOF'
cmark-py --common-mark <<'EOF'
1. what `<table>`
EOF

Expand Down Expand Up @@ -162,7 +162,21 @@ EOF
1. The Markdown translator produces a `<table> <ul> <li> ... </li> </ul>
</table>` structure.
EOF
}

demo-htm8() {
### Cases that came from developing HTM8

export PYTHONPATH=.

cmark-py --common-mark <<'EOF'
[bash]($xref:bash)
[other][]
[other]: $xref
EOF
}

"$@"
4 changes: 2 additions & 2 deletions doctools/oils_doc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def testTopicCssClass(self):

def testExpandLinks(self):
"""
<a href=$xref:bash>bash</a>
<a href="$xref:bash">bash</a>
->
<a href=/cross-ref?tag=bash#bash>
<a href="/cross-ref?tag=bash#bash">
NOTE: THIs could really be done with a ref like <a.*href="(.*)">
But we're testing it
Expand Down
17 changes: 15 additions & 2 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,16 +443,19 @@ def ValidTokenList(s, no_special_tags=False):
# Allow - for td-attrs

# allow underscore/hyphen. what about colons, like _NAME?
# what about href=$foo ?
_UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+'

# TODO: we don't need to capture the tag name here? That's done at the top
# level
_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)

_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)

# To match href="foo"
# Note: in HTML5 and XML, single quoted attributes are also valid

# <button disabled> is standard usage

_ATTR_RE = re.compile(
r'''
\s+ # Leading whitespace is required
Expand Down Expand Up @@ -577,13 +580,15 @@ def Tokens(self):
yield TagName, m.start(1), m.end(1)

pos = m.end(0)
#log('POS %d', pos)

while True:
# don't search past the end
m = _ATTR_RE.match(self.s, pos, self.end_pos)
if not m:
# A validating parser would check that > or /> is next -- there's no junk
#log('BREAK pos %d', pos)
break
#log('AttrName %r', m.group(1))

yield AttrName, m.start(1), m.end(1)

Expand All @@ -596,6 +601,14 @@ def Tokens(self):
# Skip past the "
pos = m.end(0)

#log('TOK %r', self.s)

m = _TAG_LAST_RE.match(self.s, pos)
#log('_TAG_LAST_RE match %r', self.s[pos:])
if not m:
# Extra data at end of tag. TODO: add messages for all these.
raise LexError(self.s, pos)


def ReadUntilStartTag(it, tag_lexer, tag_name):
"""Find the next <foo>, returning its (start, end) positions
Expand Down
26 changes: 20 additions & 6 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ def testAllAttrs(self):
lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())

def testAttrWithoutValue(self):
# equivalent to <button disabled="">
lex = _MakeTagLexer('<button disabled>')
all_attrs = lex.AllAttrsRaw()
log('all %s', all_attrs)

return
lex = _MakeTagLexer('<a foo=bar !></a>')
all_attrs = lex.AllAttrsRaw()
log('all %s', all_attrs)


def Lex(h, no_special_tags=False):
print(repr(h))
Expand Down Expand Up @@ -285,7 +296,7 @@ def testInvalid(self):
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')
self.fail('Expected LexError %r' % s)


INVALID_LEX = [
Expand All @@ -299,9 +310,6 @@ def testInvalid(self):
'<!-- unfinished comment',
'<? unfinished processing',
'</div bad=attr> <a> <b>',
# TODO: we should match up to > or />
#'<a foo=bar !></a>', # bad attr
#'<a zz></a>', # this is not invalid?

# TODO: should be escaped, invalid in XML
#'<a href="&"></a>',
Expand All @@ -317,13 +325,15 @@ def testInvalid(self):
VALID_PARSE = [
'<b><a href="foo">link</a></b>',
'<meta><a></a>',
# no attribute
'<button disabled></button>',

# TODO: capitalization should be allowed
#'<META><a></a>',

# TODO:
#'<a foo="&"></a>', # bad attr
#'<a foo=bar !></a>', # bad attr
#'<a zz></a>', # bad attr

# TODO: Test <svg> and <math> ?
]
Expand All @@ -332,12 +342,16 @@ def testInvalid(self):
'<meta></meta>',
]

INVALID_TAG_LEX = [
'<a foo=bar !></a>', # bad attr
]


class ValidateTest(unittest.TestCase):

def testInvalid(self):
counters = html.Counters()
for s in INVALID_LEX:
for s in INVALID_LEX + INVALID_TAG_LEX:
try:
html.Validate(s, html.BALANCED_TAGS, counters)
except html.LexError as e:
Expand Down
3 changes: 0 additions & 3 deletions lazylex/testdata.html
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@

<a href="$xref:bash">Link</a>

<!-- HTML is pretty liberal about unquoted strings -->
<a href=$xref:bash>Link</a>

<a href="$blog-tag:oil-release">Link</a>

</p>
Expand Down

0 comments on commit 8af7e9c

Please sign in to comment.