[lazylex/html] Tag lexer doesn't ignore trailing data before > or />

Now we need to lex quoted values. This also exposed other errors - need to allow single quoted strings - unquoted value regex is too strict There are some errors in Soil too: - unclosed tags at the end
oils-for-unix · Jan 12, 2025 · 8af7e9c · 8af7e9c
1 parent 06efda8
commit 8af7e9c
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 14 deletions.
diff --git a/doctools/cmark.sh b/doctools/cmark.sh
@@ -102,7 +102,7 @@ demo-quirks() {
 
   export PYTHONPATH=.
 
-  cmark-py <<'EOF'
+  cmark-py --common-mark <<'EOF'
 1. what `<table>`
 EOF
 
@@ -162,7 +162,21 @@ EOF
 1. The Markdown translator produces a `<table> <ul> <li> ... </li> </ul>
    </table>` structure.
 EOF
+}
+
+demo-htm8() {
+  ### Cases that came from developing HTM8
+
+  export PYTHONPATH=.
 
+  cmark-py --common-mark <<'EOF'
+[bash]($xref:bash)
+
+[other][]
+
+[other]: $xref
+
+EOF
 }
 
 "$@"
diff --git a/doctools/oils_doc_test.py b/doctools/oils_doc_test.py
@@ -26,9 +26,9 @@ def testTopicCssClass(self):
 
     def testExpandLinks(self):
         """
-        <a href=$xref:bash>bash</a>
+        <a href="$xref:bash">bash</a>
         ->
-        <a href=/cross-ref?tag=bash#bash>
+        <a href="/cross-ref?tag=bash#bash">
 
         NOTE: THIs could really be done with a ref like <a.*href="(.*)">
         But we're testing it

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -443,16 +443,19 @@ def ValidTokenList(s, no_special_tags=False):
 # Allow - for td-attrs
 
 # allow underscore/hyphen.  what about colons, like _NAME?
-# what about href=$foo ?
 _UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+'
 
 # TODO: we don't need to capture the tag name here?  That's done at the top
 # level
 _TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
 
+_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
+
 # To match href="foo"
 # Note: in HTML5 and XML, single quoted attributes are also valid
 
+# <button disabled> is standard usage
+
 _ATTR_RE = re.compile(
     r'''
 \s+                     # Leading whitespace is required
@@ -577,13 +580,15 @@ def Tokens(self):
         yield TagName, m.start(1), m.end(1)
 
         pos = m.end(0)
+        #log('POS %d', pos)
 
         while True:
             # don't search past the end
             m = _ATTR_RE.match(self.s, pos, self.end_pos)
             if not m:
-                # A validating parser would check that > or /> is next -- there's no junk
+                #log('BREAK pos %d', pos)
                 break
+            #log('AttrName %r', m.group(1))
 
             yield AttrName, m.start(1), m.end(1)
 
@@ -596,6 +601,14 @@ def Tokens(self):
             # Skip past the "
             pos = m.end(0)
 
+        #log('TOK %r', self.s)
+
+        m = _TAG_LAST_RE.match(self.s, pos)
+        #log('_TAG_LAST_RE match %r', self.s[pos:])
+        if not m:
+            # Extra data at end of tag.  TODO: add messages for all these.
+            raise LexError(self.s, pos)
+
 
 def ReadUntilStartTag(it, tag_lexer, tag_name):
     """Find the next <foo>, returning its (start, end) positions

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -104,6 +104,17 @@ def testAllAttrs(self):
         lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
         self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
 
+    def testAttrWithoutValue(self):
+        # equivalent to <button disabled="">
+        lex = _MakeTagLexer('<button disabled>')
+        all_attrs = lex.AllAttrsRaw()
+        log('all %s', all_attrs)
+
+        return
+        lex = _MakeTagLexer('<a foo=bar !></a>')
+        all_attrs = lex.AllAttrsRaw()
+        log('all %s', all_attrs)
+
 
 def Lex(h, no_special_tags=False):
     print(repr(h))
@@ -285,7 +296,7 @@ def testInvalid(self):
             except html.LexError as e:
                 print(e)
             else:
-                self.fail('Expected LexError')
+                self.fail('Expected LexError %r' % s)
 
 
 INVALID_LEX = [
@@ -299,9 +310,6 @@ def testInvalid(self):
     '<!-- unfinished comment',
     '<? unfinished processing',
     '</div bad=attr> <a> <b>',
-    # TODO: we should match up to > or />
-    #'<a foo=bar !></a>',  # bad attr
-    #'<a zz></a>',  # this is not invalid?
 
     # TODO: should be escaped, invalid in XML
     #'<a href="&"></a>',
@@ -317,13 +325,15 @@ def testInvalid(self):
 VALID_PARSE = [
     '<b><a href="foo">link</a></b>',
     '<meta><a></a>',
+    # no attribute
+    '<button disabled></button>',
+
     # TODO: capitalization should be allowed
     #'<META><a></a>',
 
     # TODO:
     #'<a foo="&"></a>',  # bad attr
     #'<a foo=bar !></a>',  # bad attr
-    #'<a zz></a>',  # bad attr
 
     # TODO: Test <svg> and <math> ?
 ]
@@ -332,12 +342,16 @@ def testInvalid(self):
     '<meta></meta>',
 ]
 
+INVALID_TAG_LEX = [
+    '<a foo=bar !></a>',  # bad attr
+]
+
 
 class ValidateTest(unittest.TestCase):
 
     def testInvalid(self):
         counters = html.Counters()
-        for s in INVALID_LEX:
+        for s in INVALID_LEX + INVALID_TAG_LEX:
             try:
                 html.Validate(s, html.BALANCED_TAGS, counters)
             except html.LexError as e:

diff --git a/lazylex/testdata.html b/lazylex/testdata.html
@@ -26,9 +26,6 @@
 
       <a href="$xref:bash">Link</a>
 
-      <!-- HTML is pretty liberal about unquoted strings -->
-      <a href=$xref:bash>Link</a>
-
       <a href="$blog-tag:oil-release">Link</a>
 
     </p>