Sync for validator/cpp/htmlparser (ampproject#35136)

* Do not record num_terms for <script>, <textarea> and comment nodes. PiperOrigin-RevId: 382380916 * - Move ParseAccounting from Parser to Document, so that parser can be freed after document is parsed. - Add Validate(Document*) method so clients can supply Document object instead of html string. This will help eliminate dual parsing. - Remove unused flag, it is not used anywhere outside validator-internal.cc PiperOrigin-RevId: 382613643 * - Preprocess document url attributes during parsing and attach it as Document metadata. - Renames ParseAccounting to DocumentMetadata as we will use this structure to populate more document metadata information readily available to clients. PiperOrigin-RevId: 382811824
PHCCorso · Jul 7, 2021 · 4da7669 · 4da7669
1 parent f3b1875
commit 4da7669
Show file tree

Hide file tree

Showing 6 changed files with 212 additions and 116 deletions.
diff --git a/validator/cpp/engine/validator-internal.cc b/validator/cpp/engine/validator-internal.cc
@@ -104,10 +104,6 @@ ABSL_FLAG(int, max_node_recursion_depth, 200,
           "Maximum recursion depth of nodes, if stack of nodes grow beyond this"
           "validator will stop parsing with FAIL result");
 
-ABSL_FLAG(bool, duplicate_html_body_elements_is_error, false,
-          "If true, duplicate <html>,<body> elements is considered error for "
-          "validation purposes. (Default is to allow, leaving it to HTML5 "
-          "user-agent to treat them as per the spec).");
 ABSL_FLAG(bool, allow_module_nomodule, true,
           "If true, then script versions for module and nomdule are allowed "
           "in AMP documents. This gating should be temporary and removed "
@@ -5652,6 +5648,23 @@ class Validator {
         max_errors_(max_errors),
         context_(rules_, max_errors_) {}
 
+  ValidationResult Validate(const htmlparser::Document& doc) {
+    doc_metadata_ = doc.Metadata();
+    UpdateLineColumnIndex(doc.RootNode());
+    // The validation check for document size can't be done here since
+    // the Type Identifiers on the html tag have not been parsed yet and
+    // we wouldn't know which rule to apply. It's set to the context
+    // so that when those things are known it can be checked.
+    context_.SetDocByteSize(doc_metadata_.html_src_bytes);
+    ValidateNode(doc.RootNode());
+    auto [current_line_no, current_col_no] =
+        doc_metadata_.document_end_location;
+    context_.SetLineCol(current_line_no, current_col_no > 0 ? current_col_no - 1
+                                                            : current_col_no);
+    EndDocument();
+    return result_;
+  }
+
   ValidationResult Validate(std::string_view html) {
     Clear();
     htmlparser::ParseOptions options{
@@ -5660,11 +5673,6 @@ class Validator {
         .record_node_offsets = true,
         .record_attribute_offsets = true,
     };
-    // The validation check for document size can't be done here since
-    // the Type Identifiers on the html tag have not been parsed yet and
-    // we wouldn't know which rule to apply. It's set to the context
-    // so that when those things are known it can be checked.
-    context_.SetDocByteSize(html.length());
     auto parser = std::make_unique<htmlparser::Parser>(html, options);
     auto doc = parser->Parse();
     // Currently parser returns nullptr only if document is too complex.
@@ -5676,14 +5684,7 @@ class Validator {
       return result_;
     }
 
-    parse_accounting_ = parser->Accounting();
-    UpdateLineColumnIndex(doc->RootNode());
-    ValidateNode(doc->RootNode());
-    auto [current_line_no, current_col_no] = parser->CurrentTokenizerPosition();
-    context_.SetLineCol(current_line_no, current_col_no > 0 ? current_col_no - 1
-                                                            : current_col_no);
-    EndDocument();
-    return result_;
+    return Validate(*doc);
   }
 
   // Updates context's line column index using the current node's position.
@@ -5732,7 +5733,7 @@ class Validator {
         }
         return true;
       case htmlparser::NodeType::DOCTYPE_NODE:
-        if (parse_accounting_.quirks_mode) {
+        if (doc_metadata_.quirks_mode) {
           LineCol linecol(1, 0);
           auto lc = node->LineColInHtmlSrc();
           if (lc.has_value()) {
@@ -5974,7 +5975,7 @@ class Validator {
   const ParsedValidatorRules* rules_;
   int max_errors_ = -1;
   Context context_;
-  htmlparser::ParseAccounting parse_accounting_;
+  htmlparser::DocumentMetadata doc_metadata_;
   ValidationResult result_;
   Validator(const Validator&) = delete;
   Validator& operator=(const Validator&) = delete;
@@ -5989,4 +5990,11 @@ ValidationResult Validate(std::string_view html, HtmlFormat_Code html_format,
   return validator.Validate(html);
 }
 
+ValidationResult Validate(const htmlparser::Document& doc,
+                          HtmlFormat_Code html_format, int max_errors) {
+  Validator validator(ParsedValidatorRulesProvider::Get(html_format),
+                      max_errors);
+  return validator.Validate(doc);
+}
+
 }  // namespace amp::validator
diff --git a/validator/cpp/engine/validator.h b/validator/cpp/engine/validator.h
@@ -38,10 +38,15 @@
 #ifndef AMPVALIDATOR__VALIDATOR_H_
 #define AMPVALIDATOR__VALIDATOR_H_
 
+#include "document.h"
 #include "validator.pb.h"
 
 namespace amp::validator {
 
+ValidationResult Validate(const htmlparser::Document& document,
+                          HtmlFormat_Code html_format = HtmlFormat::AMP,
+                          int max_errors = -1);
+
 ValidationResult Validate(std::string_view html,
                           HtmlFormat_Code html_format = HtmlFormat::AMP,
                           int max_errors = -1);

diff --git a/validator/cpp/htmlparser/document.h b/validator/cpp/htmlparser/document.h
@@ -29,18 +29,68 @@ namespace htmlparser {
 class Parser;
 struct ParseOptions;
 
+// Contains pieces of information about a particular HTML parse operation.
+// Clients are expected to treat all the fields as constants, but are given the
+// flexibility of modifying for upstream error reporting.
+struct DocumentMetadata {
+ public:
+  // Tells if any of the HTML, HEAD, and/or BODY elements are generated by
+  // the Parser because they were missing or implicitly created before they
+  // are parsed.
+  // Example:
+  // Original document: <html><div>foo</div></html>
+  // Parsed document: <html><head></head><body><div>foo</div></body></html>
+  //
+  // The has_manufactured_* accounting applies only to missing
+  // TokenType::START_TAG_TOKEN.
+  // If any of the </html>, </head>, or </body> end tags are missing, parser
+  // auto closes the elements but they are not treated as manufactured from
+  // the clients perspective.
+  bool has_manufactured_html = false;
+  bool has_manufactured_head = false;
+  bool has_manufactured_body = false;
+
+  // HTML5 algorithm handles duplication of unique tags by merging them and
+  // producing a valid HTML. However, if clients are interested in knowing if
+  // the original HTML source contains duplicate elements, following bits are
+  // set.
+  bool duplicate_html_elements = false;
+  bool duplicate_body_elements = false;
+  // Set only if above duplicate bits are true.
+  std::optional<LineCol> duplicate_html_element_location = std::nullopt;
+  std::optional<LineCol> duplicate_body_element_location = std::nullopt;
+
+  // If true, parsed src is missing required <!doctype html> declaration or is
+  // invalid syntax or is XHTML 4 or legacy doctype.
+  bool quirks_mode = false;
+
+  // The line column position of the last element in the document. Useful for
+  // error reporting at the end of the document.
+  LineCol document_end_location {0, 0};
+
+  // The actual size of html src in bytes.
+  std::size_t html_src_bytes = 0;
+
+  // The document's <base> url and target.
+  // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
+  std::pair<std::string, std::string> base_url;
+
+  // The link rel=canonical url found in the html src. If multiple link
+  // rel=canonical found the last one is recorded.
+  std::string canonical_url;
+};
+
+
 // The document class is a wrapper for the DOM tree exposed with RootNode().
 // All the nodes inside the document are owned by document. The nodes are
 // destroyed when Document objects goes out of scope or deleted.
 class Document {
  public:
   Document();
-
   ~Document() = default;
 
-  // Creates a new node. The node is owned by Document and is destroyed when
-  // document is destructed.
-  Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);
+
+  const DocumentMetadata& Metadata() const { return metadata_; }
 
   // Returns the root node of a DOM tree. Node* owned by document.
   Node* RootNode() const { return root_node_; }
@@ -50,6 +100,10 @@ class Document {
   const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }
 
  private:
+  // Creates a new node. The node is owned by Document and is destroyed when
+  // document is destructed.
+  Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);
+
   // Returns a new node with the same type, data and attributes.
   // The clone has no parent, no siblings and no children.
   // The node is owned by the document and is destroyed when document is
@@ -61,6 +115,8 @@ class Document {
 
   Node* root_node_;
   std::vector<Node*> fragment_nodes_{};
+  std::size_t html_src_bytes_;
+  DocumentMetadata metadata_;
 
   friend class Parser;
   friend std::unique_ptr<Document> Parse(std::string_view html);

diff --git a/validator/cpp/htmlparser/parser.cc b/validator/cpp/htmlparser/parser.cc
@@ -143,6 +143,7 @@ Parser::Parser(std::string_view html, const ParseOptions& options,
       count_num_terms_in_text_node_(options.count_num_terms_in_text_node),
       fragment_(fragment_parent != nullptr),
       context_node_(fragment_parent) {
+  document_->metadata_.html_src_bytes = html.size();
   insertion_mode_ = std::bind(&Parser::InitialIM, this);
 }
 
@@ -179,6 +180,7 @@ std::unique_ptr<Document> Parser::Parse() {
   DumpDocument(document_.get());
 #endif
 
+  document_->metadata_.document_end_location = tokenizer_->CurrentPosition();
   return std::move(document_);
 }  // End Parser::Parse.
 
@@ -423,10 +425,15 @@ void Parser::AddText(const std::string& text) {
   }
 
   text_node->data_.assign(text, 0, text.size());
-  if (count_num_terms_in_text_node_) {
+  AddChild(text_node);
+  // Count number of terms in ths text node, except if this is <script>,
+  // <textarea> or a comment node.
+  if (count_num_terms_in_text_node_ && text_node->Parent() &&
+      text_node->Parent()->DataAtom() != Atom::SCRIPT &&
+      text_node->Parent()->Type() != NodeType::COMMENT_NODE &&
+      text_node->Parent()->DataAtom() != Atom::TEXTAREA) {
     text_node->num_terms_ = Strings::CountTerms(text);
   }
-  AddChild(text_node);
 }  // Parser::AddText.
 
 void Parser::AddElement() {
@@ -441,15 +448,15 @@ void Parser::AddElement() {
 
   switch (token_.atom) {
     case Atom::HTML: {
-      element_node->SetManufactured(accounting_.has_manufactured_html);
+      element_node->SetManufactured(document_->metadata_.has_manufactured_html);
       break;
     }
     case Atom::HEAD: {
-      element_node->SetManufactured(accounting_.has_manufactured_head);
+      element_node->SetManufactured(document_->metadata_.has_manufactured_head);
       break;
     }
     case Atom::BODY: {
-      element_node->SetManufactured(accounting_.has_manufactured_body);
+      element_node->SetManufactured(document_->metadata_.has_manufactured_body);
       break;
     }
     default:
@@ -682,7 +689,7 @@ bool Parser::InitialIM() {
         doctype_node->line_col_in_html_src_ = token_.line_col_in_html_src;
       }
       document_->root_node_->AppendChild(doctype_node);
-      accounting_.quirks_mode = quirks_mode;
+      document_->metadata_.quirks_mode = quirks_mode;
       insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);
 
       if (on_node_callback_) {
@@ -695,7 +702,7 @@ bool Parser::InitialIM() {
       break;
   }
 
-  accounting_.quirks_mode = true;
+  document_->metadata_.quirks_mode = true;
   insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);
   return false;
 }  // Parser::InitialIM.
@@ -850,6 +857,15 @@ bool Parser::InHeadIM() {
           AddElement();
           open_elements_stack_.Pop();
           AcknowledgeSelfClosingTag();
+          if (!top() || !top()->LastChild()) return true;
+          // Record some extra document url related info.
+          if (token_.atom == Atom::BASE) {
+            auto base_node = top()->LastChild();
+            RecordBaseURLMetadata(base_node);
+          } else if (token_.atom == Atom::LINK) {
+            auto link_node = top()->LastChild();
+            RecordLinkRelCanonical(link_node);
+          }
           return true;
         }
         case Atom::NOSCRIPT: {
@@ -1167,9 +1183,10 @@ bool Parser::InBodyIM() {
             return true;
           }
           CopyAttributes(open_elements_stack_.at(0), token_);
-          if (!accounting_.has_manufactured_html || num_html_tags_ > 1) {
-            accounting_.duplicate_html_elements = true;
-            accounting_.duplicate_html_element_location =
+          if (!document_->metadata_.has_manufactured_html ||
+              num_html_tags_ > 1) {
+            document_->metadata_.duplicate_html_elements = true;
+            document_->metadata_.duplicate_html_element_location =
                 token_.line_col_in_html_src;
           }
           break;
@@ -1197,9 +1214,10 @@ bool Parser::InBodyIM() {
                 body->atom_ == Atom::BODY) {
               frameset_ok_ = false;
               CopyAttributes(body, token_);
-              if (!accounting_.has_manufactured_body || num_body_tags_ > 1) {
-                accounting_.duplicate_body_elements = true;
-                accounting_.duplicate_body_element_location =
+              if (!document_->metadata_.has_manufactured_body ||
+                  num_body_tags_ > 1) {
+                document_->metadata_.duplicate_body_elements = true;
+                document_->metadata_.duplicate_body_element_location =
                     token_.line_col_in_html_src;
               }
             }
@@ -1406,7 +1424,7 @@ bool Parser::InBodyIM() {
           break;
         }
         case Atom::TABLE: {
-          if (!accounting_.quirks_mode) {
+          if (!document_->metadata_.quirks_mode) {
             PopUntil(Scope::ButtonScope, Atom::P);
           }
           AddElement();
@@ -3249,13 +3267,13 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
   if (token_type == TokenType::START_TAG_TOKEN) {
     switch (atom) {
       case Atom::HTML:
-        accounting_.has_manufactured_html = true;
+        document_->metadata_.has_manufactured_html = true;
         break;
       case Atom::HEAD:
-        accounting_.has_manufactured_head = true;
+        document_->metadata_.has_manufactured_head = true;
         break;
       case Atom::BODY:
-        accounting_.has_manufactured_body = true;
+        document_->metadata_.has_manufactured_body = true;
         break;
       default:
         break;
@@ -3308,6 +3326,38 @@ void Parser::CopyAttributes(Node* node, Token token) const {
   }
 }  // Parser::CopyAttributes.
 
+void Parser::RecordBaseURLMetadata(Node* base_node) {
+  if (base_node->Type() != NodeType::ELEMENT_NODE ||
+      base_node->DataAtom() != Atom::BASE) return;
+
+  for (auto& attr : base_node->Attributes()) {
+    if (Strings::EqualFold(attr.key, "href")) {
+      document_->metadata_.base_url.first = attr.value;
+    } else if (Strings::EqualFold(attr.key, "target")) {
+      document_->metadata_.base_url.second = attr.value;
+    }
+  }
+}
+
+void Parser::RecordLinkRelCanonical(Node* link_node) {
+  if (link_node->Type() != NodeType::ELEMENT_NODE ||
+      link_node->DataAtom() != Atom::LINK) return;
+
+  bool canonical;
+  std::string canonical_url;
+  for (auto& attr : link_node->Attributes()) {
+    if (Strings::EqualFold(attr.key, "rel") &&
+        Strings::EqualFold(attr.value, "canonical")) {
+      canonical = true;
+    } else if (Strings::EqualFold(attr.key, "href")) {
+      canonical_url = attr.value;
+    }
+  }
+  if (canonical && !canonical_url.empty()) {
+    document_->metadata_.canonical_url = canonical_url;
+  }
+}
+
 namespace {
 // Returns only whitespace characters in s.
 // <space><space>foo<space>bar<space> returns 4 spaces.