Skip to content

Commit

Permalink
Sync for validator/cpp/htmlparser (ampproject#35136)
Browse files Browse the repository at this point in the history
* Do not record num_terms for <script>, <textarea> and comment nodes.

PiperOrigin-RevId: 382380916

* - Move ParseAccounting from Parser to Document, so that parser can be freed
  after document is parsed.

- Add Validate(Document*) method so clients can supply Document object instead
  of html string. This will help eliminate dual parsing.

- Remove unused flag, it is not used anywhere outside validator-internal.cc

PiperOrigin-RevId: 382613643

* - Preprocess document url attributes during parsing and attach it as Document
metadata.
- Renames ParseAccounting to DocumentMetadata as we will use this structure to
populate more document metadata information readily available to clients.

PiperOrigin-RevId: 382811824
  • Loading branch information
amaltas authored Jul 7, 2021
1 parent f3b1875 commit 4da7669
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 116 deletions.
46 changes: 27 additions & 19 deletions validator/cpp/engine/validator-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,6 @@ ABSL_FLAG(int, max_node_recursion_depth, 200,
"Maximum recursion depth of nodes, if stack of nodes grow beyond this"
"validator will stop parsing with FAIL result");

ABSL_FLAG(bool, duplicate_html_body_elements_is_error, false,
"If true, duplicate <html>,<body> elements is considered error for "
"validation purposes. (Default is to allow, leaving it to HTML5 "
"user-agent to treat them as per the spec).");
ABSL_FLAG(bool, allow_module_nomodule, true,
"If true, then script versions for module and nomdule are allowed "
"in AMP documents. This gating should be temporary and removed "
Expand Down Expand Up @@ -5652,6 +5648,23 @@ class Validator {
max_errors_(max_errors),
context_(rules_, max_errors_) {}

ValidationResult Validate(const htmlparser::Document& doc) {
doc_metadata_ = doc.Metadata();
UpdateLineColumnIndex(doc.RootNode());
// The validation check for document size can't be done here since
// the Type Identifiers on the html tag have not been parsed yet and
// we wouldn't know which rule to apply. It's set to the context
// so that when those things are known it can be checked.
context_.SetDocByteSize(doc_metadata_.html_src_bytes);
ValidateNode(doc.RootNode());
auto [current_line_no, current_col_no] =
doc_metadata_.document_end_location;
context_.SetLineCol(current_line_no, current_col_no > 0 ? current_col_no - 1
: current_col_no);
EndDocument();
return result_;
}

ValidationResult Validate(std::string_view html) {
Clear();
htmlparser::ParseOptions options{
Expand All @@ -5660,11 +5673,6 @@ class Validator {
.record_node_offsets = true,
.record_attribute_offsets = true,
};
// The validation check for document size can't be done here since
// the Type Identifiers on the html tag have not been parsed yet and
// we wouldn't know which rule to apply. It's set to the context
// so that when those things are known it can be checked.
context_.SetDocByteSize(html.length());
auto parser = std::make_unique<htmlparser::Parser>(html, options);
auto doc = parser->Parse();
// Currently parser returns nullptr only if document is too complex.
Expand All @@ -5676,14 +5684,7 @@ class Validator {
return result_;
}

parse_accounting_ = parser->Accounting();
UpdateLineColumnIndex(doc->RootNode());
ValidateNode(doc->RootNode());
auto [current_line_no, current_col_no] = parser->CurrentTokenizerPosition();
context_.SetLineCol(current_line_no, current_col_no > 0 ? current_col_no - 1
: current_col_no);
EndDocument();
return result_;
return Validate(*doc);
}

// Updates context's line column index using the current node's position.
Expand Down Expand Up @@ -5732,7 +5733,7 @@ class Validator {
}
return true;
case htmlparser::NodeType::DOCTYPE_NODE:
if (parse_accounting_.quirks_mode) {
if (doc_metadata_.quirks_mode) {
LineCol linecol(1, 0);
auto lc = node->LineColInHtmlSrc();
if (lc.has_value()) {
Expand Down Expand Up @@ -5974,7 +5975,7 @@ class Validator {
const ParsedValidatorRules* rules_;
int max_errors_ = -1;
Context context_;
htmlparser::ParseAccounting parse_accounting_;
htmlparser::DocumentMetadata doc_metadata_;
ValidationResult result_;
Validator(const Validator&) = delete;
Validator& operator=(const Validator&) = delete;
Expand All @@ -5989,4 +5990,11 @@ ValidationResult Validate(std::string_view html, HtmlFormat_Code html_format,
return validator.Validate(html);
}

ValidationResult Validate(const htmlparser::Document& doc,
HtmlFormat_Code html_format, int max_errors) {
Validator validator(ParsedValidatorRulesProvider::Get(html_format),
max_errors);
return validator.Validate(doc);
}

} // namespace amp::validator
5 changes: 5 additions & 0 deletions validator/cpp/engine/validator.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,15 @@
#ifndef AMPVALIDATOR__VALIDATOR_H_
#define AMPVALIDATOR__VALIDATOR_H_

#include "document.h"
#include "validator.pb.h"

namespace amp::validator {

ValidationResult Validate(const htmlparser::Document& document,
HtmlFormat_Code html_format = HtmlFormat::AMP,
int max_errors = -1);

ValidationResult Validate(std::string_view html,
HtmlFormat_Code html_format = HtmlFormat::AMP,
int max_errors = -1);
Expand Down
64 changes: 60 additions & 4 deletions validator/cpp/htmlparser/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,68 @@ namespace htmlparser {
class Parser;
struct ParseOptions;

// Contains pieces of information about a particular HTML parse operation.
// Clients are expected to treat all the fields as constants, but are given the
// flexibility of modifying for upstream error reporting.
struct DocumentMetadata {
public:
// Tells if any of the HTML, HEAD, and/or BODY elements are generated by
// the Parser because they were missing or implicitly created before they
// are parsed.
// Example:
// Original document: <html><div>foo</div></html>
// Parsed document: <html><head></head><body><div>foo</div></body></html>
//
// The has_manufactured_* accounting applies only to missing
// TokenType::START_TAG_TOKEN.
// If any of the </html>, </head>, or </body> end tags are missing, parser
// auto closes the elements but they are not treated as manufactured from
// the clients perspective.
bool has_manufactured_html = false;
bool has_manufactured_head = false;
bool has_manufactured_body = false;

// HTML5 algorithm handles duplication of unique tags by merging them and
// producing a valid HTML. However, if clients are interested in knowing if
// the original HTML source contains duplicate elements, following bits are
// set.
bool duplicate_html_elements = false;
bool duplicate_body_elements = false;
// Set only if above duplicate bits are true.
std::optional<LineCol> duplicate_html_element_location = std::nullopt;
std::optional<LineCol> duplicate_body_element_location = std::nullopt;

// If true, parsed src is missing required <!doctype html> declaration or is
// invalid syntax or is XHTML 4 or legacy doctype.
bool quirks_mode = false;

// The line column position of the last element in the document. Useful for
// error reporting at the end of the document.
LineCol document_end_location {0, 0};

// The actual size of html src in bytes.
std::size_t html_src_bytes = 0;

// The document's <base> url and target.
// See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
std::pair<std::string, std::string> base_url;

// The link rel=canonical url found in the html src. If multiple link
// rel=canonical found the last one is recorded.
std::string canonical_url;
};


// The document class is a wrapper for the DOM tree exposed with RootNode().
// All the nodes inside the document are owned by document. The nodes are
// destroyed when Document objects goes out of scope or deleted.
class Document {
public:
Document();

~Document() = default;

// Creates a new node. The node is owned by Document and is destroyed when
// document is destructed.
Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);

const DocumentMetadata& Metadata() const { return metadata_; }

// Returns the root node of a DOM tree. Node* owned by document.
Node* RootNode() const { return root_node_; }
Expand All @@ -50,6 +100,10 @@ class Document {
const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }

private:
// Creates a new node. The node is owned by Document and is destroyed when
// document is destructed.
Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);

// Returns a new node with the same type, data and attributes.
// The clone has no parent, no siblings and no children.
// The node is owned by the document and is destroyed when document is
Expand All @@ -61,6 +115,8 @@ class Document {

Node* root_node_;
std::vector<Node*> fragment_nodes_{};
std::size_t html_src_bytes_;
DocumentMetadata metadata_;

friend class Parser;
friend std::unique_ptr<Document> Parse(std::string_view html);
Expand Down
84 changes: 67 additions & 17 deletions validator/cpp/htmlparser/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ Parser::Parser(std::string_view html, const ParseOptions& options,
count_num_terms_in_text_node_(options.count_num_terms_in_text_node),
fragment_(fragment_parent != nullptr),
context_node_(fragment_parent) {
document_->metadata_.html_src_bytes = html.size();
insertion_mode_ = std::bind(&Parser::InitialIM, this);
}

Expand Down Expand Up @@ -179,6 +180,7 @@ std::unique_ptr<Document> Parser::Parse() {
DumpDocument(document_.get());
#endif

document_->metadata_.document_end_location = tokenizer_->CurrentPosition();
return std::move(document_);
} // End Parser::Parse.

Expand Down Expand Up @@ -423,10 +425,15 @@ void Parser::AddText(const std::string& text) {
}

text_node->data_.assign(text, 0, text.size());
if (count_num_terms_in_text_node_) {
AddChild(text_node);
// Count number of terms in ths text node, except if this is <script>,
// <textarea> or a comment node.
if (count_num_terms_in_text_node_ && text_node->Parent() &&
text_node->Parent()->DataAtom() != Atom::SCRIPT &&
text_node->Parent()->Type() != NodeType::COMMENT_NODE &&
text_node->Parent()->DataAtom() != Atom::TEXTAREA) {
text_node->num_terms_ = Strings::CountTerms(text);
}
AddChild(text_node);
} // Parser::AddText.

void Parser::AddElement() {
Expand All @@ -441,15 +448,15 @@ void Parser::AddElement() {

switch (token_.atom) {
case Atom::HTML: {
element_node->SetManufactured(accounting_.has_manufactured_html);
element_node->SetManufactured(document_->metadata_.has_manufactured_html);
break;
}
case Atom::HEAD: {
element_node->SetManufactured(accounting_.has_manufactured_head);
element_node->SetManufactured(document_->metadata_.has_manufactured_head);
break;
}
case Atom::BODY: {
element_node->SetManufactured(accounting_.has_manufactured_body);
element_node->SetManufactured(document_->metadata_.has_manufactured_body);
break;
}
default:
Expand Down Expand Up @@ -682,7 +689,7 @@ bool Parser::InitialIM() {
doctype_node->line_col_in_html_src_ = token_.line_col_in_html_src;
}
document_->root_node_->AppendChild(doctype_node);
accounting_.quirks_mode = quirks_mode;
document_->metadata_.quirks_mode = quirks_mode;
insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);

if (on_node_callback_) {
Expand All @@ -695,7 +702,7 @@ bool Parser::InitialIM() {
break;
}

accounting_.quirks_mode = true;
document_->metadata_.quirks_mode = true;
insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);
return false;
} // Parser::InitialIM.
Expand Down Expand Up @@ -850,6 +857,15 @@ bool Parser::InHeadIM() {
AddElement();
open_elements_stack_.Pop();
AcknowledgeSelfClosingTag();
if (!top() || !top()->LastChild()) return true;
// Record some extra document url related info.
if (token_.atom == Atom::BASE) {
auto base_node = top()->LastChild();
RecordBaseURLMetadata(base_node);
} else if (token_.atom == Atom::LINK) {
auto link_node = top()->LastChild();
RecordLinkRelCanonical(link_node);
}
return true;
}
case Atom::NOSCRIPT: {
Expand Down Expand Up @@ -1167,9 +1183,10 @@ bool Parser::InBodyIM() {
return true;
}
CopyAttributes(open_elements_stack_.at(0), token_);
if (!accounting_.has_manufactured_html || num_html_tags_ > 1) {
accounting_.duplicate_html_elements = true;
accounting_.duplicate_html_element_location =
if (!document_->metadata_.has_manufactured_html ||
num_html_tags_ > 1) {
document_->metadata_.duplicate_html_elements = true;
document_->metadata_.duplicate_html_element_location =
token_.line_col_in_html_src;
}
break;
Expand Down Expand Up @@ -1197,9 +1214,10 @@ bool Parser::InBodyIM() {
body->atom_ == Atom::BODY) {
frameset_ok_ = false;
CopyAttributes(body, token_);
if (!accounting_.has_manufactured_body || num_body_tags_ > 1) {
accounting_.duplicate_body_elements = true;
accounting_.duplicate_body_element_location =
if (!document_->metadata_.has_manufactured_body ||
num_body_tags_ > 1) {
document_->metadata_.duplicate_body_elements = true;
document_->metadata_.duplicate_body_element_location =
token_.line_col_in_html_src;
}
}
Expand Down Expand Up @@ -1406,7 +1424,7 @@ bool Parser::InBodyIM() {
break;
}
case Atom::TABLE: {
if (!accounting_.quirks_mode) {
if (!document_->metadata_.quirks_mode) {
PopUntil(Scope::ButtonScope, Atom::P);
}
AddElement();
Expand Down Expand Up @@ -3249,13 +3267,13 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
if (token_type == TokenType::START_TAG_TOKEN) {
switch (atom) {
case Atom::HTML:
accounting_.has_manufactured_html = true;
document_->metadata_.has_manufactured_html = true;
break;
case Atom::HEAD:
accounting_.has_manufactured_head = true;
document_->metadata_.has_manufactured_head = true;
break;
case Atom::BODY:
accounting_.has_manufactured_body = true;
document_->metadata_.has_manufactured_body = true;
break;
default:
break;
Expand Down Expand Up @@ -3308,6 +3326,38 @@ void Parser::CopyAttributes(Node* node, Token token) const {
}
} // Parser::CopyAttributes.

void Parser::RecordBaseURLMetadata(Node* base_node) {
if (base_node->Type() != NodeType::ELEMENT_NODE ||
base_node->DataAtom() != Atom::BASE) return;

for (auto& attr : base_node->Attributes()) {
if (Strings::EqualFold(attr.key, "href")) {
document_->metadata_.base_url.first = attr.value;
} else if (Strings::EqualFold(attr.key, "target")) {
document_->metadata_.base_url.second = attr.value;
}
}
}

void Parser::RecordLinkRelCanonical(Node* link_node) {
if (link_node->Type() != NodeType::ELEMENT_NODE ||
link_node->DataAtom() != Atom::LINK) return;

bool canonical;
std::string canonical_url;
for (auto& attr : link_node->Attributes()) {
if (Strings::EqualFold(attr.key, "rel") &&
Strings::EqualFold(attr.value, "canonical")) {
canonical = true;
} else if (Strings::EqualFold(attr.key, "href")) {
canonical_url = attr.value;
}
}
if (canonical && !canonical_url.empty()) {
document_->metadata_.canonical_url = canonical_url;
}
}

namespace {
// Returns only whitespace characters in s.
// <space><space>foo<space>bar<space> returns 4 spaces.
Expand Down
Loading

0 comments on commit 4da7669

Please sign in to comment.