Skip to content

Commit

Permalink
Patch by Markus Jelsma for TIKA-992 to allow OpenGraph meta tags to h…
Browse files Browse the repository at this point in the history
…ave multiple values.

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1481990 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
dameikle committed May 13, 2013
1 parent 5ffe387 commit aa18a79
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public void startElement(
atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
addHtmlMetadata(
metadata.add(
atts.getValue("property"),
atts.getValue("content"));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ stream, new TeeContentHandler(body, link),
"Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));

assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));

Expand Down Expand Up @@ -408,21 +408,20 @@ public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
*/
public void testBoilerplateRemoval() throws Exception {
String path = "/test-documents/boilerplate.html";

Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
new BoilerpipeContentHandler(handler), metadata, new ParseContext());

String content = handler.toString();
assertTrue(content.startsWith("This is the real meat"));
assertTrue(content.endsWith("This is the end of the text.\n"));
assertFalse(content.contains("boilerplate"));
assertFalse(content.contains("footer"));
}



/**
* Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
Expand All @@ -439,22 +438,22 @@ public void testElementOrdering() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// Title element in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));

// No meta elements in body
assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));

// meta elements should show up in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));

// No link elements in body
assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));

// link element should be in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));

// There should be ending elements.
assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));

Expand All @@ -475,7 +474,7 @@ public void testImgUrlExtraction() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <img> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
Expand All @@ -495,7 +494,7 @@ public void testFrameSrcExtraction() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <frame> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
}
Expand All @@ -516,7 +515,7 @@ public void testIFrameSrcExtraction() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <iframe> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
}
Expand All @@ -538,7 +537,7 @@ public void testAreaExtraction() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <map> tag should exist, with <area> tag with fully resolved URL
assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
}
Expand All @@ -560,7 +559,7 @@ public void testObjectExtraction() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <object> tag should exist with fully resolved URLs
assertTrue(
"<object> tag not correctly found in:\n" + result,
Expand All @@ -578,7 +577,7 @@ public void testMetaTagHandling() throws Exception {
Metadata metadata = new Metadata();
metadata.add("Content-Type", "text/html; charset=utf-8");
metadata.add("Language", null);

StringWriter sw = new StringWriter();
new HtmlParser().parse(
new ByteArrayInputStream(test.getBytes("UTF-8")),
Expand Down Expand Up @@ -606,10 +605,10 @@ public void testBrokenFrameset() throws Exception {
makeHtmlTransformer(sw1), new Metadata(), new ParseContext());

String result = sw1.toString();

// <frame> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));

// <body> tag should not exist.
assertFalse(Pattern.matches("(?s).*<body>.*$", result));

Expand All @@ -627,7 +626,7 @@ public void testBrokenFrameset() throws Exception {
makeHtmlTransformer(sw2), new Metadata(), new ParseContext());

result = sw2.toString();

// <frame> tags should exist, with relative URL (no base element specified)
assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
Expand All @@ -645,22 +644,22 @@ public void testBrokenFrameset() throws Exception {
*/
public void testBoilerplateDelegation() throws Exception {
String path = "/test-documents/boilerplate.html";

Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
makeHtmlTransformer(sw), metadata, new ParseContext());

String content = sw.toString();

// Should have <html>, <head>, <title>, <body> elements
assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
}

/**
* Test case for TIKA-481. Verify href in <link> is resolved.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
Expand All @@ -677,16 +676,16 @@ public void testLinkHrefResolution() throws Exception {
makeHtmlTransformer(sw), new Metadata(), new ParseContext());

String result = sw.toString();

// <link> tag should exist in <head>, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
}


/**
* Create ContentHandler that transforms SAX events into textual HTML output,
* and writes it out to <writer> - typically this is a StringWriter.
*
*
* @param writer Where to write resulting HTML text.
* @return ContentHandler suitable for passing to parse() methods.
* @throws Exception
Expand All @@ -700,24 +699,24 @@ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
handler.setResult(new StreamResult(writer));
return handler;
}

/**
* Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
*/
public void testBoilerplateWithMarkup() throws Exception {
String path = "/test-documents/boilerplate.html";

Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
ContentHandler ch = makeHtmlTransformer(sw);
BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
bpch.setIncludeMarkup(true);

new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
bpch, metadata, new ParseContext());

String content = sw.toString();
assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
Expand All @@ -741,7 +740,7 @@ public void testPushback() throws IOException, TikaException {
/**
* Test case for TIKA-869
* IdentityHtmlMapper needs to lower-case tag names.
*
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
*/
public void testIdentityMapper() throws Exception {
Expand All @@ -756,16 +755,16 @@ public void testIdentityMapper() throws Exception {
new HtmlParser().parse (
new ByteArrayInputStream(html.getBytes("UTF-8")),
makeHtmlTransformer(sw), metadata, parseContext);

String result = sw.toString();
// Make sure we don't get <body><BODY/></body>
assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
}

/**
* Test case for TIKA-889
* XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
*
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
*/
public void testNewlineAndIndent() throws Exception {
Expand All @@ -776,30 +775,32 @@ public void testNewlineAndIndent() throws Exception {
new HtmlParser().parse(
new ByteArrayInputStream(html.getBytes("UTF-8")),
handler, new Metadata(), new ParseContext());

// Make sure we get <tab>, "one", newline, newline
String result = handler.toString();

assertTrue(Pattern.matches("\tone\n\n", result));
}

/**
* Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
*
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
*/
public void testOpenGraphMetadata() throws Exception {
String test1 =
"<html><head><meta property=\"og:description\""
+ " content=\"some description\" />"
+ "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
+ "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
+ "<title>hello</title>"
+ "</head><body></body></html>";
Metadata metadata = new Metadata();
new HtmlParser().parse (
new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
assertEquals("some description", metadata.get("og:description"));

assertTrue(metadata.isMultiValued("og:image"));
}

// TIKA-1011
Expand Down

0 comments on commit aa18a79

Please sign in to comment.