Skip to content

Commit

Permalink
Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_…
Browse files Browse the repository at this point in the history
…ESCAPE to false to enable it. code4craft#149
  • Loading branch information
code4craft committed Aug 14, 2014
1 parent 4e6e946 commit 9866297
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -19,13 +20,32 @@ public class Html extends HtmlNode {

private Logger logger = LoggerFactory.getLogger(getClass());

private static volatile boolean INITED = false;

/**
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
*/
public static boolean DISABLE_HTML_ENTITY_ESCAPE = true;

/**
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
*/
private void disableJsoupHtmlEntityEscape() {
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
Entities.EscapeMode.base.getMap().clear();
Entities.EscapeMode.extended.getMap().clear();
INITED = true;
}
}

/**
* Store parsed document for better performance when only one text exist.
*/
private Document document;

public Html(String text) {
try {
disableJsoupHtmlEntityEscape();
this.document = Jsoup.parse(text);
} catch (Exception e) {
this.document = null;
Expand Down
18 changes: 15 additions & 3 deletions webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;

import static org.assertj.core.api.Assertions.assertThat;

/**
* @author [email protected] <br>
* Date: 13-4-21
Expand All @@ -13,9 +15,19 @@ public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString());

assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
}

@Test
public void testDisableJsoupHtmlEntityEscape() throws Exception {
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
}

@Test
public void testEnableJsoupHtmlEntityEscape() throws Exception {
Html.DISABLE_HTML_ENTITY_ESCAPE = false;
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&amp;b)").toString()).isEqualTo("aaaaaaa&amp;b");
}
}

0 comments on commit 9866297

Please sign in to comment.