Skip to content
This repository has been archived by the owner on Feb 6, 2022. It is now read-only.

Commit

Permalink
tunned leTemps scrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
simcasmse committed May 18, 2018
1 parent 3d9a7fb commit bbcf1cb
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Projet/Crawlers/CrawlerLeTemps/letemps_scrapper.iml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ public void visit(Page page) {
String html = htmlParseData.getHtml();
Document doc = Jsoup.parse(html);
Element title = doc.selectFirst("#block-letemps-content > article > div.container > div > div.col-sm-9.col-md-6 > div.article-content.article-content-inset.gallery.main-content > h1 > span");
Element article = doc.selectFirst("#block-letemps-content > article > div.container > div > div.col-sm-9.col-md-6 > div.article-content.article-content-inset.gallery.main-content > div.article_body");
Element article = doc.selectFirst("#block-letemps-content > article > div.container > div > div.col-sm-9.col-md-6 > div.article-content.article-content-inset.gallery.main-content > div.article_body > div.body_content");
List<String> tags = doc.select("#block-letemps-content > article > div.container > div > div.col-sm-3.col-md-2 > section > p.tags").select("b").eachText();

Element date = doc.select("meta[itemprop='datePublished']").first();
Long dateUnix = date != null ? parseDate(date.attr("content")) : 0l;

Expand Down

0 comments on commit bbcf1cb

Please sign in to comment.