Skip to content

Commit

Permalink
部分爬虫代码优化
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoyang committed Jul 24, 2021
1 parent b5df86d commit 4fe36a8
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package com.java2nb.novel.core.crawl;

import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import lombok.Data;

import java.util.List;

/**
* 章节数据封装bean
* @author Administrator
*/
@Data
public class ChapterBean {

/**
* 章节索引集合
* */
List<BookIndex> bookIndexList;

/**
* 章节内容集合
* */
List<BookContent> bookContentList;
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,11 @@
@Slf4j
public class CrawlParser {

private static IdWorker idWorker = new IdWorker();
private static final IdWorker idWorker = new IdWorker();

public static final Integer BOOK_INDEX_LIST_KEY = 1;
private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");

public static final Integer BOOK_CONTENT_LIST_KEY = 2;

private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");

private static ThreadLocal<Integer> retryCount = new ThreadLocal<>();
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();

@SneakyThrows
public static Book parseBook(RuleBean ruleBean, String bookId) {
Expand Down Expand Up @@ -113,14 +109,14 @@ public static Book parseBook(RuleBean ruleBean, String bookId) {
}
}

if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten());
if (StringUtils.isNotBlank(ruleBean.getUpdateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpdateTimeFormatPatten())) {
Pattern updateTimePatten = compile(ruleBean.getUpdateTimePatten());
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
boolean isFindUpdateTime = updateTimeMatch.find();
if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1);
//设置更新时间
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpdateTimeFormatPatten()).parse(updateTime));

}
}
Expand All @@ -142,10 +138,7 @@ public static Book parseBook(RuleBean ruleBean, String bookId) {
return book;
}

public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
Map<Integer, List> result = new HashMap<>(2);
result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0));
result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0));
public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) {

Date currentDate = new Date();

Expand All @@ -171,11 +164,11 @@ public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, B
int indexNum = 0;

//总字数
Integer totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();
int totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();

while (isFindIndex) {

BookIndex hasIndex = hasIndexs.get(indexNum);
BookIndex hasIndex = existBookIndexMap.get(indexNum);
String indexName = indexNameMatch.group(1);

if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
Expand Down Expand Up @@ -221,7 +214,7 @@ public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, B
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
Integer wordCount = StringUtil.getStrValidWordCount(content);
int wordCount = StringUtil.getStrValidWordCount(content);
bookIndex.setWordCount(wordCount);
indexList.add(bookIndex);

Expand Down Expand Up @@ -277,15 +270,20 @@ public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, B

if (indexList.size() == contentList.size() && indexList.size() > 0) {

result.put(BOOK_INDEX_LIST_KEY, indexList);
result.put(BOOK_CONTENT_LIST_KEY, contentList);
return new ChapterBean(){{
setBookIndexList(indexList);
setBookContentList(contentList);
}};

}

}


return result;
return new ChapterBean(){{
setBookIndexList(new ArrayList<>(0));
setBookContentList(new ArrayList<>(0));
}};
}


Expand All @@ -294,6 +292,7 @@ private static String getByHttpClient(String url) {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody();
assert body != null;
if (body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ public class RuleBean {
private String visitCountPatten;
private String descStart;;
private String descEnd;
private String upadateTimePatten;
private String upadateTimeFormatPatten;
private String updateTimePatten;
private String updateTimeFormatPatten;
private String bookIndexUrl;
private String indexIdPatten;
private String indexNamePatten;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.java2nb.novel.core.listener;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.*;
Expand All @@ -16,9 +17,9 @@
import javax.servlet.ServletContextListener;
import javax.servlet.annotation.WebListener;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
* @author Administrator
Expand Down Expand Up @@ -66,15 +67,15 @@ public void contextInitialized(ServletContextEvent sce) {
//查询已存在的章节
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
bookService.updateBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY), existBookIndexMap);
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
} catch (Exception e) {
log.error(e.getMessage(), e);
}

}

Thread.sleep(1000 * 60 * 10);
// 休眠10分钟
TimeUnit.MINUTES.sleep(10);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
Expand Down Expand Up @@ -107,7 +108,8 @@ public void contextInitialized(ServletContextEvent sce) {

}

Thread.sleep(1000 * 60);
//休眠1分钟
TimeUnit.MINUTES.sleep(1);

} catch (Exception e) {
log.error(e.getMessage(), e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.java2nb.novel.core.bean.PageBean;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.enums.ResponseStatus;
Expand Down Expand Up @@ -303,9 +304,9 @@ public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId,
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));

bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());

} else {
//只更新书籍的爬虫相关字段
Expand Down

0 comments on commit 4fe36a8

Please sign in to comment.