Skip to content

Commit

Permalink
feat(crawl): 新增编辑规则和测试规则
Browse files Browse the repository at this point in the history
合并mstaer分支的pull request 201206030#71
  • Loading branch information
xiaoyang committed Dec 24, 2021
1 parent fecf03b commit 96662fc
Show file tree
Hide file tree
Showing 11 changed files with 826 additions and 7 deletions.
2 changes: 1 addition & 1 deletion novel-admin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
<dependency>
<groupId>org.apache.shiro</groupId>
<artifactId>shiro-spring</artifactId>
<version>1.3.2</version>
<version>1.7.0</version>
</dependency>
<!-- shiro ehcache -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,8 @@ public interface CacheKey {
* 累积的小说点击量
* */
String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount";
}
/**
* 测试爬虫规则缓存
*/
String BOOK_TEST_PARSE = "testParse";
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package com.java2nb.novel.controller;

import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.utils.HttpUtil;
import io.github.xxyopen.model.page.PageBean;

import com.java2nb.novel.entity.CrawlSingleTask;
Expand All @@ -9,6 +12,11 @@
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.*;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @author Administrator
*/
Expand All @@ -19,7 +27,7 @@ public class CrawlController {

private final CrawlService crawlService;


private final CacheService cacheService;
/**
* 新增爬虫源
* */
Expand All @@ -39,7 +47,70 @@ public RestResult<PageBean<CrawlSource>> listCrawlByPage(@RequestParam(value = "

return RestResult.ok(crawlService.listCrawlByPage(page,pageSize));
}
/**
* 获取爬虫源
* */
@GetMapping("getCrawlSource/{id}")
public RestResult<CrawlSource> getCrawlSource(@PathVariable("id") Integer id){
CrawlSource crawlSource= crawlService.getCrawlSource(id);
return RestResult.ok(crawlSource);

}

/**
* 测试规则
* @param rule
* @param url
* @param isRefresh
* @return
*/
@PostMapping("testParse")
public RestResult<Object> testParse(String rule,String url,String isRefresh){

Map<String,Object> resultMap=new HashMap<>();
String html =null;
if(url.startsWith("https://")||url.startsWith("http://")){
String refreshCache="1";
if(!refreshCache.equals(isRefresh)) {
Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url);
if (cache == null) {
isRefresh="1";
}else {
html = (String) cache;
}
}
if(refreshCache.equals(isRefresh)){
html = HttpUtil.getByHttpClientWithChrome(url);
if (html != null) {
cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10);
}else{
resultMap.put("msg","html is null");
return RestResult.ok(resultMap);
}
}
}else{
resultMap.put("html","url is null");
return RestResult.ok(resultMap);
}
Pattern pattern = Pattern.compile(rule);
Matcher matcher = pattern.matcher(html);
boolean isFind = matcher.find();
resultMap.put("是否匹配",isFind);
if(isFind){
resultMap.put("匹配结果",matcher.group(1));
}
// resultMap.put("url",url);
return RestResult.ok(resultMap);
}
/**
* 修改爬虫源
* */
@PostMapping("updateCrawlSource")
public RestResult<Void> updateCrawlSource(CrawlSource source) {
crawlService.updateCrawlSource(source);
return RestResult.ok();

}
/**
* 开启或停止爬虫
* */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ public interface CrawlService {
* */
void addCrawlSource(CrawlSource source);


/**
* 修改爬虫源
* @param source
*/
void updateCrawlSource(CrawlSource source);
/**
* 爬虫源分页列表
* @param page 当前页码
Expand Down Expand Up @@ -106,4 +110,11 @@ public interface CrawlService {
* @param status 采集状态
* */
void updateCrawlSingleTask(CrawlSingleTask task, Byte status);

/**
* 获取采集规则详细
* @param id
* @return
*/
CrawlSource getCrawlSource(Integer id);
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,24 @@ public void addCrawlSource(CrawlSource source) {
crawlSourceMapper.insertSelective(source);

}

@Override
public void updateCrawlSource(CrawlSource source) {
if(source.getId()!=null){
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(source.getId());
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
if (crawlSource.getSourceStatus() == (byte) 1) {
//关闭
openOrCloseCrawl(crawlSource.getId(),(byte)0);
}
Date currentDate = new Date();
crawlSource.setUpdateTime(currentDate);
crawlSource.setCrawlRule(source.getCrawlRule());
crawlSource.setSourceName(source.getSourceName());
crawlSourceMapper.updateByPrimaryKey(crawlSource);
}
}
}
@Override
public PageBean<CrawlSource> listCrawlByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
Expand Down Expand Up @@ -206,6 +223,16 @@ public void updateCrawlSingleTask(CrawlSingleTask task, Byte status) {

}

@Override
public CrawlSource getCrawlSource(Integer id) {
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(id);
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
return crawlSource;
}
return null;
}

/**
* 解析分类列表
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1" href="/">爬虫源管理</a></li>
<li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1" href="/">爬虫源管理</a></li>
<li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
Expand All @@ -38,7 +39,7 @@
<div class="my_bookshelf">
<div class="title cf">
<h2 class="fl">爬虫源列表</h2>
<div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a></div>
<div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a>
</div>

<div id="divData" class="updateTable">
Expand Down Expand Up @@ -119,6 +120,7 @@ <h2 class="fl">爬虫源列表</h2>
<script language="javascript" type="text/javascript">
search(1, 10);

var pageCrawlSourceList=null;
function search(curr, limit) {

$.ajax({
Expand All @@ -129,6 +131,7 @@ <h2 class="fl">爬虫源列表</h2>
success: function (data) {
if (data.code == 200) {
var crawlSourceList = data.data.list;
pageCrawlSourceList=data.data.list;
if (crawlSourceList.length > 0) {
var crawlSourceListHtml = "";
for(var i=0;i<crawlSourceList.length;i++){
Expand All @@ -147,7 +150,9 @@ <h2 class="fl">爬虫源列表</h2>
" <td class=\"goread\" id='sourceStatus"+crawlSource.id+"'>"+(crawlSource.sourceStatus==0?'停止运行':'正在运行')+
" </td>\n" +

" <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+" </a></td> </tr>");
" <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+" </a>" +
"<a href='javascript:updateCrawlSource("+crawlSource.id+")'>修改 </a>" +
"</td> </tr>");
}
$("#crawlSourceList").html(crawlSourceListHtml);

Expand Down Expand Up @@ -196,7 +201,12 @@ <h2 class="fl">爬虫源列表</h2>
})

}
function updateCrawlSource(crawlSourceId){

localStorage.setItem("crawlSourceId",crawlSourceId);
window.location.href="/crawl/crawlSource_update.html";

}

function openOrStopCrawl(sourceId,status) {

Expand Down
Loading

0 comments on commit 96662fc

Please sign in to comment.