feat(crawl): 新增编辑规则和测试规则

合并mstaer分支的pull request 201206030#71
EraJohnession · Dec 24, 2021 · 96662fc · 96662fc
1 parent fecf03b
commit 96662fc
Show file tree

Hide file tree

Showing 11 changed files with 826 additions and 7 deletions.
diff --git a/novel-admin/pom.xml b/novel-admin/pom.xml
@@ -106,7 +106,7 @@
         <dependency>
             <groupId>org.apache.shiro</groupId>
             <artifactId>shiro-spring</artifactId>
-            <version>1.3.2</version>
+            <version>1.7.0</version>
         </dependency>
         <!-- shiro ehcache -->
         <dependency>

diff --git a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java
@@ -65,4 +65,8 @@ public interface CacheKey {
      * 累积的小说点击量
      * */
     String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount";
-}
+    /**
+     * 测试爬虫规则缓存
+     */
+    String BOOK_TEST_PARSE = "testParse";
+}
diff --git a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java
@@ -1,5 +1,8 @@
 package com.java2nb.novel.controller;
 
+import com.java2nb.novel.core.cache.CacheKey;
+import com.java2nb.novel.core.cache.CacheService;
+import com.java2nb.novel.core.utils.HttpUtil;
 import io.github.xxyopen.model.page.PageBean;
 
 import com.java2nb.novel.entity.CrawlSingleTask;
@@ -9,6 +12,11 @@
 import lombok.RequiredArgsConstructor;
 import org.springframework.web.bind.annotation.*;
 
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 /**
  * @author Administrator
  */
@@ -19,7 +27,7 @@ public class CrawlController {
 
     private final CrawlService crawlService;
 
-
+    private final CacheService cacheService;
     /**
      * 新增爬虫源
      * */
@@ -39,7 +47,70 @@ public RestResult<PageBean<CrawlSource>> listCrawlByPage(@RequestParam(value = "
 
         return RestResult.ok(crawlService.listCrawlByPage(page,pageSize));
     }
+    /**
+     * 获取爬虫源
+     * */
+    @GetMapping("getCrawlSource/{id}")
+    public RestResult<CrawlSource> getCrawlSource(@PathVariable("id") Integer id){
+        CrawlSource crawlSource=  crawlService.getCrawlSource(id);
+        return RestResult.ok(crawlSource);
 
+    }
+
+    /**
+     * 测试规则
+     * @param rule
+     * @param url
+     * @param isRefresh
+     * @return
+     */
+    @PostMapping("testParse")
+    public RestResult<Object> testParse(String rule,String url,String isRefresh){
+
+        Map<String,Object> resultMap=new HashMap<>();
+        String html =null;
+        if(url.startsWith("https://")||url.startsWith("http://")){
+            String refreshCache="1";
+            if(!refreshCache.equals(isRefresh)) {
+                Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url);
+                if (cache == null) {
+                    isRefresh="1";
+                }else {
+                    html = (String) cache;
+                }
+            }
+            if(refreshCache.equals(isRefresh)){
+                html = HttpUtil.getByHttpClientWithChrome(url);
+                if (html != null) {
+                    cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10);
+                }else{
+                    resultMap.put("msg","html is null");
+                    return RestResult.ok(resultMap);
+                }
+            }
+        }else{
+            resultMap.put("html","url is null");
+            return RestResult.ok(resultMap);
+        }
+        Pattern pattern = Pattern.compile(rule);
+        Matcher matcher = pattern.matcher(html);
+        boolean isFind = matcher.find();
+        resultMap.put("是否匹配",isFind);
+        if(isFind){
+            resultMap.put("匹配结果",matcher.group(1));
+        }
+       // resultMap.put("url",url);
+        return RestResult.ok(resultMap);
+    }
+    /**
+     * 修改爬虫源
+     * */
+    @PostMapping("updateCrawlSource")
+    public RestResult<Void> updateCrawlSource(CrawlSource source) {
+        crawlService.updateCrawlSource(source);
+        return RestResult.ok();
+
+    }
     /**
      * 开启或停止爬虫
      * */

diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java
@@ -18,7 +18,11 @@ public interface CrawlService {
      * */
     void addCrawlSource(CrawlSource source);
 
-
+    /**
+     * 修改爬虫源
+     * @param source
+     */
+    void updateCrawlSource(CrawlSource source);
     /**
      * 爬虫源分页列表
      * @param page 当前页码
@@ -106,4 +110,11 @@ public interface CrawlService {
      * @param status 采集状态
      * */
     void updateCrawlSingleTask(CrawlSingleTask task, Byte status);
+
+    /**
+     * 获取采集规则详细
+     * @param id
+     * @return
+     */
+    CrawlSource getCrawlSource(Integer id);
 }
diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java
@@ -70,7 +70,24 @@ public void addCrawlSource(CrawlSource source) {
         crawlSourceMapper.insertSelective(source);
 
     }
-
+    @Override
+    public void updateCrawlSource(CrawlSource source) {
+        if(source.getId()!=null){
+            Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(source.getId());
+            if(opt.isPresent()) {
+                CrawlSource crawlSource =opt.get();
+                if (crawlSource.getSourceStatus() == (byte) 1) {
+                    //关闭
+                    openOrCloseCrawl(crawlSource.getId(),(byte)0);
+                }
+                Date currentDate = new Date();
+                crawlSource.setUpdateTime(currentDate);
+                crawlSource.setCrawlRule(source.getCrawlRule());
+                crawlSource.setSourceName(source.getSourceName());
+                crawlSourceMapper.updateByPrimaryKey(crawlSource);
+            }
+        }
+    }
     @Override
     public PageBean<CrawlSource> listCrawlByPage(int page, int pageSize) {
         PageHelper.startPage(page, pageSize);
@@ -206,6 +223,16 @@ public void updateCrawlSingleTask(CrawlSingleTask task, Byte status) {
 
     }
 
+    @Override
+    public CrawlSource getCrawlSource(Integer id) {
+            Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(id);
+            if(opt.isPresent()) {
+                CrawlSource crawlSource =opt.get();
+                return crawlSource;
+            }
+            return null;
+    }
+
     /**
      * 解析分类列表
      */

diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html
@@ -30,6 +30,7 @@
             <ul class="log_list">
                 <li><a class="link_1" href="/">爬虫源管理</a></li>
                 <li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
+                <li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
                 <!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
                 <li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
             </ul>

diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html
@@ -30,6 +30,7 @@
             <ul class="log_list">
                 <li><a class="link_1" href="/">爬虫源管理</a></li>
                 <li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
+                <li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
                 <!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
                  <li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
             </ul>

diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html
@@ -30,6 +30,7 @@
             <ul class="log_list">
                 <li><a class="link_1 on" href="/">爬虫源管理</a></li>
                 <li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
+                <li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
                 <!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
                 <li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
             </ul>

diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html
@@ -29,6 +29,7 @@
             <ul class="log_list">
                 <li><a class="link_1 on" href="/">爬虫源管理</a></li>
                 <li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
+                <li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
                <!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
                 <li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
             </ul>
@@ -38,7 +39,7 @@
             <div class="my_bookshelf">
                 <div class="title cf">
                     <h2 class="fl">爬虫源列表</h2>
-                    <div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a></div>
+                    <div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a>
                 </div>
 
                 <div id="divData" class="updateTable">
@@ -119,6 +120,7 @@ <h2 class="fl">爬虫源列表</h2>
 <script language="javascript" type="text/javascript">
     search(1, 10);
 
+    var pageCrawlSourceList=null;
     function search(curr, limit) {
 
         $.ajax({
@@ -129,6 +131,7 @@ <h2 class="fl">爬虫源列表</h2>
             success: function (data) {
                 if (data.code == 200) {
                     var crawlSourceList = data.data.list;
+                    pageCrawlSourceList=data.data.list;
                     if (crawlSourceList.length > 0) {
                         var crawlSourceListHtml = "";
                         for(var i=0;i<crawlSourceList.length;i++){
@@ -147,7 +150,9 @@ <h2 class="fl">爬虫源列表</h2>
                                 "                            <td class=\"goread\" id='sourceStatus"+crawlSource.id+"'>"+(crawlSource.sourceStatus==0?'停止运行':'正在运行')+
                                 "                            </td>\n" +
 
-                                " <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+"  </a></td>                     </tr>");
+                                " <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+"  </a>" +
+                                "<a href='javascript:updateCrawlSource("+crawlSource.id+")'>修改  </a>" +
+                                "</td>                     </tr>");
                         }
                         $("#crawlSourceList").html(crawlSourceListHtml);
 
@@ -196,7 +201,12 @@ <h2 class="fl">爬虫源列表</h2>
         })
 
     }
+    function updateCrawlSource(crawlSourceId){
 
+        localStorage.setItem("crawlSourceId",crawlSourceId);
+        window.location.href="/crawl/crawlSource_update.html";
+
+    }
 
     function openOrStopCrawl(sourceId,status) {