Skip to content

Commit 27e75c1

Browse files
author
lucifer
committed
feat: 重构爬虫相关代码
1 parent f21ec82 commit 27e75c1

15 files changed

+8382
-7085
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,5 @@
2222
npm-debug.log*
2323
yarn-debug.log*
2424
yarn-error.log*
25+
26+
!/spider/**/.gitkeep

README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
|-spider 爬虫抓取和转化后的相关文件
1414
|---raw-markdown 由github上抓到的题解markdown原文件
1515
|---yield-db-json 从markdown中提取标题、tag、公司、各语言题解生产的json
16-
|-src 源代码
16+
|-src 源代码
1717
|-scripts
1818
|--- constants.js 脚本常量
1919
|--- curl-leetcode.js 爬虫请求逻辑
@@ -24,19 +24,15 @@
2424
|---App.js 主逻辑都在这里
2525
```
2626

27-
## 爬虫
28-
- npm run crawl 此命令会先从github上拉取问题列表,将文件名解析成数组,然后根据问题名称循 环拉取与之对应的markdown文件(此过程会先查找本地是否存在,如果存在则跳过)
29-
问题拉取完成后,根据markdown匹配正则,转化成所需的json文件
30-
31-
27+
- [爬虫相关](./scripts/README.MD)
3228

3329
## 构建
30+
3431
- npm run build
3532
- 然后将 build 文件夹的内容添加到扩展中即可,具体方式见上面的`功能介绍`
3633

3734
> 以后每次执行 npm run build, 插件会自动刷新,无需手动加载。
3835
39-
4036
## 计划
4137

4238
- [ ] 完善题目,优先级比较高的是 91 的这些题目,按照现有的两个题目的标准进行完善。

scripts/LeetCodeProvider.js

Lines changed: 41 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,43 @@
1-
2-
3-
const request = require('request')
4-
const Iconv = require('iconv-lite')
5-
const cheerio = require('cheerio')
6-
7-
const Logger = require('./logger')
8-
const Utils = require('./utils')
9-
const { PROBLEMS_URL, QUESTION_DOM_SELECTOR, BASE_MARKDWON_DOWNLOAD_URL, ENGLISH_MARKDOWN_SIGN } = require('./constants')
1+
const request = require("request");
2+
const Iconv = require("iconv-lite");
3+
const cheerio = require("cheerio");
4+
5+
const Logger = require("./logger");
6+
const Utils = require("./utils");
7+
const {
8+
PROBLEMS_URL,
9+
QUESTION_DOM_SELECTOR,
10+
BASE_MARKDWON_DOWNLOAD_URL,
11+
ENGLISH_MARKDOWN_SIGN,
12+
} = require("./constants");
1013

1114
module.exports = LeetCodeProvider = {
12-
13-
14-
15-
getProblemsTitle() {
16-
17-
return Utils.httpGet(PROBLEMS_URL)
18-
.then((body)=> {
19-
let titles = []
20-
let sHtml = Iconv.decode(body, 'utf-8').toString()
21-
cheerio.load(sHtml)(QUESTION_DOM_SELECTOR).each((idx, ele) => titles.push(ele.attribs['title']))
22-
Logger.success('获取问题列表成功')
23-
24-
return titles.filter(name => !name.endsWith(ENGLISH_MARKDOWN_SIGN))
25-
})
26-
.catch(error => {
27-
Logger.error('获取问题列表失败', error)
28-
})
29-
},
30-
31-
32-
33-
34-
getProblemDetail(problemNameWithExt) {
35-
36-
return Utils.httpGet(`${BASE_MARKDWON_DOWNLOAD_URL}${problemNameWithExt}`)
37-
.then(body => {
38-
39-
let markdown = Iconv.decode(body, 'utf-8').toString()
40-
Logger.success(`抓取问题 "${problemNameWithExt}" 成功!`)
41-
return markdown
42-
})
43-
.catch(error => {
44-
Logger.error(`抓取问题 "${problemNameWithExt}" 失败`, error)
45-
})
46-
}
47-
48-
}
49-
15+
getProblemsTitle() {
16+
return Utils.httpGet(PROBLEMS_URL)
17+
.then((body) => {
18+
let titles = [];
19+
let sHtml = Iconv.decode(body, "utf-8").toString();
20+
cheerio
21+
.load(sHtml)(QUESTION_DOM_SELECTOR)
22+
.each((idx, ele) => titles.push(ele.attribs["title"]));
23+
Logger.success("获取问题列表成功");
24+
25+
return titles.filter((name) => !name.endsWith(ENGLISH_MARKDOWN_SIGN));
26+
})
27+
.catch((error) => {
28+
Logger.error("获取问题列表失败", error);
29+
});
30+
},
31+
32+
getProblemDetail(problemNameWithExt) {
33+
return Utils.httpGet(`${BASE_MARKDWON_DOWNLOAD_URL}${problemNameWithExt}`)
34+
.then((body) => {
35+
let markdown = Iconv.decode(body, "utf-8").toString();
36+
Logger.success(`抓取问题 "${problemNameWithExt}" 成功!`);
37+
return markdown;
38+
})
39+
.catch((error) => {
40+
Logger.error(`抓取问题 "${problemNameWithExt}" 失败`, error);
41+
});
42+
},
43+
};

scripts/Logger.js

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,18 @@
1-
21
const log4js = require("log4js");
32

4-
const logger = log4js.getLogger()
3+
const logger = log4js.getLogger();
54

6-
logger.level = 'debug'
5+
logger.level = "debug";
76

8-
logger.category = 'LeetCode'
7+
logger.category = "LeetCode";
98

109
const Logger = {
11-
12-
success(...args) {
13-
logger.info(...args)
14-
},
15-
error(...args) {
16-
logger.error(...args)
17-
}
18-
19-
}
20-
21-
module.exports = Logger
10+
success(...args) {
11+
logger.info(...args);
12+
},
13+
error(...args) {
14+
logger.error(...args);
15+
},
16+
};
17+
18+
module.exports = Logger;

scripts/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
## 爬虫
2+
3+
### 使用
4+
5+
```
6+
- npm run crawl
7+
```
8+
9+
### 原理
10+
11+
此命令会:
12+
13+
- 先从 github 上拉取题解列表,将文件名解析成数组
14+
- 然后根据题解名称循环拉取与之对应的 markdown 文件(此过程会先查找本地是否存在,如果存在则跳过)
15+
- 问题拉取完成后,根据 markdown 匹配正则,转化成所需的 json 文件
16+
17+
### 架构图
18+
19+
![](https://tva1.sinaimg.cn/large/007S8ZIlly1gfpjc8au5fj30np0okjuc.jpg)

scripts/constants.js

Lines changed: 47 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,48 @@
1-
2-
3-
41
module.exports = {
5-
6-
/**
7-
* 需解析的语言类型
8-
*/
9-
SUPPORT_LANGUAGE: [
10-
'java',
11-
'js',
12-
'cpp',
13-
'py'
14-
],
15-
16-
/**
17-
* 是否启用强制更新
18-
* 如开启,会跳过读取本地缓存,拉取最新文件
19-
*/
20-
IS_FORCE_UPDATE_MODE: true,
21-
22-
/**
23-
* 请求处理频率 ms
24-
*/
25-
REQUEST_RATE: 300,
26-
27-
/**
28-
* markdown输出目录
29-
*/
30-
RAW_MARKDOWN_OUTPUT_DIR: 'spider/raw-markdown',
31-
32-
/**
33-
* 转化后的json输出目录
34-
*/
35-
DB_JSON_OUTPUT_DIR: 'spider/yield-db-json',
36-
37-
/**
38-
* 获取问题列表地址
39-
*/
40-
PROBLEMS_URL: 'https://github.com/azl397985856/leetcode/tree/master/problems',
41-
42-
/**
43-
* 抓取页面问题内容的dom元素选择器
44-
*/
45-
QUESTION_DOM_SELECTOR: '.js-navigation-item .content .js-navigation-open',
46-
47-
/**
48-
* markdwon下载地址
49-
*/
50-
BASE_MARKDWON_DOWNLOAD_URL: 'https://raw.githubusercontent.com/azl397985856/leetcode/master/problems/',
51-
52-
/**
53-
* 过滤英文文档末尾标识
54-
*/
55-
ENGLISH_MARKDOWN_SIGN: '.en.md'
56-
57-
}
2+
/**
3+
* 需解析的语言类型
4+
*/
5+
SUPPORT_LANGUAGE: ["java", "js", "cpp", "py"],
6+
7+
/**
8+
* 是否启用强制更新
9+
* 如开启,会跳过读取本地缓存,拉取最新文件
10+
*/
11+
IS_FORCE_UPDATE_MODE: true,
12+
13+
/**
14+
* 请求处理频率 ms
15+
*/
16+
REQUEST_RATE: 300,
17+
18+
/**
19+
* markdown输出目录
20+
*/
21+
RAW_MARKDOWN_OUTPUT_DIR: "spider/raw-markdown",
22+
23+
/**
24+
* 转化后的json输出目录
25+
*/
26+
DB_JSON_OUTPUT_DIR: "spider/yield-db-json",
27+
28+
/**
29+
* 获取问题列表地址
30+
*/
31+
PROBLEMS_URL: "https://github.com/azl397985856/leetcode/tree/master/problems",
32+
33+
/**
34+
* 抓取页面问题内容的dom元素选择器
35+
*/
36+
QUESTION_DOM_SELECTOR: ".js-navigation-item .content .js-navigation-open",
37+
38+
/**
39+
* markdwon下载地址
40+
*/
41+
BASE_MARKDWON_DOWNLOAD_URL:
42+
"https://raw.githubusercontent.com/azl397985856/leetcode/master/problems/",
43+
44+
/**
45+
* 过滤英文文档末尾标识
46+
*/
47+
ENGLISH_MARKDOWN_SIGN: ".en.md",
48+
};

0 commit comments

Comments
 (0)