Skip to content

Commit f21ec82

Browse files
authored
feat: 爬虫
2 parents 33401c8 + 37aeabc commit f21ec82

File tree

13 files changed

+22039
-25
lines changed

13 files changed

+22039
-25
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
# production
1212
/build
13+
/spider/**
1314

1415
# misc
1516
.DS_Store

README.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,33 @@
1010

1111
```
1212
|-public 插件的 mainfest,html 等静态资源
13-
|-src 源代码
13+
|-spider 爬虫抓取和转化后的相关文件
14+
|---raw-markdown 由github上抓到的题解markdown原文件
15+
|---yield-db-json 从markdown中提取标题、tag、公司、各语言题解生产的json
16+
|-src 源代码
17+
|-scripts
18+
|--- constants.js 脚本常量
19+
|--- curl-leetcode.js 爬虫请求逻辑
20+
|--- LeeCodeProvider.js 爬虫基类
21+
|--- Logger.js 日志辅助类
22+
|--- utils.js 正则、文件操作等辅助类
1423
|---db 所有的题目信息, 标签信息, 公司信息
1524
|---App.js 主逻辑都在这里
1625
```
1726

18-
## 构建
27+
## 爬虫
28+
- npm run crawl 此命令会先从github上拉取问题列表,将文件名解析成数组,然后根据问题名称循 环拉取与之对应的markdown文件(此过程会先查找本地是否存在,如果存在则跳过)
29+
问题拉取完成后,根据markdown匹配正则,转化成所需的json文件
30+
31+
1932

33+
## 构建
2034
- npm run build
2135
- 然后将 build 文件夹的内容添加到扩展中即可,具体方式见上面的`功能介绍`
2236

2337
> 以后每次执行 npm run build, 插件会自动刷新,无需手动加载。
2438
39+
2540
## 计划
2641

2742
- [ ] 完善题目,优先级比较高的是 91 的这些题目,按照现有的两个题目的标准进行完善。

package-lock.json

Lines changed: 15099 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,20 @@
77
"@testing-library/react": "^9.3.2",
88
"@testing-library/user-event": "^7.1.2",
99
"antd": "^4.3.1",
10+
"highlight.js": "^10.0.3",
11+
"marked": "^1.1.0",
1012
"react": "^16.13.1",
1113
"react-dom": "^16.13.1",
14+
"react-markdown": "^4.3.1",
1215
"react-scripts": "3.4.1"
1316
},
1417
"scripts": {
1518
"lint": "eslint src",
1619
"start": "react-scripts start",
1720
"build": "react-scripts build",
1821
"test": "react-scripts test",
19-
"eject": "react-scripts eject"
22+
"eject": "react-scripts eject",
23+
"crawl": "node scripts/curlLeetcode.js && node scripts/generateLeetcode.js"
2024
},
2125
"browserslist": {
2226
"production": [
@@ -29,5 +33,12 @@
2933
"last 1 firefox version",
3034
"last 1 safari version"
3135
]
36+
},
37+
"devDependencies": {
38+
"axios": "^0.19.2",
39+
"cheerio": "^1.0.0-rc.3",
40+
"iconv-lite": "^0.5.1",
41+
"log4js": "^6.3.0",
42+
"mkdirp": "^1.0.4"
3243
}
3344
}

scripts/LeetCodeProvider.js

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
3+
const request = require('request')
4+
const Iconv = require('iconv-lite')
5+
const cheerio = require('cheerio')
6+
7+
const Logger = require('./logger')
8+
const Utils = require('./utils')
9+
const { PROBLEMS_URL, QUESTION_DOM_SELECTOR, BASE_MARKDWON_DOWNLOAD_URL, ENGLISH_MARKDOWN_SIGN } = require('./constants')
10+
11+
module.exports = LeetCodeProvider = {
12+
13+
14+
15+
getProblemsTitle() {
16+
17+
return Utils.httpGet(PROBLEMS_URL)
18+
.then((body)=> {
19+
let titles = []
20+
let sHtml = Iconv.decode(body, 'utf-8').toString()
21+
cheerio.load(sHtml)(QUESTION_DOM_SELECTOR).each((idx, ele) => titles.push(ele.attribs['title']))
22+
Logger.success('获取问题列表成功')
23+
24+
return titles.filter(name => !name.endsWith(ENGLISH_MARKDOWN_SIGN))
25+
})
26+
.catch(error => {
27+
Logger.error('获取问题列表失败', error)
28+
})
29+
},
30+
31+
32+
33+
34+
getProblemDetail(problemNameWithExt) {
35+
36+
return Utils.httpGet(`${BASE_MARKDWON_DOWNLOAD_URL}${problemNameWithExt}`)
37+
.then(body => {
38+
39+
let markdown = Iconv.decode(body, 'utf-8').toString()
40+
Logger.success(`抓取问题 "${problemNameWithExt}" 成功!`)
41+
return markdown
42+
})
43+
.catch(error => {
44+
Logger.error(`抓取问题 "${problemNameWithExt}" 失败`, error)
45+
})
46+
}
47+
48+
}
49+

scripts/Logger.js

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
2+
const log4js = require("log4js");
3+
4+
const logger = log4js.getLogger()
5+
6+
logger.level = 'debug'
7+
8+
logger.category = 'LeetCode'
9+
10+
const Logger = {
11+
12+
success(...args) {
13+
logger.info(...args)
14+
},
15+
error(...args) {
16+
logger.error(...args)
17+
}
18+
19+
}
20+
21+
module.exports = Logger

scripts/constants.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
2+
3+
4+
module.exports = {
5+
6+
/**
7+
* 需解析的语言类型
8+
*/
9+
SUPPORT_LANGUAGE: [
10+
'java',
11+
'js',
12+
'cpp',
13+
'py'
14+
],
15+
16+
/**
17+
* 是否启用强制更新
18+
* 如开启,会跳过读取本地缓存,拉取最新文件
19+
*/
20+
IS_FORCE_UPDATE_MODE: true,
21+
22+
/**
23+
* 请求处理频率 ms
24+
*/
25+
REQUEST_RATE: 300,
26+
27+
/**
28+
* markdown输出目录
29+
*/
30+
RAW_MARKDOWN_OUTPUT_DIR: 'spider/raw-markdown',
31+
32+
/**
33+
* 转化后的json输出目录
34+
*/
35+
DB_JSON_OUTPUT_DIR: 'spider/yield-db-json',
36+
37+
/**
38+
* 获取问题列表地址
39+
*/
40+
PROBLEMS_URL: 'https://github.com/azl397985856/leetcode/tree/master/problems',
41+
42+
/**
43+
* 抓取页面问题内容的dom元素选择器
44+
*/
45+
QUESTION_DOM_SELECTOR: '.js-navigation-item .content .js-navigation-open',
46+
47+
/**
48+
* markdwon下载地址
49+
*/
50+
BASE_MARKDWON_DOWNLOAD_URL: 'https://raw.githubusercontent.com/azl397985856/leetcode/master/problems/',
51+
52+
/**
53+
* 过滤英文文档末尾标识
54+
*/
55+
ENGLISH_MARKDOWN_SIGN: '.en.md'
56+
57+
}

scripts/curlleetcode.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
const LeetCodeProvider = require('./leetcodeprovider')
2+
3+
const Logger = require('./logger')
4+
5+
const Utils = require('./utils')
6+
7+
8+
const { RAW_MARKDOWN_OUTPUT_DIR, REQUEST_RATE, IS_FORCE_UPDATE_MODE } = require('./constants')
9+
10+
11+
/**
12+
* 当前请求问题索引
13+
*/
14+
let requsetNumber = 0
15+
16+
17+
Utils.mkdirSync(RAW_MARKDOWN_OUTPUT_DIR)
18+
19+
const getProblemDetail = (questionsName, requsetNumber) => {
20+
21+
const cachedFilesName = Utils.getDirsFileNameSync(RAW_MARKDOWN_OUTPUT_DIR)
22+
23+
if (!IS_FORCE_UPDATE_MODE && cachedFilesName.includes(questionsName[requsetNumber])) {
24+
25+
Logger.success(`${questionsName[requsetNumber]}命中缓存, 跳过。。。`)
26+
27+
requsetNumber++
28+
29+
30+
getProblemDetail(questionsName, requsetNumber)
31+
32+
}
33+
else {
34+
35+
questionsName[requsetNumber] && LeetCodeProvider.getProblemDetail(questionsName[requsetNumber]).then(markDown => {
36+
if (markDown) {
37+
38+
Logger.success(`问题: "${questionsName[requsetNumber]}" | 结果: ${JSON.stringify(markDown)}`)
39+
40+
Utils.writeFileSync(RAW_MARKDOWN_OUTPUT_DIR, questionsName[requsetNumber], markDown)
41+
42+
requsetNumber++
43+
} else {
44+
Logger.error(`获取${questionsName[requsetNumber]} markdown 失败!`)
45+
}
46+
47+
}).catch(Logger.error).then(() => {
48+
49+
setTimeout(() => {
50+
51+
questionsName[requsetNumber] && getProblemDetail(questionsName, requsetNumber)
52+
53+
}, REQUEST_RATE)
54+
})
55+
}
56+
57+
}
58+
59+
60+
LeetCodeProvider.getProblemsTitle().then(questionsName => {
61+
62+
getProblemDetail(questionsName, requsetNumber)
63+
64+
})
65+
66+
67+
68+

0 commit comments

Comments
 (0)