Skip to content

Commit

Permalink
添加“Oschina开源中国博客”功能
Browse files Browse the repository at this point in the history
  • Loading branch information
kangvcar committed Jul 19, 2020
1 parent 1c07f25 commit 58ba309
Show file tree
Hide file tree
Showing 12 changed files with 140 additions and 4 deletions.
69 changes: 69 additions & 0 deletions Spiders/oschina/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import re
import os
import sys
import json
import requests
from bs4 import BeautifulSoup
from tkinter.filedialog import askdirectory

class Oschina(object):
def __init__(self, blogurl):
self.blogurl = blogurl
self.path = askdirectory(title='选择信息保存文件夹')
if str(self.path) == "":
sys.exit(1)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}

def get_element_of_article(self):
'''
获取元素(标题,发布时间,阅读量)
'''
# url = blogurl + 'widgets/_space_index_newest_blog?catalogId=0&q=&p={}&type=ajax'
url = self.blogurl + '/widgets/_space_index_newest_blog'
pos = 1
article_list = []
while 1:
key_dict = {
'catalogId': '0',
'q': '',
'p': str(pos),
'type': 'ajax'
}
reps = requests.get(url, headers=self.headers, params=key_dict, timeout=10)
soup = BeautifulSoup(reps.text, "html.parser")
posts = soup.find_all("div", class_="content")
# print(len(posts))
if not len(posts):
break
date_pattern = re.compile(r"\d+/\d{1,2}/\d{1,2}")
time_pattern = re.compile(r"\d{2}:\d{2}")
from tqdm import tqdm
pbar = tqdm(posts)
for each_post in pbar:
try:
item = {}
item['title'] = each_post.find("a", class_="header").text.replace(" ", "").split('\n')[-2]
item['sumary'] = each_post.find("div", class_="description").text.strip().replace('\n', '')
item['postdate'] = date_pattern.findall(posts[3].find("div", class_="extra").text)[0]
item['posttime'] = time_pattern.findall(posts[3].find("div", class_="extra").text)[0]
item['views'] = each_post.find("div", class_="extra").find_all('div', class_='item')[-2].text.strip()
article_list.append(item)
pbar.set_description("正在爬取文章:%s" % item['title'])
except:
pass
import time
time.sleep(0.1)
pos += 1
article_json = json.dumps(article_list)
return article_json

def save_as_json(self, content_json):
with open(self.path + os.sep + 'oschina_article.json', 'w', encoding='utf-8') as f:
f.write(content_json)


if __name__ == '__main__':
article = get_element_of_article('https://my.oschina.net/kangvcar')
save_as_json(article)
55 changes: 53 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
- [x] 12306
- [x] 博客园
- [x] CSDN博客
- [x] 开源中国博客
- [ ] 学信网
- [ ] 携程
- [ ] 微信好友
Expand Down Expand Up @@ -3800,7 +3801,7 @@

1. 点击**博客园**数据源按钮g

![cnblog1.png](https://i.loli.net/2020/07/19/mlhiFJ4M2GxaR9D.png ':size=10%')
![cnblog1.png](https://i.loli.net/2020/07/19/TyIhNxdX5wFEtYH.png ':size=10%')

2. 输入博客园用户名

Expand Down Expand Up @@ -3845,7 +3846,7 @@

1. 点击**CSDN**数据源按钮g

![csdn1.png](https://i.loli.net/2020/07/19/Q9PhVIpSHxiWFR7.png ':size=10%')
![csdn1.png](https://i.loli.net/2020/07/19/3cnra4DZIsGEpvk.png ':size=10%')

2. 输入CSDN博客用户名

Expand Down Expand Up @@ -3880,6 +3881,56 @@
```

</details>

***
## Oschina开源中国博客

!> **说明**:无需登录账号, 输入开源中国博客个人主页链接 (如 [https://my.oschina.net/kangvca](https://my.oschina.net/kangvcar) ) .

### 使用步骤

1. 点击**开源中国博客**数据源按钮g

![oschina1.png](https://i.loli.net/2020/07/19/IlyC7ahoAsOH8Tm.png ':size=10%')

2. 输入开源中国博客个人主页链接, 如 [https://my.oschina.net/kangvca](https://my.oschina.net/kangvcar) )

![oschina2.png](https://i.loli.net/2020/07/19/4i7xDYXZArjqdOm.png ':size=50%')

!> **注意**:个人主页链接最后不含 `/` (斜杆)

3. 选择数据保存路径

![oschina3.png](https://i.loli.net/2020/07/19/8vMerkOSI7XoAm4.png ':size=50%')

4. 查看爬取的数据 (json格式)

![oschina4.png](https://i.loli.net/2020/07/19/BLZfkFYMXEPNjpa.png ':size=50%')

### 数据说明

?> 👍 由于数据信息过长, 这里只作主要数据项说明, **点击展开查看示例**

<details>
<summary>csdn_article.json 👉 你的开源中国博客文章信息</summary>

```json
[
{
"title": "PXE/KickStart\u65e0\u4eba\u503c\u5b88\u5b89\u88c5",
"sumary": "\u5bfc\u8a00 \u4f5c\u4e3a\u4e2d\u5c0f\u516c\u53f8\u7684\u8fd0\u7ef4\uff0c\u7ecf\u5e38\u4f1a\u9047\u5230\u4e00\u4e9b\u673a\u68b0\u5f0f\u7684\u91cd\u590d\u5de5\u4f5c\uff0c\u4f8b\u5982\uff1a\u6709\u65f6\u516c\u53f8\u540c\u65f6\u4e0a\u7ebf\u51e0\u5341\u751a\u81f3\u4e0a\u767e\u53f0\u670d\u52a1\u5668\uff0c\u800c\u4e14\u9700\u8981\u6211\u4eec\u5728\u77ed\u65f6\u95f4\u5185\u5b8c\u6210\u7cfb\u7edf\u5b89\u88c5\u3002 \u5e38\u89c4\u7684\u529e\u6cd5\u6709\u4ec0\u4e48\uff1f _\u5149\u76d8\u5b89\u88c5\u7cfb\u7edf ===> \u4e00...",
"postdate": "2018/05/07",
"posttime": "21:17",
"views": "132"
},
...
]
```

</details>



***
# License
GPL-3.0
Expand Down
Binary file modified docs/_media/cnblog1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_media/csdn1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_media/oschina1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_media/oschina2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_media/oschina3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_media/oschina4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 18 additions & 2 deletions tools/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,22 @@ def OnClick(self, event):
self.updateStatus(self.frame, 2)
pass


class OschinaButton(Button):
def OnClick(self, event):
dlg = wx.TextEntryDialog(None, u"请输入开源中国个人博客主页链接:", u"获取开源中国博客用户文章信息")
if dlg.ShowModal() == wx.ID_OK:
blogurl = dlg.GetValue() # 获取文本框中输入的值
dlg.Destroy()
from oschina.main import Oschina
try:
self.updateStatus(self.frame, 0)
oschina = Oschina(blogurl)
article = oschina.get_element_of_article()
oschina.save_as_json(article)
self.updateStatus(self.frame, 1)
except:
self.updateStatus(self.frame, 2)
pass
class Item:
x = 0
y = 0
Expand Down Expand Up @@ -855,9 +870,10 @@ def __init__(self, *args, **kw):
A12306Button(self, self.pnl, Item(start_x+xstep, start_y+ystep*3, '12306', 'resource/icon/12306.png'))
CnblogButton(self, self.pnl, Item(start_x+xstep*2, start_y+ystep*3, '博客园', 'resource/icon/cnblog.png'))
CsdnButton(self, self.pnl, Item(start_x+xstep*3, start_y+ystep*3, 'CSDN博客', 'resource/icon/csdn.png'))
OschinaButton(self, self.pnl, Item(start_x+xstep*4, start_y+ystep*3, '开源中国博客', 'resource/icon/oschina.png'))
# CtripButton(self, self.pnl, Item(start_x+xstep*2, start_y+ystep*3, '携程', 'resource/icon/ctrip.png'))
# ChisButton(self, self.pnl, Item(start_x+xstep*3, start_y+ystep*3, '学信网', 'resource/icon/xuexin.png'))
WechatButton(self, self.pnl, Item(start_x+xstep*4, start_y+ystep*3, '微信好友', 'resource/icon/wechat.png'))
# WechatButton(self, self.pnl, Item(start_x+xstep*4, start_y+ystep*3, '微信好友', 'resource/icon/wechat.png'))
WechatmomentButton(self, self.pnl, Item(start_x+xstep*5, start_y+ystep*3, '微信朋友圈', 'resource/icon/wechat-moments.png'))
# GjjButton(self, self.pnl, Item(start_x +xstep*4, start_y+ystep*2, '公积金', 'resource/icon/gjj.png'))

Expand Down
Binary file modified tools/resource/icon/cnblog.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tools/resource/icon/csdn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tools/resource/icon/oschina.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 58ba309

Please sign in to comment.