Skip to content

Commit

Permalink
修复了一堆bug,适配fluid(1.19)
Browse files Browse the repository at this point in the history
  • Loading branch information
2X-ercha committed Jun 9, 2021
1 parent bd89e15 commit c50d0ac
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 23 deletions.
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,20 @@
![](https://cdn.nlark.com/yuque/0/2021/png/8391485/1612877553087-3087b091-93ce-40fd-a49f-8baf0f0f49c4.png#align=left&display=inline&height=521&margin=%5Bobject%20Object%5D&name=image.png&originHeight=521&originWidth=386&size=161076&status=done&style=none&width=386)

```
目前 release 1.18 版本:
① 支持butterfly、volantis、matery主题的友链获取
目前 release 1.19 版本:
① 支持butterfly、volantis、matery、sakura、fluid主题的友链获取
② 支持小康友链及 volantis 主题友链,即部署于 gitee 上的 issuse 友链获取
③ 支持 butterfly、volantis、matery 主题的最新文章获取
③ 支持 butterfly、volantis、matery、sakura、fluid主题的最新文章获取
④ 支持大部分拥有 sitemap 网站的文章获取
⑤ 拥有友链屏蔽、关键词屏蔽、等自定义 yaml 的配置项
⑥ 代码重构并规范化,便于二次开发
bug修复
① 重复爬取同一文章问题
② 非bf主题爬取报错
③ sitemap重新置后(不通用)
④ hexo-theme-sakura主题两种时间格式引发错误
⑤ 屏蔽链接修复
```
预览链接:https://noionion.top/friendcircle/

Expand Down Expand Up @@ -86,13 +90,15 @@ bug修复
```PY
# component
from theme import butterfly,matery,volantis
from theme import butterfly,matery,volantis,sakura,fluid
# theme fit massage
themes = [
butterfly,
matery,
volantis
volantis,
sakura,
fluid
]
```
Expand Down
2 changes: 1 addition & 1 deletion _config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ setting:
- '绒布球'

block_site: # 屏蔽站点
- https://example.com/
- 'https://example.com/'
7 changes: 4 additions & 3 deletions handlers/coreLink.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,17 @@ def delete_same_link(orign_friend_poordic):
friend_poordic.append(item)
else:
print('-----------------')
print('重复1条友链链接,已删除')
print('重复1条友链链接,已删除!链接为:', item[1])
print('-----------------')
return friend_poordic


# 链接屏蔽
def block_link(orign_friend_poordic):
def block_link(orign_friend_poordic, config = config.yml):
friend_poordic = []
block_site = config['setting']['block_site']
for item in orign_friend_poordic:
if item[1] not in config.BLOCK_SITE:
if item[1] not in block_site:
friend_poordic.append(item)
else:
print('-----------------')
Expand Down
16 changes: 11 additions & 5 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sys

# component
from theme import butterfly,matery,volantis
from theme import butterfly,matery,volantis,sakura,fluid

# handlers
from handlers.coreSettings import configs
Expand All @@ -36,7 +36,9 @@
themes = [
butterfly,
matery,
volantis
volantis,
sakura,
fluid
]

# ---------- #
Expand Down Expand Up @@ -74,7 +76,10 @@ def get_link(friendpage_link, config):

# get theme_link
for themelinkfun in themes:
themelinkfun.get_friendlink(friendpage_link, friend_poor)
try:
themelinkfun.get_friendlink(friendpage_link, friend_poor)
except:
pass
friend_poor = delete_same_link(friend_poor)
friend_poor = block_link(friend_poor)

Expand All @@ -94,13 +99,14 @@ def spider(item):
error = True
try:
total_count += 1
error, post_poor = sitmap_get(item, post_poor)
if error:
print("-----------获取sitemap信息失败,采取主页爬虫策略----------")
for themelinkfun in themes:
if not error:
break
error = themelinkfun.get_last_post(item, post_poor)
if error:
print("-----------获取主页信息失败,采取sitemap策略----------")
error, post_poor = sitmap_get(item, post_poor)

except Exception as e:
print('\n')
Expand Down
86 changes: 86 additions & 0 deletions theme/fluid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import datetime
from request_data import request

# fluid 友链规则
def get_friendlink(friendpage_link, friend_poor):
result = request.get_data(friendpage_link)
soup = BeautifulSoup(result, 'html.parser')
main_content = soup.find_all('div', {"class": "card-content"})
for item in main_content:
img = item.find('img').get('src')
link = item.find('div', {"class": "link-intro"}).text
name = item.find('div', {"class": "link-title"}).text
if "#" in link:
pass
else:
user_info = []
user_info.append(name)
user_info.append(link)
user_info.append(img)
print('----------------------')
try:
print('好友名%r' % name)
except:
print('非法用户名')
print('头像链接%r' % img)
print('主页链接%r' % link)
friend_poor.append(user_info)

# 从fluid主页获取文章
def get_last_post(user_info,post_poor):
error_sitmap = False
link = user_info[1]
print('\n')
print('-------执行fluid主页规则----------')
print('执行链接:', link)
result = request.get_data(link)
soup = BeautifulSoup(result, 'html.parser')
main_content = soup.find_all(id = 'board')
time_excit = soup.find_all('div',{"class": "post-meta mr-3"})
if main_content and time_excit:
error_sitmap = True
link_list = main_content[0].find_all('div', {"class": "post-meta mr-3"})
lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
for index, item in enumerate(link_list):
time = item.text
time = time.replace("|","")
time = time.replace(" ", "")
time = time.replace("\n", "")
try: datetime.datetime.strptime(time, "%Y-%m-%d")
except: continue
if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
lasttime = lasttime.strftime('%Y-%m-%d')
print('最新时间是', lasttime)
last_post_list = main_content[0].find_all('div', {"class": "row mx-auto index-card"})

for item in last_post_list:
time_created = item.find('div', {"class": "post-meta mr-3"}).text.strip()

if time_created == lasttime:
error_sitmap = False
a = item.find('a')
# print(item.find('a'))
stralink = a['href']
if link[-1] != '/':
link = link + '/'
print(item.find('h1', {"class": "index-header"}).text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore'))
print(link + stralink)
print("-----------获取到匹配结果----------")
post_info = {
'title': item.find('h1', {"class": "index-header"}).text.strip(),
'time': lasttime,
'link': link + stralink,
'name': user_info[0],
'img': user_info[2]
}
post_poor.append(post_info)
else:
error_sitmap = True
print('貌似不是类似fluid主题!')
print("-----------结束fluid主页规则----------")
print('\n')
return error_sitmap
13 changes: 4 additions & 9 deletions theme/sakura.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from bs4 import BeautifulSoup
import datetime
from request_data import request
import re

# sakura 友链规则
def get_friendlink(friendpage_link, friend_poor):
Expand Down Expand Up @@ -46,22 +47,16 @@ def get_last_post(user_info,post_poor):
lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
for index, item in enumerate(link_list):
time = item.text
time = time.replace("|","")
time = time.replace(" ", "")
time = time.replace("\n", "")
time = time.replace("发布于", "")
time = time.replace("\t", "")

time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time).group(0)
if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
lasttime = lasttime.strftime('%Y-%m-%d')
print('最新时间是', lasttime)
last_post_list = main_content[0].find_all('article', {"class": "post"})
for item in last_post_list:
time_created = item.find('div', {"class": "post-date"}).text.strip()
time_created = time_created.replace(" ", "")
time_created = time_created.replace("发布于", "")
time_created = time_created.replace("\t", "")
time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0)
time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
if time_created == lasttime:
error_sitmap = False
print(lasttime)
Expand Down

0 comments on commit c50d0ac

Please sign in to comment.