forked from lzjun567/python_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
33 changed files
with
940 additions
and
16,751 deletions.
There are no files selected for viewing
Binary file not shown.
Empty file.
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# -*- coding:utf-8 -*- | ||
import codecs | ||
import csv | ||
import re | ||
|
||
import jieba.analyse | ||
import matplotlib.pyplot as plt | ||
import requests | ||
from scipy.misc import imread | ||
from wordcloud import WordCloud | ||
|
||
__author__ = 'liuzhijun' | ||
|
||
cookies = { | ||
"ALF": "xxxx", | ||
"SCF": "xxxxxx.", | ||
"SUBP": "xxxxx", | ||
"SUB": "xxxx", | ||
"SUHB": "xxx-", "xx": "xx", "_T_WM": "xxx", | ||
"gsScrollPos": "", "H5_INDEX": "0_my", "H5_INDEX_TITLE": "xxx", | ||
"M_WEIBOCN_PARAMS": "xxxx" | ||
} | ||
|
||
|
||
def fetch_weibo(): | ||
api = "http://m.weibo.cn/index/my?format=cards&page=%s" | ||
for i in range(1, 102): | ||
response = requests.get(url=api % i, cookies=cookies) | ||
data = response.json()[0] | ||
groups = data.get("card_group") or [] | ||
for group in groups: | ||
text = group.get("mblog").get("text") | ||
text = text.encode("utf-8") | ||
|
||
def cleanring(content): | ||
""" | ||
去掉无用字符 | ||
""" | ||
pattern = "<a .*?/a>|<i .*?/i>|转发微博|//:|Repost|,|?|。|、|分享图片" | ||
content = re.sub(pattern, "", content) | ||
return content | ||
|
||
text = cleanring(text).strip() | ||
if text: | ||
yield text | ||
|
||
|
||
def write_csv(texts): | ||
with codecs.open('./weibo.csv', 'w') as f: | ||
writer = csv.DictWriter(f, fieldnames=["text"]) | ||
writer.writeheader() | ||
for text in texts: | ||
writer.writerow({"text": text}) | ||
|
||
|
||
def read_csv(): | ||
with codecs.open('./weibo.csv', 'r') as f: | ||
reader = csv.DictReader(f) | ||
for row in reader: | ||
yield row['text'] | ||
|
||
|
||
def word_segment(texts): | ||
jieba.analyse.set_stop_words("./stopwords.txt") | ||
for text in texts: | ||
tags = jieba.analyse.extract_tags(text, topK=20) | ||
yield " ".join(tags) | ||
|
||
|
||
def generate_img(texts): | ||
data = " ".join(text for text in texts) | ||
|
||
mask_img = imread('./heart-mask.jpg', flatten=True) | ||
wordcloud = WordCloud( | ||
font_path='msyh.ttc', | ||
background_color='white', | ||
mask=mask_img | ||
).generate(data) | ||
plt.imshow(wordcloud) | ||
plt.axis('off') | ||
plt.savefig('./heart.jpg', dpi=600) | ||
|
||
|
||
if __name__ == '__main__': | ||
texts = fetch_weibo() | ||
write_csv(texts) | ||
generate_img(word_segment(read_csv())) |
File renamed without changes.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
File renamed without changes
File renamed without changes.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Binary file not shown.
Oops, something went wrong.