Skip to content

Commit

Permalink
add weibo word cloud
Browse files Browse the repository at this point in the history
  • Loading branch information
lzjun567 committed Feb 14, 2017
1 parent a142df6 commit e95494b
Show file tree
Hide file tree
Showing 19 changed files with 2,208 additions and 0 deletions.
Binary file added HeatherT-heart-vine-mask.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added agone-Heart.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified crawer-pdf.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_11.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_13.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added my_twitter_wordcloud_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
97 changes: 97 additions & 0 deletions stop_word.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
什么
还是
就是
还要
可以
没有
看看
怎么
那么
不能
分享
出来
已经
下载
有点
今天
很多
因为
你们
完全
一次
quot
不是
这样
这么
觉得
知道
只有
不过
需要
还有
一个
这个
回复
现在
不错
大家
应该
我刚
不会
如果
时候
开始
正在
为啥
各种
一個
沒有
我們
你們
妳們
他們
她們
是否
Binary file added twitter_mask.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
936 changes: 936 additions & 0 deletions weibo.csv

Large diffs are not rendered by default.

179 changes: 179 additions & 0 deletions weibo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# -*- coding:utf-8 -*-
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree
import traceback


class weibo:
cookie = {
"Cookie": "ALF=1489643259; SCF=Ag8LC_GT2fOCJNynfN8dInmpUfTl6DpksRB4oaFKVODaVF0rGajo_z3eNm_NLXsA9ox83Rd6iv5lwPbUdHolj4E.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWbbPzPaDijADNgfIppECPn5JpX5o2p5NHD95QpSo.NS05EeoBpWs4DqcjGPNDQIgDfdntt; SUB=_2A251pupBDeRxGedI4lUW8CzOzz2IHXVXaPYJrDV6PUJbkdBeLW3akW0edqnbXfU91bwxKadYl-uuVAx5zg..; SUHB=0IIUL7Mz4i6Jh-; SSOLoginState=1487051281"} # 将your cookie替换成自己的cookie

# weibo类初始化
def __init__(self, user_id, filter=0):
self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400
self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博
self.userName = '' # 用户名,如“Dear-迪丽热巴”
self.weiboNum = 0 # 用户全部微博数
self.weiboNum2 = 0 # 爬取到的微博数
self.following = 0 # 用户关注数
self.followers = 0 # 用户粉丝数
self.weibos = [] # 微博内容
self.num_zan = [] # 微博对应的点赞数
self.num_forwarding = [] # 微博对应的转发数
self.num_comment = [] # 微博对应的评论数

# 获取用户昵称
def getUserName(self):
try:
url = 'http://weibo.cn/%d/info' % (self.user_id)
html = requests.get(url, cookies=weibo.cookie).content
selector = etree.HTML(html)
userName = selector.xpath("//title/text()")[0]
self.userName = userName[:-3].encode('gbk')
# print '用户昵称:' + self.userName
except Exception, e:
print "Error: ", e
traceback.print_exc()

# 获取用户微博数、关注数、粉丝数
def getUserInfo(self):
try:
url = 'http://weibo.cn/u/%d?filter=%d&page=1' % (self.user_id, self.filter)
print url
html = requests.get(url, cookies=weibo.cookie).content
selector = etree.HTML(html)
pattern = r"\d+\.?\d*"

# 微博数
str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]
guid = re.findall(pattern, str_wb, re.S | re.M)
for value in guid:
num_wb = int(value)
break
self.weiboNum = num_wb
# print '微博数: ' + str(self.weiboNum)

# 关注数
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
guid = re.findall(pattern, str_gz, re.M)
self.following = int(guid[0])
# print '关注数: ' + str(self.following)

# 粉丝数
str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
guid = re.findall(pattern, str_fs, re.M)
self.followers = int(guid[0])
# print '粉丝数: ' + str(self.followers)
except Exception, e:
print "Error: ", e
traceback.print_exc()

# 获取用户微博内容及对应的点赞数、转发数、评论数
def getWeiboInfo(self):
try:
url = 'http://weibo.cn/u/%d?filter=%d&page=1' % (self.user_id, self.filter)
html = requests.get(url, cookies=weibo.cookie).content
selector = etree.HTML(html)
if selector.xpath('//input[@name="mp"]') == []:
pageNum = 1
else:
pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])
pattern = r"\d+\.?\d*"
for page in range(1, pageNum + 1):
url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d' % (self.user_id, self.filter, page)
html2 = requests.get(url2, cookies=weibo.cookie).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
# print len(info)
if len(info) > 3:
for i in range(0, len(info) - 2):
self.weiboNum2 = self.weiboNum2 + 1
# 微博内容
str_t = info[i].xpath("div/span[@class='ctt']")
weibos = str_t[0].xpath('string(.)').encode('gbk', 'ignore')
self.weibos.append(weibos)
# print '微博内容:'+ weibos
# 点赞数
str_zan = info[i].xpath("div/a/text()")[-4]
guid = re.findall(pattern, str_zan, re.M)
num_zan = int(guid[0])
self.num_zan.append(num_zan)
# print '点赞数: ' + str(num_zan)
# 转发数
forwarding = info[i].xpath("div/a/text()")[-3]
guid = re.findall(pattern, forwarding, re.M)
num_forwarding = int(guid[0])
self.num_forwarding.append(num_forwarding)
# print '转发数: ' + str(num_forwarding)
# 评论数
comment = info[i].xpath("div/a/text()")[-2]
guid = re.findall(pattern, comment, re.M)
num_comment = int(guid[0])
self.num_comment.append(num_comment)
# print '评论数: ' + str(num_comment)
if self.filter == 0:
print '共' + str(self.weiboNum2) + '条微博'
else:
print '共' + str(self.weiboNum) + '条微博,其中' + str(self.weiboNum2) + '条为原创微博'
except Exception, e:
print "Error: ", e
traceback.print_exc()

# 主程序
def start(self):
try:
weibo.getUserName(self)
weibo.getUserInfo(self)
weibo.getWeiboInfo(self)
print '信息抓取完毕'
print '==========================================================================='
except Exception, e:
print "Error: ", e

# 将爬取的信息写入文件

def writeTxt(self):
try:
if self.filter == 1:
resultHeader = '\n\n原创微博内容:\n'
else:
resultHeader = '\n\n微博内容:\n'
result = '用户信息\n用户昵称:' + self.userName + '\n用户id:' + str(self.user_id) + '\n微博数:' + str(
self.weiboNum) + '\n关注数:' + str(self.following) + '\n粉丝数:' + str(self.followers) + resultHeader
for i in range(1, self.weiboNum2 + 1):
text = str(i) + ':' + self.weibos[i - 1] + '\n' + '点赞数:' + str(self.num_zan[i - 1]) + ' 转发数:' + str(
self.num_forwarding[i - 1]) + ' 评论数:' + str(self.num_comment[i - 1]) + '\n\n'
result = result + text
if os.path.isdir('weibo') == False:
os.mkdir('weibo')
f = open("weibo/%s.txt" % self.user_id, "wb")
f.write(result)
f.close()
file_path = os.getcwd() + "\weibo" + "\%d" % self.user_id + ".txt"
print '微博写入文件完毕,保存路径%s' % (file_path)
except Exception, e:
print "Error: ", e
traceback.print_exc()


# 使用实例,输入一个用户id,所有信息都会存储在wb实例中
user_id = 1697702241 # 可以改成任意合法的用户id(爬虫的微博id除外)
filter = 1 # 值为0表示爬取全部的微博信息(原创微博+转发微博),值为1表示只爬取原创微博
wb = weibo(user_id, filter) # 调用weibo类,创建微博实例wb
wb.start() # 爬取微博信息
print '用户名:' + wb.userName
print '全部微博数:' + str(wb.weiboNum)
print '关注数:' + str(wb.following)
print '粉丝数:' + str(wb.followers)
print '最新一条微博为:' + wb.weibos[0] # 若filter=1则为最新的原创微博,如果该用户微博数为0,即len(wb.weibos)==0,打印会出错,下同
print '最新一条微博获得的点赞数:' + str(wb.num_zan[0])
print '最新一条微博获得的转发数:' + str(wb.num_forwarding[0])
print '最新一条微博获得的评论数:' + str(wb.num_comment[0])
wb.writeTxt() # wb.writeTxt()只是把信息写到文件里,大家可以根据自己的需要重新编写writeTxt()函数
Loading

0 comments on commit e95494b

Please sign in to comment.