Skip to content

Commit

Permalink
refactor crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
lzjun567 committed Feb 20, 2017
1 parent ed39ec3 commit ac103b2
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
4 changes: 4 additions & 0 deletions heart/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding:utf-8 -*-
import jieba

print " ".join(jieba.cut(u":: [AVI/329M][C0930-hitozuma1007] 坂井雪恵 Yukie Sakai (308123)"))
24 changes: 21 additions & 3 deletions pdf/crawler.py → pdf/crawler1.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# coding=utf-8
import logging
import os
import re
import time
import logging

import pdfkit
import requests
from bs4 import BeautifulSoup
Expand All @@ -21,6 +22,22 @@
"""


class Crawler(object):
"""
爬虫基类,所有爬虫都应该继承此类
"""
name = None

def __init__(self, name=None, **kwargs):
if name is not None:
self.name = name
elif not getattr(self, 'name', None):
raise ValueError("%s must have a name" % type(self).__name__)

def parse(self):
raise NotImplementedError


def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
Expand Down Expand Up @@ -51,7 +68,8 @@ def func(m):
rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)
return rtn
else:
return m.group(1)+m.group(2)+m.group(3)
return m.group(1) + m.group(2) + m.group(3)

html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
Expand Down Expand Up @@ -120,4 +138,4 @@ def main():


if __name__ == '__main__':
main()
Crawler().parse()
3 changes: 3 additions & 0 deletions pdf/requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests==2.12.4
beautifulsoup4==4.5.3
pdfkit==0.6.1

0 comments on commit ac103b2

Please sign in to comment.