Skip to content

Commit

Permalink
Merge pull request binux#56 from binux/six
Browse files Browse the repository at this point in the history
add python 3 support
  • Loading branch information
binux committed Dec 15, 2014
2 parents dd0fc55 + 7aca567 commit 8741644
Show file tree
Hide file tree
Showing 58 changed files with 641 additions and 676 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ language: python
python:
- "2.6"
- "2.7"
- "3.3"
- "3.4"
services:
- mongodb
- rabbitmq
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ A Powerful Spider(Web Crawler) System in Python. [Try It Now!](http://demo.pyspi
- Javascript pages supported!
- Task priority, retry, periodical, recrawl by age and more
- Distributed architecture
- Python 3 supported!


Sample Code:
Expand Down Expand Up @@ -42,7 +43,7 @@ class Handler(BaseHandler):
Installation
============

* python2.6/7 (windows is not supported currently)
* python 2.6, 2.7, 3.3, 3.4
* `pip install --allow-all-external -r requirements.txt`
* `./run.py` , visit [http://localhost:5000/](http://localhost:5000/)

Expand Down
5 changes: 4 additions & 1 deletion pyspider/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
# http://binux.me
# Created on 2014-10-08 15:04:08

import urlparse
try:
from urllib import parse as urlparse
except ImportError:
import urlparse


def connect_database(url):
Expand Down
17 changes: 11 additions & 6 deletions pyspider/database/basedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
# http://binux.me
# Created on 2012-08-30 17:43:49

from __future__ import unicode_literals, division, absolute_import

import logging
logger = logging.getLogger('database.basedb')

from six import itervalues


class BaseDB:

Expand All @@ -16,6 +20,7 @@ class BaseDB:
dbcur should be overwirte
'''
__tablename__ = None
placeholder = '%s'

@staticmethod
Expand Down Expand Up @@ -68,44 +73,44 @@ def _select2dic(self, tablename=None, what="*", where="", where_values=[],
def _replace(self, tablename=None, **values):
tablename = self.escape(tablename or self.__tablename__)
if values:
_keys = ", ".join(self.escape(k) for k in values.iterkeys())
_keys = ", ".join(self.escape(k) for k in values)
_values = ", ".join([self.placeholder, ] * len(values))
sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values)
else:
sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename
logger.debug("<sql: %s>", sql_query)

if values:
dbcur = self._execute(sql_query, values.values())
dbcur = self._execute(sql_query, list(itervalues(values)))
else:
dbcur = self._execute(sql_query)
return dbcur.lastrowid

def _insert(self, tablename=None, **values):
tablename = self.escape(tablename or self.__tablename__)
if values:
_keys = ", ".join((self.escape(k) for k in values.iterkeys()))
_keys = ", ".join((self.escape(k) for k in values))
_values = ", ".join([self.placeholder, ] * len(values))
sql_query = "INSERT INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values)
else:
sql_query = "INSERT INTO %s DEFAULT VALUES" % tablename
logger.debug("<sql: %s>", sql_query)

if values:
dbcur = self._execute(sql_query, values.values())
dbcur = self._execute(sql_query, list(itervalues(values)))
else:
dbcur = self._execute(sql_query)
return dbcur.lastrowid

def _update(self, tablename=None, where="1=0", where_values=[], **values):
tablename = self.escape(tablename or self.__tablename__)
_key_values = ", ".join([
"%s = %s" % (self.escape(k), self.placeholder) for k in values.iterkeys()
"%s = %s" % (self.escape(k), self.placeholder) for k in values
])
sql_query = "UPDATE %s SET %s WHERE %s" % (tablename, _key_values, where)
logger.debug("<sql: %s>", sql_query)

return self._execute(sql_query, values.values() + list(where_values))
return self._execute(sql_query, list(itervalues(values)) + list(where_values))

def _delete(self, tablename=None, where="1=0", where_values=[]):
tablename = self.escape(tablename or self.__tablename__)
Expand Down
2 changes: 1 addition & 1 deletion pyspider/database/mongodb/resultdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import json
import time
from pymongo import MongoClient
from mongodbbase import SplitTableMixin
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from .mongodbbase import SplitTableMixin


class ResultDB(SplitTableMixin, BaseResultDB):
Expand Down
2 changes: 1 addition & 1 deletion pyspider/database/mongodb/taskdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import time
from pymongo import MongoClient

from mongodbbase import SplitTableMixin
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from .mongodbbase import SplitTableMixin


class TaskDB(SplitTableMixin, BaseTaskDB):
Expand Down
2 changes: 1 addition & 1 deletion pyspider/database/mysql/projectdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
from pyspider.database.basedb import BaseDB
from mysqlbase import MySQLMixin
from .mysqlbase import MySQLMixin


class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB):
Expand Down
9 changes: 6 additions & 3 deletions pyspider/database/mysql/resultdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
# Created on 2014-10-13 22:02:57

import re
import six
import time
import json
import mysql.connector

from pyspider.libs import utils
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from pyspider.database.basedb import BaseDB
from mysqlbase import MySQLMixin, SplitTableMixin
from .mysqlbase import MySQLMixin, SplitTableMixin


class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB):
Expand Down Expand Up @@ -41,9 +43,10 @@ def _create_project(self, project):
) ENGINE=MyISAM CHARSET=utf8''' % self.escape(tablename))

def _parse(self, data):
for key, value in list(six.iteritems(data)):
if isinstance(value, (bytearray, six.binary_type)):
data[key] = utils.text(value)
if 'result' in data:
if isinstance(data['result'], bytearray):
data['result'] = str(data['result'])
data['result'] = json.loads(data['result'])
return data

Expand Down
11 changes: 7 additions & 4 deletions pyspider/database/mysql/taskdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@


import re
import six
import time
import json
import mysql.connector

from pyspider.libs import utils
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from pyspider.database.basedb import BaseDB
from mysqlbase import MySQLMixin, SplitTableMixin
from .mysqlbase import MySQLMixin, SplitTableMixin


class TaskDB(MySQLMixin, SplitTableMixin, BaseTaskDB, BaseDB):
Expand Down Expand Up @@ -49,12 +51,13 @@ def _create_project(self, project):
) ENGINE=MyISAM CHARSET=utf8''' % self.escape(tablename))

def _parse(self, data):
for key, value in list(six.iteritems(data)):
if isinstance(value, (bytearray, six.binary_type)):
data[key] = utils.text(value)
for each in ('schedule', 'fetch', 'process', 'track'):
if each in data:
if data[each]:
if isinstance(data[each], bytearray):
data[each] = str(data[each])
data[each] = json.loads(unicode(data[each], 'utf8'))
data[each] = json.loads(data[each])
else:
data[each] = {}
return data
Expand Down
43 changes: 32 additions & 11 deletions pyspider/database/sqlalchemy/projectdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,21 @@
# http://binux.me
# Created on 2014-12-04 23:25:10

import re
import six
import time
import json

from sqlalchemy import (create_engine, MetaData, Table, Column, Index,
Integer, String, Float, Text, sql, func)
from pyspider.libs import utils
from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
from .sqlalchemybase import result2dict

if six.PY3:
where_type = utils.utf8
else:
where_type = utils.text


class ProjectDB(BaseProjectDB):
__tablename__ = 'projectdb'

Expand All @@ -28,45 +34,60 @@ def __init__(self, url):
Column('burst', Float(11)),
Column('updatetime', Float(16))
)
self.engine = create_engine(url)
self.engine = create_engine(url, convert_unicode=True)
self.table.create(self.engine, checkfirst=True)

@staticmethod
def _parse(data):
for key, value in list(six.iteritems(data)):
if isinstance(value, six.binary_type):
data[key] = utils.text(value)
return data

@staticmethod
def _stringify(data):
if six.PY3:
for key, value in list(six.iteritems(data)):
if isinstance(value, six.string_types):
data[key] = utils.utf8(value)
return data

def insert(self, name, obj={}):
obj = dict(obj)
obj['name'] = name
obj['updatetime'] = time.time()
return self.engine.execute(self.table.insert()
.values(**obj))
.values(**self._stringify(obj)))

def update(self, name, obj={}, **kwargs):
obj = dict(obj)
obj.update(kwargs)
obj['updatetime'] = time.time()
return self.engine.execute(self.table.update()
.where(self.table.c.name == name)
.values(**obj))
.where(self.table.c.name == where_type(name))
.values(**self._stringify(obj)))

def get_all(self, fields=None):
columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
for task in self.engine.execute(self.table.select()
.with_only_columns(columns)):
yield result2dict(columns, task)
yield self._parse(result2dict(columns, task))

def get(self, name, fields=None):
columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
for task in self.engine.execute(self.table.select()
.where(self.table.c.name == name)
.where(self.table.c.name == where_type(name))
.limit(1)
.with_only_columns(columns)):
return result2dict(columns, task)
return self._parse(result2dict(columns, task))

def drop(self, name):
return self.engine.execute(self.table.delete()
.where(self.table.c.name == name))
.where(self.table.c.name == where_type(name)))

def check_update(self, timestamp, fields=None):
columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
for task in self.engine.execute(self.table.select()
.with_only_columns(columns)
.where(self.table.c.updatetime >= timestamp)):
yield result2dict(columns, task)
yield self._parse(result2dict(columns, task))
28 changes: 22 additions & 6 deletions pyspider/database/sqlalchemy/resultdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@
# Created on 2014-12-04 18:48:15

import re
import six
import time
import json

from sqlalchemy import (create_engine, MetaData, Table, Column,
Integer, String, Float, LargeBinary, sql)
String, Float, LargeBinary)
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from pyspider.libs import utils
from .sqlalchemybase import SplitTableMixin, result2dict

if six.PY3:
where_type = utils.utf8
else:
where_type = utils.text


class ResultDB(SplitTableMixin, BaseResultDB):
__tablename__ = ''
Expand All @@ -25,7 +32,7 @@ def __init__(self, url):
Column('result', LargeBinary),
Column('updatetime', Float(16))
)
self.engine = create_engine(url)
self.engine = create_engine(url, convert_unicode=True)

self._list_project()

Expand All @@ -36,16 +43,25 @@ def _create_project(self, project):
self.table.name = self._tablename(project)
self.table.create(self.engine)

def _parse(self, data):
@staticmethod
def _parse(data):
for key, value in list(six.iteritems(data)):
if isinstance(value, six.binary_type):
data[key] = utils.text(value)
if 'result' in data:
if isinstance(data['result'], bytearray):
data['result'] = str(data['result'])
data['result'] = json.loads(data['result'])
return data

def _stringify(self, data):
@staticmethod
def _stringify(data):
if 'result' in data:
data['result'] = json.dumps(data['result'])
if six.PY3:
for key, value in list(six.iteritems(data)):
if isinstance(value, six.string_types):
data[key] = utils.utf8(value)
return data

def save(self, project, taskid, url, result):
Expand All @@ -62,7 +78,7 @@ def save(self, project, taskid, url, result):
if self.get(project, taskid, ('taskid', )):
del obj['taskid']
return self.engine.execute(self.table.update()
.where(self.table.c.taskid==taskid)
.where(self.table.c.taskid == where_type(taskid))
.values(**self._stringify(obj)))
else:
return self.engine.execute(self.table.insert()
Expand Down Expand Up @@ -102,6 +118,6 @@ def get(self, project, taskid, fields=None):
columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
for task in self.engine.execute(self.table.select()
.with_only_columns(columns=columns)
.where(self.table.c.taskid == taskid)
.where(self.table.c.taskid == where_type(taskid))
.limit(1)):
return self._parse(result2dict(columns, task))
3 changes: 2 additions & 1 deletion pyspider/database/sqlalchemy/sqlalchemybase.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
# http://binux.me
# Created on 2014-12-04 18:48:47

import six
import time
from sqlalchemy.engine import reflection


def result2dict(columns, task):
r = {}
for c, t in zip(columns, task):
if isinstance(c, basestring):
if isinstance(c, six.string_types):
r[c] = t
else:
r[c.name] = t
Expand Down
Loading

0 comments on commit 8741644

Please sign in to comment.