Skip to content

Commit

Permalink
Merge branch 'master' of github.com:geekoala/whatsnews
Browse files Browse the repository at this point in the history
  • Loading branch information
datamut committed Sep 23, 2016
2 parents 437e4fa + abcaa13 commit 12e0d56
Show file tree
Hide file tree
Showing 41 changed files with 542 additions and 136 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ do not contain any thing related with this test.

### Sub modules/systems/services
This project consists of several different sub systems as below:
+ index_crawler - to crawl article urls from news websites
+ article_crawler - to crawl article content using urls fetched above
+ index_builder - to build index to mongodb/search-engine for full text search
+ auth_api - API used to grant privilege of using other APIs like search_api for external users
+ auth_service - centralized authorization service, called by auth_api and other APIs
+ search_api - search interface for users
+ search_service - service in charge of full text search on news content in search engine mentioned above
+ query_service - service used to process user's input queries, e.g. extend relevant queries, change queries, etc.
+ rank_service - service used for sorting search result, e.g. sort according user's profile/preferences
+ index-crawler - to crawl article urls from news websites
+ article-crawler - to crawl article content using urls fetched above
+ index-builder - to build index to mongodb/search-engine for full text search
+ auth-api - API used to grant privilege of using other APIs like search_api for external users
+ auth-service - centralized authorization service, called by auth_api and other APIs
+ search-api - search interface for users
+ search-service - service in charge of full text search on news content in search engine mentioned above
+ query-service - service used to process user's input queries, e.g. extend relevant queries, change queries, etc.
+ rank-service - service used for sorting search result, e.g. sort according user's profile/preferences

A flowchart will be provided for better understanding of this project.

Expand Down
26 changes: 7 additions & 19 deletions article-crawler/crawler/kafka_consume_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
import kafka
import scrapy

KAFKA_TOPIC_ID = 'KAFKA_TOPIC_ID'
KAFKA_GROUP_ID = 'KAFKA_GROUP_ID'
KAFKA_BOOTSTRAP_SERVERS = 'KAFKA_BOOTSTRAP_SERVERS'


class KafkaConsumeSpider(scrapy.Spider):
"""This abstract spider integrate Kafka consumer with scrapy.Spider.
Expand All @@ -33,22 +29,13 @@ def set_kafka(self, settings):
Raise ValueError when topic, group, or bootstrap_servers is not
specified.
"""
topic = settings.get(KAFKA_TOPIC_ID, None)
if not topic:
raise ValueError('{} setting is required'.format(KAFKA_TOPIC_ID))

group = settings.get(KAFKA_GROUP_ID, None)
if not group:
raise ValueError('{} setting is required'.format(KAFKA_GROUP_ID))

servers = settings.get(KAFKA_BOOTSTRAP_SERVERS, None)
if not servers:
raise ValueError(
'{} setting is required'.format(KAFKA_BOOTSTRAP_SERVERS))
bootstrap_servers = servers.split(',')
topic = settings.get('KAFKA_TOPIC_ID')
group = settings.get('KAFKA_GROUP_ID')
servers = settings.get('KAFKA_BOOTSTRAP_SERVERS')
kafka_servers = servers.split(',')

self.consumer = kafka.KafkaConsumer(topic, group_id=group,
bootstrap_servers=bootstrap_servers)
bootstrap_servers=kafka_servers)
self.crawler.signals.connect(self.spider_idle,
scrapy.signals.spider_idle)
self.crawler.signals.connect(self.item_scraped,
Expand All @@ -64,7 +51,8 @@ def process_request(self):
for record in msg_list:
url = record.value
if url:
url = url.decode()[1:-1] # TODO: remove " from kafka?
url = url.decode()[
1:-1] # TODO: remove " from kafka-conf?
requests = self.make_requests_from_url(url)
if requests:
self.crawler.engine.crawl(requests, spider=self)
Expand Down
1 change: 1 addition & 0 deletions article-crawler/crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
from scrapy.exceptions import DropItem


class ContentWriterPipeline(object):
Expand Down
20 changes: 16 additions & 4 deletions article-crawler/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

import os


BOT_NAME = 'crawler'

SPIDER_MODULES = ['crawler.spiders']
Expand Down Expand Up @@ -90,8 +93,17 @@
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

KAFKA_TOPIC_ID = 'whatsnews_topic_index'
KAFKA_GROUP_ID = 'whatsnews_group_index'
KAFKA_BOOTSTRAP_SERVERS = 'localhost:9092'
KAFKA_TOPIC_ID = os.environ.get('KAFKA_TOPIC_ID', None)
assert KAFKA_TOPIC_ID is not None, \
'Environment variable KAFKA_TOPIC_ID not found'

KAFKA_GROUP_ID = os.environ.get('KAFKA_GROUP_ID', None)
assert KAFKA_GROUP_ID is not None, \
'Environment variable KAFKA_GROUP_ID not found'

KAFKA_BOOTSTRAP_SERVERS = os.environ.get('KAFKA_BOOTSTRAP_SERVERS', None)
assert KAFKA_BOOTSTRAP_SERVERS is not None, \
'Environment variable KAFKA_BOOTSTRAP_SERVERS not found'

ARTICLE_OUT_FILE = '/data/project/whatsnews/article_out/crawl_articles.txt'
ARTICLE_OUT_FILE = os.environ.get('OUT_FILE', None)
assert ARTICLE_OUT_FILE is not None, 'Environment variable OUT_FILE not found'
10 changes: 10 additions & 0 deletions auth-api/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## Authorization API

----

### Run test
```shell
AUTH_SERVICE_URL=http://example.com:8000 nosetests
```
where AUTH_SERVICE_URL is only an URL for mock auth service, so it can be a
arbitrary URL.
1 change: 1 addition & 0 deletions auth-api/authapi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from authapi.service import application
11 changes: 4 additions & 7 deletions auth-api/authapi/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,12 @@ class AuthService(object):
"""auth_service provider.
"""

def __init__(self, server):
def __init__(self, application):
# TODO: service discovery rather than designate server
self.prefix = server
self.application = application

def get_token(self, client_id, secret):
url = '{}/token/{}/{}'.format(self.prefix, client_id, secret)
url = '{}/token/{}/{}'.format(
self.application.config['AUTH_SERVICE_URL'], client_id, secret)
response = requests.get(url)
return response.json()


auth_service = AuthService(
'http://authsvr.kc7ctmpd2z.us-west-2.elasticbeanstalk.com')
21 changes: 15 additions & 6 deletions auth-api/authapi/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@
a token generated by this auth_api.
"""

from flask import Flask, Response
import json
from flask import Flask, jsonify
import os

from authapi.provider import auth_service
from authapi.provider import AuthService

application = Flask(__name__)

auth_service_url = os.environ.get('AUTH_SERVICE_URL', None)
assert auth_service_url is not None, \
'Environment variable AUTH_SERVICE_URL not found'

application.config.update(dict(
AUTH_SERVICE_URL=auth_service_url
))

auth_service = AuthService(application)


@application.route("/token/<client_id>/<secret>")
def get_token(client_id, secret):
Expand All @@ -31,6 +41,5 @@ def get_token(client_id, secret):
return a json.
"""

resp = auth_service.get_token(client_id, secret)
result = json.dumps(resp)
return Response(result, mimetype='application/json')
result = auth_service.get_token(client_id, secret)
return jsonify(result)
64 changes: 64 additions & 0 deletions auth-api/authapi/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Author: Wenhua Yang
Date: 9/23/16
"""

from flask import Flask, jsonify
from flask_loopback.flask_loopback import FlaskLoopback
from urllib.parse import urlparse

from authapi import application

auth_service_url = application.config['AUTH_SERVICE_URL']

auth_app = Flask(__name__)


@auth_app.route("/token/<client_id>/<secret>")
def get_token(client_id, secret):
if client_id == 'ID123456' and secret == '123456':
ret = {'token': 'TK123456', 'expires_in': 86400}
else:
ret = {'error_code': 3001, 'error_msg': 'invalid client_id or secret'}
return jsonify(ret)


# TODO: try to reuse this MockServer, it is duplicated with code in search_api
class MockServer(object):
def __init__(self, app, host, port):
self.app = app
self.host = host
self.port = port
self.mock_server = FlaskLoopback(app)

def get_server(self):
return self.mock_server.on((self.host, self.port))


# TODO: try to reuse this code, it is duplicated with code in search_api
def parse_host_port(url):
netloc = urlparse(url).netloc
if ':' not in netloc:
netloc = '{}:80'.format(netloc)
_host, _port = netloc.split(':')
_port = int(_port)
return _host, _port


# TODO: try to reuse this code, it is duplicated with code in search_api
# this decorator not work for nosetests
def use_mock(mock_server):
def mock_decorator(func):
def wrapper(*args, **kwargs):
with mock_server.get_server():
func(*args, **kwargs)

return wrapper

return mock_decorator


auth_host, auth_port = parse_host_port(auth_service_url)
auth_mock = MockServer(auth_app, auth_host, auth_port)

36 changes: 36 additions & 0 deletions auth-api/authapi/tests/test_authapi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Author: Wenhua Yang
Date: 9/23/16
Test cases for auth API.
"""

import json
import unittest

from authapi import application
from authapi.tests import auth_mock


class TestAuthAPI(unittest.TestCase):

@classmethod
def setUpClass(cls):
application.testing = True
cls.app_client = application.test_client()

def test_get_token(self):
# TODO: use_mock decorator not work for nosetests
with auth_mock.get_server():
resp = self.app_client.get('/token/ID123456/123456')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('token'), 'TK123456')

resp = self.app_client.get('/token/ID123456/xxx')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('error_code'), 3001)

resp = self.app_client.get('/token/xxx/123456')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('error_code'), 3001)

8 changes: 8 additions & 0 deletions auth-service/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Authorization Service

----

### Run test
```shell
nosetests
```
2 changes: 1 addition & 1 deletion auth-service/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Date: 09/18/2016
"""

from authsvr.service import application
from authsvr import application

if __name__ == '__main__':
application.run()
1 change: 1 addition & 0 deletions auth-service/authsvr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from authsvr.service import application
18 changes: 8 additions & 10 deletions auth-service/authsvr/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
function of this system.
"""

from flask import Flask, Response
import json
from flask import Flask, jsonify

application = Flask(__name__)

Expand All @@ -22,18 +21,17 @@ def get_token(client_id, secret):
# ip = request.remote_addr

if client_id == TEST_CLIENT_ID and secret == TEST_SECRET:
ret = json.dumps({'token': TEST_TOKEN, 'expires_in': 86400})
ret = {'token': TEST_TOKEN, 'expires_in': 86400}
else:
ret = json.dumps(
{'error_code': 3001, 'error_msg': 'invalid client_id or secret'})
return Response(ret, mimetype='application/json')
ret = {'error_code': 3001, 'error_msg': 'invalid client_id or secret'}
return jsonify(ret)


@application.route("/verify/<client_id>/<token>")
def verify_token(client_id, token):
if client_id == TEST_CLIENT_ID and token == TEST_TOKEN:
ret = json.dumps({'valid': True})
ret = {'valid': True}
else:
ret = json.dumps(
{'error_code': 3002, 'error_msg': 'client_id/token verify failed'})
return Response(ret, mimetype='application/json')
ret = {'error_code': 3002,
'error_msg': 'client_id/token verify failed'}
return jsonify(ret)
Empty file.
45 changes: 45 additions & 0 deletions auth-service/authsvr/tests/test_authsvr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Author: Wenhua Yang
Date: 9/23/16
Test cases for auth service.
"""

import json
import unittest

from authsvr import application


class TestAuthService(unittest.TestCase):

@classmethod
def setUpClass(cls):
application.testing = True
cls.app_client = application.test_client()

def test_fetch_token(self):
resp = self.app_client.get('/token/ID123456/123456')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('token'), 'TK123456')

resp = self.app_client.get('/token/ID-Invalid/123456')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('error_code'), 3001)

resp = self.app_client.get('/token/ID123456/xxx')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('error_code'), 3001)

def test_verify_token(self):
resp = self.app_client.get('/verify/ID123456/TK123456')
result = json.loads(resp.get_data(as_text=True))
self.assertEqual(result.get('valid'), True)

resp = self.app_client.get('/verify/ID123456/xxx')
result = json.loads(resp.get_data(as_text=True))
self.assertTrue(result.get('error_code'), 3002)

resp = self.app_client.get('/verify/xxx/TK123456')
result = json.loads(resp.get_data(as_text=True))
self.assertTrue(result.get('error_code'), 3002)
Loading

0 comments on commit 12e0d56

Please sign in to comment.