Skip to content
This repository has been archived by the owner on Jul 29, 2023. It is now read-only.

Commit

Permalink
升级接口及签名
Browse files Browse the repository at this point in the history
  • Loading branch information
never615 committed Jan 30, 2020
1 parent 2888550 commit 98b50e0
Show file tree
Hide file tree
Showing 28 changed files with 784 additions and 522 deletions.
Empty file modified .gitignore
100644 → 100755
Empty file.
Empty file modified .images/dashboard.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified DuTracker/__init__.py
100644 → 100755
Empty file.
Empty file modified DuTracker/db.py
100644 → 100755
Empty file.
Empty file modified DuTracker/items.py
100644 → 100755
Empty file.
Empty file modified DuTracker/middlewares.py
100644 → 100755
Empty file.
Empty file modified DuTracker/pipelines.py
100644 → 100755
Empty file.
Empty file modified DuTracker/settings.py
100644 → 100755
Empty file.
Empty file modified DuTracker/sign/__init__.py
100644 → 100755
Empty file.
Empty file modified DuTracker/sign/sign.js
100644 → 100755
Empty file.
Empty file modified DuTracker/sign/sign.py
100644 → 100755
Empty file.
Empty file modified DuTracker/spiders/__init__.py
100644 → 100755
Empty file.
135 changes: 68 additions & 67 deletions DuTracker/spiders/brand.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,82 +10,83 @@
from DuTracker.utils.log import log, handle_parse_exception
from DuTracker.items import ProductInfo
from DuTracker.utils.urls import get_brand_page_url as page_url
from DuTracker.utils.urls import get_headers as headers


class BrandSpider(scrapy.Spider):
name = 'brand'
allowed_domains = ['m.poizon.com']
start_urls = [
'http://m.poizon.com/mapi//search/categoryDetail?catId=0&sign=4ff93b98af1253fe192ff1328ed09081'
]
custom_settings = {
'ITEM_PIPELINES': {
'DuTracker.pipelines.SaveProductId': 300,
}
name = 'brand'
allowed_domains = ['app.poizon.com']
start_urls = [
'https://app.poizon.com/api/v1/h5/product/fire/search/getCategoryDetail?catId=0&sign=4ff93b98af1253fe192ff1328ed09081'
]
custom_settings = {
'ITEM_PIPELINES': {
'DuTracker.pipelines.SaveProductId': 300,
}

}
brandIds = {}
Ids = []
auto = False
}
brandIds = {}
Ids = []
auto = False

def start_requests(self):
log.info('获取品牌列表')
for url in self.start_urls:
yield Request(url, dont_filter=True, headers={
'AppId': 'wxapp',
'appVersion': '3.5.0',
}, callback=self.parse_brandList, meta={
'dont_retry': True
})
def start_requests(self):
log.info('获取品牌列表')
for url in self.start_urls:
yield Request(url, dont_filter=True, callback=self.parse_brandList, meta={
'dont_retry': True
}, headers=headers())

@handle_parse_exception
def parse_brandList(self, response):
brandList = json.loads(response.body_as_unicode())['data']['list']
for brand in brandList:
unionId = brand['brand']['goodsBrandId']
name = brand['brand']['brandName']
self.brandIds[unionId] = name
log.success(f'品牌:{name} 编号:{unionId}')
@handle_parse_exception
def parse_brandList(self, response):
brandList = json.loads(response.body_as_unicode())['data']['list']
for brand in brandList:
unionId = brand['brand']['goodsBrandId']
name = brand['brand']['brandName']
self.brandIds[unionId] = name
log.success(f'品牌:{name} 编号:{unionId}')

if not self.auto:
ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ')
if ids == ['']: return IgnoreRequest()
else:
ids = self.Ids
if not ids: return IgnoreRequest()
if not self.auto:
ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ')
if ids == ['']: return IgnoreRequest()
else:
ids = self.Ids
if not ids: return IgnoreRequest()

log.info(f'获取 {ids} 品牌包含商品')
for unionId in ids:
yield Request(page_url(unionId), callback=self.parse_brandInfo, meta={
'unionId': unionId,
'name': self.brandIds[unionId]
})
# log.info(f'品牌列表 {self.brandIds}')
# log.info(f'获取 {ids} 品牌包含商品')
for unionId in ids:
log.info(f'unionId: {unionId}')
unionId = int(unionId)
yield Request(page_url(unionId), callback=self.parse_brandInfo, meta={
'unionId': unionId,
'name': self.brandIds[unionId]
}, headers=headers())

@handle_parse_exception
def parse_brandInfo(self, response):
data = json.loads(response.body_as_unicode())['data']
unionId = response.meta.get('unionId')
name = response.meta.get('name')
@handle_parse_exception
def parse_brandInfo(self, response):
data = json.loads(response.body_as_unicode())['data']
unionId = response.meta.get('unionId')
name = response.meta.get('name')

num = data['total']
page = math.ceil(num / 20)
log.success(f'品牌:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}')
num = data['total']
page = math.ceil(num / 20)
log.success(f'品牌:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}')

for page in range(1, page + 1):
yield Request(page_url(unionId, page), callback=self.parse_productId, meta={
'unionId': unionId,
'name': self.brandIds[unionId]
})
for page in range(1, page + 1):
yield Request(page_url(unionId, page), callback=self.parse_productId, meta={
'unionId': unionId,
'name': self.brandIds[unionId]
}, headers=headers())

@handle_parse_exception
def parse_productId(self, response):
productList = json.loads(response.body_as_unicode())['data']['productList']
for product in productList:
name = response.meta.get('name')
pid = product['productId']
title = product['title']
yield ProductInfo(
id=pid,
title=title,
name=name,
)
@handle_parse_exception
def parse_productId(self, response):
productList = json.loads(response.body_as_unicode())['data']['productList']
for product in productList:
name = response.meta.get('name')
pid = product['productId']
title = product['title']
yield ProductInfo(
id=pid,
title=title,
name=name,
)
134 changes: 81 additions & 53 deletions DuTracker/spiders/product.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,99 @@
from scrapy.http import Request
import json
import math
import execjs

from DuTracker.items import ProductItem
from DuTracker.sign.sign import sign
from DuTracker.utils.log import log, handle_parse_exception
from DuTracker.db import *
from DuTracker.utils.urls import get_headers as headers


def get_product_info_url(productid):
e = '048a9c4943398714b356a696503d2d36'
string = f'productId{productid}sourceshareDetail{e}'
result = sign.getSign(string)
return f'http://m.poizon.com/mapi/product/detail?productId={productid}&' \
f'source=shareDetail&' \
f'sign={result}'
# def get_product_info_url(productid):
# e = '048a9c4943398714b356a696503d2d36'
# string = f'productId{productid}sourceshareDetail{e}'
# result = sign.getSign(string)
# return f'http://m.poizon.com/mapi/product/detail?productId={productid}&' \
# f'source=shareDetail&' \
# f'sign={result}'

def get_product_info_url(productId):
# 商品详情
# log.info('商品详情url')
with open('DuTracker/sign/sign.js', 'r', encoding='utf-8') as f:
all_ = f.read()
ctx = execjs.compile(all_)
sign = ctx.call('getSign',
'productId{}productSourceNamewx19bc545a393a25177083d4a748807cc0'.format(productId))

product_detail_url = 'https://app.poizon.com/api/v1/h5/index/fire/flow/product/detail?' \
'productId={}&productSourceName=wx&sign={}'.format(productId, sign)
# log.info(f'商品详情url: {product_detail_url}')
return product_detail_url


# URL https://app.poizon.com/api/v1/h5/index/fire/flow/product/detail?productId=26850&productSourceName=wx&sign=0e145c5543d9751497a2e700bbea1e4c
# URL https://app.poizon.com/api/v1/h5/index/fire/flow/product/detail?productId=65482&productSourceName=wx&sign=091fb148fe96ddbfda383d2dd46fbe67

class ProductSpider(scrapy.Spider):
name = 'product'
allowed_domains = ['m.poizon.com']
custom_settings = {
'ITEM_PIPELINES': {
'DuTracker.pipelines.SaveProductItem': 300,
}
name = 'product'
# allowed_domains = ['m.poizon.com']
# allowed_domains = ['app.poizon.com']
custom_settings = {
'ITEM_PIPELINES': {
'DuTracker.pipelines.SaveProductItem': 300,
}

}

}
productIds = []
fromDB = False

productIds = []
fromDB = False
@db_session
def start_requests(self):
log.info('获取商品详情')
if self.fromDB: [self.productIds.append(p.id) for p in Product.select()]
for pid in self.productIds:
log.info(f'获取商品详情request {pid}')
url = get_product_info_url(pid)
log.info(f'商品详情request url:{url}')
log.info("headers ---> {0}".format(headers()))
yield Request(url, headers=headers())

@db_session
def start_requests(self):
if self.fromDB: [self.productIds.append(p.id) for p in Product.select()]
for pid in self.productIds:
yield Request(get_product_info_url(pid))
@handle_parse_exception
def parse(self, response):
# log.info('商品详情response')
data = json.loads(response.body_as_unicode())['data']
imageAndText = data['imageAndText']
detail = data['detail']
productId = detail['productId']
categoryId = detail['categoryId']
logoUrl = detail['logoUrl']
images = [image['url'] for image in detail['images']]
title = detail['title']
soldNum = detail['soldNum']
sellDate = detail['sellDate']
articleNumber = detail['articleNumber']
authPrice = detail['authPrice']
goodsId = detail['goodsId']
sizeList = detail['sizeList']

@handle_parse_exception
def parse(self, response):
data = json.loads(response.body_as_unicode())['data']
imageAndText = data['imageAndText']
detail = data['detail']
productId = detail['productId']
categoryId = detail['categoryId']
logoUrl = detail['logoUrl']
images = [image['url'] for image in detail['images']]
title = detail['title']
soldNum = detail['soldNum']
sellDate = detail['sellDate']
articleNumber = detail['articleNumber']
authPrice = detail['authPrice']
goodsId = detail['goodsId']
sizeList = detail['sizeList']
# log.info(f'商品详情 response url:{response.url}')

yield ProductItem(
id=productId,
url=response.url,
title=title,
soldNum=soldNum,
logo=logoUrl,
categoryId=categoryId,
images=images,
sellDate=sellDate,
articleNumber=articleNumber,
authPrice=authPrice,
goodsId=goodsId,
sizeList=sizeList,
imageAndText=imageAndText,
detailJson=detail
)
yield ProductItem(
id=productId,
url=response.url,
title=title,
soldNum=soldNum,
logo=logoUrl,
categoryId=categoryId,
images=images,
sellDate=sellDate,
articleNumber=articleNumber,
authPrice=authPrice,
goodsId=goodsId,
sizeList=sizeList,
imageAndText=imageAndText,
detailJson=detail
)
Loading

0 comments on commit 98b50e0

Please sign in to comment.