Skip to content

Commit

Permalink
Merge pull request hcjohn463#35 from RayOct18/download_bug
Browse files Browse the repository at this point in the history
[fix] get random url and miss file during downloading
  • Loading branch information
hcjohn463 authored Aug 29, 2021
2 parents f81d4db + a8f2e9a commit 15ff162
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 27 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@

直接下載到電腦看沒煩惱

### vitual env
```
python3 -m venv jable
source jable/bin/activate. # MacOS
```

### requirements
`pip install -r requirements.txt`

Expand Down
24 changes: 8 additions & 16 deletions args.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import argparse
import requests
from bs4 import BeautifulSoup
import cloudscraper
import random
from urllib.request import Request, urlopen
from config import headers
import re


def get_parser():
Expand All @@ -16,23 +17,14 @@ def get_parser():


def av_recommand():
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://jable.tv/'
r = requests.get(url)
# 這邊用 cloudscraper 取代 requests,這套件幫我們
new_response = cloudscraper.create_scraper().get(url)
request = Request(url, headers=headers)
web_content = urlopen(request).read()
# 得到繞過轉址後的 html
soup = BeautifulSoup(new_response.text, 'html.parser')
# print(soup.prettify())
soup = BeautifulSoup(web_content, 'html.parser')
h6_tags = soup.find_all('h6', class_='title')
# print(h6_tags)
av_list = []
for tag in h6_tags:
# print(tag)
# print(tag.text.split(' ')[0][0])
if((tag.text.split(' ')[0][0] >= 'a' and tag.text.split(' ')[0][0] <= 'z') or (tag.text.split(' ')[0][0] >= 'A' and tag.text.split(' ')[0][0] <= 'Z')):
# print(tag.a.get('href'))
av_list.append(tag.a.get('href'))
# print(av_list)
av_list = re.findall(r'https[^"]+', str(h6_tags))
return random.choice(av_list)


Expand Down
26 changes: 15 additions & 11 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,19 @@ def scrape(ci, folderPath, downloadList, urls):
# 跳過已下載
print('當前目標: {0} 已下載, 故跳過...剩餘 {1} 個'.format(
urls.split('/')[-1], len(downloadList)))
downloadList.remove(urls)
else:
response = requests.get(urls, headers=headers, timeout=10)
content_ts = response.content
if ci:
content_ts = ci.decrypt(content_ts) # 解碼
with open(saveName, 'ab') as f:
f.write(content_ts)
# 輸出進度
print('\r當前下載: {0} , 剩餘 {1} 個'.format(
urls.split('/')[-1], len(downloadList)), end='', flush=True)

downloadList.remove(urls)
if response.status_code == 200:
content_ts = response.content
if ci:
content_ts = ci.decrypt(content_ts) # 解碼
with open(saveName, 'ab') as f:
f.write(content_ts)
# 輸出進度
downloadList.remove(urls)
print('\r當前下載: {0} , 剩餘 {1} 個, status code: {2}'.format(
urls.split('/')[-1], len(downloadList), response.status_code), end='', flush=True)


def prepareCrawl(ci, folderPath, tsList):
Expand All @@ -45,7 +46,10 @@ def prepareCrawl(ci, folderPath, tsList):

def startCrawl(ci, folderPath, downloadList):
# 同時建立及啟用 20 個執行緒
round = 0
while(downloadList != []):
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
executor.map(partial(scrape, ci, folderPath,
downloadList), downloadList)
round += 1
print(f', round {round}')

0 comments on commit 15ff162

Please sign in to comment.