Merge pull request hcjohn463#35 from RayOct18/download_bug

[fix] get random url and miss file during downloading
Vogellll · Aug 29, 2021 · 15ff162 · 15ff162
2 parents f81d4db + a8f2e9a
commit 15ff162
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,12 @@
 
 直接下載到電腦看沒煩惱
 
+### vitual env
+```
+python3 -m venv jable
+source jable/bin/activate. # MacOS
+```
+
 ### requirements
 `pip install -r requirements.txt`
 

diff --git a/args.py b/args.py
@@ -1,8 +1,9 @@
 import argparse
-import requests
 from bs4 import BeautifulSoup
-import cloudscraper
 import random
+from urllib.request import Request, urlopen
+from config import headers
+import re
 
 
 def get_parser():
@@ -16,23 +17,14 @@ def get_parser():
 
 
 def av_recommand():
+    headers = {'User-Agent': 'Mozilla/5.0'}
     url = 'https://jable.tv/'
-    r = requests.get(url)
-    # 這邊用 cloudscraper 取代 requests，這套件幫我們
-    new_response = cloudscraper.create_scraper().get(url)
+    request = Request(url, headers=headers)
+    web_content = urlopen(request).read()
     # 得到繞過轉址後的 html
-    soup = BeautifulSoup(new_response.text, 'html.parser')
-    # print(soup.prettify())
+    soup = BeautifulSoup(web_content, 'html.parser')
     h6_tags = soup.find_all('h6', class_='title')
-    # print(h6_tags)
-    av_list = []
-    for tag in h6_tags:
-        # print(tag)
-        # print(tag.text.split(' ')[0][0])
-        if((tag.text.split(' ')[0][0] >= 'a' and tag.text.split(' ')[0][0] <= 'z') or (tag.text.split(' ')[0][0] >= 'A' and tag.text.split(' ')[0][0] <= 'Z')):
-            # print(tag.a.get('href'))
-            av_list.append(tag.a.get('href'))
-    # print(av_list)
+    av_list = re.findall(r'https[^"]+', str(h6_tags))
     return random.choice(av_list)
 
 

diff --git a/crawler.py b/crawler.py
@@ -15,18 +15,19 @@ def scrape(ci, folderPath, downloadList, urls):
         # 跳過已下載
         print('當前目標: {0} 已下載, 故跳過...剩餘 {1} 個'.format(
             urls.split('/')[-1], len(downloadList)))
+        downloadList.remove(urls)
     else:
         response = requests.get(urls, headers=headers, timeout=10)
-        content_ts = response.content
-        if ci:
-            content_ts = ci.decrypt(content_ts)  # 解碼
-        with open(saveName, 'ab') as f:
-            f.write(content_ts)
-            # 輸出進度
-            print('\r當前下載: {0} , 剩餘 {1} 個'.format(
-                urls.split('/')[-1], len(downloadList)), end='', flush=True)
-
-    downloadList.remove(urls)
+        if response.status_code == 200:
+            content_ts = response.content
+            if ci:
+                content_ts = ci.decrypt(content_ts)  # 解碼
+            with open(saveName, 'ab') as f:
+                f.write(content_ts)
+                # 輸出進度
+            downloadList.remove(urls)
+        print('\r當前下載: {0} , 剩餘 {1} 個, status code: {2}'.format(
+            urls.split('/')[-1], len(downloadList), response.status_code), end='', flush=True)
 
 
 def prepareCrawl(ci, folderPath, tsList):
@@ -45,7 +46,10 @@ def prepareCrawl(ci, folderPath, tsList):
 
 def startCrawl(ci, folderPath, downloadList):
     # 同時建立及啟用 20 個執行緒
+    round = 0
     while(downloadList != []):
-        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
             executor.map(partial(scrape, ci, folderPath,
                                  downloadList), downloadList)
+        round += 1
+        print(f', round {round}')