Skip to content

Commit

Permalink
新增图片下载格式设置
Browse files Browse the repository at this point in the history
  • Loading branch information
JoeanAmier committed Jan 6, 2024
1 parent 8ed4e81 commit 48cfa60
Show file tree
Hide file tree
Showing 18 changed files with 510 additions and 413 deletions.
39 changes: 17 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,20 @@
<img alt="GitHub code size in bytes" src="https://img.shields.io/github/languages/code-size/JoeanAmier/XHS-Downloader?style=for-the-badge&color=73d13d">
<img alt="GitHub release (with filter)" src="https://img.shields.io/github/v/release/JoeanAmier/XHS-Downloader?style=for-the-badge&color=40a9ff">
<img alt="GitHub all releases" src="https://img.shields.io/github/downloads/JoeanAmier/XHS-Downloader/total?style=for-the-badge&color=f759ab">
<br>
<p>🔥 <b>小红书作品采集工具</b>:采集小红书作品信息;提取小红书作品下载地址;下载小红书无水印作品文件!</p>
<p>❤️ 作者仅在 GitHub 发布 XHS-Downloader,未与任何个人或网站合作,且没有任何收费计划!</p>
</div>
<h1>📑 功能清单</h1>
<h1>📑 项目功能</h1>
<ul>
<li>✅ 采集小红书图文 / 视频作品信息</li>
<li>✅ 提取小红书图文 / 视频作品下载地址</li>
<li>✅ 下载小红书无水印图文 / 视频作品文件</li>
<li>✅ 支持 Tampermonkey 用户脚本</li>
<li>✅ 批量下载账号作品(搭配用户脚本)</li>
<li>✅ 自动跳过已下载的作品文件</li>
<li>✅ 作品文件完整性处理机制</li>
<li>✅ 自定义图文作品文件下载格式</li>
<li>✅ 持久化储存作品信息至文件</li>
<li>✅ 作品文件储存至单独文件夹</li>
<li>☑️ 后台监听剪贴板下载作品</li>
Expand Down Expand Up @@ -54,13 +57,12 @@
<h1>💻 二次开发</h1>
<p>如果有其他需求,可以根据 <code>main.py</code> 的注释提示进行代码调用或修改!</p>
<pre>
# 测试链接
error_demo = "https://github.com/JoeanAmier/XHS_Downloader"
image_demo = "https://www.xiaohongshu.com/explore/63b275a30000000019020185"
video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
multiple_demo = f"{image_demo} {video_demo}"
# 示例链接
error_link = "https://github.com/JoeanAmier/XHS_Downloader"
demo_link = "https://www.xiaohongshu.com/explore/xxxxxxxxxx"
multiple_links = f"{demo_link} {demo_link} {demo_link}"
# 实例对象
path = "" # 作品数据/文件保存根路径,默认值:项目根路径
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称(自动创建),默认值:Download
user_agent = "" # 请求头 User-Agent
cookie = "" # 小红书网页版 Cookie,无需登录
Expand All @@ -69,11 +71,11 @@ timeout = 5 # 请求数据超时限制,单位:秒,默认值:10
chunk = 1024 * 1024 * 10 # 下载文件时,每次从服务器获取的数据块大小,单位:字节
max_retry = 2 # 请求数据失败时,重试的最大次数,单位:秒,默认值:5
record_data = False # 是否记录作品数据至文件
image_format = "jpg" # 图文作品文件名称后缀
image_format = "WEBP" # 图文作品文件下载格式,支持:PNG、WEBP
folder_mode = False # 是否将每个作品的文件储存至单独的文件夹
async with XHS() as xhs:
pass # 使用默认参数
async with XHS(path=path,
async with XHS(work_path=work_path,
folder_name=folder_name,
user_agent=user_agent,
cookie=cookie,
Expand All @@ -87,10 +89,9 @@ async with XHS(path=path,
) as xhs: # 使用自定义参数
download = True # 是否下载作品文件,默认值:False
# 返回作品详细信息,包括下载地址
print(await xhs.extract(error_demo, download)) # 获取数据失败时返回空字典
print(await xhs.extract(image_demo, download))
print(await xhs.extract(video_demo, download))
print(await xhs.extract(multiple_demo, download)) # 支持传入多个作品链接
print(await xhs.extract(error_link, download)) # 获取数据失败时返回空字典
print(await xhs.extract(demo_link, download))
print(await xhs.extract(multiple_links, download)) # 支持传入多个作品链接
</pre>
<h1>⚙️ 配置文件</h1>
<p>项目根目录下的 <code>settings.json</code> 文件,首次运行自动生成,可以自定义部分运行参数。</p>
Expand All @@ -106,7 +107,7 @@ async with XHS(path=path,
</thead>
<tbody>
<tr>
<td align="center">path</td>
<td align="center">work_path</td>
<td align="center">str</td>
<td align="center">作品数据 / 文件保存根路径</td>
<td align="center">项目根路径</td>
Expand Down Expand Up @@ -162,14 +163,8 @@ async with XHS(path=path,
<tr>
<td align="center">image_format</td>
<td align="center">str</td>
<td align="center">图文作品文件名称后缀,不影响实际文件格式,仅在无法判断文件类型时生效</td>
<td align="center">webp</td>
</tr>
<tr>
<td align="center">video_format</td>
<td align="center">str</td>
<td align="center">视频作品文件名称后缀,不影响实际文件格式,仅在无法判断文件类型时生效</td>
<td align="center">mp4</td>
<td align="center">图文作品文件下载格式,支持:<code>PNG</code>、<code>WEBP</code></td>
<td align="center">PNG</td>
</tr>
<tr>
<td align="center">folder_mode</td>
Expand Down
22 changes: 10 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@

async def example():
"""通过代码设置参数,适合二次开发"""
# 测试链接
error_demo = "https://github.com/JoeanAmier/XHS_Downloader"
image_demo = "https://www.xiaohongshu.com/explore/63b275a30000000019020185"
video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
multiple_demo = f"{image_demo} {video_demo}"
# 示例链接
error_link = "https://github.com/JoeanAmier/XHS_Downloader"
demo_link = "https://www.xiaohongshu.com/explore/xxxxxxxxxx"
multiple_links = f"{demo_link} {demo_link} {demo_link}"
# 实例对象
path = "" # 作品数据/文件保存根路径,默认值:项目根路径
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称(自动创建),默认值:Download
user_agent = "" # 请求头 User-Agent
cookie = "" # 小红书网页版 Cookie,无需登录
Expand All @@ -21,11 +20,11 @@ async def example():
chunk = 1024 * 1024 * 10 # 下载文件时,每次从服务器获取的数据块大小,单位:字节
max_retry = 2 # 请求数据失败时,重试的最大次数,单位:秒,默认值:5
record_data = False # 是否记录作品数据至文件
image_format = "jpg" # 图文作品文件名称后缀
image_format = "WEBP" # 图文作品文件下载格式,支持:PNG、WEBP
folder_mode = False # 是否将每个作品的文件储存至单独的文件夹
async with XHS() as xhs:
pass # 使用默认参数
async with XHS(path=path,
async with XHS(work_path=work_path,
folder_name=folder_name,
user_agent=user_agent,
cookie=cookie,
Expand All @@ -39,10 +38,9 @@ async def example():
) as xhs: # 使用自定义参数
download = True # 是否下载作品文件,默认值:False
# 返回作品详细信息,包括下载地址
print(await xhs.extract(error_demo, download)) # 获取数据失败时返回空字典
print(await xhs.extract(image_demo, download))
print(await xhs.extract(video_demo, download))
print(await xhs.extract(multiple_demo, download)) # 支持传入多个作品链接
print(await xhs.extract(error_link, download)) # 获取数据失败时返回空字典
print(await xhs.extract(demo_link, download))
print(await xhs.extract(multiple_links, download)) # 支持传入多个作品链接


async def main():
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
aiohttp>=3.9.0
textual>=0.40.0
pyperclip>=1.8.2
lxml>=4.9.3
PyYAML>=6.0.1
76 changes: 40 additions & 36 deletions source/App.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from re import compile

from .Converter import Converter
from .Converter import Namespace
from .Downloader import Download
from .Explore import Explore
from .Html import Html
Expand All @@ -10,18 +12,17 @@
ERROR,
WARNING,
)
from .Tools import logging
from .Video import Video

__all__ = ["XHS"]


class XHS:
LINK = compile(r"https?://www\.xiaohongshu\.com/explore/[a-z0-9]+")
SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+")
SHORT = compile(r"https?://xhslink\.com/[A-Za-z0-9]+")
__INSTANCE = None
TYPE = {
"视频": "v",
"图文": "n",
}

def __new__(cls, *args, **kwargs):
if not cls.__INSTANCE:
Expand All @@ -30,7 +31,7 @@ def __new__(cls, *args, **kwargs):

def __init__(
self,
path="",
work_path="",
folder_name="Download",
user_agent: str = None,
cookie: str = None,
Expand All @@ -39,13 +40,12 @@ def __init__(
chunk=1024 * 1024,
max_retry=5,
record_data=False,
image_format="webp",
video_format="mp4",
image_format="PNG",
folder_mode=False,
):
self.manager = Manager(
ROOT,
path,
work_path,
folder_name,
user_agent,
chunk,
Expand All @@ -55,75 +55,81 @@ def __init__(
max_retry,
record_data,
image_format,
video_format,
folder_mode,
)
self.html = Html(self.manager)
self.image = Image()
self.video = Video()
self.explore = Explore()
self.download = Download(self.manager, )
self.rich_log = self.download.rich_log
self.convert = Converter()
self.download = Download(self.manager)

def __extract_image(self, container: dict, html: str):
container["下载地址"] = self.image.get_image_link(html)
def __extract_image(self, container: dict, data: Namespace):
container["下载地址"] = self.image.get_image_link(
data, self.manager.image_format)

def __extract_video(self, container: dict, html: str):
container["下载地址"] = self.video.get_video_link(html)
def __extract_video(self, container: dict, data: Namespace):
container["下载地址"] = self.video.get_video_link(data)

async def __download_files(self, container: dict, download: bool, log, bar):
name = self.__naming_rules(container)
path = self.manager.folder
if (u := container["下载地址"]) and download:
await self.download.run(u, name, self.TYPE[container["作品类型"]], log, bar)
path = await self.download.run(u, name, container["作品类型"], log, bar)
elif not u:
self.rich_log(log, "提取作品文件下载地址失败!", ERROR)
self.manager.save_data(name, container)
logging(log, "提取作品文件下载地址失败!", ERROR)
self.manager.save_data(path, name, container)

async def extract(self, url: str, download=False, log=None, bar=None) -> list[dict]:
# return # 调试代码
urls = await self.__extract_links(url)
urls = await self.__extract_links(url, log)
if not urls:
self.rich_log(log, "提取小红书作品链接失败!", WARNING)
logging(log, "提取小红书作品链接失败!", WARNING)
else:
self.rich_log(log, f"共 {len(urls)} 个小红书作品待处理...")
logging(log, f"共 {len(urls)} 个小红书作品待处理...")
# return urls # 调试代码
return [await self.__deal_extract(i, download, log, bar) for i in urls]

async def __extract_links(self, url: str) -> list:
async def __extract_links(self, url: str, log) -> list:
urls = []
for i in url.split():
if u := self.SHORT.search(i):
i = await self.html.request_url(
u.group(), False)
u.group(), False, log)
if u := self.SHARE.search(i):
urls.append(u.group())
elif u := self.LINK.search(i):
urls.append(u.group())
return urls

async def __deal_extract(self, url: str, download: bool, log, bar):
self.rich_log(log, f"开始处理作品:{url}")
html = await self.html.request_url(url)
# self.rich_log(log, html) # 调试代码
logging(log, f"开始处理作品:{url}")
html = await self.html.request_url(url, log=log)
# logging(log, html) # 调试代码
if not html:
self.rich_log(log, f"{url} 获取数据失败!", ERROR)
logging(log, f"{url} 获取数据失败!", ERROR)
return {}
data = self.explore.run(html)
# self.rich_log(log, data) # 调试代码
namespace = self.__generate_data_object(html)
data = self.explore.run(namespace)
# logging(log, data) # 调试代码
if not data:
self.rich_log(log, f"{url} 提取数据失败!", ERROR)
logging(log, f"{url} 提取数据失败!", ERROR)
return {}
match data["作品类型"]:
case "视频":
self.__extract_video(data, html)
self.__extract_video(data, namespace)
case "图文":
self.__extract_image(data, html)
self.__extract_image(data, namespace)
case _:
data["下载地址"] = []
await self.__download_files(data, download, log, bar)
self.rich_log(log, f"作品处理完成:{url}")
logging(log, f"作品处理完成:{url}")
return data

def __generate_data_object(self, html: str) -> Namespace:
data = self.convert.run(html)
return Namespace(data)

def __naming_rules(self, data: dict) -> str:
"""下载文件默认使用 作品标题 或 作品 ID 作为文件名称,可修改此方法自定义文件名称格式"""
return self.manager.filter_name(data["作品标题"]) or data["作品ID"]
Expand All @@ -135,6 +141,4 @@ async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

async def close(self):
self.manager.clean()
await self.html.session.close()
await self.download.session.close()
await self.manager.close()
Loading

0 comments on commit 48cfa60

Please sign in to comment.