main.py

# -*- encoding: utf-8 -*-
import os
import requests
import json
from bs4 import BeautifulSoup
from typing import Union
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, HTTPException
import random
from utils.spider import *
import hydra
from utils.logger import setup_logger
import schedule
import time
from contextlib import asynccontextmanager
import pathlib
import re
from concurrent.futures import ThreadPoolExecutor
import asyncio
from collections import Counter

@hydra.main(config_path='data/', config_name='config', version_base=None)
def main(cfg: DictConfig):
    # 初始化日志记录器
    global logger
    logger = setup_logger(cfg)

    @asynccontextmanager
    async def lifespan(app: FastAPI):
        # 启动前的操作
        logger.info("Application startup")
        yield
        # 关闭时的操作
        logger.info("Application shutdown")

    app = FastAPI(lifespan=lifespan)

    app.add_middleware(
        CORSMiddleware,
        allow_origins=cfg.app.cors_origins,
        allow_credentials=cfg.app.cors_credentials,
        allow_methods=cfg.app.cors_methods,
        allow_headers=cfg.app.cors_headers,
    )

    # 创建线程池
    executor = ThreadPoolExecutor(max_workers=10)

    def _fetch_url(url: str) -> str:
        """获取URL内容"""
        try:
            response = requests.get(url, timeout=10)  # 减少超时时间到10秒
            response.raise_for_status()
            return response.text
        except Exception as e:
            logger.error(f"Failed to fetch URL {url}: {str(e)}")
            return ""

    def _parse_html(html_content: str, image_dir_url: str) -> list:
        """解析HTML内容并提取链接"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            a_tags = soup.find_all('a', href=True)
            links = [image_dir_url + tag['href'] for tag in a_tags if tag['href'] != '../']
            return [link for link in links if link.endswith('.webp')] or links
        except Exception as e:
            logger.error(f"Failed to parse HTML: {str(e)}")
            return []

    async def get_image_url(video_url: str) -> str:
        """异步获取图片URL"""
        try:
            # 构建图片目录URL
            image_dir_url = video_url.replace('index.m3u8', 'image/')

            # 设置超时时间为15秒的Future
            loop = asyncio.get_event_loop()
            html_content = await asyncio.wait_for(
                loop.run_in_executor(executor, _fetch_url, image_dir_url),
                timeout=15
            )
            
            if not html_content:
                return None

            # HTML解析设置5秒超时
            links = await asyncio.wait_for(
                loop.run_in_executor(executor, _parse_html, html_content, image_dir_url),
                timeout=5
            )
            
            if not links:
                logger.warning("No image links found.")
                return None

            return random.choice(links)
        except asyncio.TimeoutError:
            logger.error(f"Timeout while processing image URL for {video_url}")
            return None
        except Exception as e:
            logger.error(f"Failed to obtain the image URL: {str(e)}")
            return None

    async def read_random_line(file_path: str) -> tuple[str, str]:
        """异步读取随机行并获取图片URL"""
        if not os.path.isfile(file_path):
            logger.error("File not found")
            raise HTTPException(status_code=404, detail="File not found")

        try:
            loop = asyncio.get_event_loop()
            # 文件读取设置2秒超时
            lines = await asyncio.wait_for(
                loop.run_in_executor(executor, lambda: open(file_path, 'r').readlines()),
                timeout=2
            )

            if not lines:
                logger.error("File is empty")
                raise HTTPException(status_code=400, detail="File is empty")

            random_line = random.choice(lines).strip()
            # 获取图片URL设置总超时20秒
            img_url = await asyncio.wait_for(get_image_url(random_line), timeout=20)

            return random_line, img_url
        except asyncio.TimeoutError:
            logger.error("Timeout while reading random line or fetching image URL")
            # 如果超时，返回视频URL但不返回图片URL
            return random.choice(lines).strip() if lines else None, None
        except Exception as e:
            logger.error(f"Error in read_random_line: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    @app.get("/v1/hacg")
    async def read_hacg():
        try:
            with open(cfg.files.hacg_json_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            logger.info("HACG data fetched successfully")
            return JSONResponse({"data": data}, headers={'content-type': 'application/json;charset=utf-8'})
        except Exception as e:
            logger.error(f"Failed to fetch HACG data: {str(e)}")
            raise HTTPException(status_code=500, detail="Internal Server Error")

    @app.get("/v1/avcode/{code_str}")
    async def crawl_av(code_str: str):
        # 规范化code_str，只保留字母和数字
        code_str = re.sub(r'[^a-zA-Z0-9]', '', code_str).lower()
        
        # 如果启用了缓存，确保缓存目录存在并尝试从缓存读取
        if cfg.av_spider.use_cache:
            # 确保缓存目录存在
            pathlib.Path(cfg.av_spider.cache_dir).mkdir(parents=True, exist_ok=True)
            
            cache_path = os.path.join(cfg.av_spider.cache_dir, f"{code_str}.json")
            try:
                if os.path.exists(cache_path):
                    with open(cache_path, 'r', encoding='utf-8') as f:
                        cached_data = json.load(f)
                        logger.info(f"Cache hit for AV code: {code_str}")
                        return {"status": "succeed", "data": cached_data}
            except Exception as e:
                logger.error(f"Error reading cache file: {str(e)}")

        # 如果没有缓存或缓存读取失败，从网络获取
        crawler = AVSpider(av_code=code_str, 
                          source_url=cfg.av_spider.source_url, 
                          proxy_url=cfg.av_spider.proxy_url,
                          use_proxy=cfg.av_spider.use_proxy,
                          cfg=cfg)
        
        try:
            magnet_links = await crawler.process_av_code()
            
            if not magnet_links:
                logger.error(f"No magnet links found for AV code: {code_str}")
                raise HTTPException(status_code=404, detail="No magnet links found")

            # 准备数据
            magnet_data = [str(item) for item in magnet_links]

            # 如果启用了缓存，保存到缓存文件（只保存数据部分）
            if cfg.av_spider.use_cache:
                try:
                    with open(cache_path, 'w', encoding='utf-8') as f:
                        json.dump(magnet_data, f, ensure_ascii=False, indent=4)
                    logger.info(f"Cache written for AV code: {code_str}")
                except Exception as e:
                    logger.error(f"Error writing cache file: {str(e)}")

            logger.info(f"Magnet links found for AV code: {code_str}")
            return {"status": "succeed", "data": magnet_data}
        except Exception as e:
            logger.error(f"Error processing AV code {code_str}: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))
        finally:
            del crawler  # 确保资源被正确释放

    @app.get("/v1/get_video")
    async def get_random_video_url():
        """Returns a random video URL and its corresponding image URL."""
        try:
            file_path = cfg.files.video_urls_txt_path
            # 设置整体操作超时为25秒
            video_url, img_url = await asyncio.wait_for(
                read_random_line(file_path),
                timeout=25
            )
            
            if not video_url:
                raise HTTPException(status_code=500, detail="Failed to get video URL")
                
            logger.info("Random video URL and image URL fetched successfully")
            return {
                "url": video_url,
                "img_url": img_url or ""
            }
        except asyncio.TimeoutError:
            logger.error("Global timeout in get_random_video_url")
            raise HTTPException(status_code=504, detail="Request timeout")
        except Exception as e:
            logger.error(f"Failed to fetch random video URL: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))
        
    def run_hacg_spider():
        hacg_spider = HacgSpider(url=cfg.hacg_spider.source_url, filepath=cfg.files.hacg_json_path, cfg=cfg)
        hacg_spider.update_json_file()
        logger.info("HacgSpider task completed.")

    # Schedule the HacgSpider task to run daily at 1 AM
    schedule.every().day.at("01:00").do(run_hacg_spider)

    # Function to keep running the scheduler in the background
    def run_scheduler():
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute

    import threading
    # Start the scheduler in a separate thread
    scheduler_thread = threading.Thread(target=run_scheduler)
    scheduler_thread.daemon = True
    scheduler_thread.start()
    
    @app.get("/v1/hot_searches")
    async def get_hot_searches(top_n: int = 5, last_n_lines: int = 2000):
        """返回最热门的搜索词
        
        Args:
            top_n: 返回的热门搜索词数量，默认为5
            last_n_lines: 读取日志文件的最后行数，默认为1000行
        """
        try:
            # 参数基本验证
            if top_n < 1:
                top_n = 5
            if last_n_lines < 100:
                last_n_lines = 1000
                
            log_file_path = cfg.logging.log_file
            if not os.path.exists(log_file_path):
                logger.error(f"Log file does not exist: {log_file_path}")
                raise HTTPException(status_code=404, detail="Log file does not exist")
            
            # 使用线程池异步读取日志文件的最后N行
            def read_last_n_lines():
                encodings = ['utf-8', 'gbk', 'iso-8859-1']
                
                for encoding in encodings:
                    try:
                        with open(log_file_path, 'r', encoding=encoding) as f:
                            # 使用deque优化内存使用
                            from collections import deque
                            return deque(f, last_n_lines)
                    except UnicodeDecodeError:
                        continue
                    except Exception as e:
                        logger.error(f"Error reading log file with {encoding}: {str(e)}")
                        continue
                
                raise HTTPException(status_code=500, detail="Unable to read log file with any encoding")

            # 读取日志文件最后N行
            log_content = await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(executor, read_last_n_lines),
                timeout=10
            )
            
            # 提取包含"AV code"但不包含"404"的行
            av_code_lines = [line for line in log_content if "AV code" in line and "404" not in line]
            
            # 从每行中提取代码
            search_terms = []
            for line in av_code_lines:
                try:
                    parts = line.split(":")
                    if len(parts) >= 2:
                        search_term = parts[-1].strip().lower()
                        # 规范化搜索词，只保留字母和数字
                        search_term = re.sub(r'[^a-zA-Z0-9]', '', search_term)
                        if search_term:
                            search_terms.append(search_term)
                except Exception as e:
                    logger.warning(f"Error processing line: {str(e)}")
                    continue
            
            if not search_terms:
                return {"status": "succeed", "data": []}
            
            # 统计每个搜索词的出现次数并获取指定范围的数据
            term_counts = Counter(search_terms)
            most_common_all = term_counts.most_common()  # 获取全部排序结果
            start_index = top_n
            end_index = start_index + top_n
            selected_terms = most_common_all[start_index:end_index]  # 切片获取指定范围
            top_terms = [term for term, _ in selected_terms]
            
            logger.info(f"Retrieved top {top_n*2} popular search terms from last {last_n_lines} lines")
            return {"status": "succeed", "data": top_terms}
        except asyncio.TimeoutError:
            logger.error("Timeout while reading log file")
            raise HTTPException(status_code=504, detail="Request timeout")
        except Exception as e:
            logger.error(f"Failed to obtain popular search terms: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    main()