""" CloudSearch TMDB Enricher v1.0.0 自动匹配影视元数据:海报、评分、简介、年份、类型 """ import time import logging from typing import Optional, Dict, Any, List from dataclasses import dataclass, field import requests logger = logging.getLogger(__name__) TMDB_API_BASE = "https://api.themoviedb.org/3" TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500" @dataclass class MediaInfo: """影视元数据""" title: str = "" original_title: str = "" year: str = "" poster_url: str = "" backdrop_url: str = "" rating: str = "" rating_count: int = 0 description: str = "" genres: List[str] = field(default_factory=list) media_type: str = "" # movie / tv tmdb_id: int = 0 directors: List[str] = field(default_factory=list) actors: List[str] = field(default_factory=list) region: str = "" duration: str = "" seasons: int = 0 episodes: int = 0 source: str = "tmdb" tmdb_url: str = "" class TMDBEnricher: """TMDB 影视信息增强器""" # 常见网盘文件名模式 → 影视标题提取 TITLE_PATTERNS = [ # [4K] 流浪地球2 (2023) (r'\[.*?\]\s*(.+?)\s*[\((](\d{4})[\))]', 2), # 流浪地球2.2023.4K (r'(.+?)\.(\d{4})\.(?:4K|1080[Pp]|2160[Pp]|HD)', 2), # 流浪地球2 2023 (r'(.+?)\s+(\d{4})\s', 2), # S01E01 格式 (r'(.+?)[\.\s][Ss](\d{2})[Ee](\d{2})', 1), ] def __init__(self, api_key: str, language: str = "zh-CN", cache_ttl: int = 86400): self.api_key = api_key self.language = language self.cache_ttl = cache_ttl self._cache: Dict[str, tuple] = {} # key → (data, timestamp) def enrich(self, title: str, media_type: str = None) -> Optional[MediaInfo]: """根据标题查询 TMDB 元数据""" clean_title, year = self._extract_title_year(title) cache_key = f"{clean_title}:{year}:{media_type}" if cache_key in self._cache: data, ts = self._cache[cache_key] if time.time() - ts < self.cache_ttl: return data # 智能判断类型 if not media_type: media_type = self._guess_type(clean_title) info = self._search(clean_title, year, media_type) if info: self._cache[cache_key] = (info, time.time()) return info def enrich_batch(self, titles: List[str], max_concurrent: int = 5) -> Dict[str, MediaInfo]: """批量查询""" from concurrent.futures import ThreadPoolExecutor, as_completed results = {} with ThreadPoolExecutor(max_workers=max_concurrent) as ex: futures = {ex.submit(self.enrich, t): t for t in titles} for f in as_completed(futures): try: results[futures[f]] = f.result() except Exception as e: logger.warning(f"TMDB enrich failed: {futures[f]} - {e}") return results def _extract_title_year(self, title: str) -> tuple: """从文件名提取标题和年份""" import re for pattern, year_group in self.TITLE_PATTERNS: m = re.search(pattern, title, re.IGNORECASE) if m: name = m.group(1).strip() year = m.group(year_group) if year_group <= len(m.groups()) else "" # 去掉常见的后缀 name = re.sub(r'\s*[\[((].*?(?:完结|全\d+集|更新).*?[\]))]', '', name) return name.strip(), year return title.strip(), "" def _guess_type(self, title: str) -> str: """根据标题特征判断电影/电视剧""" import re tv_patterns = [ r'[Ss]\d{2}[Ee]\d{2}', r'第[一二三四五六七八九十\d]+季', r'[Ss]eason\s*\d+', r'全\d+集', r'更新至\d+', ] for p in tv_patterns: if re.search(p, title): return "tv" return "movie" def _search(self, title: str, year: str = "", media_type: str = "movie") -> Optional[MediaInfo]: """搜索 TMDB""" try: # 搜索 search_type = "tv" if media_type == "tv" else "movie" params = { "api_key": self.api_key, "query": title, "language": self.language, "page": 1, } if year: params["year" if search_type == "movie" else "first_air_date_year"] = year resp = requests.get( f"{TMDB_API_BASE}/search/{search_type}", params=params, timeout=10 ) data = resp.json() results = data.get("results", []) if not results and search_type == "movie": # 电视剧也试一下 resp2 = requests.get( f"{TMDB_API_BASE}/search/tv", params=params, timeout=10 ) data2 = resp2.json() results = data2.get("results", []) if not results: return None item = results[0] return self._parse_result(item, media_type) except Exception as e: logger.error(f"TMDB search error: {title} - {e}") return None def _parse_result(self, item: dict, media_type: str) -> MediaInfo: """解析 TMDB 返回""" mid = item.get("id", 0) is_tv = media_type == "tv" or item.get("media_type") == "tv" return MediaInfo( title=item.get("title") or item.get("name", ""), original_title=item.get("original_title") or item.get("original_name", ""), year=str(item.get("release_date", item.get("first_air_date", ""))[:4]), poster_url=f"{TMDB_IMAGE_BASE}{item['poster_path']}" if item.get("poster_path") else "", backdrop_url=f"{TMDB_IMAGE_BASE}{item['backdrop_path']}" if item.get("backdrop_path") else "", rating=str(round(item.get("vote_average", 0), 1)), rating_count=item.get("vote_count", 0), description=(item.get("overview") or "")[:500], genres=[g.get("name", "") for g in item.get("genre_ids", [])], media_type="tv" if is_tv else "movie", tmdb_id=mid, tmdb_url=f"https://www.themoviedb.org/{'tv' if is_tv else 'movie'}/{mid}", )