Files
CloudSearch/cloudsearch_enrich/tmdb_enricher.py
admin 83cbfaf03f v0.2.7: 修复Redis连接 + 启动管理后台
- 修复Redis认证 (配置密码)
- 启动Python管理后台 (端口9531, 15个功能开关)
- 统一版本号 0.2.7
- 更新docker-compose.yml (镜像版本/Redis URL/Admin服务)
2026-05-17 02:22:18 +08:00

180 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
CloudSearch TMDB Enricher v1.0.0
自动匹配影视元数据:海报、评分、简介、年份、类型
"""
import time
import logging
from typing import Optional, Dict, Any, List
from dataclasses import dataclass, field
import requests
logger = logging.getLogger(__name__)
TMDB_API_BASE = "https://api.themoviedb.org/3"
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
@dataclass
class MediaInfo:
"""影视元数据"""
title: str = ""
original_title: str = ""
year: str = ""
poster_url: str = ""
backdrop_url: str = ""
rating: str = ""
rating_count: int = 0
description: str = ""
genres: List[str] = field(default_factory=list)
media_type: str = "" # movie / tv
tmdb_id: int = 0
directors: List[str] = field(default_factory=list)
actors: List[str] = field(default_factory=list)
region: str = ""
duration: str = ""
seasons: int = 0
episodes: int = 0
source: str = "tmdb"
tmdb_url: str = ""
class TMDBEnricher:
"""TMDB 影视信息增强器"""
# 常见网盘文件名模式 → 影视标题提取
TITLE_PATTERNS = [
# [4K] 流浪地球2 (2023)
(r'\[.*?\]\s*(.+?)\s*[\(](\d{4})[\)]', 2),
# 流浪地球2.2023.4K
(r'(.+?)\.(\d{4})\.(?:4K|1080[Pp]|2160[Pp]|HD)', 2),
# 流浪地球2 2023
(r'(.+?)\s+(\d{4})\s', 2),
# S01E01 格式
(r'(.+?)[\.\s][Ss](\d{2})[Ee](\d{2})', 1),
]
def __init__(self, api_key: str, language: str = "zh-CN",
cache_ttl: int = 86400):
self.api_key = api_key
self.language = language
self.cache_ttl = cache_ttl
self._cache: Dict[str, tuple] = {} # key → (data, timestamp)
def enrich(self, title: str, media_type: str = None) -> Optional[MediaInfo]:
"""根据标题查询 TMDB 元数据"""
clean_title, year = self._extract_title_year(title)
cache_key = f"{clean_title}:{year}:{media_type}"
if cache_key in self._cache:
data, ts = self._cache[cache_key]
if time.time() - ts < self.cache_ttl:
return data
# 智能判断类型
if not media_type:
media_type = self._guess_type(clean_title)
info = self._search(clean_title, year, media_type)
if info:
self._cache[cache_key] = (info, time.time())
return info
def enrich_batch(self, titles: List[str], max_concurrent: int = 5) -> Dict[str, MediaInfo]:
"""批量查询"""
from concurrent.futures import ThreadPoolExecutor, as_completed
results = {}
with ThreadPoolExecutor(max_workers=max_concurrent) as ex:
futures = {ex.submit(self.enrich, t): t for t in titles}
for f in as_completed(futures):
try:
results[futures[f]] = f.result()
except Exception as e:
logger.warning(f"TMDB enrich failed: {futures[f]} - {e}")
return results
def _extract_title_year(self, title: str) -> tuple:
"""从文件名提取标题和年份"""
import re
for pattern, year_group in self.TITLE_PATTERNS:
m = re.search(pattern, title, re.IGNORECASE)
if m:
name = m.group(1).strip()
year = m.group(year_group) if year_group <= len(m.groups()) else ""
# 去掉常见的后缀
name = re.sub(r'\s*[\[(].*?(?:完结|全\d+集|更新).*?[\])]', '', name)
return name.strip(), year
return title.strip(), ""
def _guess_type(self, title: str) -> str:
"""根据标题特征判断电影/电视剧"""
import re
tv_patterns = [
r'[Ss]\d{2}[Ee]\d{2}', r'第[一二三四五六七八九十\d]+季',
r'[Ss]eason\s*\d+', r'\d+集', r'更新至\d+',
]
for p in tv_patterns:
if re.search(p, title):
return "tv"
return "movie"
def _search(self, title: str, year: str = "", media_type: str = "movie") -> Optional[MediaInfo]:
"""搜索 TMDB"""
try:
# 搜索
search_type = "tv" if media_type == "tv" else "movie"
params = {
"api_key": self.api_key,
"query": title,
"language": self.language,
"page": 1,
}
if year:
params["year" if search_type == "movie" else "first_air_date_year"] = year
resp = requests.get(
f"{TMDB_API_BASE}/search/{search_type}",
params=params, timeout=10
)
data = resp.json()
results = data.get("results", [])
if not results and search_type == "movie":
# 电视剧也试一下
resp2 = requests.get(
f"{TMDB_API_BASE}/search/tv",
params=params, timeout=10
)
data2 = resp2.json()
results = data2.get("results", [])
if not results:
return None
item = results[0]
return self._parse_result(item, media_type)
except Exception as e:
logger.error(f"TMDB search error: {title} - {e}")
return None
def _parse_result(self, item: dict, media_type: str) -> MediaInfo:
"""解析 TMDB 返回"""
mid = item.get("id", 0)
is_tv = media_type == "tv" or item.get("media_type") == "tv"
return MediaInfo(
title=item.get("title") or item.get("name", ""),
original_title=item.get("original_title") or item.get("original_name", ""),
year=str(item.get("release_date", item.get("first_air_date", ""))[:4]),
poster_url=f"{TMDB_IMAGE_BASE}{item['poster_path']}" if item.get("poster_path") else "",
backdrop_url=f"{TMDB_IMAGE_BASE}{item['backdrop_path']}" if item.get("backdrop_path") else "",
rating=str(round(item.get("vote_average", 0), 1)),
rating_count=item.get("vote_count", 0),
description=(item.get("overview") or "")[:500],
genres=[g.get("name", "") for g in item.get("genre_ids", [])],
media_type="tv" if is_tv else "movie",
tmdb_id=mid,
tmdb_url=f"https://www.themoviedb.org/{'tv' if is_tv else 'movie'}/{mid}",
)