Skip to content

Commit a8f5ec7

Browse files
committed
Support bilibili URL ingestion with automatic titles and pin direct dependencies to a verified compatible set
1 parent 28179ca commit a8f5ec7

3 files changed

Lines changed: 172 additions & 25 deletions

File tree

pyproject.toml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@ description = "Intelligent video summarization and RSS feed generation using Qwe
55
requires-python = ">=3.11"
66
license = {text = "Apache-2.0"}
77
dependencies = [
8-
"qwen-asr[vllm]",
9-
"vllm>=0.8.5",
10-
"torch",
11-
"fastapi>=0.115",
12-
"uvicorn[standard]>=0.34",
13-
"asyncpg>=0.30",
14-
"httpx>=0.28",
15-
"feedparser>=6.0",
16-
"click>=8.1",
8+
"qwen-asr[vllm]==0.0.6",
9+
"vllm==0.14.0",
10+
"torch==2.9.1",
11+
"fastapi==0.134.0",
12+
"uvicorn[standard]==0.41.0",
13+
"asyncpg==0.31.0",
14+
"httpx==0.28.1",
15+
"feedparser==6.0.12",
16+
"click==8.3.1",
17+
"yt-dlp==2026.2.21",
1718
]
1819

1920
[project.scripts]

vra/media.py

Lines changed: 139 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,35 @@
11
from __future__ import annotations
22

33
import asyncio
4+
from dataclasses import dataclass
5+
import sys
46
import uuid
57
from pathlib import Path
8+
from urllib.parse import unquote, urlparse
69

710
import httpx
811

912

13+
_DIRECT_MEDIA_EXTENSIONS = {
14+
".aac",
15+
".flac",
16+
".m4a",
17+
".mka",
18+
".mp3",
19+
".mp4",
20+
".ogg",
21+
".opus",
22+
".wav",
23+
".webm",
24+
}
25+
26+
27+
@dataclass(slots=True)
28+
class PreparedAudio:
29+
audio_path: Path
30+
title: str | None = None
31+
32+
1033
async def ensure_storage_dirs(storage_dir: str) -> None:
1134
base = Path(storage_dir)
1235
(base / "raw").mkdir(parents=True, exist_ok=True)
@@ -24,12 +47,17 @@ async def download_to_file(client: httpx.AsyncClient, url: str, dest: Path) -> N
2447

2548
async def extract_audio_ffmpeg(input_path: Path, output_path: Path) -> None:
2649
proc = await asyncio.create_subprocess_exec(
27-
"ffmpeg", "-y",
28-
"-i", str(input_path),
50+
"ffmpeg",
51+
"-y",
52+
"-i",
53+
str(input_path),
2954
"-vn",
30-
"-acodec", "pcm_s16le",
31-
"-ar", "16000",
32-
"-ac", "1",
55+
"-acodec",
56+
"pcm_s16le",
57+
"-ar",
58+
"16000",
59+
"-ac",
60+
"1",
3361
str(output_path),
3462
stdout=asyncio.subprocess.DEVNULL,
3563
stderr=asyncio.subprocess.PIPE,
@@ -43,24 +71,125 @@ def _is_url(value: str) -> bool:
4371
return value.startswith("http://") or value.startswith("https://")
4472

4573

74+
def _url_suffix(url: str) -> str:
75+
return Path(urlparse(url).path).suffix.lower()
76+
77+
78+
def _looks_like_direct_media_url(url: str) -> bool:
79+
return _url_suffix(url) in _DIRECT_MEDIA_EXTENSIONS
80+
81+
82+
def _title_from_url(url: str) -> str | None:
83+
stem = Path(unquote(urlparse(url).path)).stem.strip()
84+
return stem or None
85+
86+
87+
def _title_from_path(path: Path) -> str | None:
88+
stem = path.stem.strip()
89+
return stem or None
90+
91+
92+
async def download_with_ytdlp(
93+
url: str,
94+
raw_dir: Path,
95+
file_id: uuid.UUID,
96+
) -> tuple[Path, str | None]:
97+
raw_dir.mkdir(parents=True, exist_ok=True)
98+
output_template = str(raw_dir / f"{file_id}.%(ext)s")
99+
100+
proc = await asyncio.create_subprocess_exec(
101+
sys.executable,
102+
"-m",
103+
"yt_dlp",
104+
"--no-playlist",
105+
"--no-warnings",
106+
"-f",
107+
"bestaudio/best",
108+
"--output",
109+
output_template,
110+
"--print",
111+
"title:%(title)s",
112+
"--print",
113+
"after_move:filepath:%(filepath)s",
114+
url,
115+
stdout=asyncio.subprocess.PIPE,
116+
stderr=asyncio.subprocess.PIPE,
117+
)
118+
stdout, stderr = await proc.communicate()
119+
if proc.returncode != 0:
120+
err = stderr.decode(errors="replace").strip()
121+
raise RuntimeError(f"yt-dlp failed: {err}")
122+
123+
out_text = stdout.decode(errors="replace")
124+
downloaded_path: Path | None = None
125+
title: str | None = None
126+
for raw_line in out_text.splitlines():
127+
line = raw_line.strip()
128+
if not line:
129+
continue
130+
if line.startswith("title:"):
131+
value = line.removeprefix("title:").strip()
132+
if value:
133+
title = value
134+
continue
135+
if line.startswith("filepath:"):
136+
value = line.removeprefix("filepath:").strip()
137+
p = Path(value)
138+
if p.exists():
139+
downloaded_path = p
140+
continue
141+
142+
p = Path(line)
143+
if p.exists():
144+
downloaded_path = p
145+
146+
if downloaded_path:
147+
return downloaded_path, title
148+
149+
matches = sorted(
150+
raw_dir.glob(f"{file_id}.*"), key=lambda p: p.stat().st_mtime, reverse=True
151+
)
152+
if matches:
153+
return matches[0], title
154+
155+
raise RuntimeError("yt-dlp completed but no media file was found")
156+
157+
46158
async def prepare_audio(
47159
client: httpx.AsyncClient,
48160
source: str,
49161
storage_dir: str,
50-
) -> Path:
162+
) -> PreparedAudio:
51163
await ensure_storage_dirs(storage_dir)
52164
base = Path(storage_dir)
53165
file_id = uuid.uuid4()
166+
inferred_title: str | None = None
54167

55168
if _is_url(source):
56-
raw_path = base / "raw" / str(file_id)
57-
await download_to_file(client, source, raw_path)
58-
input_path = raw_path
169+
input_path: Path | None = None
170+
171+
if _looks_like_direct_media_url(source):
172+
raw_path = base / "raw" / f"{file_id}{_url_suffix(source)}"
173+
try:
174+
await download_to_file(client, source, raw_path)
175+
input_path = raw_path
176+
inferred_title = _title_from_url(source)
177+
except Exception:
178+
input_path = None
179+
180+
if input_path is None:
181+
input_path, inferred_title = await download_with_ytdlp(
182+
source, base / "raw", file_id
183+
)
184+
185+
if not inferred_title:
186+
inferred_title = _title_from_url(source)
59187
else:
60188
input_path = Path(source)
61189
if not input_path.exists():
62190
raise FileNotFoundError(f"Source file not found: {source}")
191+
inferred_title = _title_from_path(input_path)
63192

64193
output_path = base / "audio" / f"{file_id}.wav"
65194
await extract_audio_ffmpeg(input_path, output_path)
66-
return output_path
195+
return PreparedAudio(audio_path=output_path, title=inferred_title)

vra/pipeline.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,9 @@ async def ingest_feed(
9191
if not source_url:
9292
continue
9393

94-
video_id = await self._db.upsert_video(feed_id, guid, title, source_url, pub_dt)
94+
video_id = await self._db.upsert_video(
95+
feed_id, guid, title, source_url, pub_dt
96+
)
9597
report.item_count += 1
9698

9799
if process:
@@ -101,11 +103,15 @@ async def ingest_feed(
101103

102104
return report
103105

104-
async def process_source(self, source_url: str, title: str | None = None) -> ProcessReport:
106+
async def process_source(
107+
self, source_url: str, title: str | None = None
108+
) -> ProcessReport:
105109
video_id = await self._db.upsert_video(None, None, title, source_url, None)
106110
return await self._process_with_video(video_id, source_url, title)
107111

108-
async def rss_feed(self, title: str, link: str, description: str, limit: int = 20) -> str:
112+
async def rss_feed(
113+
self, title: str, link: str, description: str, limit: int = 20
114+
) -> str:
109115
records = await self._db.latest_summaries(limit)
110116
return render_feed(title, link, description, records)
111117

@@ -119,22 +125,32 @@ async def _process_with_video(
119125
source_url: str,
120126
title: str | None,
121127
) -> ProcessReport:
122-
audio_path = await prepare_audio(self._client, source_url, self._config.storage_dir)
123-
audio_str = str(audio_path)
128+
prepared = await prepare_audio(
129+
self._client, source_url, self._config.storage_dir
130+
)
131+
resolved_title = title or prepared.title
132+
133+
if resolved_title and resolved_title != title:
134+
await self._db.upsert_video(None, None, resolved_title, source_url, None)
135+
136+
audio_str = str(prepared.audio_path)
124137

125138
tr = await asyncio.to_thread(self._transcriber.transcribe, audio_str)
126139
sr = await asyncio.to_thread(self._summarizer.summarize, tr.text)
127140

128141
await self._db.insert_transcript(video_id, tr)
129142
await self._db.insert_summary(video_id, sr)
130143

131-
return ProcessReport(source_url=source_url, title=title, transcription=tr, summary=sr)
144+
return ProcessReport(
145+
source_url=source_url, title=resolved_title, transcription=tr, summary=sr
146+
)
132147

133148

134149
# ------------------------------------------------------------------
135150
# Helpers
136151
# ------------------------------------------------------------------
137152

153+
138154
def _pick_source_url(entry) -> str | None:
139155
enclosures = entry.get("enclosures", [])
140156
if enclosures:
@@ -147,6 +163,7 @@ def _pick_source_url(entry) -> str | None:
147163

148164
def _struct_to_dt(st):
149165
from datetime import datetime, timezone
166+
150167
try:
151168
return datetime(*st[:6], tzinfo=timezone.utc)
152169
except Exception:

0 commit comments

Comments
 (0)