diff --git a/README.md b/README.md index ac5f045..e19f0d8 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The definitive single-repository guide for parsing and preparing documents for R ### [Unstructured Documents](unstructured_documents/) -9 document types, 26+ extraction methods, all with working Python code and sample documents: +10 document types, 28+ extraction methods, all with working Python code and sample documents: | Type | Methods | Key Libraries | |------|---------|---------------| @@ -19,6 +19,7 @@ The definitive single-repository guide for parsing and preparing documents for R | Email (EML) | stdlib email parsing, structured extraction | email (stdlib) | | Markdown / Text | Chunking strategies, AST parsing, semantic chunking | mistune | | EPUB | ebooklib extraction, full text pipeline | ebooklib | +| Video | Whisper transcription, keyframe extraction | openai-whisper, opencv | ### [Advanced Methods](advanced_methods/) @@ -42,6 +43,9 @@ uv sync # Install optional OCR dependencies uv sync --extra ocr +# Install optional video dependencies (Whisper + OpenCV) +uv sync --extra video + # Install advanced parsing libraries (pick what you need) uv sync --extra docling # IBM Docling uv sync --extra unstructured # Unstructured.io diff --git a/pyproject.toml b/pyproject.toml index 6858d9f..ea1d565 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,3 +58,7 @@ llamaparse = [ marker = [ "marker-pdf>=1.0", ] +video = [ + "openai-whisper>=20231117", + "opencv-python-headless>=4.8", +] diff --git a/unstructured_documents/10_video/01_whisper_transcription.py b/unstructured_documents/10_video/01_whisper_transcription.py new file mode 100644 index 0000000..8569f1c --- /dev/null +++ b/unstructured_documents/10_video/01_whisper_transcription.py @@ -0,0 +1,337 @@ +""" +Video transcription using OpenAI Whisper. + +Whisper is a general-purpose speech recognition model from OpenAI. +It can transcribe audio in multiple languages with word-level timestamps. +For video files, we first extract the audio track, then run Whisper on it. + +Requirements: + - System: ffmpeg must be installed on the OS + macOS: brew install ffmpeg + Ubuntu: sudo apt-get install ffmpeg + Windows: https://www.gyan.dev/ffmpeg/builds/ + - Python: pip install openai-whisper + +Strengths: High accuracy, multilingual, word-level timestamps, fully local. +Weaknesses: Requires ffmpeg, GPU recommended for speed, large model downloads. + +Usage: + uv run python unstructured_documents/10_video/01_whisper_transcription.py +""" + +import subprocess +import sys +import tempfile +from pathlib import Path + +# Allow imports from project root +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from unstructured_documents.shared.chunking import ( + chunk_by_characters, + chunk_by_sentences, + preview_chunks, +) + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +SAMPLE_DIR = Path(__file__).resolve().parent / "sample_docs" +LECTURE_VIDEO = SAMPLE_DIR / "lecture.mp4" + + +def check_whisper_available() -> bool: + """Check if openai-whisper and ffmpeg are available.""" + try: + import whisper # noqa: F401 + + return True + except ImportError: + print("=" * 60) + print("openai-whisper is NOT installed.") + print("=" * 60) + print() + print("Install it with:") + print(" uv sync --extra video") + print(" # or: uv pip install openai-whisper") + print() + print("You also need ffmpeg on your system:") + print(" macOS: brew install ffmpeg") + print(" Ubuntu: sudo apt-get install ffmpeg") + print(" Windows: https://www.gyan.dev/ffmpeg/builds/") + print() + return False + + +def check_ffmpeg_available() -> bool: + """Check if ffmpeg is available on the system.""" + try: + result = subprocess.run( + ["ffmpeg", "-version"], + capture_output=True, + text=True, + ) + return result.returncode == 0 + except FileNotFoundError: + print("=" * 60) + print("ffmpeg is NOT installed.") + print("=" * 60) + print() + print("Install it:") + print(" macOS: brew install ffmpeg") + print(" Ubuntu: sudo apt-get install ffmpeg") + print(" Windows: https://www.gyan.dev/ffmpeg/builds/") + print() + return False + + +def extract_audio(video_path: Path, output_path: Path | None = None) -> Path: + """ + Extract audio track from a video file using ffmpeg. + + Converts the audio to 16kHz mono WAV, which is the format Whisper expects. + If output_path is not specified, creates a temporary file. + """ + if output_path is None: + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + output_path = Path(tmp.name) + tmp.close() + + subprocess.run( + [ + "ffmpeg", + "-i", + str(video_path), + "-vn", # No video + "-acodec", + "pcm_s16le", # 16-bit PCM + "-ar", + "16000", # 16kHz sample rate + "-ac", + "1", # Mono + "-y", # Overwrite + str(output_path), + ], + capture_output=True, + check=True, + ) + return output_path + + +def transcribe_video(video_path: Path, model_size: str = "base") -> dict: + """ + Transcribe a video file using Whisper. + + Extracts audio from the video, then runs Whisper speech recognition. + Returns a dict with the full transcript text, language, and segments. + + Model sizes (accuracy vs speed tradeoff): + - tiny: ~1GB RAM, fastest, lower accuracy + - base: ~1GB RAM, good balance (recommended to start) + - small: ~2GB RAM, better accuracy + - medium: ~5GB RAM, high accuracy + - large: ~10GB RAM, highest accuracy + """ + import whisper + + model = whisper.load_model(model_size) + + # Whisper can handle video files directly if ffmpeg is available + result = model.transcribe(str(video_path)) + + return { + "text": result["text"].strip(), + "language": result.get("language", "unknown"), + "segments": [ + { + "start": seg["start"], + "end": seg["end"], + "text": seg["text"].strip(), + } + for seg in result.get("segments", []) + ], + } + + +def transcribe_with_timestamps( + video_path: Path, model_size: str = "base" +) -> list[dict]: + """ + Transcribe video and return timestamped segments. + + Each segment includes start time, end time, and text. + Useful for creating time-indexed chunks for RAG so users + can be directed to the exact moment in the video. + """ + result = transcribe_video(video_path, model_size) + return result["segments"] + + +def extract_metadata(video_path: Path) -> dict: + """ + Extract video metadata using ffprobe (part of ffmpeg). + + Returns duration, resolution, frame rate, codec, and file size. + """ + result = subprocess.run( + [ + "ffprobe", + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + "-show_streams", + str(video_path), + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return {"error": "ffprobe failed", "file": str(video_path)} + + import json + + probe = json.loads(result.stdout) + + metadata = { + "filename": video_path.name, + "file_size_bytes": int(probe.get("format", {}).get("size", 0)), + "duration_seconds": float(probe.get("format", {}).get("duration", 0)), + "format": probe.get("format", {}).get("format_long_name", "unknown"), + } + + # Find video stream info + for stream in probe.get("streams", []): + if stream.get("codec_type") == "video": + metadata["width"] = stream.get("width") + metadata["height"] = stream.get("height") + metadata["fps"] = stream.get("r_frame_rate", "unknown") + metadata["video_codec"] = stream.get("codec_name", "unknown") + break + + # Find audio stream info + for stream in probe.get("streams", []): + if stream.get("codec_type") == "audio": + metadata["audio_codec"] = stream.get("codec_name", "unknown") + metadata["sample_rate"] = stream.get("sample_rate", "unknown") + metadata["channels"] = stream.get("channels", 0) + break + + return metadata + + +def format_timestamp(seconds: float) -> str: + """Convert seconds to HH:MM:SS format.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + return f"{h:02d}:{m:02d}:{s:02d}" + + +def demo_metadata(): + """Show video metadata extraction.""" + print("=" * 70) + print("1. VIDEO METADATA EXTRACTION (ffprobe)") + print("=" * 70) + + metadata = extract_metadata(LECTURE_VIDEO) + for key, value in metadata.items(): + print(f" {key:20s}: {value}") + + +def demo_transcription(): + """Show full video transcription using Whisper.""" + print("\n" + "=" * 70) + print("2. FULL TRANSCRIPTION (Whisper)") + print("=" * 70) + + result = transcribe_video(LECTURE_VIDEO, model_size="base") + print(f"\nDetected language: {result['language']}") + print(f"Transcript length: {len(result['text']):,} characters") + print(f"\nFull transcript:") + print(result["text"][:500]) + if len(result["text"]) > 500: + print("...") + + +def demo_timestamped_segments(): + """Show timestamped transcription segments.""" + print("\n" + "=" * 70) + print("3. TIMESTAMPED SEGMENTS (Whisper)") + print("=" * 70) + + segments = transcribe_with_timestamps(LECTURE_VIDEO, model_size="base") + print(f"\nTotal segments: {len(segments)}") + for seg in segments[:10]: + start = format_timestamp(seg["start"]) + end = format_timestamp(seg["end"]) + print(f" [{start} -> {end}] {seg['text']}") + + if len(segments) > 10: + print(f" ... and {len(segments) - 10} more segments") + + +def demo_chunking(): + """Show how to chunk transcribed text for RAG.""" + print("\n" + "=" * 70) + print("4. CHUNKING TRANSCRIPT FOR RAG") + print("=" * 70) + + result = transcribe_video(LECTURE_VIDEO, model_size="base") + text = result["text"] + print(f"\nTranscript length: {len(text):,} characters") + + # Strategy 1: Character chunks + print("\n--- Strategy: Character Chunking (500 chars, 50 overlap) ---") + char_chunks = chunk_by_characters(text, chunk_size=500, overlap=50) + preview_chunks(char_chunks, max_preview=2, max_chars=150) + + # Strategy 2: Sentence-based chunks + print("\n--- Strategy: Sentence Chunking (5 sentences per chunk) ---") + sent_chunks = chunk_by_sentences(text, sentences_per_chunk=5, overlap_sentences=1) + preview_chunks(sent_chunks, max_preview=2, max_chars=150) + + # Summary comparison + print("\n--- Chunking Strategy Comparison ---") + print(f" {'Strategy':<25s} {'Chunks':>8s} {'Avg Size':>10s}") + print(f" {'-'*25} {'-'*8} {'-'*10}") + for name, chunks in [ + ("Character (500)", char_chunks), + ("Sentence (5/chunk)", sent_chunks), + ]: + sizes = [len(c) for c in chunks] + avg = sum(sizes) / len(sizes) if sizes else 0 + print(f" {name:<25s} {len(chunks):>8d} {avg:>10.1f}") + + +if __name__ == "__main__": + if not LECTURE_VIDEO.exists(): + print(f"ERROR: {LECTURE_VIDEO} not found.") + print("Run generate_samples.py first:") + print( + " uv run python unstructured_documents/10_video/sample_docs/generate_samples.py" + ) + sys.exit(1) + + # Always show metadata (requires only ffprobe) + if check_ffmpeg_available(): + demo_metadata() + else: + print("Skipping metadata demo (ffmpeg/ffprobe not available).") + + # Transcription demos require Whisper + if check_whisper_available() and check_ffmpeg_available(): + demo_transcription() + demo_timestamped_segments() + demo_chunking() + + print("\n" + "=" * 70) + print("Done. Whisper provides accurate speech-to-text for video RAG.") + print("For visual content extraction, see 02_frame_extraction.py") + print("=" * 70) + else: + print("\nSkipping transcription demos (Whisper or ffmpeg not available).") + print("Install with: uv sync --extra video") + print("Also install ffmpeg on your system (see docs above).") diff --git a/unstructured_documents/10_video/02_frame_extraction.py b/unstructured_documents/10_video/02_frame_extraction.py new file mode 100644 index 0000000..6133c89 --- /dev/null +++ b/unstructured_documents/10_video/02_frame_extraction.py @@ -0,0 +1,366 @@ +""" +Video frame extraction using OpenCV. + +Extract frames from video files for visual analysis in RAG systems. +Keyframes capture visual content (slides, diagrams, scenes) that text +transcription alone would miss. Combine with Whisper transcripts for +comprehensive video understanding. + +Requirements: + - Python: opencv-python-headless (or opencv-python) + +Strengths: No external services needed, fast, precise frame control. +Weaknesses: Frames are images (need vision model for text description), + keyframe detection is heuristic-based. + +Usage: + uv run python unstructured_documents/10_video/02_frame_extraction.py +""" + +import sys +from pathlib import Path + +# Allow imports from project root +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from unstructured_documents.shared.chunking import ( + chunk_by_characters, + preview_chunks, +) + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +SAMPLE_DIR = Path(__file__).resolve().parent / "sample_docs" +LECTURE_VIDEO = SAMPLE_DIR / "lecture.mp4" + + +def check_opencv_available() -> bool: + """Check if OpenCV is available.""" + try: + import cv2 # noqa: F401 + + return True + except ImportError: + print("=" * 60) + print("opencv-python-headless is NOT installed.") + print("=" * 60) + print() + print("Install it with:") + print(" uv sync --extra video") + print(" # or: uv pip install opencv-python-headless") + print() + return False + + +def extract_metadata(video_path: Path) -> dict: + """ + Extract video metadata using OpenCV. + + Returns basic video properties: duration, frame count, resolution, FPS. + For more detailed metadata (codecs, audio info), use ffprobe + (see 01_whisper_transcription.py). + """ + import cv2 + + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + return {"error": f"Cannot open video: {video_path}"} + + metadata = { + "filename": video_path.name, + "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), + "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), + "fps": cap.get(cv2.CAP_PROP_FPS), + "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), + "duration_seconds": ( + cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS) + if cap.get(cv2.CAP_PROP_FPS) > 0 + else 0 + ), + "codec": int(cap.get(cv2.CAP_PROP_FOURCC)), + } + + cap.release() + return metadata + + +def extract_frames_at_interval( + video_path: Path, + interval_sec: float = 1.0, + output_dir: Path | None = None, +) -> list[dict]: + """ + Extract frames at regular time intervals. + + Good for: slide-based videos, lectures, presentations where content + changes at predictable intervals. + + Returns list of dicts with frame index, timestamp, and file path. + If output_dir is None, frames are not saved to disk (metadata only). + """ + import cv2 + + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + return [] + + fps = cap.get(cv2.CAP_PROP_FPS) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frame_interval = max(1, int(fps * interval_sec)) + + frames = [] + frame_idx = 0 + + while True: + ret, frame = cap.read() + if not ret: + break + + if frame_idx % frame_interval == 0: + timestamp = frame_idx / fps if fps > 0 else 0 + frame_info = { + "frame_index": frame_idx, + "timestamp_sec": round(timestamp, 2), + "width": frame.shape[1], + "height": frame.shape[0], + } + + if output_dir is not None: + output_dir.mkdir(parents=True, exist_ok=True) + filename = f"frame_{frame_idx:06d}.png" + filepath = output_dir / filename + cv2.imwrite(str(filepath), frame) + frame_info["file_path"] = str(filepath) + + frames.append(frame_info) + + frame_idx += 1 + + cap.release() + return frames + + +def extract_keyframes( + video_path: Path, + threshold: float = 30.0, + output_dir: Path | None = None, +) -> list[dict]: + """ + Extract keyframes based on scene change detection. + + Compares consecutive frames using mean absolute difference. + When the difference exceeds the threshold, the frame is considered + a scene change / keyframe. + + Good for: videos with distinct scenes, slide transitions, topic changes. + The threshold controls sensitivity (lower = more keyframes detected). + + Returns list of dicts with frame index, timestamp, change score, and + optionally the saved file path. + """ + import cv2 + import numpy as np + + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + return [] + + fps = cap.get(cv2.CAP_PROP_FPS) + keyframes = [] + prev_gray = None + frame_idx = 0 + + while True: + ret, frame = cap.read() + if not ret: + break + + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + if prev_gray is not None: + diff = cv2.absdiff(prev_gray, gray) + change_score = float(np.mean(diff)) + + if change_score > threshold: + timestamp = frame_idx / fps if fps > 0 else 0 + frame_info = { + "frame_index": frame_idx, + "timestamp_sec": round(timestamp, 2), + "change_score": round(change_score, 2), + "width": frame.shape[1], + "height": frame.shape[0], + } + + if output_dir is not None: + output_dir.mkdir(parents=True, exist_ok=True) + filename = f"keyframe_{frame_idx:06d}.png" + filepath = output_dir / filename + cv2.imwrite(str(filepath), frame) + frame_info["file_path"] = str(filepath) + + keyframes.append(frame_info) + + # Always capture first frame as a keyframe + elif frame_idx == 0: + timestamp = 0.0 + frame_info = { + "frame_index": 0, + "timestamp_sec": 0.0, + "change_score": 0.0, + "width": frame.shape[1], + "height": frame.shape[0], + } + + if output_dir is not None: + output_dir.mkdir(parents=True, exist_ok=True) + filepath = output_dir / "keyframe_000000.png" + cv2.imwrite(str(filepath), frame) + frame_info["file_path"] = str(filepath) + + keyframes.append(frame_info) + + prev_gray = gray + frame_idx += 1 + + cap.release() + return keyframes + + +def build_frame_descriptions(keyframes: list[dict]) -> str: + """ + Build a text representation of extracted keyframes. + + Creates a structured text from keyframe metadata that can be + chunked and embedded for RAG. In production, you would use a + vision model (GPT-4o, Claude, etc.) to describe each frame's + visual content. + + This function creates placeholder descriptions from metadata. + Replace with vision model calls for real applications. + """ + lines = [] + for i, kf in enumerate(keyframes): + timestamp = kf["timestamp_sec"] + mins = int(timestamp // 60) + secs = int(timestamp % 60) + lines.append( + f"[{mins:02d}:{secs:02d}] Keyframe {i + 1}: " + f"Scene at {kf['timestamp_sec']}s " + f"(change score: {kf.get('change_score', 'N/A')}, " + f"resolution: {kf['width']}x{kf['height']})" + ) + + return "\n".join(lines) + + +def demo_metadata(): + """Show video metadata extraction.""" + print("=" * 70) + print("1. VIDEO METADATA EXTRACTION (OpenCV)") + print("=" * 70) + + metadata = extract_metadata(LECTURE_VIDEO) + for key, value in metadata.items(): + print(f" {key:20s}: {value}") + + +def demo_interval_extraction(): + """Show frame extraction at regular intervals.""" + print("\n" + "=" * 70) + print("2. FRAME EXTRACTION AT INTERVALS (every 1 second)") + print("=" * 70) + + frames = extract_frames_at_interval(LECTURE_VIDEO, interval_sec=1.0) + print(f"\nExtracted {len(frames)} frames at 1-second intervals") + for f in frames[:5]: + print( + f" Frame {f['frame_index']:>6d} | " + f"t={f['timestamp_sec']:>6.2f}s | " + f"{f['width']}x{f['height']}" + ) + if len(frames) > 5: + print(f" ... and {len(frames) - 5} more frames") + + +def demo_keyframe_detection(): + """Show keyframe extraction based on scene changes.""" + print("\n" + "=" * 70) + print("3. KEYFRAME DETECTION (scene change)") + print("=" * 70) + + keyframes = extract_keyframes(LECTURE_VIDEO, threshold=30.0) + print(f"\nDetected {len(keyframes)} keyframes (threshold=30.0)") + for kf in keyframes[:10]: + print( + f" Frame {kf['frame_index']:>6d} | " + f"t={kf['timestamp_sec']:>6.2f}s | " + f"change={kf['change_score']:>6.2f} | " + f"{kf['width']}x{kf['height']}" + ) + if len(keyframes) > 10: + print(f" ... and {len(keyframes) - 10} more keyframes") + + +def demo_frame_descriptions(): + """Show how to build text from keyframes for RAG chunking.""" + print("\n" + "=" * 70) + print("4. KEYFRAME DESCRIPTIONS FOR RAG") + print("=" * 70) + + keyframes = extract_keyframes(LECTURE_VIDEO, threshold=30.0) + descriptions = build_frame_descriptions(keyframes) + print(f"\nGenerated descriptions for {len(keyframes)} keyframes:") + print(descriptions) + + # Chunk the descriptions for embedding + print("\n--- Chunking Frame Descriptions ---") + chunks = chunk_by_characters(descriptions, chunk_size=300, overlap=30) + preview_chunks(chunks, max_preview=3, max_chars=200) + + +def demo_threshold_comparison(): + """Compare different keyframe detection thresholds.""" + print("\n" + "=" * 70) + print("5. THRESHOLD COMPARISON") + print("=" * 70) + + print(f"\n {'Threshold':>10s} {'Keyframes':>10s} Notes") + print(f" {'-'*10} {'-'*10} {'-'*30}") + + for threshold in [10.0, 20.0, 30.0, 50.0, 80.0]: + keyframes = extract_keyframes(LECTURE_VIDEO, threshold=threshold) + note = "" + if threshold <= 10: + note = "(very sensitive, many frames)" + elif threshold <= 30: + note = "(balanced)" + else: + note = "(less sensitive, fewer frames)" + print(f" {threshold:>10.1f} {len(keyframes):>10d} {note}") + + +if __name__ == "__main__": + if not check_opencv_available(): + print("Skipping frame extraction demos (OpenCV not available).") + sys.exit(0) + + if not LECTURE_VIDEO.exists(): + print(f"ERROR: {LECTURE_VIDEO} not found.") + print("Run generate_samples.py first:") + print( + " uv run python unstructured_documents/10_video/sample_docs/generate_samples.py" + ) + sys.exit(1) + + demo_metadata() + demo_interval_extraction() + demo_keyframe_detection() + demo_frame_descriptions() + demo_threshold_comparison() + + print("\n" + "=" * 70) + print("Done. OpenCV provides fast, local frame extraction for video RAG.") + print("Combine with Whisper transcripts (01_whisper_transcription.py)") + print("for comprehensive video understanding.") + print("=" * 70) diff --git a/unstructured_documents/10_video/__init__.py b/unstructured_documents/10_video/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/unstructured_documents/10_video/sample_docs/generate_samples.py b/unstructured_documents/10_video/sample_docs/generate_samples.py new file mode 100644 index 0000000..0468e3c --- /dev/null +++ b/unstructured_documents/10_video/sample_docs/generate_samples.py @@ -0,0 +1,214 @@ +""" +Generate sample video files for testing video extraction methods. + +Creates short videos with text overlays and spoken-word style content +that can be used to demonstrate transcription and frame extraction. + +Requirements: + - Python: opencv-python-headless (or opencv-python) + +Usage: + uv run python unstructured_documents/10_video/sample_docs/generate_samples.py +""" + +import sys +from pathlib import Path + +SAMPLE_DIR = Path(__file__).resolve().parent + + +def check_opencv_available() -> bool: + """Check if OpenCV is available.""" + try: + import cv2 # noqa: F401 + + return True + except ImportError: + print("=" * 60) + print("opencv-python-headless is NOT installed.") + print("=" * 60) + print() + print("Install it with:") + print(" uv sync --extra video") + print(" # or: uv pip install opencv-python-headless") + print() + return False + + +def generate_lecture_video(): + """ + Generate a sample 'lecture' video with text slides. + + Creates a short video (10 seconds, 1 fps) with text frames simulating + a lecture about machine learning. Each frame displays different content, + providing material for both frame extraction and (simulated) transcription. + """ + import cv2 + import numpy as np + + output_path = SAMPLE_DIR / "lecture.mp4" + + # Video settings + width, height = 640, 480 + fps = 1 + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + + slides = [ + { + "title": "Introduction to Machine Learning", + "body": "Machine learning is a subset of AI that enables\n" + "systems to learn from data without being\n" + "explicitly programmed.", + }, + { + "title": "Supervised Learning", + "body": "The model learns from labeled training data.\n" + "Common algorithms: Linear Regression,\n" + "Decision Trees, Neural Networks.", + }, + { + "title": "Unsupervised Learning", + "body": "The model finds patterns in unlabeled data.\n" + "Common algorithms: K-Means Clustering,\n" + "PCA, Autoencoders.", + }, + { + "title": "Deep Learning", + "body": "Neural networks with many layers can learn\n" + "complex representations. Used in image\n" + "recognition and NLP.", + }, + { + "title": "Training Process", + "body": "1. Prepare data\n" + "2. Choose architecture\n" + "3. Train with backpropagation\n" + "4. Evaluate on test set", + }, + { + "title": "Applications", + "body": "Image recognition, natural language processing,\n" + "recommendation systems, autonomous vehicles,\n" + "medical diagnosis.", + }, + { + "title": "RAG Systems", + "body": "Retrieval-Augmented Generation combines\n" + "document retrieval with language models\n" + "for accurate, grounded answers.", + }, + { + "title": "Document Parsing for RAG", + "body": "Extract text from PDFs, DOCX, HTML, images,\n" + "and video. Chunk the text and embed it\n" + "for vector search.", + }, + { + "title": "Video in RAG Pipelines", + "body": "Transcribe audio with Whisper.\n" + "Extract keyframes for visual context.\n" + "Chunk transcript for retrieval.", + }, + { + "title": "Summary", + "body": "Machine learning enables intelligent systems.\n" + "RAG makes LLMs more accurate.\n" + "Video is a rich source for RAG.", + }, + ] + + for slide in slides: + # Create a dark blue background + frame = np.zeros((height, width, 3), dtype=np.uint8) + frame[:] = (80, 50, 20) # Dark blue-gray (BGR) + + # Draw title + cv2.putText( + frame, + slide["title"], + (30, 60), + cv2.FONT_HERSHEY_SIMPLEX, + 0.8, + (255, 255, 255), + 2, + ) + + # Draw separator line + cv2.line(frame, (30, 80), (width - 30, 80), (200, 200, 200), 1) + + # Draw body text (handle newlines) + y_offset = 130 + for line in slide["body"].split("\n"): + cv2.putText( + frame, + line.strip(), + (40, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.55, + (220, 220, 220), + 1, + ) + y_offset += 35 + + writer.write(frame) + + writer.release() + print(f" Created: {output_path.name} ({len(slides)} frames, {width}x{height})") + return output_path + + +def generate_short_clip(): + """ + Generate a very short video clip (3 seconds) for quick testing. + + Creates a simple video with colored frames and text, useful for + verifying that extraction pipelines work before running on longer videos. + """ + import cv2 + import numpy as np + + output_path = SAMPLE_DIR / "short_clip.mp4" + + width, height = 320, 240 + fps = 1 + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + + colors_and_text = [ + ((40, 40, 120), "Frame 1: Hello World"), + ((40, 120, 40), "Frame 2: RAG Pipeline"), + ((120, 40, 40), "Frame 3: Video Parsing"), + ] + + for color, text in colors_and_text: + frame = np.zeros((height, width, 3), dtype=np.uint8) + frame[:] = color + cv2.putText( + frame, + text, + (15, height // 2), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) + writer.write(frame) + + writer.release() + print(f" Created: {output_path.name} (3 frames, {width}x{height})") + return output_path + + +if __name__ == "__main__": + if not check_opencv_available(): + sys.exit(1) + + print("Generating sample video files...") + print() + + generate_lecture_video() + generate_short_clip() + + print() + print("Done. Sample videos are ready for extraction demos.") diff --git a/unstructured_documents/10_video/sample_docs/lecture.mp4 b/unstructured_documents/10_video/sample_docs/lecture.mp4 new file mode 100644 index 0000000..54ce50e Binary files /dev/null and b/unstructured_documents/10_video/sample_docs/lecture.mp4 differ diff --git a/unstructured_documents/10_video/sample_docs/short_clip.mp4 b/unstructured_documents/10_video/sample_docs/short_clip.mp4 new file mode 100644 index 0000000..2bad79a Binary files /dev/null and b/unstructured_documents/10_video/sample_docs/short_clip.mp4 differ diff --git a/unstructured_documents/README.md b/unstructured_documents/README.md index 2a2a6c4..cbc5025 100644 --- a/unstructured_documents/README.md +++ b/unstructured_documents/README.md @@ -18,6 +18,7 @@ uv run python unstructured_documents/06_images_ocr/sample_docs/generate_samples. uv run python unstructured_documents/07_email/sample_docs/generate_samples.py uv run python unstructured_documents/08_markdown_txt/sample_docs/generate_samples.py uv run python unstructured_documents/09_epub/sample_docs/generate_samples.py +uv run python unstructured_documents/10_video/sample_docs/generate_samples.py # Run any extraction script uv run python unstructured_documents/01_pdf/01_pypdf_extraction.py @@ -36,6 +37,7 @@ uv run python unstructured_documents/01_pdf/01_pypdf_extraction.py | 7 | **Email (EML)** | [07_email](07_email/) | 2 | 2 | email (stdlib) | | 8 | **Markdown / Text** | [08_markdown_txt](08_markdown_txt/) | 3 | 3 | mistune, csv (stdlib) | | 9 | **EPUB (Ebooks)** | [09_epub](09_epub/) | 2 | 2 | ebooklib, BeautifulSoup | +| 10 | **Video** | [10_video](10_video/) | 2 | 2 | openai-whisper, OpenCV | ## Decision Matrix: Which Parser for Which Situation? @@ -78,6 +80,7 @@ uv run python unstructured_documents/01_pdf/01_pypdf_extraction.py | Markdown | mistune AST parsing + heading-aware chunking | | Plain text | Recursive character splitting | | EPUB | ebooklib + BeautifulSoup | +| Video | Whisper transcription + OpenCV frame extraction | ## Shared Utilities @@ -130,7 +133,8 @@ unstructured_documents/ ├── 06_images_ocr/ # OCR for images (2 methods) ├── 07_email/ # Email parsing (2 methods) ├── 08_markdown_txt/ # Markdown/text parsing (3 methods) -└── 09_epub/ # EPUB ebook parsing (2 methods) +├── 09_epub/ # EPUB ebook parsing (2 methods) +└── 10_video/ # Video parsing (2 methods) Each folder contains: ├── README.md # Comprehensive guide for that document type