-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrun_pdf_search.py
More file actions
executable file
·186 lines (157 loc) · 6.06 KB
/
run_pdf_search.py
File metadata and controls
executable file
·186 lines (157 loc) · 6.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
"""
Entry point script for the PDF Search Plus application.
This script provides a command-line interface to the PDF Search Plus application,
allowing users to extract text from PDF files and search through them.
"""
import argparse
import sys
import os
import logging
import sqlite3
from pathlib import Path
from pdf_search_plus.main import main
from pdf_search_plus.utils.db import PDFDatabase, PDFMetadata
from pdf_search_plus.core import PDFProcessor
from pdf_search_plus.core.ocr import TesseractOCRProcessor
# Configure logging to console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
# Get the root logger and add the console handler
root_logger = logging.getLogger()
root_logger.addHandler(console_handler)
def setup_database():
"""Set up the database if it doesn't exist or is invalid."""
db = PDFDatabase()
db_exists = os.path.exists('pdf_data.db')
# Check if database exists and has the required tables
if db_exists:
try:
# Test if the database has the required tables
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_files'")
if cursor.fetchone() is None:
# Database exists but doesn't have the required tables
logging.warning("Database exists but is missing required tables. Recreating database.")
os.remove('pdf_data.db')
db.create_database()
logging.info("Database recreated successfully")
else:
logging.info("Using existing database")
except sqlite3.Error as e:
# Database exists but is corrupted or has other issues
logging.error(f"Database error: {e}. Recreating database.")
os.remove('pdf_data.db')
db.create_database()
logging.info("Database recreated successfully")
else:
# Database doesn't exist, create it
db.create_database()
logging.info("Database created successfully")
return db
def process_file(file_path, db):
"""Process a single PDF file."""
if not os.path.exists(file_path):
logging.error(f"File not found: {file_path}")
return False
if not file_path.lower().endswith('.pdf'):
logging.error(f"Not a PDF file: {file_path}")
return False
try:
ocr_processor = TesseractOCRProcessor()
pdf_processor = PDFProcessor(ocr_processor, db)
file_name = Path(file_path).stem
metadata = PDFMetadata(file_name=file_name, file_path=file_path)
pdf_processor.process_pdf(metadata)
logging.info(f"Successfully processed PDF: {file_path}")
return True
except Exception as e:
logging.error(f"Error processing PDF {file_path}: {e}")
return False
def process_folder(folder_path, db, max_workers=5):
"""Process all PDF files in a folder."""
if not os.path.isdir(folder_path):
logging.error(f"Folder not found: {folder_path}")
return False
try:
ocr_processor = TesseractOCRProcessor()
pdf_processor = PDFProcessor(ocr_processor, db)
pdf_processor.process_folder(folder_path, max_workers=max_workers)
logging.info(f"Successfully processed folder: {folder_path}")
return True
except Exception as e:
logging.error(f"Error processing folder {folder_path}: {e}")
return False
def search_database(search_term, db):
"""Search the database for the given term."""
try:
# Execute the search
results = db.search_text(search_term, use_fts=True, limit=100, offset=0)
if not results:
print(f"No results found for '{search_term}'")
return
print(f"Found {len(results)} results for '{search_term}':")
for i, result in enumerate(results, 1):
pdf_id, file_name, page_number, text, source = result
# Truncate text if too long
if len(text) > 100:
text = text[:100] + "..."
print(f"{i}. {file_name} (Page {page_number}) - {source}")
print(f" {text}")
print()
except Exception as e:
logging.error(f"Error searching database: {e}")
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="PDF Search Plus - PDF text extraction and search with OCR",
epilog="Example: python run_pdf_search.py --verbose"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose logging"
)
parser.add_argument(
"--process-file",
metavar="FILE",
help="Process a single PDF file without launching the GUI"
)
parser.add_argument(
"--process-folder",
metavar="FOLDER",
help="Process all PDF files in a folder without launching the GUI"
)
parser.add_argument(
"--search",
metavar="TERM",
help="Search for a term in the database without launching the GUI"
)
parser.add_argument(
"--max-workers",
type=int,
default=5,
help="Maximum number of worker threads for batch processing (default: 5)"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# Set log level based on verbose flag
if args.verbose:
root_logger.setLevel(logging.DEBUG)
logging.info("Verbose logging enabled")
# Set up the database
db = setup_database()
# Handle command-line operations
if args.process_file:
process_file(args.process_file, db)
elif args.process_folder:
process_folder(args.process_folder, db, args.max_workers)
elif args.search:
search_database(args.search, db)
else:
# Run the GUI application
main()