-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf-diff2.py
More file actions
95 lines (78 loc) · 3.91 KB
/
pdf-diff2.py
File metadata and controls
95 lines (78 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import fitz # PyMuPDF
import difflib
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
pages_text = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
pages_text.append(text)
return pages_text
def highlight_differences(pdf1_path, pdf2_path, output_path):
doc1 = fitz.open(pdf1_path)
pdf1_pages = extract_text_from_pdf(pdf1_path)
pdf2_pages = extract_text_from_pdf(pdf2_path)
# Keep track of highlighted instances to avoid duplicates
highlighted_instances = set()
for page_num, (text1, text2) in enumerate(zip(pdf1_pages, pdf2_pages)):
if text1 != text2:
# Use difflib to find differences line by line
diff = list(difflib.ndiff(text1.splitlines(), text2.splitlines()))
page = doc1.load_page(page_num)
# Track lines that have been removed, added, or changed
removed_lines = set()
added_lines = set()
changed_lines = []
# Collect removed and added lines
for line in diff:
if line.startswith("- "):
removed_lines.add(line[2:].strip())
elif line.startswith("+ "):
added_lines.add(line[2:].strip())
# Detect changed lines
sm = difflib.SequenceMatcher(None, text1.splitlines(), text2.splitlines())
for tag, i1, i2, j1, j2 in sm.get_opcodes():
if tag == 'replace':
for i in range(i1, i2):
changed_lines.append((text1.splitlines()[i].strip(), text2.splitlines()[j1].strip()))
for line in diff:
text_to_highlight = line[2:].strip()
color = None
note_content = ""
# Check for replaced lines
if line.startswith("- ") and (i + 1 < len(diff) and diff[i + 1].startswith("+ ")):
new_line = diff[i + 1][2:].strip()
color = (0, 0, 1) # Blue for replaced lines
note_content = f"Replaced: '{text_to_highlight}' with '{new_line}'"
# Check for added lines
elif line.startswith("+ "):
color = (0, 1, 0) # Green for added lines
note_content = f"Added line: '{text_to_highlight}'"
# Check for removed lines
elif line.startswith("- "):
color = (1, 0, 0) # Red for removed lines
note_content = f"Removed line: '{text_to_highlight}'"
# Highlight only if a color has been set
if color is not None:
print(f"Searching for: '{text_to_highlight}' on page {page_num + 1}")
text_instances = page.search_for(text_to_highlight)
for inst in text_instances:
if inst not in highlighted_instances: # Check if already highlighted
annot = page.add_highlight_annot(inst)
annot.set_colors(stroke=color) # Set the highlight color
annot.update()
if note_content: # Add note if present
annot.set_info(title="Note", content=note_content)
highlighted_instances.add(inst) # Mark this instance as highlighted
else:
print(f"No match found for: '{text_to_highlight}' on page {page_num + 1}")
print(f"Differences highlighted on page {page_num + 1}.")
# Save the annotated PDF
doc1.save(output_path)
print(f"Annotated PDF saved as '{output_path}'.")
# Specify paths
pdf1_path = "1.pdf"
pdf2_path = "4.pdf"
output_path = "output_annotated.pdf"
# Call the function to highlight differences
highlight_differences(pdf1_path, pdf2_path, output_path)