Skip to content

Commit 1867a62

Browse files
authored
Merge pull request #6 from flycatch/dev
[ Dev ] : Extract Constants for Pattern Matching
2 parents 268e4c0 + 164decf commit 1867a62

3 files changed

Lines changed: 42 additions & 13 deletions

File tree

app/pdf_processor.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
import pytesseract
55
from pdf2image import convert_from_path
66
from PIL import Image
7+
from rapidfuzz import fuzz, process # For fuzzy matching
78

8-
9-
def extract_text_from_pdf(file_path, pattern):
9+
def extract_text_from_pdf(file_path, patterns):
1010
try:
1111
pdf_file = fitz.open(file_path)
1212
trans_numbers = []
@@ -15,17 +15,45 @@ def extract_text_from_pdf(file_path, pattern):
1515
# Convert each PDF page to an image
1616
images = convert_from_path(file_path, dpi=300) # High DPI for better OCR
1717

18+
# for number, image in enumerate(images):
19+
# # Convert image to text using Tesseract OCR
20+
# text = pytesseract.image_to_string(image)
21+
22+
# for pattern in patterns:
23+
# regex = rf'{re.escape(pattern)}\s+(\d+)'
24+
# matches = re.findall(regex, text)
25+
26+
# if matches:
27+
# trans_numbers.append(matches[0])
28+
# page_numbers.append(number)
29+
# break # Stop checking once a pattern matches on a page
30+
1831
for number, image in enumerate(images):
1932
# Convert image to text using Tesseract OCR
2033
text = pytesseract.image_to_string(image)
2134

22-
# Search for pattern in extracted text
23-
regex = rf'{re.escape(pattern)}\s+(\d+)'
24-
matches = re.findall(regex, text)
35+
for pattern in patterns:
36+
# Use fuzzy matching to find a close match
37+
best_match, score, _ = process.extractOne(pattern, text.split("\n"), scorer=fuzz.partial_ratio)
38+
39+
if score > 80: # If similarity score is high
40+
print(1)
41+
print(2)
42+
print(best_match)
43+
match = re.findall('\d{3,}', best_match) # Match 3 or more digits
44+
print(match)
45+
if match:
46+
trans_numbers.append(match[0])
47+
page_numbers.append(number)
48+
break # Stop checking once a match is found
49+
50+
# # Search for pattern in extracted text
51+
# regex = rf'{re.escape(pattern)}\s+(\d+)'
52+
# matches = re.findall(regex, text)
2553

26-
if matches:
27-
trans_numbers.append(matches[0])
28-
page_numbers.append(number)
54+
# if matches:
55+
# trans_numbers.append(matches[0])
56+
# page_numbers.append(number)
2957

3058
if page_numbers and trans_numbers:
3159
saved_files = []
@@ -40,7 +68,7 @@ def extract_text_from_pdf(file_path, pattern):
4068

4169
return {"message": "Files saved successfully", "files": saved_files}
4270
else:
43-
return {"error": f"No '{pattern}' number found in the document"}
71+
return {"error": f"No matches found in the document"}
4472

4573
except Exception as e:
4674
return {"error": f"An error occurred during PDF processing: {str(e)}"}

app/routes.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ def index():
2020
def upload_file():
2121
try:
2222
file = request.files["file"]
23-
pattern = request.form.get("pattern", r'Trans\s+(\d+)')
23+
patterns = request.form.getlist("patterns") or ["Outbound delivery", "ABC", "AMS Outb Delivery", "ReplDLv HC w Trans", "Cash Sale"] # Default patterns
24+
# pattern = request.form.get("pattern", r'Trans\s+(\d+)')
2425
download_format = request.form.get("download_format", "pdf")
2526

2627
if not file or file.filename == "":
@@ -33,7 +34,7 @@ def upload_file():
3334
file_path = os.path.join(UPLOAD_FOLDER, f"{timestamp}_{file.filename}")
3435
file.save(file_path)
3536

36-
result = extract_text_from_pdf(file_path, pattern)
37+
result = extract_text_from_pdf(file_path, patterns)
3738

3839
if "error" in result:
3940
return jsonify({"error": result["error"]}), 400

templates/upload_file.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ <h2 class="text-center mb-4">PDF text Extraction</h2>
3636
<input type="file" class="form-control" name="file" multiple required>
3737
</div>
3838

39-
<div class="mb-4">
39+
<!-- <div class="mb-4">
4040
<label for="pattern" class="form-label">Search Pattern:</label>
4141
<input type="text" class="form-control" name="pattern" id="pattern" placeholder="Type your pattern" value="Trans" required>
42-
</div>
42+
</div> -->
4343

4444
<!-- dropdown for download format -->
4545
<div class="mb-4">

0 commit comments

Comments
 (0)