44import pytesseract
55from pdf2image import convert_from_path
66from PIL import Image
7+ from rapidfuzz import fuzz , process # For fuzzy matching
78
8-
9- def extract_text_from_pdf (file_path , pattern ):
9+ def extract_text_from_pdf (file_path , patterns ):
1010 try :
1111 pdf_file = fitz .open (file_path )
1212 trans_numbers = []
@@ -15,17 +15,45 @@ def extract_text_from_pdf(file_path, pattern):
1515 # Convert each PDF page to an image
1616 images = convert_from_path (file_path , dpi = 300 ) # High DPI for better OCR
1717
18+ # for number, image in enumerate(images):
19+ # # Convert image to text using Tesseract OCR
20+ # text = pytesseract.image_to_string(image)
21+
22+ # for pattern in patterns:
23+ # regex = rf'{re.escape(pattern)}\s+(\d+)'
24+ # matches = re.findall(regex, text)
25+
26+ # if matches:
27+ # trans_numbers.append(matches[0])
28+ # page_numbers.append(number)
29+ # break # Stop checking once a pattern matches on a page
30+
1831 for number , image in enumerate (images ):
1932 # Convert image to text using Tesseract OCR
2033 text = pytesseract .image_to_string (image )
2134
22- # Search for pattern in extracted text
23- regex = rf'{ re .escape (pattern )} \s+(\d+)'
24- matches = re .findall (regex , text )
35+ for pattern in patterns :
36+ # Use fuzzy matching to find a close match
37+ best_match , score , _ = process .extractOne (pattern , text .split ("\n " ), scorer = fuzz .partial_ratio )
38+
39+ if score > 80 : # If similarity score is high
40+ print (1 )
41+ print (2 )
42+ print (best_match )
43+ match = re .findall ('\d{3,}' , best_match ) # Match 3 or more digits
44+ print (match )
45+ if match :
46+ trans_numbers .append (match [0 ])
47+ page_numbers .append (number )
48+ break # Stop checking once a match is found
49+
50+ # # Search for pattern in extracted text
51+ # regex = rf'{re.escape(pattern)}\s+(\d+)'
52+ # matches = re.findall(regex, text)
2553
26- if matches :
27- trans_numbers .append (matches [0 ])
28- page_numbers .append (number )
54+ # if matches:
55+ # trans_numbers.append(matches[0])
56+ # page_numbers.append(number)
2957
3058 if page_numbers and trans_numbers :
3159 saved_files = []
@@ -40,7 +68,7 @@ def extract_text_from_pdf(file_path, pattern):
4068
4169 return {"message" : "Files saved successfully" , "files" : saved_files }
4270 else :
43- return {"error" : f"No ' { pattern } ' number found in the document" }
71+ return {"error" : f"No matches found in the document" }
4472
4573 except Exception as e :
4674 return {"error" : f"An error occurred during PDF processing: { str (e )} " }
0 commit comments