Merge pull request #6 from flycatch/dev

jithinvv4 · web-flow · commit 1867a6249c24 · 2025-05-26T10:58:38.000+05:30
[ Dev ] : Extract Constants for Pattern Matching
diff --git a/app/pdf_processor.py b/app/pdf_processor.py
@@ -4,9 +4,9 @@
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
+from rapidfuzz import fuzz, process  # For fuzzy matching
 
-
-def extract_text_from_pdf(file_path, pattern):
+def extract_text_from_pdf(file_path, patterns):
     try:
         pdf_file = fitz.open(file_path)
         trans_numbers = []
@@ -15,17 +15,45 @@ def extract_text_from_pdf(file_path, pattern):
         # Convert each PDF page to an image
         images = convert_from_path(file_path, dpi=300)  # High DPI for better OCR
 
+        # for number, image in enumerate(images):
+        #     # Convert image to text using Tesseract OCR
+        #     text = pytesseract.image_to_string(image)
+
+        #     for pattern in patterns:
+        #         regex = rf'{re.escape(pattern)}\s+(\d+)'
+        #         matches = re.findall(regex, text)
+
+        #         if matches:
+        #             trans_numbers.append(matches[0])
+        #             page_numbers.append(number)
+        #             break  # Stop checking once a pattern matches on a page
+
         for number, image in enumerate(images):
             # Convert image to text using Tesseract OCR
             text = pytesseract.image_to_string(image)
 
-            # Search for pattern in extracted text
-            regex = rf'{re.escape(pattern)}\s+(\d+)'
-            matches = re.findall(regex, text)
+            for pattern in patterns:
+                # Use fuzzy matching to find a close match
+                best_match, score, _ = process.extractOne(pattern, text.split("\n"), scorer=fuzz.partial_ratio)
+
+                if score > 80:  # If similarity score is high
+                    print(1)
+                    print(2)
+                    print(best_match)
+                    match = re.findall('\d{3,}', best_match)  # Match 3 or more digits
+                    print(match)
+                    if match:
+                        trans_numbers.append(match[0])
+                        page_numbers.append(number)
+                        break  # Stop checking once a match is found
+
+            # # Search for pattern in extracted text
+            # regex = rf'{re.escape(pattern)}\s+(\d+)'
+            # matches = re.findall(regex, text)
 
-            if matches:
-                trans_numbers.append(matches[0])
-                page_numbers.append(number)
+            # if matches:
+            #     trans_numbers.append(matches[0])
+            #     page_numbers.append(number)
 
         if page_numbers and trans_numbers:
             saved_files = []
@@ -40,7 +68,7 @@ def extract_text_from_pdf(file_path, pattern):
 
             return {"message": "Files saved successfully", "files": saved_files}
         else:
-            return {"error": f"No '{pattern}' number found in the document"}
+            return {"error": f"No matches found in the document"}
 
     except Exception as e:
         return {"error": f"An error occurred during PDF processing: {str(e)}"}
diff --git a/app/routes.py b/app/routes.py
@@ -20,7 +20,8 @@ def index():
 def upload_file():
     try:
         file = request.files["file"]
-        pattern = request.form.get("pattern", r'Trans\s+(\d+)')
+        patterns = request.form.getlist("patterns") or ["Outbound delivery", "ABC", "AMS Outb Delivery", "ReplDLv HC w Trans", "Cash Sale"]  # Default patterns
+        # pattern = request.form.get("pattern", r'Trans\s+(\d+)')
         download_format = request.form.get("download_format", "pdf")
 
         if not file or file.filename == "":
@@ -33,7 +34,7 @@ def upload_file():
         file_path = os.path.join(UPLOAD_FOLDER, f"{timestamp}_{file.filename}")
         file.save(file_path)
 
-        result = extract_text_from_pdf(file_path, pattern)
+        result = extract_text_from_pdf(file_path, patterns)
 
         if "error" in result:
             return jsonify({"error": result["error"]}), 400
diff --git a/templates/upload_file.html b/templates/upload_file.html
@@ -36,10 +36,10 @@ <h2 class="text-center mb-4">PDF text Extraction</h2>
                 <input type="file" class="form-control" name="file" multiple required>
             </div>
 
-            <div class="mb-4">
+            <!-- <div class="mb-4">
                 <label for="pattern" class="form-label">Search Pattern:</label>
                 <input type="text" class="form-control" name="pattern" id="pattern" placeholder="Type your pattern" value="Trans" required>
-            </div>
+            </div> -->
 
             <!-- dropdown for download format -->
             <div class="mb-4">