-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
108 lines (87 loc) · 3.69 KB
/
app.py
File metadata and controls
108 lines (87 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
import pdfplumber, re, io, json, os
from datetime import datetime
app = Flask(__name__)
CORS(app)
app.config["MAX_CONTENT_LENGTH"] = 15 * 1024 * 1024 # 15 MB
@app.route("/", methods=["GET"])
def index():
return render_template("index.html")
# Regex patterns
HEADER_PATTERN = re.compile(
r"^\s*Weekly\s+Terminal\s+Transactions", re.IGNORECASE | re.MULTILINE
)
ROW_PATTERN = re.compile(r"""
^\s*
(?P<sn>\d+)\s+ # S/N
\S+\s+ # Terminal ID
(?P<serial>\S+)\s+ # Terminal Serial
.*? # Business Name (variable width)
(?P<payment>[\d,]+\.\d{2})\s+ # Payment Value
\d+\s+ # Payment Volume
(?:[\d,]+\.\d{2}|0\.00)\s+ # Transfer Value (ignored)
\d+\s+ # Transfer Volume
[\d,]+\.\d{2}\s+ # Target Payment Value (ignored)
(?:True|False)\s+ # Target Met (ignored)
(?P<days>\d+)\s*$ # Days Since Last Transaction
""", re.VERBOSE)
def extract_weekly_rows(pdf_bytes):
"""
Parse the entire PDF, starting from the first "Weekly Terminal Transactions"
header, and keep extracting rows across multiple pages until we see S/N reset
to 1 (indicating a new table). Ignores footers and unrelated text.
"""
weekly_rows = []
in_weekly_section = False
with pdfplumber.open(pdf_bytes) as pdf:
for idx, page in enumerate(pdf.pages):
text = page.extract_text() or ""
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for i, ln in enumerate(lines):
# Start parsing when we see the header
if not in_weekly_section and HEADER_PATTERN.search(ln):
in_weekly_section = True
continue
if in_weekly_section:
m = ROW_PATTERN.match(ln)
if m:
sn_val = int(m.group("sn"))
# If S/N resets to 1 and we already have rows, stop completely
if sn_val == 1 and weekly_rows:
return weekly_rows
# Add valid row
weekly_rows.append({
"S/N": sn_val,
"Terminal Serial": m.group("serial"),
"Payment Value": m.group("payment"),
"Page": idx + 1
})
return weekly_rows
@app.route("/parse", methods=["POST"])
def parse_pdf():
file = request.files.get("pdf")
if not file:
return jsonify({"error": "No file uploaded. Use form field 'pdf'."}), 400
if not file.filename.lower().endswith(".pdf"):
return jsonify({"error": "Only PDF files are supported."}), 400
pdf_bytes = io.BytesIO(file.read())
try:
rows = extract_weekly_rows(pdf_bytes)
# Save to logs
log_dir = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(log_dir, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = os.path.join(log_dir, f"weekly_rows_{ts}.json")
output_json = {
"message": "Weekly section extracted successfully.",
"count": len(rows),
"rows": rows
}
with open(log_path, "w", encoding="utf-8") as f:
json.dump(output_json, f, ensure_ascii=False, indent=2)
return jsonify(output_json), 200
except Exception as e:
return jsonify({"error": f"Failed to parse PDF: {str(e)}"}), 500
if __name__ == "__main__":
app.run(host="127.0.0.1", port=5000, debug=True)