From d96fbc14330a595f1303eed5db3c2d1fea808df5 Mon Sep 17 00:00:00 2001 From: Adrian-adric-gmbh Date: Fri, 19 Sep 2025 10:18:17 +0200 Subject: [PATCH] Add number extraction method and line skipping functionality - Implemented `extract_number_from_text` to extract numeric values from strings with text. - Updated `coerce_type` to utilize the new extraction method for converting strings to int/float. - Added support for skipping lines based on provided patterns in the `parse_block` function. --- src/invoice2data/extract/invoice_template.py | 33 ++++++++++++++++++-- src/invoice2data/extract/parsers/lines.py | 15 +++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py index c5d729f0..7e1f1730 100644 --- a/src/invoice2data/extract/invoice_template.py +++ b/src/invoice2data/extract/invoice_template.py @@ -204,6 +204,27 @@ def parse_date(self, value: str) -> Any: logger.debug("result of date parsing=%s", res) return res + def extract_number_from_text(self, value: str) -> str: + """Extracts the first numeric value from a string containing text. + + This function finds and extracts numeric values from strings that may + contain text like "4 Stück", "12123 Stk.", "€25.50", etc. + + Args: + value (str): The string containing numbers and possibly text. + + Returns: + str: The extracted numeric value as a string, or empty string if no number found. + """ + # Pattern to match numbers with optional decimal separators + # This matches: 123, 123.45, 123,45, 1.234,56, 1,234.56, etc. + pattern = r'[-+]?\d{1,3}(?:[.,\s\']\d{3})*(?:[.,]\d+)?' + + match = re.search(pattern, value) + if match: + return match.group().strip() + return "" + def coerce_type(self, value: str, target_type: str) -> Any: """Coerces a value to the specified target type. @@ -221,11 +242,19 @@ def coerce_type(self, value: str, target_type: str) -> Any: if target_type == "int": if not value: return 0 - return int(self.parse_number(value)) + # Extract numeric value from text that might contain units like "4 Stück" + numeric_value = self.extract_number_from_text(value) + if not numeric_value: + return 0 + return int(self.parse_number(numeric_value)) elif target_type == "float": if not value: return 0.0 - return float(self.parse_number(value)) + # Extract numeric value from text that might contain units like "12123 Stk." + numeric_value = self.extract_number_from_text(value) + if not numeric_value: + return 0.0 + return float(self.parse_number(numeric_value)) elif target_type == "date": return self.parse_date(value) elif target_type == "datetime": diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py index d3d2798d..20cf5b5c 100644 --- a/src/invoice2data/extract/parsers/lines.py +++ b/src/invoice2data/extract/parsers/lines.py @@ -97,6 +97,21 @@ def parse_block( # noqa: RUF100 C901 # If the line has empty lines in it , skip them if not line.strip("").strip("\n").strip("\r") or not line: continue + + # Check if this is a line that should be skipped (available for all lines) + if "skip_line" in settings: + # If skip_line was provided, check for a match now + if isinstance(settings["skip_line"], list): + # Accepts a list + skip_line_results = [re.search(x, line) for x in settings["skip_line"]] + else: + # Or a simple string + skip_line_results = [re.search(settings["skip_line"], line)] + if any(skip_line_results): + # There was at least one match to a skip_line + logger.debug("skip_line match on \ns*%s*", line) + continue + if "first_line" in settings: # Check if the current lines the first_line pattern match = parse_line(settings["first_line"], line)