From d96fbc14330a595f1303eed5db3c2d1fea808df5 Mon Sep 17 00:00:00 2001
From: Adrian-adric-gmbh <adrian@adric.gmbh>
Date: Fri, 19 Sep 2025 10:18:17 +0200
Subject: [PATCH] Add number extraction method and line skipping functionality

- Implemented `extract_number_from_text` to extract numeric values from strings with text.
- Updated `coerce_type` to utilize the new extraction method for converting strings to int/float.
- Added support for skipping lines based on provided patterns in the `parse_block` function.
---
 src/invoice2data/extract/invoice_template.py | 33 ++++++++++++++++++--
 src/invoice2data/extract/parsers/lines.py    | 15 +++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py
index c5d729f0..7e1f1730 100644
--- a/src/invoice2data/extract/invoice_template.py
+++ b/src/invoice2data/extract/invoice_template.py
@@ -204,6 +204,27 @@ def parse_date(self, value: str) -> Any:
         logger.debug("result of date parsing=%s", res)
         return res
 
+    def extract_number_from_text(self, value: str) -> str:
+        """Extracts the first numeric value from a string containing text.
+
+        This function finds and extracts numeric values from strings that may
+        contain text like "4 Stück", "12123 Stk.", "€25.50", etc.
+
+        Args:
+            value (str): The string containing numbers and possibly text.
+
+        Returns:
+            str: The extracted numeric value as a string, or empty string if no number found.
+        """
+        # Pattern to match numbers with optional decimal separators
+        # This matches: 123, 123.45, 123,45, 1.234,56, 1,234.56, etc.
+        pattern = r'[-+]?\d{1,3}(?:[.,\s\']\d{3})*(?:[.,]\d+)?'
+        
+        match = re.search(pattern, value)
+        if match:
+            return match.group().strip()
+        return ""
+
     def coerce_type(self, value: str, target_type: str) -> Any:
         """Coerces a value to the specified target type.
 
@@ -221,11 +242,19 @@ def coerce_type(self, value: str, target_type: str) -> Any:
         if target_type == "int":
             if not value:
                 return 0
-            return int(self.parse_number(value))
+            # Extract numeric value from text that might contain units like "4 Stück"
+            numeric_value = self.extract_number_from_text(value)
+            if not numeric_value:
+                return 0
+            return int(self.parse_number(numeric_value))
         elif target_type == "float":
             if not value:
                 return 0.0
-            return float(self.parse_number(value))
+            # Extract numeric value from text that might contain units like "12123 Stk."
+            numeric_value = self.extract_number_from_text(value)
+            if not numeric_value:
+                return 0.0
+            return float(self.parse_number(numeric_value))
         elif target_type == "date":
             return self.parse_date(value)
         elif target_type == "datetime":
diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py
index d3d2798d..20cf5b5c 100644
--- a/src/invoice2data/extract/parsers/lines.py
+++ b/src/invoice2data/extract/parsers/lines.py
@@ -97,6 +97,21 @@ def parse_block(  # noqa: RUF100 C901
         # If the line has empty lines in it , skip them
         if not line.strip("").strip("\n").strip("\r") or not line:
             continue
+
+        # Check if this is a line that should be skipped (available for all lines)
+        if "skip_line" in settings:
+            # If skip_line was provided, check for a match now
+            if isinstance(settings["skip_line"], list):
+                # Accepts a list
+                skip_line_results = [re.search(x, line) for x in settings["skip_line"]]
+            else:
+                # Or a simple string
+                skip_line_results = [re.search(settings["skip_line"], line)]
+            if any(skip_line_results):
+                # There was at least one match to a skip_line
+                logger.debug("skip_line match on \ns*%s*", line)
+                continue
+
         if "first_line" in settings:
             # Check if the current lines the first_line pattern
             match = parse_line(settings["first_line"], line)