Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions src/invoice2data/extract/invoice_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ def parse_date(self, value: str) -> Any:
logger.debug("result of date parsing=%s", res)
return res

def extract_number_from_text(self, value: str) -> str:
"""Extracts the first numeric value from a string containing text.

This function finds and extracts numeric values from strings that may
contain text like "4 Stück", "12123 Stk.", "€25.50", etc.

Args:
value (str): The string containing numbers and possibly text.

Returns:
str: The extracted numeric value as a string, or empty string if no number found.
"""
# Pattern to match numbers with optional decimal separators
# This matches: 123, 123.45, 123,45, 1.234,56, 1,234.56, etc.
pattern = r'[-+]?\d{1,3}(?:[.,\s\']\d{3})*(?:[.,]\d+)?'

match = re.search(pattern, value)
if match:
return match.group().strip()
return ""

def coerce_type(self, value: str, target_type: str) -> Any:
"""Coerces a value to the specified target type.

Expand All @@ -221,11 +242,19 @@ def coerce_type(self, value: str, target_type: str) -> Any:
if target_type == "int":
if not value:
return 0
return int(self.parse_number(value))
# Extract numeric value from text that might contain units like "4 Stück"
numeric_value = self.extract_number_from_text(value)
if not numeric_value:
return 0
return int(self.parse_number(numeric_value))
elif target_type == "float":
if not value:
return 0.0
return float(self.parse_number(value))
# Extract numeric value from text that might contain units like "12123 Stk."
numeric_value = self.extract_number_from_text(value)
if not numeric_value:
return 0.0
return float(self.parse_number(numeric_value))
elif target_type == "date":
return self.parse_date(value)
elif target_type == "datetime":
Expand Down
15 changes: 15 additions & 0 deletions src/invoice2data/extract/parsers/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ def parse_block( # noqa: RUF100 C901
# If the line has empty lines in it , skip them
if not line.strip("").strip("\n").strip("\r") or not line:
continue

# Check if this is a line that should be skipped (available for all lines)
if "skip_line" in settings:
# If skip_line was provided, check for a match now
if isinstance(settings["skip_line"], list):
# Accepts a list
skip_line_results = [re.search(x, line) for x in settings["skip_line"]]
else:
# Or a simple string
skip_line_results = [re.search(settings["skip_line"], line)]
if any(skip_line_results):
# There was at least one match to a skip_line
logger.debug("skip_line match on \ns*%s*", line)
continue

if "first_line" in settings:
# Check if the current lines the first_line pattern
match = parse_line(settings["first_line"], line)
Expand Down