Skip to content

Commit 7c6aae7

Browse files
hf-kkleinKonstantinclaude
authored
fix: strip erroneous '#kv#' and '#nv#' prefixes from AHB XML attributes during reading (#255)
* chore: bump test data submodule to 2026-03-27 data including fun like ```xml <AWF Pruefidentifikator="#kv# 55038" Beschreibung="#kv# Info-Meldung zur Aufhebung einer zuk. Zuordnung" Kommunikation_von="NB an LF" > ``` * fix: strip erroneous '#kv# ' prefix from AHB XML attributes during reading The XML authors introduced random "#kv# " prefixes to several attribute values in the test data submodule. This commit strips these prefixes at read time in the AhbReader so they never leak into the data model. Affected XML files and lines (50 total occurrences across 3 files): UTILMD_AHB_Strom_2_1_Fehlerkorrektur_20260327.xml (FV2604, 24 occurrences): - Pruefidentifikator: lines 7791, 8189, 8532 - Beschreibung: lines 7792, 8190, 8533, 9803, 10117, 10993, 11856, 12770, 13678, 16282, 16572, 19584, 20069, 20593, 21154, 21760 - Bedingung text: lines 41302, 41369, 41413, 41705, 41724 UTILMD_AHB_Strom_2_1_Fehlerkorrektur_20260327.xml (FV2510, 24 occurrences): - Pruefidentifikator: lines 7791, 8189, 8532 - Beschreibung: lines 7792, 8190, 8533, 9803, 10117, 10993, 11856, 12770, 13678, 16282, 16572, 19584, 20069, 20593, 21154, 21760 - Bedingung text: lines 41302, 41369, 41413, 41705, 41724 UTILMD_AHB_Strom_2_1_Fehlerkorrektur_20250623.xml (FV2504, 2 occurrences): - Bedingung text: lines 131688, 131694 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: apply black formatting and extend kv prefix stripping to ub_bedingungen Apply remove_kv_prefix defensively to _to_ub_bedingung as well, and extend the integration test to also check ub_bedingungen for leaking prefixes. Fix black formatting in test file. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: generalize hashtag prefix stripping to handle #nv# in addition to #kv# Rename remove_kv_prefix to remove_hashtag_prefix and use a regex (^#\w+# ) to strip any leading '#xx# ' prefix, not just '#kv# '. Additional '#nv# ' occurrences found in Beschreibung attributes: UTILMD_AHB_Gas_1_0a_außerordentliche_20240726.xml (FV2504, 2 occurrences): - Beschreibung: line 22762: `Beschreibung="#nv# Anfrage an MSB mit Abhängig-keiten "` - Beschreibung: line 43668: `Beschreibung="#nv# Nicht bila.rel. Änderung vom LF "` UTILMD_AHB_Gas_1_0a_außerordentliche_20240726.xml (FV2410, 2 occurrences): - Beschreibung: line 22762: `Beschreibung="#nv# Anfrage an MSB mit Abhängig-keiten "` - Beschreibung: line 43668: `Beschreibung="#nv# Nicht bila.rel. Änderung vom LF "` UTILMD_AHB_Gas_1_0a_außerordentliche_20240726.xml (FV2510, 2 occurrences): - Beschreibung: line 22762: `Beschreibung="#nv# Anfrage an MSB mit Abhängig-keiten "` - Beschreibung: line 43668: `Beschreibung="#nv# Nicht bila.rel. Änderung vom LF "` UTILMD_AHB_Gas_1_1_Fehlerkorrektur_20260327.xml (FV2604, 2 occurrences): - Beschreibung: line 22854: `Beschreibung="#nv# Anfrage an MSB mit Abhängig-keiten "` - Beschreibung: line 43760: `Beschreibung="#nv# Nicht bila.rel. Änderung vom LF "` Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: clarify that remove_hashtag_prefix does not strip ##alt## markers Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add cases ensuring ##alt## and ##veraltet## markers are not stripped Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Konstantin <konstantin.klein+github@hochfrequenz.de> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 23090ef commit 7c6aae7

5 files changed

Lines changed: 84 additions & 12 deletions

File tree

src/fundamend/reader/ahbreader.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@
3131
_is_segment_group,
3232
_is_uebertragungsdatei,
3333
)
34-
from fundamend.utils import lstrip, remove_linebreaks_and_hyphens, remove_unnecessary_hyphens, strip
34+
from fundamend.utils import (
35+
lstrip,
36+
remove_hashtag_prefix,
37+
remove_linebreaks_and_hyphens,
38+
remove_unnecessary_hyphens,
39+
strip,
40+
)
3541

3642
# pylint:disable=duplicate-code
3743
# yes, it's very similar to the MigReader
@@ -53,14 +59,14 @@ def _to_code(element: ET.Element) -> Code:
5359
def _to_bedingung(element: ET.Element) -> Bedingung:
5460
return Bedingung(
5561
nummer=strip("[", element.attrib["Nummer"], "]"),
56-
text=(element.text or "").strip(),
62+
text=remove_hashtag_prefix((element.text or "").strip()),
5763
)
5864

5965

6066
def _to_ub_bedingung(element: ET.Element) -> UbBedingung:
6167
return UbBedingung(
6268
nummer=strip("[", element.attrib["Nummer"], "]"),
63-
text=(element.text or "").strip(),
69+
text=remove_hashtag_prefix((element.text or "").strip()),
6470
)
6571

6672

@@ -208,10 +214,10 @@ def get_anwendungsfall(self, pruefidentifikator: str) -> Anwendungsfall | None:
208214
for element in self._element_tree.getroot():
209215
if element.tag != "AWF":
210216
continue
211-
if element.attrib["Pruefidentifikator"] != pruefidentifikator:
217+
raw_pruefi = remove_hashtag_prefix(element.attrib["Pruefidentifikator"]).strip()
218+
if raw_pruefi != pruefidentifikator:
212219
continue
213-
if element.tag == "AWF" and element.attrib["Pruefidentifikator"] == pruefidentifikator:
214-
return self._read_anwendungsfall(element)
220+
return self._read_anwendungsfall(element)
215221
return None
216222

217223
def get_anwendungsfaelle(self) -> list[Anwendungsfall]:
@@ -251,9 +257,9 @@ def _read_anwendungsfall(self, original_element: ET.Element) -> Anwendungsfall:
251257
if not format_element.tag.startswith("M_"):
252258
format_element = next((child for child in original_element[0] if child.tag.startswith("M_")))
253259
return Anwendungsfall(
254-
pruefidentifikator=original_element.attrib["Pruefidentifikator"],
260+
pruefidentifikator=remove_hashtag_prefix(original_element.attrib["Pruefidentifikator"]).strip(),
255261
beschreibung=remove_unnecessary_hyphens(
256-
remove_linebreaks_and_hyphens(original_element.attrib["Beschreibung"])
262+
remove_linebreaks_and_hyphens(remove_hashtag_prefix(original_element.attrib["Beschreibung"]))
257263
),
258264
kommunikation_von=original_element.attrib["Kommunikation_von"].strip(),
259265
format=EdifactFormat(lstrip("M_", format_element.tag)),

src/fundamend/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,25 @@ def remove_unnecessary_hyphens(candidate: Optional[str]) -> Optional[str]:
162162
return _unnecessary_hyphen_pattern.sub("", candidate)
163163

164164

165+
_HASHTAG_PREFIX_PATTERN = re.compile(r"^#\w+# ")
166+
"""matches leading prefixes like '#kv# ', '#nv# ' etc. that XML authors erroneously add to attribute values"""
167+
168+
169+
def remove_hashtag_prefix(text: str) -> str:
170+
"""
171+
Removes leading '#xx# ' prefixes (e.g. '#kv# ', '#nv# ') that some XML authors erroneously add to attribute values.
172+
The pattern (^#\\w+# ) intentionally does NOT match double-hash markers like '##alt##' or '##veraltet##'
173+
because those are used by Anwendungsfall.is_outdated to flag outdated entries.
174+
"""
175+
return _HASHTAG_PREFIX_PATTERN.sub("", text)
176+
177+
165178
__all__ = [
166179
"lstrip",
167180
"rstrip",
168181
"strip",
169182
"parse_kommunikation_von",
183+
"remove_hashtag_prefix",
170184
"remove_linebreaks_and_hyphens",
171185
"remove_unnecessary_hyphens",
172186
]

unittests/__snapshots__/test_ahbreader.ambr

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
# name: test_sanitizing_all_awf_beschreibungen
1212
list([
1313
'',
14-
'#nv# Anfrage an MSB mit Abhängigkeiten',
15-
'#nv# Nicht bila.rel. Änderung vom LF',
1614
'(Ankündigung) Zuordnung des LF zur erz. MaLo/ Tranche',
1715
'Ab-/Bestellung BK-SZR auf Aggregationsebene RZ',
1816
'Abbestellung von Werten',
@@ -274,6 +272,9 @@
274272
'Gerätestatus',
275273
'Grundlage POG-Ermittlung',
276274
'Info Entsperrauftrag',
275+
'Info-Meldung zur Aufhebung einer zuk. Zuordnung',
276+
'Info-Meldung zur Beendigung der Zuordnung',
277+
'Info-Meldung über existierende Zuordnung',
277278
'Informationsmeldung',
278279
'Informationsmeldung zur Aufhebung einer zuk. Zuordnung',
279280
'Informationsmeldung zur Beendigung der Zuordnung',

unittests/test_utils.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from pathlib import Path
23
from typing import Generator
34

@@ -6,7 +7,7 @@
67
from fundamend import AhbReader
78
from fundamend.models.anwendungshandbuch import Anwendungsfall
89
from fundamend.models.kommunikationsrichtung import Kommunikationsrichtung
9-
from fundamend.utils import parse_kommunikation_von, remove_linebreaks_and_hyphens
10+
from fundamend.utils import parse_kommunikation_von, remove_hashtag_prefix, remove_linebreaks_and_hyphens
1011

1112
from .conftest import is_private_submodule_checked_out
1213

@@ -128,6 +129,56 @@ def test_parsing_all_kommunikation_von_there_is() -> None:
128129
_ = parse_kommunikation_von(kommunikation_von) # must not crash
129130

130131

132+
_hashtag_prefix_pattern = re.compile(r"^#\w+# ")
133+
134+
135+
@pytest.mark.parametrize(
136+
"original, expected",
137+
[
138+
pytest.param("hello", "hello", id="no prefix"),
139+
pytest.param("#kv# 55038", "55038", id="pruefidentifikator with #kv# prefix"),
140+
pytest.param(
141+
"#kv# Info-Meldung zur Aufhebung", "Info-Meldung zur Aufhebung", id="beschreibung with #kv# prefix"
142+
),
143+
pytest.param("#kv# Wenn die Messlokation", "Wenn die Messlokation", id="bedingung text with #kv# prefix"),
144+
pytest.param("#nv# Anfrage an MSB mit Abhängigkeiten", "Anfrage an MSB mit Abhängigkeiten", id="#nv# prefix"),
145+
pytest.param("#nv# Nicht bila.rel. Änderung vom LF", "Nicht bila.rel. Änderung vom LF", id="#nv# prefix 2"),
146+
pytest.param("#kv#no space", "#kv#no space", id="hashtag prefix without trailing space is not stripped"),
147+
pytest.param("##alt##", "##alt##", id="##alt## marker must not be stripped (used by is_outdated)"),
148+
pytest.param("##alt## 55001", "##alt## 55001", id="##alt## with pruefidentifikator must not be stripped"),
149+
pytest.param("##veraltet##", "##veraltet##", id="##veraltet## marker must not be stripped"),
150+
pytest.param("", "", id="empty string"),
151+
],
152+
)
153+
def test_remove_hashtag_prefix(original: str, expected: str) -> None:
154+
actual = remove_hashtag_prefix(original)
155+
assert actual == expected
156+
157+
158+
def test_no_hashtag_prefix_leaks_through_ahb_reader() -> None:
159+
"""Ensures that '#xx# ' prefixes introduced by XML authors are stripped during reading."""
160+
if not is_private_submodule_checked_out():
161+
pytest.skip("Skipping test because of missing private submodule")
162+
private_submodule_root = Path(__file__).parent.parent / "xml-migs-and-ahbs"
163+
for ahb_file_path in private_submodule_root.rglob("**/*AHB*.xml"):
164+
ahb = AhbReader(ahb_file_path).read()
165+
for awf in ahb.anwendungsfaelle:
166+
assert not _hashtag_prefix_pattern.match(
167+
awf.pruefidentifikator
168+
), f"pruefidentifikator '{awf.pruefidentifikator}' in {ahb_file_path} still has a hashtag prefix"
169+
assert not _hashtag_prefix_pattern.match(
170+
awf.beschreibung
171+
), f"beschreibung '{awf.beschreibung}' in {ahb_file_path} still has a hashtag prefix"
172+
for bedingung in ahb.bedingungen:
173+
assert not _hashtag_prefix_pattern.match(
174+
bedingung.text
175+
), f"bedingung '{bedingung.nummer}' text in {ahb_file_path} still has a hashtag prefix"
176+
for ub_bedingung in ahb.ub_bedingungen:
177+
assert not _hashtag_prefix_pattern.match(
178+
ub_bedingung.text
179+
), f"ub_bedingung '{ub_bedingung.nummer}' text in {ahb_file_path} still has a hashtag prefix"
180+
181+
131182
@pytest.mark.parametrize(
132183
"original, expected",
133184
[

xml-migs-and-ahbs

0 commit comments

Comments
 (0)