Use xmltodict to read faster without validation

gitandy · gitandy · commit 66855bfc2e5c · 2024-02-27T16:06:32.000+01:00
Added test cases
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ For ADI a user definition is a dictionary of
 The library also supports ADX import/export as compatible as possible to the ADI part. 
 Though it will differ in handling application and user definitions.
 It relys on the [ADX schemas](https://adif.org/314/ADIF_314.htm#ADX_Schemas) from adif.org.
+For the ADX import there is no validation by default to be able to read fast.
 
 Installation
 ------------
@@ -80,4 +81,7 @@ Copyright
 ---------
 PyADIF-File &copy; 2024 by Andreas Schawo is licensed under [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/) 
 
-PyADIF-File uses xmlschema Copyright (c), 2016-2022, SISSA (Scuola Internazionale Superiore di Studi Avanzati).
+PyADIF-File uses
+* xmlschema Copyright (c), 2016-2022, SISSA (Scuola Internazionale Superiore di Studi Avanzati)
+* xmltodict Copyright (c), 2012 Martin Blech and individual contributors
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ authors = [
 description = "Convert ADIF ADI/ADX content to dictionary and vice versa"
 readme = "README.md"
 requires-python = ">=3.9"
-dependencies = ["xmlschema"]
+dependencies = ["xmlschema", "xmltodict"]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Other Audience",
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-xmlschema
+xmlschema~=2.5.0
+xmltodict~=0.13.0
diff --git a/src/adif_file/adx.py b/src/adif_file/adx.py
@@ -3,9 +3,11 @@
 
 import datetime
 import os.path
+import xml
 from xml.etree.ElementTree import ElementTree, ParseError
 
 import xmlschema
+import xmltodict
 
 from adif_file.__version__ import __version__ as __version_str__
 
@@ -32,40 +34,51 @@ class XmlSyntaxError(SyntaxError):
     pass
 
 
-def load(file_name: str) -> dict:
+def loads(adx_data: str, validate: bool = False) -> dict:
+    """Load ADX content to dictionary
+       The ADX is not validated to conform to the standard
+
+       :param adx_data: the ADX content
+       :param validate: validate the ADX against the genereic XSD (very slow)
+       :return: the ADX as a dict
+       """
+
+    if validate:
+        try:
+            ADX_IMPORT_SCHEMA.validate(adx_data)
+        except ParseError as exc:
+            raise XmlSyntaxError(str(exc)) from None
+        except xmlschema.validators.exceptions.XMLSchemaChildrenValidationError as exc:
+            raise UndefinedElementException(f'in {exc.elem.tag}') from None
+        except xmlschema.validators.exceptions.XMLSchemaValidationError as exc:
+            raise MalformedValueException(f'Field "{exc.elem.tag}": {exc.reason}') from None
+
+    try:
+        data_dict = xmltodict.parse(adx_data, cdata_key='$')
+        data_dict = data_dict['ADX']
+        if ('RECORDS' in data_dict and data_dict['RECORDS'] and
+                'RECORD' in data_dict['RECORDS'] and data_dict['RECORDS']['RECORD']):
+            data_dict['RECORDS'] = data_dict['RECORDS']['RECORD']
+        else:
+            data_dict['RECORDS'] = []
+        return data_dict
+    except xml.parsers.expat.ExpatError as exc:
+        raise XmlSyntaxError(str(exc)) from None
+
+
+def load(file_name: str, validate: bool = False) -> dict:
     """Load ADX file to dictionary
        The XML is validated against the generic XSD
 
        :param file_name: the file name where the ADX data is stored
+       :param validate: validate the ADX against the genereic XSD (very slow)
        :return: the ADX as a dict
        """
 
-    try:
-        data_dict = ADX_IMPORT_SCHEMA.to_dict(file_name, decimal_type=str)
-
-        # Flatten records
-        records = []
-        for rec in data_dict['RECORDS']['RECORD']:
-            for elem in rec:
-                if type(rec[elem][0]) is str:  # Only for str to save APP data
-                    rec[elem] = rec[elem][0]
-            records.append(rec)
-        data_dict['RECORDS'] = records
-
-        # Flatten header
-        header = {}
-        for elem in data_dict['HEADER']:
-            if type(data_dict['HEADER'][elem][0]) is str:  # Only for str to save USERDEF
-                header[elem] = data_dict['HEADER'][elem][0]
-        data_dict['HEADER'] = header
-    except ParseError as exc:
-        raise XmlSyntaxError(str(exc)) from None
-    except xmlschema.validators.exceptions.XMLSchemaChildrenValidationError as exc:
-        raise UndefinedElementException(f'in {exc.elem.tag}') from None
-    except xmlschema.validators.exceptions.XMLSchemaValidationError as exc:
-        raise MalformedValueException(f'Field "{exc.elem.tag}": {exc.reason}') from None
+    with open(file_name, encoding='utf-8') as xf:
+        adx_data = xf.read()
 
-    return data_dict
+    return loads(adx_data, validate)
 
 
 def dump(file_name: str, data_dict: dict):
@@ -106,5 +119,5 @@ def dump(file_name: str, data_dict: dict):
         raise MalformedValueException(f'Field "{exc.elem.tag}": {exc.reason}') from None
 
 
-__all__ = ['load', 'dump', 'MissingRecordsException', 'UndefinedElementException',
+__all__ = ['load', 'loads', 'dump', 'MissingRecordsException', 'UndefinedElementException',
            'MalformedValueException', 'XmlSyntaxError']
diff --git a/test/test_dumpadx.py b/test/test_dumpadx.py
@@ -48,7 +48,14 @@ def test_20_dump(self):
                         {'CALL': 'YY1YYY',
                          'QSO_DATE': '20231204',
                          'TIME_ON': '1200',
-                         'QTH_INTL': 'Töst'
+                         'QTH_INTL': 'Töst',
+                         'APP':
+                             {
+                                 '@PROGRAMID': 'TESTAPP',
+                                 '@FIELDNAME': 'TESTFIELD',
+                                 '@TYPE': 'I',
+                                 '$': 'Test',
+                             },
                          }]
         }
 
@@ -72,6 +79,7 @@ def test_20_dump(self):
             <QSO_DATE>20231204</QSO_DATE>
             <TIME_ON>1200</TIME_ON>
             <QTH_INTL>Töst</QTH_INTL>
+            <APP PROGRAMID="TESTAPP" FIELDNAME="TESTFIELD" TYPE="I">Test</APP>
         </RECORD>
     </RECORDS>
 </ADX>
diff --git a/test/test_loadadx.py b/test/test_loadadx.py
@@ -23,10 +23,18 @@ def test_10_goodfile(self):
                         {'CALL': 'YY1YYY',
                          'QSO_DATE': '20231204',
                          'TIME_ON': '1200',
-                         'QTH_INTL': 'Töst'
+                         'QTH_INTL': 'Töst',
+                         'APP':
+                             {
+                                 '@PROGRAMID': 'TESTAPP',
+                                 '@FIELDNAME': 'TESTFIELD',
+                                 '@TYPE': 'I',
+                                 '$': 'Test',
+                             },
                          }]
         }
 
+        self.maxDiff = None
         self.assertDictEqual(adx_exp_dict, adif_file.adx.load(get_file_path('testdata/goodfile.adx')))
 
     def test_20_badfile(self):
@@ -35,9 +43,9 @@ def test_20_badfile(self):
 
     def test_20_badxml(self):
         self.assertRaises(adif_file.adx.MalformedValueException, adif_file.adx.load,
-                          get_file_path('testdata/badfile1.adx'))
+                          get_file_path('testdata/badfile1.adx'), True)
         self.assertRaises(adif_file.adx.UndefinedElementException, adif_file.adx.load,
-                          get_file_path('testdata/badfile2.adx'))
+                          get_file_path('testdata/badfile2.adx'), True)
 
 
 if __name__ == '__main__':
diff --git a/test/testdata/goodfile.adx b/test/testdata/goodfile.adx
@@ -18,6 +18,7 @@
             <QSO_DATE>20231204</QSO_DATE>
             <TIME_ON>1200</TIME_ON>
             <QTH_INTL>Töst</QTH_INTL>
+            <APP PROGRAMID="TESTAPP" FIELDNAME="TESTFIELD" TYPE="I">Test</APP>
         </RECORD>
     </RECORDS>
 </ADX>

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-xmlschema`
	`1`	`+xmlschema~=2.5.0`
	`2`	`+xmltodict~=0.13.0`