refactor comparison with expected file in timeseries_products unittests

mhidas · mhidas · commit c669d38e8089 · 2022-06-08T17:20:28.000+10:00
diff --git a/test_aodntools/base_test.py b/test_aodntools/base_test.py
@@ -2,8 +2,12 @@
 import tempfile
 import unittest
 
+import numpy as np
+from netCDF4 import Dataset
+
 
 class BaseTestCase(unittest.TestCase):
+    EXPECTED_OUTPUT_FILE = None
 
     @property
     def temp_dir(self):
@@ -22,3 +26,49 @@ def temp_nc_file(self):
     def tearDown(self):
         if hasattr(self, '_temp_dir'):
             shutil.rmtree(self._temp_dir)
+
+    def compare_global_attributes(self, dataset,
+                                  attrs = ('geospatial_lat_max', 'geospatial_lat_min',
+                                           'geospatial_lon_max', 'geospatial_lon_min',
+                                           'geospatial_vertical_max', 'geospatial_vertical_min',
+                                           'time_coverage_start', 'time_coverage_end'
+                                           )
+                                  ):
+        "Compare global attributes of the given dataset with those in self.EXPECTED_OUTPUT_FILE"
+
+        not_matching = []
+        with Dataset(self.EXPECTED_OUTPUT_FILE) as expected:
+            for attr in attrs:
+                if dataset.getncattr(attr) != expected.getncattr(attr):
+                    not_matching.append((attr,
+                                         "expected: {exp}; found: {found}".format(exp=dataset.getncattr(attr),
+                                                                                  found=dataset.getncattr(attr))
+                                         ))
+
+        self.assertEqual([], not_matching)
+
+    def check_nan_values(self, dataset):
+        "check that there are no NaN values in any variable (they should be fill values instead)"
+        nan_vars = [(name, "contains NaN values")
+                    for name, var in dataset.variables.items()
+                    if var.dtype in (np.dtype('float32'), np.dtype('float64')) and any(np.isnan(var[:]))
+                    ]
+        self.assertEqual([], nan_vars)
+
+    def compare_variables(self, dataset, skip_vars=('source_file', 'instrument_id')):
+        """Compare dimensions and values of all variables in dataset with those in self.EXPECTED_OUTPUT_FILE,
+        except for variables listed in skip_vars.
+        """
+        differences = []
+        with Dataset(self.EXPECTED_OUTPUT_FILE) as expected:
+            for var in set(expected.variables.keys()) - set(skip_vars):
+                if not dataset[var].dimensions == expected[var].dimensions:
+                    differences.append((var, "dimensions differ"))
+                if not dataset[var].shape == expected[var].shape:
+                    differences.append((var, "shapes differ"))
+
+                # compare the raw data arrays (not the masked_array)
+                if not all(dataset[var][:].data == expected[var][:].data):
+                    differences.append((var, "variable values differ"))
+
+        self.assertEqual([], differences)
diff --git a/test_aodntools/timeseries_products/test_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_aggregated_timeseries.py
@@ -3,7 +3,6 @@
 import os
 import unittest
 
-import numpy as np
 from netCDF4 import Dataset, chartostring
 
 from aodntools import __version__
@@ -19,12 +18,13 @@
     'IMOS_ANMN-NRS_BCKOSTUZ_20181213T080038Z_NRSROT_FV01_NRSROT-1812-WQM-55_END-20181215T013118Z_C-20190828T000000Z.nc',
     BAD_FILE
 ]
-EXPECTED_OUTPUT_FILE = os.path.join(
-    TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV01_TEMP-aggregated-timeseries_END-20190523_C-20220607.nc'
-)
 
 
 class TestAggregatedTimeseries(BaseTestCase):
+    EXPECTED_OUTPUT_FILE = os.path.join(
+        TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV01_TEMP-aggregated-timeseries_END-20190523_C-20220607.nc'
+    )
+
     def test_main_aggregator(self):
         output_file, bad_files = main_aggregator(INPUT_FILES, 'TEMP', 'NRSROT', input_dir=TEST_ROOT,
                                                  output_dir='/tmp')
@@ -70,29 +70,11 @@ def test_main_aggregator(self):
         self.assertIn(__version__, dataset.lineage)
         self.assertIn(BAD_FILE, dataset.rejected_files)
 
-        compare_attrs = ('Conventions', 'feature_type',  'author', 'author_email', 'file_version',
-                         'geospatial_lat_max', 'geospatial_lat_min', 'geospatial_lon_max', 'geospatial_lon_min',
-                         'geospatial_vertical_max', 'geospatial_vertical_min', 'naming_authority', 'project',
-                         'time_coverage_start', 'time_coverage_end'
-                         )
-        expected = Dataset(EXPECTED_OUTPUT_FILE)
-        for attr in compare_attrs:
-            self.assertEqual(dataset.getncattr(attr), expected.getncattr(attr))
-
-        # check that there are no NaN values in any variable (they should be fill values instead)
-        nan_vars = [name
-                    for name, var in dataset.variables.items()
-                    if var.dtype in (np.dtype('float32'), np.dtype('float64')) and any(np.isnan(var[:]))
-                    ]
-        self.assertEqual([], nan_vars)
-
-        # check aggregated variable values
-        non_match_vars = []
-        for var in set(expected.variables.keys()) - string_vars:
-            # compare the raw data arrays (not the masked_array)
-            if not all(dataset[var][:].data == expected[var][:].data):
-                non_match_vars.append(var)
-        self.assertEqual([], non_match_vars)
+        self.compare_global_attributes(dataset)
+
+        self.check_nan_values(dataset)
+
+        self.compare_variables(dataset)
 
     def test_source_file_attributes(self):
         output_file, bad_files = main_aggregator(INPUT_FILES, 'PSAL', 'NRSROT', input_dir=TEST_ROOT,
diff --git a/test_aodntools/timeseries_products/test_hourly_timeseries.py b/test_aodntools/timeseries_products/test_hourly_timeseries.py
@@ -20,9 +20,6 @@
     BAD_FILE
 ]
 INPUT_PATHS = [os.path.join(TEST_ROOT, f) for f in INPUT_FILES]
-EXPECTED_OUTPUT_FILE = os.path.join(
-    TEST_ROOT, 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220428.nc'
-)
 
 INST_VARIABLES = {'instrument_id', 'source_file', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'}
 OBS_VARIABLES = {'instrument_index', 'TIME'}
@@ -49,6 +46,10 @@
 
 
 class TestHourlyTimeseries(BaseTestCase):
+    EXPECTED_OUTPUT_FILE = os.path.join(
+        TEST_ROOT, 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220428.nc'
+    )
+
     def test_hourly_aggregator(self):
         output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_PATHS,
                                                    site_code='NRSROT',
@@ -87,15 +88,11 @@ def test_hourly_aggregator(self):
         self.assertIn('hourly_timeseries.py', dataset.lineage)
         self.assertIn(BAD_FILE, dataset.rejected_files)
 
-        # check variable values
-        expected = Dataset(EXPECTED_OUTPUT_FILE)
-        self.assertEqual(len(expected['TIME']), len(dataset['TIME']))
-        compare_vars = ('TIME', 'NOMINAL_DEPTH', 'instrument_index',
-                        'TEMP', 'TEMP_count', 'TEMP_min', 'TEMP_max')
-        non_match_vars = [var for var in compare_vars
-                          if not all(dataset[var][:] == expected[var][:])
-                          ]
-        self.assertEqual(non_match_vars, [])
+        self.compare_global_attributes(dataset)
+
+        self.check_nan_values(dataset)
+
+        self.compare_variables(dataset)
 
     def test_hourly_aggregator_with_nonqc(self):
         output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_FILES,
diff --git a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py
@@ -18,9 +18,6 @@
     'IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc',
     BAD_FILE
 ]
-EXPECTED_OUTPUT_FILE = os.path.join(
-    TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV01_velocity-aggregated-timeseries_END-20191018_C-20200623.nc'
-)
 
 OBS_VARS = {'TIME', 'DEPTH', 'DEPTH_quality_control', 'UCUR', 'UCUR_quality_control',
             'VCUR', 'VCUR_quality_control', 'WCUR', 'WCUR_quality_control', 'instrument_index', 'CELL_INDEX'}
@@ -29,6 +26,10 @@
 
 
 class TestVelocityAggregatedTimeseries(BaseTestCase):
+    EXPECTED_OUTPUT_FILE = os.path.join(
+        TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV01_velocity-aggregated-timeseries_END-20191018_C-20200623.nc'
+    )
+
     def test_velocity_aggregated(self):
         output_file, bad_files = velocity_aggregated(INPUT_FILES, 'NRSROT', input_dir=TEST_ROOT, output_dir='/tmp')
 
@@ -56,13 +57,11 @@ def test_velocity_aggregated(self):
         self.assertEqual(__version__, dataset.generating_code_version)
         self.assertIn(__version__, dataset.lineage)
 
-        # check aggregated variable values
-        expected = Dataset(EXPECTED_OUTPUT_FILE)
-        compare_vars = set(expected.variables.keys()) - STR_VARS
-        non_match_vars = [var for var in compare_vars
-                          if not all(dataset[var][:] == expected[var][:])
-                          ]
-        self.assertEqual(non_match_vars, [])
+        self.compare_global_attributes(dataset)
+
+        self.check_nan_values(dataset)
+
+        self.compare_variables(dataset)
 
     def test_all_rejected(self):
         self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT',
diff --git a/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py b/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py
@@ -19,9 +19,6 @@
     'IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc',
     BAD_FILE
 ]
-EXPECTED_OUTPUT_FILE = os.path.join(
-    TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV02_velocity-hourly-timeseries_END-20191018_C-20220502.nc'
-)
 
 OBS_VARS = {'TIME', 'instrument_index', 'CELL_INDEX'}
 INST_VARS = {'LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH', 'SECONDS_TO_MIDDLE'}
@@ -33,6 +30,10 @@
 
 
 class TestVelocityHourlyTimeseries(BaseTestCase):
+    EXPECTED_OUTPUT_FILE = os.path.join(
+        TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV02_velocity-hourly-timeseries_END-20191018_C-20220502.nc'
+    )
+
     def test_velocity_hourly(self):
         output_file, bad_files = velocity_hourly_aggregated(INPUT_FILES, 'NRSROT',
                                                             input_dir=TEST_ROOT, output_dir='/tmp')
@@ -61,15 +62,11 @@ def test_velocity_hourly(self):
         self.assertEqual(__version__, dataset.generating_code_version)
         self.assertIn(__version__, dataset.lineage)
 
-        # check aggregated variable values
-        expected = Dataset(EXPECTED_OUTPUT_FILE)
-        self.assertEqual(len(expected['TIME']), len(dataset['TIME']))
+        self.compare_global_attributes(dataset)
+
+        self.check_nan_values(dataset)
 
-        non_match_vars = []
-        for var in set(expected.variables.keys()) - STR_VARS:
-            if not all(np.isclose(dataset[var], expected[var], equal_nan=True)):
-                non_match_vars.append(var)
-        self.assertEqual(non_match_vars, [])
+        self.compare_variables(dataset)
 
     def test_all_rejected(self):
         self.assertRaises(NoInputFilesError, velocity_hourly_aggregated, [BAD_FILE], 'NRSROT',