Merge pull request #177 from aodn/gridded_timeseries_unittest

leonardolaiolo · web-flow · commit ea2f11c8af02 · 2023-01-25T14:38:07.000+11:00
Add unittest for gridded timeseries
diff --git a/aodntools/timeseries_products/gridded_timeseries.py b/aodntools/timeseries_products/gridded_timeseries.py
@@ -4,6 +4,7 @@
 import os.path
 import json
 from datetime import datetime, timezone
+from collections import defaultdict
 
 import xarray as xr
 import pandas as pd
@@ -91,24 +92,28 @@ def write_netCDF_aggfile(agg_dataset, output_path, encoding):
     return output_path
 
 
-def set_variableattr(varlist, variable_attribute_dictionary, add_variable_attribute):
+def set_variableattr(varlist, variable_attribute_dictionary):
     """
-    set variables variables atributes
+    Set variable atributes, separate attributes that should be passed to xarray separately as encoding
+    parameters
 
-    :param varlist: list of variable names
+    :param varlist: list of variable names to pick out
     :param variable_attribute_dictionary: dictionary of the variable attributes
-    :param add_variable_attribute: additional attributes to add
-    :return: dictionary of attributes
+    :return: tuple (dictionary of attributes, dictionary of encoding attributes)
     """
 
-    # with open(templatefile) as json_file:
-    #     variable_metadata = json.load(json_file)['_variables']
-    variable_attributes = {key: variable_attribute_dictionary[key] for key in varlist}
-    if len(add_variable_attribute)>0:
-        for key in add_variable_attribute.keys():
-            variable_attributes[key].update(add_variable_attribute[key])
+    encoding_attributes = {'_FillValue'}
+    time_encoding_attributes = {'units', 'calendar'}
+    variable_attributes = defaultdict(dict)
+    variable_encodings = defaultdict(dict)
+    for var in varlist:
+        for name, value in variable_attribute_dictionary[var].items():
+            if name in encoding_attributes or (var == 'TIME' and name in time_encoding_attributes):
+                variable_encodings[var][name] = value
+            else:
+                variable_attributes[var][name] = value
 
-    return variable_attributes
+    return variable_attributes, variable_encodings
 
 def generate_netcdf_output_filename(nc, facility_code, data_code, VoI, site_code, product_type, file_version):
     """
@@ -237,10 +242,7 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin
 
     ## set variable attributes
     varlist = list(VoI_interpolated.variables)
-    add_variable_attribute = {}
-    variable_attributes = set_variableattr(varlist, variable_attribute_dictionary, add_variable_attribute)
-    time_units = variable_attributes['TIME'].pop('units')
-    time_calendar = variable_attributes['TIME'].pop('calendar')
+    variable_attributes, encoding = set_variableattr(varlist, variable_attribute_dictionary)
     for variable in varlist:
         VoI_interpolated[variable].attrs = variable_attributes[variable]
 
@@ -293,22 +295,12 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin
                                                              file_version=file_version)
     ncout_path = os.path.join(output_dir, ncout_filename)
 
-    encoding = {'TIME': {'_FillValue': None,
-                         'units': time_units,
-                         'calendar': time_calendar,
-                         'zlib': True,
-                         'complevel': 5},
-                VoI:    {'zlib': True,
-                         'complevel': 5,
-                         'dtype': np.dtype('float32')},
-                VoI+'_count': {'dtype': np.dtype('int16'),
-                               'zlib': True,
-                               'complevel': 5},
-                'DEPTH': {'dtype': np.dtype('float32'),
-                          'zlib': True,
-                          'complevel': 5},
-                'LONGITUDE': {'_FillValue': False},
-                'LATITUDE': {'_FillValue': False}}
+    # data types and compression for encoding
+    for var in {'TIME', VoI, VoI+'_count', 'DEPTH'}:
+        encoding[var].update({'zlib': True, 'complevel': 5})
+    encoding[VoI].update({'dtype': np.dtype('float32')})
+    encoding[VoI+'_count'].update({'dtype': np.dtype('int16')})
+    encoding['DEPTH'].update({'dtype': np.dtype('float32')})
 
     write_netCDF_aggfile(VoI_interpolated, ncout_path, encoding)
 
diff --git a/test_aodntools/base_test.py b/test_aodntools/base_test.py
@@ -51,14 +51,24 @@ def check_nan_values(self, dataset):
         "check that there are no NaN values in any variable (they should be fill values instead)"
         nan_vars = [(name, "contains NaN values")
                     for name, var in dataset.variables.items()
-                    if var.dtype in (np.dtype('float32'), np.dtype('float64')) and any(np.isnan(var[:]))
+                    if var.dtype in (np.dtype('float32'), np.dtype('float64')) and np.isnan(var[:]).any()
                     ]
         self.assertEqual([], nan_vars)
 
     def compare_variables(self, dataset, skip_vars=('source_file', 'instrument_id')):
         """Compare dimensions and values of all variables in dataset with those in self.EXPECTED_OUTPUT_FILE,
         except for variables listed in skip_vars.
         """
+
+        def _arrays_equal(testvar, expected):
+            """compare two numpy arrays, handling the case of scalar variables"""
+            if expected.shape == ():
+                if np.isclose(testvar, expected):
+                    return True
+            elif (np.isclose(testvar, expected)).all():
+                return True
+            return False
+
         differences = []
         with Dataset(self.EXPECTED_OUTPUT_FILE) as expected:
             for var in set(expected.variables.keys()) - set(skip_vars):
@@ -68,7 +78,7 @@ def compare_variables(self, dataset, skip_vars=('source_file', 'instrument_id'))
                     differences.append((var, "shapes differ"))
 
                 # compare the raw data arrays (not the masked_array)
-                if not all(np.isclose(dataset[var][:].data, expected[var][:].data)):
+                if not _arrays_equal(dataset[var][:].data, expected[var][:].data):
                     differences.append((var, "variable values differ"))
 
         self.assertEqual([], differences)
diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV02_TEMP-gridded-timeseries_END-20190523_C-20230110.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV02_TEMP-gridded-timeseries_END-20190523_C-20230110.nc
diff --git a/test_aodntools/timeseries_products/test_gridded_timeseries.py b/test_aodntools/timeseries_products/test_gridded_timeseries.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+
+from netCDF4 import Dataset
+
+from test_aodntools.base_test import BaseTestCase
+from aodntools import __version__
+from aodntools.timeseries_products.gridded_timeseries import grid_variable
+
+
+TEST_ROOT = os.path.dirname(__file__)
+INPUT_FILE = 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220428.nc'
+
+
+class TestGriddedTimeseries(BaseTestCase):
+    EXPECTED_OUTPUT_FILE = os.path.join(
+        TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV02_TEMP-gridded-timeseries_END-20190523_C-20230110.nc'
+    )
+
+    def test_grid_variable(self):
+        output_file = grid_variable(INPUT_FILE, 'TEMP', input_dir=TEST_ROOT, output_dir='/tmp')
+
+        self.assertRegex(output_file,
+                         r'IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV02_TEMP-gridded-timeseries_END-20190523_C-\d{8}\.nc'
+                         )
+
+        dataset = Dataset(output_file)
+        self.assertSetEqual(set(dataset.dimensions), {'TIME', 'DEPTH'})
+        self.assertSetEqual(set(dataset.variables.keys()),
+                            {'TIME', 'DEPTH', 'LATITUDE', 'LONGITUDE', 'TEMP', 'TEMP_count'})
+
+        # check metadata
+        self.assertEqual(__version__, dataset.generating_code_version)
+        self.assertIn(__version__, dataset.lineage)
+        self.assertIn('gridded_timeseries.py', dataset.lineage)
+        self.assertIn(INPUT_FILE, dataset.source_file)
+
+        self.compare_global_attributes(dataset)
+
+        self.check_nan_values(dataset)
+
+        self.compare_variables(dataset)
+
+
+if __name__ == '__main__':
+    unittest.main()