updated processing with cleaner fixes

shuyijia · shuyijia · commit 35ea15fac86f · 2022-12-16T14:57:21.000-05:00
diff --git a/data/test_data/processed/data.pt b/data/test_data/processed/data.pt
diff --git a/matdeeplearn/preprocessor/datasets.py b/matdeeplearn/preprocessor/datasets.py
@@ -16,29 +16,27 @@ def __init__(
         self.processed_data_path = processed_data_path
         super(StructureDataset, self).__init__(root, transform, pre_transform, pre_filter)
 
-        if device is None:
-            try:
-                self.data, self.slices = torch.load(self.processed_paths[0])
-            except:
-                self.data, self.slices = torch.load(self.processed_paths[0], map_location=torch.device('cpu'))
+        if not torch.cuda.is_available() or device == "cpu":
+            self.data, self.slices = torch.load(
+                self.processed_paths[0],
+                map_location=torch.device('cpu')
+            )
         else:
-            if device == 'cpu':
-                self.data, self.slices = torch.load(self.processed_paths[0], map_location=torch.device(device))
-            else:
-                self.data, self.slices = torch.load(self.processed_paths[0])
+            self.data, self.slices = torch.load(self.processed_paths[0])
+            
     
     @property
     def raw_file_names(self):
-        '''
+        """
         The name of the files in the self.raw_dir folder 
         that must be present in order to skip downloading.
-        '''
+        """
         return []
 
     def download(self):
-        '''
+        """
         Download required data files; to be implemented
-        '''
+        """
         pass
 
     @property
@@ -47,11 +45,11 @@ def processed_dir(self):
 
     @property
     def processed_file_names(self):
-        '''
+        """
         The name of the files in the self.processed_dir 
         folder that must be present in order to skip processing.
-        '''
+        """
         return ["data.pt"]
 
 class LargeStructureDataset(InMemoryDataset):
-    pass
+    pass
diff --git a/matdeeplearn/preprocessor/helpers.py b/matdeeplearn/preprocessor/helpers.py
@@ -46,10 +46,10 @@ def one_hot_degree(data, max_degree, in_degree=False, cat=True):
 
 
 class GaussianSmearing(torch.nn.Module):
-    '''
+    """
     slightly edited version from pytorch geometric to create edge from gaussian basis
-    '''
-    def __init__(self, start=0.0, stop=5.0, resolution=50, width=0.05, device='cpu', **kwargs):
+    """
+    def __init__(self, start=0.0, stop=5.0, resolution=50, width=0.05, device="cpu", **kwargs):
         super(GaussianSmearing, self).__init__()
         offset = torch.linspace(start, stop, resolution, device=device)
         # self.coeff = -0.5 / (offset[1] - offset[0]).item() ** 2
@@ -93,22 +93,21 @@ def get_ranges(dataset, descriptor_label):
 
 def clean_up(data_list, attr_list):
     if not attr_list:
-        return data_list
+        return
     
+    # check which attributes in the list are removable
+    removable_attrs = [t for t in attr_list if t in data_list[0].to_dict()]
     for data in data_list:
-        for attr in attr_list:
-            try:
-                delattr(data, attr)
-            except:
-                continue
+        for attr in removable_attrs:
+            delattr(data, attr)
 
 def get_distances(
     positions: torch.Tensor,
     offsets: torch.Tensor,
-    device: str = 'cpu',
+    device: str = "cpu",
     mic: bool = True
 ):
-    '''
+    """
     Get pairwise atomic distances
 
     Parameters
@@ -123,7 +122,7 @@ def get_distances(
         
         mic:        bool
                     minimum image convention
-    '''
+    """
     
     # convert numpy array to torch tensors
     n_atoms = len(positions)
@@ -141,7 +140,7 @@ def get_distances(
     # this allows us to get the minimum self-loop distance
     # of an atom to itself in all other images
     origin_unit_cell_idx = 13
-    # atomic_distances[:,:,origin_unit_cell_idx].fill_diagonal_(float('inf'))
+    # atomic_distances[:,:,origin_unit_cell_idx].fill_diagonal_(float("inf"))
 
     # get minimum
     min_atomic_distances, min_indices = torch.min(atomic_distances, dim=-1)
@@ -154,8 +153,8 @@ def get_distances(
     return min_atomic_distances, min_indices
 
 
-def get_pbc_cells(cell: torch.Tensor, offset_number: int, device: str = 'cpu'):
-    '''
+def get_pbc_cells(cell: torch.Tensor, offset_number: int, device: str = "cpu"):
+    """
     Get the periodic boundary condition (PBC) offsets for a unit cell
     
     Parameters
@@ -166,15 +165,15 @@ def get_pbc_cells(cell: torch.Tensor, offset_number: int, device: str = 'cpu'):
                     the number of offsets for the unit cell
                     if == 0: no PBC
                     if == 1: 27-cell offsets (3x3x3)
-    '''
+    """
 
     _range = np.arange(-offset_number, offset_number+1)
     offsets = [list(x) for x in itertools.product(_range, _range, _range)]
     offsets = torch.tensor(offsets, device=device, dtype=torch.float)
     return offsets @ cell, offsets
 
 def get_cutoff_distance_matrix(pos, cell, r, n_neighbors, device, image_selfloop, offset_number=1):
-    '''
+    """
     get the distance matrix
     TODO: need to tune this for elongated structures
 
@@ -192,7 +191,7 @@ def get_cutoff_distance_matrix(pos, cell, r, n_neighbors, device, image_selfloop
 
         n_neighbors: int
             max number of neighbors to be considered
-    '''
+    """
 
     cells, cell_coors = get_pbc_cells(cell, offset_number, device=device)
     distance_matrix, min_indices = get_distances(pos, cells, device=device)
@@ -221,14 +220,14 @@ def get_cutoff_distance_matrix(pos, cell, r, n_neighbors, device, image_selfloop
     return cutoff_distance_matrix, cell_offsets
 
 def add_selfloop(num_nodes, edge_indices, edge_weights, cutoff_distance_matrix, self_loop=True):
-    '''
+    """
     add self loop (i, i) to graph structure
 
     Parameters
     ----------
         n_nodes: int
             number of nodes
-    '''
+    """
 
     if not self_loop:
         return edge_indices, edge_weights, (cutoff_distance_matrix != 0).int()
@@ -240,25 +239,25 @@ def add_selfloop(num_nodes, edge_indices, edge_weights, cutoff_distance_matrix,
     distance_matrix_masked = (cutoff_distance_matrix.fill_diagonal_(1) != 0).int()
     return edge_indices, edge_weights, distance_matrix_masked
 
-def load_node_representation(node_representation='onehot'):
+def load_node_representation(node_representation="onehot"):
     node_rep_path = Path(__file__).parent
     default_reps = {
-        'onehot': str(node_rep_path / './node_representations/onehot.csv')
+        "onehot": str(node_rep_path / "./node_representations/onehot.csv")
     }
 
     rep_file_path = node_representation
     if node_representation in default_reps:
         rep_file_path = default_reps[node_representation]
     
-    file_type = rep_file_path.split('.')[-1]
+    file_type = rep_file_path.split(".")[-1]
     loaded_rep = None
 
-    if file_type == 'csv':
-        loaded_rep = np.genfromtxt(rep_file_path, delimiter=',')
+    if file_type == "csv":
+        loaded_rep = np.genfromtxt(rep_file_path, delimiter=",")
         # TODO: need to check if typecasting to integer is needed
         loaded_rep = loaded_rep.astype(int)
 
-    elif file_type == 'json':
+    elif file_type == "json":
         # TODO
         pass
 
@@ -286,6 +285,6 @@ def generate_edge_features(input_data, edge_steps, r, device):
     if isinstance(input_data, Data):
         input_data = [input_data]
 
-    normalize_edge_cutoff(input_data, 'distance', r)
+    normalize_edge_cutoff(input_data, "distance", r)
     for i, data in enumerate(input_data):
-        input_data[i].edge_attr = distance_gaussian(input_data[i].edge_descriptor['distance'])
+        input_data[i].edge_attr = distance_gaussian(input_data[i].edge_descriptor["distance"])
diff --git a/matdeeplearn/preprocessor/processor.py b/matdeeplearn/preprocessor/processor.py
@@ -32,6 +32,7 @@ def process_data(dataset_config):
     node_representation = dataset_config.get("node_representation", "onehot")
     additional_attributes = dataset_config.get("additional_attributes", [])
     verbose: bool = dataset_config.get("verbose", True)
+    device: str = dataset_config.get("device", "cpu")
 
     processor = DataProcessor(
         root_path=root_path,
@@ -46,6 +47,7 @@ def process_data(dataset_config):
         node_representation=node_representation,
         additional_attributes=additional_attributes,
         verbose=verbose,
+        device=device
     )
     processor.process()
 
@@ -65,6 +67,7 @@ def __init__(
         node_representation: str = "onehot",
         additional_attributes: list = [],
         verbose: bool = True,
+        device: str = "cpu",
     ) -> None:
         """
         create a DataProcessor that processes the raw data and save into data.pt file.
@@ -77,6 +80,9 @@ def __init__(
             target_file_path: str
                 a path to a CSV file containing target y values
 
+            pt_path: str
+                a path to the directory to which data.pt should be saved
+
             r: float
                 cutoff radius
 
@@ -124,12 +130,9 @@ def __init__(
         self.node_representation = node_representation
         self.additional_attributes = additional_attributes
         self.verbose = verbose
+        self.device = device
 
         self.disable_tqdm = logging.root.level > logging.INFO
-        self.device = "cpu"
-    
-    def set_device(self, device):
-        self.device = device
 
     def src_check(self):
         if self.target_file_path:
@@ -195,7 +198,7 @@ def get_csv_additional_attributes(self, structure_id):
 
     def json_wrap(self):
         """
-        all structures are saved to a single json file
+        all structures are saved in a single json file
         """
         logging.info("Reading one JSON file for multiple structures.")
 
@@ -209,7 +212,7 @@ def json_wrap(self):
 
         dict_structures = []
         y = []
-        y_dim = 1
+        y_dim = len(original_structures[0]["y"]) if isinstance(original_structures[0]["y"], list) else 1
 
         logging.info("Converting data to standardized form for downstream processing.")
         for i, s in enumerate(tqdm(original_structures, disable=self.disable_tqdm)):
@@ -232,14 +235,13 @@ def json_wrap(self):
 
             dict_structures.append(d)
 
-            if isinstance(s["y"], str):
-                y.append(float(s["y"]))
-            elif isinstance(s["y"], list):
-                _y = [float(each) for each in s["y"]]
-                y.append(_y)
-                y_dim = len(_y)
-            else:
-                y.append(s["y"])
+            # check y types
+            _y = s["y"]
+            if isinstance(_y, str):
+                _y = float(_y)
+            elif isinstance(_y, list):
+                _y = [float(each) for each in _y]
+            y.append(_y)
 
         y = np.array(y).reshape(-1, y_dim)
         return dict_structures, y
@@ -296,7 +298,7 @@ def get_data_list(self, dict_structures, y):
             data.cell_offsets = cell_offsets
 
             data.edge_descriptor = {}
-            # data.edge_descriptor['mask'] = cd_matrix_masked
+            # data.edge_descriptor["mask"] = cd_matrix_masked
             data.edge_descriptor["distance"] = edge_weights
             data.distances = edge_weights
             data.structure_id = [[structure_id] * len(data.y)]
@@ -314,4 +316,4 @@ def get_data_list(self, dict_structures, y):
 
         clean_up(data_list, ["edge_descriptor"])
 
-        return data_list
+        return data_list