openMINDS_Python/pipeline/src/collection.py at 1c6cd5a04e17d51fcf292c0719d15b4b6a6866d3 · apdavison/openMINDS_Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""
This module provides the Collection class, which can be used to
create a collection of openMINDS metadata nodes.

The collection can be saved to and loaded from disk, in JSON-LD format.
"""

from collections import Counter
from glob import glob
from importlib import import_module
import json
import os
from .registry import lookup_type
from .base import Link, LinkedNodeEmbedding


DEFAULT_VERSION = "v5"


class Collection:
    """
    A collection of metadata nodes that can be saved to
    and loaded from disk.

    Args
    ----

    *nodes (LinkedMetadata):
        Nodes to store in the collection when creating it.
        Child nodes that are referenced from the explicitly
        listed nodes will also be added.
    """

    def __init__(self, *nodes):
        self.nodes = {}
        self.add(*nodes)

    def __len__(self):
        return len(self.nodes)

    def __iter__(self):
        return iter(self.nodes.values())

    def add(self, *nodes):
        """
        Add one or more metadata nodes to the collection.

        Child nodes that are referenced from the explicitly
        listed nodes will also be added.
        """
        for node in nodes:
            self._add_node(node)

    def _add_node(self, node):
        if node.id is None:
            node.id = self._get_blank_node_identifier()
        self.nodes[node.id] = node
        for linked_node in node.links:
            self._add_node(linked_node)

    def _get_blank_node_identifier(self):
        # see https://www.w3.org/TR/json-ld11/#identifying-blank-nodes

        # here we're choosing to use a zero-padded identifier to make
        # testing and debugging easier.
        # It might be easier just to use uuids, however
        fmt = f"_:{{identifier:06d}}"
        identifier = len(self.nodes)
        return fmt.format(identifier=identifier)

    def _sort_nodes_by_id(self):
        sorted_nodes = dict(sorted(self.nodes.items()))
        self.nodes = sorted_nodes

    def generate_ids(self, id_generator):
        """
        Generate an IRI id for all nodes in the graph that do not possess one.

        Args
        ----

        id_generator (function):
            a function that takes the node as an argument, and returns a unique IRI
        """
        for node_id in list(self.nodes.keys()):
            if node_id.startswith("_:"):
                node = self.nodes.pop(node_id)
                node.id = id_generator(node)
                self.nodes[node.id] = node

    @property
    def complete(self):
        """Do all nodes have an IRI?"""
        for node_id in self.nodes:
            if node_id.startswith("_:"):
                return False
        return True

    def save(self, path, individual_files=False, include_empty_properties=False, group_by_schema=False):
        """
        Save the node collection to disk in JSON-LD format.

        Args
        ----

        path (str):
            either a file or a directory into which the metadata will be written.
            It is recommended to use the extension ".jsonld".
        individual_files (bool):
            if False (default), save the entire collection into a single file.
            if True, `path` must be a directory, and each node is saved into a
            separate file within that directory.
        include_empty_properties (bool):
            if False (default), do not include properties with value None.
            if True, include all properties.
        group_by_schema (bool):
            Only applies if `individual_files` is True.
            If False (default), save all files in a single directory.
            If True, save into subdirectories according to the schema name.

        Returns
        -------

        A list of the file paths created.
        """
        # in case a user has added additional child nodes _after_ adding the parent node to the collection
        # we first re-add all child nodes to the collection.
        # This is probably not the most elegant or fast way to do this, but it is simple and robust.
        for node in tuple(self.nodes.values()):

            if node.type_.startswith("https://openminds.ebrains.eu/"):
                data_context = {"@vocab": "https://openminds.ebrains.eu/vocab/"}
            else:
                data_context = {"@vocab": "https://openminds.om-i.org/props/"}

            for linked_node in node.links:
                self._add_node(linked_node)
        # Now we can actually save the nodes
        if not individual_files:
            if os.path.exists(path):
                if not os.path.isfile(path):
                    raise OSError(f"Cannot create file {path} because a directory with that name already exists.")
            else:
                parent_dir = os.path.dirname(path)
                if parent_dir:
                    os.makedirs(parent_dir, exist_ok=True)
            self._sort_nodes_by_id()
            data = {
                "@context": data_context,
                "@graph": [
                    node.to_jsonld(
                        embed_linked_nodes=LinkedNodeEmbedding.NEVER,
                        include_empty_properties=include_empty_properties,
                        with_context=False
                    )
                    for node in self
                ],
            }
            with open(path, "w") as fp:
                json.dump(data, fp, indent=2)
            output_paths = [path]
        else:
            if not os.path.exists(path):
                os.makedirs(path, exist_ok=True)
            if not os.path.isdir(path):
                raise OSError(
                    f"If saving to multiple files, `path` must be a directory. path={path}, pwd={os.getcwd()}"
                )
            self._sort_nodes_by_id()
            output_paths = []
            for node in self:
                if node.id.startswith("http"):
                    file_identifier = node.uuid
                else:
                    assert node.id.startswith("_:")
                    file_identifier = node.id[2:]
                if group_by_schema:
                    dir_path = os.path.join(path, node.__class__.__name__)
                    os.makedirs(dir_path, exist_ok=True)
                    file_path = os.path.join(dir_path, f"{file_identifier}.jsonld")
                else:
                    file_path = os.path.join(path, f"{file_identifier}.jsonld")
                with open(file_path, "w") as fp:
                    data = node.to_jsonld(
                        embed_linked_nodes=LinkedNodeEmbedding.NEVER,
                        include_empty_properties=include_empty_properties
                    )
                    json.dump(data, fp, indent=2)
                    output_paths.append(file_path)
        return output_paths

    def load(self, *paths, version=DEFAULT_VERSION):
        """
        Load openMINDS metadata from one or more JSON-LD files.

        `*paths` may contain either:

        1) a single directory, in which case all JSON-LD files in this directory
        and any non-hidden subdirectories will be loaded
        (where hidden subdirectories are those whose name starts with ".").

        2) one or more JSON-LD files, which will all be loaded.

        By default, openMINDS v4 will be used.
        If the JSON-LD files use a different openMINDS version, specify it
        with the `version` argument, e.g.::

            import openminds.latest

            c = Collection()
            c.load("/path/to/my/metadata.jsonld", version="latest")

        """

        import_module(f"openminds.{version}")

        if len(paths) == 1 and os.path.isdir(paths[0]):
            data_dir = paths[0]
            json_paths = (
                glob(f"{data_dir}/**/*.jsonld", recursive=True)
                + glob(f"{data_dir}/**/*.json", recursive=True)
            )
        else:
            json_paths = paths

        for path in json_paths:
            assert os.path.isfile(path)
            with open(path, "r") as fp:
                data = json.load(fp)
            if "@graph" in data:
                for item in data["@graph"]:
                    if "@type" in item:
                        cls = lookup_type(item["@type"], version=version)
                        node = cls.from_jsonld(item)
                    else:
                        # allow links to metadata instances outside this collection
                        if not item["@id"].startswith("http"):
                            raise ValueError("Local nodes must have @type specified")
                        node = Link(item["@id"])
                    self.add(node)
            else:
                if "@type" in data:
                    cls = lookup_type(data["@type"], version=version)
                    node = cls.from_jsonld(data)
                else:
                    # allow links to metadata instances outside this collection
                    if not data["@id"].startswith("http"):
                        raise ValueError("Local nodes must have @type specified")
                    node = Link(data["@id"])
                self.add(node)
        self._resolve_links()

    def _resolve_links(self):
        """Replace `Link` attributes with typed Nodes where possible"""
        for node in self.nodes.values():
            node._resolve_links(self.nodes)

    def validate(self, ignore=None):
        """
        Check whether all constraints are satisfied.

        Arguments:
            ignore: an optional list of check types that should be ignored
                    ("required", "type", "multiplicity")

        Returns a dict containing information about any validation failures.
        """
        all_failures = {}
        for node in self:
            failures = node.validate(ignore=ignore)
            if failures:
                all_failures[node.id] = failures
        return all_failures

    @property
    def is_valid(self):
        failures = self.validate()
        return len(failures) == 0

    def sort_nodes_for_upload(self):
        """
        Return a list of nodes, sorted so that they can be uploaded to a graph database safely,
        i.e., child nodes will be saved before their parents.

        The upload code is assumed to generate @ids and update the Python instances accordingly.
        """
        unsorted = set(self.nodes.keys())
        sorted = []
        # initial step: move nodes with no children (downstream links) directly to `sorted`
        for node_id in unsorted:
            if len(self.nodes[node_id].links) == 0:
                sorted.append(node_id)
        unsorted -= set(sorted)
        # now iteratively add nodes to `sorted` if all their children are already in `sorted`
        while len(unsorted) > 0:
            newly_sorted = []
            for node_id in unsorted:
                child_ids = set(child.id for child in self.nodes[node_id].links)
                if not child_ids.difference(sorted):
                    sorted.append(node_id)
                    newly_sorted.append(node_id)
            unsorted -= set(newly_sorted)
        return [self.nodes[node_id] for node_id in sorted]

    def statistics(self):
        """
        Return a counter containing the number of nodes of each type.
        """
        stats = Counter(
            node.__class__.__name__ for node in self.nodes.values()
        )
        return stats