hed-python/hed/models/hed_string.py at 3a59047397f15ce3aabc28955f73ca40d6a6d11c · IanCa/hed-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
"""
This module is used to split tags in a HED string.
"""
import copy
from hed.models.hed_group import HedGroup
from hed.models.hed_tag import HedTag
from hed.models.model_constants import DefTagNames


class HedString(HedGroup):
    """ A HED string. """

    OPENING_GROUP_CHARACTER = '('
    CLOSING_GROUP_CHARACTER = ')'

    def __init__(self, hed_string, hed_schema, def_dict=None, _contents=None):
        """ Constructor for the HedString class.

        Parameters:
            hed_string (str): A HED string consisting of tags and tag groups.
            hed_schema (HedSchema): The schema to use to identify tags.
            def_dict(DefinitionDict or None): The def dict to use to identify def/def expand tags.
            _contents ([HedGroup and/or HedTag] or None): Create a HedString from this exact list of children.
                                                          Does not make a copy.
        Notes:
            - The HedString object parses its component tags and groups into a tree-like structure.

        """

        if _contents is not None:
            contents = _contents
        else:
            try:
                contents = self.split_into_groups(hed_string, hed_schema, def_dict)
            except ValueError:
                contents = []
        super().__init__(hed_string, contents=contents, startpos=0, endpos=len(hed_string))
        self._schema = hed_schema
        self._from_strings = None
        self._def_dict = def_dict

    @classmethod
    def from_hed_strings(cls, hed_strings):
        """ Factory for creating HedStrings via combination.

        Parameters:
            hed_strings (list or None): A list of HedString objects to combine.
                                        This takes ownership of their children.

        Returns:
            new_string(HedString): The newly combined HedString
        """
        if not hed_strings:
            raise TypeError("Passed an empty list to from_hed_strings")
        new_string = HedString.__new__(HedString)
        hed_string = ",".join([group._hed_string for group in hed_strings])
        contents = [child for sub_string in hed_strings for child in sub_string.children]
        first_schema = hed_strings[0]._schema
        first_dict = hed_strings[0]._def_dict
        new_string.__init__(hed_string=hed_string, _contents=contents, hed_schema=first_schema, def_dict=first_dict)
        new_string._from_strings = hed_strings
        return new_string

    @property
    def is_group(self):
        """ Always False since the underlying string is not a group with parentheses. """
        return False

    def _calculate_to_canonical_forms(self, hed_schema):
        """ Identify all tags using the given schema.

        Parameters:
            hed_schema (HedSchema, HedSchemaGroup): The schema to use to validate/convert tags.

        Returns:
            list: A list of issues found while converting the string. Each issue is a dictionary.

        """
        validation_issues = []
        for tag in self.get_all_tags():
            validation_issues += tag._calculate_to_canonical_forms(hed_schema)

        return validation_issues

    def __deepcopy__(self, memo):
        # check if the object has already been copied
        if id(self) in memo:
            return memo[id(self)]

        # create a new instance of HedString class, and direct copy all parameters
        new_string = self.__class__.__new__(self.__class__)
        new_string.__dict__.update(self.__dict__)

        # add the new object to the memo dictionary
        memo[id(self)] = new_string

        # Deep copy the attributes that need it(most notably, we don't copy schema/schema entry)
        new_string._original_children = copy.deepcopy(self._original_children, memo)
        new_string._from_strings = copy.deepcopy(self._from_strings, memo)
        new_string._children = copy.deepcopy(self._children, memo)

        return new_string

    def copy(self):
        """ Return a deep copy of this string.

        Returns:
            HedString: The copied group.

        """
        return_copy = copy.deepcopy(self)
        return return_copy

    def remove_definitions(self):
        """ Remove definition tags and groups from this string.

            This does not validate definitions and will blindly removing invalid ones as well.
        """
        definition_groups = self.find_top_level_tags({DefTagNames.DEFINITION_KEY}, include_groups=1)
        if definition_groups:
            self.remove(definition_groups)

    def shrink_defs(self):
        """ Replace def-expand tags with def tags

            This does not validate them and will blindly shrink invalid ones as well.

        Returns:
            self
        """
        for def_expand_tag, def_expand_group in self.find_tags({DefTagNames.DEF_EXPAND_KEY}, recursive=True):
            expanded_parent = def_expand_group._parent
            if expanded_parent:
                def_expand_tag.short_base_tag = DefTagNames.DEF_ORG_KEY
                def_expand_tag._parent = expanded_parent
                expanded_parent.replace(def_expand_group, def_expand_tag)

        return self

    def expand_defs(self):
        """ Replace def tags with def-expand tags

            This does very minimal validation

        Returns:
            self
        """
        def_tags = self.find_def_tags(recursive=True, include_groups=0)

        replacements = []
        for tag in def_tags:
            if tag.expandable and not tag.expanded:
                replacements.append((tag, tag.expandable))

        for tag, group in replacements:
            tag_parent = tag._parent
            tag_parent.replace(tag, group)
            tag._parent = group
            tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY

        return self

    def get_as_original(self):
        """ Return the original form of this string.

        Returns:
            str: The string with all the tags in their original form.

        Notes:
            Potentially with some extraneous spaces removed on returned string.
        """
        return self.get_as_form("org_tag")

    @staticmethod
    def split_into_groups(hed_string, hed_schema, def_dict=None):
        """ Split the HED string into a parse tree.

        Parameters:
            hed_string (str): A hed string consisting of tags and tag groups to be processed.
            hed_schema (HedSchema): HED schema to use to identify tags.
            def_dict(DefinitionDict): The definitions to identify
        Returns:
            list:  A list of HedTag and/or HedGroup.

        :raises ValueError:
            - The string is significantly malformed, such as mismatched parentheses.

        Notes:
            - The parse tree consists of tag groups, tags, and delimiters.
        """
        current_tag_group = [[]]

        input_tags = HedString.split_hed_string(hed_string)
        for is_hed_tag, (startpos, endpos) in input_tags:
            if is_hed_tag:
                new_tag = HedTag(hed_string, hed_schema, (startpos, endpos), def_dict)
                current_tag_group[-1].append(new_tag)
            else:
                string_portion = hed_string[startpos:endpos]
                delimiter_index = 0
                for i, char in enumerate(string_portion):
                    if not char.isspace():
                        delimiter_index = i
                        break

                delimiter_char = string_portion[delimiter_index]

                if delimiter_char is HedString.OPENING_GROUP_CHARACTER:
                    current_tag_group.append(HedGroup(hed_string, startpos + delimiter_index))

                if delimiter_char is HedString.CLOSING_GROUP_CHARACTER:
                    # if prev_delimiter == ",":
                    #     raise ValueError(f"Closing parentheses in hed string {hed_string}")
                    # Terminate existing group, and save it off.
                    paren_end = startpos + delimiter_index + 1

                    if len(current_tag_group) > 1:
                        new_group = current_tag_group.pop()
                        new_group._endpos = paren_end

                        current_tag_group[-1].append(new_group)
                    else:
                        raise ValueError(f"Closing parentheses in hed string {hed_string}")

        # Comma delimiter issues are ignored and assumed already validated currently.
        if len(current_tag_group) != 1:
            raise ValueError(f"Unmatched opening parentheses in hed string {hed_string}")

        return current_tag_group[0]

    def _get_org_span(self, tag_or_group):
        """ If this tag or group was in the original hed string, find its original span.

        Parameters:
            tag_or_group (HedTag or HedGroup): The hed tag to locate in this string.

        Returns:
            int or None:   Starting position of the given item in the original string.
            int or None:   Ending position of the given item in the original string.

        Notes:
            - If the hed tag or group was not in the original string, returns (None, None).

        """
        if self._from_strings:
            return self._get_org_span_from_strings(tag_or_group)

        if self.check_if_in_original(tag_or_group):
            return tag_or_group.span

        return None, None

    def _get_org_span_from_strings(self, tag_or_group):
        """A different case of the above, to handle if this was created from hed string objects."""
        found_string = None
        string_start_index = 0
        for string in self._from_strings:
            if string.check_if_in_original(tag_or_group):
                found_string = string
                break
            # Add 1 for comma
            string_start_index += string.span[1] + 1

        if not found_string:
            return None, None

        return tag_or_group.span[0] + string_start_index, tag_or_group.span[1] + string_start_index

    @staticmethod
    def split_hed_string(hed_string):
        """ Split a HED string into delimiters and tags.

        Parameters:
            hed_string (str): The HED string to split.

        Returns:
            list:  A list of tuples where each tuple is (is_hed_tag, (start_pos, end_pos)).

        Notes:
            - The tuple format is as follows
                - is_hed_tag (bool): A (possible) hed tag if true, delimiter if not.
                - start_pos (int):   Index of start of string in hed_string.
                - end_pos (int):     Index of end of string in hed_string

            - This function does not validate tags or delimiters in any form.

        """
        tag_delimiters = ",()"
        current_spacing = 0
        found_symbol = True
        result_positions = []
        tag_start_pos = None
        last_end_pos = 0
        for i, char in enumerate(hed_string):
            if char == " ":
                current_spacing += 1
                continue

            if char in tag_delimiters:
                if found_symbol:
                    # view_string = hed_string[last_end_pos: i]
                    if last_end_pos != i:
                        result_positions.append((False, (last_end_pos, i)))
                    last_end_pos = i
                elif not found_symbol:
                    found_symbol = True
                    last_end_pos = i - current_spacing
                    # view_string = hed_string[tag_start_pos: last_end_pos]
                    result_positions.append((True, (tag_start_pos, last_end_pos)))
                    current_spacing = 0
                    tag_start_pos = None
                continue

            # If we have a current delimiter, end it here.
            if found_symbol and last_end_pos is not None:
                # view_string = hed_string[last_end_pos: i]
                if last_end_pos != i:
                    result_positions.append((False, (last_end_pos, i)))
                last_end_pos = None

            found_symbol = False
            current_spacing = 0
            if tag_start_pos is None:
                tag_start_pos = i

        if last_end_pos is not None and len(hed_string) != last_end_pos:
            # view_string = hed_string[last_end_pos: len(hed_string)]
            result_positions.append((False, (last_end_pos, len(hed_string))))
        if tag_start_pos is not None:
            # view_string = hed_string[tag_start_pos: len(hed_string)]
            result_positions.append((True, (tag_start_pos, len(hed_string) - current_spacing)))
            if current_spacing:
                result_positions.append((False, (len(hed_string) - current_spacing, len(hed_string))))

        return result_positions

    def validate(self, allow_placeholders=True, error_handler=None):
        """
        Validate the string using the schema

        Parameters:
            allow_placeholders(bool): allow placeholders in the string
            error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed
        Returns:
            issues (list of dict): A list of issues for hed string
        """
        from hed.validator import HedValidator

        validator = HedValidator(self._schema, def_dicts=self._def_dict)
        return validator.validate(self, allow_placeholders=allow_placeholders, error_handler=error_handler)

    def find_top_level_tags(self, anchor_tags, include_groups=2):
        """ Find top level groups with an anchor tag.

            A max of 1 tag located per top level group.

        Parameters:
            anchor_tags (container):     A list/set/etc of short_base_tags to find groups by.
            include_groups (0, 1 or 2):  Parameter indicating what return values to include.
                If 0: return only tags.
                If 1: return only groups.
                If 2 or any other value: return both.
        Returns:
            list or tuple: The returned result depends on include_groups:
        """
        top_level_tags = []
        for group in self.groups():
            for tag in group.tags():
                if tag.short_base_tag.lower() in anchor_tags:
                    top_level_tags.append((tag, group))
                    # Only capture a max of 1 per group.  These are implicitly unique.
                    break

        if include_groups == 0 or include_groups == 1:
            return [tag[include_groups] for tag in top_level_tags]
        return top_level_tags

    def remove_refs(self):
        """ This removes any refs(tags contained entirely inside curly braces) from the string.

            This does NOT validate the contents of the curly braces.  This is only relevant when directly
            editing sidecar strings.  Tools will naturally ignore these.
        """
        ref_tags = [tag for tag in self.get_all_tags() if tag.is_column_ref()]
        if ref_tags:
            self.remove(ref_tags)