Skip to content

Commit 126086c

Browse files
committed
modified entotyview creation of strings and list cols
1 parent 9502f5e commit 126086c

2 files changed

Lines changed: 176 additions & 26 deletions

File tree

synapseclient/extensions/curator/file_based_metadata_task.py

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from synapseclient.operations import FileOptions, get
2525

2626
TYPE_DICT = {
27-
"string": ColumnType.STRING,
27+
"string": ColumnType.MEDIUMTEXT,
2828
"number": ColumnType.DOUBLE,
2929
"integer": ColumnType.INTEGER,
3030
"boolean": ColumnType.BOOLEAN,
@@ -36,6 +36,9 @@
3636
"boolean": ColumnType.BOOLEAN_LIST,
3737
}
3838

39+
MAX_LIST_STRING_ITEM_SIZE = 100
40+
MAX_LIST_LENGTH = 50
41+
3942

4043
def create_json_schema_entity_view(
4144
syn: Synapse,
@@ -199,48 +202,78 @@ def _create_columns_from_json_schema(json_schema: dict[str, Any]) -> list[Column
199202
raise ValueError(
200203
"The 'properties' field in the JSON Schema must be a dictionary."
201204
)
202-
columns = []
203-
for name, prop_schema in properties.items():
204-
column_type = _get_column_type_from_js_property(prop_schema)
205-
maximum_size = None
206-
if column_type == "STRING":
207-
maximum_size = 100
208-
if column_type in LIST_TYPE_DICT.values():
209-
maximum_size = 5
210-
211-
column = Column(
212-
name=name,
213-
column_type=column_type,
214-
maximum_size=maximum_size,
215-
default_value=None,
216-
)
217-
columns.append(column)
205+
columns = [
206+
_create_synapse_column_from_js_property(prop_schema, name)
207+
for name, prop_schema in properties.items()
208+
]
218209
return columns
219210

220211

212+
def _create_synapse_column_from_js_property(
213+
js_property: dict[str, Any], name: str
214+
) -> Column:
215+
"""
216+
Creates a Synapse Column based on a JSON Schema property.
217+
218+
Args:
219+
js_property: A JSON Schema property in dict form.
220+
name: The name of the column.
221+
222+
Returns:
223+
A Synapse Column based on the JSON Schema property.
224+
"""
225+
column_type = _get_column_type_from_js_property(js_property)
226+
maximum_size = None
227+
maximum_list_length = None
228+
if column_type in LIST_TYPE_DICT.values():
229+
maximum_list_length = MAX_LIST_LENGTH
230+
if column_type == ColumnType.STRING_LIST:
231+
maximum_size = MAX_LIST_STRING_ITEM_SIZE
232+
233+
return Column(
234+
name=name,
235+
column_type=column_type,
236+
maximum_size=maximum_size,
237+
maximum_list_length=maximum_list_length,
238+
)
239+
240+
221241
def _get_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType:
222242
"""
223243
Gets the Synapse column type from a JSON Schema property.
224244
The JSON Schema should be valid but that should not be assumed.
225-
If the type can not be determined ColumnType.STRING will be returned.
245+
If the type can not be determined ColumnType.MEDIUMTEXT will be returned.
226246
227247
Args:
228248
js_property: A JSON Schema property in dict form.
229249
230250
Returns:
231251
A Synapse ColumnType based on the JSON Schema type
232252
"""
233-
# Enums are always strings in Synapse tables
253+
# Enums are set as MediumText columns
234254
if "enum" in js_property:
235-
return ColumnType.STRING
255+
return ColumnType.MEDIUMTEXT
236256
if "type" in js_property:
237-
if js_property["type"] == "array":
257+
js_type = js_property["type"]
258+
# Synapse columns cannot be more than one type
259+
# If the JSONSchema type is a list of types, check if it's a nullable single type
260+
if isinstance(js_type, list):
261+
types = [t for t in js_type if t != "null"]
262+
if len(types) == 1:
263+
js_type = types[0]
264+
else:
265+
return ColumnType.MEDIUMTEXT
266+
if js_type == "array":
238267
return _get_list_column_type_from_js_property(js_property)
239-
return TYPE_DICT.get(js_property["type"], ColumnType.STRING)
268+
# If there is only one JSONSChema type, return the corresponding Synapse column type,
269+
# defaulting to MediumText if there is no match
270+
return TYPE_DICT.get(js_type, ColumnType.MEDIUMTEXT)
240271
# A oneOf list usually indicates that the type could be one or more different things
272+
# Curator extension does not create the types of JSON Schemas where this is the case
273+
# but if it is present we will attempt to determine the type based on the items in the oneOf list.
241274
if "oneOf" in js_property and isinstance(js_property["oneOf"], list):
242275
return _get_column_type_from_js_one_of_list(js_property["oneOf"])
243-
return ColumnType.STRING
276+
return ColumnType.MEDIUMTEXT
244277

245278

246279
def _get_column_type_from_js_one_of_list(js_one_of_list: list[Any]) -> ColumnType:
@@ -258,15 +291,15 @@ def _get_column_type_from_js_one_of_list(js_one_of_list: list[Any]) -> ColumnTyp
258291
items = [item for item in js_one_of_list if isinstance(item, dict)]
259292
# Enums are always strings in Synapse tables
260293
if [item for item in items if "enum" in item]:
261-
return ColumnType.STRING
294+
return ColumnType.MEDIUMTEXT
262295
# For Synapse ColumnType we can ignore null types in JSON Schemas
263296
type_items = [item for item in items if "type" in item if item["type"] != "null"]
264297
if len(type_items) == 1:
265298
type_item = type_items[0]
266299
if type_item["type"] == "array":
267300
return _get_list_column_type_from_js_property(type_item)
268-
return TYPE_DICT.get(type_item["type"], ColumnType.STRING)
269-
return ColumnType.STRING
301+
return TYPE_DICT.get(type_item["type"], ColumnType.MEDIUMTEXT)
302+
return ColumnType.MEDIUMTEXT
270303

271304

272305
def _get_list_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType:
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""Unit tests for file_based_metadata_task.py"""
2+
from typing import Any
3+
4+
import pytest
5+
6+
from synapseclient.extensions.curator.file_based_metadata_task import (
7+
MAX_LIST_LENGTH,
8+
MAX_LIST_STRING_ITEM_SIZE,
9+
_create_columns_from_json_schema,
10+
_create_synapse_column_from_js_property,
11+
_get_column_type_from_js_property,
12+
)
13+
from synapseclient.models import Column, ColumnType
14+
15+
16+
def test_create_columns_from_json_schema():
17+
schema = {
18+
"properties": {
19+
"string_col": {"type": "string"},
20+
"int_col": {"type": "integer"},
21+
"bool_col": {"type": "boolean"},
22+
}
23+
}
24+
expected = [
25+
Column(name="string_col", column_type=ColumnType.MEDIUMTEXT),
26+
Column(name="int_col", column_type=ColumnType.INTEGER),
27+
Column(name="bool_col", column_type=ColumnType.BOOLEAN),
28+
]
29+
assert _create_columns_from_json_schema(schema) == expected
30+
31+
32+
@pytest.mark.parametrize(
33+
"schema",
34+
[{}, {"properties": []}],
35+
ids=["empty schema", "properties is not a d ict"],
36+
)
37+
def test_create_columns_from_json_schema_exceptions(schema: dict[str, Any]):
38+
with pytest.raises(ValueError):
39+
_create_columns_from_json_schema(schema)
40+
41+
42+
@pytest.mark.parametrize(
43+
"prop, name, expected_type, expected_max_size, expected_max_list_length",
44+
[
45+
(
46+
{"type": "array", "items": {"type": "string"}},
47+
"string_list_col",
48+
ColumnType.STRING_LIST,
49+
MAX_LIST_STRING_ITEM_SIZE,
50+
MAX_LIST_LENGTH,
51+
),
52+
(
53+
{"type": "array", "items": {"type": "integer"}},
54+
"int_list_col",
55+
ColumnType.INTEGER_LIST,
56+
None,
57+
MAX_LIST_LENGTH,
58+
),
59+
(
60+
{"type": "array", "items": {"type": "boolean"}},
61+
"bool_list_col",
62+
ColumnType.BOOLEAN_LIST,
63+
None,
64+
MAX_LIST_LENGTH,
65+
),
66+
(
67+
{"type": "string"},
68+
"string_col",
69+
ColumnType.MEDIUMTEXT,
70+
None,
71+
None,
72+
),
73+
],
74+
ids=["string_list", "integer_list", "boolean_list", "string"],
75+
)
76+
def test_create_synapse_column_from_js_property(
77+
prop, name, expected_type, expected_max_size, expected_max_list_length
78+
):
79+
result = _create_synapse_column_from_js_property(prop, name)
80+
assert isinstance(result, Column)
81+
assert result.name == name
82+
assert result.column_type == expected_type
83+
assert result.maximum_size == expected_max_size
84+
assert result.maximum_list_length == expected_max_list_length
85+
86+
87+
@pytest.mark.parametrize(
88+
"prop, expected",
89+
[
90+
({"enum": ["a", "b", "c"]}, ColumnType.MEDIUMTEXT),
91+
({"type": "string"}, ColumnType.MEDIUMTEXT),
92+
({"type": "integer"}, ColumnType.INTEGER),
93+
({"type": "number"}, ColumnType.DOUBLE),
94+
({"type": "boolean"}, ColumnType.BOOLEAN),
95+
({"type": ["integer", "null"]}, ColumnType.INTEGER),
96+
({"type": ["integer", "string"]}, ColumnType.MEDIUMTEXT),
97+
({"type": "array", "items": {"type": "integer"}}, ColumnType.INTEGER_LIST),
98+
({"oneOf": [{"type": "integer"}, {"type": "null"}]}, ColumnType.INTEGER),
99+
({"type": "unknown"}, ColumnType.MEDIUMTEXT),
100+
({}, ColumnType.MEDIUMTEXT),
101+
],
102+
ids=[
103+
"enum_property",
104+
"type_string",
105+
"type_integer",
106+
"type_number",
107+
"type_boolean",
108+
"type_list_nullable",
109+
"type_list_multiple_types",
110+
"type_array",
111+
"one_of_list",
112+
"unknown_type",
113+
"empty_property",
114+
],
115+
)
116+
def test_get_column_type_from_js_property(prop, expected):
117+
assert _get_column_type_from_js_property(prop) == expected

0 commit comments

Comments
 (0)