-
Notifications
You must be signed in to change notification settings - Fork 74
Expand file tree
/
Copy pathactivity.py
More file actions
356 lines (280 loc) · 13 KB
/
activity.py
File metadata and controls
356 lines (280 loc) · 13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
"""
# Provenance
The Activity object represents the source of a data set or the data processing steps used to produce it. Using
[W3C provenance ontology](http://www.w3.org/2011/prov/wiki/Main_Page) terms, a result is **generated by** a
combination of data and code which are either **used** or **executed**.
## Imports
from synapseclient import Activity
## Creating an activity object
act = Activity(name='clustering',
description='whizzy clustering',
used=['syn1234','syn1235'],
executed='syn4567')
Here, syn1234 and syn1235 might be two types of measurements on a common set of samples. Some whizzy clustering code
might be referred to by syn4567. The used and executed can reference entities in Synapse or URLs.
Alternatively, you can build an activity up piecemeal:
act = Activity(name='clustering', description='whizzy clustering')
act.used(['syn12345', 'syn12346'])
act.executed(
'https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/develop/tests/unit/unit_test_client.py')
## Storing entities with provenance
The activity can be passed in when storing an Entity to set the Entity's provenance:
clustered_samples = syn.store(clustered_samples, activity=act)
We've now recorded that `clustered_samples` is the output of our whizzy clustering algorithm applied to the data stored
in syn1234 and syn1235.
## Recording data source
The [synapseclient.Synapse.store][] has shortcuts for specifying the used and executed lists directly.
For example, when storing a data entity, it's a good idea to record its source:
excellent_data = syn.store(excellent_data,
activityName='data-r-us'
activityDescription='downloaded from data-r-us',
used='http://data-r-us.com/excellent/data.xyz')
"""
import collections.abc
from synapseclient.core.exceptions import SynapseError, SynapseMalformedEntityError
from synapseclient.core.utils import get_synid_and_version, is_synapse_id_str, is_url
from synapseclient.entity import is_synapse_entity
def is_used_entity(x) -> bool:
"""
Returns:
True if the given object represents a UsedEntity.
"""
# A UsedEntity must be a dictionary with a 'reference' field, with a 'targetId' field
if (
not isinstance(x, collections.abc.Mapping)
or "reference" not in x
or "targetId" not in x["reference"]
):
return False
# Must only have three keys
if not all(key in ("reference", "wasExecuted", "concreteType") for key in x.keys()):
return False
# 'reference' field can only have two keys
if not all(
key in ("targetId", "targetVersionNumber") for key in x["reference"].keys()
):
return False
return True
def is_used_url(x) -> bool:
"""
Returns:
True if the given object represents a UsedURL.
"""
# A UsedURL must be a dictionary with a 'url' field
if not isinstance(x, collections.abc.Mapping) or "url" not in x:
return False
# Must only have four keys
if not all(
key in ("url", "name", "wasExecuted", "concreteType") for key in x.keys()
):
return False
return True
def _get_any_bad_args(badargs, dictionary):
"""Returns the intersection of 'badargs' and the non-Null keys of 'dictionary'."""
return list(
illegal
for illegal in badargs
if illegal in dictionary and dictionary[illegal] is not None
)
def _raise_incorrect_used_usage(badargs, message):
"""Raises an informative exception about Activity.used()."""
if any(badargs):
raise SynapseMalformedEntityError(
"The parameter%s '%s' %s not allowed in combination with a %s."
% (
"s" if len(badargs) > 1 else "",
badargs,
"are" if len(badargs) > 1 else "is",
message,
)
)
class Activity(dict):
"""
Represents the provenance of a Synapse Entity.
Parameters:
name: Name of the Activity
description: A short text description of the Activity
used: Either a list of:
- [reference objects](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/Reference.html) (e.g. [{'targetId':'syn123456', 'targetVersionNumber':1}])
- a list of Synapse Entities or Entity IDs
- a list of URL's
executed: A code resource that was executed to generate the Entity.
data: A dictionary representation of an Activity, with fields 'name', 'description' and 'used' (a list of reference objects)
See also: The [W3C's provenance ontology](http://www.w3.org/TR/prov-o/)
"""
# TODO: make constructors from JSON consistent across objects
def __init__(self, name=None, description=None, used=None, executed=None, data={}):
super(Activity, self).__init__(data)
if "used" not in self:
self["used"] = []
if name is not None:
self["name"] = name
if description is not None:
self["description"] = description
if used is not None:
self.used(used)
if executed is not None:
self.executed(executed)
def used(
self, target=None, targetVersion=None, wasExecuted=None, url=None, name=None
):
"""
Add a resource used by the activity.
This method tries to be as permissive as possible. It accepts a string which might be a synapse ID or a URL,
a synapse entity, a UsedEntity or UsedURL dictionary or a list containing any combination of these.
In addition, named parameters can be used to specify the fields of either a UsedEntity or a UsedURL.
If target and optionally targetVersion are specified, create a UsedEntity.
If url and optionally name are specified, create a UsedURL.
It is an error to specify both target/targetVersion parameters and url/name parameters in the same call.
To add multiple UsedEntities and UsedURLs, make a separate call for each or pass in a list.
In case of conflicting settings for wasExecuted both inside an object and with a parameter, the parameter wins.
For example, this UsedURL will have wasExecuted set to False:
activity.used({'url':'http://google.com', 'name':'Goog', 'wasExecuted':True}, wasExecuted=False)
Entity examples:
activity.used('syn12345')
activity.used(entity)
activity.used(target=entity, targetVersion=2)
activity.used(codeEntity, wasExecuted=True)
activity.used({'reference':{'target':'syn12345', 'targetVersion':1}, 'wasExecuted':False})
URL examples:
activity.used('http://mydomain.com/my/awesome/data.RData')
activity.used(url='http://mydomain.com/my/awesome/data.RData', name='Awesome Data')
activity.used(url='https://github.com/joe_hacker/code_repo', name='Gnarly hacks', wasExecuted=True)
activity.used({'url':'https://github.com/joe_hacker/code_repo', 'name':'Gnarly hacks'}, wasExecuted=True)
List example:
activity.used(['syn12345', 'syn23456', entity, \
{'reference':{'target':'syn100009', 'targetVersion':2}, 'wasExecuted':True}, \
'http://mydomain.com/my/awesome/data.RData'])
"""
# -- A list of targets
if isinstance(target, list):
badargs = _get_any_bad_args(["targetVersion", "url", "name"], locals())
_raise_incorrect_used_usage(badargs, "list of used resources")
for item in target:
self.used(item, wasExecuted=wasExecuted)
return
# -- UsedEntity
elif is_used_entity(target):
badargs = _get_any_bad_args(["targetVersion", "url", "name"], locals())
_raise_incorrect_used_usage(
badargs, "dictionary representing a used resource"
)
resource = target
if "concreteType" not in resource:
resource["concreteType"] = (
"org.sagebionetworks.repo.model.provenance.UsedEntity"
)
# -- Used URL
elif is_used_url(target):
badargs = _get_any_bad_args(["targetVersion", "url", "name"], locals())
_raise_incorrect_used_usage(badargs, "URL")
resource = target
if "concreteType" not in resource:
resource["concreteType"] = (
"org.sagebionetworks.repo.model.provenance.UsedURL"
)
# -- Synapse Entity
elif is_synapse_entity(target):
badargs = _get_any_bad_args(["url", "name"], locals())
_raise_incorrect_used_usage(badargs, "Synapse entity")
reference = {"targetId": target["id"]}
if "versionNumber" in target:
reference["targetVersionNumber"] = target["versionNumber"]
if targetVersion:
reference["targetVersionNumber"] = int(targetVersion)
resource = {
"reference": reference,
"concreteType": "org.sagebionetworks.repo.model.provenance.UsedEntity",
}
# -- URL parameter
elif url:
badargs = _get_any_bad_args(["target", "targetVersion"], locals())
_raise_incorrect_used_usage(badargs, "URL")
resource = {
"url": url,
"name": name if name else target,
"concreteType": "org.sagebionetworks.repo.model.provenance.UsedURL",
}
# -- URL as a string
elif is_url(target):
badargs = _get_any_bad_args(["targetVersion"], locals())
_raise_incorrect_used_usage(badargs, "URL")
resource = {
"url": target,
"name": name if name else target,
"concreteType": "org.sagebionetworks.repo.model.provenance.UsedURL",
}
# -- Synapse Entity ID (assuming the string is an ID)
elif isinstance(target, str):
badargs = _get_any_bad_args(["url", "name"], locals())
_raise_incorrect_used_usage(badargs, "Synapse entity")
if not is_synapse_id_str(target):
raise ValueError("%s is not a valid Synapse id" % target)
synid, version = get_synid_and_version(
target
) # Handle synapseIds of from syn234.4
if version:
if targetVersion and int(targetVersion) != int(version):
raise ValueError(
"Two conflicting versions for %s were specified" % target
)
targetVersion = int(version)
reference = {"targetId": synid}
if targetVersion:
reference["targetVersionNumber"] = int(targetVersion)
resource = {
"reference": reference,
"concreteType": "org.sagebionetworks.repo.model.provenance.UsedEntity",
}
else:
raise SynapseError("Unexpected parameters in call to Activity.used().")
# Set wasExecuted
if wasExecuted is None:
# Default to False
if "wasExecuted" not in resource:
resource["wasExecuted"] = False
else:
# wasExecuted parameter overrides setting in an object
resource["wasExecuted"] = wasExecuted
# Add the used resource to the activity
self["used"].append(resource)
def executed(self, target=None, targetVersion=None, url=None, name=None):
"""
Add a code resource that was executed during the activity.
See [synapseclient.activity.Activity.used][]
"""
self.used(
target=target,
targetVersion=targetVersion,
url=url,
name=name,
wasExecuted=True,
)
def _getStringList(self, wasExecuted=True):
usedList = []
for source in [
source
for source in self["used"]
if source.get("wasExecuted", False) == wasExecuted
]:
if source["concreteType"].endswith("UsedURL"):
if source.get("name"):
usedList.append(source.get("name"))
else:
usedList.append(source.get("url"))
else: # It is an entity for now
tmpstr = source["reference"]["targetId"]
if "targetVersionNumber" in source["reference"]:
tmpstr += ".%i" % source["reference"]["targetVersionNumber"]
usedList.append(tmpstr)
return usedList
def _getExecutedStringList(self):
return self._getStringList(wasExecuted=True)
def _getUsedStringList(self):
return self._getStringList(wasExecuted=False)
def __str__(self):
str = "%s\n Executed:\n" % self.get("name", "")
str += "\n".join(self._getExecutedStringList())
str += " Used:\n"
str += "\n".join(self._getUsedStringList())
return str