Skip to content

Commit 18903af

Browse files
Add basic indexing for collections. (#2080)
* Add basic indexing for collections. * Add script to reload collections db. * Update collections db reload script with options. * Add pagination to dynamodb results.
1 parent 8417a46 commit 18903af

13 files changed

Lines changed: 466 additions & 150 deletions

File tree

dss-api.yml

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -997,7 +997,7 @@ paths:
997997
- code
998998
/collections:
999999
get:
1000-
operationId: dss.api.collections.listcollections
1000+
operationId: dss.api.collections.list_collections
10011001
security:
10021002
- dcpAuth: []
10031003
summary: Retrieve a user's collections.
@@ -1013,21 +1013,15 @@ paths:
10131013
10141014
Collections are replicated across storage replicas similarly to files and bundles.
10151015
parameters:
1016-
- name: replica
1017-
in: query
1018-
description: Replica to fetch from.
1019-
required: true
1020-
type: string
1021-
enum: [aws, gcp]
10221016
- name: per_page
10231017
in: query
10241018
description: Max number of results to return per page.
10251019
required: false
10261020
type: integer
10271021
format: int32
1028-
minimum: 50
1029-
maximum: 100
1030-
default: 100
1022+
minimum: 10
1023+
maximum: 500
1024+
default: 500
10311025
- name: start_at
10321026
in: query
10331027
description: >
@@ -1043,7 +1037,7 @@ paths:
10431037
type: object
10441038
properties:
10451039
collections:
1046-
description: A user's collections.
1040+
description: A user's collection UUIDs and versions.
10471041
type: array
10481042
items:
10491043
$ref: '#/definitions/CollectionOfCollectionsItem'
@@ -1053,7 +1047,7 @@ paths:
10531047
type: object
10541048
properties:
10551049
collections:
1056-
description: A user's collections.
1050+
description: A user's collection UUIDs and versions.
10571051
type: array
10581052
items:
10591053
$ref: '#/definitions/CollectionOfCollectionsItem'
@@ -1115,9 +1109,7 @@ paths:
11151109
pattern: "[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}"
11161110
- name: version
11171111
in: query
1118-
description: >
1119-
Timestamp of collection creation in DSS_VERSION format format.
1120-
generated.
1112+
description: Timestamp of collection creation in DSS_VERSION format.
11211113
required: true
11221114
type: string
11231115
format: DSS_VERSION
@@ -1171,8 +1163,7 @@ paths:
11711163
security:
11721164
- dcpAuth: []
11731165
summary: Retrieve a collection given a UUID.
1174-
description: >
1175-
Given a collection UUID, return the associated collection object.
1166+
description: Given a collection UUID, return the associated collection object.
11761167
parameters:
11771168
- name: uuid
11781169
in: path
@@ -1748,7 +1739,7 @@ paths:
17481739
Add or remove files from a bundle. A specific version of the bundle to update must be provided, and a
17491740
new version will be written.
17501741
1751-
Bundles manifests exceeding 20,000 files will not be included in the Elasticsearch index document.
1742+
Bundle manifests exceeding 20,000 files will not be included in the Elasticsearch index document.
17521743
parameters:
17531744
- name: uuid
17541745
in: path
@@ -2456,19 +2447,16 @@ definitions:
24562447
CollectionOfCollectionsItem:
24572448
type: object
24582449
properties:
2459-
collection_uuid:
2450+
uuid:
24602451
type: string
24612452
description: A UUID identifying the collection.
24622453
pattern: "[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}"
2463-
collection_version:
2464-
type: string
2454+
version:
24652455
description: The version of the UUID identifying the collection.
2466-
collection:
2467-
$ref: '#/definitions/Collection'
2456+
type: string
24682457
required:
2469-
- collection_uuid
2470-
- collection_version
2471-
- collection
2458+
- uuid
2459+
- version
24722460
Collection:
24732461
type: object
24742462
properties:

dss/api/collections.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,18 @@
1313
from dss.error import DSSException, dss_handler
1414
from dss.storage.blobstore import test_object_exists
1515
from dss.storage.hcablobstore import BlobStore, compose_blob_key
16-
from dss.storage.identifiers import CollectionFQID, CollectionTombstoneID
16+
from dss.storage.identifiers import CollectionFQID, CollectionTombstoneID, COLLECTION_PREFIX
1717
from dss.util import security, hashabledict, UrlBuilder
1818
from dss.util.version import datetime_to_version_format
1919
from dss.storage.blobstore import idempotent_save
20-
20+
from dss.collections import owner_lookup
2121
from cloud_blobstore import BlobNotFoundError
2222

2323
MAX_METADATA_SIZE = 1024 * 1024
2424

2525
logger = logging.getLogger(__name__)
2626

27+
2728
def get_impl(uuid: str, replica: str, version: str = None):
2829
uuid = uuid.lower()
2930
bucket = Replica[replica].bucket
@@ -46,43 +47,41 @@ def get_impl(uuid: str, replica: str, version: str = None):
4647
raise DSSException(404, "not_found", "Could not find collection for UUID {}".format(uuid))
4748
return json.loads(collection_blob)
4849

49-
def fetch_collections(handle, bucket, collection_keys):
50-
authenticated_user_email = security.get_token_email(request.token_info)
51-
52-
all_collections = []
53-
for key in collection_keys:
54-
uuid, version = key[len('collections/'):].split('.', 1)
55-
assert version != 'dead'
56-
collection = json.loads(handle.get(bucket, key))
57-
if collection['owner'] == authenticated_user_email:
58-
all_collections.append({'collection_uuid': uuid,
59-
'collection_version': version,
60-
'collection': collection})
61-
return all_collections
6250

6351
@dss_handler
6452
@security.authorized_group_required(['hca'])
65-
def listcollections(replica: str, per_page: int, start_at: int = 0):
66-
bucket = Replica[replica].bucket
67-
handle = Config.get_blobstore_handle(Replica[replica])
53+
def list_collections(per_page: int, start_at: int = 0):
54+
"""
55+
Return a list of a user's collections.
56+
57+
Collection uuids are indexed and called by the user's email in a dynamoDB table.
58+
59+
:param int per_page: # of collections returned per paged response.
60+
:param int start_at: Where the next chunk of paged response should start at.
61+
:return: A dictionary containing a list of dictionaries looking like:
62+
{'collections': [{'uuid': uuid, 'version': version}, {'uuid': uuid, 'version': version}, ... , ...]}
63+
"""
64+
# TODO: Replica is unused, so this does not use replica. Appropriate?
65+
owner = security.get_token_email(request.token_info)
6866

69-
# expensively list every collection file in the bucket, even those not belonging to the user (possibly 1000's... )
70-
collection_keys = [i for i in handle.list(bucket, prefix='collections') if not i.endswith('dead')]
67+
collections = []
68+
for collection in owner_lookup.get_collection_fqids_for_owner(owner):
69+
fqid = CollectionFQID.from_key(f'{COLLECTION_PREFIX}/{collection}')
70+
collections.append({'uuid': fqid.uuid, 'version': fqid.version})
7171

7272
# paged response
73-
if len(collection_keys) - start_at > per_page:
73+
if len(collections) - start_at > per_page:
7474
next_url = UrlBuilder(request.url)
7575
next_url.replace_query("start_at", str(start_at + per_page))
76-
# each chunk will be searched for collections belonging to that user (even more expensive; per bucket file)
77-
# hits returned will vary between zero and the "per_page" size of the chunk
78-
collections = fetch_collections(handle, bucket, collection_keys[start_at:start_at + per_page])
79-
response = make_response(jsonify({'collections': collections}), requests.codes.partial)
76+
collection_page = collections[start_at:start_at + per_page]
77+
response = make_response(jsonify({'collections': collection_page}), requests.codes.partial)
8078
response.headers['Link'] = f"<{next_url}>; rel='next'"
8179
return response
8280
# single response returning all collections (or those remaining)
8381
else:
84-
collections = fetch_collections(handle, bucket, collection_keys[start_at:])
85-
return jsonify({'collections': collections}), requests.codes.ok
82+
collection_page = collections[start_at:]
83+
return jsonify({'collections': collection_page}), requests.codes.ok
84+
8685

8786
@dss_handler
8887
@security.authorized_group_required(['hca'])
@@ -93,6 +92,7 @@ def get(uuid: str, replica: str, version: str = None):
9392
raise DSSException(requests.codes.forbidden, "forbidden", f"Collection access denied")
9493
return collection_body
9594

95+
9696
@dss_handler
9797
@security.authorized_group_required(['hca'])
9898
def put(json_request_body: dict, replica: str, uuid: str, version: str):
@@ -107,11 +107,16 @@ def put(json_request_body: dict, replica: str, uuid: str, version: str):
107107
timestamp = datetime.datetime.utcnow()
108108
version = datetime_to_version_format(timestamp)
109109
collection_version = version
110+
# update dynamoDB; used to speed up lookup time; will not update if owner already associated w/uuid
111+
owner_lookup.put_collection(owner=authenticated_user_email,
112+
collection_fqid=str(CollectionFQID(collection_uuid, collection_version)))
113+
# add the collection file to the bucket
110114
handle.upload_file_handle(Replica[replica].bucket,
111115
CollectionFQID(collection_uuid, collection_version).to_key(),
112116
io.BytesIO(json.dumps(collection_body).encode("utf-8")))
113117
return jsonify(dict(uuid=collection_uuid, version=collection_version)), requests.codes.created
114118

119+
115120
@dss_handler
116121
@security.authorized_group_required(['hca'])
117122
def patch(uuid: str, json_request_body: dict, replica: str, version: str):
@@ -143,12 +148,14 @@ def patch(uuid: str, json_request_body: dict, replica: str, version: str):
143148
io.BytesIO(json.dumps(collection).encode("utf-8")))
144149
return jsonify(dict(uuid=uuid, version=new_collection_version)), requests.codes.ok
145150

151+
146152
def _dedpuplicate_contents(contents: List) -> List:
147153
dedup_collection: OrderedDict[int, dict] = OrderedDict()
148154
for item in contents:
149155
dedup_collection[hash(tuple(sorted(item.items())))] = item
150156
return list(dedup_collection.values())
151157

158+
152159
@dss_handler
153160
@security.authorized_group_required(['hca'])
154161
def delete(uuid: str, replica: str):
@@ -175,9 +182,11 @@ def delete(uuid: str, replica: str):
175182
f"collection tombstone with UUID {uuid} already exists")
176183
status_code = requests.codes.ok
177184
response_body = dict() # type: dict
178-
185+
# update dynamoDB
186+
owner_lookup.delete_collection_uuid(owner=authenticated_user_email, uuid=uuid)
179187
return jsonify(response_body), status_code
180188

189+
181190
@functools.lru_cache(maxsize=64)
182191
def get_json_metadata(entity_type: str, uuid: str, version: str, replica: Replica, blobstore_handle: BlobStore):
183192
try:
@@ -198,6 +207,7 @@ def get_json_metadata(entity_type: str, uuid: str, version: str, replica: Replic
198207
"invalid_link",
199208
"Could not find file for UUID {}".format(uuid))
200209

210+
201211
def resolve_content_item(replica: Replica, blobstore_handle: BlobStore, item: dict):
202212
try:
203213
if item["type"] in {"file", "bundle", "collection"}:
@@ -221,6 +231,7 @@ def resolve_content_item(replica: Replica, blobstore_handle: BlobStore, item: di
221231
'Error while parsing the link "{}": {}: {}'.format(item, type(e).__name__, e)
222232
)
223233

234+
224235
def verify_collection(contents: List[dict], replica: Replica, blobstore_handle: BlobStore, batch_size=64):
225236
"""
226237
Given user-supplied collection contents that pass schema validation, resolve all entities in the collection and

dss/collections/__init__.py

Whitespace-only changes.

dss/collections/owner_lookup.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import os
2+
from botocore.exceptions import ClientError
3+
4+
from dss import dynamodb # type: ignore
5+
6+
7+
collection_db_table = f"dss-collections-db-{os.environ['DSS_DEPLOYMENT_STAGE']}"
8+
9+
10+
def put_collection(owner: str, collection_fqid: str, permission_level: str = 'owner'):
11+
try:
12+
dynamodb.put_item(table=collection_db_table,
13+
hash_key=owner,
14+
sort_key=collection_fqid,
15+
value=permission_level,
16+
dont_overwrite='sort_key')
17+
except ClientError as e:
18+
if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
19+
raise
20+
21+
22+
def get_collection(owner: str, collection_fqid: str):
23+
return dynamodb.get_item(table=collection_db_table,
24+
hash_key=owner,
25+
sort_key=collection_fqid,
26+
return_key='sort_key')
27+
28+
29+
def get_collection_fqids_for_owner(owner: str):
30+
"""Returns an Iterator of uuid strings."""
31+
return dynamodb.get_primary_key_items(table=collection_db_table,
32+
key=owner,
33+
return_key='sort_key')
34+
35+
36+
def get_all_collection_keys():
37+
"""Returns an Iterator of (owner, uuid) for all items in the collections db table."""
38+
return dynamodb.get_all_table_items(table=collection_db_table, both_keys=True)
39+
40+
41+
def delete_collection(owner: str, collection_fqid: str):
42+
"""Deletes one collection item from a database."""
43+
dynamodb.delete_item(table=collection_db_table,
44+
hash_key=owner,
45+
sort_key=collection_fqid)
46+
47+
48+
def delete_collection_uuid(owner: str, uuid: str):
49+
"""Deletes all versions of a uuid in the database."""
50+
for collection_fqid in get_collection_fqids_for_owner(owner):
51+
if collection_fqid.startswith(uuid):
52+
dynamodb.delete_item(table=collection_db_table,
53+
hash_key=owner,
54+
sort_key=collection_fqid)

0 commit comments

Comments
 (0)