|
6 | 6 | import logging |
7 | 7 |
|
8 | 8 | from django.apps import apps |
9 | | -from django.db.models import Q |
| 9 | +from django.db.models import Q, Exists, OuterRef |
10 | 10 | from django.contrib.contenttypes.models import ContentType |
11 | 11 | from website.settings import CeleryConfig |
12 | 12 | from celery.utils.time import get_exponential_backoff_interval |
@@ -150,11 +150,25 @@ def get_not_indexed_guids_for_resource(resource_type: str): |
150 | 150 | } |
151 | 151 | resource_model, query = resource_mapper.get(resource_type, 'projects') |
152 | 152 | node_type = ContentType.objects.get_for_model(resource_model) |
153 | | - public_node_ids = resource_model.objects.filter(query).values_list('id', flat=True) |
154 | | - return Guid.objects.filter( |
155 | | - Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True), |
| 153 | + # Check if guid belong to a public resource |
| 154 | + is_public_resource = resource_model.objects.filter( |
| 155 | + query, |
| 156 | + id=OuterRef('object_id'), |
| 157 | + ) |
| 158 | + # Check if specific resource has any indexed guids |
| 159 | + has_indexed_guid = Guid.objects.filter( |
156 | 160 | content_type=node_type, |
157 | | - object_id__in=public_node_ids, |
| 161 | + object_id=OuterRef('object_id'), |
| 162 | + has_been_indexed=True, |
| 163 | + ) |
| 164 | + return ( |
| 165 | + Guid.objects |
| 166 | + .exclude(Exists(has_indexed_guid)) # exclude guid if its resource has any indexed guid |
| 167 | + .exclude(has_been_indexed=True) # exclude other guids if its indexed that belong to other resource_type |
| 168 | + .filter(content_type=node_type) |
| 169 | + .filter(Exists(is_public_resource)) # keep guid if the resource is public for specific content_type |
| 170 | + .order_by('object_id', 'id') |
| 171 | + .distinct('object_id') # return the oldest created guid from several |
158 | 172 | ) |
159 | 173 |
|
160 | 174 | def pls_send_trove_record(osf_item, *, is_backfill: bool, osfmap_partition: OsfmapPartition): |
|
0 commit comments