66import logging
77
88from django .apps import apps
9- from django .db .models import Q
9+ from django .db .models import Q , Exists , OuterRef
1010from django .contrib .contenttypes .models import ContentType
1111from website .settings import CeleryConfig
1212from celery .utils .time import get_exponential_backoff_interval
@@ -133,14 +133,14 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name
133133
134134@celery_app .task
135135def task__reindex_resource_into_share (resource_type : str , limit : int ):
136- guids = get_not_indexed_guids_for_resource (resource_type ).values_list ('_id' , flat = True )[:limit ].iterator ()
136+ guids = get_not_indexed_guids_for_resource_with_no_indexed_guid (resource_type ).values_list ('_id' , flat = True )[:limit ].iterator ()
137137 for guid in guids :
138138 task__update_share .apply_async (
139139 kwargs = {'guid' : guid , 'is_backfill' : True },
140140 queue = CeleryConfig .task_low_queue ,
141141 )
142142
143- def get_not_indexed_guids_for_resource (resource_type : str ):
143+ def get_not_indexed_guids_for_resource_with_no_indexed_guid (resource_type : str ):
144144 from osf .models import Guid , Registration , Preprint , Node , OSFUser
145145 resource_mapper = {
146146 'projects' : (Node , Q (is_public = True ) & Q (deleted__isnull = True )),
@@ -150,11 +150,25 @@ def get_not_indexed_guids_for_resource(resource_type: str):
150150 }
151151 resource_model , query = resource_mapper .get (resource_type , 'projects' )
152152 node_type = ContentType .objects .get_for_model (resource_model )
153- public_node_ids = resource_model .objects .filter (query ).values_list ('id' , flat = True )
154- return Guid .objects .filter (
155- Q (has_been_indexed = False ) | Q (has_been_indexed__isnull = True ),
153+ # Check if guid belong to a public resource
154+ is_public_resource = resource_model .objects .filter (
155+ query ,
156+ id = OuterRef ('object_id' ),
157+ )
158+ # Check if specific resource has any indexed guids
159+ has_indexed_guid = Guid .objects .filter (
156160 content_type = node_type ,
157- object_id__in = public_node_ids ,
161+ object_id = OuterRef ('object_id' ),
162+ has_been_indexed = True ,
163+ )
164+ return (
165+ Guid .objects
166+ .exclude (Exists (has_indexed_guid )) # exclude guid if its resource has any indexed guid
167+ .exclude (has_been_indexed = True ) # exclude other guids if its indexed that belong to other resource_type
168+ .filter (content_type = node_type )
169+ .filter (Exists (is_public_resource )) # keep guid if the resource is public for specific content_type
170+ .order_by ('object_id' , 'id' )
171+ .distinct ('object_id' ) # return the oldest created guid from several
158172 )
159173
160174def pls_send_trove_record (osf_item , * , is_backfill : bool , osfmap_partition : OsfmapPartition ):
0 commit comments