Speed up source version creation pipeline (#833)

paynejd · claude · web-flow · commit abbd62b7099d · 2026-04-02T16:56:15.000+05:30
* Speed up source version creation pipeline

- Fix N+1 queries in ES indexing by adding prefetch_related to batch_index()
  for source/expansion concept and mapping indexing tasks
- Replace Paginator with manual slicing in batch_index() to avoid COUNT(*) query
- Optimize export serialization: increase batch size 100-&gt;1000, use single
  file handle, eliminate redundant .exists() checks, add prefetch on mappings
- Replace M2M .set() with bulk_create on through tables in seed_concepts()
  and seed_mappings() for faster version seeding
- Move snapshot serialization and checksum computation from synchronous
  persist_new_version() to async seed_children_to_new_version task

Benchmarked with PIH (8,427 concepts + 45,089 mappings):
- Seeding: ~2s (was minutes with .set())
- Export: ~4.5min with 10x larger batches
- API response: instant (snapshot/checksum now async)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* Use prefetch cache in ES document prepare methods

Change values_list() calls on prefetched relations (names, descriptions,
sources) to iterate .all() instead. values_list() bypasses Django's
prefetch cache and hits the DB, negating the prefetch_related added in
the previous commit. This eliminates thousands of redundant queries
during concept and mapping indexing.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/core/common/models.py b/core/common/models.py
@@ -5,7 +5,6 @@
 from django.contrib.postgres.fields import ArrayField
 from django.core.cache import cache
 from django.core.exceptions import ValidationError
-from django.core.paginator import Paginator
 from django.core.validators import RegexValidator
 from django.db import models, IntegrityError, transaction
 from django.db.models import Value, Q, Count
@@ -200,16 +199,22 @@ def get_exact_or_criteria(attr, values, decode=False):
         return criteria
 
     @staticmethod
-    def batch_index(queryset, document, single_batch=False):
+    def batch_index(queryset, document, single_batch=False, prefetch=None):
         if not get(settings, 'TEST_MODE'):
             doc = document()
+            if prefetch:
+                queryset = queryset.prefetch_related(*prefetch)
             if single_batch:
                 doc.update(queryset.all(), parallel=True)
             else:
-                paginator = Paginator(queryset.order_by('-id'), 500)
-                for page_number in paginator.page_range:
-                    page = paginator.page(page_number)
-                    doc.update(page.object_list, parallel=True)
+                batch_size = 500
+                start = 0
+                while True:
+                    batch = list(queryset.order_by('-id')[start:start + batch_size])
+                    if not batch:
+                        break
+                    doc.update(batch, parallel=True)
+                    start += batch_size
 
     @staticmethod
     @transaction.atomic
@@ -720,9 +725,6 @@ def persist_new(cls, obj, created_by, **kwargs):
 
     @classmethod
     def persist_new_version(cls, obj, user=None, **kwargs):
-        from core.collections.serializers import CollectionDetailSerializer
-        from core.sources.serializers import SourceDetailSerializer
-
         errors = {}
 
         obj.is_active = True
@@ -731,17 +733,14 @@ def persist_new_version(cls, obj, user=None, **kwargs):
             obj.created_by = user
             obj.updated_by = user
         repo_resource_name = obj.__class__.__name__
-        serializer = SourceDetailSerializer if repo_resource_name == 'Source' else CollectionDetailSerializer
         head = obj.head
         if not head:
             errors[repo_resource_name.lower()] = 'Version Head not found.'
             return errors
-        obj.snapshot = serializer(head).data
         obj.update_version_data(head)
         obj.save(**kwargs)
 
         if obj.id:
-            obj.get_checksums(recalculate=True)
             obj.sibling_versions.update(is_latest_version=False)
 
         is_test_mode = get(settings, 'TEST_MODE', False)
diff --git a/core/common/tasks.py b/core/common/tasks.py
@@ -393,6 +393,17 @@ def seed_children_to_new_version(self, resource, obj_id, export=True, sync=False
         task_id = self.request.id
         try:
             instance.add_processing(task_id)
+            # Compute snapshot and checksums async (moved from persist_new_version for faster HTTP response)
+            head = instance.head
+            if head:
+                if is_source:
+                    from core.sources.serializers import SourceDetailSerializer
+                    instance.snapshot = SourceDetailSerializer(head).data
+                elif is_collection:
+                    from core.collections.serializers import CollectionDetailSerializer
+                    instance.snapshot = CollectionDetailSerializer(head).data
+                instance.save(update_fields=['snapshot'])
+            instance.get_checksums(recalculate=True)
             instance.seed_references()
             if is_source:
                 instance.seed_concepts(index=False)
@@ -569,7 +580,11 @@ def index_expansion_concepts(expansion_id, count=None, concept_versioned_ids=Non
             queryset = Concept.objects.filter(versioned_object_id__in=concept_versioned_ids)
         else:
             queryset = expansion.concepts
-        expansion.batch_index(queryset, ConceptDocument)
+        expansion.batch_index(
+            queryset, ConceptDocument,
+            prefetch=['sources', 'names', 'descriptions',
+                      'expansion_set', 'expansion_set__collection_version']
+        )
 
 
 @app.task(
@@ -586,7 +601,10 @@ def index_expansion_mappings(expansion_id, count=None, mapping_versioned_ids=Non
             queryset = Mapping.objects.filter(versioned_object_id__in=mapping_versioned_ids)
         else:
             queryset = expansion.mappings
-        expansion.batch_index(queryset, MappingDocument)
+        expansion.batch_index(
+            queryset, MappingDocument,
+            prefetch=['sources', 'expansion_set', 'expansion_set__collection_version']
+        )
 
 
 @app.task
@@ -621,7 +639,11 @@ def index_source_concepts(source_id):
     source = Source.objects.filter(id=source_id).first()
     if source:
         from core.concepts.documents import ConceptDocument
-        source.batch_index(source.concepts, ConceptDocument)
+        source.batch_index(
+            source.concepts, ConceptDocument,
+            prefetch=['sources', 'names', 'descriptions',
+                      'expansion_set', 'expansion_set__collection_version']
+        )
 
 
 @app.task(
@@ -633,7 +655,10 @@ def index_source_mappings(source_id):
     source = Source.objects.filter(id=source_id).first()
     if source:
         from core.mappings.documents import MappingDocument
-        source.batch_index(source.mappings, MappingDocument)
+        source.batch_index(
+            source.mappings, MappingDocument,
+            prefetch=['sources', 'expansion_set', 'expansion_set__collection_version']
+        )
 
 
 @app.task(base=QueueOnceCustomTask)
diff --git a/core/common/utils.py b/core/common/utils.py
@@ -213,7 +213,7 @@ def write_export_file(
     resource_string = json.dumps(data, cls=encoders.JSONEncoder)
     logger.info('Done serializing attributes.')
 
-    batch_size = 100
+    batch_size = 1000
     is_collection = resource_type == 'collection'
 
     concepts_qs = Concept.objects.none()
@@ -236,97 +236,88 @@ def write_export_file(
         if version.is_head:
             filters['is_latest_version'] = True
 
+    resource_name = resource_type.title()
+
     with open('export.json', 'w') as out:
         out.write(f'{resource_string[:-1]}, "concepts": [')
 
-    resource_name = resource_type.title()
-
-    if concepts_qs.exists():
-        logger.info(f'{resource_name} has concepts. Getting them in batches of {batch_size:d}...')
         concept_serializer_class = get_class('core.concepts.serializers.ConceptVersionExportSerializer')
+        written_concepts = False
         start = 0
-        end = batch_size
-        batch_queryset = concepts_qs.order_by('-concept_id')[start:end]
-
-        while batch_queryset.exists():
-            logger.info(f'Serializing concepts {start + 1:d} - {end:d}...')
+        while True:
+            batch_ids = list(
+                concepts_qs.order_by('-concept_id')[start:start + batch_size].values_list('concept_id', flat=True)
+            )
+            if not batch_ids:
+                break
+            logger.info(f'Serializing concepts {start + 1:d} - {start + len(batch_ids):d}...')
             queryset = Concept.objects.filter(
-                id__in=batch_queryset.values_list('concept_id')).filter(**filters).order_by('-id')
-            if queryset.exists():
-                if start > 0:
-                    with open('export.json', 'a') as out:
-                        out.write(', ')
-                concept_versions = queryset.prefetch_related('names', 'descriptions')
+                id__in=batch_ids).filter(**filters).prefetch_related('names', 'descriptions').order_by('-id')
+            concept_versions = list(queryset)
+            if concept_versions:
+                if written_concepts:
+                    out.write(', ')
                 data = concept_serializer_class(concept_versions, many=True).data
                 concept_string = json.dumps(data, cls=encoders.JSONEncoder)
-                concept_string = concept_string[1:-1]
-
-                with open('export.json', 'a') as out:
-                    out.write(concept_string)
-
+                out.write(concept_string[1:-1])
+                written_concepts = True
             start += batch_size
-            end += batch_size
-            batch_queryset = concepts_qs.order_by('-concept_id')[start:end]
 
-        logger.info('Done serializing concepts.')
+        if written_concepts:
+            logger.info('Done serializing concepts.')
 
-    if is_collection:
-        references_qs = version.references
-        total_references = references_qs.count()
+        if is_collection:
+            references_qs = version.references
+            total_references = references_qs.count()
 
-        with open('export.json', 'a') as out:
             out.write('], "references": [')
-        if total_references:
-            logger.info(
-                f'{resource_name} has {total_references:d} references. Getting them in batches of {batch_size:d}...'
-            )
-            reference_serializer_class = get_class('core.collections.serializers.CollectionReferenceDetailSerializer')
-            for start in range(0, total_references, batch_size):
-                end = min(start + batch_size, total_references)
-                logger.info(f'Serializing references {start + 1:d} - {end:d}...')
-                references = references_qs.order_by('-id').filter()[start:end]
-                reference_serializer = reference_serializer_class(references, many=True)
-                reference_string = json.dumps(reference_serializer.data, cls=encoders.JSONEncoder)
-                reference_string = reference_string[1:-1]
-                with open('export.json', 'a') as out:
-                    out.write(reference_string)
-                    if end != total_references:
+            if total_references:
+                logger.info(
+                    f'{resource_name} has {total_references:d} references. '
+                    f'Getting them in batches of {batch_size:d}...'
+                )
+                reference_serializer_class = get_class(
+                    'core.collections.serializers.CollectionReferenceDetailSerializer')
+                for ref_start in range(0, total_references, batch_size):
+                    ref_end = min(ref_start + batch_size, total_references)
+                    logger.info(f'Serializing references {ref_start + 1:d} - {ref_end:d}...')
+                    references = references_qs.order_by('-id').filter()[ref_start:ref_end]
+                    reference_serializer = reference_serializer_class(references, many=True)
+                    reference_string = json.dumps(reference_serializer.data, cls=encoders.JSONEncoder)
+                    out.write(reference_string[1:-1])
+                    if ref_end != total_references:
                         out.write(', ')
-            logger.info('Done serializing references.')
+                logger.info('Done serializing references.')
 
-    with open('export.json', 'a') as out:
         out.write('], "mappings": [')
 
-    if mappings_qs.exists():
-        logger.info(f'{resource_name} has mappings. Getting them in batches of {batch_size:d}...')
         mapping_serializer_class = get_class('core.mappings.serializers.MappingDetailSerializer')
+        written_mappings = False
         start = 0
-        end = batch_size
-        batch_queryset = mappings_qs.order_by('-mapping_id')[start:end]
-
-        while batch_queryset.exists():
-            logger.info(f'Serializing mappings {start + 1:d} - {start + batch_size:d}...')
+        while True:
+            batch_ids = list(
+                mappings_qs.order_by('-mapping_id')[start:start + batch_size].values_list('mapping_id', flat=True)
+            )
+            if not batch_ids:
+                break
+            logger.info(f'Serializing mappings {start + 1:d} - {start + len(batch_ids):d}...')
             queryset = Mapping.objects.filter(
-                id__in=batch_queryset.values_list('mapping_id')).filter(**filters).order_by('-id')
-            if queryset.exists():
-                if start > 0:
-                    with open('export.json', 'a') as out:
-                        out.write(', ')
-
-                data = mapping_serializer_class(queryset, many=True).data
+                id__in=batch_ids).filter(**filters).prefetch_related(
+                'from_concept', 'to_concept', 'from_source', 'to_source').order_by('-id')
+            mapping_versions = list(queryset)
+            if mapping_versions:
+                if written_mappings:
+                    out.write(', ')
+                data = mapping_serializer_class(mapping_versions, many=True).data
                 mapping_string = json.dumps(data, cls=encoders.JSONEncoder)
-                mapping_string = mapping_string[1:-1]
-                with open('export.json', 'a') as out:
-                    out.write(mapping_string)
-
+                out.write(mapping_string[1:-1])
+                written_mappings = True
             start += batch_size
-            end += batch_size
-            batch_queryset = mappings_qs.order_by('-mapping_id')[start:end]
 
-        logger.info('Done serializing mappings.')
+        if written_mappings:
+            logger.info('Done serializing mappings.')
 
-    end_time = str(round((time.time() - start_time) + 2, 2))
-    with open('export.json', 'a') as out:
+        end_time = str(round((time.time() - start_time) + 2, 2))
         out.write('], "export_time": ' + json.dumps(f"{end_time}secs", cls=encoders.JSONEncoder) + '}')
 
     version.update_extras('__export_time', end_time)
diff --git a/core/concepts/documents.py b/core/concepts/documents.py
@@ -165,11 +165,11 @@ def prepare_numeric_id(instance):
 
     @staticmethod
     def prepare_locale(instance):
-        return compact(set(instance.names.values_list('locale', flat=True)))
+        return compact(set(n.locale for n in instance.names.all()))
 
     @staticmethod
     def prepare_source_version(instance):
-        return list(instance.sources.values_list('version', flat=True))
+        return [s.version for s in instance.sources.all()]
 
     @staticmethod
     def prepare_collection_version(instance):
@@ -218,15 +218,15 @@ def prepare_properties(instance):
 
     @staticmethod
     def prepare_name_types(instance):
-        return compact(set(instance.names.values_list('type', flat=True)))
+        return compact(set(n.type for n in instance.names.all()))
 
     @staticmethod
     def prepare_description_types(instance):
-        return compact(set(instance.descriptions.values_list('type', flat=True)))
+        return compact(set(d.type for d in instance.descriptions.all()))
 
     @staticmethod
     def prepare_description(instance):
-        return '. '.join(compact(set(instance.descriptions.values_list('name', flat=True))))
+        return '. '.join(compact(set(d.name for d in instance.descriptions.all())))
 
     def prepare(self, instance):
         data = super().prepare(instance)
@@ -239,8 +239,8 @@ def prepare(self, instance):
         name = get(preferred_locale, 'name') or ''
         data['_name'] = name.lower()
         data['name'] = name.replace('-', '_')
-        synonyms = instance.names.exclude(name=name).exclude(name='')
-        data['synonyms'] = compact(set(synonyms.values_list('name', flat=True)))
+        synonyms = [n for n in instance.names.all() if n.name and n.name != name]
+        data['synonyms'] = compact(set(n.name for n in synonyms))
         data['_synonyms'] = data['synonyms']
 
         if instance.parent.has_semantic_match_algorithm:
diff --git a/core/mappings/documents.py b/core/mappings/documents.py
@@ -106,7 +106,7 @@ def prepare_to_concept(instance):
 
     @staticmethod
     def prepare_source_version(instance):
-        return list(instance.sources.values_list('version', flat=True))
+        return [s.version for s in instance.sources.all()]
 
     @staticmethod
     def prepare_collection_version(instance):
diff --git a/core/sources/models.py b/core/sources/models.py
@@ -449,18 +449,27 @@ def index_resources_for_self_as_unreleased(self):
                 latest_released.index_children_async(user)
 
     def seed_concepts(self, index=True):
+        from core.concepts.models import Concept
         head = self.head
         if head:
-            concepts = head.concepts.filter(is_latest_version=True)
-            self.concepts.set(concepts)
+            through_model = Concept.sources.through
+            through_model.objects.filter(source_id=self.id).delete()
+            concept_ids = list(head.concepts.filter(is_latest_version=True).values_list('id', flat=True))
+            through_objects = [through_model(source_id=self.id, concept_id=cid) for cid in concept_ids]
+            through_model.objects.bulk_create(through_objects, batch_size=5000)
             if index:
                 from core.concepts.documents import ConceptDocument
                 self.batch_index(self.concepts, ConceptDocument)
 
     def seed_mappings(self, index=True):
+        from core.mappings.models import Mapping
         head = self.head
         if head:
-            self.mappings.set(head.mappings.filter(is_latest_version=True))
+            through_model = Mapping.sources.through
+            through_model.objects.filter(source_id=self.id).delete()
+            mapping_ids = list(head.mappings.filter(is_latest_version=True).values_list('id', flat=True))
+            through_objects = [through_model(source_id=self.id, mapping_id=mid) for mid in mapping_ids]
+            through_model.objects.bulk_create(through_objects, batch_size=5000)
             if index:
                 from core.mappings.documents import MappingDocument
                 self.batch_index(self.mappings, MappingDocument)