Merge pull request #5711 from bjester/incompletely-synced

bjester · web-flow · commit bf03d5f79e05 · 2026-02-18T09:55:44.000-08:00
Add management command for auditing missing sources
diff --git a/.gitignore b/.gitignore
@@ -25,10 +25,14 @@ var/
 
 # Ignore editor / IDE related data
 .vscode/
+.gemini/
 
 # IntelliJ IDE, except project config
 .idea/
 /*.iml
+.junie/
+.aiassistant/
+.aiignore
 # ignore future updates to run configuration
 .run/devserver.run.xml
 
diff --git a/contentcuration/contentcuration/management/commands/fix_missing_import_sources.py b/contentcuration/contentcuration/management/commands/fix_missing_import_sources.py
@@ -0,0 +1,164 @@
+import csv
+import logging
+import time
+
+from django.core.management.base import BaseCommand
+from django.db.models import Exists
+from django.db.models import FilteredRelation
+from django.db.models import OuterRef
+from django.db.models import Q
+from django.db.models.expressions import F
+from django_cte import With
+
+from contentcuration.models import Channel
+from contentcuration.models import ContentNode
+
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    """
+    Audits nodes that have imported content from public channels and whether the imported content
+    has a missing source node.
+
+    TODO: this does not yet FIX them
+    """
+
+    def handle(self, *args, **options):
+        start = time.time()
+
+        public_cte = self.get_public_cte()
+
+        # preliminary filter on channels to those private and non-deleted, which have content
+        # lft=1 is always true for root nodes, so rght>2 means it actually has children
+        private_channels_cte = With(
+            Channel.objects.filter(
+                public=False,
+                deleted=False,
+            )
+            .annotate(
+                non_empty_main_tree=FilteredRelation(
+                    "main_tree", condition=Q(main_tree__rght__gt=2)
+                ),
+            )
+            .annotate(
+                tree_id=F("non_empty_main_tree__tree_id"),
+            )
+            .values("id", "name", "tree_id"),
+            name="dest_channel_cte",
+        )
+
+        # reduce the list of private channels to those that have an imported node
+        # from a public channel
+        destination_channels = (
+            private_channels_cte.queryset()
+            .with_cte(public_cte)
+            .with_cte(private_channels_cte)
+            .filter(
+                Exists(
+                    public_cte.join(
+                        ContentNode.objects.filter(
+                            tree_id=OuterRef("tree_id"),
+                        ),
+                        original_channel_id=public_cte.col.id,
+                    )
+                )
+            )
+            .values("id", "name", "tree_id")
+            .order_by("id")
+        )
+
+        logger.info("=== Iterating over private destination channels. ===")
+        channel_count = 0
+        total_node_count = 0
+
+        with open("fix_missing_import_sources.csv", "w", newline="") as csv_file:
+            csv_writer = csv.DictWriter(
+                csv_file,
+                fieldnames=[
+                    "channel_id",
+                    "channel_name",
+                    "contentnode_id",
+                    "contentnode_title",
+                    "public_channel_id",
+                    "public_channel_name",
+                    "public_channel_deleted",
+                ],
+            )
+            csv_writer.writeheader()
+
+            for channel in destination_channels.iterator():
+                node_count = self.handle_channel(csv_writer, channel)
+
+                if node_count > 0:
+                    total_node_count += node_count
+                    channel_count += 1
+
+        logger.info("=== Done iterating over private destination channels. ===")
+        logger.info(f"Found {total_node_count} nodes across {channel_count} channels.")
+        logger.info(f"Finished in {time.time() - start}")
+
+    def get_public_cte(self) -> With:
+        # This CTE gets all public channels with their main tree info
+        return With(
+            Channel.objects.filter(public=True)
+            .annotate(
+                tree_id=F("main_tree__tree_id"),
+            )
+            .values("id", "name", "deleted", "tree_id"),
+            name="public_cte",
+        )
+
+    def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
+        public_cte = self.get_public_cte()
+        channel_id = channel["id"]
+        channel_name = channel["name"]
+        tree_id = channel["tree_id"]
+
+        missing_source_nodes = (
+            public_cte.join(
+                ContentNode.objects.filter(tree_id=tree_id),
+                original_channel_id=public_cte.col.id,
+            )
+            .with_cte(public_cte)
+            .annotate(
+                public_channel_id=public_cte.col.id,
+                public_channel_name=public_cte.col.name,
+                public_channel_deleted=public_cte.col.deleted,
+            )
+            .filter(
+                Q(public_channel_deleted=True)
+                | ~Exists(
+                    ContentNode.objects.filter(
+                        tree_id=public_cte.col.tree_id,
+                        node_id=OuterRef("original_source_node_id"),
+                    )
+                )
+            )
+            .values(
+                "public_channel_id",
+                "public_channel_name",
+                "public_channel_deleted",
+                contentnode_id=F("id"),
+                contentnode_title=F("title"),
+            )
+        )
+
+        # Count and log results
+        node_count = missing_source_nodes.count()
+
+        # TODO: this will be replaced with logic to correct the missing source nodes
+        if node_count > 0:
+            logger.info(
+                f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes."
+            )
+            row_dict = {
+                "channel_id": channel_id,
+                "channel_name": channel_name,
+            }
+            for node_dict in missing_source_nodes.iterator():
+                row_dict.update(node_dict)
+                csv_writer.writerow(row_dict)
+
+        return node_count
diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py
@@ -47,6 +47,7 @@
 from django.utils import timezone
 from django.utils.translation import gettext as _
 from django_cte import CTEManager
+from django_cte import CTEQuerySet
 from django_cte import With
 from le_utils import proquint
 from le_utils.constants import content_kinds
@@ -837,7 +838,7 @@ def exists(self, *filters):
         return Exists(self.queryset().filter(*filters).values("user_id"))
 
 
-class ChannelModelQuerySet(models.QuerySet):
+class ChannelModelQuerySet(CTEQuerySet):
     def create(self, **kwargs):
         """
         Create a new object with the given kwargs, saving it to the database
@@ -863,6 +864,12 @@ def update_or_create(self, defaults=None, **kwargs):
         return super().update_or_create(defaults, **kwargs)
 
 
+class ChannelModelManager(models.Manager.from_queryset(ChannelModelQuerySet)):
+    """Custom Channel models manager with CTE support"""
+
+    pass
+
+
 class Channel(models.Model):
     """ Permissions come from association with organizations """
 
@@ -994,7 +1001,7 @@ class Channel(models.Model):
         ]
     )
 
-    objects = ChannelModelQuerySet.as_manager()
+    objects = ChannelModelManager()
 
     @classmethod
     def get_editable(cls, user, channel_id):
diff --git a/contentcuration/contentcuration/tests/management/__init__.py b/contentcuration/contentcuration/tests/management/__init__.py
diff --git a/contentcuration/contentcuration/tests/management/commands/__init__.py b/contentcuration/contentcuration/tests/management/commands/__init__.py
diff --git a/contentcuration/contentcuration/tests/management/commands/test_fix_missing_import_sources.py b/contentcuration/contentcuration/tests/management/commands/test_fix_missing_import_sources.py
@@ -0,0 +1,100 @@
+from unittest.mock import mock_open
+from unittest.mock import patch
+
+from django.core.management import call_command
+
+from contentcuration.tests import testdata
+from contentcuration.tests.base import StudioTestCase
+
+
+class CommandTestCase(StudioTestCase):
+    """Test suite for the fix_missing_import_sources management command"""
+
+    def setUp(self):
+        open_patcher = patch(
+            "contentcuration.management.commands.fix_missing_import_sources.open",
+            mock_open(),
+        )
+        self.mock_open = open_patcher.start()
+        self.mock_file = self.mock_open.return_value
+        self.mock_file.__enter__.return_value = self.mock_file
+        self.addCleanup(open_patcher.stop)
+
+        csv_writer_patcher = patch(
+            "contentcuration.management.commands.fix_missing_import_sources.csv.DictWriter"
+        )
+        self.mock_csv_writer = csv_writer_patcher.start()
+        self.mock_csv_writer_instance = self.mock_csv_writer.return_value
+        self.addCleanup(csv_writer_patcher.stop)
+
+        self.public_channel = testdata.channel("Public Channel")
+        self.public_channel.public = True
+        self.public_channel.save()
+
+        self.private_channel = testdata.channel("Private Channel")
+
+        # see tree.json for this file
+        self.original_node = (
+            self.public_channel.main_tree.get_descendants()
+            .filter(node_id="00000000000000000000000000000003")
+            .first()
+        )
+        self.copied_node = self.original_node.copy_to(
+            target=self.private_channel.main_tree
+        )
+
+    def test_handle__opens_csv_file(self):
+        call_command("fix_missing_import_sources")
+
+        self.mock_open.assert_called_once_with(
+            "fix_missing_import_sources.csv", "w", newline=""
+        )
+
+        self.mock_csv_writer.assert_called_once_with(
+            self.mock_file,
+            fieldnames=[
+                "channel_id",
+                "channel_name",
+                "contentnode_id",
+                "contentnode_title",
+                "public_channel_id",
+                "public_channel_name",
+                "public_channel_deleted",
+            ],
+        )
+
+        self.mock_csv_writer_instance.writeheader.assert_called_once()
+        self.mock_csv_writer_instance.writerow.assert_not_called()
+
+    def test_handle__finds_missing(self):
+        self.original_node.delete()
+        call_command("fix_missing_import_sources")
+
+        self.mock_csv_writer_instance.writerow.assert_called_once_with(
+            {
+                "channel_id": self.private_channel.id,
+                "channel_name": self.private_channel.name,
+                "contentnode_id": self.copied_node.id,
+                "contentnode_title": self.copied_node.title,
+                "public_channel_id": self.public_channel.id,
+                "public_channel_name": self.public_channel.name,
+                "public_channel_deleted": False,
+            }
+        )
+
+    def test_handle__finds_for_deleted_channel(self):
+        self.public_channel.deleted = True
+        self.public_channel.save(actor_id=testdata.user().id)
+        call_command("fix_missing_import_sources")
+
+        self.mock_csv_writer_instance.writerow.assert_called_once_with(
+            {
+                "channel_id": self.private_channel.id,
+                "channel_name": self.private_channel.name,
+                "contentnode_id": self.copied_node.id,
+                "contentnode_title": self.copied_node.title,
+                "public_channel_id": self.public_channel.id,
+                "public_channel_name": self.public_channel.name,
+                "public_channel_deleted": True,
+            }
+        )