Skip to content

Commit bf03d5f

Browse files
authored
Merge pull request #5711 from bjester/incompletely-synced
Add management command for auditing missing sources
2 parents 9cc5466 + 9b79638 commit bf03d5f

6 files changed

Lines changed: 277 additions & 2 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,14 @@ var/
2525

2626
# Ignore editor / IDE related data
2727
.vscode/
28+
.gemini/
2829

2930
# IntelliJ IDE, except project config
3031
.idea/
3132
/*.iml
33+
.junie/
34+
.aiassistant/
35+
.aiignore
3236
# ignore future updates to run configuration
3337
.run/devserver.run.xml
3438

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import csv
2+
import logging
3+
import time
4+
5+
from django.core.management.base import BaseCommand
6+
from django.db.models import Exists
7+
from django.db.models import FilteredRelation
8+
from django.db.models import OuterRef
9+
from django.db.models import Q
10+
from django.db.models.expressions import F
11+
from django_cte import With
12+
13+
from contentcuration.models import Channel
14+
from contentcuration.models import ContentNode
15+
16+
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class Command(BaseCommand):
21+
"""
22+
Audits nodes that have imported content from public channels and whether the imported content
23+
has a missing source node.
24+
25+
TODO: this does not yet FIX them
26+
"""
27+
28+
def handle(self, *args, **options):
29+
start = time.time()
30+
31+
public_cte = self.get_public_cte()
32+
33+
# preliminary filter on channels to those private and non-deleted, which have content
34+
# lft=1 is always true for root nodes, so rght>2 means it actually has children
35+
private_channels_cte = With(
36+
Channel.objects.filter(
37+
public=False,
38+
deleted=False,
39+
)
40+
.annotate(
41+
non_empty_main_tree=FilteredRelation(
42+
"main_tree", condition=Q(main_tree__rght__gt=2)
43+
),
44+
)
45+
.annotate(
46+
tree_id=F("non_empty_main_tree__tree_id"),
47+
)
48+
.values("id", "name", "tree_id"),
49+
name="dest_channel_cte",
50+
)
51+
52+
# reduce the list of private channels to those that have an imported node
53+
# from a public channel
54+
destination_channels = (
55+
private_channels_cte.queryset()
56+
.with_cte(public_cte)
57+
.with_cte(private_channels_cte)
58+
.filter(
59+
Exists(
60+
public_cte.join(
61+
ContentNode.objects.filter(
62+
tree_id=OuterRef("tree_id"),
63+
),
64+
original_channel_id=public_cte.col.id,
65+
)
66+
)
67+
)
68+
.values("id", "name", "tree_id")
69+
.order_by("id")
70+
)
71+
72+
logger.info("=== Iterating over private destination channels. ===")
73+
channel_count = 0
74+
total_node_count = 0
75+
76+
with open("fix_missing_import_sources.csv", "w", newline="") as csv_file:
77+
csv_writer = csv.DictWriter(
78+
csv_file,
79+
fieldnames=[
80+
"channel_id",
81+
"channel_name",
82+
"contentnode_id",
83+
"contentnode_title",
84+
"public_channel_id",
85+
"public_channel_name",
86+
"public_channel_deleted",
87+
],
88+
)
89+
csv_writer.writeheader()
90+
91+
for channel in destination_channels.iterator():
92+
node_count = self.handle_channel(csv_writer, channel)
93+
94+
if node_count > 0:
95+
total_node_count += node_count
96+
channel_count += 1
97+
98+
logger.info("=== Done iterating over private destination channels. ===")
99+
logger.info(f"Found {total_node_count} nodes across {channel_count} channels.")
100+
logger.info(f"Finished in {time.time() - start}")
101+
102+
def get_public_cte(self) -> With:
103+
# This CTE gets all public channels with their main tree info
104+
return With(
105+
Channel.objects.filter(public=True)
106+
.annotate(
107+
tree_id=F("main_tree__tree_id"),
108+
)
109+
.values("id", "name", "deleted", "tree_id"),
110+
name="public_cte",
111+
)
112+
113+
def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
114+
public_cte = self.get_public_cte()
115+
channel_id = channel["id"]
116+
channel_name = channel["name"]
117+
tree_id = channel["tree_id"]
118+
119+
missing_source_nodes = (
120+
public_cte.join(
121+
ContentNode.objects.filter(tree_id=tree_id),
122+
original_channel_id=public_cte.col.id,
123+
)
124+
.with_cte(public_cte)
125+
.annotate(
126+
public_channel_id=public_cte.col.id,
127+
public_channel_name=public_cte.col.name,
128+
public_channel_deleted=public_cte.col.deleted,
129+
)
130+
.filter(
131+
Q(public_channel_deleted=True)
132+
| ~Exists(
133+
ContentNode.objects.filter(
134+
tree_id=public_cte.col.tree_id,
135+
node_id=OuterRef("original_source_node_id"),
136+
)
137+
)
138+
)
139+
.values(
140+
"public_channel_id",
141+
"public_channel_name",
142+
"public_channel_deleted",
143+
contentnode_id=F("id"),
144+
contentnode_title=F("title"),
145+
)
146+
)
147+
148+
# Count and log results
149+
node_count = missing_source_nodes.count()
150+
151+
# TODO: this will be replaced with logic to correct the missing source nodes
152+
if node_count > 0:
153+
logger.info(
154+
f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes."
155+
)
156+
row_dict = {
157+
"channel_id": channel_id,
158+
"channel_name": channel_name,
159+
}
160+
for node_dict in missing_source_nodes.iterator():
161+
row_dict.update(node_dict)
162+
csv_writer.writerow(row_dict)
163+
164+
return node_count

contentcuration/contentcuration/models.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from django.utils import timezone
4848
from django.utils.translation import gettext as _
4949
from django_cte import CTEManager
50+
from django_cte import CTEQuerySet
5051
from django_cte import With
5152
from le_utils import proquint
5253
from le_utils.constants import content_kinds
@@ -837,7 +838,7 @@ def exists(self, *filters):
837838
return Exists(self.queryset().filter(*filters).values("user_id"))
838839

839840

840-
class ChannelModelQuerySet(models.QuerySet):
841+
class ChannelModelQuerySet(CTEQuerySet):
841842
def create(self, **kwargs):
842843
"""
843844
Create a new object with the given kwargs, saving it to the database
@@ -863,6 +864,12 @@ def update_or_create(self, defaults=None, **kwargs):
863864
return super().update_or_create(defaults, **kwargs)
864865

865866

867+
class ChannelModelManager(models.Manager.from_queryset(ChannelModelQuerySet)):
868+
"""Custom Channel models manager with CTE support"""
869+
870+
pass
871+
872+
866873
class Channel(models.Model):
867874
""" Permissions come from association with organizations """
868875

@@ -994,7 +1001,7 @@ class Channel(models.Model):
9941001
]
9951002
)
9961003

997-
objects = ChannelModelQuerySet.as_manager()
1004+
objects = ChannelModelManager()
9981005

9991006
@classmethod
10001007
def get_editable(cls, user, channel_id):

contentcuration/contentcuration/tests/management/__init__.py

Whitespace-only changes.

contentcuration/contentcuration/tests/management/commands/__init__.py

Whitespace-only changes.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from unittest.mock import mock_open
2+
from unittest.mock import patch
3+
4+
from django.core.management import call_command
5+
6+
from contentcuration.tests import testdata
7+
from contentcuration.tests.base import StudioTestCase
8+
9+
10+
class CommandTestCase(StudioTestCase):
11+
"""Test suite for the fix_missing_import_sources management command"""
12+
13+
def setUp(self):
14+
open_patcher = patch(
15+
"contentcuration.management.commands.fix_missing_import_sources.open",
16+
mock_open(),
17+
)
18+
self.mock_open = open_patcher.start()
19+
self.mock_file = self.mock_open.return_value
20+
self.mock_file.__enter__.return_value = self.mock_file
21+
self.addCleanup(open_patcher.stop)
22+
23+
csv_writer_patcher = patch(
24+
"contentcuration.management.commands.fix_missing_import_sources.csv.DictWriter"
25+
)
26+
self.mock_csv_writer = csv_writer_patcher.start()
27+
self.mock_csv_writer_instance = self.mock_csv_writer.return_value
28+
self.addCleanup(csv_writer_patcher.stop)
29+
30+
self.public_channel = testdata.channel("Public Channel")
31+
self.public_channel.public = True
32+
self.public_channel.save()
33+
34+
self.private_channel = testdata.channel("Private Channel")
35+
36+
# see tree.json for this file
37+
self.original_node = (
38+
self.public_channel.main_tree.get_descendants()
39+
.filter(node_id="00000000000000000000000000000003")
40+
.first()
41+
)
42+
self.copied_node = self.original_node.copy_to(
43+
target=self.private_channel.main_tree
44+
)
45+
46+
def test_handle__opens_csv_file(self):
47+
call_command("fix_missing_import_sources")
48+
49+
self.mock_open.assert_called_once_with(
50+
"fix_missing_import_sources.csv", "w", newline=""
51+
)
52+
53+
self.mock_csv_writer.assert_called_once_with(
54+
self.mock_file,
55+
fieldnames=[
56+
"channel_id",
57+
"channel_name",
58+
"contentnode_id",
59+
"contentnode_title",
60+
"public_channel_id",
61+
"public_channel_name",
62+
"public_channel_deleted",
63+
],
64+
)
65+
66+
self.mock_csv_writer_instance.writeheader.assert_called_once()
67+
self.mock_csv_writer_instance.writerow.assert_not_called()
68+
69+
def test_handle__finds_missing(self):
70+
self.original_node.delete()
71+
call_command("fix_missing_import_sources")
72+
73+
self.mock_csv_writer_instance.writerow.assert_called_once_with(
74+
{
75+
"channel_id": self.private_channel.id,
76+
"channel_name": self.private_channel.name,
77+
"contentnode_id": self.copied_node.id,
78+
"contentnode_title": self.copied_node.title,
79+
"public_channel_id": self.public_channel.id,
80+
"public_channel_name": self.public_channel.name,
81+
"public_channel_deleted": False,
82+
}
83+
)
84+
85+
def test_handle__finds_for_deleted_channel(self):
86+
self.public_channel.deleted = True
87+
self.public_channel.save(actor_id=testdata.user().id)
88+
call_command("fix_missing_import_sources")
89+
90+
self.mock_csv_writer_instance.writerow.assert_called_once_with(
91+
{
92+
"channel_id": self.private_channel.id,
93+
"channel_name": self.private_channel.name,
94+
"contentnode_id": self.copied_node.id,
95+
"contentnode_title": self.copied_node.title,
96+
"public_channel_id": self.public_channel.id,
97+
"public_channel_name": self.public_channel.name,
98+
"public_channel_deleted": True,
99+
}
100+
)

0 commit comments

Comments
 (0)