Skip to content

Commit a69141f

Browse files
Improve bulk import flow for hierarchy content regardless of concept import order (#852)
* fix(core/importers): build hierarchy after inline bulk import * Track parent_concept_urls for non-delete Concept imports to record child-to-parent relationships before finalization * Invert child-to-parent map and call make_hierarchy to attach newly imported concepts under their parents * fix(importers): rebuild concept hierarchy after bulk import * Capture child-to-parent relationships before clearing input list to preserve hierarchy data during import lifecycle * Invert saved map after processing and call make_hierarchy to restore parent links for imported concepts * fix(importers): normalize bulk import hierarchy URIs and guard reconciliation by edit access * Encode concept_id when building child URIs to match stored concept URI format * Skip hierarchy reconciliation for concepts without edit permissions to prevent unintended updates * fix(importers): reconcile bulk import concept hierarchies after import * Collect concept parent links during parallel bulk import execution for deferred reconciliation * Apply hierarchy mapping after all import tasks complete to ensure correct parent-child ordering * Remove inline hierarchy reconciliation from BulkImportInline to avoid partial updates * Add regression tests for child-before-parent ordering, inaccessible concepts, non-concept records, and URI encoding
1 parent 034f224 commit a69141f

3 files changed

Lines changed: 284 additions & 1 deletion

File tree

core/common/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,11 @@ def queue_bulk_import( # pylint: disable=too-many-arguments
439439
task_func = bulk_import_parallel_inline
440440
args = (to_import, username, update_if_exists, threads)
441441
else:
442+
# TODO: bulk_import_inline is unreachable from the current view layer —
443+
# import_response always passes `parallel_threads = request.data.get('parallel') or 5`
444+
# which is always truthy, so the `elif threads` branch above is always taken.
445+
# Consider removing bulk_import_inline and this branch once confirmed no external
446+
# callers rely on it (check API consumers and the test suite first).
442447
task_func = bulk_import_inline
443448
else:
444449
task_func = bulk_import

core/importers/models.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from core.collections.models import Collection
1616
from core.common.constants import HEAD
1717
from core.common.tasks import bulk_import_parts_inline, delete_organization, batch_index_resources, \
18-
post_import_update_resource_counts
18+
post_import_update_resource_counts, make_hierarchy
1919
from core.common.utils import drop_version, is_url_encoded_string, encode_string, to_parent_uri, chunks
2020
from core.concepts.models import Concept
2121
from core.mappings.models import Mapping
@@ -968,11 +968,13 @@ def __init__(
968968
self.parts = deque([])
969969
self.result = None
970970
self._json_result = None
971+
self.concept_hierarchy_map = {} # child_uri -> [parent_uris], built before input_list is cleared
971972
if self.content:
972973
self.input_list = self.content if isinstance(self.content, list) else self.content.splitlines()
973974
self.total = len(self.input_list)
974975
self.make_resource_distribution()
975976
self.make_parts()
977+
self.collect_concept_hierarchy_map()
976978
self.content = None # memory optimization
977979
self.input_list = [] # memory optimization
978980

@@ -1022,6 +1024,25 @@ def make_parts(self):
10221024
self.parts[-1].append(line)
10231025
prev_line = line
10241026

1027+
def collect_concept_hierarchy_map(self):
1028+
for data in self.input_list:
1029+
line = data if isinstance(data, dict) else json.loads(data)
1030+
if line.get('type', '').lower() != 'concept':
1031+
continue
1032+
parent_urls = line.get('parent_concept_urls') or []
1033+
concept_id = line.get('id')
1034+
owner = line.get('owner')
1035+
source = line.get('source')
1036+
if parent_urls and concept_id and owner and source:
1037+
owner_type = line.get('owner_type', '').lower()
1038+
owner_prefix = 'users' if owner_type in ['user', 'users'] else 'orgs'
1039+
# P2: normalize concept_id the same way ConceptImporter.parse() does,
1040+
# so the URI matches what was actually persisted in the database.
1041+
if not is_url_encoded_string(concept_id):
1042+
concept_id = encode_string(concept_id)
1043+
child_uri = f'/{owner_prefix}/{owner}/sources/{source}/concepts/{concept_id}/'
1044+
self.concept_hierarchy_map[child_uri] = parent_urls
1045+
10251046
@staticmethod
10261047
def chunker_list(seq, size, is_child): # pylint: disable=too-many-locals
10271048
"""
@@ -1127,6 +1148,30 @@ def run(self):
11271148
self.resource_wise_time[part_type] = 0
11281149
self.resource_wise_time[part_type] += round(time.time() - start_time, 4)
11291150

1151+
if self.concept_hierarchy_map:
1152+
# P1: restrict reconciliation to concepts the importing user can actually edit,
1153+
# mirroring the has_edit_access guard in ConceptImporter.process(). This excludes
1154+
# PERMISSION_DENIED rows and prevents hierarchy changes on sources the user does
1155+
# not own, even when those concepts already exist in the database.
1156+
user = UserProfile.objects.filter(username=self.username).first()
1157+
accessible_uris = set(
1158+
concept.uri
1159+
for concept in Concept.objects.filter(
1160+
uri__in=self.concept_hierarchy_map.keys(), id=F('versioned_object_id')
1161+
).select_related('parent')
1162+
if concept.parent.has_edit_access(user)
1163+
)
1164+
inverted = {}
1165+
for child_uri, parent_uris in self.concept_hierarchy_map.items():
1166+
if child_uri not in accessible_uris:
1167+
continue
1168+
for parent_uri in parent_uris:
1169+
if parent_uri not in inverted:
1170+
inverted[parent_uri] = []
1171+
inverted[parent_uri].append(child_uri)
1172+
if inverted:
1173+
make_hierarchy(inverted)
1174+
11301175
post_import_update_resource_counts.apply_async(queue='default', permanent=False)
11311176

11321177
self.update_elapsed_seconds()

core/importers/tests.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,239 @@ def test_import_subtask_multiple_resource_per_file(self):
12641264
'OCL', 'Concept', [{'start_index': 5, 'end_index': 10}]).run()
12651265
self.assertEqual(concept_result, [1, 1, 1, 1, 1])
12661266

1267+
# ── collect_concept_hierarchy_map ────────────────────────────────────────
1268+
1269+
def test_collect_concept_hierarchy_map_basic(self):
1270+
"""Concepts with parent_concept_urls are collected; those without are ignored."""
1271+
content = '\n'.join([
1272+
json.dumps({
1273+
"type": "Concept", "id": "ChildConcept",
1274+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1275+
"parent_concept_urls": ["/orgs/TestOrg/sources/TestSource/concepts/ParentConcept/"]
1276+
}),
1277+
json.dumps({
1278+
"type": "Concept", "id": "ParentConcept",
1279+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1280+
}),
1281+
])
1282+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1283+
1284+
self.assertEqual(
1285+
importer.concept_hierarchy_map,
1286+
{
1287+
'/orgs/TestOrg/sources/TestSource/concepts/ChildConcept/':
1288+
['/orgs/TestOrg/sources/TestSource/concepts/ParentConcept/']
1289+
}
1290+
)
1291+
1292+
def test_collect_concept_hierarchy_map_encodes_special_chars(self):
1293+
"""Concept IDs with special characters (e.g. &) are URL-encoded to match persisted URIs (P2)."""
1294+
content = json.dumps({
1295+
"type": "Concept", "id": "1A40.0&XA8UM1",
1296+
"owner": "OpenMRS-OCL-Squad", "owner_type": "Organization", "source": "ICD-11-CIEL-Bridge",
1297+
"parent_concept_urls": ["/orgs/OpenMRS-OCL-Squad/sources/ICD-11-CIEL-Bridge/concepts/BlockL1-1A0/"]
1298+
})
1299+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1300+
1301+
encoded_uri = '/orgs/OpenMRS-OCL-Squad/sources/ICD-11-CIEL-Bridge/concepts/1A40.0%26XA8UM1/'
1302+
raw_uri = '/orgs/OpenMRS-OCL-Squad/sources/ICD-11-CIEL-Bridge/concepts/1A40.0&XA8UM1/'
1303+
1304+
self.assertIn(encoded_uri, importer.concept_hierarchy_map)
1305+
self.assertNotIn(raw_uri, importer.concept_hierarchy_map)
1306+
1307+
def test_collect_concept_hierarchy_map_user_owner_type(self):
1308+
"""Concepts owned by a User (not an Org) use /users/ prefix in their URI."""
1309+
content = json.dumps({
1310+
"type": "Concept", "id": "MyConcept",
1311+
"owner": "johndoe", "owner_type": "User", "source": "MySource",
1312+
"parent_concept_urls": ["/users/johndoe/sources/MySource/concepts/RootConcept/"]
1313+
})
1314+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1315+
1316+
self.assertIn('/users/johndoe/sources/MySource/concepts/MyConcept/', importer.concept_hierarchy_map)
1317+
1318+
def test_collect_concept_hierarchy_map_ignores_non_concepts(self):
1319+
"""Only Concept lines are indexed; Source, Mapping, and Reference lines are ignored."""
1320+
content = '\n'.join([
1321+
json.dumps({"type": "Source", "id": "S1", "owner": "O", "owner_type": "Organization",
1322+
"parent_concept_urls": ["/orgs/O/sources/S1/concepts/Root/"]}),
1323+
json.dumps({"type": "Mapping", "id": "M1", "owner": "O", "owner_type": "Organization",
1324+
"source": "S1",
1325+
"parent_concept_urls": ["/orgs/O/sources/S1/concepts/Root/"]}),
1326+
json.dumps({"type": "Concept", "id": "C1", "owner": "O", "owner_type": "Organization",
1327+
"source": "S1"}),
1328+
])
1329+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1330+
1331+
self.assertEqual(importer.concept_hierarchy_map, {})
1332+
1333+
# ── run() hierarchy reconciliation ───────────────────────────────────────
1334+
1335+
@patch('core.importers.models.make_hierarchy')
1336+
@patch('core.importers.models.BulkImportParallelRunner.wait_till_tasks_alive')
1337+
@patch('core.importers.models.BulkImportParallelRunner.queue_tasks')
1338+
def test_run_calls_make_hierarchy_with_inverted_map(self, queue_tasks_mock, wait_mock, make_hierarchy_mock):
1339+
"""After all chunks complete, make_hierarchy receives the inverted {parent_uri: [child_uris]} map."""
1340+
org = OrganizationFactory(mnemonic='TestOrg')
1341+
source = OrganizationSourceFactory(organization=org, mnemonic='TestSource', version='HEAD')
1342+
parent = ConceptFactory(parent=source, mnemonic='ParentConcept')
1343+
child = ConceptFactory(parent=source, mnemonic='ChildConcept')
1344+
1345+
# File order: child before parent (the exact scenario the fix targets)
1346+
content = '\n'.join([
1347+
json.dumps({
1348+
"type": "Concept", "id": child.mnemonic,
1349+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1350+
"parent_concept_urls": [parent.uri]
1351+
}),
1352+
json.dumps({
1353+
"type": "Concept", "id": parent.mnemonic,
1354+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1355+
}),
1356+
])
1357+
1358+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1359+
importer.run()
1360+
1361+
make_hierarchy_mock.assert_called_once()
1362+
inverted = make_hierarchy_mock.call_args[0][0]
1363+
self.assertIn(parent.uri, inverted)
1364+
self.assertIn(child.uri, inverted[parent.uri])
1365+
1366+
@patch('core.importers.models.make_hierarchy')
1367+
@patch('core.importers.models.BulkImportParallelRunner.wait_till_tasks_alive')
1368+
@patch('core.importers.models.BulkImportParallelRunner.queue_tasks')
1369+
def test_run_skips_make_hierarchy_when_no_hierarchy(self, queue_tasks_mock, wait_mock, make_hierarchy_mock):
1370+
"""make_hierarchy is not called when no concept in the import has parent_concept_urls."""
1371+
content = '\n'.join([
1372+
json.dumps({
1373+
"type": "Concept", "id": "StandaloneConcept",
1374+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1375+
}),
1376+
])
1377+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1378+
importer.run()
1379+
1380+
make_hierarchy_mock.assert_not_called()
1381+
1382+
@patch('core.importers.models.BulkImportParallelRunner.wait_till_tasks_alive')
1383+
@patch('core.importers.models.BulkImportParallelRunner.queue_tasks')
1384+
def test_run_hierarchy_child_before_parent(self, queue_tasks_mock, wait_mock):
1385+
"""
1386+
End-to-end: when child appears before parent in the import file, the reconciliation
1387+
step must correctly establish the parent_concepts M2M link in the database.
1388+
This is the primary bug scenario fixed by this PR.
1389+
"""
1390+
org = OrganizationFactory(mnemonic='TestOrg')
1391+
source = OrganizationSourceFactory(organization=org, mnemonic='TestSource', version='HEAD')
1392+
parent = ConceptFactory(parent=source, mnemonic='ParentConcept')
1393+
child = ConceptFactory(parent=source, mnemonic='ChildConcept')
1394+
1395+
# child listed before parent — the exact ordering that broke hierarchy before the fix
1396+
content = '\n'.join([
1397+
json.dumps({
1398+
"type": "Concept", "id": child.mnemonic,
1399+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1400+
"parent_concept_urls": [parent.uri]
1401+
}),
1402+
json.dumps({
1403+
"type": "Concept", "id": parent.mnemonic,
1404+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1405+
}),
1406+
])
1407+
1408+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1409+
importer.run()
1410+
1411+
child.refresh_from_db()
1412+
self.assertIn(parent.get_latest_version(), child.parent_concepts.all())
1413+
1414+
@patch('core.importers.models.BulkImportParallelRunner.wait_till_tasks_alive')
1415+
@patch('core.importers.models.BulkImportParallelRunner.queue_tasks')
1416+
def test_run_hierarchy_parent_before_child(self, queue_tasks_mock, wait_mock):
1417+
"""
1418+
End-to-end: when parent appears before child (natural order), the reconciliation
1419+
must also establish the link correctly — confirming the fix is order-agnostic.
1420+
"""
1421+
org = OrganizationFactory(mnemonic='TestOrg')
1422+
source = OrganizationSourceFactory(organization=org, mnemonic='TestSource', version='HEAD')
1423+
parent = ConceptFactory(parent=source, mnemonic='ParentConcept')
1424+
child = ConceptFactory(parent=source, mnemonic='ChildConcept')
1425+
1426+
# parent listed before child — normal/expected order
1427+
content = '\n'.join([
1428+
json.dumps({
1429+
"type": "Concept", "id": parent.mnemonic,
1430+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1431+
}),
1432+
json.dumps({
1433+
"type": "Concept", "id": child.mnemonic,
1434+
"owner": "TestOrg", "owner_type": "Organization", "source": "TestSource",
1435+
"parent_concept_urls": [parent.uri]
1436+
}),
1437+
])
1438+
1439+
importer = BulkImportParallelRunner(content, 'ocladmin', True)
1440+
importer.run()
1441+
1442+
child.refresh_from_db()
1443+
self.assertIn(parent.get_latest_version(), child.parent_concepts.all())
1444+
1445+
@patch('core.importers.models.make_hierarchy')
1446+
@patch('core.importers.models.BulkImportParallelRunner.wait_till_tasks_alive')
1447+
@patch('core.importers.models.BulkImportParallelRunner.queue_tasks')
1448+
def test_run_excludes_inaccessible_concepts(self, queue_tasks_mock, wait_mock, make_hierarchy_mock):
1449+
"""
1450+
Concepts from sources the importing user cannot edit must be excluded from make_hierarchy,
1451+
even when they exist in the database. This prevents hierarchy changes on foreign sources
1452+
that were denied during import (P1 security fix).
1453+
"""
1454+
# OrgA — importing user is a member → has edit access
1455+
org_a = OrganizationFactory(mnemonic='OrgA')
1456+
source_a = OrganizationSourceFactory(organization=org_a, mnemonic='SourceA', version='HEAD')
1457+
parent_a = ConceptFactory(parent=source_a, mnemonic='ParentA')
1458+
child_a = ConceptFactory(parent=source_a, mnemonic='ChildA')
1459+
1460+
# OrgB — importing user is NOT a member and source is not publicly editable → no edit access
1461+
org_b = OrganizationFactory(mnemonic='OrgB')
1462+
source_b = OrganizationSourceFactory(organization=org_b, mnemonic='SourceB', version='HEAD', public_access='None')
1463+
parent_b = ConceptFactory(parent=source_b, mnemonic='ParentB')
1464+
child_b = ConceptFactory(parent=source_b, mnemonic='ChildB')
1465+
1466+
importing_user = UserProfileFactory(username='importer-user')
1467+
org_a.members.add(importing_user)
1468+
self.assertFalse(org_b.is_member(importing_user))
1469+
1470+
content = '\n'.join([
1471+
# accessible concept (OrgA)
1472+
json.dumps({
1473+
"type": "Concept", "id": child_a.mnemonic,
1474+
"owner": "OrgA", "owner_type": "Organization", "source": "SourceA",
1475+
"parent_concept_urls": [parent_a.uri]
1476+
}),
1477+
# inaccessible concept (OrgB — user has no permission)
1478+
json.dumps({
1479+
"type": "Concept", "id": child_b.mnemonic,
1480+
"owner": "OrgB", "owner_type": "Organization", "source": "SourceB",
1481+
"parent_concept_urls": [parent_b.uri]
1482+
}),
1483+
])
1484+
1485+
importer = BulkImportParallelRunner(content, importing_user.username, True)
1486+
importer.run()
1487+
1488+
make_hierarchy_mock.assert_called_once()
1489+
inverted = make_hierarchy_mock.call_args[0][0]
1490+
1491+
# OrgA child must be linked to its parent
1492+
self.assertIn(parent_a.uri, inverted)
1493+
self.assertIn(child_a.uri, inverted[parent_a.uri])
1494+
1495+
# OrgB child must NOT appear — user has no access to SourceB
1496+
self.assertNotIn(parent_b.uri, inverted)
1497+
all_children = [uri for uris in inverted.values() for uri in uris]
1498+
self.assertNotIn(child_b.uri, all_children)
1499+
12671500

12681501
class BulkImportViewTest(OCLAPITestCase):
12691502
def setUp(self):

0 commit comments

Comments
 (0)