Skip to content

Commit 4cf7090

Browse files
authored
Merge pull request #21 from TogetherCrew/fix/mediawiki-activities-wrong-arg
fix: mediawiki data transformation and loading to use mediawiki_platf…
2 parents c03a3e2 + bd71452 commit 4cf7090

2 files changed

Lines changed: 13 additions & 6 deletions

File tree

hivemind_etl/mediawiki/activities.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,15 @@ async def extract_mediawiki(mediawiki_platform: dict[str, Any]) -> None:
7272

7373

7474
@activity.defn
75-
async def transform_mediawiki_data(community_id: str) -> list[Document]:
75+
async def transform_mediawiki_data(mediawiki_platform: dict[str, Any]) -> list[Document]:
7676
"""Transform the extracted MediaWiki data."""
77+
78+
community_id = mediawiki_platform["community_id"]
7779
try:
80+
namespaces = mediawiki_platform["namespaces"]
81+
7882
logging.info(f"Starting transformation for community {community_id}")
79-
mediawiki_etl = MediawikiETL(community_id=community_id)
83+
mediawiki_etl = MediawikiETL(community_id=community_id, namespaces=namespaces)
8084
result = mediawiki_etl.transform()
8185
logging.info(f"Completed transformation for community {community_id}")
8286
return result
@@ -86,9 +90,12 @@ async def transform_mediawiki_data(community_id: str) -> list[Document]:
8690

8791

8892
@activity.defn
89-
async def load_mediawiki_data(documents: list[Document], community_id: str) -> None:
93+
async def load_mediawiki_data(mediawiki_platform: dict[str, Any]) -> None:
9094
"""Load the transformed MediaWiki data into the database."""
95+
community_id = mediawiki_platform["community_id"]
9196
try:
97+
documents = mediawiki_platform["documents"]
98+
9299
logging.info(f"Starting data load for community {community_id}")
93100
mediawiki_etl = MediawikiETL(community_id=community_id)
94101
mediawiki_etl.load(documents=documents)

hivemind_etl/mediawiki/workflows.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,19 @@ async def run(self, platform_id: str | None = None) -> None:
5959
# Transform the extracted data
6060
documents = await workflow.execute_activity(
6161
transform_mediawiki_data,
62-
platform["community_id"],
62+
mediawiki_platform,
6363
start_to_close_timeout=timedelta(minutes=30),
6464
retry_policy=RetryPolicy(
6565
initial_interval=timedelta(minutes=1),
6666
maximum_attempts=3,
6767
),
6868
)
6969

70+
mediawiki_platform["documents"] = documents
7071
# Load the transformed data
7172
await workflow.execute_activity(
7273
load_mediawiki_data,
73-
documents,
74-
platform["community_id"],
74+
mediawiki_platform,
7575
start_to_close_timeout=timedelta(minutes=30),
7676
retry_policy=RetryPolicy(
7777
initial_interval=timedelta(minutes=1),

0 commit comments

Comments
 (0)