Skip to content

Commit f4bdeb4

Browse files
committed
feat: disable caching in CustomIngestionPipeline across multiple modules!
- Updated the CustomIngestionPipeline instantiation in etl.py, pipeline.py, website_etl.py, and activities.py to set use_cache=False, ensuring that caching is disabled during document ingestion.
1 parent d61e8d4 commit f4bdeb4

4 files changed

Lines changed: 6 additions & 2 deletions

File tree

hivemind_etl/mediawiki/etl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def transform(self) -> list[Document]:
9999
def load(self, documents: list[Document]) -> None:
100100
logging.info(f"Loading {len(documents)} documents into Qdrant!")
101101
ingestion_pipeline = CustomIngestionPipeline(
102-
self.community_id, collection_name=self.platform_id
102+
self.community_id, collection_name=self.platform_id, use_cache=False
103103
)
104104

105105
# Process batches in parallel using ThreadPoolExecutor

hivemind_etl/simple_ingestion/pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ async def process_document(
147147
pipeline = CustomIngestionPipeline(
148148
community_id=ingestion_request.communityId,
149149
collection_name=collection_name,
150+
use_cache=False,
150151
)
151152

152153
document = Document(
@@ -188,6 +189,7 @@ async def process_documents_batch(
188189
pipeline = CustomIngestionPipeline(
189190
community_id=batch_chunk.communityId,
190191
collection_name=collection_name,
192+
use_cache=False,
191193
)
192194

193195
# Convert all documents in this chunk to Document objects

hivemind_etl/website/website_etl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(
3030

3131
# preparing the ingestion pipeline
3232
self.ingestion_pipeline = CustomIngestionPipeline(
33-
self.community_id, collection_name=self.platform_id
33+
self.community_id, collection_name=self.platform_id, use_cache=False,
3434
)
3535

3636
async def extract(

hivemind_summarizer/activities.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date(
9797
pipeline = CustomIngestionPipeline(
9898
community_id=community_id,
9999
collection_name=f"{input.platform_id}_summary",
100+
use_cache=False,
100101
)
101102
# get the latest date from the collection
102103
latest_date = pipeline.get_latest_document_date(
@@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range(
211212
extract_text_only=extract_text_only,
212213
platform_id=input.platform_id,
213214
community_id=community_id,
215+
use_cache=False,
214216
)
215217
summaries = await fetch_platform_summaries_by_date(date_input)
216218
result[date] = summaries

0 commit comments

Comments
 (0)