Merge pull request #3 from DataLabTechTV/dev

DataLabTechTV · web-flow · commit 6f1b9b3f05a2 · 2025-07-08T18:05:11.000+01:00
feat: graph rag cli options for interactive and direct querying
diff --git a/graph/cli.py b/graph/cli.py
@@ -1,11 +1,12 @@
 import os
+from typing import Optional
 
 import click
 from loguru import logger as log
 
 from graph.embedding import NodeEmbedding, NodeEmbeddingAlgo
 from graph.ops import KuzuOps
-from graph.rag import GraphRAG
+from graph.rag import ContextAssemblerException, GraphRAG, GraphRetrievalException
 from shared.lakehouse import Lakehouse
 from shared.settings import env
 
@@ -96,7 +97,7 @@ def embeddings(schema: str, dimension: int, batch_size: int, epochs: int, algo:
         log.exception(e)
 
 
-@graph.command()
+@graph.command(help="Reindex embedding property")
 @click.argument("schema", type=click.STRING)
 def reindex(schema: str):
     try:
@@ -106,11 +107,36 @@ def reindex(schema: str):
         log.error(e)
 
 
-@graph.command()
+@graph.command(help="Run GraphRAG pipeline")
 @click.argument("schema", type=click.STRING)
-def rag(schema: str):
+@click.option(
+    "--interactive",
+    "-i",
+    is_flag=True,
+    help="Run in interactive mode using a REPL",
+)
+@click.option(
+    "--query",
+    "-q",
+    type=click.STRING,
+    help="User query prompt",
+)
+def rag(schema: str, interactive: bool, query: Optional[str]):
     gr = GraphRAG(schema)
-    gr.interactive()
+
+    if interactive and query is not None:
+        raise click.UsageError("--interactive and --query cannot be used together")
+
+    if query is not None:
+        try:
+            response = gr.invoke(dict(user_query=query))
+            log.info("Final response:\n{}", response.content)
+        except GraphRetrievalException as e:
+            log.error("{}\n{}", e, e.query)
+        except ContextAssemblerException as e:
+            log.error(e)
+    else:
+        gr.interactive()
 
 
 if __name__ == "__main__":
diff --git a/graph/ops.py b/graph/ops.py
@@ -1,7 +1,6 @@
 import os
 import shutil
 import tempfile
-import textwrap
 from enum import Enum
 from string import Template
 from typing import Any, Optional
diff --git a/graph/rag.py b/graph/rag.py
@@ -6,6 +6,7 @@
 
 import ollama
 import pandas as pd
+from colorama import Fore
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema import AIMessage
 from langchain.schema.runnable import Runnable
@@ -56,6 +57,16 @@ def get_line(lineno):
         return get_line
 
 
+class GraphRetrievalException(Exception):
+    def __init__(self, message, query):
+        self.query = query
+        super().__init__(message)
+
+
+class ContextAssemblerException(Exception):
+    pass
+
+
 class GraphRAG(Runnable):
     def __init__(
         self,
@@ -170,18 +181,27 @@ def query_graph(
         limit: Optional[int] = None,
     ) -> RunnableFn:
         def run(inputs: dict[str, Any]) -> dict[str, Any]:
+            log.info(
+                "Querying graph for matching entities (shuffle={}, limit={})",
+                shuffle,
+                limit,
+            )
+
             query = inputs["query"]
             params = inputs.get("params")
 
-            context_df = pd.DataFrame(self.graph.query(query, params))
+            try:
+                context_df = pd.DataFrame(self.graph.query(query, params))
 
-            if shuffle:
-                context_df = context_df.sample(frac=1)
+                if shuffle:
+                    context_df = context_df.sample(frac=1)
 
-            if limit is not None:
-                context_df = context_df.head(limit)
+                if limit is not None:
+                    context_df = context_df.head(limit)
 
-            return dict(context=context_df)
+                return dict(context=context_df)
+            except:
+                raise GraphRetrievalException("Graph query failed", query=query)
 
         return run
 
@@ -214,6 +234,10 @@ def combined_knn(self, k: int) -> RunnableFn:
 
         def run(inputs: dict[str, Any]) -> dict[str, Any]:
             context = inputs["context"]
+
+            if context is None or len(context) == 0:
+                raise ContextAssemblerException("Context not found")
+
             node_ids = context.node_id.to_list()
 
             for node_id in node_ids:
@@ -247,9 +271,17 @@ def nn_sample_shortest_paths(
         max_length: int,
     ) -> RunnableFn:
         def run(inputs: dict[str, Any]) -> dict[str, Any]:
-            source_node_ids = inputs["graph_retrieval"]["context"].node_id.to_list()
+            context = inputs["graph_retrieval"]["context"]
+
+            if context is None or len(context) == 0:
+                raise ContextAssemblerException("Context not found")
+
+            source_node_ids = context.node_id.to_list()
             target_node_ids = inputs["combined_knn"]["knn"]
 
+            if target_node_ids is None or len(target_node_ids) == 0:
+                raise ContextAssemblerException("Nearest neighbors not found")
+
             paths_df = self.ops.sample_shortest_paths(
                 source_node_ids,
                 target_node_ids,
@@ -271,6 +303,9 @@ def nn_random_walks(
         def run(inputs: dict[str, Any]) -> dict[str, Any]:
             source_node_ids = inputs["combined_knn"]["knn"]
 
+            if source_node_ids is None or len(source_node_ids) == 0:
+                raise ContextAssemblerException("Nearest neighbors not found")
+
             paths_dfs = []
 
             for source_node_id in source_node_ids:
@@ -412,7 +447,7 @@ def loader(self, stop_event: threading.Event):
 
                 time.sleep(0.1)
 
-        print(f"\r⏱ {elapsed}\n")
+        print("\b\b\b   ", end="\n\n", flush=True)
 
     def interactive(self):
         config_path = user_config_path("datalab", "DataLabTechTV")
@@ -461,9 +496,17 @@ def interactive(self):
                         )
                         loader_thread.start()
 
-                        output = self.invoke(dict(user_query=user_query))
-
-                        stop_event.set()
-                        loader_thread.join()
-
-                        print(output["context"])
+                        try:
+                            response = self.invoke(dict(user_query=user_query))
+                            stop_event.set()
+                            loader_thread.join()
+                            print(response.content)
+                        except GraphRetrievalException as e:
+                            stop_event.set()
+                            loader_thread.join()
+                            print(Fore.RED + "Error: " + str(e))
+                            print(Fore.MAGENTA + e.query + Fore.RESET)
+                        except ContextAssemblerException as e:
+                            stop_event.set()
+                            loader_thread.join()
+                            print(Fore.RED + "Error: " + str(e) + Fore.RESET)
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
   "boto3>=1.38.29",
   "boto3-stubs[s3]>=1.38.29",
   "click>=8.2.1",
+  "colorama>=0.4.6",
   "dbt-core>=1.9.6",
   "dbt-duckdb",
   "environs>=14.2.0",
diff --git a/tests/test_graph_rag.py b/tests/test_graph_rag.py
@@ -4,6 +4,7 @@
 
 PROMPTS = (
     "If I like metal artists like Metallica or Iron Maiden, but also listen to IDM, what other artists and genres could I listen to?",
+    "What other bands like Anthrax are there?",
 )
 
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`
`5`	`5`	`PROMPTS = (`
`6`	`6`	`"If I like metal artists like Metallica or Iron Maiden, but also listen to IDM, what other artists and genres could I listen to?",`
	`7`	`+ "What other bands like Anthrax are there?",`
`7`	`8`	`)`
`8`	`9`
`9`	`10`