@@ -15,7 +15,6 @@ def __init__(
1515 graph_backend : str = "kuzu" ,
1616 kv_backend : str = "rocksdb" ,
1717 quiz_samples : int = 1 ,
18- concurrency_limit : int = 200 ,
1918 ):
2019 super ().__init__ (working_dir = working_dir , op_name = "quiz_service" )
2120 self .quiz_samples = quiz_samples
@@ -28,21 +27,16 @@ def __init__(
2827 backend = kv_backend , working_dir = working_dir , namespace = "quiz"
2928 )
3029 self .generator = QuizGenerator (self .llm_client )
31- self .concurrency_limit = concurrency_limit
3230
3331 def process (self , batch : pd .DataFrame ) -> Iterable [pd .DataFrame ]:
34- # this operator does not consume any batch data
35- # but for compatibility we keep the interface
36- _ = batch .to_dict (orient = "records" )
32+ data = batch .to_dict (orient = "records" )
3733 self .graph_storage .reload ()
38- yield from self .quiz ()
34+ return self .quiz (data )
3935
4036 async def _process_single_quiz (self , item : tuple ) -> dict | None :
4137 # if quiz in quiz_storage exists already, directly get it
4238 index , desc = item
4339 _quiz_id = compute_dict_hash ({"index" : index , "description" : desc })
44- if self .quiz_storage .get_by_id (_quiz_id ):
45- return None
4640
4741 tasks = []
4842 for i in range (self .quiz_samples ):
@@ -68,47 +62,43 @@ async def _process_single_quiz(self, item: tuple) -> dict | None:
6862 logger .error ("Error when quizzing description %s: %s" , item , e )
6963 return None
7064
71- def quiz (self ) -> Iterable [pd .DataFrame ]:
65+ def quiz (self , batch ) -> Iterable [pd .DataFrame ]:
7266 """
7367 Get all nodes and edges and quiz their descriptions using QuizGenerator.
7468 """
75- edges = self .graph_storage .get_all_edges ()
76- nodes = self .graph_storage .get_all_nodes ()
77-
7869 items = []
7970
80- for edge in edges :
81- edge_data = edge [2 ]
82- desc = edge_data ["description" ]
83- items .append (((edge [0 ], edge [1 ]), desc ))
71+ for item in batch :
72+ nodes = item .get ("nodes" , [])
73+ edges = item .get ("edges" , [])
8474
85- for node in nodes :
86- node_data = node [1 ]
87- desc = node_data ["description" ]
88- items .append ((node [0 ], desc ))
75+ for node_id , node_data in nodes .items ():
76+ node_data = node_data [0 ]
77+ desc = node_data ["description" ]
78+ items .append ((node_id , desc ))
79+ for edge_key , edge_data in edges .items ():
80+ edge_data = edge_data [0 ]
81+ desc = edge_data ["description" ]
82+ items .append ((edge_key , desc ))
8983
9084 logger .info ("Total descriptions to quiz: %d" , len (items ))
9185
92- for i in range (0 , len (items ), self .concurrency_limit ):
93- batch_items = items [i : i + self .concurrency_limit ]
94- batch_results = run_concurrent (
95- self ._process_single_quiz ,
96- batch_items ,
97- desc = f"Quizzing descriptions ({ i } / { i + len (batch_items )} )" ,
98- unit = "description" ,
99- )
86+ results = run_concurrent (
87+ self ._process_single_quiz ,
88+ items ,
89+ desc = f"Quizzing batch of { len (items )} descriptions" ,
90+ unit = "description" ,
91+ )
92+ valid_results = [res for res in results if res ]
10093
101- final_results = []
102- for new_result in batch_results :
103- if new_result :
104- self .quiz_storage .upsert (
105- {
106- new_result ["_quiz_id" ]: {
107- "description" : new_result ["description" ],
108- "quizzes" : new_result ["quizzes" ],
109- }
110- }
111- )
112- final_results .append (new_result )
113- self .quiz_storage .index_done_callback ()
114- yield pd .DataFrame (final_results )
94+ for res in valid_results :
95+ self .quiz_storage .upsert (
96+ {
97+ res ["_quiz_id" ]: {
98+ "description" : res ["description" ],
99+ "quizzes" : res ["quizzes" ],
100+ }
101+ }
102+ )
103+ self .quiz_storage .index_done_callback ()
104+ return pd .DataFrame (valid_results )
0 commit comments