1414from dotenv import load_dotenv
1515from typing import Optional , Dict , Any , List
1616from wfcommons .wfbench .translator .utils .llm_client import LLMClient
17+ from wfcommons .wfbench .translator .skills .loader import SkillLoader
1718
1819load_dotenv () # loads .env from cwd (project root)
1920
@@ -25,7 +26,7 @@ class LLMTranslator():
2526 • Uses existing WfFormat as examples.
2627 • Accepts a trace from a NEW workflow system.
2728 • Sends all of this as grounding context to an LLM.
28- • Produces a new recipe automatically.
29+ • Produces a new trace in WfFormat automatically.
2930
3031 The user does not implement translation logic.
3132 """
@@ -37,13 +38,14 @@ def __init__(self,
3738 examples_instances : Optional [List [str ]] = None ,
3839 num_examples : int = 3 ,
3940 system_prompt : Optional [str ] = None ,
41+ skill_name : Optional [str ] = None ,
4042 ** kwargs
4143 ):
4244 """
4345 Parameters
4446 ----------
4547 llm_client : LLMClient, optional
46- A pre-configured LLMClient instance. Either this or
48+ A pre-configured LLMClient instance. Either this or
4749 ``model_name`` must be provided.
4850 model_name : str, optional
4951 Key from models.yaml (e.g. "qwen3", "ollama/llama3").
@@ -60,6 +62,10 @@ def __init__(self,
6062 Number of example instances to include in the prompt.
6163 system_prompt : str, optional
6264 Override the default system instructions for the LLM.
65+ When provided, the skills system is bypassed entirely.
66+ skill_name : str, optional
67+ Explicit skill to use (e.g. "nextflow", "cwl").
68+ Auto-detected from trace content if not specified.
6369 kwargs : dict
6470 Additional parameters passed to the parent Translator if needed.
6571 """
@@ -76,7 +82,9 @@ def __init__(self,
7682 self .llm = llm_client
7783 self .examples_instances = examples_instances
7884 self .num_examples = num_examples
79- self .system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
85+ self .skill_name = skill_name
86+ self ._skill_loader = SkillLoader ()
87+ self ._system_prompt_override = system_prompt
8088
8189 # ------------------------------------------------------------------ #
8290 # YAML helpers #
@@ -88,6 +96,11 @@ def available_models(models_file: str | Path | None = None) -> list[str]:
8896 cfg = LLMTranslator ._load_models_yaml (models_file )
8997 return list (cfg .keys ())
9098
99+ @staticmethod
100+ def available_skills () -> list [str ]:
101+ """Return the list of available skill names."""
102+ return SkillLoader ().available_skills ()
103+
91104 @staticmethod
92105 def _load_models_yaml (models_file : str | Path | None = None ) -> dict :
93106 path = Path (models_file ) if models_file else MODELS_YAML
@@ -160,23 +173,46 @@ def _fetch_examples_from_path(self,
160173 path : str ,
161174 ref : str = "main" ) -> List [Dict [str , Any ]]:
162175 """
163- Fetch Python files from a specific path in a GitHub repository.
164-
176+ Fetch JSON files from a specific path in the WfInstances GitHub repository.
177+
165178 Parameters
166179 ----------
167180 path : str
168181 Path within the repository to fetch files from.
169182 ref : str, optional
170183 Git reference (branch, tag, or commit SHA). Defaults to "main".
171184
172- Returns
185+ Returns
173186 -------
174187 List[Dict[str, Any]]
175188 List of dictionaries with 'url', 'filename', and 'content' keys.
189+ Returns an empty list if the path does not exist in the repository.
176190 """
177191 print (f"Fetching examples from path: { path } at ref: { ref } " )
178192 url = f"https://api.github.com/repos/wfcommons/WfInstances/contents/{ path } ?ref={ ref } "
179- listing = requests .get (url ).json ()
193+ resp = requests .get (url )
194+
195+ if resp .status_code == 404 :
196+ print (f"Warning: path '{ path } ' not found in WfInstances repository. "
197+ "Skipping examples from this path." )
198+ return []
199+
200+ if resp .status_code != 200 :
201+ print (f"Warning: GitHub API returned status { resp .status_code } for "
202+ f"path '{ path } '. Skipping examples from this path." )
203+ return []
204+
205+ listing = resp .json ()
206+
207+ if isinstance (listing , dict ) and "message" in listing :
208+ print (f"Warning: GitHub API error for path '{ path } ': "
209+ f"{ listing ['message' ]} . Skipping examples from this path." )
210+ return []
211+
212+ if not isinstance (listing , list ):
213+ print (f"Warning: unexpected response for path '{ path } '. "
214+ "Skipping examples from this path." )
215+ return []
180216
181217 examples = []
182218 for item in listing :
@@ -201,8 +237,11 @@ def translate(self, trace, metadata=None, json_schema: dict | None = None, **kwa
201237 trace_text = str (trace )
202238
203239
240+ examples = self ._retrieve_examples (trace_text )
241+
204242 prompt = self ._build_prompt (
205243 trace = trace_text ,
244+ examples = examples ,
206245 metadata = metadata ,
207246 )
208247
@@ -219,11 +258,16 @@ def translate(self, trace, metadata=None, json_schema: dict | None = None, **kwa
219258 output = self .llm .complete (prompt , response_format = response_format )
220259 return output
221260
222- def _retrieve_examples (self , trace_text : str ):
261+ def _retrieve_examples (self , trace_text : str ) -> List [ Dict [ str , Any ]] :
223262 """
224- Simple scoring method to choose top-k (num_examples) examples
225- Replace with embeddings if desired.
263+ Simple scoring method to choose top-k (num_examples) examples.
264+
265+ Returns an empty list if no examples_instances were provided or
266+ none of the paths exist in the WfInstances repository.
226267 """
268+ if not self .examples_instances :
269+ return []
270+
227271 examples = self ._load_examples (self .examples_instances )
228272 flat_examples = []
229273 if isinstance (examples , dict ):
@@ -232,6 +276,11 @@ def _retrieve_examples(self, trace_text: str):
232276 else :
233277 flat_examples = list (examples )
234278
279+ if not flat_examples :
280+ print ("Warning: no valid examples found from any of the provided "
281+ "instance paths. Proceeding without examples." )
282+ return []
283+
235284 results = []
236285 for example in flat_examples :
237286 score = self ._similarity (trace_text , example ["content" ])
@@ -243,7 +292,7 @@ def _retrieve_examples(self, trace_text: str):
243292 @staticmethod
244293 def _similarity (a : str , b : str ) -> float :
245294 """
246- Very naive similarity based on word overlap. Replace with embeddings if desired.
295+ Very naive similarity based on word overlap.
247296 """
248297 return len (set (a .split ()) & set (b .split ()))
249298
@@ -254,9 +303,22 @@ def _build_prompt(
254303 metadata : Optional [Dict [str , Any ]] = None ,
255304 ) -> str :
256305
257- prompt = self .system_prompt .strip () + "\n \n "
306+ if self ._system_prompt_override is not None :
307+ # Explicit override: use as-is (backward compat)
308+ system_prompt = self ._system_prompt_override
309+ else :
310+ # Compose from skill files
311+ skill_hint = self .skill_name
312+ if not skill_hint and metadata and "source_system" in metadata :
313+ skill_hint = metadata ["source_system" ].lower ()
314+ system_prompt = self ._skill_loader .compose_prompt (
315+ trace_text = trace ,
316+ skill_name = skill_hint ,
317+ )
258318
259- prompt += "=== EXAMPLE TRANSLATORS (FROM URLS) ===\n "
319+ prompt = system_prompt .strip () + "\n \n "
320+
321+ prompt += "=== EXAMPLE WORKFLOW INSTANCES (WFFORMAT) ===\n "
260322 for i , ex in enumerate (examples , 1 ):
261323 prompt += f"\n --- Example { i } ---\n "
262324 prompt += f"Source URL: { ex ['url' ]} \n "
@@ -308,70 +370,4 @@ def _parse_llm_output(self, output: str) -> Dict[str, Any]:
308370 except json .JSONDecodeError :
309371 pass
310372
311- raise ValueError ("Could not extract valid JSON from LLM output." )
312-
313- DEFAULT_SYSTEM_PROMPT = """
314- You are an expert software engineer specializing in workflow systems.
315- Translate workflow definitions/traces into WfCommons WfFormat 1.5 JSON.
316-
317- OUTPUT THIS EXACT STRUCTURE:
318- {
319- "name": "<workflow_name - REQUIRED>",
320- "schemaVersion": "1.5",
321- "workflow": {
322- "specification": {
323- "tasks": [
324- {
325- "name": "<task_name>",
326- "id": "<task_id>",
327- "parents": [],
328- "children": [],
329- "inputFiles": [],
330- "outputFiles": []
331- }
332- ],
333- "files": []
334- },
335- "execution": {
336- "makespanInSeconds": <number or 0 if unknown>,
337- "executedAt": "<timestamp or "1970-01-01T00:00:00Z" if unknown>",
338- "tasks": [
339- {
340- "id": "<task_id matching specification>",
341- "runtimeInSeconds": <number or 0 if unknown>,
342- "executedAt": "<timestamp or "1970-01-01T00:00:00Z" if unknown>",
343- "command": {
344- "program": "<program name>",
345- "arguments": []
346- },
347- "coreCount": <number or 1>,
348- "avgCPU": <percentage or 0>,
349- "readBytes": <number or 0>,
350- "writtenBytes": <number or 0>,
351- "memoryInBytes": <number or 0>,
352- "machines": ["unknown"]
353- }
354- ],
355- "machines": [
356- {
357- "nodeName": "unknown"
358- }
359- ]
360- }
361- }
362- }
363-
364- RULES:
365- 1. Use EXACTLY this structure - do not add or rename fields
366- 2. "name" is REQUIRED - use the workflow name from the file, or the filename from metadata
367- 3. "schemaVersion" is always "1.5"
368- 4. Do NOT include optional top-level fields like "description", "createdAt", "author", "runtimeSystem" unless explicitly provided in source
369- 5. For arrays not found, use empty array []
370- 6. For numbers not found, use 0
371- 7. For timestamp strings not found, use "1970-01-01T00:00:00Z"
372- 8. Each task in specification MUST have: name, id, parents, children
373- 9. Each task in execution MUST have: id (matching specification), runtimeInSeconds
374- 10. Infer task dependencies from data flow (channels, inputs/outputs)
375- 11. Only populate execution fields if runtime data exists in the source - otherwise use 0 or placeholder values
376- 12. Output ONLY valid JSON - no explanations or markdown
377- """
373+ raise ValueError ("Could not extract valid JSON from LLM output." )
0 commit comments