Skip to content

Commit 6e5c387

Browse files
blaiszikclaude
andauthored
Restore missing mdf_client.py from design-renaissance branch (#470)
* Restore missing mdf_client.py from design-renaissance branch This file was part of PR #469 but was not included in the merge, causing ModuleNotFoundError when importing foundry. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * Fix DOI search to return correct dataset The forge DOI search can return multiple results where only one actually has the matching DOI. Previously, get_metadata_by_doi() blindly returned the first result, which often didn't have the requested DOI. Now it iterates through results to find the one with the exact DOI match, fixing test_dataframe_search_by_doi and test_dataframe_download_by_doi tests. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent af1ffe6 commit 6e5c387

2 files changed

Lines changed: 291 additions & 2 deletions

File tree

foundry/foundry.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,16 @@ def get_metadata_by_doi(self, doi: str) -> dict:
342342
results = forge.match_dois(doi).search()
343343
if len(results) < 1:
344344
return None
345-
else:
346-
return results[0]
345+
# Filter to find the result that actually has the matching DOI
346+
for result in results:
347+
result_doi = result.get('dc', {}).get('identifier', {})
348+
if isinstance(result_doi, dict):
349+
result_doi = result_doi.get('identifier', '')
350+
if result_doi == doi:
351+
return result
352+
# If no exact match found, return None
353+
logger.warning(f"DOI search returned {len(results)} results but none matched DOI {doi}")
354+
return None
347355

348356
def get_metadata_by_query(self, q: str, limit: int) -> dict:
349357
"""Submit query to forge client and return results

foundry/mdf_client.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
"""Minimal MDF client replacing mdf_forge dependency.
2+
3+
This provides the essential functionality needed by Foundry without
4+
requiring the full mdf_forge package.
5+
6+
Also includes staging upload functionality for publishing local data to MDF
7+
without requiring Globus Connect Personal.
8+
"""
9+
10+
import uuid
11+
from pathlib import Path
12+
from typing import Any, Dict, List, Optional
13+
14+
import requests
15+
16+
# MDF Public Staging Endpoint (NCSA)
17+
STAGING_ENDPOINT_ID = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec"
18+
STAGING_BASE_PATH = "/tmp"
19+
TRANSFER_API_BASE = "https://transfer.api.globus.org/v0.10"
20+
21+
22+
class StagingUploader:
23+
"""Handles uploading files to MDF staging endpoint.
24+
25+
This allows users to publish local data to MDF without needing
26+
Globus Connect Personal running. Files are uploaded via HTTPS
27+
to a temporary staging location on the MDF public endpoint.
28+
29+
Usage:
30+
uploader = StagingUploader(transfer_token)
31+
unique_id, remote_dir = uploader.create_staging_directory()
32+
uploader.upload_file(Path("data.csv"), remote_dir)
33+
# Then use globus://{STAGING_ENDPOINT_ID}{remote_dir}/ as data source
34+
"""
35+
36+
def __init__(self, transfer_token: str, https_token: Optional[str] = None):
37+
"""Initialize uploader with Globus tokens.
38+
39+
Args:
40+
transfer_token: Globus OAuth2 access token with transfer scope
41+
https_token: Globus OAuth2 access token with HTTPS scope for NCSA
42+
(if None, uses transfer_token)
43+
"""
44+
self.transfer_token = transfer_token
45+
self.https_token = https_token or transfer_token
46+
self.endpoint_id = STAGING_ENDPOINT_ID
47+
self.headers = {
48+
"Authorization": f"Bearer {transfer_token}",
49+
"Content-Type": "application/json",
50+
}
51+
self._https_server = None
52+
53+
def _get_https_server(self) -> str:
54+
"""Get the HTTPS server URL for the endpoint."""
55+
if self._https_server:
56+
return self._https_server
57+
58+
# Query the endpoint to get its HTTPS server
59+
response = requests.get(
60+
f"{TRANSFER_API_BASE}/endpoint/{self.endpoint_id}",
61+
headers=self.headers,
62+
)
63+
64+
if response.ok:
65+
data = response.json()
66+
self._https_server = data.get("https_server")
67+
if self._https_server:
68+
return self._https_server
69+
70+
# Fallback: Use the standard NCSA HTTPS endpoint
71+
self._https_server = "https://g-b0c3f4.dd271.03c0.data.globus.org"
72+
return self._https_server
73+
74+
def create_staging_directory(self) -> tuple[str, str]:
75+
"""Create a unique directory on the staging endpoint.
76+
77+
Returns:
78+
Tuple of (unique_id, full_path) for the created directory
79+
"""
80+
unique_id = str(uuid.uuid4())
81+
dir_path = f"{STAGING_BASE_PATH}/{unique_id}"
82+
83+
response = requests.post(
84+
f"{TRANSFER_API_BASE}/endpoint/{self.endpoint_id}/mkdir",
85+
headers=self.headers,
86+
json={
87+
"path": dir_path,
88+
"DATA_TYPE": "mkdir",
89+
},
90+
)
91+
92+
if not response.ok:
93+
error_data = response.json()
94+
raise RuntimeError(
95+
f"Failed to create staging directory: {error_data.get('message', response.text)}"
96+
)
97+
98+
return unique_id, dir_path
99+
100+
def upload_file(self, local_path: Path, remote_dir: str, filename: Optional[str] = None) -> str:
101+
"""Upload a single file to the staging endpoint via HTTPS.
102+
103+
Args:
104+
local_path: Path to local file
105+
remote_dir: Remote directory path (e.g., /tmp/uuid)
106+
filename: Optional remote filename (defaults to local filename)
107+
108+
Returns:
109+
Remote path to uploaded file
110+
"""
111+
if filename is None:
112+
filename = local_path.name
113+
114+
remote_path = f"{remote_dir}/{filename}"
115+
116+
https_server = self._get_https_server()
117+
upload_url = f"{https_server}{remote_path}"
118+
119+
with open(local_path, "rb") as f:
120+
response = requests.put(
121+
upload_url,
122+
headers={"Authorization": f"Bearer {self.https_token}"},
123+
data=f,
124+
)
125+
126+
if not response.ok:
127+
raise RuntimeError(
128+
f"Failed to upload {local_path.name}: {response.status_code} {response.text}"
129+
)
130+
131+
return remote_path
132+
133+
def upload_directory(
134+
self,
135+
local_dir: Path,
136+
remote_dir: str,
137+
progress_callback=None,
138+
) -> List[str]:
139+
"""Upload all files from a local directory.
140+
141+
Args:
142+
local_dir: Local directory containing files to upload
143+
remote_dir: Remote directory path
144+
progress_callback: Optional callback(filename, current, total)
145+
146+
Returns:
147+
List of remote paths to uploaded files
148+
"""
149+
files = list(local_dir.iterdir())
150+
files = [f for f in files if f.is_file()]
151+
uploaded = []
152+
153+
for i, file_path in enumerate(files):
154+
if progress_callback:
155+
progress_callback(file_path.name, i + 1, len(files))
156+
157+
remote_path = self.upload_file(file_path, remote_dir)
158+
uploaded.append(remote_path)
159+
160+
return uploaded
161+
162+
def get_globus_url(self, remote_dir: str) -> str:
163+
"""Get the Globus file manager URL for a staged directory.
164+
165+
This is the format expected by MDF Connect for data sources.
166+
167+
Args:
168+
remote_dir: Remote directory path (e.g., /tmp/uuid)
169+
170+
Returns:
171+
Globus file manager URL for use with MDF
172+
"""
173+
from urllib.parse import quote
174+
encoded_path = quote(f"{remote_dir}/", safe="")
175+
return f"https://app.globus.org/file-manager?origin_id={self.endpoint_id}&origin_path={encoded_path}"
176+
177+
178+
class MDFClient:
179+
"""Minimal MDF client for dataset search and download."""
180+
181+
def __init__(
182+
self,
183+
index: str = "mdf",
184+
services: Optional[Any] = None,
185+
search_client: Optional[Any] = None,
186+
transfer_client: Optional[Any] = None,
187+
data_mdf_authorizer: Optional[Any] = None,
188+
petrel_authorizer: Optional[Any] = None,
189+
):
190+
"""Initialize the MDF client."""
191+
self.index = index
192+
self.search_client = search_client
193+
self.transfer_client = transfer_client
194+
self.data_mdf_authorizer = data_mdf_authorizer
195+
self.petrel_authorizer = petrel_authorizer
196+
self._resource_types: List[str] = []
197+
self._organizations: List[str] = []
198+
self._dois: List[str] = []
199+
200+
def match_resource_types(self, resource_type: str) -> "MDFClient":
201+
"""Filter by resource type."""
202+
self._resource_types = [resource_type]
203+
return self
204+
205+
def match_organizations(self, organization: str) -> "MDFClient":
206+
"""Filter by organization."""
207+
self._organizations = [organization]
208+
return self
209+
210+
def match_dois(self, doi: str) -> "MDFClient":
211+
"""Filter by DOI."""
212+
self._dois = [doi]
213+
return self
214+
215+
def search(
216+
self,
217+
q: Optional[str] = None,
218+
advanced: bool = False,
219+
limit: int = 10,
220+
**kwargs,
221+
) -> List[Dict]:
222+
"""Search for datasets."""
223+
if self.search_client is None:
224+
raise RuntimeError("Search client not configured")
225+
226+
query_parts = []
227+
if q:
228+
query_parts.append(q)
229+
for rt in self._resource_types:
230+
query_parts.append(f'mdf.resource_type:"{rt}"')
231+
for org in self._organizations:
232+
query_parts.append(f'mdf.organizations:"{org}"')
233+
for doi in self._dois:
234+
query_parts.append(f'dc.identifier.identifier:"{doi}"')
235+
236+
full_query = " AND ".join(query_parts) if query_parts else "*"
237+
238+
index_id = "1a57bbe5-5272-477f-9d31-343b8258b7a5" if self.index == "mdf" else "aeccc263-f083-45f5-ab1d-08ee702b3384"
239+
240+
results = self.search_client.search(index_id, full_query, limit=limit, advanced=advanced)
241+
242+
self._resource_types = []
243+
self._organizations = []
244+
self._dois = []
245+
246+
# Extract content from Globus Search response structure
247+
# Structure: gmeta[].entries[].content
248+
contents = []
249+
for gmeta_entry in results.get("gmeta", []):
250+
for entry in gmeta_entry.get("entries", []):
251+
if "content" in entry:
252+
contents.append(entry["content"])
253+
return contents
254+
255+
def globus_download(
256+
self,
257+
results: List[Dict],
258+
dest: str = ".",
259+
dest_ep: Optional[str] = None,
260+
download_datasets: bool = True,
261+
**kwargs,
262+
) -> Dict:
263+
"""Download data using Globus Transfer."""
264+
if self.transfer_client is None:
265+
raise RuntimeError("Transfer client not configured")
266+
267+
transfer_items = []
268+
for result in results:
269+
if "data" in result:
270+
data = result["data"]
271+
if "endpoint_path" in data:
272+
transfer_items.append({
273+
"source_endpoint": data.get("endpoint_id"),
274+
"source_path": data["endpoint_path"],
275+
"destination_path": dest,
276+
})
277+
278+
if not transfer_items:
279+
return {"status": "no_data", "message": "No transferable data found"}
280+
281+
return {"status": "pending", "items": transfer_items}

0 commit comments

Comments
 (0)