Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# dkany
Python package for accessing open data websites powered by dkan

# `uv` and package management
We're using [`uv`](https://docs.astral.sh/uv/), since pipenv was having a hard time building. Install it with `pip install uv`, then it should work almost identically to `pipenv`
# How to use

## Installation
Add the library to your project with your preferred package manager, e.g.
- `pip install dkany`
- `uv add dkany`
- `pipenv install dkany`

## Usage
See `./scratch/basic_run.py` for example code using the library to create, update, and delete datasets.

# Local Development
## `uv` and package management
We're using [`uv`](https://docs.astral.sh/uv/), since pipenv was having a hard time building. Install it with `pip install uv`, then it should work almost identically to `pipenv`


To install the package and it's dependences for development, run
```
Expand All @@ -28,8 +39,8 @@ There are many ways to run the tests associated with this app.


# Ideas for Improvement
TODO: Validate dataset file (All columns have column names)
- Validate dataset file (All columns have column names)

# Deploying
# Deploying to PyPi

See [our confluence doc on deploying to AWS CodeArtifact](https://mathematicampr.atlassian.net/wiki/spaces/WEB/pages/2514354711/Deploying+to+AWS+CodeArtifact)
See `.github/workflows/build-and-publish.yml` for the workflow that publishes this library. New versions are automatically published to test.pypi.org when a pre-release is made, and to pypi.org when a release is published.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"pyyaml>=6.0.3",
"requests>=2.32.5",
"requests-toolbelt>=1.0.0",
"typing-extensions>=4.0.0",
]

[dependency-groups]
Expand Down
62 changes: 56 additions & 6 deletions scratch/basic_run.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,65 @@
from dkany.client import DKANClient as DkanyClient

def main():
client = DkanyClient(
base_url = "https://edit.data.medicaid.gov"

def create_client() -> DkanyClient:
return DkanyClient(
base_url="https://edit.data.medicaid.gov", user_name="DEMO", password="your_api_key"
)

test_dataset_id = "9e407144-9ed9-5cee-937a-17d65b07a9a7"

exists = client.check_dataset_exists(test_dataset_id)
def create_dataset() -> str:
client = create_client()

body = {
"title": "Test Dataset from DKAN Client",
"type": ["dataset"],
"license": "http://opendatacommons.org/licenses/odc-by/1.0/",
"accessLevel": "published",
}

response = client.create_dataset(body)

print(f"Created dataset with ID: {response['identifier']}")
return response["identifier"]


def dataset_exists(dataset_id: str) -> None:
client = create_client()

exists = client.check_dataset_exists(dataset_id)

print(f"dataset {exists} exits")


def update_dataset(dataset_id: str):
client = create_client()

body = {
"title": "Updated Test Dataset from DKAN Client",
"type": ["dataset"],
"license": "http://opendatacommons.org/licenses/odc-by/1.0/",
"accessLevel": "hidden",
}

response = client.update_dataset(dataset_id, body)

print(f"Updated dataset with ID: {response['identifier']}")


def remove_dataset(dataset_id: str):
client = create_client()

client.delete_dataset(dataset_id)

print(f"Deleted dataset with ID: {dataset_id}")


def main():
dataset_id = create_dataset()
dataset_exists(dataset_id)
update_dataset(dataset_id)
remove_dataset(dataset_id)


if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion src/dkany/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.3"
__version__ = "0.1.4"
84 changes: 60 additions & 24 deletions src/dkany/client/client.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import logging
from copy import deepcopy as copy
from datetime import datetime as dt
from typing import List, Optional
from typing import List, Optional, Dict, Any

import requests
from requests.cookies import RequestsCookieJar
from requests_toolbelt import sessions # type: ignore
from requests.models import Response
from requests_toolbelt import sessions # type: ignore

from dkany.client.errors import BadResponse
from dkany.client.types import (
DkanSearchResponse,
DkanSearchParams,
DkanDatasetMetadataResponse,
DkanCreateDatasetResponse,
DkanUpdateDatasetResponse,
DkanDeleteDatasetResponse,
DkanGetDatasetResponse,
DkanMetadataFilterParams,
)

logger = logging.getLogger(__name__)

Expand All @@ -18,13 +29,19 @@ def url_join(url_part_list):

class DKANClient:
"""
docstring
The main interface with the DKAN API.

Arguments:
base_url: The base URL of the DKAN instance.
cookie_dict: A dictionary of cookies to attach to requests
user_name: The CMS Username (Four characters)
password: The DKAN API key associated with the user_name
"""

def __init__(
self,
base_url: Optional[str] = None,
cookie_dict: Optional[dict] = None,
cookie_dict: Optional[Dict[str, str]] = None,
user_name: Optional[str] = None,
password: Optional[str] = None,
):
Expand All @@ -51,7 +68,9 @@ def __init__(
self.existing_dataset_url = (
"api/1/metastore/schemas/dataset/items/{dataset_identifier}?_format=json"
)
self.revise_dataset_url = "api/1/metastore/schemas/dataset/items/{dataset_identifier}/revisions?_format=json"
self.revise_dataset_url = (
"api/1/metastore/schemas/dataset/items/{dataset_identifier}/revisions?_format=json"
)
self.query_datastore_url = (
"api/1/datastore/query/{dataset_identifier}/{datastore_idx}?_format=json"
)
Expand All @@ -71,22 +90,29 @@ def __str__(self) -> str:
return f"DKAN client for {self.base_url} with user {self.user_name}"

def _process_response(
self, response, acceptable_responses: Optional[List[int]] = None
self, response: Response, acceptable_responses: Optional[List[int]] = None
):
acceptable_responses = acceptable_responses or [200, 201]
if response.status_code not in acceptable_responses:
raise BadResponse(response, acceptable_responses)
out = response.json()
return out

def _paged_search(self, params, page):
def _paged_search(self, params: DkanSearchParams, page) -> DkanSearchResponse:
params["page"] = page

response = self.session.get(self.search_url, params=params)
json = self._process_response(response)
if any(k not in json for k in ("total", "results", "facets")):
err = (
"Malformed search response received from DKAN instance. "
+ "Expected keys 'total', 'results', and 'facets' got: "
+ ", ".join(json.keys())
)
raise SystemError(err)
return json

return self._process_response(response)

def _search_all_pages(self, params):
def _search_all_pages(self, params: DkanSearchParams) -> Dict[str, DkanDatasetMetadataResponse]:
page = 1
out = self._paged_search(params, page)
total = int(out["total"])
Expand All @@ -105,10 +131,10 @@ def _search_all_pages(self, params):

def search(
self, title: Optional[str] = None, tags=None, categories=None, page="ALL"
):
params = {}
) -> Dict[str, DkanDatasetMetadataResponse]:
params = DkanSearchParams()
if title is not None:
params["title"] = title
params["fulltext"] = title # todo: is this what's intended by this param?
if tags is not None:
params["keyword"] = tags
if categories is not None:
Expand All @@ -128,7 +154,11 @@ def search(

return out

def filter_search_results(self, search_results, filter_params):
def filter_search_results(
self,
search_results: Dict[str, DkanDatasetMetadataResponse],
filter_params: Optional[DkanMetadataFilterParams],
) -> Dict[str, DkanDatasetMetadataResponse]:
if filter_params is None:
return search_results
if len(filter_params.keys()) == 0:
Expand All @@ -138,30 +168,34 @@ def filter_search_results(self, search_results, filter_params):

for search_key, search_result_value in inital_search_results:
for filter_key, filter_value in filter_params.items():
if search_result_value[filter_key] != filter_value:
if search_result_value[filter_key] != filter_value: # type: ignore # mypy issue with TypedDict optional keys
search_results.pop(search_key)
break

return search_results

def create_dataset(self, body):
def create_dataset(self, body: Dict[str, Any]) -> DkanCreateDatasetResponse:
response = self.session.post(self.post_new_dataset_url, json=body)
return self._process_response(response)

def delete_dataset(self, dataset_identifier):
def delete_dataset(self, dataset_identifier: str) -> DkanDeleteDatasetResponse:
response = self.session.delete(
self.existing_dataset_url.format(dataset_identifier=dataset_identifier)
)
return self._process_response(response)

def update_dataset(self, dataset_identifier, body):
def update_dataset(
self, dataset_identifier: str, body: DkanDatasetMetadataResponse
) -> DkanUpdateDatasetResponse:
response = self.session.put(
self.existing_dataset_url.format(dataset_identifier=dataset_identifier),
json=body,
)
return self._process_response(response)

def mark_dataset_hidden(self, dataset_identifier, message=""):
def mark_dataset_hidden(
self, dataset_identifier: str, message: str = ""
) -> DkanUpdateDatasetResponse:
"""
Sets dataset accesslevel to "hidden"
Hides dataset from searches made on data.medicare.gov user interface
Expand All @@ -174,7 +208,7 @@ def mark_dataset_hidden(self, dataset_identifier, message=""):
)
return self._process_response(response)

def mark_dataset_public(self, dataset_identifier, message=""):
def mark_dataset_public(self, dataset_identifier: str, message="") -> DkanCreateDatasetResponse:
"""
Sets dataset accesslevel to "published"
Makes a dataset searchable through data.medicare.gov user interface
Expand All @@ -187,21 +221,21 @@ def mark_dataset_public(self, dataset_identifier, message=""):
)
return self._process_response(response)

def get_dataset_metadata(self, dataset_identifier):
def get_dataset_metadata(self, dataset_identifier: str) -> DkanDatasetMetadataResponse:
response = self.session.get(
self.existing_dataset_url.format(dataset_identifier=dataset_identifier),
params={"_format": "json"},
)
return self._process_response(response)

def check_dataset_exists(self, dataset_identifier):
def check_dataset_exists(self, dataset_identifier) -> bool:
try:
_ = self.get_dataset_metadata(dataset_identifier)
return True
except BadResponse:
return False

def trigger_dataset_reimport(self, dataset_identifier):
def trigger_dataset_reimport(self, dataset_identifier) -> DkanUpdateDatasetResponse:
body = self.get_dataset_metadata(dataset_identifier)
body["modified"] = dt.now().strftime(self.dkan_time_format)
return self.update_dataset(dataset_identifier, body)
Expand All @@ -216,7 +250,9 @@ def get_full_query_url(self, dataset_identifier, datastore_idx=0):
]
)

def get_data_by_dataset_identifier(self, dataset_identifier, datastore_idx=0):
def get_data_by_dataset_identifier(
self, dataset_identifier, datastore_idx=0
) -> DkanGetDatasetResponse:
response = self.session.get(
self.query_datastore_url.format(
dataset_identifier=dataset_identifier, datastore_idx=datastore_idx
Expand Down
11 changes: 7 additions & 4 deletions src/dkany/client/errors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from typing import List
from requests.models import Response


class Error(Exception):
"""Base class for exceptions in this module."""

message: str

def __str__(self):
Expand All @@ -10,12 +15,10 @@ def __repr__(self):


class BadResponse(Error):
def __init__(self, response, acceptable_status_codes):
def __init__(self, response: Response, acceptable_status_codes: List[int]):
status_code = response.status_code
message = []
message.append(
"Status code returned not in acceptable status codes for this response"
)
message.append("Status code returned not in acceptable status codes for this response")
message.append(
f"Returned: {status_code}:{response.reason}, Acceptable Codes {acceptable_status_codes}"
)
Expand Down
Loading