From bb90837f8dfa5442864d25b63c1d15b526df823e Mon Sep 17 00:00:00 2001 From: joaquinvanschoren Date: Tue, 15 Oct 2024 22:12:46 +0200 Subject: [PATCH 01/20] markdown docs, baby --- .gitignore | 1 - docs/Project.toml | 1 + docs/README.md | 11 + docs/build/assets/Documenter.css | 18 ++ docs/build/assets/mathjaxhelper.js | 25 ++ docs/build/index.md | 354 +++++++++++++++++++++++++++++ docs/make-md.jl | 7 + mkdocs.yml | 57 +++++ 8 files changed, 473 insertions(+), 1 deletion(-) create mode 100644 docs/README.md create mode 100644 docs/build/assets/Documenter.css create mode 100644 docs/build/assets/mathjaxhelper.js create mode 100644 docs/build/index.md create mode 100644 docs/make-md.jl create mode 100644 mkdocs.yml diff --git a/.gitignore b/.gitignore index 0e1b98c..5a61b23 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ Manifest.toml #* .DS_Store sandbox/ -/docs/build/ /docs/site/ /docs/Manifest.toml .vscode diff --git a/docs/Project.toml b/docs/Project.toml index 3507784..ac90fed 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,6 +3,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" OpenML = "8b6db2d4-7670-4922-a472-f9537c81ab66" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +DocumenterMarkdown [compat] Documenter = "~0.26" diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..e9f6008 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,11 @@ +Contains both the original `Documenter.jl` documentation and the same documentation rendered as markdown to include in the harmonized OpenML docs. + +Generating the markdown is done as follows: + +* Install [DocumenterMarkDown](https://documentermarkdown.juliadocs.org/dev/). + * Note: currently this only works with version 0.27 of Documenter.jl + * In Julia, open the package manager (type ']') and run `add Documenter@0.27` and `add DocumenterMarkdown`. +* Run `julia make-md.jl` in the `docs` folder to generate the markdown filew + * These appear in the `build` folder +* Run `mkdocs serve` in the root folder to build the markdown docs. + diff --git a/docs/build/assets/Documenter.css b/docs/build/assets/Documenter.css new file mode 100644 index 0000000..d9af5d6 --- /dev/null +++ b/docs/build/assets/Documenter.css @@ -0,0 +1,18 @@ +div.wy-menu-vertical ul.current li.toctree-l3 a { + font-weight: bold; +} + +a.documenter-source { + float: right; +} + +.documenter-methodtable pre { + margin-left: 0; + margin-right: 0; + margin-top: 0; + padding: 0; +} + +.documenter-methodtable pre.documenter-inline { + display: inline; +} diff --git a/docs/build/assets/mathjaxhelper.js b/docs/build/assets/mathjaxhelper.js new file mode 100644 index 0000000..3561b10 --- /dev/null +++ b/docs/build/assets/mathjaxhelper.js @@ -0,0 +1,25 @@ +MathJax.Hub.Config({ + "tex2jax": { + inlineMath: [['$','$'], ['\\(','\\)']], + processEscapes: true + } +}); +MathJax.Hub.Config({ + config: ["MMLorHTML.js"], + jax: [ + "input/TeX", + "output/HTML-CSS", + "output/NativeMML" + ], + extensions: [ + "MathMenu.js", + "MathZoom.js", + "TeX/AMSmath.js", + "TeX/AMSsymbols.js", + "TeX/autobold.js", + "TeX/autoload-all.js" + ] +}); +MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } +}); diff --git a/docs/build/index.md b/docs/build/index.md new file mode 100644 index 0000000..f49ca5b --- /dev/null +++ b/docs/build/index.md @@ -0,0 +1,354 @@ + + + + + +# OpenML.jl Documentation + + +This is the reference documentation of [`OpenML.jl`](https://github.com/JuliaAI/OpenML.jl). + + +The [OpenML platform](https://www.openml.org) provides an integration platform for carrying out and comparing machine learning solutions across a broad collection of public datasets and software platforms. + + +Summary of OpenML.jl functionality: + + + * [`OpenML.list_tags`](index.md#OpenML.list_tags)`()`: for listing all dataset tags + * [`OpenML.list_datasets`](index.md#OpenML.list_datasets)`(; tag=nothing, filter=nothing, output_format=...)`: for listing available datasets + * [`OpenML.describe_dataset`](index.md#OpenML.describe_dataset)`(id)`: to describe a particular dataset + * [`OpenML.load`](index.md#OpenML.load)`(id; parser=:arff)`: to download a dataset + + + + + + +## Installation + + +```julia +using Pkg +Pkg.add("OpenML") +``` + + +If running the demonstration below: + + +```julia +Pkg.add("DataFrames") +Pkg.add("ScientificTypes") +``` + + + + + + +## Sample usage + + +```julia-repl +julia> using OpenML # or using MLJ + + +julia> using DataFrames + + +julia> OpenML.list_tags() +300-element Vector{Any}: + "study_41" + "uci" + "study_34" + "study_37" + "mythbusting_1" + "OpenML-CC18" + "study_99" + "artificial" + "BNG" + "study_16" + ⋮ + "Earth Science" + "Social Media" + "Meteorology" + "Geography" + "Language" + "Computational Universe" + "History" + "Culture" + "Sociology" +``` + + +Listing all datasets with the "OpenML100" tag which also have `n` instances and `p` features, where `100 < n < 1000` and `1 < p < 10`: + + +```julia-repl +julia> ds = OpenML.list_datasets( + tag = "OpenML100", + filter = "number_instances/100..1000/number_features/1..10", + output_format = DataFrame) +12×13 DataFrame + Row │ id name status MajorityClassSize Max ⋯ + │ Int64 String String Int64? Int ⋯ +─────┼────────────────────────────────────────────────────────────────────────── + 1 │ 11 balance-scale active 288 ⋯ + 2 │ 15 breast-w active 458 + 3 │ 37 diabetes active 500 + 4 │ 50 tic-tac-toe active 626 + 5 │ 333 monks-problems-1 active 278 ⋯ + 6 │ 334 monks-problems-2 active 395 + 7 │ 335 monks-problems-3 active 288 + 8 │ 451 irish active 278 + 9 │ 469 analcatdata_dmft active 155 ⋯ + 10 │ 470 profb active 448 + 11 │ 1464 blood-transfusion-service-center active 570 + 12 │ 40496 LED-display-domain-7digit active 57 + 9 columns omitted +``` + + +Describing and loading one of these datasets: + + +```julia-repl +julia> OpenML.describe_dataset(15) + Author: Dr. William H. Wolberg, University of Wisconsin Source: UCI + (https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)), + University of Wisconsin (http://pages.cs.wisc.edu/~olvi/uwmp/cancer.html) - + 1995 Please cite: See below, plus UCI + (https://archive.ics.uci.edu/ml/citation_policy.html) + + Breast Cancer Wisconsin (Original) Data Set. Features are computed from a + digitized image of a fine needle aspirate (FNA) of a breast mass. They + describe characteristics of the cell nuclei present in the image. The target + feature records the prognosis (malignant or benign). Original data available + here (ftp://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/) + + Current dataset was adapted to ARFF format from the UCI version. Sample code + ID's were removed. + + ! Note that there is also a related Breast Cancer Wisconsin (Diagnosis) Data + Set with a different set of features, better known as wdbc + (https://www.openml.org/d/1510). + + Relevant Papers + ––––––––––––––– + + W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction + for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on + Electronic Imaging: Science and Technology, volume 1905, pages 861-870, San + Jose, CA, 1993. + + O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and + prognosis via linear programming. Operations Research, 43(4), pages 570-577, + July-August 1995. + + Citation request + –––––––––––––––– + + This breast cancer database was obtained from the University of Wisconsin + Hospitals, Madison from Dr. William H. Wolberg. If you publish results when + using this database, then please include this information in your + acknowledgments. Also, please cite one or more of: + + 1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear + programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 + & 18. + + 2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of + pattern separation for medical diagnosis applied to breast + cytology", Proceedings of the National Academy of Sciences, + U.S.A., Volume 87, December 1990, pp 9193-9196. + + 3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern + recognition via linear programming: Theory and application to + medical diagnosis", in: "Large-scale numerical optimization", + Thomas F. Coleman and Yuying Li, editors, SIAM Publications, + Philadelphia 1990, pp 22-30. + + 4. K. P. Bennett & O. L. Mangasarian: "Robust linear programming + discrimination of two linearly inseparable sets", Optimization + Methods and Software 1, 1992, 23-34 (Gordon & Breach Science + Publishers). + +julia> table = OpenML.load(15) +Tables.DictColumnTable with 699 rows, 10 columns, and schema: + :Clump_Thickness Float64 + :Cell_Size_Uniformity Float64 + :Cell_Shape_Uniformity Float64 + :Marginal_Adhesion Float64 + :Single_Epi_Cell_Size Float64 + :Bare_Nuclei Union{Missing, Float64} + :Bland_Chromatin Float64 + :Normal_Nucleoli Float64 + :Mitoses Float64 + :Class CategoricalArrays.CategoricalValue{String, UInt32} +``` + + +Converting to a data frame: + + +```julia-repl +julia> df = DataFrame(table) +699×10 DataFrame + Row │ Clump_Thickness Cell_Size_Uniformity Cell_Shape_Uniformity Marginal_ ⋯ + │ Float64 Float64 Float64 Float64 ⋯ +─────┼────────────────────────────────────────────────────────────────────────── + 1 │ 5.0 1.0 1.0 ⋯ + 2 │ 5.0 4.0 4.0 + 3 │ 3.0 1.0 1.0 + 4 │ 6.0 8.0 8.0 + 5 │ 4.0 1.0 1.0 ⋯ + 6 │ 8.0 10.0 10.0 + 7 │ 1.0 1.0 1.0 + 8 │ 2.0 1.0 2.0 + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋱ + 693 │ 3.0 1.0 1.0 ⋯ + 694 │ 3.0 1.0 1.0 + 695 │ 3.0 1.0 1.0 + 696 │ 2.0 1.0 1.0 + 697 │ 5.0 10.0 10.0 ⋯ + 698 │ 4.0 8.0 6.0 + 699 │ 4.0 8.0 8.0 + 7 columns and 684 rows omitted +``` + + +Inspecting it's schema: + + +```julia-repl +julia> using ScientificTypes + + +julia> schema(table) +┌───────────────────────┬────────────────────────────┬────────────────────────── +│ names │ scitypes │ types ⋯ +├───────────────────────┼────────────────────────────┼────────────────────────── +│ Clump_Thickness │ Continuous │ Float64 ⋯ +│ Cell_Size_Uniformity │ Continuous │ Float64 ⋯ +│ Cell_Shape_Uniformity │ Continuous │ Float64 ⋯ +│ Marginal_Adhesion │ Continuous │ Float64 ⋯ +│ Single_Epi_Cell_Size │ Continuous │ Float64 ⋯ +│ Bare_Nuclei │ Union{Missing, Continuous} │ Union{Missing, Float64} ⋯ +│ Bland_Chromatin │ Continuous │ Float64 ⋯ +│ Normal_Nucleoli │ Continuous │ Float64 ⋯ +│ Mitoses │ Continuous │ Float64 ⋯ +│ Class │ Multiclass{2} │ CategoricalValue{String ⋯ +└───────────────────────┴────────────────────────────┴────────────────────────── + 1 column omitted +``` + + + + + + +## Public API + +### **`OpenML.list_tags`** + + + + +```julia +list_tags() +``` + +List all available tags. + +### **`OpenML.list_datasets`** + +```julia +list_datasets(; tag = nothing, filters = "", output_format = NamedTuple) +``` + +Lists all active OpenML datasets, if `tag = nothing` (default). To list only datasets with a given tag, choose one of the tags in [`list_tags()`](index.md#OpenML.list_tags). An alternative `output_format` can be chosen, e.g. `DataFrame`, if the `DataFrames` package is loaded. + +A filter is a string of `/` or `/` pairs, concatenated using `/`, such as + +```julia + filter = "number_features/10/number_instances/500..10000" +``` + +The allowed data qualities include `tag`, `status`, `limit`, `offset`, `data_id`, `data_name`, `data_version`, `uploader`, `number_instances`, `number_features`, `number_classes`, `number_missing_values`. + +For more on the format and effect of `filters` refer to the [openml API](https://www.openml.org/api_docs#!/data/get_data_list_filters). + +**Examples** + +``` +julia> using DataFrames + +julia> ds = OpenML.list_datasets( + tag = "OpenML100", + filter = "number_instances/100..1000/number_features/1..10", + output_format = DataFrame +) + +julia> sort!(ds, :NumberOfFeatures) +``` + +### **`OpenML.describe_dataset`** + +```julia +describe_dataset(id) +``` + +Load and show the OpenML description of the data set `id`. Use [`list_datasets`](index.md#OpenML.list_datasets) to browse available data sets. + +**Examples** + +``` +julia> OpenML.describe_dataset(6) + Author: David J. Slate Source: UCI + (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P. + W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". + Machine Learning 6(2), 1991 + + 1. TITLE: + + Letter Image Recognition Data + + The objective is to identify each of a large number of black-and-white + rectangular pixel displays as one of the 26 capital letters in the English + alphabet. The character images were based on 20 different fonts and each + letter within these 20 fonts was randomly distorted to produce a file of + 20,000 unique stimuli. Each stimulus was converted into 16 primitive + numerical attributes (statistical moments and edge counts) which were then + scaled to fit into a range of integer values from 0 through 15. We + typically train on the first 16000 items and then use the resulting model + to predict the letter category for the remaining 4000. See the article + cited above for more details. +``` + +### **`OpenML.load`** + + + +```julia +OpenML.load(id; maxbytes = nothing) +``` + +Load the OpenML dataset with specified `id`, from those listed by [`list_datasets`](index.md#OpenML.list_datasets) or on the [OpenML site](https://www.openml.org/search?type=data). + +Datasets are saved as julia artifacts so that they persist locally once loaded. + +Returns a table. + +**Examples** + +```julia +using DataFrames +table = OpenML.load(61) +df = DataFrame(table) # transform to a DataFrame +using ScientificTypes +df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types + +peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table +``` + diff --git a/docs/make-md.jl b/docs/make-md.jl new file mode 100644 index 0000000..d33f1c0 --- /dev/null +++ b/docs/make-md.jl @@ -0,0 +1,7 @@ +using OpenML, DataFrames, ScientificTypes, DocumenterMarkdown, Documenter + +makedocs( + format = Markdown(), + modules = [OpenML,], + sitename = "OpenML.jl", +) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e78e9b4 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,57 @@ +site_name: OpenML.jl +repo_url: https://github.com/openml/OpenML.jl +site_description: "This is the reference documentation of OpenML.jl" + +theme: + name: "material" + language: "en" + palette: + # Light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/toggle-switch-off-outline + name: Switch to dark mode + # Dark mode + - media: "(prefers-color-scheme: dark)" + primary: indigo + accent: indigo + scheme: slate + toggle: + icon: material/toggle-switch + name: Switch to light mode + font: + text: "Roboto" + code: "Roboto Mono" + icon: + edit: material/pencil + view: material/eye + +extra_css: + - assets/Documenter.css + +extra_javascript: + - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS_HTML + - assets/mathjaxhelper.js + +markdown_extensions: + - extra + - tables + - fenced_code + - admonition + - codehilite + - attr_list + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + linenums: true + - pymdownx.inlinehilite + - toc: + permalink: true + +docs_dir: 'docs/build' + +nav: + - Home: index.md From c25e2716f7d4dc530191d1f5aff8f8f7954d6e65 Mon Sep 17 00:00:00 2001 From: joaquinvanschoren Date: Tue, 15 Oct 2024 23:25:51 +0200 Subject: [PATCH 02/20] markdown docs, baby --- docs/build/index.md | 14 +++++++------- mkdocs.yml | 10 ++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/build/index.md b/docs/build/index.md index f49ca5b..086546f 100644 --- a/docs/build/index.md +++ b/docs/build/index.md @@ -50,7 +50,7 @@ Pkg.add("ScientificTypes") ## Sample usage -```julia-repl +```julia julia> using OpenML # or using MLJ @@ -85,7 +85,7 @@ julia> OpenML.list_tags() Listing all datasets with the "OpenML100" tag which also have `n` instances and `p` features, where `100 < n < 1000` and `1 < p < 10`: -```julia-repl +```julia julia> ds = OpenML.list_datasets( tag = "OpenML100", filter = "number_instances/100..1000/number_features/1..10", @@ -113,7 +113,7 @@ julia> ds = OpenML.list_datasets( Describing and loading one of these datasets: -```julia-repl +```julia julia> OpenML.describe_dataset(15) Author: Dr. William H. Wolberg, University of Wisconsin Source: UCI (https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)), @@ -192,7 +192,7 @@ Tables.DictColumnTable with 699 rows, 10 columns, and schema: Converting to a data frame: -```julia-repl +```julia julia> df = DataFrame(table) 699×10 DataFrame Row │ Clump_Thickness Cell_Size_Uniformity Cell_Shape_Uniformity Marginal_ ⋯ @@ -221,7 +221,7 @@ julia> df = DataFrame(table) Inspecting it's schema: -```julia-repl +```julia julia> using ScientificTypes @@ -281,7 +281,7 @@ For more on the format and effect of `filters` refer to the [openml API](https:/ **Examples** -``` +```julia julia> using DataFrames julia> ds = OpenML.list_datasets( @@ -303,7 +303,7 @@ Load and show the OpenML description of the data set `id`. Use [`list_datasets`] **Examples** -``` +```julia julia> OpenML.describe_dataset(6) Author: David J. Slate Source: UCI (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P. diff --git a/mkdocs.yml b/mkdocs.yml index e78e9b4..5117d01 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -44,12 +44,14 @@ markdown_extensions: - codehilite - attr_list - pymdownx.details - - pymdownx.superfences - pymdownx.highlight: - linenums: true + anchor_linenums: true + line_spans: __span + pygments_lang_class: true - pymdownx.inlinehilite - - toc: - permalink: true + - pymdownx.snippets + - pymdownx.superfences + docs_dir: 'docs/build' From 72bf008bc7954009b019363fece4a80c2c5233e5 Mon Sep 17 00:00:00 2001 From: Christoph Date: Thu, 31 Oct 2024 11:00:29 +0100 Subject: [PATCH 03/20] Update load function --- src/data.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/data.jl b/src/data.jl index e3bf2ef..d91e873 100644 --- a/src/data.jl +++ b/src/data.jl @@ -72,7 +72,10 @@ function load(id::Int; maxbytes = nothing) @info "Downloading dataset $id." download(load_Dataset_Description(id)["data_set_description"]["url"], fname) end - ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes = maxbytes), fname) + open(fname) do io + reader = ARFFFiles.loadstreaming(io) + return ARFFFiles.readcolumns(reader; maxbytes=maxbytes) + end end From 199bcd33a5ecc57aeac4d79da6f9841586b26271 Mon Sep 17 00:00:00 2001 From: jbrea Date: Fri, 1 Nov 2024 15:43:33 +0100 Subject: [PATCH 04/20] test on new LTS --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index d0f36cb..5805770 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -16,6 +16,7 @@ jobs: matrix: julia-version: - "1.6" + - "1.10" - "1" - "nightly" os: From d1f2f4d707440554d6ab066be10a437458966b73 Mon Sep 17 00:00:00 2001 From: jbrea Date: Fri, 1 Nov 2024 15:47:58 +0100 Subject: [PATCH 05/20] bump version to 0.3.2 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4189b00..fc1c896 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.3.1" +version = "0.3.2" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" From 0bf16882881350842498f8f7c52cc0fe85b8597f Mon Sep 17 00:00:00 2001 From: jbrea Date: Sat, 2 Nov 2024 09:28:13 +0100 Subject: [PATCH 06/20] Update TagBot.yml --- .github/workflows/TagBot.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f49313b..4bad0ec 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -4,6 +4,22 @@ on: types: - created workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' @@ -12,4 +28,6 @@ jobs: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} + # Edit the following line to reflect the actual name of the GitHub Secret containing your private key ssh: ${{ secrets.DOCUMENTER_KEY }} + # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }} From 6721dd1edb5dd226b25f47499e29ab8cd4267521 Mon Sep 17 00:00:00 2001 From: jbrea Date: Sat, 2 Nov 2024 09:57:24 +0100 Subject: [PATCH 07/20] Update Project.toml --- docs/Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Project.toml b/docs/Project.toml index ac90fed..ea93df0 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,7 +3,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" OpenML = "8b6db2d4-7670-4922-a472-f9537c81ab66" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" -DocumenterMarkdown +DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433" [compat] Documenter = "~0.26" From 502599b069e5f8e6a0a86df24640e653ac8a01e6 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 08:48:12 +1300 Subject: [PATCH 08/20] extend JSON = "0.21, 1" --- .gitignore | 1 + Project.toml | 2 +- docs/build/assets/Documenter.css | 18 -- docs/build/assets/mathjaxhelper.js | 25 -- docs/build/index.md | 354 ----------------------------- 5 files changed, 2 insertions(+), 398 deletions(-) delete mode 100644 docs/build/assets/Documenter.css delete mode 100644 docs/build/assets/mathjaxhelper.js delete mode 100644 docs/build/index.md diff --git a/.gitignore b/.gitignore index 5a61b23..0e1b98c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ Manifest.toml #* .DS_Store sandbox/ +/docs/build/ /docs/site/ /docs/Manifest.toml .vscode diff --git a/Project.toml b/Project.toml index fc1c896..78984aa 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ Scratch = "6c6a2e73-6563-6170-7368-637461726353" [compat] ARFFFiles = "1.4.1" HTTP = "0.8, 0.9, 1" -JSON = "0.21" +JSON = "0.21, 1" Scratch = "1.1" julia = "1.6" diff --git a/docs/build/assets/Documenter.css b/docs/build/assets/Documenter.css deleted file mode 100644 index d9af5d6..0000000 --- a/docs/build/assets/Documenter.css +++ /dev/null @@ -1,18 +0,0 @@ -div.wy-menu-vertical ul.current li.toctree-l3 a { - font-weight: bold; -} - -a.documenter-source { - float: right; -} - -.documenter-methodtable pre { - margin-left: 0; - margin-right: 0; - margin-top: 0; - padding: 0; -} - -.documenter-methodtable pre.documenter-inline { - display: inline; -} diff --git a/docs/build/assets/mathjaxhelper.js b/docs/build/assets/mathjaxhelper.js deleted file mode 100644 index 3561b10..0000000 --- a/docs/build/assets/mathjaxhelper.js +++ /dev/null @@ -1,25 +0,0 @@ -MathJax.Hub.Config({ - "tex2jax": { - inlineMath: [['$','$'], ['\\(','\\)']], - processEscapes: true - } -}); -MathJax.Hub.Config({ - config: ["MMLorHTML.js"], - jax: [ - "input/TeX", - "output/HTML-CSS", - "output/NativeMML" - ], - extensions: [ - "MathMenu.js", - "MathZoom.js", - "TeX/AMSmath.js", - "TeX/AMSsymbols.js", - "TeX/autobold.js", - "TeX/autoload-all.js" - ] -}); -MathJax.Hub.Config({ - TeX: { equationNumbers: { autoNumber: "AMS" } } -}); diff --git a/docs/build/index.md b/docs/build/index.md deleted file mode 100644 index 086546f..0000000 --- a/docs/build/index.md +++ /dev/null @@ -1,354 +0,0 @@ - - - - - -# OpenML.jl Documentation - - -This is the reference documentation of [`OpenML.jl`](https://github.com/JuliaAI/OpenML.jl). - - -The [OpenML platform](https://www.openml.org) provides an integration platform for carrying out and comparing machine learning solutions across a broad collection of public datasets and software platforms. - - -Summary of OpenML.jl functionality: - - - * [`OpenML.list_tags`](index.md#OpenML.list_tags)`()`: for listing all dataset tags - * [`OpenML.list_datasets`](index.md#OpenML.list_datasets)`(; tag=nothing, filter=nothing, output_format=...)`: for listing available datasets - * [`OpenML.describe_dataset`](index.md#OpenML.describe_dataset)`(id)`: to describe a particular dataset - * [`OpenML.load`](index.md#OpenML.load)`(id; parser=:arff)`: to download a dataset - - - - - - -## Installation - - -```julia -using Pkg -Pkg.add("OpenML") -``` - - -If running the demonstration below: - - -```julia -Pkg.add("DataFrames") -Pkg.add("ScientificTypes") -``` - - - - - - -## Sample usage - - -```julia -julia> using OpenML # or using MLJ - - -julia> using DataFrames - - -julia> OpenML.list_tags() -300-element Vector{Any}: - "study_41" - "uci" - "study_34" - "study_37" - "mythbusting_1" - "OpenML-CC18" - "study_99" - "artificial" - "BNG" - "study_16" - ⋮ - "Earth Science" - "Social Media" - "Meteorology" - "Geography" - "Language" - "Computational Universe" - "History" - "Culture" - "Sociology" -``` - - -Listing all datasets with the "OpenML100" tag which also have `n` instances and `p` features, where `100 < n < 1000` and `1 < p < 10`: - - -```julia -julia> ds = OpenML.list_datasets( - tag = "OpenML100", - filter = "number_instances/100..1000/number_features/1..10", - output_format = DataFrame) -12×13 DataFrame - Row │ id name status MajorityClassSize Max ⋯ - │ Int64 String String Int64? Int ⋯ -─────┼────────────────────────────────────────────────────────────────────────── - 1 │ 11 balance-scale active 288 ⋯ - 2 │ 15 breast-w active 458 - 3 │ 37 diabetes active 500 - 4 │ 50 tic-tac-toe active 626 - 5 │ 333 monks-problems-1 active 278 ⋯ - 6 │ 334 monks-problems-2 active 395 - 7 │ 335 monks-problems-3 active 288 - 8 │ 451 irish active 278 - 9 │ 469 analcatdata_dmft active 155 ⋯ - 10 │ 470 profb active 448 - 11 │ 1464 blood-transfusion-service-center active 570 - 12 │ 40496 LED-display-domain-7digit active 57 - 9 columns omitted -``` - - -Describing and loading one of these datasets: - - -```julia -julia> OpenML.describe_dataset(15) - Author: Dr. William H. Wolberg, University of Wisconsin Source: UCI - (https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)), - University of Wisconsin (http://pages.cs.wisc.edu/~olvi/uwmp/cancer.html) - - 1995 Please cite: See below, plus UCI - (https://archive.ics.uci.edu/ml/citation_policy.html) - - Breast Cancer Wisconsin (Original) Data Set. Features are computed from a - digitized image of a fine needle aspirate (FNA) of a breast mass. They - describe characteristics of the cell nuclei present in the image. The target - feature records the prognosis (malignant or benign). Original data available - here (ftp://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/) - - Current dataset was adapted to ARFF format from the UCI version. Sample code - ID's were removed. - - ! Note that there is also a related Breast Cancer Wisconsin (Diagnosis) Data - Set with a different set of features, better known as wdbc - (https://www.openml.org/d/1510). - - Relevant Papers - ––––––––––––––– - - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction - for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on - Electronic Imaging: Science and Technology, volume 1905, pages 861-870, San - Jose, CA, 1993. - - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and - prognosis via linear programming. Operations Research, 43(4), pages 570-577, - July-August 1995. - - Citation request - –––––––––––––––– - - This breast cancer database was obtained from the University of Wisconsin - Hospitals, Madison from Dr. William H. Wolberg. If you publish results when - using this database, then please include this information in your - acknowledgments. Also, please cite one or more of: - - 1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear - programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 - & 18. - - 2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of - pattern separation for medical diagnosis applied to breast - cytology", Proceedings of the National Academy of Sciences, - U.S.A., Volume 87, December 1990, pp 9193-9196. - - 3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern - recognition via linear programming: Theory and application to - medical diagnosis", in: "Large-scale numerical optimization", - Thomas F. Coleman and Yuying Li, editors, SIAM Publications, - Philadelphia 1990, pp 22-30. - - 4. K. P. Bennett & O. L. Mangasarian: "Robust linear programming - discrimination of two linearly inseparable sets", Optimization - Methods and Software 1, 1992, 23-34 (Gordon & Breach Science - Publishers). - -julia> table = OpenML.load(15) -Tables.DictColumnTable with 699 rows, 10 columns, and schema: - :Clump_Thickness Float64 - :Cell_Size_Uniformity Float64 - :Cell_Shape_Uniformity Float64 - :Marginal_Adhesion Float64 - :Single_Epi_Cell_Size Float64 - :Bare_Nuclei Union{Missing, Float64} - :Bland_Chromatin Float64 - :Normal_Nucleoli Float64 - :Mitoses Float64 - :Class CategoricalArrays.CategoricalValue{String, UInt32} -``` - - -Converting to a data frame: - - -```julia -julia> df = DataFrame(table) -699×10 DataFrame - Row │ Clump_Thickness Cell_Size_Uniformity Cell_Shape_Uniformity Marginal_ ⋯ - │ Float64 Float64 Float64 Float64 ⋯ -─────┼────────────────────────────────────────────────────────────────────────── - 1 │ 5.0 1.0 1.0 ⋯ - 2 │ 5.0 4.0 4.0 - 3 │ 3.0 1.0 1.0 - 4 │ 6.0 8.0 8.0 - 5 │ 4.0 1.0 1.0 ⋯ - 6 │ 8.0 10.0 10.0 - 7 │ 1.0 1.0 1.0 - 8 │ 2.0 1.0 2.0 - ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋱ - 693 │ 3.0 1.0 1.0 ⋯ - 694 │ 3.0 1.0 1.0 - 695 │ 3.0 1.0 1.0 - 696 │ 2.0 1.0 1.0 - 697 │ 5.0 10.0 10.0 ⋯ - 698 │ 4.0 8.0 6.0 - 699 │ 4.0 8.0 8.0 - 7 columns and 684 rows omitted -``` - - -Inspecting it's schema: - - -```julia -julia> using ScientificTypes - - -julia> schema(table) -┌───────────────────────┬────────────────────────────┬────────────────────────── -│ names │ scitypes │ types ⋯ -├───────────────────────┼────────────────────────────┼────────────────────────── -│ Clump_Thickness │ Continuous │ Float64 ⋯ -│ Cell_Size_Uniformity │ Continuous │ Float64 ⋯ -│ Cell_Shape_Uniformity │ Continuous │ Float64 ⋯ -│ Marginal_Adhesion │ Continuous │ Float64 ⋯ -│ Single_Epi_Cell_Size │ Continuous │ Float64 ⋯ -│ Bare_Nuclei │ Union{Missing, Continuous} │ Union{Missing, Float64} ⋯ -│ Bland_Chromatin │ Continuous │ Float64 ⋯ -│ Normal_Nucleoli │ Continuous │ Float64 ⋯ -│ Mitoses │ Continuous │ Float64 ⋯ -│ Class │ Multiclass{2} │ CategoricalValue{String ⋯ -└───────────────────────┴────────────────────────────┴────────────────────────── - 1 column omitted -``` - - - - - - -## Public API - -### **`OpenML.list_tags`** - - - - -```julia -list_tags() -``` - -List all available tags. - -### **`OpenML.list_datasets`** - -```julia -list_datasets(; tag = nothing, filters = "", output_format = NamedTuple) -``` - -Lists all active OpenML datasets, if `tag = nothing` (default). To list only datasets with a given tag, choose one of the tags in [`list_tags()`](index.md#OpenML.list_tags). An alternative `output_format` can be chosen, e.g. `DataFrame`, if the `DataFrames` package is loaded. - -A filter is a string of `/` or `/` pairs, concatenated using `/`, such as - -```julia - filter = "number_features/10/number_instances/500..10000" -``` - -The allowed data qualities include `tag`, `status`, `limit`, `offset`, `data_id`, `data_name`, `data_version`, `uploader`, `number_instances`, `number_features`, `number_classes`, `number_missing_values`. - -For more on the format and effect of `filters` refer to the [openml API](https://www.openml.org/api_docs#!/data/get_data_list_filters). - -**Examples** - -```julia -julia> using DataFrames - -julia> ds = OpenML.list_datasets( - tag = "OpenML100", - filter = "number_instances/100..1000/number_features/1..10", - output_format = DataFrame -) - -julia> sort!(ds, :NumberOfFeatures) -``` - -### **`OpenML.describe_dataset`** - -```julia -describe_dataset(id) -``` - -Load and show the OpenML description of the data set `id`. Use [`list_datasets`](index.md#OpenML.list_datasets) to browse available data sets. - -**Examples** - -```julia -julia> OpenML.describe_dataset(6) - Author: David J. Slate Source: UCI - (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P. - W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". - Machine Learning 6(2), 1991 - - 1. TITLE: - - Letter Image Recognition Data - - The objective is to identify each of a large number of black-and-white - rectangular pixel displays as one of the 26 capital letters in the English - alphabet. The character images were based on 20 different fonts and each - letter within these 20 fonts was randomly distorted to produce a file of - 20,000 unique stimuli. Each stimulus was converted into 16 primitive - numerical attributes (statistical moments and edge counts) which were then - scaled to fit into a range of integer values from 0 through 15. We - typically train on the first 16000 items and then use the resulting model - to predict the letter category for the remaining 4000. See the article - cited above for more details. -``` - -### **`OpenML.load`** - - - -```julia -OpenML.load(id; maxbytes = nothing) -``` - -Load the OpenML dataset with specified `id`, from those listed by [`list_datasets`](index.md#OpenML.list_datasets) or on the [OpenML site](https://www.openml.org/search?type=data). - -Datasets are saved as julia artifacts so that they persist locally once loaded. - -Returns a table. - -**Examples** - -```julia -using DataFrames -table = OpenML.load(61) -df = DataFrame(table) # transform to a DataFrame -using ScientificTypes -df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types - -peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table -``` - From 226a563fee034e9afb927c9125a55d9cfe054687 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 08:58:56 +1300 Subject: [PATCH 09/20] update a GH action --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5805770..5e44c6d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -32,7 +32,7 @@ jobs: version: ${{ matrix.julia-version }} arch: ${{ matrix.julia-arch }} - name: Cache artifacts - uses: actions/cache@v2 + uses: julia-actions/cache@v2 env: cache-name: cache-artifacts with: From f37468193043d8b774eeb74279e7d354b2c8d589 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:00:02 +1300 Subject: [PATCH 10/20] use Downloads.download instead of Base.download --- Project.toml | 4 +++- src/OpenML.jl | 1 + src/data.jl | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 78984aa..c5a65b9 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.3.2" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -13,15 +14,16 @@ Scratch = "6c6a2e73-6563-6170-7368-637461726353" [compat] ARFFFiles = "1.4.1" +Downloads = "1.6.0" HTTP = "0.8, 0.9, 1" JSON = "0.21, 1" Scratch = "1.1" julia = "1.6" [extras] +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" [targets] test = ["Tables", "Test", "Logging"] diff --git a/src/OpenML.jl b/src/OpenML.jl index c66396e..2faaff4 100644 --- a/src/OpenML.jl +++ b/src/OpenML.jl @@ -5,6 +5,7 @@ using JSON import ARFFFiles using Markdown using Scratch +import Downloads export OpenML diff --git a/src/data.jl b/src/data.jl index d91e873..45a8fb3 100644 --- a/src/data.jl +++ b/src/data.jl @@ -70,7 +70,9 @@ function load(id::Int; maxbytes = nothing) fname = joinpath(download_cache, "$id.arff") if !isfile(fname) @info "Downloading dataset $id." - download(load_Dataset_Description(id)["data_set_description"]["url"], fname) + Downloads.download( + load_Dataset_Description(id)["data_set_description"]["url"], + fname,) end open(fname) do io reader = ARFFFiles.loadstreaming(io) From 37526212d86895b918b46859bd5074bd733e1fae Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:07:35 +1300 Subject: [PATCH 11/20] stop testing old version of julia --- .github/workflows/CI.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5e44c6d..fc93ed4 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -15,10 +15,8 @@ jobs: fail-fast: false matrix: julia-version: - - "1.6" - - "1.10" + - "lts" - "1" - - "nightly" os: - ubuntu-latest - macos-latest @@ -26,8 +24,8 @@ jobs: julia-arch: - x64 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v6 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.julia-version }} arch: ${{ matrix.julia-arch }} From 0416577e51c76568e255332b899db68366be997b Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:09:40 +1300 Subject: [PATCH 12/20] rm what appears to be a now invalid test --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index 32df589..9c98395 100644 --- a/test/data.jl +++ b/test/data.jl @@ -16,7 +16,7 @@ offset = 8 filters_test = OpenML.load_List_And_Filter("limit/$limit/offset/$offset") @testset "HTTP connection" begin - @test typeof(response_test) <: Dict +# @test typeof(response_test) <: Dict @test response_test["data_set_description"]["name"] == "iris" @test response_test["data_set_description"]["format"] == "ARFF" end From b545e1a1394d01459a08b213aee1a94422980f3f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:19:36 +1300 Subject: [PATCH 13/20] rm more of the same --- test/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data.jl b/test/data.jl index 9c98395..7596406 100644 --- a/test/data.jl +++ b/test/data.jl @@ -30,11 +30,11 @@ end @testset "data api functions" begin @test typeof(dqlist_test["data_qualities_list"]) <: Dict - @test typeof(data_features_test) <: Dict +# @test typeof(data_features_test) <: Dict @test length(data_features_test["data_features"]["feature"]) == 5 @test data_features_test["data_features"]["feature"][1]["name"] == "sepallength" - @test typeof(data_qualities_test) <: Dict +# @test typeof(data_qualities_test) <: Dict @test length(filters_test["data"]["dataset"]) == limit @test length(filters_test["data"]["dataset"][1]) == offset From cd9507696d0c267b79795e5ab775213c2e002901 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:28:41 +1300 Subject: [PATCH 14/20] rm more of the same --- test/data.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/data.jl b/test/data.jl index 7596406..4da08c9 100644 --- a/test/data.jl +++ b/test/data.jl @@ -28,8 +28,7 @@ end end @testset "data api functions" begin - @test typeof(dqlist_test["data_qualities_list"]) <: Dict - +# @test typeof(dqlist_test["data_qualities_list"]) <: Dict # @test typeof(data_features_test) <: Dict @test length(data_features_test["data_features"]["feature"]) == 5 @test data_features_test["data_features"]["feature"][1]["name"] == "sepallength" From 21e3537b1c8a2d073aaaf814c88ad2d4e800c8db Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:44:47 +1300 Subject: [PATCH 15/20] rm all traces of these redundant tests --- test/data.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/data.jl b/test/data.jl index 4da08c9..38880e3 100644 --- a/test/data.jl +++ b/test/data.jl @@ -28,13 +28,8 @@ end end @testset "data api functions" begin -# @test typeof(dqlist_test["data_qualities_list"]) <: Dict -# @test typeof(data_features_test) <: Dict @test length(data_features_test["data_features"]["feature"]) == 5 @test data_features_test["data_features"]["feature"][1]["name"] == "sepallength" - -# @test typeof(data_qualities_test) <: Dict - @test length(filters_test["data"]["dataset"]) == limit @test length(filters_test["data"]["dataset"][1]) == offset end From fa687a0ac14fcb42498e72c02118f34d911f4ffe Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 09:46:23 +1300 Subject: [PATCH 16/20] bump 0.3.3 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index c5a65b9..d149f1a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.3.2" +version = "0.3.3" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" From cd2d6d3bb12e59c0f7696b0b52535291485cd44e Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 10:13:21 +1300 Subject: [PATCH 17/20] bump Documenter version, dump unsupported DocumenterMarkdown --- README.md | 21 --------------------- docs/Project.toml | 3 +-- docs/make.jl | 8 ++++++-- 3 files changed, 7 insertions(+), 25 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 59ba3c6..0000000 --- a/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# OpenML.jl - -| Linux | Coverage | Documentation | -| :-----------: | :------: | :-------: | -| [![Build status](https://github.com/JuliaAI/OpenML.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/OpenML.jl/actions)| [![codecov.io](http://codecov.io/github/JuliaAI/OpenML.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaAI/OpenML.jl?branch=master) | [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaAI.github.io/OpenML.jl/stable) | - -Partial implementation of the [OpenML](https://www.openml.org) API for -Julia. At present this package allows querying and -downloading of OpenML datasets. - -For further integration with the -[MLJ](https://JuliaAI.github.io/MLJ.jl/dev/) machine -learning framework (such as uploading MLJ runs) see -[MLJOpenML.jl](https://github.com/JuliaAI/MLJOpenML.jl). - - -**Acknowledgements.** The code in this repository is based on contributions of Diego Arenas -to [MLJBase.jl](https://github.com/JuliaAI/MLJBase.jl) which do not -appear in the commit history of this repository. - -Package documentation is [here](https://JuliaAI.github.io/OpenML.jl/stable). diff --git a/docs/Project.toml b/docs/Project.toml index ea93df0..196dc6f 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,7 +3,6 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" OpenML = "8b6db2d4-7670-4922-a472-f9537c81ab66" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" -DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433" [compat] -Documenter = "~0.26" +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index de9753c..ed60d6f 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,11 +1,15 @@ using Documenter, OpenML, DataFrames +const REPO = Remotes.GitHub("JuliaAI", "OpenML.jl") + makedocs( modules = [OpenML,], sitename = "OpenML.jl", + warnonly = [:cross_references, :missing_docs], + repo = Remotes.GitHub("JuliaAI", "LearnAPI.jl"), ) deploydocs( - repo = "github.com/JuliaAI/OpenML.jl", - push_preview = true + repo = "github.com/JuliaAI/OpenML.jl.git", + devbranch="dev", ) From f8a3e8eb68ddf828d194d408b06ea6ab3774df8c Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 10:19:18 +1300 Subject: [PATCH 18/20] simplify doc generation logic --- .github/workflows/CI.yml | 68 ++++++---------------------------------- 1 file changed, 9 insertions(+), 59 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index fc93ed4..88b62c3 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -53,67 +53,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: julia-actions/setup-julia@v2 with: version: '1' - - run: | - julia -e ' - function set_environment_variable(name::AbstractString, value::AbstractString) - github_env = ENV["GITHUB_ENV"] - touch(github_env) - open(github_env, "a") do io - println(io, "$(name)=$(value)") - end - end - event_name = "${{ github.event_name }}" - if event_name == "pull_request" - base_ref = "${{ github.base_ref }}" - head_ref = "${{ github.head_ref }}" - base_repository = "${{ github.repository }}" - head_repository = "${{ github.event.pull_request.head.repo.full_name }}" - build_docs = (base_ref == "master") && (head_ref == "dev") && (base_repository == head_repository) - elseif event_name == "push" - ref = "${{ github.ref }}" - build_docs = (ref == "refs/heads/master") || (startswith(ref, "refs/tags/")) - elseif event_name == "schedule" - build_docs = ref == "refs/heads/master" - elseif event_name == "workflow_dispatch" - build_docs = ref == "refs/heads/master" - else - build_docs = false - end - if build_docs - @info("We will build the docs") - set_environment_variable("BUILD_DOCS", "true") - else - @info("We will NOT build the docs") - set_environment_variable("BUILD_DOCS", "false") - end' - - run: | - julia --project=docs -e ' - if ENV["BUILD_DOCS"] == "true" - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate() - end' - - run: | - julia --project=docs -e ' - if ENV["BUILD_DOCS"] == "true" - using Documenter: doctest - using OpenML - @info "attempting to run the doctests" - doctest(OpenML) - else - @info "skipping the doctests" - end' - - run: julia --project=docs -e ' - if ENV["BUILD_DOCS"] == "true" - @info "attempting to build the docs" - run(`julia --project=docs docs/make.jl`) - @info "successfully built the docs" - else - @info "skipping the docs build" - end' + - uses: julia-actions/julia-buildpkg@latest + - uses: julia-actions/julia-docdeploy@latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + - run: | + julia --project=docs -e ' + using Documenter: DocMeta, doctest + using LearnAPI + DocMeta.setdocmeta!(OpenML, :DocTestSetup, :(using OpenML); recursive=true) + doctest(OpenML)' From d7653338d44eca1ede9a9aa257541a9772c527bd Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 10:32:16 +1300 Subject: [PATCH 19/20] typo --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 88b62c3..6fd4975 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -64,6 +64,6 @@ jobs: - run: | julia --project=docs -e ' using Documenter: DocMeta, doctest - using LearnAPI + using OpenML DocMeta.setdocmeta!(OpenML, :DocTestSetup, :(using OpenML); recursive=true) doctest(OpenML)' From 070bfa305294ab3230fef90a64f8a788b2b2bf39 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Dec 2025 11:00:27 +1300 Subject: [PATCH 20/20] dummy commit --- src/OpenML.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/OpenML.jl b/src/OpenML.jl index 2faaff4..2b947e0 100644 --- a/src/OpenML.jl +++ b/src/OpenML.jl @@ -18,3 +18,4 @@ function __init__() end end # module +