Standardize rworkflows CI, update code, and add new functions

bschilder · claude · bschilder · commit bf268902ae1b · 2026-03-13T16:07:18.000-04:00
- Replace old rworkflows.yml with canonical template (docker on ghcr.io,
  GITHUB_TOKEN, write-all permissions, devel/RELEASE branches)
- Add github_dependents_scrape.R and sourcegraph_code.R
- Remove deprecated get_github_url.R
- Refactor github_dependents, update github_files_download
- Sync documentation with devtools::document()

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml
@@ -4,12 +4,17 @@ name: rworkflows
     branches:
     - master
     - main
+    - devel
+    - RELEASE_**
   pull_request:
     branches:
     - master
     - main
+    - devel
+    - RELEASE_**
 jobs:
   rworkflows:
+    permissions: write-all
     runs-on: ${{ matrix.config.os }}
     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
     container: ${{ matrix.config.cont }}
@@ -18,30 +23,32 @@ jobs:
       matrix:
         config:
         - os: ubuntu-latest
-          r: devel
           bioc: devel
-          cont: bioconductor/bioconductor_docker:devel
-          rspm: https://packagemanager.rstudio.com/cran/__linux__/focal/release
+          r: auto
+          cont: ghcr.io/bioconductor/bioconductor_docker:devel
+          rspm: ~
         - os: macOS-latest
-          r: latest
           bioc: release
+          r: auto
+          cont: ~
+          rspm: ~
         - os: windows-latest
-          r: latest
           bioc: release
+          r: auto
+          cont: ~
+          rspm: ~
     steps:
     - uses: neurogenomics/rworkflows@master
       with:
         run_bioccheck: ${{ false }}
-        run_crancheck: ${{ true }}
+        run_rcmdcheck: ${{ true }}
+        as_cran: ${{ true }}
         run_vignettes: ${{ true }}
         has_testthat: ${{ true }}
         run_covr: ${{ true }}
         run_pkgdown: ${{ true }}
         has_runit: ${{ false }}
-        GITHUB_TOKEN: ${{ secrets.PAT_GITHUB }}
-        run_docker: ${{ false }}
-        DOCKER_USERNAME: bschilder
-        DOCKER_ORG: bschilder
-        DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-        runner_os: ${{ runner.os }}
-        cache_version: cache-v1
+        has_latex: ${{ false }}
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run_docker: ${{ true }}
+        docker_registry: ghcr.io
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -61,7 +61,7 @@ Suggests:
 Remotes:
     github::neurogenomics/cranlogs,
     github::neurogenomics/rworkflows
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 VignetteBuilder: knitr
 License: GPL-3
 Config/testthat/edition: 3
diff --git a/NEWS.md b/NEWS.md
@@ -4,7 +4,8 @@
 
 * `github_branches`
     - Handle repos with more than 100 branches.
-
+* `github_files_download`
+    - Can now download files in private repos.
 
 # echogithub 0.99.2
 
diff --git a/R/description_extract_i.R b/R/description_extract_i.R
@@ -9,6 +9,9 @@ description_extract_i <- function(desc_file = NULL,
                                   verbose = TRUE){
     #### Find or read DESCRIPTION file ####
     
+    get_github_url_desc <- utils::getFromNamespace("get_github_url_desc",
+                                                   ns = "rworkflows")
+    
     if(is.null(desc_file)) {
         messager("desc_file is required for description_extract.",
                  "Returning NULL.",v=verbose)
@@ -45,10 +48,10 @@ description_extract_i <- function(desc_file = NULL,
       } else if(desc_file$has_fields(f)){
           return(desc_file$get_field(f))
       } else if(f=="github_url"){
-          gh_url <- get_github_url(desc_file = desc_file)
+          gh_url <- get_github_url_desc(desc_file = desc_file)
           return(gh_url)
       } else if(f=="owner"){
-          gh_url <- get_github_url(desc_file = desc_file)
+          gh_url <- get_github_url_desc(desc_file = desc_file)
           if(is.null(gh_url)) {
               return(NULL)
           } else {
@@ -57,7 +60,7 @@ description_extract_i <- function(desc_file = NULL,
               )
           }  
       } else if(f=="repo"){
-          gh_url <- get_github_url(desc_file = desc_file)
+          gh_url <- get_github_url_desc(desc_file = desc_file)
           if(is.null(gh_url)) {
               return(NULL)
           } else {
diff --git a/R/get_github_url.R b/R/get_github_url.R
diff --git a/R/github_code.R b/R/github_code.R
@@ -14,21 +14,22 @@
 #' @examples
 #' \dontrun{
 #' ## easily exceeds API limit
-#' repos <- github_code(query="Package path:DESCRIPTION", .limit=5)
+#' repos <- github_code(query="Package: path:DESCRIPTION", .limit=5)
 #' }
 github_code <- function(query,
                         token = gh::gh_token(),
                         .limit = Inf,
                         verbose = TRUE){
+    # devoptera::args2vars(github_code) 
     owner_repo <- repo <- NULL;
     
-    endpoint <- "https://api.github.com/search/code"
+    endpoint <- "https://api.github.com/search/code" 
     res <-  gh::gh(endpoint,
                    .token = token,
                    .limit = .limit,
                    q = query,
                    #### only beta version supports full-on regex ####
-                   # q = "/(?-i)Package/ path:/(?-i)^DESCRIPTION$/", 
+                   # q = "/(?-i)Package:/ path:/(?-i)^DESCRIPTION$/",
                    per_page = 100)  
     dt <- gh_to_dt(gh_response = res$items,
                    verbose = verbose)
diff --git a/R/github_dependents.R b/R/github_dependents.R
@@ -28,47 +28,22 @@ github_dependents <- function(owner,
                               max_pages = 1000,
                               verbose = TRUE) {
     
-    #devoptera::args2vars(github_insights)  
+    # devoptera::args2vars(github_dependents)  
     
-    requireNamespace("rvest")
-    owner_repo <- NULL;
     messager("Searching for dependents of:",paste(owner,repo,sep="/"),
              v=verbose) 
-    url <- paste0("https://github.com/", owner,"/",repo, "/network/dependents")
-    #### Loop over the specified number of pages ####
-    all_dat <- list() 
-    for (i in seq_len(max_pages)) {
-        # Print a message indicating the URL being scraped
-        messager(paste0("+ Scraping page ",i,"."),v=verbose) 
-        # Retrieve the HTML content of the page
-        page <- rvest::read_html(url) 
-        box_rows <- rvest::html_elements(page,".Box-row")
-        dt <- (rvest::html_text2(box_rows)) |> 
-            stringr::str_split(" / |\n|[ ]", simplify = TRUE) |> 
-            data.table::data.table() |> 
-            `colnames<-`(c("owner","repo","stargazers_count","forks_count"))
-        dt[,owner_repo:=paste(owner,repo,sep="/")]
-        dt <- cbind(target=paste(owner,repo,sep="/"),dt) 
-        all_dat[[i]] <- dt 
-        #### Find the button for the next page ####
-        buttons <- page |> rvest::html_nodes(".paginate-container .btn")
-        next_buttons <- buttons[rvest::html_text(buttons)=="Next"]
-        #### Check if the button is disabled ####
-        is_disabled <- any(sapply(next_buttons, function(btn) {
-            btn_attr <- rvest::html_attr(btn, "disabled")
-            !is.na(btn_attr) && btn_attr == "disabled"
-        }))
-        #### If the button isn't disable, update the URL to scrape ####
-        if (isFALSE(is_disabled)) { 
-            url <- next_buttons |> rvest::html_attr("href") 
-        #### Otherwise, break the loop ####
-        } else {
-            break 
-        }
-    }
-    #### Bind data from all pages ####
-    all_dat <- data.table::rbindlist(all_dat, 
-                                     use.names = TRUE, idcol = "page")
+    
+    #### Method 1: JSON file ####
+    # URL <- paste0("https://github.com/", owner,"/",repo, "/dependency-graph/sbom")
+    # j <- jsonlite::fromJSON("~/Downloads/rworkflows_neurogenomics_b017b7a1aeda0026dda330b01cb798ddb5f1d264.json")
+    # j$packages
+    
+    #### Method 2: Webscraping ####
+    all_dat <- github_dependents_scrape(owner = owner,
+                                        repo = repo, 
+                                        token = token,
+                                        max_pages = max_pages,
+                                        verbose = verbose)
     #### Report ####
     messager("Found",formatC(nrow(all_dat),big.mark = ","),
              "dependents.",v=verbose)
diff --git a/R/github_dependents_scrape.R b/R/github_dependents_scrape.R
@@ -0,0 +1,45 @@
+github_dependents_scrape <- function(owner,
+                                     repo, 
+                                     token = gh::gh_token(),
+                                     max_pages = 1000,
+                                     verbose = TRUE){
+    
+    requireNamespace("rvest")
+    owner_repo <- NULL;
+    url <- paste0("https://github.com/", owner,"/",repo, "/network/dependents")
+    #### Loop over the specified number of pages ####
+    all_dat <- list()
+    for (i in seq_len(max_pages)) {
+        # Print a message indicating the URL being scraped
+        messager(paste0("+ Scraping page ",i,"."),v=verbose)
+        # Retrieve the HTML content of the page
+        page <- rvest::read_html(url)
+        box_rows <- rvest::html_elements(page,".Box-row")
+        dt <- (rvest::html_text2(box_rows)) |>
+            stringr::str_split(" / |\n|[ ]", simplify = TRUE) |>
+            data.table::data.table() |>
+            `colnames<-`(c("owner","repo","stargazers_count","forks_count"))
+        dt[,owner_repo:=paste(owner,repo,sep="/")]
+        dt <- cbind(target=paste(owner,repo,sep="/"),dt)
+        all_dat[[i]] <- dt
+        #### Find the button for the next page ####
+        buttons <- page |> rvest::html_nodes(".paginate-container .btn")
+        next_buttons <- buttons[rvest::html_text(buttons)=="Next"]
+        #### Check if the button is disabled ####
+        is_disabled <- any(sapply(next_buttons, function(btn) {
+            btn_attr <- rvest::html_attr(btn, "disabled")
+            !is.na(btn_attr) && btn_attr == "disabled"
+        }))
+        #### If the button isn't disable, update the URL to scrape ####
+        if (isFALSE(is_disabled)) {
+            url <- next_buttons |> rvest::html_attr("href")
+            #### Otherwise, break the loop ####
+        } else {
+            break
+        }
+    }
+    ### Bind data from all pages ####
+    all_dat <- data.table::rbindlist(all_dat,
+                                     use.names = TRUE, idcol = "page")
+    return(all_dat)
+}
diff --git a/R/github_files.R b/R/github_files.R
@@ -86,9 +86,9 @@ github_files <- function(owner,
     if(is.null(dt)) return(NULL)
     #### Add download link ####
     dt[,link_raw:=paste(
-        "https://github.com", owner, repo, "raw",
-        branch, path, sep="/"
-    )] 
+        "https://raw.githubusercontent.com",
+        owner,repo,branch,path, sep="/")
+       ] 
     #### Unlist cols ####
     unlist_dt(dt = dt, 
               exclude = "size",
diff --git a/R/github_files_download.R b/R/github_files_download.R
@@ -15,12 +15,15 @@
 #'                    query = ".md$")
 #' filelist_local <- github_files_download(filelist = dt$link_raw)
 github_files_download <- function(filelist,
+                                  token = gh::gh_token(),
                                   download_dir = tempdir(),
                                   overwrite = FALSE,
                                   timeout = 5*60,
                                   nThread = 1,
                                   verbose = TRUE) {
+    # devoptera::args2vars(github_files_download)
     
+    options(timeout = timeout)
     messager("+ Downloading", length(filelist), "files.", v = verbose)
     local_files <- parallel::mclapply(stats::setNames(filelist,
                                                       filelist), 
@@ -40,9 +43,21 @@ github_files_download <- function(filelist,
         )
         if (!file.exists(destfile) &
             isFALSE(overwrite)) {
-            messager(paste("Downloading:", x),v=verbose)
-            options(timeout = timeout)
-            utils::download.file(url = x, 
+            messager("Downloading:",x,v=verbose)
+            #### Add token to header ####
+            extra <- getOption("download.file.extra")
+            if(!is.null(token)) { 
+                extra <- c(extra, "--fail", "-L")
+                headers <- c(Authorization = paste("token", token))
+                qh <- shQuote(paste0(names(headers), ": ", headers))
+                extra <- c(extra, paste("-H", qh))
+            }  
+            #### Download ####
+            utils::download.file(url = x,
+                                 method = "curl",
+                                 quiet = verbose<2,
+                                 mode = "wb",
+                                 extra = extra,
                                  destfile = destfile)
         } else {
             messager("Returning pre-existing file:",x,v=verbose)
diff --git a/R/sourcegraph_code.R b/R/sourcegraph_code.R
@@ -0,0 +1,39 @@
+sourcegraph_code <- function(){
+    # curl \
+    # -H 'Authorization: token sgp_a39bc1d81f55b8b7485d2e9bf03ae7605e2b8168' \
+    # -d '{"query":"query { currentUser { username } }"}' \
+    # https://sourcegraph.com/.api/graphql
+    
+    # context:global case:yes count:all fork:yes archived:yes content:"Package:" file:(?-i)^DESCRIPTION$
+    
+    # repo_api <-  "https://sourcegraph.com/.api/graphql"
+    # #### Search ####
+    # q <- list(
+    #     Accept="text/event-stream",
+    #     header="Authorization: token <access token>",
+    # 
+    #     --url="https://sourcegraph.com/.api/graphql/search/stream",
+    #     "data-urlencode"="q=<query>"
+    # )
+    # httr::timeout(seconds = seconds)
+    # req <- httr::GET(repo_api)
+    # httr::message_for_status(req)
+    # filelist <- unlist(lapply(httr::content(req)$tree, "[", "path"),
+    #                    use.names = FALSE
+    # ) 
+    
+    # src search -json 'repogroup:sample error'  |> tmp.json
+    j <- jsonlite::read_json("~/Desktop/tmp.json",  
+                             simplifyDataFrame = TRUE)  
+    d <- cbind(j$Results$repository,
+               file_name=j$Results$file$name,
+               file_url=j$Results$file$url,
+          lapply(j$Results$lineMatches,function(x)x[1,]) |>
+              data.table::rbindlist()
+          )
+    d$package <- gsub("^Package:| |\n|\r","",d$preview)
+    # length(unique(d$package))
+    # sum(d$file_name!="DESCRIPTION")
+    # sum(!grepl("^Package:*",d$preview))
+    return(d)
+}
diff --git a/man/github_code.Rd b/man/github_code.Rd
diff --git a/man/github_files.Rd b/man/github_files.Rd
diff --git a/man/github_files_download.Rd b/man/github_files_download.Rd
diff --git a/man/r_repos.Rd b/man/r_repos.Rd