diff --git a/.bazelproject b/.bazelproject new file mode 100644 index 0000000..36bbf05 --- /dev/null +++ b/.bazelproject @@ -0,0 +1,6 @@ +directories: + . + +targets: + //java/com/revolutionanalytics/hadoop/hdfs:rhdfs_lib + //pkg:rhdfs diff --git a/.gitignore b/.gitignore index 5c2ca1a..1d93898 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,35 @@ +# Ignore backup files. +*~ + +# Ignore Vim swap files. +.*.swp + +# Ignore files generated by IDEs. +/.classpath +/.factorypath +/.idea/ +/.project +/.settings +/.vscode/ +/bazel.iml + +# Ignore all bazel-* symlinks. There is no full list since this can change +# based on the name of the directory bazel is cloned into. +/bazel-* + +# Ignore outputs generated during Bazel bootstrapping. +/output/ + +# Dependency Analysis +graph.in +graph.png + +# R Artifacts .RData .Rhistory .Rproj.user *.Rproj - -*.orig +pkg/rhdfs.bin.Rcheck +*.log +pkg/..Rcheck +pkg.Rcheck \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..d0449ab --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +dist: trusty + +addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - wget + - pkg-config + +before_install: + - wget https://github.com/bazelbuild/bazel/releases/download/0.11.0/bazel_0.11.0-linux-x86_64.deb + - sudo dpkg -i bazel_0.11.0-linux-x86_64.deb + - sudo sh -c 'echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list' + - gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 + - gpg -a --export E084DAB9 | sudo apt-key add - + - wget http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.6.0-cdh5.13.0.tar.gz + - sudo tar xvf hadoop-2.6.0-cdh5.13.0.tar.gz -C /opt + - sudo apt-get update + - sudo apt-get -y install r-base + - sudo R CMD javareconf + - sudo R --quiet -e 'install.packages("rJava", type="source", repos="http://cran.us.r-project.org")' + - sudo apt-get install -y texlive-base texlive-extra-utils texlive-latex-base texlive-latex-recommended texlive-generic-recommended +env: + - HADOOP_CMD="/opt/hadoop-2.6.0-cdh5.13.0/bin/hadoop" +script: + - bazel build //... \ No newline at end of file diff --git a/README.md b/README.md index af849f2..5311863 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,21 @@ -rhdfs -===== +# rhdfs -A package that allows R developers to use Hadoop HDFS, developed as part of the RHadoop project. Please see the [RHadoop wiki](https://github.com/RevolutionAnalytics/RHadoop/wiki) for information. +This repository provides an R package for interacting with Hadoop Distributed File System (HDFS). + +[![Build Status](https://travis-ci.org/bowlofstew/rhdfs.svg?branch=master)](https://github.com/bowlofstew/rhdfs) + +## Documentation + +[Requirements](docs/REQUIREMENTS.md) + +[Compilation](docs/COMPILATION.md) + +[CRAN](docs/CRAN.md) + +[Dependency Analysis](docs/DEPENDENCIES.md) + +[IntelliJ](docs/INTELLIJ.md) + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..4c4d1af --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,50 @@ +workspace(name = "rhdfs") + +# Bazel Rules +RULES_R_COMMIT="6c6a0536c829e59c1ddcb3a96df0376342f66bf9" + +git_repository( + name = "com_grail_rules_r", + remote = "https://github.com/grailbio/rules_r.git", + commit = RULES_R_COMMIT, +) + +# Java, Maven Dependencies +maven_jar( + name = "org_apache_hadoop_hadoop_core", + artifact = "org.apache.hadoop:hadoop-core:1.2.1", +) + +# R Dependencies +CRAN_BASE_URL="https://cloud.r-project.org/src/contrib" +CRAN_PACKAGE="tar.gz" +RJAVA_VERSION="0.9-9" +DBI_VERSION="0.8" +BIGLM_VERSION="0.9-1" + +new_http_archive( + name = "R_rJava", + build_file = "cran/BUILD.rJava", + strip_prefix = "rJava", + urls = [ + "{}/rJava_{}.{}".format(CRAN_BASE_URL, RJAVA_VERSION, CRAN_PACKAGE) + ], +) + +new_http_archive( + name = "R_DBI", + build_file = "cran/BUILD.DBI", + strip_prefix = "DBI", + urls = [ + "{}/DBI_{}.{}".format(CRAN_BASE_URL, DBI_VERSION, CRAN_PACKAGE) + ], +) + +new_http_archive( + name = "R_biglm", + build_file = "cran/BUILD.biglm", + strip_prefix = "biglm", + urls = [ + "{}/biglm_{}.{}".format(CRAN_BASE_URL, BIGLM_VERSION, CRAN_PACKAGE) + ], +) \ No newline at end of file diff --git a/build/rhdfs_1.0.5.tar.gz b/build/rhdfs_1.0.5.tar.gz deleted file mode 100644 index a4a7114..0000000 Binary files a/build/rhdfs_1.0.5.tar.gz and /dev/null differ diff --git a/build/rhdfs_1.0.6.tar.gz b/build/rhdfs_1.0.6.tar.gz deleted file mode 100644 index 12ff074..0000000 Binary files a/build/rhdfs_1.0.6.tar.gz and /dev/null differ diff --git a/build/rhdfs_1.0.6.zip b/build/rhdfs_1.0.6.zip deleted file mode 100644 index 557f712..0000000 Binary files a/build/rhdfs_1.0.6.zip and /dev/null differ diff --git a/build/rhdfs_1.0.7.tar.gz b/build/rhdfs_1.0.7.tar.gz deleted file mode 100644 index 0059720..0000000 Binary files a/build/rhdfs_1.0.7.tar.gz and /dev/null differ diff --git a/build/rhdfs_1.0.7.zip b/build/rhdfs_1.0.7.zip deleted file mode 100644 index 935e557..0000000 Binary files a/build/rhdfs_1.0.7.zip and /dev/null differ diff --git a/build/rhdfs_1.0.8.tar.gz b/build/rhdfs_1.0.8.tar.gz deleted file mode 100644 index 9d0a5d2..0000000 Binary files a/build/rhdfs_1.0.8.tar.gz and /dev/null differ diff --git a/build/rhdfs_1.0.8.zip b/build/rhdfs_1.0.8.zip deleted file mode 100644 index 88e32ee..0000000 Binary files a/build/rhdfs_1.0.8.zip and /dev/null differ diff --git a/cran/BUILD.DBI b/cran/BUILD.DBI new file mode 100644 index 0000000..8c3f880 --- /dev/null +++ b/cran/BUILD.DBI @@ -0,0 +1,15 @@ +# https://cran.r-project.org/web/packages/DBI/index.html + +load("@com_grail_rules_r//R:defs.bzl", "r_pkg") + +package(default_visibility = ["//visibility:public"]) + +r_pkg( + name = "DBI", + srcs = glob( + ["**"], + exclude = [], + ), + lazy_data = True, + deps = [], +) diff --git a/cran/BUILD.biglm b/cran/BUILD.biglm new file mode 100644 index 0000000..0da1924 --- /dev/null +++ b/cran/BUILD.biglm @@ -0,0 +1,17 @@ +# https://cran.r-project.org/web/packages/biglm/index.html + +load("@com_grail_rules_r//R:defs.bzl", "r_pkg") + +package(default_visibility = ["//visibility:public"]) + +r_pkg( + name = "biglm", + srcs = glob( + ["**"], + exclude = [], + ), + lazy_data = True, + deps = [ + "@R_DBI//:DBI", + ], +) diff --git a/cran/BUILD.rJava b/cran/BUILD.rJava new file mode 100644 index 0000000..d2c5d45 --- /dev/null +++ b/cran/BUILD.rJava @@ -0,0 +1,15 @@ +# https://cran.r-project.org/web/packages/rJava/index.html + +load("@com_grail_rules_r//R:defs.bzl", "r_pkg") + +package(default_visibility = ["//visibility:public"]) + +r_pkg( + name = "rJava", + srcs = glob( + ["**"], + exclude = [], + ), + lazy_data = True, + deps = [], +) diff --git a/docs/COMPILATION.md b/docs/COMPILATION.md new file mode 100644 index 0000000..fecf0b6 --- /dev/null +++ b/docs/COMPILATION.md @@ -0,0 +1,8 @@ +# Compilation + +In order to compile the package, execute the command, `bazel build //...` and this +will build the R package and its Java dependency. + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/docs/CRAN.md b/docs/CRAN.md new file mode 100644 index 0000000..ce830cb --- /dev/null +++ b/docs/CRAN.md @@ -0,0 +1,11 @@ +# Publishing to CRAN + +1. Execute the command, `bazel build //..`. + +2. From the top level project directory, execute the command, `R CMD check --as-cran pkg/` + +3. In a web browser, navigate to `http://cran.r-project.org/submit.html` and create a submission for your package. + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/docs/DEPENDENCIES.md b/docs/DEPENDENCIES.md new file mode 100644 index 0000000..405652b --- /dev/null +++ b/docs/DEPENDENCIES.md @@ -0,0 +1,19 @@ +# Dependency Analysis + +In order to perform a dependency analysis of the project, you will need to meet an extra requirement of: + +* [GraphViz Dot Compiler](https://www.graphviz.org/) + +Once the dependency is satisified, you can execute the following commands to create a depedency graph: + + bazel query 'deps(//pkg:rhdfs)' --output graph > graph.in + + dot -Tpng < graph.in > graph.png + +## Depdendency Graph + +![Image of Yaktocat](resources/dependencies.png) + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/docs/INTELLIJ.md b/docs/INTELLIJ.md new file mode 100644 index 0000000..a53ed74 --- /dev/null +++ b/docs/INTELLIJ.md @@ -0,0 +1,7 @@ +# IntelliJ + +[Jetbrain's IntelliJ](https://www.jetbrains.com/idea/) is supported for this project with the [Bazel plugin](https://github.com/bazelbuild/intellij). + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/docs/REQUIREMENTS.md b/docs/REQUIREMENTS.md new file mode 100644 index 0000000..e465062 --- /dev/null +++ b/docs/REQUIREMENTS.md @@ -0,0 +1,17 @@ +# Requirements + +## Compilation Requirements + +The basic project requirements are as follows: + +* [JDK](https://java.com/en/download/) + +* [Bazel](https://bazel.build/) + +* [R](https://www.r-project.org/) + +* [Tex]() + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/docs/resources/dependencies.png b/docs/resources/dependencies.png new file mode 100644 index 0000000..9bf6380 Binary files /dev/null and b/docs/resources/dependencies.png differ diff --git a/java/build.num b/java/build.num deleted file mode 100644 index 4a4eb78..0000000 --- a/java/build.num +++ /dev/null @@ -1,3 +0,0 @@ -#Build Number for ANT. Do not edit! -#Thu Jul 14 01:34:39 EDT 2011 -build.number=91 diff --git a/java/build.xml b/java/build.xml deleted file mode 100644 index 4c39269..0000000 --- a/java/build.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/java/com/revolutionanalytics/hadoop/hdfs/BUILD b/java/com/revolutionanalytics/hadoop/hdfs/BUILD new file mode 100644 index 0000000..ce31b8b --- /dev/null +++ b/java/com/revolutionanalytics/hadoop/hdfs/BUILD @@ -0,0 +1,16 @@ +package(default_visibility = ["//visibility:public"]) + +SOURCES = glob( + ["**"], + exclude = [ + "BUILD", + ], +) + +java_library( + name = "rhdfs_lib", + srcs = SOURCES, + deps = [ + "@org_apache_hadoop_hadoop_core//jar" + ], +) \ No newline at end of file diff --git a/java/rhdfs.jar b/java/rhdfs.jar deleted file mode 100644 index c55e4fb..0000000 Binary files a/java/rhdfs.jar and /dev/null differ diff --git a/pkg/BUILD b/pkg/BUILD new file mode 100644 index 0000000..b2427af --- /dev/null +++ b/pkg/BUILD @@ -0,0 +1,25 @@ +package(default_visibility = ["//visibility:public"]) + +load("@com_grail_rules_r//R:defs.bzl", "r_package") + +r_package( + pkg_name = "rhdfs", + pkg_srcs = glob( + ["**"], + exclude = [ + "BUILD", + "README.md", + ".gitignore", + ".Rbuildignore", + "rplatform.Rproj", + ".Rproject.user" + ], + ), + pkg_deps = [ + "@R_rJava//:rJava", + "@R_biglm//:biglm", + ], + pkg_suggested_deps = [ + "//java/com/revolutionanalytics/hadoop/hdfs:rhdfs_lib" + ] +) \ No newline at end of file diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index 1f7f129..5f57474 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -1,10 +1,16 @@ Package: rhdfs Type: Package Title: R and Hadoop Distributed Filesystem -Version: 1.0.8 -Date: 2013-06-20 +Version: 1.0.10 +Date: 2018-03-06 Author: Revolution Analytics -Depends: R (>= 2.6.0), methods, rJava (>= 0.8) -Maintainer: Revolution Analytics +Depends: + R (>= 3.3.3), +Imports: + rJava (>= 0.9), + methods +SystemRequirements: Java (>= 1.6) +Maintainer: Stewart Henderson Description: Functions to browse, open, write files to the HDFS License: Apache License (== 2.0) +RoxygenNote: 6.0.1.9000 diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE index a702f26..42b573a 100644 --- a/pkg/NAMESPACE +++ b/pkg/NAMESPACE @@ -1,9 +1,14 @@ importFrom("utils", "packageDescription") importFrom("rJava", ".jinit") importFrom("rJava", ".jnew") +importFrom("rJava", ".jarray") +importFrom("rJava", ".jshort") +importFrom("rJava", ".jlong") +importFrom("rJava", ".jevalArray") +importFrom("rJava", ".jcall") +importFrom("rJava", ".jclass") importFrom("rJava", ".jclassLoader") importFrom("rJava", "J") exportPattern("^hdfs") S3method("print", "hdfsFH") -S3method("as.character", "hdfsFH") - +S3method("as.character", "hdfsFH") \ No newline at end of file diff --git a/pkg/R/zzz.R b/pkg/R/zzz.R index d442d72..782fb5b 100644 --- a/pkg/R/zzz.R +++ b/pkg/R/zzz.R @@ -15,11 +15,17 @@ library(utils) .hdfsEnv <- new.env() -.onLoad <- function(libname,pkgname){ +.onLoad <- function(libname,pkgname) { vrs <- packageDescription(pkgname, lib.loc = libname, fields = "Version", drop = TRUE) - if (Sys.getenv("HADOOP_CMD") == "") stop(sprintf("Environment variable HADOOP_CMD must be set before loading package %s", pkgname)) + if (Sys.getenv("HADOOP_CMD") == "") { + HADOOP_VERSION = "2.6.0" + CLOUDERA_VERSION = "5.13.0" + CLOUDERA_HADOOP_INSTALLATION_DIR = sprintf("/opt/hadoop-%s-cdh%s", HADOOP_VERSION, CLOUDERA_VERSION) + if (!dir.exists(CLOUDERA_HADOOP_INSTALLATION_DIR)) { + Sys.setenv(HADOOP_CMD = sprintf("/opt/hadoop-%s-cdh%s/bin/hadoop", HADOOP_VERSION, CLOUDERA_VERSION)) + } + } packageStartupMessage("\nHADOOP_CMD=", Sys.getenv("HADOOP_CMD")) packageStartupMessage("\nBe sure to run hdfs.init()") - #hdfs.init() } diff --git a/pkg/README.md b/pkg/README.md new file mode 100644 index 0000000..67d0fb6 --- /dev/null +++ b/pkg/README.md @@ -0,0 +1,31 @@ +# R Package + +This directory contains the code to build the R package. The build is described +in the BUILD file. + +## Java Issues + +You may run into rJava issues. The follow guide may be of help: + +* Execute the command, `R CMD javareconf` + +* Install `rJava` and compile it, `R --quiet -e 'install.packages("rJava", type="source", repos="http://cran.us.r-project.org")' +` + +* Test to see if rJava can be loaded now, `R --quiet -e 'library("rJava"); .jinit(); .jcall("java/lang/System", "S", "getProperty", "java.runtime.version")' +` + +# Installation + +You must ensure that the environment variable, `HADOOP_CMD` has been set in your environment. If it is not, you will encounter issues installing. The error will be exhibited by a message of the form: + +``` +Error : .onLoad failed in loadNamespace() for 'rhdfs', details: + call: fun(libname, pkgname) + error: Environment variable HADOOP_CMD must be set before loading package rhdfs +Error: loading failed +``` + +## Author(s) + +Stewart Henderson \ No newline at end of file diff --git a/pkg/man/hdfs-file-access.Rd b/pkg/man/hdfs-file-access.Rd index 2ca5650..cf55712 100644 --- a/pkg/man/hdfs-file-access.Rd +++ b/pkg/man/hdfs-file-access.Rd @@ -77,6 +77,7 @@ hdfs.tell(con) \examples{ ## Following example describes a way to lazy load a character of vector of ## variables + library('rJava') # for J() function save.objects <- function(ob.names=ls(theenv), save.name, theenv=.GlobalEnv){ hdfs.dircreate(save.name) data.file <- hdfs.file(path=sprintf("\%s/data",save.name),mode="w") diff --git a/pkg/man/hdfs.file-level.Rd b/pkg/man/hdfs.file-level.Rd index f36845e..0911333 100644 --- a/pkg/man/hdfs.file-level.Rd +++ b/pkg/man/hdfs.file-level.Rd @@ -66,7 +66,7 @@ \code{hdfs.chown} and \code{hdfs.chmod} change the file permissions and owner and group membership of the file/directory. See - \url{http://hadoop.apache.org/common/docs/r0.20.2/hdfs_permissions_guide.html} + \url{https://hadoop.apache.org/docs/r1.2.1/hdfs_permissions_guide.html} for further details \code{hdfs.dircreate} creates a directory. \code{hdfs.mkdir} is a synonym