typelevel · jeremyrsmith · Feb 18, 2017 · Feb 18, 2017 · Feb 18, 2017 · Feb 20, 2017
diff --git a/build.sbt b/build.sbt
@@ -4,6 +4,10 @@ val scalatest = "3.0.1"
 val shapeless = "2.3.2"
 val scalacheck = "1.13.4"
 
+// spark has scalatest and scalactic as a runtime dependency
+// which can mess things up if you use a different version in your project
+val exclusions = Seq(ExclusionRule("org.scalatest"), ExclusionRule("org.scalactic"))
+
 lazy val root = Project("frameless", file("." + "frameless")).in(file("."))
   .aggregate(core, cats, dataset, docs)
   .settings(framelessSettings: _*)
@@ -22,7 +26,7 @@ lazy val cats = project
   .settings(publishSettings: _*)
   .settings(libraryDependencies ++= Seq(
     "org.typelevel"    %% "cats"       % catsv,
-    "org.apache.spark" %% "spark-core" % sparkVersion % "provided"))
+    "org.apache.spark" %% "spark-core" % sparkVersion % "provided" excludeAll(exclusions: _*)))
 
 lazy val dataset = project
   .settings(name := "frameless-dataset")
@@ -31,8 +35,8 @@ lazy val dataset = project
   .settings(framelessTypedDatasetREPL: _*)
   .settings(publishSettings: _*)
   .settings(libraryDependencies ++= Seq(
-    "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
-    "org.apache.spark" %% "spark-sql"  % sparkVersion % "provided"
+    "org.apache.spark" %% "spark-core" % sparkVersion % "provided" excludeAll(exclusions: _*),
+    "org.apache.spark" %% "spark-sql"  % sparkVersion % "provided" excludeAll(exclusions: _*)
   ))
   .dependsOn(core % "test->test;compile->compile")
 

diff --git a/dataset/src/main/scala/frameless/TypedColumn.scala b/dataset/src/main/scala/frameless/TypedColumn.scala
@@ -271,6 +271,7 @@ object TypedColumn {
       lgen: LabelledGeneric.Aux[T, H],
       selector: Selector.Aux[H, K, V]
     ): Exists[T, K, V] = new Exists[T, K, V] {}
+
   }
 
   implicit class OrderedTypedColumnSyntax[T, U: CatalystOrdered](col: TypedColumn[T, U]) {
@@ -279,4 +280,4 @@ object TypedColumn {
     def >(other: TypedColumn[T, U]): TypedColumn[T, Boolean] = (col.untyped > other.untyped).typed
     def >=(other: TypedColumn[T, U]): TypedColumn[T, Boolean] = (col.untyped >= other.untyped).typed
   }
-}
+}
diff --git a/dataset/src/main/scala/frameless/TypedDataset.scala b/dataset/src/main/scala/frameless/TypedDataset.scala
@@ -62,11 +62,9 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val
     *
     * It is statically checked that column with such name exists and has type `A`.
     */
-  def apply[A](column: Witness.Lt[Symbol])(
-    implicit
-    exists: TypedColumn.Exists[T, column.T, A],
+  def apply[A](selector: T => A)(implicit
     encoder: TypedEncoder[A]
-  ): TypedColumn[T, A] = col(column)
+  ): TypedColumn[T, A] = macro frameless.macros.ColumnMacros.fromFunction[T, A]
 
   /** Returns `TypedColumn` of type `A` given it's name.
     *
@@ -319,6 +317,9 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val
     }
   }
 
+  def selectExpr[B](expr: T => B)(implicit encoder: TypedEncoder[B]): TypedDataset[B] =
+    macro frameless.macros.ColumnMacros.fromExpr[T, B]
+
   /** Type-safe projection from type T to Tuple2[A,B]
     * {{{
     *   d.select( d('a), d('a)+d('b), ... )

diff --git a/dataset/src/main/scala/frameless/TypedEncoder.scala b/dataset/src/main/scala/frameless/TypedEncoder.scala
@@ -8,6 +8,7 @@ import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import shapeless._
+
 import scala.reflect.ClassTag
 
 abstract class TypedEncoder[T](implicit val classTag: ClassTag[T]) extends Serializable {
@@ -264,15 +265,30 @@ object TypedEncoder {
     def targetDataType: DataType = DataTypes.createArrayType(underlying.targetDataType)
 
     def constructorFor(path: Expression): Expression = {
-      val arrayData = Invoke(
-        MapObjects(
-          underlying.constructorFor,
-          path,
-          underlying.targetDataType
-        ),
-        "array",
-        ScalaReflection.dataTypeFor[Array[AnyRef]]
-      )
+      val arrayData = Option(underlying.sourceDataType)
+        .filter(ScalaReflection.isNativeType)
+        .filter(_ == underlying.targetDataType)
+        .collect {
+          case BooleanType => "toBooleanArray" -> ScalaReflection.dataTypeFor[Array[Boolean]]
+          case ByteType    => "toByteArray"    -> ScalaReflection.dataTypeFor[Array[Byte]]
+          case ShortType   => "toShortArray"   -> ScalaReflection.dataTypeFor[Array[Short]]
+          case IntegerType => "toIntArray"     -> ScalaReflection.dataTypeFor[Array[Int]]
+          case LongType    => "toLongArray"    -> ScalaReflection.dataTypeFor[Array[Long]]
+          case FloatType   => "toFloatArray"   -> ScalaReflection.dataTypeFor[Array[Float]]
+          case DoubleType  => "toDoubleArray"  -> ScalaReflection.dataTypeFor[Array[Double]]
+        }.map {
+        case (method, typ) => Invoke(path, method, typ)
+      }.getOrElse {
+        Invoke(
+          MapObjects(
+            underlying.constructorFor,
+            path,
+            underlying.targetDataType
+          ),
+          "array",
+          ScalaReflection.dataTypeFor[Array[AnyRef]]
+        )
+      }
 
       StaticInvoke(
         TypedEncoderUtils.getClass,
@@ -296,6 +312,58 @@ object TypedEncoder {
     }
   }
 
+  implicit def arrayEncoder[A](
+    implicit
+    underlying: TypedEncoder[A],
+    classTag: ClassTag[Array[A]]
+  ): TypedEncoder[Array[A]] = new TypedEncoder[Array[A]]() {
+    def nullable: Boolean = false
+
+    def sourceDataType: DataType = FramelessInternals.objectTypeFor[Array[A]](classTag)
+
+    def targetDataType: DataType = DataTypes.createArrayType(underlying.targetDataType)
+
+    def constructorFor(path: Expression): Expression = {
+      Option(underlying.sourceDataType)
+        .filter(ScalaReflection.isNativeType)
+        .filter(_ == underlying.targetDataType)
+        .collect {
+          case BooleanType => "toBooleanArray" -> ScalaReflection.dataTypeFor[Array[Boolean]]
+          case ByteType    => "toByteArray"    -> ScalaReflection.dataTypeFor[Array[Byte]]
+          case ShortType   => "toShortArray"   -> ScalaReflection.dataTypeFor[Array[Short]]
+          case IntegerType => "toIntArray"     -> ScalaReflection.dataTypeFor[Array[Int]]
+          case LongType    => "toLongArray"    -> ScalaReflection.dataTypeFor[Array[Long]]
+          case FloatType   => "toFloatArray"   -> ScalaReflection.dataTypeFor[Array[Float]]
+          case DoubleType  => "toDoubleArray"  -> ScalaReflection.dataTypeFor[Array[Double]]
+        }.map {
+          case (method, typ) => Invoke(path, method, typ)
+        }.getOrElse {
+          Invoke(
+            MapObjects(
+              underlying.constructorFor,
+              path,
+              underlying.targetDataType
+            ),
+            "array",
+            ScalaReflection.dataTypeFor[Array[AnyRef]]
+          )
+        }
+    }
+
+    def extractorFor(path: Expression): Expression = {
+      // if source `path` is already native for Spark, no need to `map`
+      if (ScalaReflection.isNativeType(underlying.sourceDataType)) {
+        NewInstance(
+          classOf[GenericArrayData],
+          path :: Nil,
+          dataType = ArrayType(underlying.targetDataType, underlying.nullable)
+        )
+      } else {
+        MapObjects(underlying.extractorFor, path, underlying.sourceDataType)
+      }
+    }
+  }
+
   /** Encodes things using injection if there is one defined */
   implicit def usingInjection[A: ClassTag, B]
     (implicit inj: Injection[A, B], trb: TypedEncoder[B]): TypedEncoder[A] =
@@ -322,4 +390,4 @@ object TypedEncoder {
     recordEncoder: Lazy[RecordEncoderFields[G]],
     classTag: ClassTag[F]
   ): TypedEncoder[F] = new RecordEncoder[F, G]
-}
+}
diff --git a/dataset/src/main/scala/frameless/functions/package.scala b/dataset/src/main/scala/frameless/functions/package.scala
@@ -2,4 +2,6 @@ package frameless
 
 package object functions extends Udf {
   object aggregate extends AggregateFunctions
+
+
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,6 @@ package frameless

		package object functions extends Udf {
		object aggregate extends AggregateFunctions


		}