From caafee5f60be67522e1e8d5bf224152bd32f6383 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 18 Mar 2016 00:16:57 -0700 Subject: [PATCH 1/2] started to work on mllib types --- build.sbt | 2 + .../tensorframes/BasicOperationsSuite.scala | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/build.sbt b/build.sbt index d6938cf..148fd1a 100644 --- a/build.sbt +++ b/build.sbt @@ -56,6 +56,8 @@ libraryDependencies += "org.apache.spark" %% "spark-core" % targetSparkVersion % libraryDependencies += "org.apache.spark" %% "spark-sql" % targetSparkVersion % "provided" +libraryDependencies += "org.apache.spark" %% "spark-mllib" % targetSparkVersion % "provided" + libraryDependencies += "org.scalatest" %% "scalatest" % "2.1.3" % "test" // Compilation of proto files diff --git a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala index a613007..251b790 100644 --- a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala +++ b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala @@ -5,6 +5,7 @@ import org.tensorframes.impl.DebugRowOps import org.tensorframes.test.dsl._ import org.apache.spark.Logging +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, IntegerType} @@ -202,8 +203,48 @@ class BasicOperationsSuite class CurrentOperationsSuite extends FunSuite with TensorFramesTestSparkContext with Logging { + import Shape.Unknown lazy val sql = sqlContext val ops = new DebugRowOps + test("dense vector udts") { + val df = sql.createDataFrame(Seq( + (1, Vectors.dense(1.0)), + (2, Vectors.dense(2.0)))).toDF("col1", "col2") + val exp = ops.explainDetailed(df) + logDebug(s"exp=$exp") + val stf = exp.cols.find(_.columnName=="col2").get.stf.get + println(ops.explain(df)) + assert(stf.dataType === DoubleType) + assert(stf.shape === Shape(2, 1)) + } + + test("dense vector udts with different shapes") { + val df = sql.createDataFrame(Seq( + (1, Vectors.dense(1.0)), + (2, Vectors.dense(2.0, 2.1)))).toDF("col1", "col2") + val exp = ops.explainDetailed(df) + logDebug(s"exp=$exp") + val stf = exp.cols.find(_.columnName=="col2").get.stf.get + println(ops.explain(df)) + assert(stf.dataType === DoubleType) + assert(stf.shape === Shape(2, Unknown)) + } + + test("dense matrix udts") { + + } + + test("dense matrix udts with different row sizes") { + + } + + test("dense matrix udts with different column sizes") { + + } + + test("dense matrix udts with different column and row sizes") { + + } } \ No newline at end of file From 2460b977e314801b66af1505fdee731dd1fc9617 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 18 Mar 2016 00:34:19 -0700 Subject: [PATCH 2/2] working vector analysis --- .../org/tensorframes/ColumnInformation.scala | 6 ++++ .../tensorframes/ExperimentalOperations.scala | 6 +++- .../tensorframes/BasicOperationsSuite.scala | 36 ++++++++++++++++--- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/main/scala/org/tensorframes/ColumnInformation.scala b/src/main/scala/org/tensorframes/ColumnInformation.scala index 3e0dac5..e216484 100644 --- a/src/main/scala/org/tensorframes/ColumnInformation.scala +++ b/src/main/scala/org/tensorframes/ColumnInformation.scala @@ -1,6 +1,7 @@ package org.tensorframes import org.apache.spark.Logging +import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types._ @@ -126,6 +127,11 @@ object ColumnInformation extends Logging { extractFromRow(x.elementType).map { info => SparkTFColInfo(info.shape.prepend(Unknown), info.dataType) } + case v: VectorUDT => + // We do not have extra shape information at this point. + // The dtype is generic double + val dtype = DoubleType + Some(SparkTFColInfo(Shape(Unknown, Unknown), DoubleType)) case _ => logDebug("not understood: " + dt) // Not understood. diff --git a/src/main/scala/org/tensorframes/ExperimentalOperations.scala b/src/main/scala/org/tensorframes/ExperimentalOperations.scala index 05d4548..6e8c691 100644 --- a/src/main/scala/org/tensorframes/ExperimentalOperations.scala +++ b/src/main/scala/org/tensorframes/ExperimentalOperations.scala @@ -1,9 +1,10 @@ package org.tensorframes import org.apache.spark.Logging +import org.apache.spark.mllib.linalg.{VectorUDT, DenseVector, Vector} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.{ArrayType, DataType, NumericType} +import org.apache.spark.sql.types.{DoubleType, ArrayType, DataType, NumericType} import org.tensorframes.impl.SupportedOperations /** @@ -113,11 +114,14 @@ private[tensorframes] object ExtraOperations extends ExperimentalOperations with private def extractBasicType(dt: DataType): Option[NumericType] = dt match { case x: NumericType => Some(x) case x: ArrayType => extractBasicType(x.elementType) + case _: VectorUDT => Some(DoubleType) case _ => None } def analyzeData(x: Any): Option[Shape] = x match { case null => None + case x: DenseVector => + Some(Shape(x.size)) case u: Array[_] => val shapes = u.map(analyzeData) mergeStructs(shapes).map(_.prepend(u.length)) diff --git a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala index 251b790..749ed7f 100644 --- a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala +++ b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala @@ -217,7 +217,7 @@ class CurrentOperationsSuite val stf = exp.cols.find(_.columnName=="col2").get.stf.get println(ops.explain(df)) assert(stf.dataType === DoubleType) - assert(stf.shape === Shape(2, 1)) + assert(stf.shape === Shape(Unknown, Unknown)) } test("dense vector udts with different shapes") { @@ -229,22 +229,48 @@ class CurrentOperationsSuite val stf = exp.cols.find(_.columnName=="col2").get.stf.get println(ops.explain(df)) assert(stf.dataType === DoubleType) + assert(stf.shape === Shape(Unknown, Unknown)) + } + + test("analyze dense vector udts") { + val df = sql.createDataFrame(Seq( + (1, Vectors.dense(1.0)), + (2, Vectors.dense(2.0)))).toDF("col1", "col2") + val df2 = ops.analyze(df) + val exp = ops.explainDetailed(df2) + logDebug(s"exp=$exp") + val stf = exp.cols.find(_.columnName=="col2").get.stf.get + println(ops.explain(df)) + assert(stf.dataType === DoubleType) + assert(stf.shape === Shape(2, 1)) + } + + test("analyze dense vector udts with different shapes") { + val df = sql.createDataFrame(Seq( + (1, Vectors.dense(1.0)), + (2, Vectors.dense(2.0, 2.1)))).toDF("col1", "col2") + val df2 = ops.analyze(df) + val exp = ops.explainDetailed(df2) + logDebug(s"exp=$exp") + val stf = exp.cols.find(_.columnName=="col2").get.stf.get + println(ops.explain(df)) + assert(stf.dataType === DoubleType) assert(stf.shape === Shape(2, Unknown)) } - test("dense matrix udts") { + ignore("dense matrix udts") { } - test("dense matrix udts with different row sizes") { + ignore("dense matrix udts with different row sizes") { } - test("dense matrix udts with different column sizes") { + ignore("dense matrix udts with different column sizes") { } - test("dense matrix udts with different column and row sizes") { + ignore("dense matrix udts with different column and row sizes") { } } \ No newline at end of file