From caafee5f60be67522e1e8d5bf224152bd32f6383 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 18 Mar 2016 00:16:57 -0700
Subject: [PATCH 1/2] started to work on mllib types

---
 build.sbt                                     |  2 +
 .../tensorframes/BasicOperationsSuite.scala   | 41 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/build.sbt b/build.sbt
index d6938cf..148fd1a 100644
--- a/build.sbt
+++ b/build.sbt
@@ -56,6 +56,8 @@ libraryDependencies += "org.apache.spark" %% "spark-core" % targetSparkVersion %
 
 libraryDependencies += "org.apache.spark" %% "spark-sql" % targetSparkVersion % "provided"
 
+libraryDependencies += "org.apache.spark" %% "spark-mllib" % targetSparkVersion % "provided"
+
 libraryDependencies += "org.scalatest" %% "scalatest" % "2.1.3" % "test"
 
 // Compilation of proto files
diff --git a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
index a613007..251b790 100644
--- a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
+++ b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
@@ -5,6 +5,7 @@ import org.tensorframes.impl.DebugRowOps
 import org.tensorframes.test.dsl._
 
 import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.types.{DoubleType, IntegerType}
 
@@ -202,8 +203,48 @@ class BasicOperationsSuite
 
 class CurrentOperationsSuite
   extends FunSuite with TensorFramesTestSparkContext with Logging {
+  import Shape.Unknown
   lazy val sql = sqlContext
 
   val ops = new DebugRowOps
 
+  test("dense vector udts") {
+    val df = sql.createDataFrame(Seq(
+      (1, Vectors.dense(1.0)),
+      (2, Vectors.dense(2.0)))).toDF("col1", "col2")
+    val exp = ops.explainDetailed(df)
+    logDebug(s"exp=$exp")
+    val stf = exp.cols.find(_.columnName=="col2").get.stf.get
+    println(ops.explain(df))
+    assert(stf.dataType === DoubleType)
+    assert(stf.shape === Shape(2, 1))
+  }
+
+  test("dense vector udts with different shapes") {
+    val df = sql.createDataFrame(Seq(
+      (1, Vectors.dense(1.0)),
+      (2, Vectors.dense(2.0, 2.1)))).toDF("col1", "col2")
+    val exp = ops.explainDetailed(df)
+    logDebug(s"exp=$exp")
+    val stf = exp.cols.find(_.columnName=="col2").get.stf.get
+    println(ops.explain(df))
+    assert(stf.dataType === DoubleType)
+    assert(stf.shape === Shape(2, Unknown))
+  }
+
+  test("dense matrix udts") {
+
+  }
+
+  test("dense matrix udts with different row sizes") {
+
+  }
+
+  test("dense matrix udts with different column sizes") {
+
+  }
+
+  test("dense matrix udts with different column and row sizes") {
+
+  }
 }
\ No newline at end of file

From 2460b977e314801b66af1505fdee731dd1fc9617 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 18 Mar 2016 00:34:19 -0700
Subject: [PATCH 2/2] working vector analysis

---
 .../org/tensorframes/ColumnInformation.scala  |  6 ++++
 .../tensorframes/ExperimentalOperations.scala |  6 +++-
 .../tensorframes/BasicOperationsSuite.scala   | 36 ++++++++++++++++---
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/org/tensorframes/ColumnInformation.scala b/src/main/scala/org/tensorframes/ColumnInformation.scala
index 3e0dac5..e216484 100644
--- a/src/main/scala/org/tensorframes/ColumnInformation.scala
+++ b/src/main/scala/org/tensorframes/ColumnInformation.scala
@@ -1,6 +1,7 @@
 package org.tensorframes
 
 import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.VectorUDT
 import org.apache.spark.sql.types._
 
 
@@ -126,6 +127,11 @@ object ColumnInformation extends Logging {
       extractFromRow(x.elementType).map { info =>
         SparkTFColInfo(info.shape.prepend(Unknown), info.dataType)
       }
+    case v: VectorUDT =>
+      // We do not have extra shape information at this point.
+      // The dtype is generic double
+      val dtype = DoubleType
+      Some(SparkTFColInfo(Shape(Unknown, Unknown), DoubleType))
     case _ =>
       logDebug("not understood: " + dt)
       // Not understood.
diff --git a/src/main/scala/org/tensorframes/ExperimentalOperations.scala b/src/main/scala/org/tensorframes/ExperimentalOperations.scala
index 05d4548..6e8c691 100644
--- a/src/main/scala/org/tensorframes/ExperimentalOperations.scala
+++ b/src/main/scala/org/tensorframes/ExperimentalOperations.scala
@@ -1,9 +1,10 @@
 package org.tensorframes
 
 import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{VectorUDT, DenseVector, Vector}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.{ArrayType, DataType, NumericType}
+import org.apache.spark.sql.types.{DoubleType, ArrayType, DataType, NumericType}
 import org.tensorframes.impl.SupportedOperations
 
 /**
@@ -113,11 +114,14 @@ private[tensorframes] object ExtraOperations extends ExperimentalOperations with
   private def extractBasicType(dt: DataType): Option[NumericType] = dt match {
     case x: NumericType => Some(x)
     case x: ArrayType => extractBasicType(x.elementType)
+    case _: VectorUDT => Some(DoubleType)
     case _ => None
   }
 
   def analyzeData(x: Any): Option[Shape] = x match {
     case null => None
+    case x: DenseVector =>
+      Some(Shape(x.size))
     case u: Array[_] =>
       val shapes = u.map(analyzeData)
       mergeStructs(shapes).map(_.prepend(u.length))
diff --git a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
index 251b790..749ed7f 100644
--- a/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
+++ b/src/test/scala/org/tensorframes/BasicOperationsSuite.scala
@@ -217,7 +217,7 @@ class CurrentOperationsSuite
     val stf = exp.cols.find(_.columnName=="col2").get.stf.get
     println(ops.explain(df))
     assert(stf.dataType === DoubleType)
-    assert(stf.shape === Shape(2, 1))
+    assert(stf.shape === Shape(Unknown, Unknown))
   }
 
   test("dense vector udts with different shapes") {
@@ -229,22 +229,48 @@ class CurrentOperationsSuite
     val stf = exp.cols.find(_.columnName=="col2").get.stf.get
     println(ops.explain(df))
     assert(stf.dataType === DoubleType)
+    assert(stf.shape === Shape(Unknown, Unknown))
+  }
+
+  test("analyze dense vector udts") {
+    val df = sql.createDataFrame(Seq(
+      (1, Vectors.dense(1.0)),
+      (2, Vectors.dense(2.0)))).toDF("col1", "col2")
+    val df2 = ops.analyze(df)
+    val exp = ops.explainDetailed(df2)
+    logDebug(s"exp=$exp")
+    val stf = exp.cols.find(_.columnName=="col2").get.stf.get
+    println(ops.explain(df))
+    assert(stf.dataType === DoubleType)
+    assert(stf.shape === Shape(2, 1))
+  }
+
+  test("analyze dense vector udts with different shapes") {
+    val df = sql.createDataFrame(Seq(
+      (1, Vectors.dense(1.0)),
+      (2, Vectors.dense(2.0, 2.1)))).toDF("col1", "col2")
+    val df2 = ops.analyze(df)
+    val exp = ops.explainDetailed(df2)
+    logDebug(s"exp=$exp")
+    val stf = exp.cols.find(_.columnName=="col2").get.stf.get
+    println(ops.explain(df))
+    assert(stf.dataType === DoubleType)
     assert(stf.shape === Shape(2, Unknown))
   }
 
-  test("dense matrix udts") {
+  ignore("dense matrix udts") {
 
   }
 
-  test("dense matrix udts with different row sizes") {
+  ignore("dense matrix udts with different row sizes") {
 
   }
 
-  test("dense matrix udts with different column sizes") {
+  ignore("dense matrix udts with different column sizes") {
 
   }
 
-  test("dense matrix udts with different column and row sizes") {
+  ignore("dense matrix udts with different column and row sizes") {
 
   }
 }
\ No newline at end of file