amplab · liancheng · Jun 3, 2014 · Jun 3, 2014 · Jun 3, 2014 · Jun 4, 2014
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+sbt/sbt-launch-*.jar
 target/
 build/
 metastore_db/
@@ -8,6 +9,7 @@ work/
 run-tests-from-scratch-workspace/
 
 conf/shark-env.sh
+conf/hive-site.xml
 
 # Compiled Source
 *.class

diff --git a/README.md b/README.md
@@ -59,9 +59,19 @@ resultSet.next()
 println(resultSet.getInt(1))
 ```
 
+## Running Shark CLI
+* Configure the shark_home/conf/shark-env.sh
+* Configure the shark_home/conf/hive-site.xml
+* Start the Shark CLI
+```
+$ bin/shark
+catalyst> show tables;
+catalyst> set shark.exec.mode=hive;
+hive>show tables;
+```
+But there is a bug, which require show tables before doing anything else.
+
 ## Known Missing Features
-* Shark CLI
-* Restoring cached tables upon restart
 * Invalidation of cached tables when data is INSERTed
 * Off-heap storage using Tachyon
 * TGFs

diff --git a/project/SharkBuild.scala b/project/SharkBuild.scala
@@ -75,9 +75,8 @@ object SharkBuild extends Build {
   val excludeXerces = ExclusionRule(organization = "xerces")
   val excludeHive = ExclusionRule(organization = "org.apache.hive")
 
-
   /** Extra artifacts not included in Spark SQL's Hive support. */
-  val hiveArtifacts = Seq("hive-cli", "hive-jdbc", "hive-beeline")
+  val hiveArtifacts = Seq("hive-cli", "hive-jdbc", "hive-exec", "hive-service", "hive-beeline")
   val hiveDependencies = hiveArtifacts.map ( artifactId =>
     "org.spark-project.hive" % artifactId % "0.12.0" excludeAll(
       excludeGuava, excludeLog4j, excludeAsm, excludeNetty, excludeXerces, excludeServlet)
@@ -101,15 +100,21 @@ object SharkBuild extends Build {
 
     libraryDependencies ++= hiveDependencies ++ yarnDependency,
     libraryDependencies ++= Seq(
-      "org.apache.spark" %% "spark-hive" % SPARK_VERSION,
+      "org.apache.spark" %% "spark-hive" % SPARK_VERSION excludeAll(excludeHive, excludeServlet) force(),
       "org.apache.spark" %% "spark-repl" % SPARK_VERSION,
+      "org.apache.hadoop" % "hadoop-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm) force(),
+      "org.mortbay.jetty" % "jetty" % "6.1.26" exclude ("org.mortbay.jetty", "servlet-api") force(),
+      "org.eclipse.jetty.orbit" % "javax.servlet" % "3.0.0.v201112011016" artifacts ( Artifact("javax.servlet", "jar", "jar") ),
       "com.typesafe" %% "scalalogging-slf4j" % "1.0.1",
       "org.scalatest"    %% "scalatest"       % "1.9.1"  % "test"
     ),
 
     // Download managed jars into lib_managed.
     retrieveManaged := true,
     resolvers ++= Seq(
+      "Maven Repository"     at "http://repo.maven.apache.org/maven2",
+      "Apache Repository"    at "https://repository.apache.org/content/repositories/releases",
+      "JBoss Repository"     at "https://repository.jboss.org/nexus/content/repositories/releases/",
       "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/",
       "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
       "Local Maven" at Path.userHome.asFile.toURI.toURL + ".m2/repository"

diff --git a/src/main/scala/org/apache/spark/sql/hive/CatalystContext.scala b/src/main/scala/org/apache/spark/sql/hive/CatalystContext.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.spark.SparkContext
+
+import shark.LogHelper
+
+class CatalystContext(sc: SparkContext) extends HiveContext(sc) with LogHelper {
+  type QueryExecution = HiveContext#QueryExecution
+
+  @transient protected[hive] override lazy val sessionState = SessionState.get()
+  @transient protected[hive] override lazy val hiveconf = sessionState.getConf
+
+  def executeHiveQL(statement: String): this.QueryExecution = executePlan(hql(statement).logicalPlan)
+}
diff --git a/src/main/scala/shark/CatalystDriver.scala b/src/main/scala/shark/CatalystDriver.scala
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2012 The Regents of The University California.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package shark
+
+import scala.collection.JavaConversions._
+
+import java.util.{ArrayList => JArrayList}
+
+import org.apache.commons.lang.exception.ExceptionUtils
+import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
+import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
+import org.apache.spark.sql.hive.{CatalystContext, HiveMetastoreTypes}
+
+class CatalystDriver(val context: CatalystContext = CatalystEnv.catalystContext) extends Driver with LogHelper {
+  private var tableSchema: Schema = _
+  private var hiveResponse: Seq[String] = _
+
+  override def init(): Unit = {
+  }
+
+  private def getResultSetSchema(query: context.QueryExecution): Schema = {
+    val analyzed = query.analyzed
+    logger.debug(s"Result Schema: ${analyzed.output}")
+    if (analyzed.output.size == 0) {
+      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
+    } else {
+      val fieldSchemas = analyzed.output.map { attr =>
+        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
+      }
+
+      new Schema(fieldSchemas, null)
+    }
+  }
+
+  override def run(command: String): CommandProcessorResponse = {
+    val execution = context.executeHiveQL(command)
+
+    // TODO unify the error code
+    try {
+      hiveResponse = execution.stringResult()
+      tableSchema = getResultSetSchema(execution)
+      new CommandProcessorResponse(0)
+    } catch {
+      case cause: Throwable =>
+        logError(s"Failed in [$command]", cause)
+        new CommandProcessorResponse(-3, ExceptionUtils.getFullStackTrace(cause), null)
+    }
+  }
+
+  override def close(): Int = {
+    hiveResponse = null
+    tableSchema = null
+    0
+  }
+
+  override def getSchema: Schema = tableSchema
+
+  override def getResults(res: JArrayList[String]): Boolean = {
+    if (hiveResponse == null) {
+      false
+    } else {
+      res.addAll(hiveResponse)
+      hiveResponse = null
+      true
+    }
+  }
+
+  override def destroy() {
+    super.destroy()
+    hiveResponse = null
+    tableSchema = null
+  }
+}
diff --git a/src/main/scala/shark/CatalystEnv.scala b/src/main/scala/shark/CatalystEnv.scala
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2012 The Regents of The University California.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package shark
+
+import scala.collection.mutable
+import scala.collection.mutable.{HashMap, HashSet}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.shims.ShimLoader
+import org.apache.spark.SparkConf
+import org.apache.spark.scheduler.StatsReportListener
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.hive.CatalystContext
+import org.apache.spark.scheduler.SplitInfo
+
+/** A singleton object for the master program. The slaves should not access this. */
+// TODO add tachyon / memory store based (Copied from SharkEnv.scala)
+object CatalystEnv extends LogHelper {
+
+  def init(): CatalystContext = {
+    if (catalystContext == null) {
+      initWithCatalystContext()
+    }
+
+    catalystContext
+  }
+
+  def fixIncompatibleConf(conf: Configuration) {
+    if (sparkContext == null) {
+      init()
+    }
+
+    val hiveIsLocal = ShimLoader.getHadoopShims.isLocalMode(conf)
+    if (!sparkContext.isLocal && hiveIsLocal) {
+      val warnMessage = "Hive Hadoop shims detected local mode, but Shark is not running locally."
+      logWarning(warnMessage)
+
+      // Try to fix this without bothering user
+      val newValue = "Spark_%s".format(System.currentTimeMillis())
+      for (k <- Seq("mapred.job.tracker", "mapreduce.framework.name")) {
+        val v = conf.get(k)
+        if (v == null || v == "" || v == "local") {
+          conf.set(k, newValue)
+          logWarning("Setting %s to '%s' (was '%s')".format(k, newValue, v))
+        }
+      }
+
+      // If still not fixed, bail out
+      if (ShimLoader.getHadoopShims.isLocalMode(conf)) {
+        throw new Exception(warnMessage)
+      }
+    }
+  }
+
+  def initWithCatalystContext(
+      jobName: String = "Shark::" + java.net.InetAddress.getLocalHost.getHostName,
+      master: String = System.getenv("MASTER")): CatalystContext = {
+
+    sparkContext = initSparkContext(jobName, master)
+    sparkContext.addSparkListener(new StatsReportListener())
+
+    catalystContext = new CatalystContext(sparkContext)
+    catalystContext
+  }
+
+  private def initSparkContext(
+      jobName: String = "Shark::" + java.net.InetAddress.getLocalHost.getHostName,
+      master: String = System.getenv("MASTER")): SparkContext = {
+
+    if (sparkContext != null) {
+      sparkContext.stop()
+    }
+
+    sparkContext = new SparkContext(
+      createSparkConf(if (master == null) "local" else master,
+      jobName,
+      System.getenv("SPARK_HOME"),
+      Nil,
+      executorEnvVars), Map[String, Set[SplitInfo]]())
+
+    sparkContext
+  }
+
+  private def createSparkConf(
+      master: String,
+      jobName: String,
+      sparkHome: String,
+      jars: Seq[String],
+      environment: HashMap[String, String]): SparkConf = {
+
+    val newConf = new SparkConf()
+      .setMaster(master)
+      .setAppName(jobName)
+      .setJars(jars)
+      .setExecutorEnv(environment.toSeq)
+
+    Option(sparkHome).foreach(newConf.setSparkHome)
+    newConf
+  }
+
+  logDebug("Initializing SharkEnv")
+
+  val executorEnvVars = {
+    val envVars = Set(
+      "SPARK_MEM",
+      "SPARK_CLASSPATH",
+      "HADOOP_HOME",
+      "JAVA_HOME",
+      "MESOS_NATIVE_LIBRARY",
+      "TACHYON_MASTER",
+      "TACHYON_WAREHOUSE_PATH")
+    HashMap.empty ++= envVars.map { key =>
+      key -> Option(System.getenv(key)).getOrElse("")
+    }.toMap
+  }
+
+  var catalystContext: CatalystContext = _
+
+  var sparkContext: SparkContext = _
+
+  /** Cleans up and shuts down the Shark environments. */
+  def stop() {
+    logDebug("Shutting down Shark Environment")
+    // Stop the SparkContext
+    if (CatalystEnv.sparkContext != null) {
+      sparkContext.stop()
+      sparkContext = null
+      catalystContext = null
+    }
+  }
+}