From 403f460c644308126b6f3ab5dda66fa6b1872ce9 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 12 Jul 2014 15:52:47 -0700 Subject: [PATCH 1/2] Incorporate shark/pull/329 into Spark SQL. Credit to @chiragaggarwal. --- .../apache/spark/sql/hive/TableReader.scala | 64 +++++++++++++++---- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 8cfde46186ca4..e0e15b9958f97 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -21,9 +21,14 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants._ import org.apache.hadoop.hive.ql.exec.Utilities +import org.apache.hadoop.hive.ql.io.orc.OrcSerde import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable} import org.apache.hadoop.hive.ql.plan.TableDesc -import org.apache.hadoop.hive.serde2.Deserializer +import org.apache.hadoop.hive.serde2.{Serializer, Deserializer} +import org.apache.hadoop.hive.serde2.columnar.{ColumnarStruct => HiveColumnarStruct} +import org.apache.hadoop.hive.serde2.`lazy`.LazyStruct +import org.apache.hadoop.hive.serde2.objectinspector.{StructObjectInspector, ObjectInspectorConverters} +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.IdentityConverter import org.apache.hadoop.io.Writable import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf} @@ -40,7 +45,6 @@ private[hive] sealed trait TableReader { def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[_] } - /** * Helper class for scanning tables stored in Hadoop - e.g., to read Hive tables that reside in the * data warehouse directory. @@ -157,21 +161,59 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon // Create local references so that the outer object isn't serialized. val tableDesc = _tableDesc + val tableSerDeClass = tableDesc.getDeserializerClass + val broadcastedHiveConf = _broadcastedHiveConf val localDeserializer = partDeserializer val hivePartitionRDD = createHadoopRdd(tableDesc, inputPathStr, ifc) - hivePartitionRDD.mapPartitions { iter => + hivePartitionRDD.mapPartitions { case iter => val hconf = broadcastedHiveConf.value.value val rowWithPartArr = new Array[Object](2) - // Map each tuple to a row object - iter.map { value => - val deserializer = localDeserializer.newInstance() - deserializer.initialize(hconf, partProps) - val deserializedRow = deserializer.deserialize(value) - rowWithPartArr.update(0, deserializedRow) - rowWithPartArr.update(1, partValues) - rowWithPartArr.asInstanceOf[Object] + + val partSerDe = localDeserializer.newInstance() + val tableSerDe = tableSerDeClass.newInstance() + partSerDe.initialize(hconf, partProps) + tableSerDe.initialize(hconf, tableDesc.getProperties) + + val tblConvertedOI = ObjectInspectorConverters.getConvertedOI( + partSerDe.getObjectInspector, tableSerDe.getObjectInspector, true) + .asInstanceOf[StructObjectInspector] + val partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter( + partSerDe.getObjectInspector, tblConvertedOI) + + // This is done per partition, and unnecessary to put it in the iterations (in iter.map). + rowWithPartArr.update(1, partValues) + + // Map each tuple to a row object. + if (partTblObjectInspectorConverter.isInstanceOf[IdentityConverter]) { + iter.map { case value => + rowWithPartArr.update(0, partSerDe.deserialize(value)) + rowWithPartArr.asInstanceOf[Object] + } + } else { + iter.map { case value => + val deserializedRow = { + // If partition schema does not match table schema, update the row to match. + val convertedRow = partTblObjectInspectorConverter.convert(partSerDe.deserialize(value)) + + // If conversion was performed, convertedRow will be a standard Object, but if + // conversion wasn't necessary, it will still be lazy. We can't have both across + // partitions, so we serialize and deserialize again to make it lazy. + if (tableSerDe.isInstanceOf[OrcSerde]) { + convertedRow + } else { + convertedRow match { + case _: LazyStruct => convertedRow + case _: HiveColumnarStruct => convertedRow + case _ => tableSerDe.deserialize( + tableSerDe.asInstanceOf[Serializer].serialize(convertedRow, tblConvertedOI)) + } + } + } + rowWithPartArr.update(0, deserializedRow) + rowWithPartArr.asInstanceOf[Object] + } } } }.toSeq From db332f20695763f79d9ed6732944ad610db7acb6 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 12 Jul 2014 19:41:57 -0700 Subject: [PATCH 2/2] Fix scalastyle. --- .../src/main/scala/org/apache/spark/sql/hive/TableReader.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index e0e15b9958f97..bb2240b335d4f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -195,7 +195,8 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon iter.map { case value => val deserializedRow = { // If partition schema does not match table schema, update the row to match. - val convertedRow = partTblObjectInspectorConverter.convert(partSerDe.deserialize(value)) + val convertedRow = + partTblObjectInspectorConverter.convert(partSerDe.deserialize(value)) // If conversion was performed, convertedRow will be a standard Object, but if // conversion wasn't necessary, it will still be lazy. We can't have both across