spark.range(1).createOrReplaceTempView("demo")
// DESC view
scala> sql("DESC EXTENDED demo").show
+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
| id| bigint| null|
+--------+---------+-------+
// DESC table
// Make the demo reproducible
spark.sharedState.externalCatalog.dropTable(
db = "default",
table = "bucketed",
ignoreIfNotExists = true,
purge = true)
spark.range(10).write.bucketBy(5, "id").saveAsTable("bucketed")
assert(spark.catalog.tableExists("bucketed"))
// EXTENDED to include Detailed Table Information
// Note no partitions used
// Could also be FORMATTED
scala> sql("DESC EXTENDED bucketed").show(numRows = 50, truncate = false)
+----------------------------+-----------------------------------------------------------------------------+-------+
|col_name |data_type |comment|
+----------------------------+-----------------------------------------------------------------------------+-------+
|id |bigint |null |
| | | |
|# Detailed Table Information| | |
|Database |default | |
|Table |bucketed | |
|Owner |jacek | |
|Created Time |Sun Sep 30 20:57:22 CEST 2018 | |
|Last Access |Thu Jan 01 01:00:00 CET 1970 | |
|Created By |Spark 2.3.1 | |
|Type |MANAGED | |
|Provider |parquet | |
|Num Buckets |5 | |
|Bucket Columns |[`id`] | |
|Sort Columns |[] | |
|Table Properties |[transient_lastDdlTime=1538333842] | |
|Statistics |3740 bytes | |
|Location |file:/Users/jacek/dev/apps/spark-2.3.1-bin-hadoop2.7/spark-warehouse/bucketed| |
|Serde Library |org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | |
|InputFormat |org.apache.hadoop.mapred.SequenceFileInputFormat | |
|OutputFormat |org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat | |
|Storage Properties |[serialization.format=1] | |
+----------------------------+-----------------------------------------------------------------------------+-------+
// Make the demo reproducible
val tableName = "partitioned_bucketed_sorted"
val partCol = "part"
spark.sharedState.externalCatalog.dropTable(
db = "default",
table = tableName,
ignoreIfNotExists = true,
purge = true)
spark.range(10)
.withColumn("part", $"id" % 2) // extra column for partitions
.write
.partitionBy(partCol)
.bucketBy(5, "id")
.sortBy("id")
.saveAsTable(tableName)
assert(spark.catalog.tableExists(tableName))
scala> sql(s"DESC EXTENDED $tableName").show(numRows = 50, truncate = false)
+----------------------------+------------------------------------------------------------------------------------------------+-------+
|col_name |data_type |comment|
+----------------------------+------------------------------------------------------------------------------------------------+-------+
|id |bigint |null |
|part |bigint |null |
|# Partition Information | | |
|# col_name |data_type |comment|
|part |bigint |null |
| | | |
|# Detailed Table Information| | |
|Database |default | |
|Table |partitioned_bucketed_sorted | |
|Owner |jacek | |
|Created Time |Mon Oct 01 10:05:32 CEST 2018 | |
|Last Access |Thu Jan 01 01:00:00 CET 1970 | |
|Created By |Spark 2.3.1 | |
|Type |MANAGED | |
|Provider |parquet | |
|Num Buckets |5 | |
|Bucket Columns |[`id`] | |
|Sort Columns |[`id`] | |
|Table Properties |[transient_lastDdlTime=1538381132] | |
|Location |file:/Users/jacek/dev/apps/spark-2.3.1-bin-hadoop2.7/spark-warehouse/partitioned_bucketed_sorted| |
|Serde Library |org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | |
|InputFormat |org.apache.hadoop.mapred.SequenceFileInputFormat | |
|OutputFormat |org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat | |
|Storage Properties |[serialization.format=1] | |
|Partition Provider |Catalog | |
+----------------------------+------------------------------------------------------------------------------------------------+-------+
scala> sql(s"DESCRIBE EXTENDED $tableName PARTITION ($partCol=1)").show(numRows = 50, truncate = false)
+--------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------+
|col_name |data_type |comment|
+--------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------+
|id |bigint |null |
|part |bigint |null |
|# Partition Information | | |
|# col_name |data_type |comment|
|part |bigint |null |
| | | |
|# Detailed Partition Information| | |
|Database |default | |
|Table |partitioned_bucketed_sorted | |
|Partition Values |[part=1] | |
|Location |file:/Users/jacek/dev/apps/spark-2.3.1-bin-hadoop2.7/spark-warehouse/partitioned_bucketed_sorted/part=1 | |
|Serde Library |org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | |
|InputFormat |org.apache.hadoop.mapred.SequenceFileInputFormat | |
|OutputFormat |org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat | |
|Storage Properties |[path=file:/Users/jacek/dev/apps/spark-2.3.1-bin-hadoop2.7/spark-warehouse/partitioned_bucketed_sorted, serialization.format=1]| |
|Partition Parameters |{totalSize=1870, numFiles=5, transient_lastDdlTime=1538381329} | |
|Partition Statistics |1870 bytes | |
| | | |
|# Storage Information | | |
|Num Buckets |5 | |
|Bucket Columns |[`id`] | |
|Sort Columns |[`id`] | |
|Location |file:/Users/jacek/dev/apps/spark-2.3.1-bin-hadoop2.7/spark-warehouse/partitioned_bucketed_sorted | |
|Serde Library |org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | |
|InputFormat |org.apache.hadoop.mapred.SequenceFileInputFormat | |
|OutputFormat |org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat | |
|Storage Properties |[serialization.format=1] | |
+--------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------+