使用 json定义spark sql schema 代码例子-spark技术分享

package streaming.core

import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
import org.scalatest.concurrent.Eventually
import streaming.util.Logging
import org.apache.spark.sql.types._
import org.apache.spark.sql.{streaming, functions => F, _}


/**
  * Created by sunbiaobiao on 2019/3/27.
  */
class SchemaTest extends FunSuite
  with BeforeAndAfter
  with BeforeAndAfterAll
  with Eventually  with Logging {

  override def beforeAll {}

  test("spark sql schema") {

    val spark = SparkSession.builder().getOrCreate()

    import  spark.implicits._
    import  spark._

    val addressesSchema = new StructType().add($"city".string).add($"state".string).add($"zip".string)

    val schema = new StructType().add($"firstName".string).add($"lastName".string).add($"email".string).add($"addresses".array(addressesSchema)).add("sons", ArrayType(MapType(StringType, new StructType().add($"firstName".string))))add("keys", MapType(StringType, StringType))

    println(schema.prettyJson)

    import org.apache.spark.sql.types.DataType

    //val schemaAsJson = schema.prettyJson
    // 打印出来的json 如下
    val schemaAsJson =
      """
        |{
        |  "type" : "struct",
        |  "fields" : [ {
        |    "name" : "firstName",
        |    "type" : "string",
        |    "nullable" : true,
        |    "metadata" : { }
        |  }, {
        |    "name" : "lastName",
        |    "type" : "string",
        |    "nullable" : true,
        |    "metadata" : { }
        |  }, {
        |    "name" : "email",
        |    "type" : "string",
        |    "nullable" : true,
        |    "metadata" : { }
        |  }, {
        |    "name" : "addresses",
        |    "type" : {
        |      "type" : "array",
        |      "elementType" : {
        |        "type" : "struct",
        |        "fields" : [ {
        |          "name" : "city",
        |          "type" : "string",
        |          "nullable" : true,
        |          "metadata" : { }
        |        }, {
        |          "name" : "state",
        |          "type" : "string",
        |          "nullable" : true,
        |          "metadata" : { }
        |        }, {
        |          "name" : "zip",
        |          "type" : "string",
        |          "nullable" : true,
        |          "metadata" : { }
        |        } ]
        |      },
        |      "containsNull" : true
        |    },
        |    "nullable" : true,
        |    "metadata" : { }
        |  }, {
        |    "name" : "sons",
        |    "type" : {
        |      "type" : "array",
        |      "elementType" : {
        |        "type" : "map",
        |        "keyType" : "string",
        |        "valueType" : {
        |          "type" : "struct",
        |          "fields" : [ {
        |            "name" : "firstName",
        |            "type" : "string",
        |            "nullable" : true,
        |            "metadata" : { }
        |          } ]
        |        },
        |        "valueContainsNull" : true
        |      },
        |      "containsNull" : true
        |    },
        |    "nullable" : true,
        |    "metadata" : { }
        |  }, {
        |    "name" : "keys",
        |    "type" : {
        |      "type" : "map",
        |      "keyType" : "string",
        |      "valueType" : "string",
        |      "valueContainsNull" : true
        |    },
        |    "nullable" : true,
        |    "metadata" : { }
        |  } ]
        |}
      """.stripMargin

    val dt = DataType.fromJson(schemaAsJson)


    val rawJsons = Seq("""
  {
    "firstName" : "Jacek",
    "lastName" : "Laskowski",
    "email" : "jacek@japila.pl",
    "addresses" : [
      {
        "city" : "Warsaw",
        "state" : "N/A",
        "zip" : "02-791"
      }
    ]
  }
""").toDF("rawjson")

    val people = rawJsons
      .select(F.from_json($"rawjson", dt, Map.empty[String, String]) as "json")
      .select("json.*") // <-- flatten the struct field
      .withColumn("address", F.explode($"addresses")) // <-- explode the array field
      .drop("addresses")  // <-- no longer needed
      .select("firstName", "lastName", "email", "address.*") // <-- flatten the struct field
  }

}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

package streaming.core

import org.apache.spark.sql.SparkSession

import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}

import org.scalatest.concurrent.Eventually

import streaming.util.Logging

import org.apache.spark.sql.types._

import org.apache.spark.sql.{streaming, functions => F, _}

/**

* Created by sunbiaobiao on 2019/3/27.

class SchemaTest extends FunSuite

with BeforeAndAfter

with BeforeAndAfterAll

with Eventually with Logging {

override def beforeAll {}

test("spark sql schema") {

val spark = SparkSession.builder().getOrCreate()

import spark.implicits._

import spark._

val addressesSchema = new StructType().add($"city".string).add($"state".string).add($"zip".string)

val schema = new StructType().add($"firstName".string).add($"lastName".string).add($"email".string).add($"addresses".array(addressesSchema)).add("sons", ArrayType(MapType(StringType, new StructType().add($"firstName".string))))add("keys", MapType(StringType, StringType))

println(schema.prettyJson)

import org.apache.spark.sql.types.DataType

//val schemaAsJson = schema.prettyJson

// 打印出来的json 如下

val schemaAsJson =

"""

| "type" : "struct",

| "fields" : [ {

| "name" : "firstName",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "lastName",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "email",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "addresses",

| "type" : {

| "type" : "array",

| "elementType" : {

| "type" : "struct",

| "fields" : [ {

| "name" : "city",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "state",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "zip",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| } ]

| },

| "containsNull" : true

| },

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "sons",

| "type" : {

| "type" : "array",

| "elementType" : {

| "type" : "map",

| "keyType" : "string",

| "valueType" : {

| "type" : "struct",

| "fields" : [ {

| "name" : "firstName",

| "type" : "string",

| "nullable" : true,

| "metadata" : { }

| } ]

| },

| "valueContainsNull" : true

| },

| "containsNull" : true

| },

| "nullable" : true,

| "metadata" : { }

| }, {

| "name" : "keys",

| "type" : {

| "type" : "map",

| "keyType" : "string",

| "valueType" : "string",

| "valueContainsNull" : true

| },

| "nullable" : true,

| "metadata" : { }

| } ]

""".stripMargin

val dt = DataType.fromJson(schemaAsJson)

val rawJsons = Seq("""

{

"firstName" : "Jacek",

"lastName" : "Laskowski",

"email" : "jacek@japila.pl",

"addresses" : [

{

"city" : "Warsaw",

"state" : "N/A",

"zip" : "02-791"

}

]

}

""").toDF("rawjson")

val people = rawJsons

.select(F.from_json($"rawjson", dt, Map.empty[String, String]) as "json")

.select("json.*") // <-- flatten the struct field

.withColumn("address", F.explode($"addresses")) // <-- explode the array field

.drop("addresses") // <-- no longer needed

.select("firstName", "lastName", "email", "address.*") // <-- flatten the struct field

}

使用 json定义spark sql schema 代码例子

相关推荐

评论抢沙发

欢迎关注：spark技术分享

热门标签

近期文章

分类目录

关注公众号：spark技术分享

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

QQ咨询

回顶部

相关推荐

评论 抢沙发

欢迎关注：spark技术分享

热门标签

近期文章

分类目录

关注公众号：spark技术分享

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

QQ咨询

回顶部

评论抢沙发