sanp · March 13, 2017 15:59 · ddrscott · Mar 10, 2017 · ddrscott · Mar 10, 2017
diff --git a/parse_json_with_spark_lateral_view.py b/parse_json_with_spark_lateral_view.py
 # Parse JSON data with this one weird trick!

 from pyspark import SparkContext
 from pyspark import SparkConf
 from pyspark.sql import SQLContext
 from pyspark.sql import Row

 # Set up basic spark session
 conf = (SparkConf()
         .setAppName('My App')
         .set("spark.executor.memory", '10g'))
 sc = SparkContext(conf=conf)
 sql_context = SQLContext(sc)
 spark_session = sql_context.sparkSession

 # Sample Data
 jstr1 = u'{"header":{"id":12345,"foo":"bar"},"body":{"id":111000,"name":"foobar","sub_json":{"id":54321,"sub_sub_json":{"col1":20,"col2":"somethong"}}}}'
 jstr2 = u'{"header":{"id":12346,"foo":"baz"},"body":{"id":111002,"name":"barfoo","sub_json":{"id":23456,"sub_sub_json":{"col1":30,"col2":"something else"}}}}'
 jstr3 = u'{"header":{"id":43256,"foo":"foobaz"},"body":{"id":20192,"name":"bazbar","sub_json":{"id":39283,"sub_sub_json":{"col1":50,"col2":"another thing"}}}}'
 df = sql_context.createDataFrame([Row(json=jstr1),Row(json=jstr2),Row(json=jstr3)])
 df.show()
 # >>> df.show()
 # +--------------------+
 # |                json|
 # +--------------------+
 # |{"header":{"id":1...|
 # |{"header":{"id":1...|
 # |{"header":{"id":4...|
 # +--------------------+

 # Create a sql view of the data
 df.createOrReplaceTempView('df')

 # Parse it - step by step:

 # 1. Just the top level JSON
 q = """
  select
    json
  from df
 """
 result = spark_session.sql(q)
 result.show()
 # >>> result.show()
 # +--------------------+
 # |                json|
 # +--------------------+
 # |{"header":{"id":1...|
 # |{"header":{"id":1...|
 # |{"header":{"id":4...|
 # +--------------------+

 # 2. Add the next level of nested data
 q = """
  select
    a.json
    , b.header
    , b.body
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
 """
 result = spark_session.sql(q)
 result.show()
 # >>> result.show()
 # +--------------------+--------------------+--------------------+
 # |                json|              header|                body|
 # +--------------------+--------------------+--------------------+
 # |{"header":{"id":1...|{"id":12345,"foo"...|{"id":111000,"nam...|
 # |{"header":{"id":1...|{"id":12346,"foo"...|{"id":111002,"nam...|
 # |{"header":{"id":4...|{"id":43256,"foo"...|{"id":20192,"name...|
 # +--------------------+--------------------+--------------------+

 # 3. Go deeper into the nesting
 q = """
  select
    a.json
    , b.header
    , c.id
    , c.foo
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
  lateral view json_tuple(b.header, 'id', 'foo') c
    as id, foo
 """
 result = spark_session.sql(q)
 result.show()
 # >>> result.show()
 # +--------------------+--------------------+-----+------+
 # |                json|              header|   id|   foo|
 # +--------------------+--------------------+-----+------+
 # |{"header":{"id":1...|{"id":12345,"foo"...|12345|   bar|
 # |{"header":{"id":1...|{"id":12346,"foo"...|12346|   baz|
 # |{"header":{"id":4...|{"id":43256,"foo"...|43256|foobaz|
 # +--------------------+--------------------+-----+------+

 # ...

 # Fully parsed out everything
 q = """
  select
    c.header_id
    , c.foo
    , d.body_id
    , d.name
    , e.id as sub_json_id
    , f.col1
    , f.col2
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
  lateral view json_tuple(b.header, 'id', 'foo') c
    as header_id, foo
  lateral view json_tuple(b.body, 'id', 'name', 'sub_json') d
    as body_id, name, sub_json
  lateral view json_tuple(d.sub_json, 'id', 'sub_sub_json') e
    as id, sub_sub_json
  lateral view json_tuple(e.sub_sub_json, 'col1', 'col2') f
    as col1, col2
 """
 result = spark_session.sql(q)
 result.show()
 # >>> result.show()
 # +---------+------+-------+------+-----------+----+--------------+
 # |header_id|   foo|body_id|  name|sub_json_id|col1|          col2|
 # +---------+------+-------+------+-----------+----+--------------+
 # |    12345|   bar| 111000|foobar|      54321|  20|     somethong|
 # |    12346|   baz| 111002|barfoo|      23456|  30|something else|
 # |    43256|foobaz|  20192|bazbar|      39283|  50| another thing|
 # +---------+------+-------+------+-----------+----+--------------+

 # Now you can parse anything!
	# Parse JSON data with this one weird trick!

	from pyspark import SparkContext
	from pyspark import SparkConf
	from pyspark.sql import SQLContext
	from pyspark.sql import Row

	# Set up basic spark session
	conf = (SparkConf()
	.setAppName('My App')
	.set("spark.executor.memory", '10g'))
	sc = SparkContext(conf=conf)
	sql_context = SQLContext(sc)
	spark_session = sql_context.sparkSession

	# Sample Data
	jstr1 = u'{"header":{"id":12345,"foo":"bar"},"body":{"id":111000,"name":"foobar","sub_json":{"id":54321,"sub_sub_json":{"col1":20,"col2":"somethong"}}}}'
	jstr2 = u'{"header":{"id":12346,"foo":"baz"},"body":{"id":111002,"name":"barfoo","sub_json":{"id":23456,"sub_sub_json":{"col1":30,"col2":"something else"}}}}'
	jstr3 = u'{"header":{"id":43256,"foo":"foobaz"},"body":{"id":20192,"name":"bazbar","sub_json":{"id":39283,"sub_sub_json":{"col1":50,"col2":"another thing"}}}}'
	df = sql_context.createDataFrame([Row(json=jstr1),Row(json=jstr2),Row(json=jstr3)])
	df.show()
	# >>> df.show()
	# +--------------------+
	# \| json\|
	# +--------------------+
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":4...\|
	# +--------------------+

	# Create a sql view of the data
	df.createOrReplaceTempView('df')

	# Parse it - step by step:

	# 1. Just the top level JSON
	q = """
	select
	json
	from df
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+
	# \| json\|
	# +--------------------+
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":4...\|
	# +--------------------+

	# 2. Add the next level of nested data
	q = """
	select
	a.json
	, b.header
	, b.body
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+--------------------+--------------------+
	# \| json\| header\| body\|
	# +--------------------+--------------------+--------------------+
	# \|{"header":{"id":1...\|{"id":12345,"foo"...\|{"id":111000,"nam...\|
	# \|{"header":{"id":1...\|{"id":12346,"foo"...\|{"id":111002,"nam...\|
	# \|{"header":{"id":4...\|{"id":43256,"foo"...\|{"id":20192,"name...\|
	# +--------------------+--------------------+--------------------+

	# 3. Go deeper into the nesting
	q = """
	select
	a.json
	, b.header
	, c.id
	, c.foo
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	lateral view json_tuple(b.header, 'id', 'foo') c
	as id, foo
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+--------------------+-----+------+
	# \| json\| header\| id\| foo\|
	# +--------------------+--------------------+-----+------+
	# \|{"header":{"id":1...\|{"id":12345,"foo"...\|12345\| bar\|
	# \|{"header":{"id":1...\|{"id":12346,"foo"...\|12346\| baz\|
	# \|{"header":{"id":4...\|{"id":43256,"foo"...\|43256\|foobaz\|
	# +--------------------+--------------------+-----+------+

	# ...

	# Fully parsed out everything
	q = """
	select
	c.header_id
	, c.foo
	, d.body_id
	, d.name
	, e.id as sub_json_id
	, f.col1
	, f.col2
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	lateral view json_tuple(b.header, 'id', 'foo') c
	as header_id, foo
	lateral view json_tuple(b.body, 'id', 'name', 'sub_json') d
	as body_id, name, sub_json
	lateral view json_tuple(d.sub_json, 'id', 'sub_sub_json') e
	as id, sub_sub_json
	lateral view json_tuple(e.sub_sub_json, 'col1', 'col2') f
	as col1, col2
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +---------+------+-------+------+-----------+----+--------------+
	# \|header_id\| foo\|body_id\| name\|sub_json_id\|col1\| col2\|
	# +---------+------+-------+------+-----------+----+--------------+
	# \| 12345\| bar\| 111000\|foobar\| 54321\| 20\| somethong\|
	# \| 12346\| baz\| 111002\|barfoo\| 23456\| 30\|something else\|
	# \| 43256\|foobaz\| 20192\|bazbar\| 39283\| 50\| another thing\|
	# +---------+------+-------+------+-----------+----+--------------+

	# Now you can parse anything!