1 2 | df = sqlContext.read.json("swift2d://vault.spark/data.json”) df.write.parquet("swift2d://vault.spark/data.parquet”) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * import sys sc = SparkContext() sqlContext = SQLContext(sc) if (len(sys.argv) != 2): print "ERROR: This program takes object name as input" sys.exit(0) objectName = sys.argv[1] myList = [[1,'a'],[2,'b'],[3,'c'],[4,'d'],[5,'e'],[6,'f']] parallelList = sc.parallelize(myList).collect() schema = StructType([StructField('column1', IntegerType(), False), StructField('column2', StringType(), False)]) df = sqlContext.createDataFrame(parallelList, schema) df.printSchema() df.show() dfTarget = df.coalesce(1) dfTarget.write.parquet("swift2d://vault.spark/" + objectName) dfRead = sqlContext.read.parquet("swift2d://vault.spark/" + objectName) dfRead.show() print "Done!" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | root |-- column1: integer (nullable = false) |-- column2: string (nullable = false) +-------+-------+ |column1|column2| +-------+-------+ | 1| a| | 2| b| | 3| c| | 4| d| | 5| e| | 6| f| +-------+-------+ +-------+-------+ |column1|column2| +-------+-------+ | 1| a| | 2| b| | 3| c| | 4| d| | 5| e| | 6| f| +-------+-------+ Done! |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |