-
Notifications
You must be signed in to change notification settings - Fork 2
/
using_row_object.py
60 lines (48 loc) · 1.35 KB
/
using_row_object.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from pyspark.sql import Row,SparkSession
#create a Row objet
row = Row("James",40)
print(row[0]+","+str(row[1]))
#using Named arguments
row = Row(name="Alice",age=11)
print(row.name)
print(row.age)
#create a custom class from Row
Person = Row("name","age")
p1 = Person("James",40)
p2 = Person("Alice",11)
print(p1.name+","+p2.name)
spark = (SparkSession
.builder
.master("local")
.appName("using_row_object")
.getOrCreate()
)
data = [
Row(name="James,,Smith",lang=["Java","Scala","C++"],state="CA"),
Row(name="Michael,Rose,",lang=["Spark","Java","C++"],state="NJ"),
Row(name="Robert,,Williams",lang=["CSharp","VB"],state="NV")
]
#create RDD from Row objects
rdd = spark.sparkContext.parallelize(data)
print(rdd.collect())
#collect returns Row
collData = rdd.collect()
for rows in collData:
print(rows.name+" , "+str(rows.lang))
#create DataFrame from Row objects
df = spark.createDataFrame(data)
df.printSchema()
df.show(truncate=False)
#changing column names
columns = ["name","languages","currentstate"]
df2 = spark.createDataFrame(data).toDF(*columns)
df2.printSchema()
#create nested Struct using Row object
data = [
Row(name="James",prop=Row(hair="black",eye="blue")),
Row(name="Annie",prop=Row(hair="grey",eye="black"))
]
df3 = spark.createDataFrame(data)
df3.printSchema()
df3.show()
spark.stop()