-
Notifications
You must be signed in to change notification settings - Fork 2
/
read_file.py
38 lines (33 loc) · 1.37 KB
/
read_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import sys, os
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from subprocess import check_output
spark_conf = SparkConf()
SPARK_DRIVER_HOST = check_output(["hostname", "-i"]).decode(encoding="utf-8").strip()
spark_conf.setAll(
[
(
"spark.master",
"spark://spark:7077",
), # <--- this host must be resolvable by the driver in this case pyspark (whatever it is located, same server or remote) in our case the IP of server
("spark.app.name", "myApp"),
("spark.submit.deployMode", "client"),
("spark.ui.showConsoleProgress", "true"),
("spark.eventLog.enabled", "false"),
("spark.logConf", "false"),
(
"spark.driver.bindAddress",
"0.0.0.0",
), # <--- this host is the IP where pyspark will bind the service running the driver (normally 0.0.0.0)
(
"spark.driver.host",
SPARK_DRIVER_HOST,
), # <--- this host is the resolvable IP for the host that is running the driver and it must be reachable by the master and master must be able to reach it (in our case the IP of the container where we are running pyspark
]
)
spark_sess = SparkSession.builder.config(conf=spark_conf).getOrCreate()
spark_reader = spark_sess.read
textFile = spark_reader.text("*.md")
print(textFile.first())
spark_sess.stop()
quit()