-
Notifications
You must be signed in to change notification settings - Fork 16
/
update_spark_params.py
63 lines (52 loc) · 2.21 KB
/
update_spark_params.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Utility used to regenerate the json files used for the lists of spark properties
"""
import json
import numpy
import pandas as pd
from os.path import join, dirname
import pdb
import sys
# Next three imports are indirectly required by the code in this function
import html5lib
import bs4
import lxml
def _save_documentation(version, base_url="https://spark.apache.org/docs"):
"""
Write the spark property documentation to a file
"""
target_dir = join(dirname(__file__), 'spylon', 'spark')
with open(join(target_dir, "spark_properties_{}.json".format(version)), 'w') as fp:
all_props = _fetch_documentation(version=version, base_url=base_url)
all_props = sorted(all_props, key=lambda x: x[0])
all_props_d = [{"property": p, "default": d, "description": desc} for p, d, desc in all_props]
json.dump(all_props_d, fp, indent=2)
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
doc_urls = [
"{base_url}/{version}/configuration.html",
"{base_url}/{version}/sql-programming-guide.html",
"{base_url}/{version}/monitoring.html",
"{base_url}/{version}/spark-standalone.html",
"{base_url}/{version}/running-on-mesos.html",
"{base_url}/{version}/running-on-yarn.html",
]
for url in doc_urls:
doc_url = url.format(version=version, base_url=base_url)
# print(url)
print("Loading spark properties from %s", doc_url)
dfs = pd.read_html(doc_url, header=0)
desired_cols = ["Property Name", "Default", "Meaning"]
for df in dfs:
if ("Property Name" in df) and ('Default' in df):
for pn, default, desc in df[desired_cols].itertuples(index=False):
if type(default) == numpy.bool_:
default = bool(default)
yield pn, default, desc
if __name__ == "__main__":
# set the pdb_hook as the except hook for all exceptions so that debugging is easier
def pdb_hook(exctype, value, traceback):
pdb.post_mortem(traceback)
sys.excepthook = pdb_hook
spark_versions = ["1.6.0", "1.6.1", "2.0.2", "2.1.0", "latest"]
for sv in spark_versions:
_save_documentation(version=sv)