-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathproblem22.py
More file actions
18 lines (16 loc) · 750 Bytes
/
problem22.py
File metadata and controls
18 lines (16 loc) · 750 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from pyspark import SparkContext, SparkConf
# Configure
conf = SparkConf().setAppName('Problem 22')
sc = SparkContext(conf=conf)
# Loading data
content = sc.textFile("/user/smartlin1/data/Content.txt")
remove = sc.textFile("/user/smartlin1/data/Remove.txt")
removeRDD = remove.flatMap(lambda word: word.split(",")).map(lambda x: x.strip())
# broadcast the removeRDD to each partition for a read only copy for each stack
bc = sc.broadcast(removeRDD.collect())
words = content.flatMap(lambda word: word.split(" "))
filteredWords = words.filter(lambda x: x not in bc.value)
wordPairs = filteredWords.map(lambda word: (word, 1))
wordCount = wordPairs.reduceByKey(lambda x,y: x+y)
wordCount.repartition(1).saveAsTextFile("output/result")
bc.unpersist()