-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrunHashing.py
58 lines (48 loc) · 2.12 KB
/
runHashing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import time
import sys
import ConfigParser
import json
config = ConfigParser.ConfigParser()
config.read("hashdoop.conf")
# Traces to sketch
years = json.loads(config.get("Traces","years"))
months = json.loads(config.get("Traces","months"))
days = json.loads(config.get("Traces","days"))
# Parameters for the Hadoop cluster
hadoopBlockSize = config.get("Hadoop", "blockSize")
streamingLib = config.get("Hadoop", "streamingLib")
tracesHdfsPath = config.get("Hadoop", "tracesHdfsPath")
sketchesHdfsPath = config.get("Hadoop", "sketchesHdfsPath")
# Parameters for hashing
nbHash = config.get("Hashing", "nbHash")
hashSize = config.get("Hashing", "hashSize")
# Go through all traces
for ye in years:
for mo in months:
for da in days:
traceName = "{0}{1:02d}{2:02d}1400.ipsum".format(ye,mo,da)
outputDir = traceName+"/"
cmdExp = """hadoop jar {streamingLib} \
-files hashing \
-D map.output.key.field.separator=, \
-D mapred.text.key.partitioner.options=-k1,2 \
-D mapred.reduce.tasks={nbReducer} \
-D dfs.blocksize={hadoopBlockSize} \
-libjars hashing/customMultiOutput.jar \
-outputformat com.custom.CustomMultiOutputFormat \
-mapper "hashing/sketch_ipsum_mapper.py {nbHash} {hashSize}" \
-reducer hashing/sketch_ipsum_reducer.py \
-input {tracesHdfsPath}{traceName} \
-output {sketchesHdfsPath}{outputPath} \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner"""
outputPath=outputDir+str(nbHash)+"hash_"+str(hashSize)+"sketch"
nbReducer = hashSize
#set parameters in the command line
cmd = cmdExp.format(traceName=traceName, outputPath=outputPath,
nbHash=nbHash, hashSize=hashSize, nbReducer=nbReducer,
hadoopBlockSize=hadoopBlockSize, streamingLib=streamingLib,
sketchesHdfsPath=sketchesHdfsPath, tracesHdfsPath=tracesHdfsPath);
start = time.time()
os.system(cmd)
print "Hashed %s in %s sec." % (traceName, time.time() - start)