-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsort.py
executable file
·114 lines (89 loc) · 2.76 KB
/
sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
import sys
import os
from subprocess import call
from time import time
def line_count(filename):
count = 0
fd = open(filename)
for line in fd:
count += 1
fd.close()
return count
def split(input_file, part_size):
count = 0
outfile = "sort.tmp1"
outfd = open(outfile, 'w')
fd = open(input_file)
for line in fd:
count += 1
if count % part_size == 0:
print count
outfd.close()
outfile = "sort.tmp" + str(count/part_size + 1)
outfd = open(outfile, 'w')
outfd.write(line)
fd.close()
outfd.close()
def merge_file_impl(file1, file2):
filename = "sorted.tmp" + str(time())
fd1 = open(file1, 'r')
fd2 = open(file2, 'r')
outfd = open(filename, 'w')
line1 = fd1.readline()
line2 = fd2.readline()
while len(line1) > 0 and len(line2) > 0:
if line1 <= line2:
outfd.write(line1)
line1 = fd1.readline()
else:
outfd.write(line2)
line2 = fd2.readline()
if len(line1)==0:
while len(line2) > 0:
outfd.write(line2)
line2 = fd2.readline()
elif len(line2)==0:
while len(line1) > 0:
outfd.write(line1)
line1 = fd1.readline()
fd1.close()
fd2.close()
outfd.close()
#remove the file1, file2
call("rm -rf " + file1 + " " + file2, shell=True)
return filename
def merge_file(file_list):
print len(file_list)
if len(file_list) == 1:
return file_list[0]
new_file_list = []
i = 0
while (i+1) < len(file_list):
name = merge_file_impl(file_list[i], file_list[i+1])
new_file_list.append(name)
i+=2
if i < len(file_list):
new_file_list.append(file_list[i])
return merge_file(new_file_list)
def sort_file(input_file, num_lines, partitions):
part_size = num_lines / partitions
split(input_file, part_size)
if num_lines % partitions > 0:
partitions += 1
for i in range(1, partitions + 1):
command = "export LC_ALL=C;sort sort.tmp"+str(i) + " > sorted.tmp" + str(i)
call(command, shell=True)
call("rm -rf sort.tmp" + str(i), shell=True)
file_list = ["sorted.tmp" + str(x) for x in range(1, partitions+1)]
final = merge_file(file_list)
#rename the file file
call("mv " + final + " " + input_file + ".sorted", shell=True)
#clean up
command = "rm -rf sort.tmp* sorted.tmp*"
call(command, shell=True)
if __name__ == "__main__":
""" argv[1] is the file to be sorted
argv[2] is the number of lines in the file
argv[3] is the number of partitions we are going to use for the sort"""
print sort_file(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))