-
Notifications
You must be signed in to change notification settings - Fork 0
/
partitionKmers.py
executable file
·115 lines (80 loc) · 4.37 KB
/
partitionKmers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python3
import sys
import bisect as bisect
import argparse as argparse
import logging as logging
def getPartitionKmersArgs( override = None ):
description = '''Take input files containing lists of kmers and distribute their contents line-by-line to different sub-files
(buckets) based on the kmer value, such that like kmers are in the same bucket.'''
example = '''To partition the file fsplit0.kmer into fsplit0.kmer.part0 fsplit0.kmer.part1, given the file partition.txt that consists of a single line with the value 'CACCGTACAT':
%(prog)s --partition partition.txt --suffix .part fsplit0.kmer'''
parser = argparse.ArgumentParser(description=description, epilog = example,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--partition', help='The file containing (one-per-line) the kmer prefix to use as the divider between files', default=None)
parser.add_argument('--suffix', help='The suffix to use before the number indicating which part is being output.', default='.part')
parser.add_argument('--col', help='The (whitespace separated) column where kmers are to be found. Data will be partitioned based on the value of this column.', default=1)
parser.add_argument('kmers', metavar='kmerFile', nargs='+', help='A list of kmers to be filtered into multiple partitins (probably for multiprocessing)')
parser.add_argument('--debug', action='store_true', help='Output diagnostic data to the screen using STDERR')
if override is None:
return parser.parse_args()
else:
return parser.parse_args(override)
def partitionKmers(filename, fileSuffix, partitionsList, sortColumn=1, bufferSize=32*1024):
outFiles = {}
outFilesList = []
input = open(filename, 'r')
count = 0
for partition in partitionsList:
outFilesList.append(filename + fileSuffix + str(count))
# Open the just appended filename for output.
outFiles[partition] = open(outFilesList[-1],'w', buffering=bufferSize)
logging.debug("Opened partition file: %s", filename + fileSuffix + str(count))
# Set the output file to NOT use line buffering because it results
# in WAY too many writes.
# outFiles[partition].reconfigure(line_buffering=False)
count = count + 1
block = input.readlines(bufferSize)
while len(block) > 0:
for line in block:
# Extract the kmer from the specified column (or the first, if not specified)
kmer = line.split(maxsplit=sortColumn)[-1]
if kmer == '': continue
# This line is... obscure looking. It will write the line to
# the correct output file based on its content and the partitions
# that were input.
outFiles[partitions[bisect.bisect_right(partitions,kmer)]].write(line)
block = input.readlines(bufferSize)
for outputFile in outFiles.values():
outputFile.close()
return outFilesList
def parsePartitions(file=None):
if file is None:
partitionsFile = sys.stdin
else:
partitionsFile = open(file, 'r')
# Note that 'sorted' is likely redundant; the output of guessPartitions.py
# will be sorted.
partitions = sorted([ line.strip() for line in partitionsFile.readlines() ])
# Our input is kmers; all kmers are less than ZZZZ. This value is
# intended to provide a terminal bucket for kmers larger than the last
# partition value.
partitions.append('ZZZZ')
partitionsFile.close()
return partitions
if __name__ == '__main__':
options = getPartitionKmersArgs()
if options.debug:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
else:
logging.basicConfig(format='%(levelname)s:%(message)s')
logging.debug('Commandline options, as parsed: %s', str(options))
# Buffer to improve processing speed for large files.
# Note that hasn't been tested on gigabyte-sized files. Could be
# some room for performance improvement at that level.
bufferSize = 32 * 1024
fileSuffix = options.suffix
# People number lists starting at 1, Python numbers them starting at zero.
searchColumn = int(options.col) - 1
partitions = parsePartitions(options.partition)
for file in options.kmers:
partitionKmers(file, fileSuffix, partitions, searchColumn, bufferSize)