Skip to content

Commit 4867f52

Browse files
committed
actually adding the local file now.lol
1 parent d8c5eb8 commit 4867f52

File tree

1 file changed

+214
-0
lines changed

1 file changed

+214
-0
lines changed

src/readpaf_local.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# We acknoweldge the authors of readpaf library and direct use of it locally. readpaf is not available as a conda package.
2+
from __future__ import division
3+
from collections import namedtuple
4+
5+
6+
__all__ = ["parse_paf"]
7+
8+
__version__ = "0.0.11a2"
9+
10+
try:
11+
import pandas as pd
12+
except Exception as E:
13+
pandas = False
14+
e = E
15+
else:
16+
pandas = True
17+
18+
19+
class _PAF:
20+
"""Base PAF methods, can't guarantee field names here so use indices"""
21+
22+
def __str__(self):
23+
"""Formats a record as a PAF line for writing to a file"""
24+
return "{}\t{}".format("\t".join(map(str, self[:-1])), self._fmt_tags())
25+
26+
def _fmt_tags(self):
27+
"""Format tag dict as SAM style"""
28+
return "\t".join("{}:{}:{}".format(*t) for t in self[-1].values())
29+
30+
def blast_identity(self):
31+
"""BLAST identity, see:
32+
https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
33+
"""
34+
return self[9] / self[10]
35+
36+
37+
SAM_TAG = namedtuple("tag", ["name", "type", "value"])
38+
FIELDS = [
39+
"query_name",
40+
"query_length",
41+
"query_start",
42+
"query_end",
43+
"strand",
44+
"target_name",
45+
"target_length",
46+
"target_start",
47+
"target_end",
48+
"residue_matches",
49+
"alignment_block_length",
50+
"mapping_quality",
51+
"tags",
52+
]
53+
NA_VALUES = ["*"]
54+
SAM_TYPES = {"i": int, "A": str, "f": float, "Z": str}
55+
56+
57+
def _expand_dict_in_series(df, field):
58+
"""Convert a Series of dict to Series and add to the original DataFrame
59+
60+
Parameters
61+
----------
62+
df : pd.DataFrame
63+
A DataFrame with a Series of dict
64+
field : str
65+
The Series of dicts to expand
66+
67+
Returns
68+
-------
69+
pd.DataFrame
70+
The orignal DataFrame with extra Series from the dicts
71+
"""
72+
return df.join(
73+
pd.DataFrame(
74+
[{k: v for k, _, v in r.values()} for r in df.pop(field).tolist()]
75+
),
76+
rsuffix="_tag",
77+
)
78+
79+
80+
def _parse_tags(tags):
81+
"""Convert a list of SAM style tags, from a PAF file, to a dict
82+
83+
https://samtools.github.io/hts-specs/SAMv1.pdf section 1.5
84+
85+
Parameters
86+
----------
87+
tags : list
88+
A list of SAM style tags
89+
90+
Returns
91+
-------
92+
dict of str: namedtuple
93+
Returns dict of SAM style tags.
94+
Each key is the tag name and the value is a namedtuple with fields
95+
`name`, `type`, and `value`.
96+
"""
97+
return {
98+
tag: SAM_TAG(tag, type_, SAM_TYPES.get(type_, lambda x: x)(val))
99+
for tag, type_, val in (x.split(":", 2) for x in tags)
100+
}
101+
102+
103+
def _paf_generator(file_like, fields=None, na_values=None, na_rep=None):
104+
"""Generator that returns namedtuples from a PAF file
105+
106+
Parameters
107+
----------
108+
file_like : file-like object
109+
File-like object
110+
fields : list
111+
List of field names to use for records, must have 13 entries.
112+
113+
Yields
114+
------
115+
namedtuple
116+
Correctly formatted PAF record and a dict of extra tags
117+
118+
Raises
119+
------
120+
ValueError
121+
"""
122+
if len(fields) != 13:
123+
raise ValueError("{} fields provided, expected 13".format(len(fields)))
124+
_PAF_nt = namedtuple("PAF", fields)
125+
PAF = type("PAF", (_PAF, _PAF_nt), dict())
126+
for record in file_like:
127+
record = record.strip()
128+
if not record:
129+
continue
130+
record = record.split("\t")
131+
yield PAF(
132+
str(record[0]),
133+
int(record[1]) if record[1] not in na_values else na_rep,
134+
int(record[2]) if record[2] not in na_values else na_rep,
135+
int(record[3]) if record[3] not in na_values else na_rep,
136+
str(record[4]),
137+
str(record[5]),
138+
int(record[6]) if record[6] not in na_values else na_rep,
139+
int(record[7]) if record[7] not in na_values else na_rep,
140+
int(record[8]) if record[8] not in na_values else na_rep,
141+
int(record[9]) if record[9] not in na_values else na_rep,
142+
int(record[10]) if record[10] not in na_values else na_rep,
143+
int(record[11]) if record[11] not in na_values else na_rep,
144+
_parse_tags(record[12:]),
145+
)
146+
147+
148+
def parse_paf(file_like, fields=None, na_values=None, na_rep=0, dataframe=False):
149+
"""Read a minimap2 PAF file as either an iterator or a pandas.DataFrame
150+
151+
When using as an iterator the `tags` field is a list of namedtuples.
152+
Each namedtuple has the fields `name`, `type`, `value` that corresponds to
153+
each field (delimeted by `:`) in the SAM-style tag.
154+
155+
Parameters
156+
----------
157+
file_like : file-like object
158+
Object with a read() method, such as a sys.stdin, file handler or io.StringIO.
159+
fields : list, optional
160+
List of field names to use for records, must have 13 entries. These should
161+
be in the order of the fields in the PAF file and the last field will be
162+
used for tags. Default:
163+
["query_name", "query_length", "query_start", "query_end", "strand",
164+
"target_name", "target_length", "target_start", "target_end",
165+
"residue_matches", "alignment_block_length", "mapping_quality", "tags"]
166+
na_values : list[str], optional
167+
List of additional strings to interpret as NaN values in numeric fields
168+
(2, 3, 4, 7, 8, 9, 10, 11, 12).
169+
Default: ["*"]
170+
na_rep : int or float, optional
171+
Value to use when a NaN value specified in `na_values` is found. Default: `0`.
172+
dataframe : bool, optional
173+
Default is False. When True a pandas.DataFrame is returned with Series
174+
named as the `fields` parameter. SAM tags are expanded into Series as
175+
well and given their specified types, if any of the field names overlap
176+
with tags the tag column will be given the suffix `_tag`.
177+
178+
Returns
179+
-------
180+
iterator or pandas.DataFrame when dataframe is True
181+
"""
182+
fields = FIELDS if fields is None else fields
183+
na_values = set(NA_VALUES if na_values is None else na_values + NA_VALUES)
184+
if not isinstance(na_rep, (int, float)):
185+
raise ValueError("na_rep must be int or float")
186+
187+
if dataframe and pandas:
188+
# TODO: make this nicer
189+
df = pd.DataFrame(
190+
(line.strip().split("\t", 12) for line in file_like if line.strip()),
191+
columns=fields,
192+
)
193+
df = df.join(
194+
pd.DataFrame(
195+
df.pop(fields[-1])
196+
.str.findall(r"([^\t]+?):[A-Za-z]+?:(.+?)")
197+
.map(dict)
198+
.to_list()
199+
),
200+
rsuffix="_tag",
201+
)
202+
if df.empty:
203+
return pd.DataFrame(columns=fields)
204+
df = df.replace(
205+
{
206+
fields[i]: {v: na_rep for v in na_values}
207+
for i in (2, 3, 4, 7, 8, 9, 10, 11, 12)
208+
}
209+
)
210+
return df.infer_objects()
211+
elif dataframe and not pandas:
212+
raise ImportError(e)
213+
else:
214+
return _paf_generator(file_like, fields, na_values, na_rep)

0 commit comments

Comments
 (0)