Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions gffutils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@
# vs
# ID=001; Name=gene1
"field separator": ";",
# Sometimes there are semicolons inside quotes that break things, e.g.,
#
# note "Evidence 1a: Function1, Function2"
# vs
# note "Evidence 1a: Function; PubMedId: 123, 456"
"semicolon_in_quotes": False,
# Usually "=" for GFF3; " " for GTF, e.g.,
#
# gene_id "GENE1"
Expand Down
2 changes: 1 addition & 1 deletion gffutils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def infer_dialect(attributes):
-------
Dictionary representing the inferred dialect
"""
attributes, dialect = parser._split_keyvals(attributes)
attributes, dialect = parser._split_keyvals(attributes, infer_dialect_call=True)
return dialect


Expand Down
18 changes: 16 additions & 2 deletions gffutils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def sort_key(x):
# TODO:
# Cythonize -- profiling shows that the bulk of the time is spent on this
# function...
def _split_keyvals(keyval_str, dialect=None):
def _split_keyvals(keyval_str, dialect=None, infer_dialect_call=False):
"""
Given the string attributes field of a GFF-like line, split it into an
attributes dictionary and a "dialect" dictionary which contains information
Expand All @@ -186,6 +186,11 @@ def _split_keyvals(keyval_str, dialect=None):
attribute string.

Otherwise, use the provided dialect (and return it at the end).

The `infer_dialect_call` argument denotes whether the call to this function
has been made as part of the regular parsing or only to obtain the dialect
using helpers.infer_dialect(); this helps us to call the regex from PR #215
only when absolutely required so as to avoid slowing down every other case.
"""

def _unquote_quals(quals, dialect):
Expand Down Expand Up @@ -216,7 +221,10 @@ def _unquote_quals(quals, dialect):
if dialect["trailing semicolon"]:
keyval_str = keyval_str.rstrip(";")

parts = keyval_str.split(dialect["field separator"])
if dialect["semicolon_in_quotes"]:
parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
else:
parts = keyval_str.split(dialect["field separator"])

kvsep = dialect["keyval separator"]
if dialect["leading semicolon"]:
Expand Down Expand Up @@ -288,9 +296,15 @@ def _unquote_quals(quals, dialect):
# GFF3 works with no spaces.
# So split on the first one we can recognize...
for sep in (" ; ", "; ", ";"):
# We want to run regex only when calling helpers.infer_dialect()
parts = keyval_str.split(sep)
parts_regex = parts
if infer_dialect_call:
parts_regex = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
if len(parts) > 1:
dialect["field separator"] = sep
if parts != parts_regex:
dialect["semicolon_in_quotes"] = True
break

# Is it GFF3? They have key-vals separated by "="
Expand Down