diff --git a/gffutils/constants.py b/gffutils/constants.py index 901e714..3cc2077 100644 --- a/gffutils/constants.py +++ b/gffutils/constants.py @@ -127,6 +127,12 @@ # vs # ID=001; Name=gene1 "field separator": ";", + # Sometimes there are semicolons inside quotes that break things, e.g., + # + # note "Evidence 1a: Function1, Function2" + # vs + # note "Evidence 1a: Function; PubMedId: 123, 456" + "semicolon_in_quotes": False, # Usually "=" for GFF3; " " for GTF, e.g., # # gene_id "GENE1" diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 0e2f430..b68414c 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -35,7 +35,7 @@ def infer_dialect(attributes): ------- Dictionary representing the inferred dialect """ - attributes, dialect = parser._split_keyvals(attributes) + attributes, dialect = parser._split_keyvals(attributes, infer_dialect_call=True) return dialect diff --git a/gffutils/parser.py b/gffutils/parser.py index 0d82889..e64aa42 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -174,7 +174,7 @@ def sort_key(x): # TODO: # Cythonize -- profiling shows that the bulk of the time is spent on this # function... -def _split_keyvals(keyval_str, dialect=None): +def _split_keyvals(keyval_str, dialect=None, infer_dialect_call=False): """ Given the string attributes field of a GFF-like line, split it into an attributes dictionary and a "dialect" dictionary which contains information @@ -186,6 +186,11 @@ def _split_keyvals(keyval_str, dialect=None): attribute string. Otherwise, use the provided dialect (and return it at the end). + + The `infer_dialect_call` argument denotes whether the call to this function + has been made as part of the regular parsing or only to obtain the dialect + using helpers.infer_dialect(); this helps us to call the regex from PR #215 + only when absolutely required so as to avoid slowing down every other case. """ def _unquote_quals(quals, dialect): @@ -216,7 +221,10 @@ def _unquote_quals(quals, dialect): if dialect["trailing semicolon"]: keyval_str = keyval_str.rstrip(";") - parts = keyval_str.split(dialect["field separator"]) + if dialect["semicolon_in_quotes"]: + parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) + else: + parts = keyval_str.split(dialect["field separator"]) kvsep = dialect["keyval separator"] if dialect["leading semicolon"]: @@ -288,9 +296,15 @@ def _unquote_quals(quals, dialect): # GFF3 works with no spaces. # So split on the first one we can recognize... for sep in (" ; ", "; ", ";"): + # We want to run regex only when calling helpers.infer_dialect() parts = keyval_str.split(sep) + parts_regex = parts + if infer_dialect_call: + parts_regex = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) if len(parts) > 1: dialect["field separator"] = sep + if parts != parts_regex: + dialect["semicolon_in_quotes"] = True break # Is it GFF3? They have key-vals separated by "="