daler · DevangThakkar · May 14, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023
diff --git a/gffutils/constants.py b/gffutils/constants.py
@@ -127,6 +127,12 @@
     # vs
     #   ID=001; Name=gene1
     "field separator": ";",
+    # Sometimes there are semicolons inside quotes that break things, e.g.,
+    #
+    #   note "Evidence 1a: Function1, Function2"
+    # vs
+    #   note "Evidence 1a: Function; PubMedId: 123, 456"
+    "semicolon_in_quotes": False,
     # Usually "=" for GFF3; " " for GTF, e.g.,
     #
     #   gene_id "GENE1"

diff --git a/gffutils/helpers.py b/gffutils/helpers.py
@@ -35,7 +35,7 @@ def infer_dialect(attributes):
     -------
     Dictionary representing the inferred dialect
     """
-    attributes, dialect = parser._split_keyvals(attributes)
+    attributes, dialect = parser._split_keyvals(attributes, infer_dialect_call=True)
     return dialect
 
 

diff --git a/gffutils/parser.py b/gffutils/parser.py
@@ -174,7 +174,7 @@ def sort_key(x):
 # TODO:
 # Cythonize -- profiling shows that the bulk of the time is spent on this
 # function...
-def _split_keyvals(keyval_str, dialect=None):
+def _split_keyvals(keyval_str, dialect=None, infer_dialect_call=False):
     """
     Given the string attributes field of a GFF-like line, split it into an
     attributes dictionary and a "dialect" dictionary which contains information
@@ -186,6 +186,11 @@ def _split_keyvals(keyval_str, dialect=None):
     attribute string.
 
     Otherwise, use the provided dialect (and return it at the end).
+
+    The `infer_dialect_call` argument denotes whether the call to this function
+    has been made as part of the regular parsing or only to obtain the dialect
+    using helpers.infer_dialect(); this helps us to call the regex from PR #215
+    only when absolutely required so as to avoid slowing down every other case.
     """
 
     def _unquote_quals(quals, dialect):
@@ -216,7 +221,10 @@ def _unquote_quals(quals, dialect):
         if dialect["trailing semicolon"]:
             keyval_str = keyval_str.rstrip(";")
 
-        parts = keyval_str.split(dialect["field separator"])
+        if dialect["semicolon_in_quotes"]:
+            parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
+        else:
+            parts = keyval_str.split(dialect["field separator"])
 
         kvsep = dialect["keyval separator"]
         if dialect["leading semicolon"]:
@@ -288,9 +296,15 @@ def _unquote_quals(quals, dialect):
     # GFF3 works with no spaces.
     # So split on the first one we can recognize...
     for sep in (" ; ", "; ", ";"):
+        # We want to run regex only when calling helpers.infer_dialect()
         parts = keyval_str.split(sep)
+        parts_regex = parts
+        if infer_dialect_call:
+            parts_regex = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
         if len(parts) > 1:
             dialect["field separator"] = sep
+            if parts != parts_regex:
+                dialect["semicolon_in_quotes"] = True
             break
 
     # Is it GFF3?  They have key-vals separated by "="