lexers: Integrate new lexers with the rest of Elixir

fstachura · fstachura · commit 60f953577e25 · 2025-04-25T01:13:45.000+02:00
diff --git a/elixir/filters/__init__.py b/elixir/filters/__init__.py
@@ -1,23 +1,51 @@
-from typing import List
-
-from .utils import Filter, FilterContext
-from .projects import project_filters, default_filters
-
-# Returns a list of applicable filters for project_name under provided filter context
-def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
-    filter_classes = project_filters.get(project_name, default_filters)
-    filters = []
-
-    for filter_cls in filter_classes:
-        if type(filter_cls) == tuple and len(filter_cls) == 2:
-            cls, kwargs = filter_cls
-            filters.append(cls(**kwargs))
-        elif type(filter_cls) == type:
-            filters.append(filter_cls())
-        else:
-            raise ValueError(f"Invalid filter: {filter_cls}, " \
-                    "should be either a two element tuple or a type. " \
-                    "Make sure project_filters in project.py is valid.")
-
-    return [f for f in filters if f.check_if_applies(ctx)]
+from .ident import IdentFilter
+
+from .cppinc import CppIncFilter
+from .cpppathinc import CppPathIncFilter
+
+from .defconfig import DefConfigIdentsFilter
+from .configin import ConfigInFilter
+
+from .kconfig import KconfigFilter
+from .kconfigidents import KconfigIdentsFilter
+
+from .dtsi import DtsiFilter
+from .dtscompdocs import DtsCompDocsFilter
+from .dtscompcode import DtsCompCodeFilter
+from .dtscompdts import DtsCompDtsFilter
+
+from .makefileo import MakefileOFilter
+from .makefiledtb import MakefileDtbFilter
+from .makefiledir import MakefileDirFilter
+from .makefilesubdir import MakefileSubdirFilter
+from .makefilefile import MakefileFileFilter
+from .makefilesrctree import MakefileSrcTreeFilter
+from .makefilesubdir import MakefileSubdirFilter
+
+
+# List of filters applied to all projects
+default_filters = [
+    DtsCompCodeFilter,
+    DtsCompDtsFilter,
+    DtsCompDocsFilter,
+    IdentFilter,
+    CppIncFilter,
+]
+
+# List of filters for Kconfig files
+common_kconfig_filters = [
+    KconfigFilter,
+    KconfigIdentsFilter,
+    DefConfigIdentsFilter,
+]
+
+# List of filters for Makefiles
+common_makefile_filters = [
+    MakefileOFilter,
+    MakefileDtbFilter,
+    MakefileDirFilter,
+    MakefileFileFilter,
+    MakefileSubdirFilter,
+    MakefileSrcTreeFilter,
+]
 
diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py
@@ -0,0 +1,10 @@
+from .lexers import *
+
+default_lexers = {
+    r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+    r'makefile\..*':  MakefileLexer,
+    r'.*\.dts(i)?': DTSLexer,
+    r'.*\.s': GasLexer,
+    r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+}
+
diff --git a/elixir/project_utils.py b/elixir/project_utils.py
@@ -4,6 +4,7 @@
 from .filters.utils import Filter, FilterContext
 from .filters import default_filters
 from .projects import projects
+from .lexers import default_lexers
 
 # Returns a list of applicable filters for project_name under provided filter context
 def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
@@ -28,3 +29,19 @@ def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
 
     return [f for f in filters if f.check_if_applies(ctx)]
 
+def get_lexer(path: str, project_name: str):
+    project_config = projects.get(project_name)
+    if project_config is None or 'lexers' not in project_config:
+        lexers = default_lexers
+    else:
+        lexers = project_config['lexers']
+
+    path = path.lower()
+    for regex, lexer in lexers.items():
+        if re.match(regex, path):
+            if type(lexer) == tuple:
+                lexer_cls, kwargs = lexer
+                return lambda code: lexer_cls(code, **kwargs)
+            else:
+                return lambda code: lexer(code)
+
diff --git a/elixir/projects.py b/elixir/projects.py
@@ -1,4 +1,7 @@
 from .filters import *
+from collections import OrderedDict
+from .filters import *
+from .lexers import *
 
 # Dictionary of custom per-projects settings.
 # filters:
@@ -48,6 +51,29 @@
             # Our solution is to ignore all includes in such paths
             (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}),
         ],
+        'lexers': OrderedDict({
+            r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+            r'makefile\..*':  MakefileLexer,
+            r'.*\.dts(i)?': DTSLexer,
+            r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+
+            r'/arch/alpha/.*\.s': (GasLexer, {"arch": "alpha"}),
+            r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}),
+            r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}),
+            r'/arch/csky/.*\.s': (GasLexer, {"arch": "csky"}),
+            r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}),
+            r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}),
+            r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}),
+            r'/arch/openrisc/.*\.s': (GasLexer, {"arch": "openrisc"}),
+            r'/arch/parisc/.*\.s': (GasLexer, {"arch": "parisc"}),
+            r'/arch/s390/.*\.s': (GasLexer, {"arch": "s390"}),
+            r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}),
+            r'/arch/sparc/.*\.s': (GasLexer, {"arch": "sparc"}),
+            r'/arch/um/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}),
+            r'.*\.s': GasLexer,
+        }),
     },
     'qemu': {
         'filters': [
@@ -63,6 +89,24 @@
             CppPathIncFilter,
             *common_makefile_filters,
         ],
+        'lexers': OrderedDict({
+            r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+            r'makefile\..*':  MakefileLexer,
+            r'.*\.dts(i)?': DTSLexer,
+            r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+
+            r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}),
+            r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}),
+            r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}),
+            r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}),
+            r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}),
+            r'/arch/riscv/.*\.s': (GasLexer, {"arch": "riscv"}),
+            r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}),
+            r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/sandbox/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}),
+            r'.*\.s': GasLexer,
+        }),
     },
     'uclibc-ng': {
         'filters': [
diff --git a/elixir/query.py b/elixir/query.py
@@ -21,7 +21,8 @@
 from .lib import script, scriptLines, decode
 from . import lib
 from . import data
-import os
+from .lexers import TokenType
+import os, sys
 from collections import OrderedDict
 from urllib import parse
 
@@ -172,29 +173,38 @@ def query(self, cmd, *args):
 
             version = args[0]
             path = args[1]
+            lexer = args[2]
 
             filename = os.path.basename(path)
             family = lib.getFileFamily(filename)
 
-            if family != None:
+            if family is not None and lexer is not None:
                 buffer = BytesIO()
-                tokens = self.scriptLines('tokenize-file', version, path, family)
-                even = True
+                code = self.get_file_raw(version, path)
 
                 prefix = b''
                 if family == 'K':
                     prefix = b'CONFIG_'
 
-                for tok in tokens:
-                    even = not even
-                    tok2 = prefix + tok
-                    if (even and self.db.defs.exists(tok2) and
-                        (lib.compatibleFamily(self.db.defs.get(tok2).get_families(), family) or
-                        lib.compatibleMacro(self.db.defs.get(tok2).get_macros(), family))):
-                        tok = b'\033[31m' + tok2 + b'\033[0m'
-                    else:
-                        tok = lib.unescape(tok)
-                    buffer.write(tok)
+                for token_type, token, _, line in lexer(code).lex():
+                    token = token.encode()
+
+                    if token_type == TokenType.ERROR:
+                        print("error token: ", token, token_type, filename, line, file=sys.stderr)
+                    elif token_type == TokenType.IDENTIFIER:
+                        token_with_prefix = prefix + token
+                        token_in_db = self.db.defs.exists(token_with_prefix)
+                        if token_in_db:
+                            compatible = \
+                                lib.compatibleFamily(self.db.defs.get(token_with_prefix).get_families(), family) or \
+                                lib.compatibleMacro(self.db.defs.get(token_with_prefix).get_macros(), family)
+
+                            if compatible:
+                                buffer.write(b'\033[31m' + token_with_prefix + b'\033[0m')
+                                continue
+
+                    buffer.write(token)
+
                 return decode(buffer.getvalue())
             else:
                 return decode(self.script('get-file', version, path))
diff --git a/elixir/web.py b/elixir/web.py
@@ -33,7 +33,7 @@
 
 from .lib import validFamily
 from .query import Query, SymbolInstance
-from .project_utils import get_filters
+from .project_utils import get_filters, get_lexer
 from .filters.utils import FilterContext
 from .autocomplete import AutocompleteResource
 from .api import ApiIdentGetterResource
@@ -485,7 +485,8 @@ def format_code(filename, code):
 # version: requested version of the project
 # path: path to the file in the repository
 def generate_source(q, project, version, path):
-    code = q.query('file', version, path)
+    lexer = get_lexer(path, project)
+    code = q.query('file', version, path, lexer)
 
     _, fname = os.path.split(path)
     _, extension = os.path.splitext(fname)
diff --git a/update.py b/update.py
@@ -22,13 +22,16 @@
 # Throughout, an "idx" is the sequential number associated with a blob.
 # This is different from that blob's Git hash.
 
+import sys
 from sys import argv
 from threading import Thread, Lock, Event, Condition
 
+from elixir.lexers import TokenType
 import elixir.lib as lib
 from elixir.lib import script, scriptLines
 import elixir.data as data
 from elixir.data import PathList
+from elixir.project_utils import get_lexer
 from find_compatible_dts import FindCompatibleDTS
 
 verbose = False
@@ -56,6 +59,7 @@
 bindings_idxes = [] # DT bindings documentation files
 idx_key_mod = 1000000
 defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key.
+file_paths = {}
 
 tags_done = False # True if all tags have been added to new_idxes
 
@@ -163,7 +167,7 @@ def run(self):
         progress('vers: Thread finished', index)
 
     def update_versions(self, tag):
-        global blobs_lock
+        global blobs_lock, file_paths
 
         # Get blob hashes and associated file paths
         blobs = scriptLines('list-blobs', '-p', tag)
@@ -174,12 +178,14 @@ def update_versions(self, tag):
             with blobs_lock:
                 idx = db.blob.get(hash)
             buf.append((idx, path))
+            file_paths[idx] = path
 
         buf = sorted(buf)
         obj = PathList()
         for idx, path in buf:
             obj.append(idx, path)
 
+
             # Store DT bindings documentation files to parse them later
             if path[:33] == b'Documentation/devicetree/bindings':
                 bindings_idxes.append(idx)
@@ -275,6 +281,7 @@ def run(self):
 
             new_idxes[self.index][1].wait() # Make sure the tag is ready
             new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag
+            new_idxes[self.index][4].wait() # Tell that UpdateVersions processed the tag
 
             with tags_refs_lock:
                 tags_refs[0] += 1
@@ -288,45 +295,53 @@ def run(self):
             progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0])
 
     def update_references(self, idxes):
-        global hash_file_lock, defs_lock, refs_lock, tags_refs
+        global hash_file_lock, defs_lock, refs_lock, tags_refs, file_paths
 
         for idx in idxes:
             if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0])
 
             with hash_file_lock:
                 hash = db.hash.get(idx)
-                filename = db.file.get(idx)
+                filename = file_paths[idx].decode()
 
             family = lib.getFileFamily(filename)
             if family == None: continue
 
+            lexer = get_lexer(filename, project)
+            if lexer is None:
+                continue
+
+            try:
+                code = script('get-blob', hash).decode()
+            except UnicodeDecodeError:
+                code = script('get-blob', hash).decode('raw_unicode_escape')
+
             prefix = b''
             # Kconfig values are saved as CONFIG_<value>
             if family == 'K':
                 prefix = b'CONFIG_'
 
-            tokens = scriptLines('tokenize-file', '-b', hash, family)
-            even = True
-            line_num = 1
             idents = {}
             with defs_lock:
-                for tok in tokens:
-                    even = not even
-                    if even:
-                        tok = prefix + tok
-
-                        if (db.defs.exists(tok) and
-                            not ( (idx*idx_key_mod + line_num) in defs_idxes and
-                                defs_idxes[idx*idx_key_mod + line_num] == tok ) and
-                            (family != 'M' or tok.startswith(b'CONFIG_'))):
-                            # We only index CONFIG_??? in makefiles
-                            if tok in idents:
-                                idents[tok] += ',' + str(line_num)
-                            else:
-                                idents[tok] = str(line_num)
+                for token_type, token, _, line in lexer(code).lex():
+                    if token_type == TokenType.ERROR:
+                        print("error token: ", token, token_type, filename, line, file=sys.stderr)
+                        continue
 
-                    else:
-                        line_num += tok.count(b'\1')
+                    token = prefix + token.encode()
+
+                    if token_type != TokenType.IDENTIFIER:
+                        continue
+
+                    if (db.defs.exists(token) and
+                        not ( (idx*idx_key_mod + line) in defs_idxes and
+                            defs_idxes[idx*idx_key_mod + line] == token ) and
+                        (family != 'M' or token.startswith(b'CONFIG_'))):
+                        # We only index CONFIG_??? in makefiles
+                        if token in idents:
+                            idents[token] += ',' + str(line)
+                        else:
+                            idents[token] = str(line)
 
             with refs_lock:
                 for ident, lines in idents.items():
@@ -579,6 +594,7 @@ def progress(msg, current):
 for tag in scriptLines('list-tags'):
     if not db.vers.exists(tag):
         tag_buf.append(tag)
+        break
 
 num_tags = len(tag_buf)
 project = lib.currentProject()