fix: fixed bugs in displany and ingestion of contents

idank · Sep 15, 2024 · b807e60 · b807e60
1 parent 423315a
commit b807e60
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 102 deletions.
diff --git a/explainshell/algo/classifier.py b/explainshell/algo/classifier.py
@@ -14,7 +14,8 @@
 
 def get_features(paragraph):
     features = {}
-    p_text = paragraph.cleantext()
+    p_text = paragraph.clean_text()
+    logger.debug(f"length of p_text: {len(p_text)}")
     assert p_text
 
     features["starts_with_hyphen"] = algo.features.starts_with_hyphen(p_text)
@@ -46,7 +47,7 @@ def train(self):
         if self.classifier:
             return
 
-        man_pages = self.store.trainingset()
+        man_pages = self.store.training_set()
 
         # flatten the manpages so we get a list of (manpage-name, paragraph)
         def flatten_manpages(manpage):
@@ -58,19 +59,19 @@ def flatten_manpages(manpage):
         paragraphs = itertools.chain(*[flatten_manpages(m) for m in man_pages])
         training = list(paragraphs)
 
-        negids = [p for p in training if not p.is_option]
-        posids = [p for p in training if p.is_option]
+        neg_ids = [p for p in training if not p.is_option]
+        pos_ids = [p for p in training if p.is_option]
 
-        negfeats = [(get_features(p), False) for p in negids]
-        posfeats = [(get_features(p), True) for p in posids]
+        neg_feats = [(get_features(p), False) for p in neg_ids]
+        pos_feats = [(get_features(p), True) for p in pos_ids]
 
-        negcutoff = len(negfeats) * 3 / 4
-        poscutoff = len(posfeats) * 3 / 4
+        neg_cutoff = int(len(neg_feats) * 3 / 4)
+        pos_cutoff = int(len(pos_feats) * 3 / 4)
 
-        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
-        self.testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
+        train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
+        self.test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]
 
-        logger.info("train on %d instances", len(trainfeats))
+        logger.info("train on %d instances", len(train_feats))
 
         if self.algo == "maxent":
             c = nltk.classify.maxent.MaxentClassifier
@@ -79,14 +80,14 @@ def flatten_manpages(manpage):
         else:
             raise ValueError("unknown classifier")
 
-        self.classifier = c.train(trainfeats, **self.classifier_args)
+        self.classifier = c.train(train_feats, **self.classifier_args)
 
     def evaluate(self):
         self.train()
         ref_sets = collections.defaultdict(set)
         test_sets = collections.defaultdict(set)
 
-        for i, (feats, label) in enumerate(self.testfeats):
+        for i, (feats, label) in enumerate(self.test_feats):
             ref_sets[label].add(i)
             guess = self.classifier.prob_classify(feats)
             observed = guess.max()

diff --git a/explainshell/manager.py b/explainshell/manager.py
@@ -49,7 +49,7 @@ def _read(self, ctx, f_runner):
         assert len(ctx.manpage.paragraphs) > 1
 
         ctx.manpage = store.ManPage(
-            ctx.manpage.shortpath,
+            ctx.manpage.short_path,
             ctx.manpage.name,
             ctx.manpage.synopsis,
             ctx.manpage.paragraphs,
@@ -73,7 +73,7 @@ def _extract(self, ctx, f_runner):
 
     def _write(self, ctx, f_runner):
         f_runner.pre_add_manpage()
-        return ctx.store.addmanpage(ctx.manpage)
+        return ctx.store.add_manpage(ctx.manpage)
 
     def _update(self, ctx, f_runner):
         f_runner.pre_add_manpage()
@@ -171,7 +171,7 @@ def findmulti_cmds(self):
             logger.info("inserting mapping (multi_cmd) %s -> %s", src, dst)
 
         for multi_cmd, _id in multi_cmds.items():
-            self.store.setmulti_cmd(_id)
+            self.store.set_multi_cmd(_id)
             logger.info("making %r a multi_cmd", multi_cmd)
 
         return mappings_to_a, multi_cmds

diff --git a/explainshell/manpage.py b/explainshell/manpage.py
@@ -100,7 +100,7 @@ def bold(ln_in):
         x = list(rp_with)
         x.insert(1, "</u>")
         x = "".join(x)
-        _replacements.append((x, "%s</u>" % rp_with))
+        _replacements.append((x, f"{rp_with}</u>"))
 
 _replacements_no_prefix = [
     "\xc2\xb7",  # bullet
@@ -121,7 +121,7 @@ def bold(ln_in):
     x = list(s)
     x.insert(1, "</u>")
     x = "".join(x)
-    _replacements.append((x, "%s</u>" % s))
+    _replacements.append((x, f"{s}</u>"))
 
 _href = re.compile(r'<a href="file:///[^\?]*\?([^\(]*)\(([^\)]*)\)">')
 _section = re.compile(r"<b>([^<]+)</b>")
@@ -139,8 +139,9 @@ def _parse_text(lines):
         )
         for look_for, rp_with in _replacements:
             ln = re.sub(look_for, rp_with, ln)
+
         # confirm the line is valid utf8
-        l_replaced = ln.decode("utf8", "ignore").encode("utf8")
+        l_replaced = ln  # .decode("utf8", "ignore").encode("utf8")
         if l_replaced != ln:
             logger.error("line %r contains invalid utf8", ln)
             ln = l_replaced
@@ -177,14 +178,14 @@ def _parse_synopsis(base, synopsis):
 
 
 class ManPage:
-    """read the man page at path by executing w3mman2html.cgi and find its
-    synopsis with lexgrog
+    """read the man page at path by executing `w3mman2html.cgi` and find it's
+    synopsis with `lexgrog`
 
     since some man pages share the same name (different versions), each
     alias of a man page has a score that's determined in this simple fashion:
     - name of man page source file is given a score of 10
     - all other names found for a particular man page are given a score of 1
-      (other names are found by scanning the output of lexgrog)
+      (other names are found by scanning the output of `lexgrog`)
     """
 
     def __init__(self, path):
@@ -201,11 +202,27 @@ def read(self):
         on the class instance."""
         cmd = [config.MAN2HTML, urllib.parse.urlencode({"local": os.path.abspath(self.path)})]
         logger.info("executing %r", " ".join(cmd))
-        self._text = subprocess.check_output(cmd, stderr=devnull, env=ENV)
+        self._text = ""
+
+        try:
+            t_proc = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=300, env=ENV)
+
+            if t_proc.stdout:
+                self._text = t_proc.stdout
+            if t_proc.stderr:
+                logger.error(f"failed to extract text for {self.name} -> w3mman2html.cgi returned: {t_proc.stderr}")
+        except Exception as error_msg:
+            logger.error(f"failed to extract text for {self.name} -> error: {error_msg}")
+
         try:
-            self.synopsis = subprocess.check_output(
-                ["lexgrog", self.path], stderr=devnull
-            ).rstrip()
+            self.synopsis = ""
+            s_proc = subprocess.run(
+                ["lexgrog", self.path], capture_output=True, text=True, timeout=300
+            )
+            if s_proc.stdout:
+                self.synopsis = s_proc.stdout.rstrip()
+            if s_proc.stderr:
+                logger.error(f"failed to extract text for {self.name} -> lexgrog returned: {s_proc.stderr}")
         except subprocess.CalledProcessError:
             logger.error("failed to extract synopsis for %s", self.name)
 
@@ -222,7 +239,7 @@ def parse(self):
             d = collections.OrderedDict()
             for prog, text in self.synopsis:
                 d.setdefault(text, []).append(prog)
-            text, progs = d.items()[0]
+            text, progs = list(dict(d).items())[0]
             self.synopsis = text
             self.aliases.update(progs)
         self.aliases.remove(self.name)

diff --git a/explainshell/matcher.py b/explainshell/matcher.py
@@ -39,12 +39,12 @@ class Matcher(bashlex.ast.nodevisitor):
     """
 
     def __init__(self, s, store):
-        self.s = s.encode("latin1", "replace")
+        self.s = s
         self.store = store
         self._prev_option = self._current_option = None
         self.groups = [MatchGroup("shell")]
 
-        # a list of matchwordexpansions where expansions happened during word
+        # a list of `match_word_exp` where expansions happened during word
         # expansion
         self.expansions = []
 
@@ -150,6 +150,8 @@ def visitredirect(self, node, input, r_type, output, heredoc):
         if r_type in help_constants.REDIRECTION_KIND:
             helptext.append(help_constants.REDIRECTION_KIND[r_type])
 
+        logger.debug(helptext)
+
         self.groups[0].results.append(
             MatchResult(node.pos[0], node.pos[1], "\n\n".join(helptext), None)
         )
@@ -393,18 +395,18 @@ def attemptfuzzy(chars):
             m = []
             if chars[0] == "-":
                 tokens = [chars[0:2]] + list(chars[2:])
-                considerarg = True
+                consider_arg = True
             else:
                 tokens = list(chars)
-                considerarg = False
+                consider_arg = False
 
             pos = node.pos[0]
-            prevoption = None
+            prev_option = None
             for i, t in enumerate(tokens):
                 op = t if t[0] == "-" else "-" + t
                 option = self.find_option(op)
                 if option:
-                    if considerarg and not m and option.expects_arg:
+                    if consider_arg and not m and option.expects_arg:
                         logger.info(
                             "option %r expected an arg, taking the rest too", option
                         )
@@ -420,7 +422,7 @@ def attemptfuzzy(chars):
                 # match the current token, take the rest as its argument, this
                 # covers a series of short options where the last one has an argument
                 # with no space between it, such as 'xargs -r0n1'
-                elif considerarg and prevoption and prevoption.expects_arg:
+                elif consider_arg and prev_option and prev_option.expects_arg:
                     pmr = m[-1]
                     mr = MatchResult(
                         pmr.start, pmr.end + (len(tokens) - i), pmr.text, None
@@ -434,7 +436,7 @@ def attemptfuzzy(chars):
                 else:
                     m.append(self.unknown(t, pos, pos + len(t)))
                 pos += len(t)
-                prevoption = option
+                prev_option = option
             return m
 
         def _visitword(node, word):

diff --git a/explainshell/options.py b/explainshell/options.py
@@ -14,7 +14,7 @@ def extract(manpage):
     options"""
     for i, p in enumerate(manpage.paragraphs):
         if p.is_option:
-            s, ln = extract_option(p.cleantext())
+            s, ln = extract_option(p.clean_text())
             if s or ln:
                 expects_arg = any(x.expects_arg for x in s + ln)
                 s = [x.flag for x in s]