Skip to content

Commit

Permalink
fix: fixed bugs in displany and ingestion of contents
Browse files Browse the repository at this point in the history
  • Loading branch information
mundanevision20 authored Sep 15, 2024
1 parent 423315a commit b807e60
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 102 deletions.
27 changes: 14 additions & 13 deletions explainshell/algo/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

def get_features(paragraph):
features = {}
p_text = paragraph.cleantext()
p_text = paragraph.clean_text()
logger.debug(f"length of p_text: {len(p_text)}")
assert p_text

features["starts_with_hyphen"] = algo.features.starts_with_hyphen(p_text)
Expand Down Expand Up @@ -46,7 +47,7 @@ def train(self):
if self.classifier:
return

man_pages = self.store.trainingset()
man_pages = self.store.training_set()

# flatten the manpages so we get a list of (manpage-name, paragraph)
def flatten_manpages(manpage):
Expand All @@ -58,19 +59,19 @@ def flatten_manpages(manpage):
paragraphs = itertools.chain(*[flatten_manpages(m) for m in man_pages])
training = list(paragraphs)

negids = [p for p in training if not p.is_option]
posids = [p for p in training if p.is_option]
neg_ids = [p for p in training if not p.is_option]
pos_ids = [p for p in training if p.is_option]

negfeats = [(get_features(p), False) for p in negids]
posfeats = [(get_features(p), True) for p in posids]
neg_feats = [(get_features(p), False) for p in neg_ids]
pos_feats = [(get_features(p), True) for p in pos_ids]

negcutoff = len(negfeats) * 3 / 4
poscutoff = len(posfeats) * 3 / 4
neg_cutoff = int(len(neg_feats) * 3 / 4)
pos_cutoff = int(len(pos_feats) * 3 / 4)

trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
self.testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
self.test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

logger.info("train on %d instances", len(trainfeats))
logger.info("train on %d instances", len(train_feats))

if self.algo == "maxent":
c = nltk.classify.maxent.MaxentClassifier
Expand All @@ -79,14 +80,14 @@ def flatten_manpages(manpage):
else:
raise ValueError("unknown classifier")

self.classifier = c.train(trainfeats, **self.classifier_args)
self.classifier = c.train(train_feats, **self.classifier_args)

def evaluate(self):
self.train()
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(self.testfeats):
for i, (feats, label) in enumerate(self.test_feats):
ref_sets[label].add(i)
guess = self.classifier.prob_classify(feats)
observed = guess.max()
Expand Down
6 changes: 3 additions & 3 deletions explainshell/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _read(self, ctx, f_runner):
assert len(ctx.manpage.paragraphs) > 1

ctx.manpage = store.ManPage(
ctx.manpage.shortpath,
ctx.manpage.short_path,
ctx.manpage.name,
ctx.manpage.synopsis,
ctx.manpage.paragraphs,
Expand All @@ -73,7 +73,7 @@ def _extract(self, ctx, f_runner):

def _write(self, ctx, f_runner):
f_runner.pre_add_manpage()
return ctx.store.addmanpage(ctx.manpage)
return ctx.store.add_manpage(ctx.manpage)

def _update(self, ctx, f_runner):
f_runner.pre_add_manpage()
Expand Down Expand Up @@ -171,7 +171,7 @@ def findmulti_cmds(self):
logger.info("inserting mapping (multi_cmd) %s -> %s", src, dst)

for multi_cmd, _id in multi_cmds.items():
self.store.setmulti_cmd(_id)
self.store.set_multi_cmd(_id)
logger.info("making %r a multi_cmd", multi_cmd)

return mappings_to_a, multi_cmds
Expand Down
39 changes: 28 additions & 11 deletions explainshell/manpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def bold(ln_in):
x = list(rp_with)
x.insert(1, "</u>")
x = "".join(x)
_replacements.append((x, "%s</u>" % rp_with))
_replacements.append((x, f"{rp_with}</u>"))

_replacements_no_prefix = [
"\xc2\xb7", # bullet
Expand All @@ -121,7 +121,7 @@ def bold(ln_in):
x = list(s)
x.insert(1, "</u>")
x = "".join(x)
_replacements.append((x, "%s</u>" % s))
_replacements.append((x, f"{s}</u>"))

_href = re.compile(r'<a href="file:///[^\?]*\?([^\(]*)\(([^\)]*)\)">')
_section = re.compile(r"<b>([^<]+)</b>")
Expand All @@ -139,8 +139,9 @@ def _parse_text(lines):
)
for look_for, rp_with in _replacements:
ln = re.sub(look_for, rp_with, ln)

# confirm the line is valid utf8
l_replaced = ln.decode("utf8", "ignore").encode("utf8")
l_replaced = ln # .decode("utf8", "ignore").encode("utf8")
if l_replaced != ln:
logger.error("line %r contains invalid utf8", ln)
ln = l_replaced
Expand Down Expand Up @@ -177,14 +178,14 @@ def _parse_synopsis(base, synopsis):


class ManPage:
"""read the man page at path by executing w3mman2html.cgi and find its
synopsis with lexgrog
"""read the man page at path by executing `w3mman2html.cgi` and find it's
synopsis with `lexgrog`
since some man pages share the same name (different versions), each
alias of a man page has a score that's determined in this simple fashion:
- name of man page source file is given a score of 10
- all other names found for a particular man page are given a score of 1
(other names are found by scanning the output of lexgrog)
(other names are found by scanning the output of `lexgrog`)
"""

def __init__(self, path):
Expand All @@ -201,11 +202,27 @@ def read(self):
on the class instance."""
cmd = [config.MAN2HTML, urllib.parse.urlencode({"local": os.path.abspath(self.path)})]
logger.info("executing %r", " ".join(cmd))
self._text = subprocess.check_output(cmd, stderr=devnull, env=ENV)
self._text = ""

try:
t_proc = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=300, env=ENV)

if t_proc.stdout:
self._text = t_proc.stdout
if t_proc.stderr:
logger.error(f"failed to extract text for {self.name} -> w3mman2html.cgi returned: {t_proc.stderr}")
except Exception as error_msg:
logger.error(f"failed to extract text for {self.name} -> error: {error_msg}")

try:
self.synopsis = subprocess.check_output(
["lexgrog", self.path], stderr=devnull
).rstrip()
self.synopsis = ""
s_proc = subprocess.run(
["lexgrog", self.path], capture_output=True, text=True, timeout=300
)
if s_proc.stdout:
self.synopsis = s_proc.stdout.rstrip()
if s_proc.stderr:
logger.error(f"failed to extract text for {self.name} -> lexgrog returned: {s_proc.stderr}")
except subprocess.CalledProcessError:
logger.error("failed to extract synopsis for %s", self.name)

Expand All @@ -222,7 +239,7 @@ def parse(self):
d = collections.OrderedDict()
for prog, text in self.synopsis:
d.setdefault(text, []).append(prog)
text, progs = d.items()[0]
text, progs = list(dict(d).items())[0]
self.synopsis = text
self.aliases.update(progs)
self.aliases.remove(self.name)
Expand Down
18 changes: 10 additions & 8 deletions explainshell/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ class Matcher(bashlex.ast.nodevisitor):
"""

def __init__(self, s, store):
self.s = s.encode("latin1", "replace")
self.s = s
self.store = store
self._prev_option = self._current_option = None
self.groups = [MatchGroup("shell")]

# a list of matchwordexpansions where expansions happened during word
# a list of `match_word_exp` where expansions happened during word
# expansion
self.expansions = []

Expand Down Expand Up @@ -150,6 +150,8 @@ def visitredirect(self, node, input, r_type, output, heredoc):
if r_type in help_constants.REDIRECTION_KIND:
helptext.append(help_constants.REDIRECTION_KIND[r_type])

logger.debug(helptext)

self.groups[0].results.append(
MatchResult(node.pos[0], node.pos[1], "\n\n".join(helptext), None)
)
Expand Down Expand Up @@ -393,18 +395,18 @@ def attemptfuzzy(chars):
m = []
if chars[0] == "-":
tokens = [chars[0:2]] + list(chars[2:])
considerarg = True
consider_arg = True
else:
tokens = list(chars)
considerarg = False
consider_arg = False

pos = node.pos[0]
prevoption = None
prev_option = None
for i, t in enumerate(tokens):
op = t if t[0] == "-" else "-" + t
option = self.find_option(op)
if option:
if considerarg and not m and option.expects_arg:
if consider_arg and not m and option.expects_arg:
logger.info(
"option %r expected an arg, taking the rest too", option
)
Expand All @@ -420,7 +422,7 @@ def attemptfuzzy(chars):
# match the current token, take the rest as its argument, this
# covers a series of short options where the last one has an argument
# with no space between it, such as 'xargs -r0n1'
elif considerarg and prevoption and prevoption.expects_arg:
elif consider_arg and prev_option and prev_option.expects_arg:
pmr = m[-1]
mr = MatchResult(
pmr.start, pmr.end + (len(tokens) - i), pmr.text, None
Expand All @@ -434,7 +436,7 @@ def attemptfuzzy(chars):
else:
m.append(self.unknown(t, pos, pos + len(t)))
pos += len(t)
prevoption = option
prev_option = option
return m

def _visitword(node, word):
Expand Down
2 changes: 1 addition & 1 deletion explainshell/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def extract(manpage):
options"""
for i, p in enumerate(manpage.paragraphs):
if p.is_option:
s, ln = extract_option(p.cleantext())
s, ln = extract_option(p.clean_text())
if s or ln:
expects_arg = any(x.expects_arg for x in s + ln)
s = [x.flag for x in s]
Expand Down
Loading

0 comments on commit b807e60

Please sign in to comment.