-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSPELLCHECK.py
executable file
·167 lines (135 loc) · 4.58 KB
/
SPELLCHECK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/bin/env python
import sys,os
import getopt
from glob import glob
import re
import itertools
from pprint import pprint
import frontmatter
from spellchecker import SpellChecker
from markdown import markdown
from bs4 import BeautifulSoup
#! for ASNI colors outout
from colorama import init, Fore, Back
init()
os.chdir("/home/jw/sites/tholonia/chirpy2")
def showhelp():
print("help")
rs = """
-h, --help show help
-f, --filespec default: "*.md"
-r, --recursive default OFF
-y, --yaml include YAML FrontMatter. Default OFF
-d, --dump dump bad words only
-v, --verbose default: OFF
"""
print(rs)
exit()
def split_path(pstr):
dirname = os.path.dirname(pstr)
if dirname == "" or dirname == ".":
dirname = os.getcwd()
basename = os.path.basename(pstr)
ns = basename.split(".")
ext = ns[-1]
nameonly = "".join(ns[:-1])
fullpath = f"{dirname}/{basename}"
return {
"dirname": dirname,
"basename": basename,
"ext": ext,
"nameonly": nameonly,
"fullpath": fullpath,
}
def findintree(filespec):
allfiles = glob(filespec, recursive=True) # ! get list of all files
ab_allfiles = []
for f in allfiles: # ! remove all instance of ZPROJECTS, a dir that needs to be ignore.
if f.find("ZPROJECTS") != -1 or f.find("_site") != -1:
pass
else:
ab_allfiles.append(f)
found_ary = []
for file in ab_allfiles:
found_ary.append(file)
return found_ary
def load_fm(fn):
f = open(fn, encoding="utf-8")
fm = frontmatter.load(f)
f.close()
return fm
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
content = markdown(markdown_string.replace("\n", " ").replace("_", " "))
content = re.sub(r'<pre>(.*?)</pre>', ' ', content)
content = re.sub(r'<code>(.*?)</code >', ' ', content)
# extract text
soup = BeautifulSoup(content, "html.parser")
text = ''.join(soup.findAll(string=True))
return text
#! ---------------------------------------------------------------------------
#! Set default values
filespec = "*.md"
recursive = ''
yaml = False
dump = False
verbose = False
argv = sys.argv[1:]
try:
opts, args = getopt.getopt(argv,"hf:rydv",["help","filespec=","recursive","yaml","dump"],)
except Exception as e:
print(str(e))
#! set defaults
for opt, arg in opts:
if opt in ("-h", "--help"): showhelp()
if opt in ("-f", "--filespec"): filespec = arg
if opt in ("-r", "--recursive"): recursive = "**/"
if opt in ("-y", "--yaml"): yaml = True
if opt in ("-d", "--dump"): dump = True
if opt in ("-v", "--verbose"): verbose = True
#^ ---------------------------------------------------------------------------
#! load the words to ignore
with open('src/skipwords.txt') as f:
skipwords = f.read().splitlines()
spell = SpellChecker()
spell.word_frequency.load_words(skipwords)
#! get all files as determined by the arguments
mdfiles = findintree(recursive+filespec)
for file in mdfiles:
bad_words = {} #! this holds words that do not exist in the spellchecker or have been listed as incorrect
# ! load FM and content
post = load_fm(file)
content = post.content
#! this REALLY slows things down
if yaml:
import numpy as np
lst_1d = np.array(post.metadata).flat
yaml_str = " ".join(map(str, lst_1d))
content = content + " " +yaml_str
#! remove any markdown tags other chars and create a list of words
content = markdown_to_text(content)
#! Remove URL's. This is here because if there are no HTML stuff BeautifulSoup freaks out
content = re.sub(r'http\S+', '', content)
content = re.sub("<[^>]*>", "", content) #! remove HTML tags
content.replace("%","~") #! need to swap % so it doesn't look like a Liquid command for next line
content = re.sub(r' {~[^}]*~}', '', content) # ! remove LiquidScript tags
words=spell.split_words(content)
#! fist make an ary with the bad words using the word as key to eliminate dupe entries
for word in words:
if word not in spell.known(word):
probable_word = spell.correction(word)
if word != probable_word:
bad_words[word]=probable_word
#! test and print
if len(bad_words) > 0 or verbose == True:
if not dump:
print(f">>> {Fore.YELLOW}{file}{Fore.RESET}")
for word in bad_words:
if word != bad_words[word]:
if not dump:
print(f"\t{Fore.GREEN}[{word:20s}]\t{Fore.CYAN}[{bad_words[word]}]{Fore.RESET}")
#! print our a simple list of bad words for easy cut/paste into skipwords.txt
if len(bad_words) > 0:
for w in bad_words:
print(w)