-
Notifications
You must be signed in to change notification settings - Fork 0
/
DocumentScan.Rmd
235 lines (199 loc) · 5.69 KB
/
DocumentScan.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
---
jupyter:
jupytext:
text_representation:
extension: .Rmd
format_name: rmarkdown
format_version: '1.2'
jupytext_version: 1.11.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
```{python}
# import the libaries
from utils_files import scan_files, make_hash, get_filename
from utils import load_pickle, save_pickle
from utils_word import get_doc_info, get_doc_text
from utils_excel import get_excel_info
from utils_pptx import get_pptx_info, get_pptx_text
from utils_pdf import get_pdf_info, get_pdf_text
import os
```
```{python}
config = {
"locations": {
"pickle": {
"root": ".",
"folders": ["data"],
"name": "docscan",
"ext": ".pickle"
}
}
}
```
```{python}
# set up the paths
""" {
"name": "Downloads",
"folders": ["users", "David", "Downloads"],
"root":"D:\\"
},
{
"name": "BCFSA",
"folders":["users", "David", "Google", "thoughtswin", "BCFSA"],
"root":"D:\\"
},"""
locations = [
{
"name": "ManitobaHydro",
"folders":["users", "David", "Google", "thoughtswin", "Manitoba Hydro", "EA Role"],
"root":"D:\\"
},
]
filetypes = [
{
"type":"Word",
"ext":[".docx"],
"infoscanner":get_doc_info,
"textreader": get_doc_text
},
{
"type":"Excel",
"ext":[".xlsx"],
"infoscanner":get_excel_info,
"textreader": None
},
{
"type":"PowerPoint",
"ext":[".pptx"],
"infoscanner":get_pptx_info,
"textreader": get_pptx_text
},
{
"type":"PDF",
"ext":[".pdf"],
"infoscanner":get_pdf_info,
"textreader": get_pdf_text
}
]
```
```{python}
picklename = get_filename(f=config.get("locations", {}).get("pickle", {}))
print(picklename)
```
```{python}
def scanFolder(location):
"""extract the folder details, and generate a filelist"""
name = location.get("name", "default")
root = location.get("root", ".")
folders = location.get("folders", [])
folder = os.path.expanduser(os.path.join(root, *folders))
print(folder)
for f in scan_files(folder, options={"stats": True, "split": True}):
filepath = os.path.join(f.get("folder"), f.get("file"))
if filepath in files:
fo = files[filepath]
if 'doctext' in fo:
continue
yield f
```
```{python}
def scan(locations, filetypes=None):
"""This will take a locations object and build a file list"""
# helper list
fields = ["type"] # , "infoscanner", "textreader"
# pivot the filetypes list
if filetypes is None:
filetypes = []
lookups = {}
for filetype in filetypes:
for ext in filetype.get("ext", []):
lookups[ext] = filetype
# process the locations and get the files
data = {}
for location in locations:
for f in scanFolder(location):
ext = f.get("ext")
# get the hash of this file
path = os.path.join(f.get("folder"), f.get("file"))
f["hashes"] = make_hash(path)
if ext in lookups:
# matched
for field in fields:
f[field] = lookups.get(ext,{}).get(field)
try:
# get the doc_info
if lookups.get(ext).get("infoscanner"):
f['docinfo'] = lookups.get(ext).get("infoscanner")(path)
except:
pass
try:
# get the doc_info
if lookups.get(ext).get("textreader"):
f['doctext'] = lookups.get(ext).get("textreader")(path)
except:
pass
# return the new object
yield f
```
```{python}
data = load_pickle(picklename=picklename)
docs = data.get("docs",{})
names = data.get("names",{})
files = data.get("files",{})
```
```{python}
for f in scan(locations = locations, filetypes = filetypes):
# get the filepath;
filepath = os.path.join(f.get("folder",""), f.get("file",""))
if filepath in files:
print("skipping file {}".format(filepath))
files[filepath] = f
try:
# build a hash index
hashvalue = f.get('hashes', {}).get('SHA1')
if hashvalue not in docs:
docs[hashvalue] = []
docs[hashvalue].append(f)
except Exception as ex:
print("ERROR HASH: {}\n\t{}".format(filepath, ex))
try:
# build a name index
name = f.get("file")
if name not in names:
names[name] = []
names[name].append(f)
except Exception as ex:
print("ERROR Name: {}\n\t{}".format(filepath, ex))
```
```{python}
phrase = "Enterprise Architecture"
for k, f in files.items():
if isinstance(f, dict):
if doctext:
doctext = f.get("doctext")
print("{}\n\t{}".format(k, doctext))
```
```{python}
save_pickle(picklename=picklename, data={"docs": docs, "names": names, "files": files})
```
```{python}
filename = "ARC - Potential Transition.docx"
f = files.get(filename)
print(f)
```
```{python}
print(len(docs))
```
```{python}
for idx, (k, v) in enumerate(docs.items()):
if isinstance(v, list) and len(v) > 1:
print("{} => {} {}".format(idx, k, len(v)))
for f in v:
filepath = os.path.join(f.get("folder",""), f.get("file",""))
print("{}".format(filepath))
```
```{python}
```