-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyfoca.py
585 lines (559 loc) · 27.3 KB
/
pyfoca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
#!/usr/bin/python
#
#############################################################################################################
# #
# This reason behind creating this script is due to the issues #
# I've experienced in the FOCA application such as with downloading files. #
# Additionally, taking screen captures for reports can turn into three and #
# four captures depending on what metadata was gathered. Therefore, this script #
# can perform some of the same functions as FOCA, and outputs the data into a nicely #
# formatted table. #
# #
# For those of you who have already used FOCA in the past to perform metadata #
# extraction, you can export the data you've received (e.g., user, software, printers, folders, etc.) #
# into a directory, and have this script #
# to parse those files and print it to a table. This will make the screenshot look much better. #
# #
# Author: Alton Johnson <alton@vonahi.io> #
# Updated: 06/06/2013 #
# Version: 1.6 #
# #
#############################################################################################################
import getopt, os, httplib, socket, urllib2, re, time, commands,sys
from sys import argv
curr_time = time.time()
totalFiles = 0
extractedFrom = 0
class colors:
white = "\033[1;37m"
normal = "\033[0;00m"
red = "\033[1;31m"
blue = "\033[1;34m"
green = "\033[1;32m"
try:
import pyPdf
from pyPdf import PdfFileReader
except Exception, err:
print colors.red + " Warning: To obtain maximum data from PDF documents, it's highly recommended that you install the pyPDF python module."
print " pyPDF can be downloaded from http://pybrary.net/pyPdf/" + colors.normal
banner = '\n ' + "-" * 79 + colors.white + '\n pyfoca v1.6 - Document Metadata Extractor, Alton Johnson (alton@vonahi.io)\n ' + colors.normal + "-" * 79 + "\n"
class metaparser:
def __init__(self, fileName, workingDir, domainName, pageResults,exts,report_dir,del_files,verbose):
self.fileName = fileName
self.container = list()
self.offset = [0]
self.data_exists = [0]
self.top_row = [' | File Name','Creation Date','Author','Produced By','Modification Date','Last Saved By']
self.top_rowf = ['Folders','Operating System(s)','Printers','Software','Users','Emails']
self.domainName = domainName
self.workingDir = workingDir
self.pageResults = pageResults
self.totalSuccess = 0
self.exts = exts
self.report_dir = report_dir
self.del_files = del_files
self.verbose = verbose
if self.report_dir == "":
while len(self.offset) < len(self.top_row):
self.offset.append(0)
self.data_exists.append(0)
else:
while len(self.offset) < len(self.top_rowf):
self.offset.append(0)
self.data_exists.append(0)
def processFile(self, curr_file):
global extractedFrom
author = '-'
date = '-'
generator = '-'
created = '-'
producer = '-'
modded = '-'
last_saved = '-'
if ".pdf" in curr_file:
try:
pdfFile = PdfFileReader(file(curr_file, 'rb'))
if pdfFile.getIsEncrypted():
pdfFile.decrypt('')
docInfo = pdfFile.getDocumentInfo()
if not docInfo:
return
last_saved = '-'
#looks at the entire dictionary to parse for information
if "/CreationDate" in docInfo:
data = docInfo["/CreationDate"].strip("D:|'")
year = data[0:4]
date = data[4:6] + "/" + data[6:8]
created_time = data[8:10] + ":" + data[10:12]
created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
created = date + "/" + year + " " + created_time
if "/Author" in docInfo:
author = docInfo["/Author"] + " "
if len(author) <=1:
author = "-"
if "/Producer" in docInfo:
producer = docInfo["/Producer"].strip("(Windows)")
producer = re.sub(r'[^\w]', ' ', producer)
if len(producer) == 0:
producer = "-"
while True:
if " " in producer:
producer = producer.replace(" ", " ")
else:
break
if "/ModDate" in docInfo:
data = docInfo["/ModDate"].strip("D:|'")
year = data[0:4]
date = data[4:6] + "/" + data[6:8]
modded_time = data[8:10] + ":" + data[10:12]
modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
modded = date + "/" + year + " " + modded_time
#strips '/' off file name (if it includes directory name)
if "/" in curr_file:
curr_file = curr_file[curr_file.rfind("/")+1:]
if "\\" in curr_file:
curr_file = curr_file.replace("\\","")
#trim information if it's too long
if len(curr_file) > 15: # trims file name
curr_file = curr_file[:15] + "..." + curr_file[-13:]
if len(producer) > 30:
producer = producer[:20] + " [snipped] "
if len(author) > 20:
author = author[:20] + " [snipped] "
#appends each piece of information. output will show ONLY if at least ONE file has data in a column
self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
except Exception, err:
return
else:
try:
curr_file = curr_file.replace(" ","\ ").replace("(", "\(").replace(")", "\)")
output = commands.getoutput('extract -V ' + curr_file).split('\n')
if "extract: not found" in output[0]:
print colors.red + " Error: This script requires the extract command."
print " Please install extract by typing \'apt-get install extract\' in terminal.\n" + colors.normal
exit()
for i in output:
if "creator" in i:
author = i[i.find("-")+2:]
rem_alphanumeric = re.compile('\W')
author = re.sub(rem_alphanumeric, ' ', author)
while True:
if " " in author:
author = author.replace(" ", " ")
elif author[0] == " ":
author = author[1:]
else:
break
elif "date" in i and "creation" not in i:
year = i[i.find('-')+2:(i.find('-')+2)+4]
date = i[i.find(year)+5:(i.find(year)+5)+5].replace("-","/")
modded_time = i[i.find(":")-2:i.rfind(":")-1]
modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
modded = date + "/" + year + " " + modded_time
elif 'generator' in i:
producer = i[i.find('-')+2:]
elif 'creation' in i:
year = i[i.find('-')+2:(i.find('-')+2)+4]
date = i[i.find(year)+5:(i.find(year)+5)+5].replace("-","/")
created_time = i[i.find(":")-2:i.rfind(":")-1]
created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
created = date + "/" + year + " " + created_time
elif 'last saved' in i:
last_saved = i[i.find('-')+2:]
if "/" in curr_file:
curr_file = curr_file[curr_file.rfind("/")+1:]
if "\\" in curr_file:
curr_file = curr_file.replace("\\","")
#trim the file name if it's longer than 15 characters
if len(curr_file) > 15:
curr_file = curr_file[:9] + "..." + curr_file[-13:]
if author != "-" or date != "-" or generator != "-" or created != "-" or producer != "-" or modded != "-" or last_saved != "-":
self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
except Exception, err:
if "command not found" in str(err):
print colors.red + "\n Error: This program requires the \"extract\" command, and it cannot be found."
print " Please install extract by using 'apt-get install extract' from terminal." + colors.normal
exit()
# print colors.red + curr_file + " --------------- " + str(err) + colors.normal
return
extractedFrom = len(self.container)
def parseReport(self, folders_file, OS_file, printers_file, software_file, users_file, emails_file):
supported_options = [folders_file, OS_file, printers_file, software_file, users_file, emails_file]
#grab variable with most lines -- this determines how many times we append to container
max_lines = 0
for i in supported_options:
if len(i) > max_lines:
max_lines = len(i)
for ln in range(0,max_lines):
if ln < len(folders_file):
add_folder = folders_file[ln].replace("%20", " ")
else:
add_folder = "-"
if ln < len(OS_file):
add_os = OS_file[ln]
else:
add_os = "-"
if ln < len(printers_file):
add_printer = printers_file[ln]
else:
add_printer = "-"
if ln < len(software_file):
add_software = software_file[ln]
else:
add_software = "-"
if ln < len(users_file):
add_user = users_file[ln]
else:
add_user = "-"
if ln < len(emails_file):
add_email = emails_file[ln]
else:
add_email = "-"
self.container.append([add_folder, add_os, add_printer, add_software, add_user, add_email])
def grabMeta(self):
print banner
global totalFiles
foundFile = False
files = []
# FOCA file types
folders_file = []
OS_file = []
printers_file = []
software_file = []
users_file = []
emails_file = []
self.foca_filetypes = []
if self.report_dir:
if self.report_dir == ".":
self.report_dir = "./"
self.report_dir = self.report_dir.replace(" ", "\ ")
print " Reading files..."
for dirname, dirnames, filenames in os.walk(self.report_dir):
for z in filenames:
try:
new_open = open(dirname + z)
file_contents = new_open.read().replace("\r", "").replace("\t","").split('\n')
while '' in file_contents:
file_contents.remove('')
if "Metadata" in file_contents[0]:
self.foca_files = file_contents[1][file_contents[1].find("(")+1:file_contents[1].find("/")]
for ext in file_contents:
if ext != "":
if "." == file_contents[file_contents.index(ext)][0]:
self.foca_filetypes.append(file_contents[file_contents.index(ext)])
elif "folders" in file_contents[0]:
for i in file_contents[1:]:
if i not in folders_file:
folders_file.append(i)
folders_file.sort()
elif "operating systems" in file_contents[0]:
for a in file_contents[1:]:
if a not in OS_file:
OS_file.append(a)
OS_file.sort()
elif "printers" in file_contents[0]:
for b in file_contents[1:]:
if b not in printers_file:
printers_file.append(b)
printers_file.sort()
elif "software" in file_contents[0]:
for c in file_contents[1:]:
if c not in software_file:
software_file.append(c)
software_file.sort()
elif "users" in file_contents[0]:
for d in file_contents[1:]:
if d not in users_file:
users_file.append(d)
users_file.sort()
elif "emails" in file_contents[0]:
for e in file_contents[1:]:
if e not in emails_file:
emails_file.append(e)
emails_file.sort()
new_open.close()
except Exception, err:
# print err
pass
if len(emails_file) == 0 and len(users_file) == 0 and len(software_file) == 0 and len(printers_file) == 0 and len(OS_file) == 0 and len(folders_file) == 0:
print colors.red + " Error: There are no supported files within the specified directory.\n" + colors.normal
exit()
self.parseReport(folders_file, OS_file, printers_file, software_file, users_file, emails_file)
elif self.workingDir != "":
if self.workingDir == ".":
self.workingDir = "./"
for dirname, dirnames, filenames in os.walk(self.workingDir):
if len(filenames) == 0:
print colors.red + " Error: There are no files within the specified directory.\n" + colors.normal
exit()
for i in filenames:
for ext in self.exts:
if ext in i:
foundFile = True
curr_file = dirname + i
curr_file = curr_file.replace(" ","\ ").replace("(", "\(").replace(")", "\)")
self.processFile(curr_file)
totalFiles += 1
if foundFile == False:
print colors.red + "\n Error: Sorry, no supported files were located within the specified directory. Please try another file or directory.\n" + colors.normal
exit()
elif self.fileName != "":
self.fileName = self.fileName.replace(" ", "\ ").replace("(", "\(").replace(")", "\)")
self.processFile(self.fileName)
totalFiles += 1
elif self.domainName != "":
print " Domain: %s" % self.domainName
print " Attempting to gather links from google searches..."
conn = httplib.HTTPConnection('www.google.com')
total_count = 0
for e in self.exts:
count = 0
while count < self.pageResults:
conn.request("GET","/search?q=site:" + self.domainName + "+ext:" + e + "&start=%s0" % str(count))
r1 = conn.getresponse()
contents = r1.read()
new_pattern = "(?P<url>https?://[^:]+\.%s)" % e
new_pattern = re.findall(new_pattern,contents)
for n in new_pattern:
if n not in files:
files.append(n)
count += 1
total_count += 1
totalFiles = len(files)
if len(files) == 0:
print " No files were located within Google based on the extension(s) and domain you provided.\n"
exit()
print " Discovered " + str(len(files)) + " files from " + str(total_count) + " total google searches..."
#create pyfoca-downloads directory if it doesn't exist
if not os.path.exists('pyfoca-downloads'):
print " Creating pyfoca-downloads folder..."
os.makedirs('pyfoca-downloads')
#set max amount of spaces for pdf file names
spaces = 0
for item in files:
item = item[item.rfind("/")+1:]
if len(item) > 10:
short_file = item[:10] + "..." + item[-10:]
else:
short_file = item
if len(short_file) > spaces:
spaces = len(short_file) + 3
print " Attempting to download files..."
if self.verbose == False:
print " Please wait..."
#download each file that we added to the 'files' variable
print " -------------------------------"
for f in files:
if "..." in f:
del files[files.index(f)]
continue
pdf_name = f[f.rfind("/")+1:]
print f
try:
response = urllib2.urlopen(f)
source = response.read()
write_file = open('pyfoca-downloads/%s' % pdf_name, 'w')
write_file.write(source)
write_file.close()
name = pdf_name.replace("(", "\(").replace(")", "\)")
filesize = commands.getoutput('ls -lh pyfoca-downloads/%s | awk \'{print $5}\'' % name)
if len(pdf_name) > 10:
short_file = pdf_name[:10] + "..." + pdf_name[-10:]
else:
short_file = pdf_name
if self.verbose == True:
print; print colors.blue + " [+] " + short_file, "-" * (spaces-len(short_file)), "success", "[%s of %s] [size: %s]" % (str(files.index(f)+1),str(len(files)), filesize) + colors.normal
except Exception, err:
if self.verbose == True:
print colors.red + " [-] " + short_file, "-" * (spaces-len(short_file)), "fail", "[%s of %s]" % (str(files.index(f)+1),str(len(files))) + colors.normal
totalFiles -= 1
continue
print
for e in files:
pdf_name = e[e.rfind("/")+1:]
self.processFile('pyfoca-downloads/%s' % pdf_name)
def printMeta(self):
#check to see if user requested FOCA file parsing; if so, print statistics first
if self.report_dir != "":
print " User specified option for \"FOCA\" text file parsing. Printing details..."
print " ----------------------------------------------------------------------"
for i in self.foca_filetypes:
print " Total " + i[:i.find("(")-1] + " files: " + i[i.find("(")+1:i.find(")")]
print
#for self.data_exists, add 1 to any column with data, and 0 to column without data
for i in self.container:
for num in range(0,len(self.top_row)):
if i[num] != "-":
self.data_exists[num] = 1
#check self.data_exists for empty columns, and remove them.
restart_check = 1
while restart_check == 1:
restart_check = 0
for data in self.data_exists:
if data == 0:
if self.report_dir == "":
del self.top_row[self.data_exists.index(data)]
else:
del self.top_rowf[self.data_exists.index(data)]
del self.offset[self.data_exists.index(data)]
for citem in self.container:
del citem[self.data_exists.index(data)]
del self.data_exists[self.data_exists.index(0)]
restart_check = 1
totalFiles = len(self.container)
#states that no data exists if nothing really does. this prevents the output of self.top_row from showing with nothing in the table.
if len(self.container) == 0:
print colors.red + " Either no data was found on Google, or there were issues opening the documents."
print colors.red + " Ensure that the 'extract' tool is installed by running 'sudo apt-get install extract'\n" + colors.normal
exit()
#goes through each item in container and make sure max spaces are correct
for item in self.container:
for num in range(0,len(item)):
if "|" not in item[0]:
item[0] = " | " + item[0]
if len(item[num]) > self.offset[num]:
self.offset[num] = len(item[num]) + 1
if self.report_dir == "":
for x in range(0,len(self.offset)):
if len(self.top_row[x]) > self.offset[x]:
self.offset[x] = len(self.top_row[x]) + 1
else:
for x in range(0,len(self.offset)):
if "|" not in self.top_rowf[0]:
self.top_rowf[0] = " | " + self.top_rowf[0]
if len(self.top_rowf[x]) > self.offset[x]:
self.offset[x] = len(self.top_rowf[x]) + 1
#prints the top row (formatted according to the # of spaces set from above code)
if self.report_dir == "":
top_bottom_lines = " " + "-" * (sum(self.offset) + len(self.top_row) + len(self.top_row)-2)
print top_bottom_lines
for top in self.top_row:
print top + " " * (self.offset[self.top_row.index(top)] - len(top)) + "|",
print "\n" + top_bottom_lines
else:
top_bottom_lines = " " + "-" * (sum(self.offset) + len(self.top_rowf) + len(self.top_rowf)-2)
print top_bottom_lines
for top in self.top_rowf:
print top + " " * (self.offset[self.top_rowf.index(top)] - len(top)) + "|",
print "\n" + top_bottom_lines
#prints the metadata details for each file
for item in self.container:
for num in range(0,len(item)):
print item[num] + " " * (self.offset[num] - len(item[num])) + "|",
if item == self.container[-1]:
print "\n" + top_bottom_lines + "\n"
else:
print
print " " + "--" * 5
if self.report_dir == "":
if self.del_files:
print " Deleting pyfoca-downloads folder..."
commands.getoutput('rm pyfoca-downloads/ -rf')
print " Extracted data from %s file(s)." % str(totalFiles)
else:
print " Extracted data from %s file(s)." % self.foca_files
def help():
print banner
print " Usage: ./pyfoca.py <OPTIONS> \n"
print colors.green + " Domain options:\n" + colors.normal
print "\t -d <domain>\t\tHarvests all documents from a domain (saves to pyfoca-downloads/). \n\t\t\t\tAfterwards, extract metadata."
print colors.green + "\n Parse file/dir:\n" + colors.normal
print "\t -f <file>\t\tExtracts metadata specifically from one file. (Cannot use with '-d')"
print "\t -w <dir>\t\tExtracts metadata from files within specified directory. (Cannot use with '-d')"
print colors.green + "\n Foca Export Parsing:\n" + colors.normal
print "\t -r <directory>\t\tParses data exported from FOCA. Provide directory containing exported files."
print colors.green + "\n Misc:\n" + colors.normal
print "\t -x\t\t\tAfter parsing metadata, delete files downloaded from the domain."
print "\t -e <pdf|doc|xls|all>\tSearch based on provided extension(s). Separate with comma. (Default is all.) "
print "\t -p <number>\t\tSearches x amount of google pages (per extension). (Default is 2.)"
print "\t -t <secs>\t\tSets timeout value. (Default is 5.)"
print "\t -v\t\t\tPrints status messages for files that are downloaded."
print "\n Supported extensions are: .pdf, .doc, .docx, .xls, .xlsx, and .ppt"
print " Example: ./pyfoca.py -d www.domain.com -e pdf,doc -p 3\n"
exit()
def main(argv):
if len(argv) < 2:
help()
try:
opts, args = getopt.getopt(argv, 'vxf:d:r:w:p:t:e:')
except getopt.GetoptError:
help()
fileName = ''
workingDir = ''
domainName = ''
pageResults = 2
verbose = False
socket.setdefaulttimeout(5)
exts = ['all']
supported_exts = ['all','pdf','doc','docx','xls','xlsx','ppt']
report_dir = ''
del_files = False
for opt, arg in opts:
if opt == "-f":
fileName = arg
elif opt == "-w":
workingDir = arg
elif opt == "-d":
domainName = arg
elif opt == "-p":
pageResults = int(arg)
elif opt == "-t":
socket.setdefaulttimeout(float(arg))
elif opt == "-e":
exts = arg.split(',')
elif opt == "-r":
report_dir = arg
elif opt == "-x":
del_files = True
elif opt == "-v":
verbose = True
#checks for errors before submitting for processing
if domainName != "" and (fileName != "" or workingDir != ""):
print colors.red + "\n Error: You have provided a domain name, yet you also have provided a file and/or working directory."
print " You can only use the domain name option by itself." + colors.normal
help()
if fileName != "" and (workingDir != "" or domainName != ""):
print colors.red + "\n Error: You have provided a file name, yet you also have provided a working directory and/or domain name."
print " You can only use the file name option by itself." + colors.normal
help()
if workingDir != "" and (fileName != "" or domainName != ""):
print colors.red + "\n Error: You have provided a working directory, yet you have also provided a file name and/or domain name."
print " You can only use the working directory option by itself." + colors.normal
help()
if report_dir and (workingDir != "" or fileName != "" or domainName != ""):
print colors.red + "\n Error: You've enabled report mode. Therefore, you can only provide a directory with files exported from FOCA."
print " It appears that you've enabled report mode, along with some other options (e.g., directory, file name, domain name, etc.)."
print " Please check your options and try again." + colors.normal
help()
if del_files == True and domainName == "":
print colors.red + "\n Error: You've provided the '-x' option when you have no domain name specified. Please check your options." + colors.normal
help()
for i in exts:
if i.lower() not in supported_exts:
print colors.red + "\n Error: You've provided an unsupported extension. Please try again." + colors.normal
help()
if fileName != "":
try:
with open(fileName) as f: pass
except IOError, err:
print colors.red + "\n Error: " + str(err) + "\n"
exit()
if " " in fileName:
fileName = fileName.replace(" ","\ ")
if "all" in exts:
exts = supported_exts[1:]
startparse = metaparser(fileName, workingDir, domainName, pageResults,exts,report_dir,del_files,verbose)
startparse.grabMeta()
startparse.printMeta()
if __name__ == "__main__":
try:
main(argv[1:])
except KeyboardInterrupt:
print "\n Exiting. Interrupted by user (ctrl-c)."
if os.path.exists('pyfoca-downloads'):
del_folder = raw_input(" Remove pyfoca-downloads folder? [Y/n] ")
if "n" not in del_folder:
commands.getoutput('rm pyfoca-downloads/ -r')
print
exit()
print " Completed in: %.1fs\n" % (time.time() - curr_time)