-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiletype.py
80 lines (57 loc) · 1.89 KB
/
filetype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import magic
import csv
import xml.sax
TYPE_MAPPING = {"gzip": "gz", "bzip2": "bz2",
"Zip": "zip", "RAR": "rar",
"POSIX tar": "tar"}
COMPRESSION = ["gz", "bz2"]
ARCHIVED = ["zip", "rar", "tar"]
m = magic.Magic()
def get_type(fname):
ftype = m.from_file(fname)
if fname.lower().endswith('.xlsx') or fname.lower().endswith('.xls') or 'Excel' in ftype:
return 'xlsx'
for k in TYPE_MAPPING.keys():
if k in ftype:
return TYPE_MAPPING[k]
# solutions here from http://stackoverflow.com/questions/9084228/python-to-check-if-a-gzipped-file-is-xml-or-csv
# and http://stackoverflow.com/questions/2984888/check-if-file-has-a-csv-format-with-python
if 'text' in ftype:
with open(fname, 'rb') as fh:
try:
xml.sax.parse(fh, xml.sax.ContentHandler())
return 'xml'
except: # SAX' exceptions are not public
pass
fh.seek(0)
# if line count is less than 2, csv type check will not be accurate
# so txt is returned as default
linecount = 0
for line in fh:
linecount += 1
if linecount > 2:
break
if linecount <= 2:
return 'txt'
fh.seek(0)
try:
dialect = csv.Sniffer().sniff(fh.read(1024))
return 'csv'
except csv.Error:
pass
return 'txt'
return ftype
def is_compression(fname):
ftype = get_type(fname)
return is_compression_by_type(ftype)
def is_compression_by_type(ftype):
if ftype in COMPRESSION:
return True
return False
def is_archived(fname):
ftype = get_type(fname)
return is_archived_by_type(ftype)
def is_archived_by_type(ftype):
if ftype in ARCHIVED:
return True
return False