forked from diskoverdata/diskover-community
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiskover_crawlapi.py
172 lines (156 loc) · 5.59 KB
/
diskover_crawlapi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""diskover - Elasticsearch file system crawler
diskover is a file system crawler that index's
your file metadata into Elasticsearch.
See README.md or https://github.com/shirosaidev/diskover
for more information.
Copyright (C) Chris Park 2017-2019
diskover is released under the Apache 2.0 license. See
LICENSE for the full license text.
"""
from diskover import config
from datetime import datetime
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
import sys
import requests
from requests.exceptions import HTTPError
import json
import dateutil.parser as dp
def _url(path):
return config['api_url'] + path
def api_connection():
"""Connect to file system storage api and return requests session
or False if unable to connect.
"""
api_url = config['api_url']
if api_url == "":
return None
api_user = config['api_user']
api_password = config['api_password']
ses = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=3)
if "https" in api_url:
p = "https"
else:
p = "http"
ses.mount(p, adapter)
if api_user != "" and api_password != "":
ses.auth = (api_user, api_password)
# check connection to api
try:
resp = ses.get(api_url, verify=False)
resp.raise_for_status()
except HTTPError as http_err:
print("Error connecting to storage api, exiting (%s)" % http_err)
sys.exit(1)
except Exception as err:
print("Error connecting to storage api, exiting (%s)" % err)
sys.exit(1)
else:
return ses
def api_stat(path, ses):
url = _url('/files/' + quote(path.encode('utf-8'), safe='/'))
resp = ses.get(url, verify=False)
d = json.loads(resp.text)
uid = d['uid']
gid = d['gid']
ctime = dp.parse(d['creationTime']).timestamp()
atime = dp.parse(d['lastAccessTime']).timestamp()
mtime = dp.parse(d['lastModifiedTime']).timestamp()
nlink = d['numLinks']
ino = d['inode']
size = d['size']
mode = 0
dev = 0
return mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime
def api_listdir(path, ses):
dirs = []
nondirs = []
page = 1
# get path metadata
path_metadata = api_stat(path, ses)
root = (path, path_metadata)
# get all pages of file lists
while True:
url = _url('/files/' + quote(path.encode('utf-8'), safe='/') + '/_children?page=%s&pageSize=%s' % (page, config['api_pagesize']))
resp = ses.get(url, verify=False)
if resp.status_code == 200:
try:
items = json.loads(resp.text)['_embedded']['children']
for d in items:
if d['isDirectory'] and not d['isSymbolicLink']:
dirs.append(
(
d['fullPath'],
(
0, # mode
d['inode'],
0, # dev
d['numLinks'],
d['uid'],
d['gid'],
d['size'],
dp.parse(d['lastAccessTime']).timestamp(),
dp.parse(d['lastModifiedTime']).timestamp(),
dp.parse(d['creationTime']).timestamp()
)
)
)
elif d['isRegularFile'] and not d['isSymbolicLink']:
nondirs.append(
(
d['fullPath'],
(
0, # mode
d['inode'],
0, # dev
d['numLinks'],
d['uid'],
d['gid'],
d['size'],
dp.parse(d['lastAccessTime']).timestamp(),
dp.parse(d['lastModifiedTime']).timestamp(),
dp.parse(d['creationTime']).timestamp(),
0 # blocks
)
)
)
except KeyError:
# no items
break
finally:
# check if there is a next page or if on last page
try:
url_next = json.loads(resp.text)['_links']['next']['href']
except KeyError:
# no more pages
break
finally:
page = page + 1
else:
break
return root, dirs, nondirs
def api_add_diskspace(es, index, path, ses, logger):
url = _url('/metadata')
resp = ses.get(url, verify=False)
d = json.loads(resp.text)
total = int(d['totalSpace'])
free = int(d['unallocatedSpace'])
available = int(d['usableSpace'])
used = total - free
#fstype = d['type']
indextime_utc = datetime.utcnow().isoformat()
data = {
"path": path,
"total": total,
"used": used,
"free": free,
"available": available,
"indexing_date": indextime_utc
}
logger.info('Adding disk space info to es index')
es.index(index=index, doc_type='diskspace', body=data)