-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathp4p-cachegroom.py
304 lines (252 loc) · 9.25 KB
/
p4p-cachegroom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#! /bin/env python
from __future__ import print_function
import argparse
import datetime
import logging
import os
import os.path
import re
from bisect import bisect_left
logger = logging.getLogger(__name__)
"""
This program scans a perforce versioned file repository as
maintained by a Perforce proxy, in order to keep its size
under control. It will delete the files with the least
recent access times to bring the number of files or the
total disk space used down to a given limit.
Can be run once a day by a cron script
"""
def find_versioned_files(root):
"""
Look for versioned files in the folder, those ending with known
Perforce endings for such files. Return them as a list of
access time, filesize, pathname, tuples.
"""
entries = []
for dirpath, dirnames, filenames in os.walk(root):
for filename in filenames:
if not is_versioned(filename):
continue
filepath = os.path.join(dirpath, filename)
s = os.stat(filepath)
entries.append((s.st_atime, s.st_size, filepath))
return entries
# text files end with ,v.
# individual binary files are in folders ending with ,d and
# have names like 1.n where n is the version number.
# all can have a .gz tacked on.
BIN = re.compile(r"1\.\d+")
def is_versioned(filename):
if filename.endswith(",v") or filename.endswith(".gz"):
return True
if BIN.match(filename):
return True
return False
def sum_size(files):
"""convenience function to add up the sizes of files"""
return sum(t[1] for t in files)
def cumulative_sum(values):
"""create a cumulative sum, e.g. of file sizes"""
it = iter(values)
last = next(it)
yield last
for v in it:
last += v
yield last
def find_size_limit_old(files, limit_size):
"""
Find the place in the list where the size limit is exceeded
"""
size = sum_size(files)
for i, f in enumerate(files):
if size <= limit_size:
return i # we found the cut-off point
size -= f[1]
return i
def find_size_limit(cumulative_sizes, limit_size):
"""
Using cumulative sizes, find the place of old files to throw away
Return the first index that we keep
"""
# what is the cumulative target to throw away?
target = cumulative_sizes[-1] - limit_size
# find index where cumulative size is greater or equal to what is needed to throw away
return find_ge(cumulative_sizes, target) + 1
def find_atime_limit(files, atime):
"""
Find the place in the list where the access time is equal or higher than given
If everything is older than time (lower access time) then return len(files)
"""
target = (atime, 0, "")
return find_ge(files, target)
# this comes from bisect documentation
def find_ge(a, x):
"Find leftmost item greater than or equal to x"
i = bisect_left(a, x)
return i
def unlink_files(files):
"""
Unlinks files from the list of files until criteria are met regarding
number of files or filesize. Relies on the order of the list, so that
to delete the oldest files, they should be sorted by access time
"""
n = 0
for f in files:
try:
os.unlink(f[2])
n += 1
except OSError:
logger.warning("could not remove file %r", f[2])
return n
def format_size(s):
"""
helper to format large numbers as powers of 1024
"""
pre = ["Ki", "Mi", "Gi", "Ti"]
prefix = ""
ss = float(s)
while ss >= 1024 and pre:
prefix = pre[0]
del pre[0]
ss /= 1024
return ss, prefix
def format_size2(size):
return "%.2f %sB" % format_size(size)
def timestamp_from_datetime(dt):
timestamp = (dt - datetime.datetime(1970, 1, 1)).total_seconds()
return timestamp
def main():
def unitsize(a):
"""
helper to parse sizes such as 100G or 1T
"""
size = a
for i, unit in enumerate("KMGT"):
if a.endswith(unit):
size = a[:-1]
break
else:
i = -1
try:
size = float(size)
except ValueError:
raise argparse.ArgumentTypeError(
"%s must be a number, optionally followed by K, M or G" % a
)
return size * 1024 ** (i + 1)
parser = argparse.ArgumentParser(
description="clean a perforce proxy cache directory"
)
parser.add_argument("root", help="root of versioned files")
parser.add_argument("--dry-run", action="store_true", help="do not delete anything")
parser.add_argument("--max-size", type=unitsize, help="maximum total size of files")
parser.add_argument(
"--max-size-hard",
type=unitsize,
help="maximum total size of files, overriding --min-age",
)
parser.add_argument("--max-count", type=int, help="maximum total number of files")
parser.add_argument(
"--min-age", type=int, help="don't throw away anything younger than this (days)"
)
parser.add_argument(
"--max-age", type=int, help="don't keep anything older than this (days)"
)
args = parser.parse_args()
if not (args.max_size or args.max_count or args.max_age):
parser.error("At least one of --max-size, --max-count, --max-age required")
print("Trimming P4 versioned file folder %r:" % args.root)
files = find_versioned_files(args.root)
size = sum_size(files)
print("Found %d files, %s" % (len(files), format_size2(size)))
if not files:
return # without any files, just quit
# sort files according to access time, oldest first (lowest timestamp)
files.sort()
# and create the cumulative sum
cumulative_sizes = list(cumulative_sum([f[1] for f in files]))
# now apply the criteria
i_keep = 0 # the index of the oldest (lowest timestamp) file we keep
now = datetime.datetime.utcnow()
# max-size, max-count and min age, all work to create a lower bound on number of
# files to store. That is, we take the largest index (youngest) file that these produce.
if args.max_count is not None:
i = i_keep
i_keep = max(i_keep, len(files) - args.max_count)
if i_keep != i:
print(
"--max-count=%r limiting kept files to %d"
% (args.max_count, len(files) - i_keep)
)
else:
print("--max-count=%r not limiting kept files" % (args.max_count,))
if args.max_size is not None:
i = i_keep
i_keep = max(i_keep, find_size_limit(cumulative_sizes, args.max_size))
if i_keep != i:
print(
"--max-size=%s limiting kept files to %d"
% (format_size2(args.max_size), len(files) - i_keep)
)
else:
print(
"--max-size=%r not limiting kept files" % (format_size2(args.max_size),)
)
if args.max_age is not None:
i = i_keep
limit = now - datetime.timedelta(days=args.max_age)
i_keep = max(i_keep, find_atime_limit(files, timestamp_from_datetime(limit)))
if i_keep != i:
print(
"--max-age=%r limiting kept files to %d"
% (args.max_age, len(files) - i_keep)
)
else:
print("--max-age=%r not limiting kept files" % (args.max_age,))
# But now, for min age, it overrides all this. We don't throw away anything younger
# than min age
if args.min_age:
i = i_keep
limit = now - datetime.timedelta(days=args.min_age)
i_keep = min(i_keep, find_atime_limit(files, timestamp_from_datetime(limit)))
if i_keep != i:
print(
"--min-age=%r forcing kept files to %d"
% (args.min_age, len(files) - i_keep)
)
else:
print("--min-age=%r not forcing kept files" % (args.min_age,))
if args.max_size_hard is not None:
# still, we do provide a way to limit, even min_age, if our disk has limited size.
i = i_keep
i_keep = max(i_keep, find_size_limit(cumulative_sizes, args.max_size_hard))
if i_keep != i:
print(
"--max-size-hard=%s limiting kept files to %d"
% (format_size2(args.max_size_hard), len(files) - i_keep)
)
else:
print(
"--max-size-hard=%r not limiting kept files"
% (format_size2(args.max_size_hard),)
)
# perform the action
if not args.dry_run:
print("deleting %d files" % i_keep)
n_removed = unlink_files(files[:i_keep])
else:
print("not deleting %d files (dry run)" % i_keep)
n_removed = i_keep
# output some statistics
for what, part in ("Unlinked", files[:n_removed]), ("Remaining", files[n_removed:]):
size = sum_size(part)
count = len(part)
print("%s:" % what)
print(" %6d files, %s" % (count, format_size2(size)))
if count:
oldest = datetime.datetime.utcfromtimestamp(part[0][0])
youngest = datetime.datetime.utcfromtimestamp(part[-1][0])
print(" atime: %s to %s" % (oldest.isoformat(), youngest.isoformat()))
if __name__ == "__main__":
logging.basicConfig()
main()