-
Notifications
You must be signed in to change notification settings - Fork 0
/
proxy.py
243 lines (212 loc) · 8.26 KB
/
proxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import argparse
import queue
import subprocess
import signal
import os
import sys
import glob
import logging
import socket
import time
import tempfile
from flask import Flask, request
from flask_restful import reqparse, abort, Api, Resource, inputs
import proteopt.client
from proteopt.common import serialize, deserialize
app = Flask(__name__)
api = Api(app)
class Proxy(Resource):
endpoints = set()
max_retries = None
client = None
@classmethod
def get_client(cls):
if cls.client is None:
if not cls.endpoints:
raise ValueError("No endpoints")
cls.client = proteopt.client.Client(
endpoints=[e + "/tool" for e in cls.endpoints],
max_retries=cls.max_retries,
extra_parallelism_factor=1)
return cls.client
def get(self, action, name):
if action == "add-endpoint":
endpoint = request.args.get('endpoint')
self.endpoints.add(endpoint)
self.client = None
return f"Added endpoint {endpoint}"
elif action == "remove-endpoint":
endpoint = request.args.get('endpoint')
if endpoint in self.endpoints:
self.endpoints.remove(endpoint)
self.client = None
return f"Removed endpoint {endpoint}"
else:
return f"No such endpoint {endpoint}"
elif action == "status":
lines = []
lines.extend(sorted(self.endpoints))
return "\n".join(lines)
elif action == "clear":
self.endpoints.clear()
self.client = None
return "Cleared endpoints"
return str(self.MODEL_CACHE.keys())
class Tool(Resource):
def get(self, tool_name):
max_parallelism = Proxy.get_client().max_parallelism
result = {
'description': 'proxy',
'endpoints': sorted(Proxy.endpoints),
'max_parallelism': max_parallelism,
}
return result, 200
def post(self, tool_name):
payload = request.get_json()
payload['tool_name'] = tool_name
client = Proxy.get_client()
result_queue = queue.Queue()
client.work_queue.put((0, payload, result_queue))
(payload_id, return_payload) = result_queue.get()
assert payload_id == 0
return return_payload, 200
api.add_resource(Proxy, '/proxy/<action>')
api.add_resource(Tool, '/tool/<tool_name>')
# Run the test server
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--no-cleanup", action="store_true", default=False)
arg_parser.add_argument("--max-retries", default=2, type=int)
arg_parser.add_argument("--endpoints", nargs="+")
arg_parser.add_argument("--host", default="127.0.0.1")
arg_parser.add_argument("--port", type=int)
arg_parser.add_argument("--write-endpoint-to-file")
arg_parser.add_argument("--write-launched-endpoints-to-file")
arg_parser.add_argument(
"--debug",
default=False,
action="store_true")
arg_parser.add_argument(
"--launch-servers",
metavar="N",
type=int,
help="Launch N API servers. If N=-K, then K servers are launched per GPU and "
"the CUDA_VISIBLE_DEVICES parameter is set accordingly for each server.")
arg_parser.add_argument(
"--cuda-devices",
nargs="+",
type=int,
help="CUDA device numbers to use for launched API servers.")
arg_parser.add_argument(
"--launch-args",
nargs=argparse.REMAINDER,
help="All following args are args for launched API servers.")
if __name__ == '__main__':
args = arg_parser.parse_args(sys.argv[1:])
logging.basicConfig(level=logging.INFO)
endpoint_to_process = {}
work_dir = None
if args.launch_servers:
print(args)
num_to_launch = args.launch_servers
set_cuda_visible_devices = False
num_per_gpu = None
if args.launch_servers < 0:
num_per_gpu = -args.launch_servers
if args.cuda_devices:
print("Using specified cuda devices")
num_gpus = len(args.cuda_devices)
cuda_devices_map = dict(
(i, v) for (i, v) in enumerate(args.cuda_devices))
else:
num_gpus = subprocess.check_output(["nvidia-smi", "-L"]).decode().split("\n")
num_gpus = [g.strip() for g in num_gpus]
num_gpus = len([g for g in num_gpus if g.startswith("GPU ")])
print(f"Detected {num_gpus} GPUs.")
cuda_devices_map = dict((i, i) for i in range(num_gpus))
num_to_launch = num_gpus * num_per_gpu
print(f"Will launch {num_to_launch} processes on GPUs.")
set_cuda_visible_devices = True
work_dir = tempfile.TemporaryDirectory(prefix="proteopt_proxy_")
for i in range(num_to_launch):
endpoint_file = os.path.join(work_dir.name, f"endpoint.{i}.txt")
sub_args = [
"python",
os.path.join(os.path.dirname(__file__), "api.py"),
]
sub_args.extend(args.launch_args)
sub_args.extend(["--write-endpoint-to-file", endpoint_file])
environ = os.environ.copy()
if set_cuda_visible_devices:
environ["CUDA_VISIBLE_DEVICES"] = str(
cuda_devices_map[i // num_per_gpu])
print(f"Launching API server {i} / {num_to_launch} with args:")
print(sub_args)
logfile = os.path.join(work_dir.name, f"log.{i}.txt")
logfile_fd = open(logfile, "w+b")
process = subprocess.Popen(
sub_args, stderr=logfile_fd, stdout=logfile_fd, env=environ)
while process.poll() is None and not os.path.exists(endpoint_file):
time.sleep(0.1)
try:
endpoint = open(endpoint_file).read().strip()
except IOError:
print("Failed to load endpoint file. Process log:")
logfile_fd.seek(0)
for line in logfile_fd.readlines():
print(line)
raise
print(f"API server {i} at endpoint {endpoint} will log to {logfile}")
endpoint_to_process[endpoint] = process
Proxy.endpoints.update(list(endpoint_to_process))
if args.write_launched_endpoints_to_file:
with open(args.write_launched_endpoints_to_file, "w") as fd:
for endpoint in Proxy.endpoints:
fd.write(endpoint)
fd.write("\n")
print("Wrote", args.write_launched_endpoints_to_file)
Proxy.max_retries = args.max_retries
if args.endpoints:
Proxy.endpoints.update(args.endpoints)
print("Initialized proxy with endpoints: ", Proxy.endpoints)
port = args.port
if not port:
# Identify an available port
# Based on https://stackoverflow.com/questions/5085656/how-to-select-random-port-number-in-flask
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind((args.host, 0))
port = sock.getsockname()[1]
sock.close()
endpoint = "http://%s:%d" % (args.host, port)
print("Endpoint will be", endpoint)
if args.write_endpoint_to_file:
with open(args.write_endpoint_to_file, "w") as fd:
fd.write(endpoint)
fd.write("\n")
print("Wrote", args.write_endpoint_to_file)
def cleanup(sig, frame):
if args.debug:
print("Dumping logs.")
for g in glob.glob(os.path.join(work_dir.name, "*.txt")):
print("*" * 40)
print(g)
print("*" * 40)
for line in open(g).readlines():
print("---", line.rstrip())
if work_dir is not None and not args.no_cleanup:
print(f"Cleaning up {work_dir}")
work_dir.cleanup()
while endpoint_to_process:
endpoint, process = endpoint_to_process.popitem()
print(f"Terminating process with endpoint {endpoint}")
process.terminate()
if process.poll() is None:
process.kill()
print("Done.")
sys.exit(0)
signal.signal(signal.SIGINT, cleanup)
app.run(
host=args.host,
port=port,
debug=args.debug,
use_reloader=False,
threaded=True)