Skip to content

Commit 2f3769f

Browse files
gustavocuozzoLuís Gustavo Cuozzo
andauthored
New api-v3 (#1)
* API v3 * Corrige inicialização da API * APIv3 improvements Co-authored-by: Luís Gustavo Cuozzo <lcuozzo@cpqd.com.br>
1 parent b8ba0bf commit 2f3769f

File tree

3 files changed

+186
-197
lines changed

3 files changed

+186
-197
lines changed

README.md

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,16 @@ client = TranscriptionClient(
4848
#### Operação de transcrição simples:
4949

5050
```python
51-
audio_id, result = client.transcribe("/caminho/para/audio.wav")
51+
job_id, result = client.transcribe("/caminho/para/audio.wav")
5252
```
5353

5454
Alternativamente, o usuário pode escolher apenas iniciar a transcrição
5555
e esperar pelo resultado posteriormente usando um valor negativo para o
5656
parâmetro de timeout:
5757

5858
```python
59-
audio_id = client.transcribe("/caminho/para/audio.wav", timeout=-1)
60-
result = client.wait_result(audio_id)
59+
job_id = client.transcribe("/caminho/para/audio.wav", timeout=-1)
60+
result = client.wait_result(job_id)
6161
```
6262

6363
As operações `transcribe` com `timeout>=0` e `wait_result` por padrão deletam o
@@ -66,25 +66,34 @@ arquivo após o término da transcrição (`delete_after=True`).
6666
#### Impressão de resultado via _callback_:
6767

6868
```python
69-
def callback(audio_id, response):
70-
print(audio_id, response)
69+
def callback(job_id, response):
70+
print(job_id, response)
7171

7272
client.register_callback(callback)
73-
audio_id, result = client.transcribe("/caminho/para/audio.wav")
73+
job_id, result = client.transcribe("/caminho/para/audio.wav")
7474
```
7575

7676
É possível melhorar o controle de resultado usando uma classe de contexto para
7777
armazenar os resultados para uso fora da _callback_.
7878

7979
```python
8080
class Context():
81-
def callback(self, audio_id, response):
82-
if response["event"] == "finished":
83-
self.result = response["result"]
81+
def __init__(self):
82+
self.results = {}
83+
84+
def callback(self, job_id, response):
85+
job = response["job"]
86+
if job["status"] == "COMPLETED":
87+
job_id = job["id"]
88+
segments = response["segments"]
89+
self.results[job_id] = {
90+
"job": job,
91+
"segments": segments}
92+
8493

8594
c = Context()
8695
client.register_callback(c.callback)
87-
audio_id, result = client.transcribe("example.wav")
96+
job_id, result = client.transcribe("example.wav")
8897
print(c.result)
8998
```
9099

@@ -112,23 +121,34 @@ class Context:
112121
self.lock = RLock()
113122
self.pbar = tqdm.tqdm(total=len(to_transcribe))
114123

115-
def callback(self, audio_id, response):
116-
if response["event"] == "finished":
117-
result = response["result"]
118-
self.results[result["filename"]] = result
124+
def callback(self, job_id, response):
125+
job = response["job"]
126+
if job["status"] == "COMPLETED":
127+
job_id = job["id"]
128+
segments = response["segments"]
129+
self.results[job_id] = {
130+
"job": job,
131+
"segments": segments}
119132
with self.lock:
120133
self.pbar.update(1)
121134

122135
c = Context()
123136
client.register_callback(c.callback)
124137

125-
# Armazena todos os audio_ids para esperar os resultados.
126-
audio_ids = []
138+
# Armazena todos os job_ids para esperar os resultados.
139+
job_ids = []
127140
for path in to_transcribe:
128-
audio_ids.append(client.transcribe(path, timeout=-1))
129-
for audio_id in audio_ids:
130-
client.wait_result(audio_id)
131-
print(c.results)
141+
job_ids.append(client.transcribe(path, timeout=-1))
142+
for job_id in job_ids:
143+
client.wait_result(job_id)
144+
for id in c.results:
145+
print("id: {}\n\tstatus:{}\n\tfilename:{}\n\tsegments:{}\n".format(
146+
id,
147+
c.results[id]["job"]["status"],
148+
c.results[id]["job"]["filename"],
149+
c.results[id]["segments"],
150+
)
151+
)
132152
```
133153

134154
## Segurança

cpqdtrd/api.py

Lines changed: 91 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import time
1111
import logging
1212
import urllib
13-
from typing import List
13+
import json
14+
from datetime import datetime
15+
from typing import List, Dict, Union
1416
from contextlib import closing
1517

1618

@@ -40,7 +42,8 @@ def __init__(
4042
self._log = logging.getLogger("cpqdtrd.api")
4143
while not ok:
4244
try:
43-
self.audiofile_list()
45+
for r in self.query():
46+
self._log.debug("response: {}".format(r))
4447
ok = True
4548
except Exception as e:
4649
self._log.warning("Exception on API list request: {}".format(e))
@@ -51,127 +54,80 @@ def __init__(
5154
msg = "API call retries exceeded"
5255
raise self.TimeoutException(msg)
5356

54-
def audiofile_list(self, batch: str = ""):
55-
if batch:
56-
return requests.get(
57-
"{}/audiofile/list/batch/{}".format(self._url, batch), auth=self._auth
58-
)
59-
else:
60-
return requests.get("{}/audiofile/list/".format(self._url), auth=self._auth)
57+
def create(self, file_path: str, tag: str = None, config: List[str] = None, callbacks_url: List = []):
58+
upload_request = "{}/job/create/".format(self._url)
59+
if tag:
60+
upload_request += "?tag={}".format(tag)
6161

62-
def audiofile_get(self, uid: str):
63-
return requests.get("{}/audiofile/{}".format(self._url, uid), auth=self._auth)
62+
data = {}
63+
if config:
64+
data["config"] = config
6465

65-
def audiofile_create(self, file_name: str):
66-
return requests.get(
67-
"{}/audiofile/create/{}".format(self._url, file_name), auth=self._auth
68-
)
66+
if len(callbacks_url) > 0:
67+
data["callback_urls"] = ','.join(callbacks_url)
6968

70-
def audiofile_create_batch(self, batch: str):
71-
return requests.get(
72-
"{}/audiofile/create/batch/{}".format(self._url, batch), auth=self._auth
73-
)
74-
75-
def audiofile_upload(self, file_path: str, batch: str = ""):
76-
upload_request = "{}/audiofile/upload/".format(self._url)
77-
if batch:
78-
data = {"batch": batch}
79-
else:
80-
data = {}
8169
with open(file_path, "rb") as f:
82-
files = [("files", f)]
70+
upload_file = [("upload_file", f)]
8371
return requests.post(
84-
upload_request, data=data, files=files, auth=self._auth
72+
upload_request, data=data, files=upload_file, auth=self._auth
8573
)
8674

87-
def audiofile_delete(self, uid: str, delete_on_disk: bool = False):
88-
delete_request = "{}/audiofile/delete/{}"
89-
if delete_on_disk:
90-
delete_request += "?deleteOnDisk=true"
91-
return requests.delete(delete_request.format(self._url, uid), auth=self._auth)
92-
93-
def audiofile_delete_batch(self, batch: str, delete_on_disk: bool = False):
94-
delete_request = "{}/audiofile/delete/batch/{}"
95-
if delete_on_disk:
96-
delete_request += "?deleteOnDisk=true"
97-
return requests.delete(delete_request.format(self._url, batch), auth=self._auth)
98-
99-
def transcription_start(self, audio_id: str, request_args: dict = {}):
100-
start_request = "{}/transcription/start/audiofile/{}"
101-
sep = "?"
102-
for arg, val in request_args.items():
103-
if arg == "webhook" and type(val) is list:
104-
for w in val:
105-
start_request += sep
106-
sep = "&"
107-
start_request += "webhook={}".format(w)
108-
else:
109-
start_request += sep
110-
sep = "&"
111-
start_request += "{}={}".format(arg, val)
112-
return requests.get(start_request.format(self._url, audio_id), auth=self._auth)
113-
114-
def transcription_start_batch(
115-
self, batch: str, word_hints: str = "", lm_url: str = ""
75+
def list_jobs(self, page: int = 1, limit: int = 100, tag: str = None):
76+
params = {"page": page, "limit": limit}
77+
if tag:
78+
params["tag"] = tag
79+
return requests.get("{}/job".format(self._url), params=params, auth=self._auth)
80+
81+
def status(self, job_id: str):
82+
return requests.get("{}/job/status/{}".format(self._url, job_id), auth=self._auth)
83+
84+
def result(self, job_id: str):
85+
return requests.get("{}/job/result/{}".format(self._url, job_id), auth=self._auth)
86+
87+
def stop(self, job_id: str):
88+
return requests.post("{}/job/stop/{}".format(self._url, job_id), auth=self._auth)
89+
90+
def retry(self, job_id: str):
91+
return requests.post("{}/job/retry/{}".format(self._url, job_id), auth=self._auth)
92+
93+
def delete(self, job_id: str):
94+
return requests.delete("{}/job/{}".format(self._url, job_id), auth=self._auth)
95+
96+
def query(
97+
self,
98+
tags: List[str] = [],
99+
filenames: List[str] = [],
100+
statuses: List[str] = [],
101+
projection: List[str] = [],
102+
get_result: bool = False,
103+
page: int = 1,
104+
limit: int = 100,
105+
start_date: datetime = None,
106+
end_date: datetime = None,
116107
):
117-
start_request = "{}/transcription/start/batch/{}"
118-
sep = "?"
119-
if lm_url:
120-
start_request += sep
121-
sep = "&"
122-
start_request += "lm.uri={}".format(lm_url)
123-
if word_hints:
124-
start_request += sep
125-
start_request += "hints.words={}".format(word_hints)
126-
return requests.get(start_request.format(self._url, batch), auth=self._auth)
127-
128-
def transcription_status(self, audio_id: str):
129-
return requests.get(
130-
"{}/transcription/status/audiofile/{}".format(self._url, audio_id),
131-
auth=self._auth,
132-
)
133-
134-
def transcription_status_batch(self, batch: str):
135-
return requests.get(
136-
"{}/transcription/status/batch/{}".format(self._url, batch), auth=self._auth
137-
)
138-
139-
def transcription_reset(self, audio_id: str, hard: bool = False):
140-
reset_request = "{}/transcription/reset/audiofile/{}"
141-
if hard:
142-
reset_request += "?hard=true"
143-
return requests.get(reset_request.format(self._url, audio_id), auth=self._auth)
144-
145-
def transcription_reset_batch(self, batch: str, hard: bool = False):
146-
reset_request = "{}/transcription/reset/batch/{}"
147-
if hard:
148-
reset_request += "?hard=true"
149-
return requests.get(reset_request.format(self._url, batch), auth=self._auth)
150-
151-
def transcription_result(self, audio_id: str, is_csv: bool = False):
152-
result_request = "{}/transcription/result/audiofile/{}".format(
153-
self._url, audio_id
154-
)
155-
if is_csv:
156-
result_request += "?format=csv"
157-
return requests.get(result_request, auth=self._auth)
158-
159-
def transcription_result_batch(self, batch: str, format: str = ""):
160-
result_request = "{}/transcription/result/batch/{}".format(self._url, batch)
161-
if format != "":
162-
result_request += "?format=" + format
163-
return requests.get(result_request, auth=self._auth)
164-
165-
def query_collection(self, collection: str, query: dict, project: list = []):
166-
query_request = "{}/query/collection/{}".format(self._url, collection)
167-
sep = "?"
168-
for k in query:
169-
query_request += sep + "{}={}".format(k, query[k])
170-
sep = "&"
171-
for p in project:
172-
query_request += sep + "project={}".format(p)
173-
sep = "&"
174-
with closing(requests.get(query_request, stream=True, auth=self._auth)) as r:
108+
request = "{}/query/job".format(self._url)
109+
110+
params = {}
111+
if tags:
112+
params["tag"] = tags
113+
if filenames:
114+
params["filenames"] = filenames
115+
if statuses:
116+
params["status"] = statuses
117+
if projection:
118+
params["projection"] = projection
119+
if get_result:
120+
params["result"] = "true"
121+
122+
params["page"] = page
123+
params["limit"] = limit
124+
125+
if start_date:
126+
params["start_date"] = start_date.isoformat()
127+
if end_date:
128+
params["end_date"] = end_date.isoformat()
129+
130+
with closing(requests.get(request, params=params, stream=True, auth=self._auth)) as r:
175131
for line in r.iter_lines():
176132
yield line
177133

@@ -180,19 +136,27 @@ def webhook_whoami(self):
180136
return requests.get(whoami_request, auth=self._auth)
181137

182138
def webhook_validate(
183-
self, host, port, timeout=None, retries=None, token="", crt=""
139+
self,
140+
host: str,
141+
port: int,
142+
timeout: Union[None, int] = None,
143+
retries: Union[None, int] = None,
144+
token: str = "",
145+
crt: str = "",
184146
):
185-
test_request = "{}/webhook/validate/{}/{}".format(self._url, host, port)
186-
sep = "?"
187-
if timeout is not None:
188-
test_request += sep + "timeout={}".format(timeout)
189-
sep = "&"
190-
if retries is not None:
191-
test_request += sep + "retries={}".format(retries)
192-
sep = "&"
147+
test_request = "{}/webhook/validate".format(self._url)
148+
payload = {
149+
"url": "{}:{}".format(host, port)
150+
}
151+
if timeout:
152+
payload["timeout"] = int(timeout)
153+
if retries:
154+
payload["retries"] = int(retries)
155+
193156
if crt is not None:
194-
return requests.post(
195-
test_request, auth=self._auth, json={"crt": crt, "token": token}
157+
r = requests.post(
158+
test_request, params=payload, auth=self._auth, json={"crt": crt, "token": token}
196159
)
197160
else:
198-
return requests.get(test_request, auth=self._auth)
161+
r = requests.get(test_request, params=payload, auth=self._auth)
162+
return r

0 commit comments

Comments
 (0)