-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprofile_stanza.py
55 lines (48 loc) · 1.92 KB
/
profile_stanza.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import argparse
from pathlib import Path
from typing import List
from time import time
from pathlib import Path
import stanza
from stanza.models.common.doc import Document
import stanza_batch
import GPUtil
import matplotlib.pyplot as plt
def path_type(fp: str) -> Path:
return Path(fp).resolve()
if __name__ == "__main__":
save_fp_help = ('File path to save the GPU memory as a '
'function of documents processed plot. '
'As we have to get the GPU memory usage for each '
'document this slows the program down a lot.')
parser = argparse.ArgumentParser()
parser.add_argument('--save-fp', help=save_fp_help, type=path_type)
parser.add_argument('--clear-cache', action='store_true')
args = parser.parse_args()
stanza.download("en", processors="tokenize,pos,sentiment")
nlp = stanza.Pipeline(
lang="en", processors="tokenize,pos,sentiment", use_gpu=True
)
book_data: List[str] = []
test_data_dir = Path(__file__, "..", "tests", "data").resolve()
with Path(test_data_dir, "jane_austin_emma_data.txt").open(
"r"
) as emma_file:
book_data = [line for line in emma_file]
assert len(book_data) == 490
t = time()
gpu_memory_used: List[float] = []
processed_book_data: List[Document] = []
for document in stanza_batch.batch(book_data, nlp, clear_cache=args.clear_cache):
processed_book_data.append(document)
if args.save_fp:
# assuming the first GPU is the one being used.
gpu_memory_used.append(GPUtil.getGPUs()[0].memoryUsed)
print(f'Time taken: {time() - t}')
if args.save_fp:
number_documents_processed = range(len(processed_book_data))
plt.plot(number_documents_processed, gpu_memory_used)
plt.xlabel('Number of documents processed')
plt.ylabel('GPU Memory used (MB)')
plt.grid(True)
plt.savefig(str(args.save_fp))