forked from the-crypt-keeper/tldw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerger.py
29 lines (23 loc) · 797 Bytes
/
merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
import sys
in_file = sys.argv[1]
with open(in_file) as infile:
chunks = [json.loads(line) for line in infile.readlines()]
def part_to_time(part):
mins = part*5
oh = mins // 60
om = mins % 60
return f'{oh:02}:{om:02}'
text = ''
for idx, chunk in enumerate(chunks):
#text += f'\n\n[{part_to_time(idx)} - {part_to_time(idx+1)}] '
text += f'\nSection {idx+1}: {chunk["answer"]}\n'
out_file = in_file.replace('ndjson','txt')
with open(out_file,'w') as outfile:
outfile.write(text)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', use_fast = True)
logits = tokenizer.encode(text)
print('chunks:', len(chunks))
print('summary bytes:', len(text))
print('summary tokens:', len(logits))