-
Notifications
You must be signed in to change notification settings - Fork 1
/
encode_data.py
52 lines (38 loc) · 1.81 KB
/
encode_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from scratch_datasets import TextualDataset, EncodedFiles2Dataset
from json import load, loads
from os import listdir
from config import tokenizer, encoded_file_keyword
from tqdm import tqdm
def jsonl2dataset(path, file, tokenizer, save=False, attr="text", save_path=None):
with open(path+file, "r", encoding="utf-8") as jf:
texts = [loads(x)[attr] for x in jf.readlines()]
dataset = TextualDataset(texts, tokenizer)
if save:
if not save_path:
save_path = path + encoded_file_keyword + file
dataset.__jdump__(save_path)
else:
return dataset
def json2sents(path, attr="sents"):
with open(path, "r", encoding="utf-8") as jf:
return load(jf)[attr]
def multipleJsonl2dataset(path):
files = [x for x in listdir(path) if encoded_file_keyword not in x and ".jsonl" in x]
for file in tqdm(files, total=len(files)):
jsonl2dataset(path, file, tokenizer, save=True)
def encoded2datasets(path, files, trim=None, block=None, dev_ratio=0.01,
shfl=False, save=False, save_path=None, name="", eos=2):
dataset = EncodedFiles2Dataset(path, files, shfl, trim=trim, block=block, eos=eos)
if save:
if save_path is None:
save_path = path
dataset.__jdumpwsplit__(save_path, dev_ratio, name=name)
else:
return dataset
def multipleEncoded2datasets(path, trim=None, block=None, shfl=False, name="", eos=2):
files = [x for x in listdir(path) if encoded_file_keyword in x]
encoded2datasets(path, files, save=True, trim=trim, block=block, shfl=shfl, name=name, eos=eos, dev_ratio=0.01)
#path_to_files = "C:/gpt2/korpusi/tajno/"
#multipleJson2dataset(path_to_files)
#multipleEncoded2datasets(path_to_files, trim=512, name="_bert-p")
#multipleEncoded2datasets(path_to_files, block=128, name="_gpt-p")