-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBPE.py
61 lines (48 loc) · 1.47 KB
/
BPE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import torch
import torch.nn as nn
import pickle
from preprocess import *
from utils import *
import collections
def make_vocab(corpus):
vocab = dict()
tokens = {"<PAD>" : 0, "<BOS>" : 1, "<EOS>" : 2}
for word, freq in corpus:
temp = ""
#word = word.lower()
for c in word:
temp += c + " "
if c not in tokens:
tokens[c] = len(tokens)
temp += "</w>"
vocab[temp] = freq
return vocab, tokens
def get_stats(vocab):
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
def bpe_corpus(corpus, iteration):
vocabs, tokens = make_vocab(corpus)
for _ in tqdm(range(iteration), desc = "byte-pair-encoding"):
pairs = get_stats(vocabs)
best = max(pairs, key = pairs.get)
tokens["".join(best)] = len(tokens)
#print(tokens)
vocabs = merge_vocab(best, vocabs)
print(len(tokens), best, pairs[best], "".join(best))
idx2word = dict()
for word, idx in tokens.items():
idx2word[idx] = word
return tokens, idx2word