-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
68 lines (52 loc) · 2 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import string
import re
df = pd.read_csv('dataset/english_to_german.csv')
df.columns = df.columns.str.strip()
df['source'] = df['German']
df['target'] = df['English'].apply(lambda x: '[start] ' + x + ' [end]')
df = df.drop(['English', 'German'], axis=1)
print(df.sample(8))
# shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
# split the data into train, validation, and test sets
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.2)
test_size = int(len(df) * 0.1)
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]
# Standardizing, tokenizing and indexing the data
max_tokens = 25000
sequence_length = 30
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(
lowercase, f"[{re.escape(strip_chars)}]", "")
# tokenize the data using our custom standardization function
source_vectorization = keras.layers.TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=sequence_length,
)
target_vectorization = keras.layers.TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=sequence_length + 1, # add +1 token to our target sentences since they'll be shifted right by 1 during training
standardize=custom_standardization,
)
# index all tokens in the source and target sentences
train_source_texts = train_df['source'].values
train_target_texts = train_df['target'].values
source_vectorization.adapt(train_source_texts)
target_vectorization.adapt(train_target_texts)
source_vectorization_model = keras.Sequential([source_vectorization])
target_vectorization_model = keras.Sequential([target_vectorization])
source_vectorization_model.save("tokens_de.h5")
target_vectorization_model.save("tokens_en.h5")