import numpy as np
from keras_transformer import get_model, decode
with open('../Dataset/English_Hindi_Hinglish.txt', mode = 'r') as f:
data = f.readlines()
data = data[0:195] # 195 Because we have that many labeled data points for Hinglish to English translation.
source_tokens = [i.split(',')[1].strip().split(' ') for i in data]
target_tokens = [i.split(' ')[0].strip().split(' ') for i in data]
# Generate dictionaries
def build_token_dict(token_list):
token_dict = {
'<PAD>': 0,
'<START>': 1,
'<END>': 2,
}
for tokens in token_list:
for token in tokens:
if token not in token_dict:
token_dict[token] = len(token_dict)
return token_dict
source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
# Padding
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))
encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
# Build & fit model
model = get_model(
token_num=max(len(source_token_dict), len(target_token_dict)),
embed_dim=32,
encoder_num=2,
decoder_num=2,
head_num=4,
hidden_dim=128,
dropout_rate=0.05,
use_same_embed=False, # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()
Number of Parameters to Train When There Are a Certain Number of Lines in Input
Number of Lines is 55
Model: "model_1"
...
Total params: 65,827
Trainable params: 65,827
Non-trainable params: 0
Number of Lines is 115
Total params: 72,002
Trainable params: 72,002
Non-trainable params: 0
Number of Lines is 165
Total params: 77,787
Trainable params: 77,787
Non-trainable params: 0
Number of Lines is 195
Total params: 80,777
Trainable params: 80,777
Non-trainable params: 0
%%time
model.fit(
x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
y=np.array(decode_output * 1024),
epochs=10,
batch_size=32,
)
Training Logs When There is a Certain Number of Lines in Input
# Number of Lines in Input is 55
Number of Epochs: 10
CPU times: user 11min 31s, sys: 56 s, total: 12min 27s
Wall time: 5min 48s
<keras.callbacks.History at 0x7f8f347f69d0>
# Number of Lines in Input is 115
Number of Epochs: 10
CPU times: user 26min 55s, sys: 2min 7s, total: 29min 2s
Wall time: 13min 33s
# Number of Lines in Input is 150
Number of Epochs: 10
CPU times: user 41min 26s, sys: 3min 12s, total: 44min 39s
Wall time: 21min 1s
# Number of Lines in Input is 195
Number of Epochs: 10
Epoch 1/10
6240/6240 [==============================] - 165s 25ms/step - loss: 0.1641
Epoch 2/10
6240/6240 [==============================] - 163s 26ms/step - loss: 0.0049
Epoch 3/10
6240/6240 [==============================] - 151s 24ms/step - loss: 0.0043
Epoch 4/10
6240/6240 [==============================] - 150s 24ms/step - loss: 0.0038
Epoch 5/10
6240/6240 [==============================] - 150s 24ms/step - loss: 0.0043
Epoch 6/10
6240/6240 [==============================] - 153s 24ms/step - loss: 0.0036
Epoch 7/10
6240/6240 [==============================] - 153s 24ms/step - loss: 0.0036
Epoch 8/10
6240/6240 [==============================] - 151s 24ms/step - loss: 0.0036
Epoch 9/10
6240/6240 [==============================] - 150s 24ms/step - loss: 0.0038
Epoch 10/10
6240/6240 [==============================] - 152s 24ms/step - loss: 0.0037
CPU times: user 51min 23s, sys: 3min 52s, total: 55min 16s
Wall time: 25min 39s
# Validation
decoded = decode(
model,
encode_input,
start_token=target_token_dict['<START>'],
end_token=target_token_dict['<END>'],
pad_token=target_token_dict['<PAD>'],
)
for i in decoded:
print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))
...
Follow him.
I am tired.
I can swim.
I can swim.
I love you.
# Testing
test_sents = [
'kaise ho?',
'kya tum mujhse pyar karte ho?',
'kya tum mujhe pyar karte ho?'
]
test_tokens = [i.split() for i in test_sents]
test_token_dict = build_token_dict(test_tokens)
test_token_dict_inv = {v: k for k, v in test_token_dict.items()}
test_enc_tokens = [['<START>'] + tokens + ['<END>'] for tokens in test_tokens]
test_enc_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in test_enc_tokens]
test_input = [list(map(lambda x: test_token_dict[x], tokens)) for tokens in test_enc_tokens]
decoded = decode(
model,
test_input,
start_token=test_token_dict['<START>'],
end_token=test_token_dict['<END>'],
pad_token=test_token_dict['<PAD>'],
)
for i in decoded:
print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Monday, October 17, 2022
Hinglish to English Machine Translation Using Transformers
Download Code and Data
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment