import numpy as np from keras_transformer import get_model, decode with open('../Dataset/English_Hindi_Hinglish.txt', mode = 'r') as f: data = f.readlines() data = data[0:195] # 195 Because we have that many labeled data points for Hinglish to English translation. source_tokens = [i.split(',')[1].strip().split(' ') for i in data] target_tokens = [i.split(' ')[0].strip().split(' ') for i in data] # Generate dictionaries def build_token_dict(token_list): token_dict = { '<PAD>': 0, '<START>': 1, '<END>': 2, } for tokens in token_list: for token in tokens: if token not in token_dict: token_dict[token] = len(token_dict) return token_dict source_token_dict = build_token_dict(source_tokens) target_token_dict = build_token_dict(target_tokens) target_token_dict_inv = {v: k for k, v in target_token_dict.items()} # Add special tokens encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens] decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens] output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens] # Padding source_max_len = max(map(len, encode_tokens)) target_max_len = max(map(len, decode_tokens)) encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens] decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens] output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens] encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens] decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens] decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens] # Build & fit model model = get_model( token_num=max(len(source_token_dict), len(target_token_dict)), embed_dim=32, encoder_num=2, decoder_num=2, head_num=4, hidden_dim=128, dropout_rate=0.05, use_same_embed=False, # Use different embeddings for different languages ) model.compile('adam', 'sparse_categorical_crossentropy') model.summary()Number of Parameters to Train When There Are a Certain Number of Lines in Input
Number of Lines is 55
Model: "model_1" ... Total params: 65,827 Trainable params: 65,827 Non-trainable params: 0Number of Lines is 115
Total params: 72,002 Trainable params: 72,002 Non-trainable params: 0Number of Lines is 165
Total params: 77,787 Trainable params: 77,787 Non-trainable params: 0Number of Lines is 195
Total params: 80,777 Trainable params: 80,777 Non-trainable params: 0 %%time model.fit( x=[np.array(encode_input * 1024), np.array(decode_input * 1024)], y=np.array(decode_output * 1024), epochs=10, batch_size=32, )Training Logs When There is a Certain Number of Lines in Input
# Number of Lines in Input is 55
Number of Epochs: 10 CPU times: user 11min 31s, sys: 56 s, total: 12min 27s Wall time: 5min 48s <keras.callbacks.History at 0x7f8f347f69d0># Number of Lines in Input is 115
Number of Epochs: 10 CPU times: user 26min 55s, sys: 2min 7s, total: 29min 2s Wall time: 13min 33s# Number of Lines in Input is 150
Number of Epochs: 10 CPU times: user 41min 26s, sys: 3min 12s, total: 44min 39s Wall time: 21min 1s# Number of Lines in Input is 195
Number of Epochs: 10 Epoch 1/10 6240/6240 [==============================] - 165s 25ms/step - loss: 0.1641 Epoch 2/10 6240/6240 [==============================] - 163s 26ms/step - loss: 0.0049 Epoch 3/10 6240/6240 [==============================] - 151s 24ms/step - loss: 0.0043 Epoch 4/10 6240/6240 [==============================] - 150s 24ms/step - loss: 0.0038 Epoch 5/10 6240/6240 [==============================] - 150s 24ms/step - loss: 0.0043 Epoch 6/10 6240/6240 [==============================] - 153s 24ms/step - loss: 0.0036 Epoch 7/10 6240/6240 [==============================] - 153s 24ms/step - loss: 0.0036 Epoch 8/10 6240/6240 [==============================] - 151s 24ms/step - loss: 0.0036 Epoch 9/10 6240/6240 [==============================] - 150s 24ms/step - loss: 0.0038 Epoch 10/10 6240/6240 [==============================] - 152s 24ms/step - loss: 0.0037 CPU times: user 51min 23s, sys: 3min 52s, total: 55min 16s Wall time: 25min 39s# Validation
decoded = decode( model, encode_input, start_token=target_token_dict['<START>'], end_token=target_token_dict['<END>'], pad_token=target_token_dict['<PAD>'], ) for i in decoded: print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1]))) ... Follow him. I am tired. I can swim. I can swim. I love you.# Testing
test_sents = [ 'kaise ho?', 'kya tum mujhse pyar karte ho?', 'kya tum mujhe pyar karte ho?' ] test_tokens = [i.split() for i in test_sents] test_token_dict = build_token_dict(test_tokens) test_token_dict_inv = {v: k for k, v in test_token_dict.items()} test_enc_tokens = [['<START>'] + tokens + ['<END>'] for tokens in test_tokens] test_enc_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in test_enc_tokens] test_input = [list(map(lambda x: test_token_dict[x], tokens)) for tokens in test_enc_tokens] decoded = decode( model, test_input, start_token=test_token_dict['<START>'], end_token=test_token_dict['<END>'], pad_token=test_token_dict['<PAD>'], ) for i in decoded: print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))
Monday, October 17, 2022
Hinglish to English Machine Translation Using Transformers
Download Code and Data
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment