Language translation is a key service that is needed by people who are traveling as well as for people who are settling in a new country. AI or machine learning-based neural machine has been used by Google Translate, which has been used by one billion users.In this project, a Long Short Term Memory Network (LSTM) will be train to perform English to French translation. This project could be practically used as a communication tool to help people by reducing the language barrier to one another.
Problem:
Dataset:
Source:
# Import libraries
from collections import Counter
import operator
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed, RepeatVector, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
# load the data
df_english = pd.read_csv('small_vocab_en.csv', sep = '/t', names = ['english'], engine = 'python')
df_french = pd.read_csv('small_vocab_fr.csv', sep = '/t', names = ['french'], engine = 'python')
# Check english data
df_english.head()
# Check english data info
df_english.info()
# Check french data
df_french.head()
# Check french data info
df_french.info()
# Concatenate both English and French dataset
df = pd.concat([df_english, df_french], axis = 1)
# Check the dataset
df.head()
# Show english records
print('Total English Records = {}'.format(len(df['english'])))
# Show french records
print('Total French Records = {}'.format(len(df['french'])))
# Download nltk packages
nltk.download('punkt')
# Download stopwords
nltk.download("stopwords")
# Function to remove punctuations
def remove_punc(x):
return re.sub('[!#?,.:";]', '', x)
# Apply function
df['french'] = df['french'].apply(remove_punc)
df['english'] = df['english'].apply(remove_punc)
# Create empty list
english_words = []
french_words = []
# Create function that gets the unique values in datasets
def get_unique_words(x, word_list):
for word in x.split():
if word not in word_list:
word_list.append(word)
# Apply the function
df['english'].apply(lambda x: get_unique_words(x, english_words));
df['french'].apply(lambda x: get_unique_words(x, french_words));
# Check unique words in english dataset
english_words[:10]
# Check unique words in french dataset
french_words[:10]
# Check total unique words in english dataset
total_english_words = len(english_words)
total_english_words
# Check total unique words in french dataset
total_french_words = len(french_words)
total_french_words
# Obtain list of all words in the dataset
words = []
for i in df['english']:
for word in i.split():
words.append(word)
# Check data
words[:10]
# Obtain the total count of words
english_words_counts = Counter(words)
# Check data
english_words_counts;
# Output:
# Counter({'new': 12197,
# 'jersey': 11225,
# 'is': 205858,
# 'sometimes': 37746,
# 'quiet': 8693,
# 'during': 74933,
# 'autumn': 9004,
# 'and': 59850,
# 'it': 75137,
# 'snowy': 8898,
# Sort the dictionary by values
english_words_counts = sorted(english_words_counts.items(), key = operator.itemgetter(1), reverse = True)
# Check data counts
english_words_counts;
# Output:
# [('is', 205858),
# ('in', 75525),
# ('it', 75137),
# ('during', 74933),
# ('the', 67628),
# ('but', 63987),
# ('and', 59850),
# ('sometimes', 37746),
# ('usually', 37507),
# ('never', 37500),
# ('favorite', 28332),
# ('least', 27564),
# ('fruit', 27192),
# ('most', 14934),
# ('loved', 14166),
# Append the values to a list for visualization purposes
# Create empty
english_words = []
english_counts = []
# Separate english words and its counts
for i in range(len(english_words_counts)):
english_words.append(english_words_counts[i][0])
english_counts.append(english_words_counts[i][1])
# Check data
english_words[:10]
# Check data
english_counts[:10]
# Plot barplot using plotly
fig = px.bar(x = english_words, y = english_counts)
fig.show()
# Plot the word cloud for text that is Real
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000, width = 1600, height = 800 ).generate(" ".join(df.english))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()
# Check sample data
df.english[0]
# Tokenized sample data
nltk.word_tokenize(df.english[0])
# Maximum length (number of words) per document. We will need it later for embeddings
maxlen_english = -1
for doc in df.english:
tokens = nltk.word_tokenize(doc)
if(maxlen_english < len(tokens)):
maxlen_english = len(tokens)
print("The maximum number of words in any document = ", maxlen_english)
# Create empty list
words = []
# Obtain the count of french words
for i in df['french']:
for word in i.split():
words.append(word)
words
# Obtain the total count of words
french_words_counts = Counter(words)
# Check data
french_words_counts;
# Output:
# Counter({'new': 11047,
# 'jersey': 11052,
# 'est': 196809,
# 'parfois': 37746,
# 'calme': 7256,
# 'pendant': 10741,
# "l'": 32917,
# 'automne': 14727,
# 'et': 59851,
# 'il': 84079,
# 'neigeux': 1867,
# 'en': 105768,
# 'avril': 8954,
# 'les': 65255,
# Sort the dictionary by values
french_words_counts = sorted(french_words_counts.items(), key = operator.itemgetter(1), reverse = True)
# Check data
french_words_counts;
# Output:
# [('est', 196809),
# ('en', 105768),
# ('il', 84079),
# ('les', 65255),
# ('mais', 63987),
# ('et', 59851),
# ('la', 49861),
# ('parfois', 37746),
# ('jamais', 37215),
# ('le', 35306),
# ("l'", 32917),
# Append the values to a list for visuaization purpose
# Create empty list
french_words = []
french_counts = []
# Separate english words and its counts
for i in range(len(french_words_counts)):
french_words.append(french_words_counts[i][0])
french_counts.append(french_words_counts[i][1])
# Plot barplot using plotly
fig = px.bar(x = french_words, y = french_counts)
fig.show()
# Plot the word cloud for French
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df.french))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()
# Maximum length (number of words) per document. We will need it later for embeddings
maxlen_french = -1
for doc in df.french:
tokens = nltk.word_tokenize(doc)
if(maxlen_french < len(tokens)):
maxlen_french = len(tokens)
print("The maximum number of words in any document = ", maxlen_french)
# Create function
def tokenize_and_pad(x, maxlen):
# A tokenier to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(char_level = False)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
padded = pad_sequences(sequences, maxlen = maxlen, padding = 'post')
return tokenizer, sequences, padded
# Tokenize and padding to the data
x_tokenizer, x_sequences, x_padded = tokenize_and_pad(df.english, maxlen_french)
y_tokenizer, y_sequences, y_padded = tokenize_and_pad(df.french, maxlen_french)
# Total vocab size, since we added padding we add 1 to the total word count
english_vocab_size = total_english_words + 1
print("Complete English Vocab Size:", english_vocab_size)
# Total vocab size, since we added padding we add 1 to the total word count
french_vocab_size = total_french_words + 1
print("Complete French Vocab Size:", french_vocab_size)
# Show tokenized english words
print("The tokenized version for document\n", df.english[-1:].item(),"\n is : ", x_padded[-1:])
# Show tokenized french words
print("The tokenized version for document\n", df.french[-1:].item(),"\n is : ", y_padded[-1:])
# Function to obtain the text from padded variables
def pad_to_text(padded, tokenizer):
id_to_word = {id: word for word, id in tokenizer.word_index.items()}
id_to_word[0] = ''
return ' '.join([id_to_word[j] for j in padded])
# Check texts
pad_to_text(y_padded[0], y_tokenizer)
# Import library
from sklearn.model_selection import train_test_split
# Train test split
x_train, x_test, y_train, y_test = train_test_split(x_padded, y_padded, test_size = 0.1)
# Sequential Model
model = Sequential()
# Embedding layer
model.add(Embedding(english_vocab_size, 256, input_length = maxlen_english, mask_zero = True))
# Encoder
model.add(LSTM(256))
# Decoder
# Repeat vector repeats the input for the desired number of times to change
# 2D-array to 3D array. For example: (1,256) to (1,23,256)
model.add(RepeatVector(maxlen_french))
# LSTM layer
model.add(LSTM(256, return_sequences = True ))
# Dense layer, time distributed
model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
# Compile the model
model.compile(optimizer = 'adam', loss ='sparse_categorical_crossentropy', metrics = ['accuracy'])
# Check model summary
model.summary()
# Change the shape of target from 2D to 3D
y_train = np.expand_dims(y_train, axis = 2)
# Check data dimension
y_train.shape
# Train the model
model.fit(x_train, y_train, batch_size = 1024, validation_split = 0.1, epochs = 3)
# Save the model
model.save("weights.h5")
# Function to make prediction
def prediction(x, x_tokenizer = x_tokenizer, y_tokenizer = y_tokenizer):
predictions = model.predict(x)[0]
id_to_word = {id: word for word, id in y_tokenizer.word_index.items()}
id_to_word[0] = ''
return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])
# Show the original English and French words, and then the predicted french word
for i in range(5):
print('Original English word - {}\n'.format(pad_to_text(x_test[i], x_tokenizer)))
print('Original French word - {}\n'.format(pad_to_text(y_test[i], y_tokenizer)))
print('Predicted French word - {}\n\n\n\n'.format(prediction(x_test[i:i+1])))
The model performed poorly due to its low number iterations in the training but there were some words predicted correctly. This could be further improve by increasing the number of epochs, experimenting the hyper parameters and adding more data into the model.