In a world that’s becoming more and more connected, it is easier for lies to spread. It turns out that with a dataset consisting of news articles classified as either reliable or not it is possible to detect fake news. Artificial Intelligence or Machine learning-based fake news detector is crucial for companies and media to automatically predict whether circulating news is fake or not. In this project, a recurrent neural network (LSTM) will be train to predict if the news is classified as true or fake.
Problem:
Dataset
Source:
Ahmed H, Traore I, Saad S. “Detecting opinion spams and fake news using text classification”, Journal of Security and Privacy, Volume 1, Issue 1, Wiley, January/February 2018.
Ahmed H, Traore I, Saad S. (2017) “Detection of Online Fake News Using N-Gram Analysis and Machine Learning Techniques. In: Traore I., Woungang I., Awad A. (eds) Intelligent, Secure, and Dependable Systems in Distributed and Cloud Environments. ISDDC 2017. Lecture Notes in Computer Science, vol 10618. Springer, Cham (pp. 127-138).
# Import libaries
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# import keras
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
# Import theme
# from jupyterthemes import jtplot
# jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
# load the data
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")
# Check true data
df_true.head()
# Check true data info
df_true.info()
# Check true data for missing values
df_true.isnull().sum()
# Check fake data
df_fake.head()
# Check true data info
df_fake.info()
# Check fake data for missing values
df_fake.isnull().sum()
# add a target class column to indicate whether the news is real or fake
df_true['isfake'] = 1
# Check data
df_true.head()
# add a target class column to indicate whether the news is real or fake
df_fake['isfake'] = 0
# Check data
df_fake.head()
# Concatenate Real and Fake News
df = pd.concat([df_true, df_fake]).reset_index(drop = True)
# Check data
df.head()
# Check data
df.tail()
# Drop column
df.drop(columns = ['date'], inplace = True)
# Combine title and text together
df['original'] = df['title'] + ' ' + df['text']
df.head()
# Check data sample
df['original'][0]
# Check data sample
df['original'][1]
# Check data sample
df['original'][3]
# Download stopwords
nltk.download("stopwords")
# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Check stopwords
stop_words
# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
result.append(token)
return result
# Apply the function to the dataframe
df['clean'] = df['original'].apply(preprocess)
# Check data
df.head()
# Show original news
df['original'][0]
# Show cleaned up news after removing stopwords
print(df['clean'][0])
# Obtain the total words present in the dataset
list_of_words = []
for i in df.clean:
for j in i:
list_of_words.append(j)
# Check data
list_of_words[:10]
# Check data length
len(list_of_words)
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
# Check data
total_words
# Join the words into a string
df['clean_joined'] = df['clean'].apply(lambda x: " ".join(x))
# Check data
df.head()
# Check data
df['clean_joined'][0]
# Check data
df['clean_joined'][1]
# Check data
df['clean_joined'][2]
# Plot the number of samples in 'subject'
plt.figure(figsize = (8, 8))
sns.countplot(y = "subject", data = df)
plt.show()
# Plot the count plot for fake vs. true news
plt.figure(figsize = (8, 8))
sns.countplot(y = "isfake", data = df)
plt.show()
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 1].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()
# plot the word cloud for text that is Fake
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 0].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()
# length of maximum document will be needed to create word embeddings
maxlen = -1
for doc in df.clean_joined:
tokens = nltk.word_tokenize(doc)
if(maxlen<len(tokens)):
maxlen = len(tokens)
print("The maximum number of words in any document is =", maxlen)
# visualize the distribution of number of words in a text
import plotly.express as px
fig = px.histogram(x = [len(nltk.word_tokenize(x)) for x in df.clean_joined], nbins = 100)
fig.show()
# split data into test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.clean_joined, df.isfake, test_size = 0.2)
# Import library
from nltk import word_tokenize
# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)
# Check the tokenzed format
print("The encoding for document\n",df.clean_joined[0],"\n is : ",train_sequences[0])
# Add padding can either be maxlen = 4406 or smaller number maxlen = 40 seems to work well based on results
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post')
for i,doc in enumerate(padded_train[:2]):
print("The padded encoding for document",i+1," is : ",doc)
# Sequential Model
model = Sequential()
# embeddidng layer
model.add(Embedding(total_words, output_dim = 128))
# model.add(Embedding(total_words, output_dim = 240))
# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))
# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()
# Check total number of words
total_words
# Transform into array
y_train = np.asarray(y_train)
# Train the model
model.fit(padded_train, y_train, batch_size = 64, validation_split = 0.1, epochs = 2)
# Make prediction
pred = model.predict(padded_test)
# If the predicted value is >0.5 it is real else it is fake
prediction = []
for i in range(len(pred)):
if pred[i].item() > 0.5:
prediction.append(1)
else:
prediction.append(0)
# Getting the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(list(y_test), prediction)
print("Model Accuracy : ", accuracy)
# Get the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (15, 15))
sns.heatmap(cm, annot = True)
plt.show()
# Category dictionary
category = { 0: 'Fake News', 1 : "Real News"}
It turns out this architecture works well for fake news detection. The advantage of recurrent neural network is that the performance time is much shorter, while the performance remains the same. This architecture is useful in other Natural Language Classification tasks as well. The model gives good results in Toxic comment classification as well as sentiment analysis.