# Importing modules
import pandas as pd
import os
os.chdir('..')
import numpy as np
import re, nltk, spacy, gensim
import logging
import warnings
warnings.filterwarnings('ignore')
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
# Store dataset in a Pandas Dataframe
df = pd.read_csv('/content/social.csv')
# Show the Title and Content column
df[['Title', 'Content']]
# Check data dimension
df.shape
# Check for missing values
df.isnull().mean()
# Import regular expression library
import re
# Remove punctuation
df['Content'] = df['Content'].map(lambda x: re.sub('[,@#\.!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~!?]','', x))
# Convert the contents to lowercase
df['Content'] = df['Content'].map(lambda x: x.lower())
# Print out the first rows of papers
df['Content'].head()
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different titles together.
long_string = ','.join(list(df['Title'].values))
# Generate the word cloud
wordcloud = WordCloud(background_color = 'white',
max_words = 500,
contour_width = 3,
contour_color = 'steelblue',
collocations = False, width=1000, height=400).generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Join the different processed Content together.
long_string = ','.join(list(df['Content'].values))
# Generate the word cloud
wordcloud2 = WordCloud(background_color = 'white',
max_words = 200,
contour_width = 3,
contour_color = 'steelblue',
collocations = False, width=1000, height=400).generate(long_string)
# Visualize the word cloud
wordcloud2.to_image()
# Set timer in processing
%%time
# Import libraries
import gensim
from gensim.utils import simple_preprocess
# Split sentences into words
def sent_to_words (sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))
data = df.Content.values.tolist()
data_words = list(sent_to_words(data))
# Check the data
print(data_words[:1])
# Build the bigram and trigram models # higher threshold fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[data_words], threshold = 100)
# Faster way to get a sentence formatted as a bigram or trigram
bigram_mod = gensim.models.phrases.Phraser (bigram)
trigram_mod = gensim.models.phrases.Phraser (trigram)
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.extend(['data', 'development', 'result', 'analysis', 'model'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
# Lemmatization function
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Import spacy library
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
# Lemmatize keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'])
# Check data
print(data_lemmatized[:1])
# Import library
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View data
print(corpus[:2])
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
id2word = id2word,
num_topics = 5,
random_state = 123, # For reproducibility
chunksize = 100,
passes = 10,
alpha = 0.01,
eta = 'auto',
iterations = 400,
per_word_topics = True)
# Import library
from pprint import pprint
# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
# Transform corpus
doc_lda = lda_model[corpus]
# Import library
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus) )
# Compute Coherence Score
coherence_model_lda = CoherenceModel (model = lda_model, texts = data_lemmatized, dictionary = id2word, coherence = 'c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: c_v ', coherence_lda)
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel (model = lda_model, texts = data_lemmatized, dictionary = id2word, coherence = 'u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score u_mass: ', coherence_lda)
# Intall Library
!pip install -U pyLDAvis
# Import library
import pyLDAvis.gensim
# Visualize the topics
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# Show this in notebook
pyLDAvis.enable_notebook()
# Show visualization
vis
# Store content column
tc = df[['Content']]
# Get the probabilities being part of a certain topic
all_topics = lda_model.get_document_topics(doc_lda, minimum_probability = 0.0)
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
# Content and topic probalities
result = pd.concat([tc, all_topics_df], axis = 1, sort = False)
# Check data columns
result.columns
# Rename columns
result.columns = ['Content', 'Food', 'Health', 'Influence', 'Negative Sentiment', 'Location']
# Check data
result.head(10)
# Function below nicely aggregates this information in a presentable table.
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)
# Install library
!pip install heatmapz
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Show correlation in the features by using heatmap
from heatmap import corrplot
plt.figure(figsize=(12, 8))
corrplot(result.corr(), size_scale=300)
# Plot the number of Topic in dataset
plt.figure(figsize = (8, 8))
sns.countplot(y = "Dominant_Topic", data = df_dominant_topic)
plt.show()
# Channel column
channel_name = df['Channel Name']
# Get the channel value counts
unique, counts = np.unique(channel_name, return_counts = True)
# Channel counts
channel_counts = dict(zip(unique, counts))
# Transform into dataframe
channel_freq = pd.DataFrame(list(channel_counts.items()),columns = ['Channel_Name','Channel_Count'])
# Sort dataframe according to frequency
channel_freq = channel_freq.sort_values(by=['Channel_Count'], ascending = False )
# Add Post Type according to their respective indices
channel_freq['Post_type'] = df['Post Type']
# Add channel Type according to their respective indices
channel_freq['Channel_Type'] = df['Channel Site Type']
# Re-arrange columns
channel_freq = channel_freq[['Channel_Count', 'Channel_Name', 'Channel_Type', 'Post_type']]
# Check top infulential channels
channel_freq.head(10)
# Dataframe containing content and topic
DF = df_dominant_topic[['Text', 'Dominant_Topic']]
# Check dataframe
DF.head()
# Download stopwords
nltk.download("stopwords")
# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
# Stop words for english
stop_words = stopwords.words('english')
# Add stop words
stop_words.extend(['http', 'www', 'com'])
# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
result.append(token)
return result
# Apply the function to the dataframe
DF['Clean'] = DF['Text'].apply(preprocess)
# Check data
DF.head()
# Obtain the total words present in the dataset
list_of_words = []
for i in DF.Clean:
for j in i:
list_of_words.append(j)
# Check data
list_of_words[:10]
# Check data length
len(list_of_words)
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
# Check data
total_words
# Join the words into a string
DF['Clean_Joined'] = DF['Clean'].apply(lambda x: " ".join(x))
# Check data
DF.head()
# Import library
import nltk
nltk.download('punkt')
# length of maximum content will be needed to create word embeddings
maxlen = -1
for doc in DF.Clean_Joined:
tokens = nltk.word_tokenize(doc)
if(maxlen<len(tokens)):
maxlen = len(tokens)
print("The maximum number of words in any document is =", maxlen)
# Visualize the distribution of number of words in a Content
sns.distplot([len(nltk.word_tokenize(x)) for x in DF.Clean_Joined], bins = 100 )
plt.show()
# Import library
from sklearn.model_selection import train_test_split
# Split data into test and train
x_train, x_test, y_train, y_test = train_test_split(DF.Clean_Joined, DF.Dominant_Topic, test_size = 0.2)
# Import libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk import word_tokenize
# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)
# Check the tokenzed format
print("The encoding for content\n",DF.Clean_Joined[10],"\n is : ", train_sequences[0])
# Add padding can be maxlen = 2401
padded_train = pad_sequences(train_sequences, maxlen = 2401, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences, maxlen = 2401, truncating = 'post')
for i,doc in enumerate(padded_train[:2]):
print("The padded encoding for content",i+1," is : ", doc)
# Transform into array
y_train = np.asarray(y_train)
# Sequential Model
model = Sequential()
# Embeddidng layer
model.add(Embedding(total_words, output_dim = 128))
# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))
# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(5,activation = 'softmax'))
# Compile model
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])
# Model summary
model.summary()
# Train the model
model.fit(padded_train, y_train, batch_size = 64, validation_split = 0.2, epochs = 1)
# Make prediction
prediction = model.predict(padded_test)
# Import library
from sklearn.metrics import accuracy_score
# Check accuracy
accuracy = accuracy_score(list(y_test), prediction)
# Show accuracy
print("Model Accuracy : ", accuracy)