During the last few decades, with the rise of Youtube, Amazon, Netflix and many other such web services, recommender systems have taken more and more place in our lives. From e-commerce (suggest to buyers articles that could interest them) to online advertisement (suggest to users the right contents, matching their preferences), recommender systems are today unavoidable in our daily online journeys.
In a very general way, recommender systems are algorithms aimed at suggesting relevant items to users (items being movies to watch, text to read, products to buy or anything else depending on industries).
Recommender systems are really critical in some industries as they can generate a huge amount of income when they are efficient or also be a way to stand out significantly from competitors. As a proof of the importance of recommender systems, a few years ago, Netflix organised a challenges (the “Netflix prize”) where the goal was to produce a recommender system that performs better than its own algorithm with a prize of 1 million dollars to win.
In this project, different types of recommender systems will be implemented to make a good recommendation for the users in movies and books. For each of them, it can be observe how they work, describe their theoretical aspect.
Dataset:
# Import libraries
import pandas as pd
import numpy as np
# Import datasets
movies_df = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
# Check movies data
movies_df.head()
# Check credits data
credits.head()
# Check datasets dimension
print("Credits:", credits.shape)
print("Movies:", movies_df.shape)
# Rename column
credits_column_renamed = credits.rename(index = str, columns = {"movie_id": "id"})
# Merge column data
movies_df_merge = movies_df.merge(credits_column_renamed, on = 'id')
# Check data
movies_df_merge.head()
# Drop unnecessary columns
movies_cleaned_df = movies_df_merge.drop(columns = ['homepage', 'title_x', 'title_y', 'status','production_countries'])
# Check data
movies_cleaned_df.head()
# Check data info
movies_cleaned_df.info()
# Calculate all the components based on the above formula
v = movies_cleaned_df['vote_count']
R = movies_cleaned_df['vote_average']
C = movies_cleaned_df['vote_average'].mean()
m = movies_cleaned_df['vote_count'].quantile(0.70)
# Add column of weighted average
movies_cleaned_df['weighted_average'] = ((R * v)+ (C * m)) / (v + m)
# Check data
movies_cleaned_df.head()
# Sort data according to weighted averages
movie_sorted_ranking = movies_cleaned_df.sort_values('weighted_average',ascending = False)
# Get only the neccessary columns
movie_sorted_ranking[['original_title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].head(20)
# Import library
import matplotlib.pyplot as plt
import seaborn as sns
# Sorted dataset according the weighted averages, from highest to lowest
weight_average = movie_sorted_ranking.sort_values('weighted_average', ascending = False)
# Create barplot
plt.figure(figsize = (12,6))
axis1 = sns.barplot(x = weight_average['weighted_average'].head(10), y = weight_average['original_title'].head(10), data = weight_average)
plt.title('Best Movies by average votes', weight = 'bold')
plt.xlabel('Weighted Average Score', weight = 'bold')
plt.ylabel('Movie Title', weight = 'bold')
plt.xlim(4, 10)
plt.show()
# Sorted dataset according to popularity
popularity = movie_sorted_ranking.sort_values('popularity',ascending = False)
# Check data
popularity.head()
# Create barplot
plt.figure(figsize = (12,6))
ax = sns.barplot(x = popularity['popularity'].head(10), y = popularity['original_title'].head(10), data = popularity)
plt.title('Most Popular by Votes', weight = 'bold')
plt.xlabel('Score of Popularity', weight = 'bold')
plt.ylabel('Movie Title', weight = 'bold')
# Recommendation based on scaled weighted average and popularity score (Priority is given 50% to both)
# Import library
from sklearn.preprocessing import MinMaxScaler
# Scale the data
scaling = MinMaxScaler()
movie_scaled_df = scaling.fit_transform(movies_cleaned_df[['weighted_average','popularity']])
# Create dataframe
movie_normalized_df = pd.DataFrame(movie_scaled_df,columns = ['weighted_average','popularity'])
# Check data
movie_normalized_df.head()
# Insert new columns
movies_cleaned_df[['normalized_weight_average','normalized_popularity']]= movie_normalized_df
# Check data
movies_cleaned_df.head()
# Create new column
movies_cleaned_df['score'] = (movies_cleaned_df['normalized_weight_average'] * 0.5) + (movies_cleaned_df['normalized_popularity'] * 0.5)
# Sort data according to score
movies_scored_df = movies_cleaned_df.sort_values(['score'], ascending = False)
# Check data
movies_scored_df[['original_title', 'normalized_weight_average', 'normalized_popularity', 'score']].head(20)
# Sorted data
scored_df = movies_cleaned_df.sort_values('score', ascending = False)
# Create barplot
plt.figure(figsize=(16,6))
ax = sns.barplot(x=scored_df['score'].head(10), y = scored_df['original_title'].head(10), data=scored_df, palette='deep')
plt.title('Best Rated & Most Popular Blend', weight = 'bold')
plt.xlabel('Score', weight = 'bold')
plt.ylabel('Movie Title', weight = 'bold')
plt.show()
# Import libraries
import pandas as pd
import numpy as np
# Import datasets
movies_df = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
# Check movies data
movies_df.head()
# Check credits data
credits.head()
# Check datasets dimension
print("Credits:", credits.shape)
print("Movies:", movies_df.shape)
# Rename column
credits_column_renamed = credits.rename(index = str, columns = {"movie_id": "id"})
# Merge column data
movies_df_merge = movies_df.merge(credits_column_renamed, on = 'id')
# Check data
movies_df_merge.head()
# Drop unnecessary columns
movies_cleaned_df = movies_df_merge.drop(columns = ['homepage', 'title_x', 'title_y', 'status','production_countries'])
# Check data
movies_cleaned_df.head()
# Check data info
movies_cleaned_df.info()
# Check data
movies_cleaned_df.head(1)['overview']
# Import library
from sklearn.feature_extraction.text import TfidfVectorizer
# Apply vectorizer
tfv = TfidfVectorizer(min_df = 3, max_features = None,
strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', # Remove unncessary characters !@#$%^&*()
ngram_range = (1, 3), # Take 1 to 3 combinations of words
stop_words = 'english') # Remove unneccessary words
# Fill NaNs with empty string
movies_cleaned_df['overview'] = movies_cleaned_df['overview'].fillna('')
# Fit the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(movies_cleaned_df['overview'])
# Sparse matrix
tfv_matrix
# Check data dimension
tfv_matrix.shape
# Import library
from sklearn.metrics.pairwise import sigmoid_kernel
# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
# Check data relation score
sig[0]
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned_df.index, index = movies_cleaned_df['original_title']).drop_duplicates()
# Check data
indices
# Check data
indices['Newlyweds']
# Check data relation score
sig[4799]
# Show 10 indices and its relation score
list(enumerate(sig[indices['Newlyweds']][:10]))
# Show indices and its relation score in descending order # Top 10
sorted(list(enumerate(sig[indices['Newlyweds']][:10])), key = lambda x: x[1], reverse = True)
# Recommender function
def give_rec(title, sig = sig):
# Get the index corresponding to original_title
idx = indices[title]
# Get the pairwsie similarity scores
sig_scores = list(enumerate(sig[idx]))
# Sort the movies
sig_scores = sorted(sig_scores, key = lambda x: x[1], reverse=True)
# Scores of the 10 most similar movies
sig_scores = sig_scores[1:11]
# Movie indices
movie_indices = [i[0] for i in sig_scores]
# Top 10 most similar movies
return movies_cleaned_df['original_title'].iloc[movie_indices]
# Test content-based recommendation system with the seminal film The Dark Knight Rises
give_rec('The Dark Knight Rises')
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import data
books = pd.read_csv('BX-Books.csv', sep = ';', error_bad_lines = False, encoding = "latin-1")
users = pd.read_csv('BX-Users.csv', sep = ';', error_bad_lines = False, encoding = "latin-1")
ratings = pd.read_csv('BX-Book-Ratings.csv', sep = ';', error_bad_lines = False, encoding = "latin-1")
# Books columns
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
# Users columns
users.columns = ['userID', 'Location', 'Age']
# Ratings columns
ratings.columns = ['userID', 'ISBN', 'bookRating']
# Check dataset dimnensions
books.shape, users.shape, ratings.shape
# Create bar plot for ratings distribution
plt.rc("font", size=15)
ratings['bookRating'].value_counts(sort = False).plot(kind ='bar')
plt.title('Rating Distribution\n')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('system1.png', bbox_inches='tight')
plt.show()
# Create bar plot for age distribution
users['Age'].hist(bins=[0, 10, 20, 30, 40, 50, 100])
plt.title('Age Distribution\n')
plt.xlabel('Age')
plt.ylabel('Count')
plt.savefig('system2.png', bbox_inches='tight')
plt.show()
# Get value counts
counts1 = ratings['userID'].value_counts()
# Get value counts
counts = ratings['bookRating'].value_counts()
# To ensure statistical significance, users with less than 200 ratings, and books with less than 100 ratings are excluded.
ratings = ratings[ratings['userID'].isin(counts1[counts1 >= 200].index)]
ratings = ratings[ratings['bookRating'].isin(counts[counts >= 100].index)]
# KNN is a machine learning algorithm to find clusters of similar users based on common book ratings,
# and make predictions using the average rating of top-k nearest neighbors.
# For example, we first present ratings in a matrix with the matrix having one row for each item (book)
# and one column for each user.
# Merge data based on ISBN
combine_book_rating = pd.merge(ratings, books, on = 'ISBN')
# Data columns to be drop
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']
# Drop columns
combine_book_rating = combine_book_rating.drop(columns, axis = 1)
# Check data
combine_book_rating.head()
# Drop missing values
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])
# Group by book titles and create a new column for total rating count.
book_ratingCount = (combine_book_rating.
groupby(by = ['bookTitle'])['bookRating'].
count().
reset_index().
rename(columns = {'bookRating': 'totalRatingCount'})
[['bookTitle', 'totalRatingCount']]
)
# Check data
book_ratingCount.head()
# Combine the rating data with the total rating count data
# This gives exactly what we need to find out which books are popular and filter out lesser-known books
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
# Check data
rating_with_totalRatingCount.head()
# Set decimal
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Check data statistics
print(book_ratingCount['totalRatingCount'].describe())
# The median book has been rated only once. Let’s look at the top of the distribution
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))
# Set threshold
popularity_threshold = 50
# Satisfy threshold
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
# Check data
rating_popular_book.head()
# Check data dimension
rating_popular_book.shape
# Merge column data
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how = 'left')
# Filter to users in US and Canada only
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
# Drop column
us_canada_user_rating = us_canada_user_rating.drop('Age', axis=1)
# Check data
us_canada_user_rating.head()
# Import library
from scipy.sparse import csr_matrix
# Drop duplicates
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
# Pivot dataframe
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
# Create sparse matrix
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)
# Check dataframe
us_canada_user_rating_pivot
# Import library
from sklearn.neighbors import NearestNeighbors
# Cosine similarity
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
# Fit the model
model_knn.fit(us_canada_user_rating_matrix)
# Get a random sample
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
# Show random sample
print('Index:', query_index, '=> Book:', us_canada_user_rating_pivot.index[query_index])
# Set distances and indices
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
# Recommendation function
for i in range(0, len(distances.flatten())):
if i == 0:
print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
else:
print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))