Almost all bank marketing team are leveraging artificial intelligence or machine learning to launch a marketing ad campaign that is tailored to specific group of customers. In order the campaign to be successful they need to know their target market and take advantage of that information. The process is known as "Marketing Segmentation" and it is crucial for maximizing marketing campaign conversion rate. In this project, an unsupervised machine learning will be implemented to distinguish the different customer classes.
Problem:
Datase:
Source: Kaggle Competition
# Import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# from jupyterthemes import jtplot
# jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
# Import data
creditcard_df = pd.read_csv('Marketing_data.csv')
# Check data
creditcard_df.head()
# Check data statistics
creditcard_df.describe(include ='all')
# Check more data info
creditcard_df.info()
# Check for missing values
creditcard_df.isnull().mean()
# Visualize missing values
sns.heatmap(creditcard_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
plt.show()
# Fill up the missing elements with mean of the 'MINIMUM_PAYMENT'
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()
# Fill up the missing elements with mean of the 'MINIMUM_PAYMENT'
creditcard_df.loc[(creditcard_df['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard_df['CREDIT_LIMIT'].mean()
# Check for missing values
creditcard_df.isnull().sum()
# Check for duplicated entries in the data
creditcard_df.duplicated().sum()
# Drop customer ID column
creditcard_df.drop('CUST_ID', axis = 1, inplace = True)
# Check data
creditcard_df.head()
# Check the number of columns
n = len(creditcard_df.columns)
n
# Check column features
creditcard_df.columns
# Distplot combines the matplotlib.hist function with seaborn kdeplot()
# KDE Plot represents the Kernel Density Estimate
# KDE is used for visualizing the Probability Density of a continuous variable.
# KDE demonstrates the probability density at different values in a continuous variable.
# Mean of balance is $1500
# 'Balance_Frequency' for most customers is updated frequently ~1
# For 'PURCHASES_FREQUENCY', there are two distinct group of customers
# For 'ONEOFF_PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY' most users don't do one off puchases or installment purchases frequently
# Very small number of customers pay their balance in full 'PRC_FULL_PAYMENT'~0
# Credit limit average is around $4500
# Most customers are ~11 years tenure
# Create KDE plot
plt.figure(figsize=(10,50))
for i in range(len(creditcard_df.columns)):
plt.subplot(17, 1, i+1)
sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws={"color": "b", "lw": 3, "label": "KDE"}, hist_kws={"color": "g"})
plt.title(creditcard_df.columns[i])
plt.tight_layout()
plt.show()
# Import library
from heatmap import corrplot
# Show correlation in the features by using heatmap
plt.figure(figsize=(12, 8))
corrplot(creditcard_df.corr(), size_scale=300)
# Scale the data first
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)
# Check data dimension
creditcard_df_scaled.shape
# Check data
creditcard_df_scaled;
# Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
# 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
# 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
# 'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
# 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
# 'TENURE'], dtype='object')
# Create empty list
scores_1 = []
range_values = range(1,20)
# Create function
for i in range_values:
kmeans = KMeans(n_clusters = i)
kmeans.fit(creditcard_df_scaled)
scores_1.append(kmeans.inertia_)
plt.plot(scores_1, 'bx-')
plt.show()
# From this we can observe that, 4th cluster seems to be forming the elbow of the curve.
# However, the values does not reduce linearly until 8th cluster.
# Let's choose the number of clusters to be 7 or 8.
# Apply KMeans
kmeans = KMeans(4)
kmeans.fit(creditcard_df_scaled)
# Labels (cluster) associated to each data point
labels = kmeans.labels_
# Check centroids
kmeans.cluster_centers_.shape
# Create dataframe
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [creditcard_df.columns])
# Check data
cluster_centers
# In order to understand what these numbers mean, perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [creditcard_df.columns])
cluster_centers
# First Customers cluster (Transactors):
# Those are customers who pay least amount of intrerest charges and careful with their money,
# Cluster with lowest balance ($104) and cash advance ($303), Percentage of full payment = 23%.
# Second customers cluster (revolvers):
# People who use credit card as a loan (most lucrative sector)
# with highest balance ($5000) and cash advance (~$5000), low purchase frequency, high cash advance frequency (0.5),
# high cash advance transactions (16) and low percentage of full payment (3%).
# Third customer cluster (VIP/Prime):
# People who have high credit limit $16K and highest percentage of full payment,
# target for increase credit limit and increase spending habits.
# Fourth customer cluster (low tenure):
# These are customers with low tenure (7 years), low balance.
# Labels associated to each data point
labels.shape
# Check minimun and maximum label
labels.min(), labels.max()
# Fit KMeans
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
y_kmeans
# Concatenate the clusters labels to our original dataframe
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()
# Plot the histogram of various clusters
for i in creditcard_df.columns:
plt.figure(figsize = (35, 5))
for j in range(4):
plt.subplot(1, 4, j+1)
cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
cluster[i].hist(bins = 20)
plt.title('{} \nCluster {} '.format(i,j))
plt.show()
# Obtain the principal components
pca = PCA(n_components = 2)
principal_comp = pca.fit_transform(creditcard_df_scaled)
# Check data
principal_comp
# Create a dataframe with the two components
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2'])
pca_df.head()
# Concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()
# Visualize the clusters
plt.figure(figsize=(20,10))
ax = sns.scatterplot(x="pca1", y="pca2", alpha = 0.4, hue = "cluster", data = pca_df, palette =['orange','gray','yellow','blue'])
plt.show()
First Customers cluster (Transactors):
Second customers cluster (revolvers):
Third customer cluster (VIP/Prime):
Fourth customer cluster (low tenure):