Customer Churn occurs when subscribers or customers stop doing business with a company or service. A business typically treats a customer as churned once a specific amount of time has passed since the customers last interaction with the business or service.
Retaining customers are obviously important for companies, because it boosts the companies revenue and helps the company to build a meaningful relationship with the customer. What might not be so obvious is that customer retention is actually more valuable than customer acquisition and there is a lot of data to back this claim.
Problem:
dataset:
Source: IBM Sample Data Sets
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Import the dataset
df = pd.read_csv('project_data/churn.csv')
# Check data
df.head()
# Show the numbers of rows and columns
df.shape
# Show all the columns
df.columns.values
# Check for missing data
df.isnull().mean()
# Check data statistics
df.describe(include = 'all')
# Check customer churn count
df['Churn'].value_counts()
# Visualize the count of customer churn
sns.countplot(df['Churn'])
plt.show()
# Check the percentage of customer that are leaving
num_retained = df[df['Churn'] == 'No'].shape[0]
num_churned = df[df['Churn'] == 'Yes'].shape[0]
# Print the percentage of customer that stayed
print(num_retained / (num_retained + num_churned) * 100, '% of customer stayed with the company')
# Print the percentage of customer that left
print(num_churned / (num_retained + num_churned) * 100, '% of customer left with the company')
# Visualize the churn count for both male and females
sns.countplot(x = 'gender', hue = 'Churn', data = df)
plt.show()
# Visualize the churn count for internet service
sns.countplot(x = 'InternetService', hue = 'Churn', data = df)
plt.show()
# Create label
numerical_features = ['tenure', 'MonthlyCharges']
# Create subplots
fig, ax = plt.subplots(1, 2, figsize = (28, 8))
df[df['Churn'] == 'No'][numerical_features].hist(bins = 20, color = 'blue', alpha = 0.5, ax= ax)
df[df['Churn'] == 'Yes'][numerical_features].hist(bins = 20, color = 'orange', alpha = 0.5, ax= ax)
plt.show()
# Remove the unnecessary columns
cleaned_df = df.drop('customerID', axis = 1)
# Check the numbers of rows and columns in the dataset
cleaned_df.shape
# Convert all the non-numeric columns to numeric
# Import library
from feature_engine import categorical_encoders as ce
# Set up the encoder
encoder = ce.OrdinalCategoricalEncoder(encoding_method='arbitrary', variables = None)
# fit the encoder
encoder.fit(cleaned_df)
# transform the data
cleaned_df = encoder.transform(cleaned_df)
# Check data type
cleaned_df.dtypes
cleaned_df
# Check correlation in the features by using heatmap
from heatmap import corrplot
plt.figure(figsize=(8, 8))
corrplot(cleaned_df.corr(), size_scale=300)
# Label the indepedent and dependent variables
x = cleaned_df.drop('Churn', axis = 1)
y = cleaned_df['Churn']
# Scaled the data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Create Machine Learning models
def models(X_train, y_train):
# Use Logistic Regression
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(random_state = 0)
log.fit(X_train, y_train)
# use KNeighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
# Use SVM (Linear Kernel)
from sklearn.svm import SVC
svc_lin = SVC(kernel = 'linear', random_state = 0)
svc_lin.fit(X_train, y_train)
# Use SVM (RBF Kernel)
from sklearn.svm import SVC
svc_rbf = SVC(kernel = 'rbf', random_state = 0)
svc_rbf.fit(X_train, y_train)
# Use GaussianNB
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, y_train)
# Use Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth= 100, criterion = 'entropy', random_state = 0)
tree.fit(X_train, y_train)
# Use RandomforestClassifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=20, max_depth= 100, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train)
# Use XGBoost Classifer
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs = 20, max_depth = 100)
xgb.fit(X_train, y_train)
# Use Lightgbm Classfier
import lightgbm
lgbm = lightgbm.LGBMClassifier()
lgbm.fit(X_train, y_train)
from catboost import CatBoostClassifier
cat = CatBoostClassifier(max_depth = 10 , iterations = 5)
cat.fit(X_train, y_train, eval_set = (X_test, y_test))
# Print the training accuracy for each model
print('model[0] Logistic Regression Training Accuracy: ', log.score(X_train, y_train))
print('model[1] KNNeighbors Training Accuracy: ', knn.score(X_train, y_train))
print('model[2] SVC Linear Training Accuracy: ', svc_lin.score(X_train, y_train))
print('model[3] SVC RBF Training Accuracy: ', svc_rbf.score(X_train, y_train))
print('model[4] Gauss Training Accuracy: ', gauss.score(X_train, y_train))
print('model[5] Decision Tree Training Accuracy: ', tree.score(X_train, y_train))
print('model[6] Random Forest Training Accuracy: ', forest.score(X_train, y_train))
print('model[7] XGBoost Training Accuracy: ', xgb.score(X_train, y_train))
print('model[8] Lightgbm Training Accuracy: ', lgbm.score(X_train, y_train))
print('model[9] Catboost Training Accuracy: ', cat.score(X_train, y_train))
return log, knn, svc_lin, svc_rbf, gauss, tree, forest, xgb, lgbm, cat
# Show Training accuracy score
model = models(X_train, y_train)
# Show the confusion matrix and accuracy for all the models on the test data
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
for i in range( len(model) ):
cm = confusion_matrix(y_test, model[i].predict((X_test)))
# Extract TN, FP, FN, TP
TN, FP, FN, TP = confusion_matrix(y_test, model[i].predict(X_test)).ravel()
test_score = (TP + TN) / (TN + FP + FN + TP)
print('model[{}] Testing Accuracy: "{}"'.format(i, test_score))
# Check classification report
for i in range( len(model) ):
# Create a predicitions
predictions = model[i].predict(X_test)
#Check precision, recall, f1-score
print('model[{}]:'.format(i))
print( classification_report(y_test, predictions) )
print('---------------------------------------------------')
# Implement-Fold Cross Validation
for i in range( len(model) ):
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model[i], X = X_train, y = y_train, cv = 3)
print('model[{}] Mean Accuracy:'.format(i), accuracies.mean())
print('model[{}] Standard Deviation:'.format(i), accuracies.std())
print('---------------------------------------------------')
# Get the best model
UsedFitter = model[7]
# Create confusion matrix
y_predict = UsedFitter.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
cm
# Visualize confusion matrix
sns.heatmap(cm, annot=True)
plt.show()
# Get feature importance
importances = pd.DataFrame({'feature': x.columns, 'importance': np.round(UsedFitter.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending = False).set_index('feature')
print(importances.head(10))
# Visualize the importance
importances.head(10).plot.bar()
plt.show()
# Check training dataset dimensions
X_train.shape, X_test.shape
# Import libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
# Build model
model = Sequential([
Dense(units = 16, input_dim = 19, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dropout(0.5),
Dense(units = 24, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dense(1, activation = 'sigmoid')
])
# Check model summary
model.summary()
# Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fit the model and initiate training
history = model.fit(X_train, y_train, batch_size = 15, epochs = 50)
# Check model score in test dataset
score = model.evaluate(X_test, y_test)
print(score)
# Import libraries
import itertools
from sklearn.metrics import confusion_matrix
# Create a plot function
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
# Get the predictions
y_pred = model.predict(X_test)
# Transform into dataframe
y_test = pd.DataFrame(y_test)
# Check confusion matrix for test dataset
cm = confusion_matrix(y_test, y_pred.round())
# Visualize confusion matrix for test dataset
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Apply sampling confusion matrix # Applying the whole parameter # Entire dateset
y_pred = model.predict(X)
y_expected = pd.DataFrame(y)
# Create confusion matrix for entire dataset
cm = confusion_matrix(y_expected, y_pred.round())
# Visualize confusion matrix for entire dataset
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
Several machine learning models were tested. XGboost model obtained the highest mean accuracy of 95% but in cross validation it got 78% so it is possible that an over fitting occurred. Catboost model was able to achieved 80% mean accuracy and 80% cross validation. Deep Neural Network was also implemented, that achieved 84% accuracy, with 100 iterations.
The accuracy of the model was 84% which is better than the 73.46% that could’ve done just by guessing a customer would always stay with the company. The company may want to lower it’s monthly charges at least for new customers for the first 2 years and stop providing fiber optics internet service, this may be a good strategy to help retain their customers and reduce customer Churn. Maybe with some more analysis on the data and tweaking of the program, I can improve this models performance and accuracy score.