Employee Attrition is the gradual reduction in staff numbers that occurs as employees retire or resign and are not replaced. Employee attrition can be costly for businesses. The company loses employee productivity, and employee knowledge. In this project, it will show you how to predict if an employee is likely to leave the company.
Problem:
Dataset:
Source: This is a fictional data set created by IBM data scientists.
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Import data
df = pd.read_csv('project_data/employee.csv')
# Check dataframe
df.head()
# View data statistics
df.describe()
# Check more info
df.info()
# Check dataframe dimension
df.shape
# Check for any missing or null values in the data
df.isnull().values.any()
# Print all of the data types and their unique values
for column in df.columns:
if df[column].dtype == object:
print(str(column) + ':' + str(df[column].unique()))
print(df[column].value_counts())
print('----------------------------------------------------------')
# Get a count of numbers of employee that stayed and left the company
df['Attrition'].value_counts()
# Visualize the number of employees that stayed and left the company
sns.countplot(df['Attrition'])
plt.show()
# This is the percentage if we just guessed 'NO'.
print('This is the percentage if we just guessed NO:', ((1233 - 237) / 1233),'%')
# Show the number of employees that left and stayed by their age
plt.subplots(figsize = (12, 4))
sns.countplot(x = 'Age', hue = 'Attrition', data = df, palette = 'colorblind')
plt.show()
# Remove some useless columns
df = df.drop('Over18', axis = 1)
df = df.drop('EmployeeNumber', axis = 1)
df = df.drop('StandardHours', axis = 1)
df = df.drop('EmployeeCount', axis = 1)
# Convert all the non-numeric columns to numeric
# Import library
from feature_engine import categorical_encoders as ce
# Set up the encoder
encoder = ce.OrdinalCategoricalEncoder(encoding_method='arbitrary', variables = None)
# fit the encoder
encoder.fit(df)
# transform the data
cleaned_df = encoder.transform(df)
# Check the dataframe
cleaned_df.head()
# Check correlation in the features by using heatmap
from heatmap import corrplot
plt.figure(figsize=(15, 10))
corrplot(cleaned_df.corr(), size_scale=300)
plt.show()
# label the independent and dependent variables
x = cleaned_df.drop('Attrition', axis = 1)
y = cleaned_df['Attrition']
# Scaled the data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(x)
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state= 0)
# Create Machine Learning models
def models(X_train, y_train):
# Use Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth= 100, criterion = 'entropy', random_state = 0)
tree.fit(X_train, y_train)
# Use RandomforestClassifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=20, max_depth= 100, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train)
# Use XGBoost Classifer
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs = 20, max_depth = 100)
xgb.fit(X_train, y_train)
# Print the training accuracy for each model
print('model[0] Decision Tree Training Accuracy: ', tree.score(X_train, y_train))
print('model[1] Random Forest Training Accuracy: ', forest.score(X_train, y_train))
print('model[2] XGBoost Training Accuracy: ', xgb.score(X_train, y_train))
return tree, forest, xgb
# Show Training accuracy score
model = models(X_train, y_train)
# Show the confusion matrix and accuracy for all the models on the test data
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
for i in range( len(model) ):
cm = confusion_matrix(y_test, model[i].predict((X_test)))
# Extract TN, FP, FN, TP
TN, FP, FN, TP = confusion_matrix(y_test, model[i].predict(X_test)).ravel()
test_score = (TP + TN) / (TN + FP + FN + TP)
print('model[{}] Testing Accuracy: "{}"'.format(i, test_score))
# Import library
from sklearn.metrics import classification_report
# Check classification report
for i in range( len(model) ):
# Create a predicitions
predictions = model[i].predict(X_test)
#Check precision, recall, f1-score
print('model[{}]:'.format(i))
print( classification_report(y_test, predictions) )
print('---------------------------------------------------')
# Implement-Fold Cross Validation
for i in range( len(model) ):
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model[i], X = X_train, y = y_train, cv = 3)
print('model[{}] Mean Accuracy:'.format(i), accuracies.mean())
print('model[{}] Standard Deviation:'.format(i), accuracies.std())
print('---------------------------------------------------')
# Get the best model
UsedFitter = model[2]
# Create confusion matrix
y_predict = UsedFitter.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
# Visualize confusion matrix
sns.heatmap(cm, annot=True)
plt.show()
# Get feature importance
importances = pd.DataFrame({'feature': x.columns, 'importance': np.round(UsedFitter.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending = False).set_index('feature')
print(importances.head(10))
# Visualize the importance
importances.head(10).plot.bar()
plt.show()
Job level appears to be the most important feature followed by the over time, marital status, and stock option level. Seeing this result makes it would be better to check if people with a higher job level are less likely to leave than someone with a lower job level. The model (XGBoost) were able to obtained 86% accuracy. Maybe with some more analysis on the data and tweaking of the program, this can improve the model performance and accuracy score.