Fraud detection, one of the many cases of anomaly detection is an important aspect of financial markets. Is there any way to predict whether a transaction is fraudulent or not based on the history of transactions? In this project, neural network architecture will be implemented as it attempts to predict the cases as frauds or not.
Problem:
Dataset:
Source: ULB-ML Group
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
np.random.seed(0)
# Import data
data = pd.read_csv('project_data/creditcard.csv')
data.head()
# Check data dimension
data.shape
# Check data info
data.info()
# Check for missing values
data.isnull().mean()
# Implement feature scaling
from sklearn.preprocessing import StandardScaler
data['NomarlizedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
# Drop the original data value
data = data.drop(['Amount'], axis = 1)
# drop the Time column
data = data.drop(['Time'], axis = 1)
# Check data
data.head()
# Create label for dependent and independent variable
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']
# Check independent variables
X.head()
# Check dependent variable
y.head()
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# Check train dataset dimensions
X_train.shape, y_train.shape
# Check test dataset dimensions
X_test.shape, y_test.shape
# Transform train dataset into array
X_train = np.array(X_train)
y_train = np.array(y_train)
# Transform test dataset into array
X_test = np.array(X_test)
y_test = np.array(y_test)
# Import libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
# Build model
model = Sequential([
Dense(units = 16, input_dim = 29, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dropout(0.5),
Dense(units = 24, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dense(1, activation = 'sigmoid')
])
# Check model summary
model.summary()
# Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fit the model and initiate training
model.fit(X_train, y_train, batch_size = 15, epochs = 5)
# Check model score in test dataset
score = model.evaluate(X_test, y_test)
print(score)
# Import libraries
import itertools
from sklearn.metrics import confusion_matrix
# Create a plot function
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
# Get the predictions
y_pred = model.predict(X_test)
# Transform into dataframe
y_test = pd.DataFrame(y_test)
# Check confusion matrix for test dataset
cm = confusion_matrix(y_test, y_pred.round())
print(cm)
# Visualize confusion matrix for test dataset
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Apply sampling confusion matrix # Applying the whole parameter # Entire dateset
y_pred = model.predict(X)
y_expected = pd.DataFrame(y)
# Create confusion matrix for entire dataset
cm = confusion_matrix(y_expected, y_pred.round())
# Visualize confusion matrix for entire dataset
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Apply random forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 10)
random_forest.fit(X_train, y_train.values.ravel())
# Get predictions
y_pred = random_forest.predict(X_test)
# Check test score
random_forest.score(X_test, y_test)
# Check confusion matrix for test dataset
cm = confusion_matrix(y_test, y_pred)
# Print confusion matrix
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Apply sampling confusion matrix # Applying the whole parameter # Entire dateset
y_pred = random_forest.predict(X)
# Confusion matrix for entire dataset
cm = confusion_matrix(y, y_pred.round())
# Print confusion matrix
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Import library
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
# Fit the model
decision_tree.fit(X_train, y_train.values.ravel())
# Get predictions
y_pred = decision_tree.predict(X_test)
# Check test score
decision_tree.score(X_test, y_test)
# Apply sampling confusion matrix # Applying the whole parameter # Entire dateset
y_pred = decision_tree.predict(X)
# Confusion matrix for entire dataset
cm = confusion_matrix(y, y_pred.round())
# Print confusion matrix
plot_confusion_matrix(cm, classes = [0, 1])
plt.show()
# Undersampling techinique
# Normal indices
normal_indices = data[data['Class'] == 0].index
# Fraud Indices
fraud_indices = np.array(data[data['Class'] == 1].index)
# Get the number of fraud data
number_records_fraud = len(fraud_indices)
print(number_records_fraud)
# Generates a random sample from a given 1-D array
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
# Create an array
random_normal_indices = np.array(random_normal_indices)
# Check the size
print(len(random_normal_indices))
# concatenate dataframe
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
print(len(under_sample_indices))
# label under sample data
under_sample_data = data.iloc[under_sample_indices, :]
under_sample_indices
# Features
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# Label
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_undersample, y_undersample, test_size = 0.3)
# Transform into array
X_train = np.array(X_train)
X_test = np.array(X_test)
# Transform into array
y_train = np.array(y_train)
y_test = np.array(y_test)
# Check the model
model.summary()
# Fit the model and initiate training
model.fit(X_train, y_train, batch_size = 15, epochs = 5)
# Predict using the undersample dataset
y_pred = model.predict(X_test)
y_expected = pd.DataFrame(y_test)
# Visualize confusion matrix
cm = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cm, classes = [0,1])
plt.show()
# Predict using the entire dataset
y_pred = model.predict(X)
y_expected = pd.DataFrame(y)
# Visualize confusion matrix
cm = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cm, classes = [0,1])
plt.show()
# Install library
!pip install -U imbalanced-learn
# Import libary
from imblearn.over_sampling import SMOTE
# Fit SMOTE
X_resample, y_resample = SMOTE().fit_sample(X, y.values.ravel())
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size = 0.3)
# Transform into array
X_train = np.array(X_train)
X_test = np.array(X_test)
# Transform into array
y_train = np.array(y_train)
y_test = np.array(y_test)
# Fit the model and initiate training
model.fit(X_train, y_train, batch_size = 15, epochs = 5)
# Predict using the undersample dataset
y_pred = model.predict(X_test)
y_expected = pd.DataFrame(y_test)
# Visualize confusion matrix
cm = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cm, classes = [0,1])
plt.show()
# Predict using the entire dataset
y_pred = model.predict(X)
y_expected = pd.DataFrame(y)
# Visualize confusion matrix
cm = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cm, classes = [0,1])
plt.show()
Several models were tested such as Deep Neural Network, Random Forest and Decision Tree. Under sampling was implemented in this classification problem due to imbalance of dataset and then the most common technique known as SMOTE: Synthetic Minority Over-sampling Technique.
The final model used was Deep Neural Network and it obtained less error in prediction