import pandas as pd from matplotlib import pyplot as plt import numpy as np import seaborn as sns import tensorflow as tf from tensorflow import keras df = pd.read_csv("/content/Churn_Modelling.csv") df.sample(5) df.isnull() sns.heatmap(df.isnull(),yticklabels=False,cbar=True,cmap='cividis') df.dtypes df.describe() sns.set_style('darkgrid') sns.countplot(x="Exited", data=df) sns.set_style('darkgrid') sns.countplot(x='NumOfProducts',hue='Exited',data=df,palette='rainbow') sns.set_style('darkgrid') sns.countplot(x='HasCrCard',hue='Exited',data=df,palette='Set1') sns.set_style('darkgrid') sns.countplot(x='Gender',hue='Exited',data=df) sns.set_style('darkgrid') sns.countplot(x='Geography',hue='Exited',data=df,palette='pastel') sns.set_style('darkgrid') sns.countplot(x='IsActiveMember',hue='Exited',data=df,palette='Set2') sns.displot(df['Age'].dropna(),kde=True,color='skyblue') df.Gender.unique() df['Gender'].replace({'Female':0,'Male':1},inplace=True) df.Gender.unique() df1 = pd.get_dummies(data=df,columns=['Geography']) df1.columns cols_to_scale=['CreditScore','Age','Balance','EstimatedSalary','Tenure','NumOfProducts'] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() df1[cols_to_scale] = scaler.fit_transform(df1[cols_to_scale]) df1 df1.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True) df1 df1.describe() x = df1.drop(['Exited'],axis=1) x.head() y = df1['Exited'] y.head() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=15,stratify=y) X_train.shape X_test.shape X_train[0:10] def ANN(X_train, y_train, X_test, y_test, loss, weights): model = keras.Sequential([ keras.layers.Dense(12, input_dim=12, activation='relu'), keras.layers.Dense(8, activation='relu'), keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss=loss, metrics=['accuracy']) if weights == -1: model.fit(X_train, y_train, epochs=100) else: model.fit(X_train, y_train, epochs=100, class_weight = weights) print(model.evaluate(X_test, y_test)) y_preds = model.predict(X_test) y_preds = np.round(y_preds) print("Classification Report: \n", classification_report(y_test, y_preds)) return y_preds y_pred = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred) plt.figure(figsize = (10,7)) sns.heatmap(cm, annot=True, fmt='d') plt.xlabel('Predicted') plt.ylabel('Truth') y_pred = [] for element in yp: if element > 0.5: y_pred.append(1) else: y_pred.append(0) y_pred[:10] y_test[:10] """# Method 1: Undersampling""" # Class count count_class_0, count_class_1 = df1.Exited.value_counts() # Divide by class df_class_0 = df1[df1['Exited'] == 0] df_class_1 = df1[df1['Exited'] == 1] # Undersample 0-class and concat the DataFrames of both class df_class_0_under = df_class_0.sample(count_class_1) df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0) print('Random under-sampling:') print(df_test_under.Exited.value_counts()) X = df_test_under.drop('Exited',axis='columns') y = df_test_under['Exited'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y) y_train.value_counts() y_pred1 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred1) plt.figure(figsize = (10,7)) sns.heatmap(cm, annot=True, fmt='d') plt.xlabel('Predicted') plt.ylabel('Truth') """# Method2: Oversampling""" df_class_1_over = df_class_1.sample(count_class_0, replace=True) df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0) print('Random over-sampling:') print(df_test_over.Exited.value_counts()) X = df_test_over.drop('Exited',axis='columns') y = df_test_over['Exited'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y) y_pred2 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred2) plt.figure(figsize = (10,7)) sns.heatmap(cm, annot=True, fmt='d') plt.xlabel('Predicted') plt.ylabel('Truth') """#Method3: SMOTE""" X = df1.drop('Exited',axis='columns') y = df1['Exited'] from imblearn.over_sampling import SMOTE smote = SMOTE(sampling_strategy='minority') X_sm, y_sm = smote.fit_sample(X, y) X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm) y_pred3 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred3) plt.figure(figsize = (10,7)) sns.heatmap(cm, annot=True, fmt='d') plt.xlabel('Predicted') plt.ylabel('Truth') """# Method4: Use of Ensemble with undersampling""" df1.Exited.value_counts() X = df1.drop('Exited',axis='columns') y = df1['Exited'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y) y_train.value_counts() df2 = X_train.copy() df2['Exited'] = y_train df2.head() df2_class0 = df2[df2.Exited==0] df2_class1 = df2[df2.Exited==1] def get_train_batch(df_majority, df_minority, start, end): df_train = pd.concat([df_majority[start:end], df_minority], axis=0) X_train = df_train.drop('Exited', axis='columns') y_train = df_train.Exited return X_train, y_train X_train, y_train = get_train_batch(df2_class0, df2_class1, 0, 1630) y_pred4_1 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) X_train, y_train = get_train_batch(df2_class0, df2_class1, 1630,3260 ) y_pred4_2 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) X_train, y_train = get_train_batch(df2_class0, df2_class1, 3260, 4890) y_pred4_3 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) X_train, y_train = get_train_batch(df2_class0, df2_class1, 4890, 6370) y_pred4_4 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1) y_pred_final = y_pred4_1.copy() for i in range(len(y_pred4_1)): n_ones = y_pred4_1[i] + y_pred4_2[i] + y_pred4_3[i]+y_pred4_4[i] if n_ones>1: y_pred_final[i] = 1 else: y_pred_final[i] = 0 cl_rep = classification_report(y_test, y_pred_final) print(cl_rep) cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred_final) plt.figure(figsize = (10,7)) sns.heatmap(cm, annot=True, fmt='d') plt.xlabel('Predicted') plt.ylabel('Truth')