import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

#algorithms to use
from sklearn.linear_model import LogisticRegression

#to build SVM model
from sklearn.svm import SVC

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

#for tuning the model
from sklearn.model_selection import GridSearchCV

#to ignore warnings
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB


# Unique values in each column:
df.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithCurrManager          18
dtype: int64


df=df.drop(['EmployeeCount','Over18','StandardHours'],axis=1)
df.head()


df = df.set_index('EmployeeNumber')
df.head()


from sklearn.preprocessing import LabelEncoder

le_attrition = LabelEncoder()
le_gender = LabelEncoder()
le_overtime = LabelEncoder()

df['Attrition'] = le_attrition.fit_transform(df['Attrition'])
df['Gender'] = le_gender.fit_transform(df['Gender'])
df['OverTime'] = le_overtime.fit_transform(df['OverTime'])

df.head()


# Attrition: 1 = Yes, 0 = No
# Gender: 1 = Male, 0 = Female
# OverTime: 1 = Yes, 0 = No


df.describe().T


df.hist(figsize=(14,14))
plt.show()


df['OverTime'].hist(figsize=(4,4))
plt.title("OverTime (0=No, 1=Yes)")
plt.show()

df['BusinessTravel'].hist(figsize=(4,4))
plt.title("Travel Frequency")
plt.show()

df['Department'].hist(figsize=(10,4))
plt.title("Department")
plt.show()

df['EducationField'].hist(figsize=(20,4))
plt.title("Field of Education")
plt.show()

df['JobRole'].hist(figsize=(20,4))
plt.title("Job Role")
plt.show()

df['MaritalStatus'].hist(figsize=(4,4))
plt.title("Marital Status")
plt.show()


plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),annot=True, fmt='0.2f', cmap='YlGnBu')

<AxesSubplot:>


(pd.crosstab(df['OverTime'],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')
plt.xlabel('Over Time (0 = No, 1= Yes)')
plt.legend(title="Attrition", labels=('No','Yes'))
plt.show()

(pd.crosstab(df['TotalWorkingYears'],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(15,4),stacked=True)
plt.ylabel('Percentage Attrition %')
plt.xlabel('Total Number of Years Worked')
plt.legend(title="Attrition", labels=('No','Yes'))
plt.show()

(pd.crosstab(df['MonthlyIncome'],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(15,4),stacked=True)
plt.ylabel('Percentage Attrition %')
plt.xlabel('Monthly Income')
plt.legend(title="Attrition", labels=('No','Yes'))
plt.show()

(pd.crosstab(df['JobLevel'],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')
plt.xlabel('Job Level')
plt.legend(title="Attrition", labels=('No','Yes'))
plt.show()

(pd.crosstab(df['Age'],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')
plt.xlabel('Age')
plt.legend(title="Attrition", labels=('No','Yes'))
plt.show()


#Mean of numerical variables grouped by attrition
df.groupby(['Attrition']).mean().T


df.head()


# list of columns for categorical variables we want get_dummy columns for
dummy_variables = ['BusinessTravel', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'Department','Education', 'EducationField', 'Gender',  'JobInvolvement','JobLevel', 'JobRole', 'MaritalStatus']

# creating get_dummy columns
df = pd.get_dummies(data=df, columns=dummy_variables)

df.head()


X=df.drop(columns = ['Attrition'])
y=df['Attrition']


X.head()


y.head()

EmployeeNumber
1    1
2    0
4    1
5    0
7    0
Name: Attrition, dtype: int64


#Let's see what columns we have to determine what should be scaled:
X.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'EnvironmentSatisfaction_1',
       'EnvironmentSatisfaction_2', 'EnvironmentSatisfaction_3',
       'EnvironmentSatisfaction_4', 'JobSatisfaction_1', 'JobSatisfaction_2',
       'JobSatisfaction_3', 'JobSatisfaction_4', 'WorkLifeBalance_1',
       'WorkLifeBalance_2', 'WorkLifeBalance_3', 'WorkLifeBalance_4',
       'PerformanceRating_3', 'PerformanceRating_4',
       'RelationshipSatisfaction_1', 'RelationshipSatisfaction_2',
       'RelationshipSatisfaction_3', 'RelationshipSatisfaction_4',
       'StockOptionLevel_0', 'StockOptionLevel_1', 'StockOptionLevel_2',
       'StockOptionLevel_3', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales', 'Education_1',
       'Education_2', 'Education_3', 'Education_4', 'Education_5',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree', 'Gender_0',
       'Gender_1', 'JobInvolvement_1', 'JobInvolvement_2', 'JobInvolvement_3',
       'JobInvolvement_4', 'JobLevel_1', 'JobLevel_2', 'JobLevel_3',
       'JobLevel_4', 'JobLevel_5', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single'],
      dtype='object')


# Scaling the numeric data in our X data

scaler=StandardScaler()

X_scaled = X
X_scaled[['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome',
          'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
          'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
          'YearsSinceLastPromotion', 'YearsWithCurrManager']] = scaler.fit_transform(X_scaled[['Age',
            'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome',
              'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
              'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
              'YearsSinceLastPromotion', 'YearsWithCurrManager']])

X_scaled=pd.DataFrame(X_scaled, columns=X.columns)

X_scaled


# Splitting the data and imprementing stratified sampling by setting stratify=y, which will make sure our y_test and 
# y_train will have similar proportions of our attrition classes

X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=12,stratify=y)


# To verify that stratified sampling worked, let's look at the proportion of attritions to non-attritions
# in our y-train:

len(y_train[y_train==1])/len(y_train[y_train==0])

0.1926977687626775


# And we see that it's almost the same as our y_test:

len(y_test[y_test==1])/len(y_test[y_test==0])

0.1902834008097166


# creating a metric function that takes the actual and the predicted values, and returns the Scikit learn
# classification report and a heatmap based on the confusion matrix (both described below)
def metrics(actual, predicted):
    print(classification_report(actual, predicted))
    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite (0)', 'Attrite (1)'], yticklabels=['Not Attrite (0)', 'Attrite (1)'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# instantiating our logistric regression and fitting the model to the train data
lr=LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

LogisticRegression()


# checking performance on the training data
y_train_pred = lr.predict(X_train)
metrics(y_train, y_train_pred)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       986
           1       0.82      0.53      0.65       190

    accuracy                           0.91      1176
   macro avg       0.87      0.75      0.80      1176
weighted avg       0.90      0.91      0.90      1176


# checking performance on the test dataset
y_test_pred = lr.predict(X_test)
metrics(y_test, y_test_pred)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93       247
           1       0.78      0.38      0.51        47

    accuracy                           0.88       294
   macro avg       0.84      0.68      0.72       294
weighted avg       0.88      0.88      0.87       294


# finding the coefficients of our logistic regression model and ordering them by significance
X_columns = X.columns

coef_lr = lr.coef_

# allowing to see all of our rows:
pd.set_option('display.max_rows', None)

pd.DataFrame(coef_lr, columns=X_columns).T.sort_values(by=0,ascending=False)


#finding the odds by finding the exponential values
odds = np.exp(lr.coef_[0])

# making a dataframe of the odds and sorting
pd.DataFrame(odds, X_train.columns, columns=['odds']).sort_values(by='odds', ascending=False)


# Creating precision-recall curve for our logistic regression model:

# predict_proba gives the probability of each observation belonging to either class of attrition [0, 1]:
y_scores_lr=lr.predict_proba(X_train)
y_scores_lr

array([[0.70647394, 0.29352606],
       [0.89205336, 0.10794664],
       [0.99527504, 0.00472496],
       ...,
       [0.88464202, 0.11535798],
       [0.83306736, 0.16693264],
       [0.84688581, 0.15311419]])


# We see how the above probabilities for a row being 
# either of the two classes [0, 1] result in the below outputs, which is our train data y predictions:
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])


# The following will compute precision-recall pairs for different probability thresholds:
precisions_lr, recalls_lr, thresholds_lr = precision_recall_curve(y_train, y_scores_lr[:,1])


# Note that the inputed "y_scores_lr[:,1]" is referring to the second column in the y_scores_lr array, i.e. 
# the probabilities of the observations being class 1 (i.e. "Attrite")

y_scores_lr[:,1]

array([0.29352606, 0.10794664, 0.00472496, ..., 0.11535798, 0.16693264,
       0.15311419])


# Here are the computed precision values
precisions_lr

array([0.16156463, 0.16170213, 0.16183986, ..., 1.        , 1.        ,
       1.        ])


# The computed recall values:
recalls_lr

array([1.        , 1.        , 1.        , ..., 0.01052632, 0.00526316,
       0.        ])


# And the thresholds that those precision-recall values were based on:
thresholds_lr

array([1.70291391e-05, 8.87346354e-05, 1.37396902e-04, ...,
       9.60710727e-01, 9.76933909e-01, 9.94279286e-01])


# Plot the values of our precision and recall value pairs for each of our thresholds
plt.figure(figsize=(10,8))
plt.plot(thresholds_lr, precisions_lr[:-1], 'g--', label='Precision')
plt.plot(thresholds_lr, recalls_lr[:-1], 'r--', label = 'Recall')
plt.xlabel('Threshold')
plt.legend(loc='upper right')
plt.ylim([0,1])
plt.show()


optimal_threshold1=.38

# We first redefine our y_train_pred, which originally gave our prediction of either 0 or 1 for each observaation,
# to now refer instead to an array containing prediction pairs for an observation either being [0 or 1].
# Thereafter in the metrics function we only refer to the second column of the array (which refers to the class 1,
# i.e, whether the observation is "Attrite"), and select only those values as 'Attrite' that are 
# above our NEW probability threshold of 0.38. Anything below this is classified as "Not Attrite"
y_train_pred = lr.predict_proba(X_train)
metrics(y_train, y_train_pred[:,1]>optimal_threshold1)

              precision    recall  f1-score   support

           0       0.93      0.94      0.94       986
           1       0.68      0.66      0.67       190

    accuracy                           0.89      1176
   macro avg       0.81      0.80      0.80      1176
weighted avg       0.89      0.89      0.89      1176


optimal_threshold1=.38
y_test_pred = lr.predict_proba(X_test)
metrics(y_test, y_test_pred[:,1]>optimal_threshold1)

              precision    recall  f1-score   support

           0       0.91      0.94      0.92       247
           1       0.59      0.49      0.53        47

    accuracy                           0.86       294
   macro avg       0.75      0.71      0.73       294
weighted avg       0.86      0.86      0.86       294


# instantiating our SVM model and fitting it to the train data
svm = SVC(kernel = 'linear')
svm.fit(X_train,y_train)

SVC(kernel='linear')

SVC(kernel='linear')


# metrics on train data:

y_train_svm_pred = svm.predict(X_train)
metrics(y_train, y_train_svm_pred)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       986
           1       0.85      0.56      0.68       190

    accuracy                           0.91      1176
   macro avg       0.89      0.77      0.81      1176
weighted avg       0.91      0.91      0.91      1176


# metrics on test data
y_test_svm_pred = svm.predict(X_test)
metrics(y_test, y_test_svm_pred)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94       247
           1       0.75      0.45      0.56        47

    accuracy                           0.89       294
   macro avg       0.83      0.71      0.75       294
weighted avg       0.88      0.89      0.88       294


# We repeat what we did for the logistic regression model to find the precision_recall curve:

svm2=SVC(probability=True) # Setting to true enables probability estimates
svm2.fit(X_train,y_train)
svm_y_scores=svm2.predict_proba(X_train) # as noted earlier, predict_proba gives the probability of each classification

# calculating our precision and recall values for different thresholds for class 1:
svm_precisions, svm_recalls, svm_thresholds = precision_recall_curve(y_train, svm_y_scores[:,1])

#Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,8))
plt.plot(svm_thresholds, svm_precisions[:-1], 'g--', label='precision')
plt.plot(svm_thresholds, svm_recalls[:-1], 'r--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper right')
plt.ylim([0,1])
plt.show()


# Checking performance with a new threshold of 0.25:
optimal_threshold_svm=.25
y_train_pred = svm2.predict_proba(X_train)
metrics(y_train, y_train_pred[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       986
           1       0.89      0.87      0.88       190

    accuracy                           0.96      1176
   macro avg       0.93      0.93      0.93      1176
weighted avg       0.96      0.96      0.96      1176


y_test_pred = svm2.predict_proba(X_test)
metrics(y_test, y_test_pred[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       247
           1       0.57      0.55      0.56        47

    accuracy                           0.86       294
   macro avg       0.74      0.74      0.74       294
weighted avg       0.86      0.86      0.86       294

	count	mean	std	min	25%	50%	75%	max
Age	1470.0	36.923810	9.135373	18.0	30.0	36.0	43.00	60.0
Attrition	1470.0	0.161224	0.367863	0.0	0.0	0.0	0.00	1.0
DailyRate	1470.0	802.485714	403.509100	102.0	465.0	802.0	1157.00	1499.0
DistanceFromHome	1470.0	9.192517	8.106864	1.0	2.0	7.0	14.00	29.0
Education	1470.0	2.912925	1.024165	1.0	2.0	3.0	4.00	5.0
EnvironmentSatisfaction	1470.0	2.721769	1.093082	1.0	2.0	3.0	4.00	4.0
Gender	1470.0	0.600000	0.490065	0.0	0.0	1.0	1.00	1.0
HourlyRate	1470.0	65.891156	20.329428	30.0	48.0	66.0	83.75	100.0
JobInvolvement	1470.0	2.729932	0.711561	1.0	2.0	3.0	3.00	4.0
JobLevel	1470.0	2.063946	1.106940	1.0	1.0	2.0	3.00	5.0
JobSatisfaction	1470.0	2.728571	1.102846	1.0	2.0	3.0	4.00	4.0
MonthlyIncome	1470.0	6502.931293	4707.956783	1009.0	2911.0	4919.0	8379.00	19999.0
MonthlyRate	1470.0	14313.103401	7117.786044	2094.0	8047.0	14235.5	20461.50	26999.0
NumCompaniesWorked	1470.0	2.693197	2.498009	0.0	1.0	2.0	4.00	9.0
OverTime	1470.0	0.282993	0.450606	0.0	0.0	0.0	1.00	1.0
PercentSalaryHike	1470.0	15.209524	3.659938	11.0	12.0	14.0	18.00	25.0
PerformanceRating	1470.0	3.153741	0.360824	3.0	3.0	3.0	3.00	4.0
RelationshipSatisfaction	1470.0	2.712245	1.081209	1.0	2.0	3.0	4.00	4.0
StockOptionLevel	1470.0	0.793878	0.852077	0.0	0.0	1.0	1.00	3.0
TotalWorkingYears	1470.0	11.279592	7.780782	0.0	6.0	10.0	15.00	40.0
TrainingTimesLastYear	1470.0	2.799320	1.289271	0.0	2.0	3.0	3.00	6.0
WorkLifeBalance	1470.0	2.761224	0.706476	1.0	2.0	3.0	3.00	4.0
YearsAtCompany	1470.0	7.008163	6.126525	0.0	3.0	5.0	9.00	40.0
YearsInCurrentRole	1470.0	4.229252	3.623137	0.0	2.0	3.0	7.00	18.0
YearsSinceLastPromotion	1470.0	2.187755	3.222430	0.0	0.0	1.0	3.00	15.0
YearsWithCurrManager	1470.0	4.123129	3.568136	0.0	2.0	3.0	7.00	17.0

Attrition	0	1
Age	37.561233	33.607595
DailyRate	812.504461	750.362869
DistanceFromHome	8.915653	10.632911
Education	2.927007	2.839662
EnvironmentSatisfaction	2.771290	2.464135
Gender	0.593674	0.632911
HourlyRate	65.952149	65.573840
JobInvolvement	2.770479	2.518987
JobLevel	2.145985	1.637131
JobSatisfaction	2.778589	2.468354
MonthlyIncome	6832.739659	4787.092827
MonthlyRate	14265.779400	14559.308017
NumCompaniesWorked	2.645580	2.940928
OverTime	0.234388	0.535865
PercentSalaryHike	15.231144	15.097046
PerformanceRating	3.153285	3.156118
RelationshipSatisfaction	2.733982	2.599156
StockOptionLevel	0.845093	0.527426
TotalWorkingYears	11.862936	8.244726
TrainingTimesLastYear	2.832928	2.624473
WorkLifeBalance	2.781022	2.658228
YearsAtCompany	7.369019	5.130802
YearsInCurrentRole	4.484185	2.902954
YearsSinceLastPromotion	2.234388	1.945148
YearsWithCurrManager	4.367397	2.852321

	Age	DailyRate	DistanceFromHome	HourlyRate	MonthlyIncome	MonthlyRate	NumCompaniesWorked	OverTime	PercentSalaryHike	TotalWorkingYears	...	JobRole_Laboratory Technician	JobRole_Manager	JobRole_Manufacturing Director	JobRole_Research Director	JobRole_Research Scientist	JobRole_Sales Executive	JobRole_Sales Representative	MaritalStatus_Divorced	MaritalStatus_Married	MaritalStatus_Single
EmployeeNumber
1	0.446350	0.742527	-1.010909	1.383138	-0.108350	0.726020	2.125136	1	-1.150554	-0.421642	...	0	0	0	0	0	1	0	0	0	1
2	1.322365	-1.297775	-0.147150	-0.240677	-0.291719	1.488876	-0.678049	0	2.129306	-0.164511	...	0	0	0	0	1	0	0	0	1	0
4	0.008343	1.414363	-0.887515	1.284725	-0.937654	-1.674841	1.324226	1	-0.057267	-0.550208	...	1	0	0	0	0	0	0	0	0	1
5	-0.429664	1.461466	-0.764121	-0.486709	-0.763634	1.243211	-0.678049	1	-1.150554	-0.421642	...	0	0	0	0	1	0	0	0	1	0
7	-1.086676	-0.524295	-0.887515	-1.274014	-0.644858	0.325900	2.525591	0	-0.877232	-0.678774	...	1	0	0	0	0	0	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2061	-0.101159	0.202082	1.703764	-1.224807	-0.835451	-0.284329	0.523316	0	0.489376	0.735447	...	1	0	0	0	0	0	0	0	1	0
2062	0.227347	-0.469754	-0.393938	-1.175601	0.741140	1.004010	0.523316	0	-0.057267	-0.293077	...	0	0	0	0	0	0	0	0	1	0
2064	-1.086676	-1.605183	-0.640727	1.038693	-0.076690	-1.284418	-0.678049	1	1.309341	-0.678774	...	0	0	1	0	0	0	0	0	1	0
2065	1.322365	0.546677	-0.887515	-0.142264	-0.236474	-0.150393	-0.277594	0	-0.330589	0.735447	...	0	0	0	0	0	1	0	0	1	0
2068	-0.320163	-0.432568	-0.147150	0.792660	-0.445978	-0.574124	-0.277594	0	-0.877232	-0.678774	...	1	0	0	0	0	0	0	0	1	0

Predicting Employee Attrition (A Classification Problem Where Recall Matters)¶

Background and Objective:¶

Import the necessary libraries¶

Dataset:¶

Visualizing how attrition plays out with select variables:¶

Building Our Models and Tuning for Improved Recall¶

Preparing data for modeling¶

Splitting into Train/Test Data [80%/20%]¶

Building the models¶

Logistic Regression Model¶

Balancing Precision and Recall for the Best Predictions (Tuning Our Predictive Threshold)¶

SVM Model¶

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EmployeeNumber	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	1	...	1	80	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	2	...	4	80	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	...	2	80	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	1	5	...	3	80	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	7	...	4	80	1	6	3	3	2	2	2	2

	Age	Attrition	DailyRate	DistanceFromHome	HourlyRate	MonthlyIncome	MonthlyRate	NumCompaniesWorked	OverTime	PercentSalaryHike	...	JobRole_Laboratory Technician	JobRole_Manager	JobRole_Manufacturing Director	JobRole_Research Director	JobRole_Research Scientist	JobRole_Sales Executive	JobRole_Sales Representative	MaritalStatus_Divorced	MaritalStatus_Married	MaritalStatus_Single
EmployeeNumber
1	41	1	1102	1	94	5993	19479	8	1	11	...	0	0	0	0	0	1	0	0	0	1
2	49	0	279	8	61	5130	24907	1	0	23	...	0	0	0	0	1	0	0	0	1	0
4	37	1	1373	2	92	2090	2396	6	1	15	...	1	0	0	0	0	0	0	0	0	1
5	33	0	1392	3	56	2909	23159	1	1	11	...	0	0	0	0	1	0	0	0	1	0
7	27	0	591	2	40	3468	16632	9	0	12	...	1	0	0	0	0	0	0	0	1	0

	0
OverTime	1.898805
JobInvolvement_1	1.134276
JobLevel_5	1.117030
BusinessTravel_Travel_Frequently	0.856694
StockOptionLevel_0	0.784258
EnvironmentSatisfaction_1	0.744762
EducationField_Technical Degree	0.684411
WorkLifeBalance_1	0.666428
YearsAtCompany	0.661752
JobSatisfaction_1	0.642984
RelationshipSatisfaction_1	0.627119
EducationField_Human Resources	0.506828
JobLevel_1	0.495047
JobRole_Sales Executive	0.494169
JobRole_Laboratory Technician	0.492066
YearsSinceLastPromotion	0.456381
NumCompaniesWorked	0.456202
JobRole_Sales Representative	0.454248
StockOptionLevel_3	0.419664
DistanceFromHome	0.374497
Department_Sales	0.297851
MaritalStatus_Single	0.289934
JobLevel_3	0.285568
Education_3	0.214903
Education_2	0.172415
PerformanceRating_4	0.157454
Gender_1	0.130873
EducationField_Marketing	0.108484
MonthlyRate	0.090924
JobRole_Manufacturing Director	0.071486
JobSatisfaction_3	0.052286
JobRole_Human Resources	0.022859
MaritalStatus_Married	0.009447
BusinessTravel_Travel_Rarely	0.009043
WorkLifeBalance_4	-0.002549
JobRole_Healthcare Representative	-0.021445
HourlyRate	-0.023089
Education_1	-0.042578
PercentSalaryHike	-0.055194
EnvironmentSatisfaction_2	-0.097432
Education_4	-0.123376
Department_Research & Development	-0.129874
Gender_0	-0.132569
JobInvolvement_2	-0.132604
WorkLifeBalance_2	-0.134548
EducationField_Other	-0.158324
PerformanceRating_3	-0.159150
RelationshipSatisfaction_2	-0.162632
Department_Human Resources	-0.169674
JobSatisfaction_2	-0.202070
DailyRate	-0.208592
Age	-0.220543
Education_5	-0.223060
EnvironmentSatisfaction_3	-0.231806
RelationshipSatisfaction_4	-0.232961
RelationshipSatisfaction_3	-0.233221
JobInvolvement_3	-0.260244
JobRole_Manager	-0.271228
StockOptionLevel_1	-0.280062
TrainingTimesLastYear	-0.284799
MaritalStatus_Divorced	-0.301078
YearsInCurrentRole	-0.402503
EnvironmentSatisfaction_4	-0.417222
TotalWorkingYears	-0.464589
MonthlyIncome	-0.494748
JobSatisfaction_4	-0.494896
YearsWithCurrManager	-0.524659
WorkLifeBalance_3	-0.531027
EducationField_Medical	-0.561301
EducationField_Life Sciences	-0.581794
JobRole_Research Director	-0.616384
JobRole_Research Scientist	-0.627466
JobInvolvement_4	-0.743124
JobLevel_4	-0.819870
BusinessTravel_Non-Travel	-0.867433
StockOptionLevel_2	-0.925556
JobLevel_2	-1.079471

	odds
OverTime	6.677908
JobInvolvement_1	3.108922
JobLevel_5	3.055766
BusinessTravel_Travel_Frequently	2.355361
StockOptionLevel_0	2.190780
EnvironmentSatisfaction_1	2.105941
EducationField_Technical Degree	1.982603
WorkLifeBalance_1	1.947269
YearsAtCompany	1.938185
JobSatisfaction_1	1.902148
RelationshipSatisfaction_1	1.872208
EducationField_Human Resources	1.660018
JobLevel_1	1.640575
JobRole_Sales Executive	1.639135
JobRole_Laboratory Technician	1.635691
YearsSinceLastPromotion	1.578351
NumCompaniesWorked	1.578069
JobRole_Sales Representative	1.574989
StockOptionLevel_3	1.521451
DistanceFromHome	1.454259
Department_Sales	1.346961
MaritalStatus_Single	1.336340
JobLevel_3	1.330518
Education_3	1.239742
Education_2	1.188171
PerformanceRating_4	1.170527
Gender_1	1.139823
EducationField_Marketing	1.114587
MonthlyRate	1.095186
JobRole_Manufacturing Director	1.074103
JobSatisfaction_3	1.053677
JobRole_Human Resources	1.023122
MaritalStatus_Married	1.009492
BusinessTravel_Travel_Rarely	1.009084
WorkLifeBalance_4	0.997454
JobRole_Healthcare Representative	0.978783
HourlyRate	0.977176
Education_1	0.958316
PercentSalaryHike	0.946301
EnvironmentSatisfaction_2	0.907164
Education_4	0.883931
Department_Research & Development	0.878206
Gender_0	0.875842
JobInvolvement_2	0.875812
WorkLifeBalance_2	0.874111
EducationField_Other	0.853573
PerformanceRating_3	0.852868
RelationshipSatisfaction_2	0.849903
Department_Human Resources	0.843940
JobSatisfaction_2	0.817038
DailyRate	0.811726
Age	0.802083
Education_5	0.800066
EnvironmentSatisfaction_3	0.793100
RelationshipSatisfaction_4	0.792184
RelationshipSatisfaction_3	0.791978
JobInvolvement_3	0.770863
JobRole_Manager	0.762442
StockOptionLevel_1	0.755737
TrainingTimesLastYear	0.752165
MaritalStatus_Divorced	0.740020
YearsInCurrentRole	0.668644
EnvironmentSatisfaction_4	0.658875
TotalWorkingYears	0.628393
MonthlyIncome	0.609724
JobSatisfaction_4	0.609634
YearsWithCurrManager	0.591757
WorkLifeBalance_3	0.588001
EducationField_Medical	0.570466
EducationField_Life Sciences	0.558895
JobRole_Research Director	0.539893
JobRole_Research Scientist	0.533943
JobInvolvement_4	0.475626
JobLevel_4	0.440489
BusinessTravel_Non-Travel	0.420028
StockOptionLevel_2	0.396311
JobLevel_2	0.339775