import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier


import warnings
warnings.filterwarnings('ignore') #ignore warnings

# set display max columns
pd.set_option('display.max_columns', 100)


# View Data Dictionary:
hos_data_dic = pd.read_csv('hospital_los_data_dic.csv')
hos_data_dic


# Read and Display Dataset:
hos = pd.read_csv('hospital_los.csv')
hos


# Check data types and data quality:
hos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital                           318438 non-null  int64  
 2   Hospital_type                      318438 non-null  int64  
 3   Hospital_city                      318438 non-null  int64  
 4   Hospital_region                    318438 non-null  int64  
 5   Available_Extra_Rooms_in_Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility                      318438 non-null  object 
 9   Bed_Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  313906 non-null  float64
 12  Type of Admission                  318438 non-null  object 
 13  Illness_Severity                   318438 non-null  object 
 14  Patient_Visitors                   318438 non-null  int64  
 15  Age                                318438 non-null  object 
 16  Admission_Deposit                  318438 non-null  float64
 17  Stay_Days                          318438 non-null  object 
dtypes: float64(3), int64(8), object(7)
memory usage: 43.7+ MB


# Find Number of Missing Values for Each Variable: 
hos.isnull().sum()

case_id                                 0
Hospital                                0
Hospital_type                           0
Hospital_city                           0
Hospital_region                         0
Available_Extra_Rooms_in_Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility                           0
Bed_Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Illness_Severity                        0
Patient_Visitors                        0
Age                                     0
Admission_Deposit                       0
Stay_Days                               0
dtype: int64


# Remove All Missing Values:
hos_df = hos.dropna()
hos_df.isnull().sum() #recheck missing values

case_id                              0
Hospital                             0
Hospital_type                        0
Hospital_city                        0
Hospital_region                      0
Available_Extra_Rooms_in_Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility                        0
Bed_Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Illness_Severity                     0
Patient_Visitors                     0
Age                                  0
Admission_Deposit                    0
Stay_Days                            0
dtype: int64


# Recheck Data Types and Quality:
hos_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313793 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            313793 non-null  int64  
 1   Hospital                           313793 non-null  int64  
 2   Hospital_type                      313793 non-null  int64  
 3   Hospital_city                      313793 non-null  int64  
 4   Hospital_region                    313793 non-null  int64  
 5   Available_Extra_Rooms_in_Hospital  313793 non-null  int64  
 6   Department                         313793 non-null  object 
 7   Ward_Type                          313793 non-null  object 
 8   Ward_Facility                      313793 non-null  object 
 9   Bed_Grade                          313793 non-null  float64
 10  patientid                          313793 non-null  int64  
 11  City_Code_Patient                  313793 non-null  float64
 12  Type of Admission                  313793 non-null  object 
 13  Illness_Severity                   313793 non-null  object 
 14  Patient_Visitors                   313793 non-null  int64  
 15  Age                                313793 non-null  object 
 16  Admission_Deposit                  313793 non-null  float64
 17  Stay_Days                          313793 non-null  object 
dtypes: float64(3), int64(8), object(7)
memory usage: 45.5+ MB


# Summary Statistics: 
hos_df.describe()


# Check distributions for numeric features:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

sns.histplot(hos_df, x='Hospital', kde=True, color='olive', stat='frequency', ax=axes[0,0])
sns.histplot(hos_df, x='Hospital_type', kde=True, color='gold', stat='frequency', ax=axes[0,1])
sns.histplot(hos_df, x='Hospital_city', kde=True, color='limegreen', stat='frequency', ax=axes[1,0])
sns.histplot(hos_df, x='Hospital_region', kde=True, color='mediumaquamarine', stat='frequency', ax=axes[1,1])
plt.show()


# Check distribution for numeric features:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sns.histplot(hos_df, x='Available_Extra_Rooms_in_Hospital', kde=True, color='teal', ax=axes[0,0])
sns.histplot(hos_df, x='Bed_Grade', kde=True, color='aqua', ax=axes[0,1])
sns.histplot(hos_df, x='City_Code_Patient', kde=True, color='royalblue', ax=axes[1,0])
sns.histplot(hos_df, x='Patient_Visitors', kde=True, color='blue', ax=axes[1,1])
plt.show()


# Boxplots showing outliers for Available_Extra_Rooms_in_Hospital, City_Code_Patient, & Patient_Visitors:
sns.boxplot(data=hos_df[['Available_Extra_Rooms_in_Hospital', 'City_Code_Patient', 'Patient_Visitors']], 
            orient="h", palette='winter')
plt.show()


# Number of observations suspected as outliers for Available_Extra_Rooms_in_Hospital:
count1 = (hos_df['Available_Extra_Rooms_in_Hospital'] > 7).sum()
count1

1336


# Number of observations suspected as outliers for City_Code_Patient:
count2 = (hos_df['City_Code_Patient'] > 15).sum()
count2

15827


# Number of observations suspected as outliers for Patient_Visitors:
count2 = (hos_df['Patient_Visitors'] > 7).sum()
count2

9612


# Check distribution for Admission_Deposit:
sns.histplot(hos_df, x='Admission_Deposit', kde=True, color='greenyellow')
plt.show()


# Check distributions for categorical features:
ward_order = ['P', 'Q', 'R', 'S', 'T', 'U'] #order for Ward_Type
wfac_order = ['A', 'B', 'C', 'D', 'E', 'F'] #order for Ward_Facility
fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # set fig size

sns.countplot(x=hos_df['Department'], palette='YlGnBu', ax=axes[0,0]) #plot Department
sns.countplot(x=hos_df['Ward_Type'], order=ward_order, palette='PuBuGn_r', ax=axes[0,1]) #plot by order
sns.countplot(x=hos_df['Ward_Facility'], order=wfac_order, palette='PuBuGn', ax=axes[1,0]) #plot by order
sns.countplot(x=hos_df['Type of Admission'], palette='YlGnBu_r', ax=axes[1,1]) #plot Type of Admission
plt.show()


# Value Counts for Department:
hos_df['Department'].value_counts()

gynecology            245850
anesthesia             29187
radiotherapy           28153
TB & Chest disease      9460
surgery                 1143
Name: Department, dtype: int64


# Proportion of each value for Department:
round(hos_df['Department'].value_counts(normalize=True) *100,2)

gynecology            78.35
anesthesia             9.30
radiotherapy           8.97
TB & Chest disease     3.01
surgery                0.36
Name: Department, dtype: float64


# Check distributions for Illness_Severity and Age:
# Order for Age
age_order = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100'] 
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # set fig size

sns.countplot(x=hos_df['Illness_Severity'], palette='PuBuGn_r', ax=axes[0]) #plot for Illness_Severity
sns.countplot(x=hos_df['Age'], order=age_order, palette='YlGnBu_r', ax=axes[1]) #plot for Age by Order
plt.tight_layout() 
plt.show()


# Check distribution for Stay_Days:
stay_order = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', 'More than 100 Days'] 
fig = plt.figure(figsize=(14, 8)) #above, order for stay_days, set fig size
sns.countplot(x=hos_df['Stay_Days'], order=stay_order, palette='YlGnBu') #plot based on order 
plt.tight_layout() 
plt.show()


# Value Counts for Stay_Days:
hos_df['Stay_Days'].value_counts()

21-30                 86316
11-20                 77095
31-40                 54312
51-60                 34463
0-10                  23250
41-50                 11541
71-80                 10096
More than 100 Days     6548
81-90                  4761
91-100                 2713
61-70                  2698
Name: Stay_Days, dtype: int64


# Proportion of each class for Stay_Days:
round(hos_df['Stay_Days'].value_counts(normalize=True) *100,2)

21-30                 27.51
11-20                 24.57
31-40                 17.31
51-60                 10.98
0-10                   7.41
41-50                  3.68
71-80                  3.22
More than 100 Days     2.09
81-90                  1.52
91-100                 0.86
61-70                  0.86
Name: Stay_Days, dtype: float64


# Update Available_Extra_Rooms_in_Hospital Feature:
cond1 = hos_df['Available_Extra_Rooms_in_Hospital'] < 7
hos_df2 = hos_df[cond1]


# Update City_Code_Patient Feature:
cond2 = hos_df2['City_Code_Patient'] < 15 
hos_df3 = hos_df2[cond2]


# Update Patient_Visitors Feature:
cond3 = hos_df3['Patient_Visitors'] < 7
hos_df4 = hos_df3[cond3]


# Recheck Data:
hos_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274202 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            274202 non-null  int64  
 1   Hospital                           274202 non-null  int64  
 2   Hospital_type                      274202 non-null  int64  
 3   Hospital_city                      274202 non-null  int64  
 4   Hospital_region                    274202 non-null  int64  
 5   Available_Extra_Rooms_in_Hospital  274202 non-null  int64  
 6   Department                         274202 non-null  object 
 7   Ward_Type                          274202 non-null  object 
 8   Ward_Facility                      274202 non-null  object 
 9   Bed_Grade                          274202 non-null  float64
 10  patientid                          274202 non-null  int64  
 11  City_Code_Patient                  274202 non-null  float64
 12  Type of Admission                  274202 non-null  object 
 13  Illness_Severity                   274202 non-null  object 
 14  Patient_Visitors                   274202 non-null  int64  
 15  Age                                274202 non-null  object 
 16  Admission_Deposit                  274202 non-null  float64
 17  Stay_Days                          274202 non-null  object 
dtypes: float64(3), int64(8), object(7)
memory usage: 39.7+ MB


# Recalculate Summary Statistics: 
hos_df4.describe()


# Recheck distribution for Hospital, Available_Extra_Rooms_in_Hospital, City_Code_Patient, & Patient_Visitors
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sns.histplot(hos_df4, x='Hospital', kde=True, color='olive', stat='frequency', ax=axes[0,0])
sns.histplot(hos_df4, x='Available_Extra_Rooms_in_Hospital', kde=True, color='teal', ax=axes[0,1])
sns.histplot(hos_df4, x='City_Code_Patient', kde=True, color='royalblue', ax=axes[1,0])
sns.histplot(hos_df4, x='Patient_Visitors', kde=True, color='blue', ax=axes[1,1])
plt.show()


# Value Counts for Age:
hos_df4['Age'].value_counts()

31-40     54916
41-50     54682
51-60     41746
21-30     34951
71-80     31008
61-70     29187
11-20     14336
81-90      6899
0-10       5348
91-100     1129
Name: Age, dtype: int64


# Proportion of each class for Age:
round(hos_df4['Age'].value_counts(normalize=True) *100,2)

31-40     20.03
41-50     19.94
51-60     15.22
21-30     12.75
71-80     11.31
61-70     10.64
11-20      5.23
81-90      2.52
0-10       1.95
91-100     0.41
Name: Age, dtype: float64


# Replace 81-90 and 91-100, as 81+ for Age:
hos_df4['Age'] = hos_df4['Age'].replace(['81-90', '91-100'], '81+')


# Recheck Value Counts for Age:
hos_df4['Age'].value_counts()

31-40    54916
41-50    54682
51-60    41746
21-30    34951
71-80    31008
61-70    29187
11-20    14336
81+       8028
0-10      5348
Name: Age, dtype: int64


# Recheck distribution for Age: 
fig = plt.figure(figsize=(8, 5))
age_order = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81+'] 
sns.countplot(x=hos_df4['Age'], order=age_order, palette='YlGnBu_r')
plt.show()


# Value Counts for Stay_Days:
hos_df4['Stay_Days'].value_counts()

21-30                 77902
11-20                 69498
31-40                 48098
51-60                 29599
0-10                  20703
41-50                 10137
71-80                  7849
81-90                  3221
More than 100 Days     2936
61-70                  2271
91-100                 1988
Name: Stay_Days, dtype: int64


# Proportion of each class for Stay_Days:
round(hos_df4['Stay_Days'].value_counts(normalize=True) *100,2)

21-30                 28.41
11-20                 25.35
31-40                 17.54
51-60                 10.79
0-10                   7.55
41-50                  3.70
71-80                  2.86
81-90                  1.17
More than 100 Days     1.07
61-70                  0.83
91-100                 0.73
Name: Stay_Days, dtype: float64


# Replace 41-50, 51-60, 61-70, 71-80, 81-90, 91-100, and more than 100 days as 41+ for Stay_Days:
hos_df4['Stay_Days'] = hos_df4['Stay_Days'].replace(['41-50', '51-60', '61-70', '71-80', '81-90', '91-100', 'More than 100 Days'], '41+')


# Recheck Value Counts for Stay_Days:
hos_df4['Stay_Days'].value_counts()

21-30    77902
11-20    69498
41+      58001
31-40    48098
0-10     20703
Name: Stay_Days, dtype: int64


# Recheck distribution for Stay_Days:
stay_order = ['0-10', '11-20', '21-30', '31-40', '41+'] #order for Stay_Days
fig = plt.figure(figsize=(8, 5)) #set fig size
sns.countplot(x=hos_df4['Stay_Days'], order=stay_order, palette='YlGnBu') #plot by order
plt.show()


# Reset Index in the DF: 
hos_df4 = hos_df4.reset_index(drop=True) #reset index without adding index as a column
hos_df4 #check DF


X = hos_df4.drop(['case_id','Stay_Days', 'patientid'], axis=1) # Create DF for Features (remove case_id and Stay_Days)
y = hos_df4['Stay_Days'] # Create DF for Target


X # View Features


y # View Target

0          0-10
1           41+
2         31-40
3           41+
4           41+
          ...  
274197    21-30
274198    31-40
274199    11-20
274200    11-20
274201     0-10
Name: Stay_Days, Length: 274202, dtype: object


# Specify Categorical Columns, Transform into dummy variables
cat_feats = ['Department', 'Ward_Type', 'Ward_Facility', 'Type of Admission', 'Illness_Severity', 'Age']
X_dummies = pd.get_dummies(X, columns=cat_feats, drop_first=False)


# Check Info for Dummies DF: 
X_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274202 entries, 0 to 274201
Data columns (total 41 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Hospital                           274202 non-null  int64  
 1   Hospital_type                      274202 non-null  int64  
 2   Hospital_city                      274202 non-null  int64  
 3   Hospital_region                    274202 non-null  int64  
 4   Available_Extra_Rooms_in_Hospital  274202 non-null  int64  
 5   Bed_Grade                          274202 non-null  float64
 6   City_Code_Patient                  274202 non-null  float64
 7   Patient_Visitors                   274202 non-null  int64  
 8   Admission_Deposit                  274202 non-null  float64
 9   Department_TB & Chest disease      274202 non-null  uint8  
 10  Department_anesthesia              274202 non-null  uint8  
 11  Department_gynecology              274202 non-null  uint8  
 12  Department_radiotherapy            274202 non-null  uint8  
 13  Department_surgery                 274202 non-null  uint8  
 14  Ward_Type_P                        274202 non-null  uint8  
 15  Ward_Type_Q                        274202 non-null  uint8  
 16  Ward_Type_R                        274202 non-null  uint8  
 17  Ward_Type_S                        274202 non-null  uint8  
 18  Ward_Type_T                        274202 non-null  uint8  
 19  Ward_Type_U                        274202 non-null  uint8  
 20  Ward_Facility_A                    274202 non-null  uint8  
 21  Ward_Facility_B                    274202 non-null  uint8  
 22  Ward_Facility_C                    274202 non-null  uint8  
 23  Ward_Facility_D                    274202 non-null  uint8  
 24  Ward_Facility_E                    274202 non-null  uint8  
 25  Ward_Facility_F                    274202 non-null  uint8  
 26  Type of Admission_Emergency        274202 non-null  uint8  
 27  Type of Admission_Trauma           274202 non-null  uint8  
 28  Type of Admission_Urgent           274202 non-null  uint8  
 29  Illness_Severity_Extreme           274202 non-null  uint8  
 30  Illness_Severity_Minor             274202 non-null  uint8  
 31  Illness_Severity_Moderate          274202 non-null  uint8  
 32  Age_0-10                           274202 non-null  uint8  
 33  Age_11-20                          274202 non-null  uint8  
 34  Age_21-30                          274202 non-null  uint8  
 35  Age_31-40                          274202 non-null  uint8  
 36  Age_41-50                          274202 non-null  uint8  
 37  Age_51-60                          274202 non-null  uint8  
 38  Age_61-70                          274202 non-null  uint8  
 39  Age_71-80                          274202 non-null  uint8  
 40  Age_81+                            274202 non-null  uint8  
dtypes: float64(3), int64(6), uint8(32)
memory usage: 27.2 MB


# Instantiate the label encoder
label_encoder = LabelEncoder()


# Assign encoded values to y 
y_T = label_encoder.fit_transform(y)


# Review Econding for y_T
yunique = np.unique(y)
y_Tunique = np.unique(y_T)
y_encode = dict(zip(yunique, y_Tunique))
print(y_encode)

{'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41+': 4}


# View encoded y_T
y_T

array([0, 4, 3, ..., 1, 1, 0])


# Split Data into Train, Validation, and Test Sets at 70-20-10 ratio:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_T, test_size=0.10, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


X_train.shape # Check Training size

(197424, 41)


X_val.shape # Check Validation size

(49357, 41)


X_test.shape # Check Normalized Testing

(27421, 41)


# Print Train Test Ratio Split
print(f"Train Ratio: {X_train.shape[0]/X.shape[0]:.2f} %")
print(f"Validation Ratio: {X_val.shape[0]/X.shape[0]:.2f} %")
print(f"Test Ratio: {X_test.shape[0]/X.shape[0]:.2f} %")

Train Ratio: 0.72 %
Validation Ratio: 0.18 %
Test Ratio: 0.10 %


# Normalize Features using Min-Max Scaler
mm = MinMaxScaler()
mm.fit(X_train)
X2n_train = mm.transform(X_train)
X2n_val = mm.transform(X_val)
X2n_test = mm.transform(X_test)


X2n_train.shape # Check Normalized Training

(197424, 41)


X2n_val.shape # Check Normalized Validation

(49357, 41)


X2n_test.shape # Check Normalized Testing

(27421, 41)


# Function to Plot Validation Curve:
def plot_validation_curve(scores_params, param_name):
    plt.plot(scores_params, label='cv') #plot the scores
    plt.xticks(np.arange(len(param_name)), param_name, rotation=45) #set the x-axis ticks
    plt.xlabel(str(param_name)) #label x-axis
    plt.ylabel('Score') #label y-axis
    plt.title('Validation Curve') # set title 
    plt.legend() #show legend
    plt.show() #show plot


# Function to Fit Model, Make Prediction, and Get Scores saved as a DF:
def fit_predict_score(model, X_train, y_train, X_val, y_val): #takes as input model, X, and Y 
    model.fit(X_train, y_train) #fit model with training data
    y_pred = model.predict(X_val) #predict on validation data
    acc = np.round(accuracy_score(y_val, y_pred), 4) #accuracy rounded to 4 decimals
    bal_acc = np.round(balanced_accuracy_score(y_val, y_pred), 4) #balanced accuracy rounded to 4 decimals
    scores = {'Model': [model.__class__.__name__], 'Accuracy':[acc], 'Balanced Accuracy': [bal_acc]} #place model name & scores in dict
    scores_df = pd.DataFrame(scores) #create df for scores 
    return scores_df #return df


# Function to Fit Model, Make Prediction, Display Confusion Matrix with Accuracy Score & Classification Report
def fit_predict_display_scores(model, X_train, y_train, X_val, y_val): #takes as input model, X, and Y 
    model.fit(X_train, y_train) #fit model with training data
    y_pred = model.predict(X_val) #predict on validation data
    acc = np.round(accuracy_score(y_val, y_pred), 4) #accuracy rounded to 4 decimals
    metrics.ConfusionMatrixDisplay.from_predictions(y_val, y_pred) #display confusion matrix
    plt.title(str(model.__class__.__name__) + ' | ' + 'Accuracy: ' + str(acc), fontsize=12) #title confusion matrix with name of clf and acc
    plt.show() #show plot
    print(classification_report(y_val, y_pred)) #print classification reprot


# Function that Plots Feature Importance 
def plot_feature_importances(model, df):
    n_features = df.shape[1] #capture features from DF
    df_colnames= df.columns #capture feature names
    plt.barh(np.arange(n_features), model.feature_importances_, align='center', color='teal') #bar plot 
    plt.yticks(np.arange(n_features), df_colnames) #set the y-ticks 
    plt.xlabel("Feature Importance") #label x-axis
    plt.ylabel("Features") #label y-axis
    plt.title(str(model)) #set title 
    plt.ylim(-1, n_features) #set the y-axis limits


# Function that Predicts on a best_estimator and Displays Confusion Matrix with Accuracy Score
def fit_predict_display_bestscores(best_est, X_val, y_val): #takes as input model, X, and Y 
    y_pred = best_est.predict(X_val) #predict on validation data
    acc = np.round(accuracy_score(y_val, y_pred), 4) #accuracy rounded to 4 decimals
    metrics.ConfusionMatrixDisplay.from_predictions(y_val, y_pred) #display confusion matrix
    plt.title(str(best_est.__class__.__name__) + ' | ' + 'Accuracy: ' + str(acc), fontsize=12) #title confusion matrix with name of clf and acc
    plt.show() #show plot
    print(classification_report(y_val, y_pred)) #print classification reprot


# Function that Predicts on a best_estimator and Gets Scores saved as a DF
def fit_predict_bestscore(best_est, X_val, y_val): #takes as input model, X, and Y 
    y_pred = best_est.predict(X_val) #predict on validation data
    acc = np.round(accuracy_score(y_val, y_pred), 4) #accuracy rounded to 4 decimals
    bal_acc = np.round(balanced_accuracy_score(y_val, y_pred), 4) #balanced accuracy rounded to 4 decimals
    scores = {'Model': [best_est.__class__.__name__], 'Accuracy':[acc], 'Balanced Accuracy': [bal_acc]} #place model name & scores in dict
    scores_df = pd.DataFrame(scores) #create df for scores 
    return scores_df #return df


# Random Forest: Initialize the Random Forest Classifier w/ Default Parameters
RF = RandomForestClassifier(random_state=0, class_weight='balanced') # balanced class_weight 

# GradientBoost: Initialize the Gradient Boosting Classifier w/ Default Parameters
gbst = GradientBoostingClassifier(random_state=0)

# Adaboost: Initialize the Adaboost Classifier w/ Default Parameters
AdaB = AdaBoostClassifier(random_state=0)

# KNN Classifier: Initialize the KNN Classifier w/ Default Parameters
knn = KNeighborsClassifier(n_jobs = -1)

# LDA Classifier: Initialize the LDA Classifier w/ Default Parameters
lda = LinearDiscriminantAnalysis()

# SVC Classifier: Initialize the LDA Classifier w/ Default Parameters
linSVC = LinearSVC(random_state=0, class_weight='balanced') # balanced class_weight

# SGD Classifier: Initialize the SGD Classifier w/ Default Parameters
SGD = SGDClassifier(random_state=0, n_jobs=-1, class_weight='balanced')


# Random Forest: 
RF_val = fit_predict_score(RF, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(RF, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.28      0.18      0.22      3653
           1       0.39      0.41      0.40     12665
           2       0.40      0.51      0.45     13916
           3       0.32      0.20      0.25      8701
           4       0.57      0.58      0.57     10422

    accuracy                           0.42     49357
   macro avg       0.39      0.38      0.38     49357
weighted avg       0.41      0.42      0.41     49357


# GradientBoost: 
gbst_val = fit_predict_score(gbst, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(gbst, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.42      0.12      0.18      3653
           1       0.43      0.50      0.46     12665
           2       0.42      0.67      0.51     13916
           3       0.49      0.07      0.12      8701
           4       0.62      0.60      0.61     10422

    accuracy                           0.46     49357
   macro avg       0.47      0.39      0.38     49357
weighted avg       0.48      0.46      0.43     49357


# AdaBoost: 
AdaB_val = fit_predict_score(AdaB, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(AdaB, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.28      0.08      0.13      3653
           1       0.39      0.44      0.41     12665
           2       0.41      0.63      0.50     13916
           3       0.29      0.02      0.04      8701
           4       0.53      0.63      0.58     10422

    accuracy                           0.43     49357
   macro avg       0.38      0.36      0.33     49357
weighted avg       0.40      0.43      0.38     49357


# KNN:
knn_val = fit_predict_score(knn, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(knn, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.18      0.17      0.17      3653
           1       0.34      0.44      0.38     12665
           2       0.37      0.45      0.41     13916
           3       0.28      0.18      0.22      8701
           4       0.54      0.37      0.44     10422

    accuracy                           0.36     49357
   macro avg       0.34      0.32      0.33     49357
weighted avg       0.37      0.36      0.36     49357


# LDA:
lda_val = fit_predict_score(lda, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(lda, X2n_train, y_train, X2n_val, y_val)  # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.55      0.02      0.04      3653
           1       0.39      0.45      0.42     12665
           2       0.40      0.65      0.50     13916
           3       0.29      0.01      0.03      8701
           4       0.57      0.64      0.60     10422

    accuracy                           0.44     49357
   macro avg       0.44      0.35      0.32     49357
weighted avg       0.42      0.44      0.38     49357


# Linear SVC
linSVC_val = fit_predict_score(linSVC, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(linSVC, X2n_train, y_train, X2n_val, y_val) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.20      0.43      0.27      3653
           1       0.42      0.34      0.37     12665
           2       0.43      0.50      0.46     13916
           3       0.14      0.00      0.00      8701
           4       0.48      0.70      0.57     10422

    accuracy                           0.41     49357
   macro avg       0.33      0.39      0.34     49357
weighted avg       0.37      0.41      0.37     49357


# SGD 
sgd_val = fit_predict_score(SGD, X2n_train, y_train, X2n_val, y_val)
fit_predict_display_scores(SGD, X2n_train, y_train, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.17      0.42      0.24      3653
           1       0.45      0.03      0.06     12665
           2       0.38      0.66      0.49     13916
           3       0.10      0.00      0.00      8701
           4       0.48      0.70      0.57     10422

    accuracy                           0.37     49357
   macro avg       0.32      0.36      0.27     49357
weighted avg       0.35      0.37      0.29     49357


# Display Accuracy Results for Baseline Models
base_df = [knn_val, sgd_val, linSVC_val, RF_val, AdaB_val, lda_val, gbst_val]
base_results = pd.concat(base_df, ignore_index=True) #combine all model results
base_results


# Check values for max_depth
max_depth = [1, 5, 10, 15, 20, 30]


# Search parameter and get scores for each value
scores_params = [] #append scores 
for m in max_depth: #for each value in max_depth
    RF = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=m) #initiate RF Classifier
    scores = cross_val_score(RF, X=X2n_train, y=y_train, cv=3) #get cv score using classifer
    scores_params.append(scores.mean()) #append mean score


# Plot Validation Curve
plot_validation_curve(scores_params, max_depth)


# Create parameter dictionary for max_depth
RF_params = dict( 
    max_depth = [15, 17, 19, 20, 21])


# Initiate Grid Search for max_depth
gs_RF = GridSearchCV(estimator=RF, param_grid=RF_params, cv=3, n_jobs=-1)


# Fit GridSearchCV to Training Set
gs_RF.fit(X2n_train, y_train)

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              max_depth=30, random_state=0),
             n_jobs=-1, param_grid={'max_depth': [15, 17, 19, 20, 21]})

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              max_depth=30, random_state=0),
             n_jobs=-1, param_grid={'max_depth': [15, 17, 19, 20, 21]})

RandomForestClassifier(class_weight='balanced', max_depth=30, random_state=0)

RandomForestClassifier(class_weight='balanced', max_depth=30, random_state=0)


# Capture Best Estimator Model for RF
best_RF = gs_RF.best_estimator_ #get best estimator
best_RF_val = fit_predict_bestscore(best_RF, X2n_val, y_val) #get scores to df 
best_RF #show best estimator params

RandomForestClassifier(class_weight='balanced', max_depth=19, random_state=0)

RandomForestClassifier(class_weight='balanced', max_depth=19, random_state=0)


# View Accuracy, Confusion Matrix, and Performance
fit_predict_display_bestscores(best_RF, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.25      0.36      0.29      3653
           1       0.42      0.43      0.42     12665
           2       0.43      0.54      0.48     13916
           3       0.39      0.18      0.25      8701
           4       0.62      0.57      0.60     10422

    accuracy                           0.44     49357
   macro avg       0.42      0.42      0.41     49357
weighted avg       0.45      0.44      0.44     49357


# Check Feature Importance from the model: 
plot_feature_importances(best_RF, X_dummies)


# Check values for learning_rate 
learning_rate = [0.01, 0.1, 0.5, 1.0, 1.5]


# Search parameter and get scores for each value
scores_params = [] #append scores 
for m in learning_rate: #for each value in learning_rate
    gbst = GradientBoostingClassifier(random_state=0, learning_rate=m)
    scores = cross_val_score(gbst, X=X2n_train, y=y_train, cv=3) #get cv score using classifer
    scores_params.append(scores.mean()) #append mean score


# Plot Validation Curve
plot_validation_curve(scores_params, learning_rate)


# Create parameter dictionary for learning_rate
gbst_params = dict( 
    learning_rate = [0.4, 0.5, 0.6])


# Initiate Grid Search for learning_rate
gs_gbst = GridSearchCV(estimator=gbst, param_grid=gbst_params, cv=3, n_jobs=-1)


# Fit GridSearchCV to Training Set
gs_gbst.fit(X2n_train, y_train)

GridSearchCV(cv=3,
             estimator=GradientBoostingClassifier(learning_rate=1.5,
                                                  random_state=0),
             n_jobs=-1, param_grid={'learning_rate': [0.4, 0.5, 0.6]})

GridSearchCV(cv=3,
             estimator=GradientBoostingClassifier(learning_rate=1.5,
                                                  random_state=0),
             n_jobs=-1, param_grid={'learning_rate': [0.4, 0.5, 0.6]})

GradientBoostingClassifier(learning_rate=1.5, random_state=0)

GradientBoostingClassifier(learning_rate=1.5, random_state=0)


# Capture Best Estimator Model for gbst
best_gbst = gs_gbst.best_estimator_ #get best estimator
best_gbst_val = fit_predict_bestscore(best_gbst, X2n_val, y_val) #get scores to df 
best_gbst #show best estimator params

GradientBoostingClassifier(learning_rate=0.5, random_state=0)

GradientBoostingClassifier(learning_rate=0.5, random_state=0)


# Fit, Predict Score, and Display Confusion Matrix
fit_predict_display_bestscores(best_gbst, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.40      0.14      0.21      3653
           1       0.44      0.50      0.46     12665
           2       0.42      0.66      0.52     13916
           3       0.49      0.12      0.19      8701
           4       0.63      0.59      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.47      0.40      0.40     49357
weighted avg       0.48      0.47      0.44     49357


# Check Feature Importance from the model: 
plot_feature_importances(best_gbst, X_dummies)


# Check values for learning_rate 
learning_rate = [0.1, 0.5, 0.7, 1.0, 1.5, 2.0]


# Search parameter and get scores for each value
scores_params = [] #append scores 
for m in learning_rate: #for each value in learning_rate
    AdaB = AdaBoostClassifier(random_state=0, learning_rate=m)
    scores = cross_val_score(AdaB, X=X2n_train, y=y_train, cv=3) #get cv score using classifer
    scores_params.append(scores.mean()) #append mean score


# Plot Validation Curve
plot_validation_curve(scores_params, learning_rate)


# Create parameter dictionary for learning_rate
ada_params = dict( 
    learning_rate = [0.5, 0.6, 0.7, 0.8, 1.0])


# Initiate Grid Search for learning_rate
gs_ada = GridSearchCV(estimator=AdaB, param_grid=ada_params, cv=3, n_jobs=-1)


# Fit GridSearchCV to Training Set
gs_ada.fit(X2n_train, y_train)

GridSearchCV(cv=3,
             estimator=AdaBoostClassifier(learning_rate=2.0, random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.5, 0.6, 0.7, 0.8, 1.0]})

GridSearchCV(cv=3,
             estimator=AdaBoostClassifier(learning_rate=2.0, random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.5, 0.6, 0.7, 0.8, 1.0]})

AdaBoostClassifier(learning_rate=2.0, random_state=0)

AdaBoostClassifier(learning_rate=2.0, random_state=0)


# Capture Best Estimator Model for AdaBoost
best_ada = gs_ada.best_estimator_
best_ada_val = fit_predict_bestscore(best_ada, X2n_val, y_val)
best_ada

AdaBoostClassifier(learning_rate=0.7, random_state=0)

AdaBoostClassifier(learning_rate=0.7, random_state=0)


# View Accuracy, Confusion Matrix, and Performance
fit_predict_display_bestscores(best_ada, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.30      0.06      0.09      3653
           1       0.39      0.43      0.41     12665
           2       0.41      0.65      0.50     13916
           3       0.32      0.02      0.04      8701
           4       0.54      0.63      0.58     10422

    accuracy                           0.43     49357
   macro avg       0.39      0.36      0.32     49357
weighted avg       0.41      0.43      0.38     49357


# Check Feature Importance from the model: 
plot_feature_importances(best_ada, X_dummies)


# Check values for learning_rate 
learning_rate = [0.01, 0.1, 0.5, 1.0, 1.5]


# Search parameter and get scores for each value
scores_params = [] #append scores 
for m in learning_rate: #for each value in learning_rate
    hgbst = HistGradientBoostingClassifier(random_state=0, learning_rate=m)
    scores = cross_val_score(hgbst, X=X2n_train, y=y_train, cv=3) #get cv score using classifer
    scores_params.append(scores.mean()) #append mean score


# Plot Validation Curve
plot_validation_curve(scores_params, learning_rate)


# Create parameter dictionary for learning_rate
hgbst_params = dict( 
    learning_rate = [0.03, 0.05, 0.09, 0.1, 0.15])


# Initiate Grid Search for learning_rate
hgbst = HistGradientBoostingClassifier(random_state=0)
gs_hgbst = GridSearchCV(estimator=hgbst, param_grid=hgbst_params, cv=3, n_jobs=-1)


# Fit GridSearchCV to Training Set
gs_hgbst.fit(X2n_train, y_train)

GridSearchCV(cv=3, estimator=HistGradientBoostingClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.03, 0.05, 0.09, 0.1, 0.15]})

GridSearchCV(cv=3, estimator=HistGradientBoostingClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.03, 0.05, 0.09, 0.1, 0.15]})

HistGradientBoostingClassifier(random_state=0)

HistGradientBoostingClassifier(random_state=0)


# Capture Best Estimator Model for HistGboost
best_hgbst = gs_hgbst.best_estimator_
best_hgbst_val = fit_predict_bestscore(best_hgbst, X2n_val, y_val)
best_hgbst

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)


# View Accuracy, Confusion Matrix, and Performance
fit_predict_display_bestscores(best_hgbst, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.41      0.15      0.22      3653
           1       0.44      0.50      0.47     12665
           2       0.42      0.67      0.52     13916
           3       0.49      0.13      0.20      8701
           4       0.63      0.58      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.48      0.41      0.40     49357
weighted avg       0.48      0.47      0.45     49357


# Check values for reg_param
reg_param = [0.0, 0.2, 0.5, 0.7, 1.0, 1.2, 1.5, 1.7, 2.0]


# Search parameter and get scores for each value
scores_params = [] #append scores 
for m in reg_param: #for each value in learning_rate
    QDA = QuadraticDiscriminantAnalysis(reg_param=m)
    scores = cross_val_score(QDA, X=X2n_train, y=y_train, cv=3) #get cv score using classifer
    scores_params.append(scores.mean()) #append mean score


# Plot Validation Curve
plot_validation_curve(scores_params, reg_param)


# Create parameter dictionary for reg_param
qda_params = dict( 
    reg_param = [0.1, 0.15, 0.2, 0.25, 0.3])


# Initiate Grid Search for reg_param
QDA = QuadraticDiscriminantAnalysis()
gs_qda = GridSearchCV(estimator=QDA, param_grid=qda_params, cv=3, n_jobs=-1)


# Fit GridSearchCV to Training Set
gs_qda.fit(X2n_train, y_train)

/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/opt/miniconda3/envs/dsc_env/lib/python3.10/site-packages/sklearn/discriminant_analysis.py:887: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

GridSearchCV(cv=3, estimator=QuadraticDiscriminantAnalysis(), n_jobs=-1,
             param_grid={'reg_param': [0.1, 0.15, 0.2, 0.25, 0.3]})

GridSearchCV(cv=3, estimator=QuadraticDiscriminantAnalysis(), n_jobs=-1,
             param_grid={'reg_param': [0.1, 0.15, 0.2, 0.25, 0.3]})

QuadraticDiscriminantAnalysis()

QuadraticDiscriminantAnalysis()


# Capture Best Estimator Model for QDA
best_qda = gs_qda.best_estimator_
best_qda_val = fit_predict_bestscore(best_qda, X2n_val, y_val)
best_qda

QuadraticDiscriminantAnalysis(reg_param=0.1)

QuadraticDiscriminantAnalysis(reg_param=0.1)


# View Accuracy, Confusion Matrix, and Performance
fit_predict_display_bestscores(best_qda, X2n_val, y_val)

              precision    recall  f1-score   support

           0       0.31      0.00      0.01      3653
           1       0.38      0.41      0.39     12665
           2       0.37      0.63      0.47     13916
           3       0.31      0.00      0.01      8701
           4       0.45      0.51      0.48     10422

    accuracy                           0.39     49357
   macro avg       0.36      0.31      0.27     49357
weighted avg       0.37      0.39      0.34     49357


# Display Accuracy Results for Optimized Models
optimized_df = [best_qda_val, best_ada_val, best_RF_val, best_gbst_val, best_hgbst_val]
optimized_results = pd.concat(optimized_df, ignore_index=True) #combine all model results
optimized_results


# Random Forest One-vs-Rest: Initialize Classifier
OvR_RF = OneVsRestClassifier(best_RF, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvR_RF_val = fit_predict_score(OvR_RF, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvR_RF, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.27      0.30      0.28      3653
           1       0.43      0.46      0.44     12665
           2       0.43      0.58      0.50     13916
           3       0.41      0.17      0.24      8701
           4       0.63      0.57      0.60     10422

    accuracy                           0.45     49357
   macro avg       0.43      0.42      0.41     49357
weighted avg       0.46      0.45      0.44     49357


# Adaboost One-vs-Rest: Initialize Classifier
OvR_ada = OneVsRestClassifier(best_ada, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvR_ada_val = fit_predict_score(OvR_ada, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvR_ada, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.46      0.05      0.10      3653
           1       0.40      0.46      0.43     12665
           2       0.41      0.64      0.50     13916
           3       0.35      0.01      0.03      8701
           4       0.54      0.64      0.59     10422

    accuracy                           0.44     49357
   macro avg       0.43      0.36      0.33     49357
weighted avg       0.43      0.44      0.39     49357


# Histogram-Based Gradient Boost One-vs-Rest: Initialize Classifier
OvR_hgbst = OneVsRestClassifier(best_hgbst, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvR_hgbst_val = fit_predict_score(OvR_hgbst, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvR_hgbst, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.42      0.14      0.21      3653
           1       0.44      0.50      0.47     12665
           2       0.42      0.67      0.52     13916
           3       0.50      0.11      0.18      8701
           4       0.63      0.59      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.48      0.40      0.40     49357
weighted avg       0.48      0.47      0.44     49357


# LDA One-vs-Rest: Initialize Classifier
OvR_lda = OneVsRestClassifier(lda, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvR_lda_val = fit_predict_score(OvR_lda, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvR_lda, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.67      0.01      0.03      3653
           1       0.39      0.44      0.41     12665
           2       0.40      0.67      0.50     13916
           3       0.09      0.00      0.00      8701
           4       0.56      0.64      0.60     10422

    accuracy                           0.44     49357
   macro avg       0.42      0.35      0.31     49357
weighted avg       0.40      0.44      0.38     49357


# Random Forest One-vs-One: Initialize Classifier
OvO_RF = OneVsOneClassifier(best_RF, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvO_RF_val = fit_predict_score(OvO_RF, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvO_RF, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.29      0.30      0.29      3653
           1       0.42      0.45      0.43     12665
           2       0.43      0.58      0.49     13916
           3       0.41      0.18      0.25      8701
           4       0.63      0.58      0.60     10422

    accuracy                           0.45     49357
   macro avg       0.44      0.42      0.41     49357
weighted avg       0.46      0.45      0.44     49357


# Adaboost One-vs-One: Initialize Classifier
OvO_ada = OneVsOneClassifier(best_ada, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvO_ada_val = fit_predict_score(OvO_ada, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvO_ada, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.40      0.08      0.13      3653
           1       0.41      0.47      0.44     12665
           2       0.42      0.63      0.50     13916
           3       0.36      0.04      0.07      8701
           4       0.54      0.63      0.58     10422

    accuracy                           0.44     49357
   macro avg       0.43      0.37      0.34     49357
weighted avg       0.43      0.44      0.40     49357


# Histogram-Based Gradient Boost One-vs-One: Initialize Classifier
OvO_hgbst = OneVsOneClassifier(best_hgbst, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvO_hgbst_val = fit_predict_score(OvO_hgbst, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvO_hgbst, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.41      0.15      0.22      3653
           1       0.44      0.50      0.47     12665
           2       0.42      0.66      0.52     13916
           3       0.51      0.13      0.21      8701
           4       0.64      0.58      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.48      0.41      0.41     49357
weighted avg       0.49      0.47      0.45     49357


# LDA One-vs-One: Initialize Classifier
OvO_lda = OneVsOneClassifier(lda, n_jobs=-1)


# View Accuracy, Confusion Matrix, and Performance
OvO_lda_val = fit_predict_score(OvO_lda, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(OvO_lda, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.49      0.03      0.06      3653
           1       0.39      0.45      0.42     12665
           2       0.40      0.66      0.50     13916
           3       0.36      0.03      0.05      8701
           4       0.57      0.62      0.60     10422

    accuracy                           0.44     49357
   macro avg       0.44      0.36      0.32     49357
weighted avg       0.44      0.44      0.39     49357


# Display Accuracy Results for OvR & OvO Models
OvROvO_df = [OvR_lda_val, OvO_lda_val, OvO_ada_val, OvR_ada_val, OvO_RF_val, OvR_RF_val, 
             OvR_hgbst_val, OvO_hgbst_val]
OvROvO_results = pd.concat(OvROvO_df, ignore_index=True) #combine all model results
OvROvO_results.loc[[0], ['Model']] = 'One-vs-Rest(OvR)-LDA' #rename model
OvROvO_results.loc[[1], ['Model']] = 'One-vs-One(OvO)-LDA' #rename model
OvROvO_results.loc[[2], ['Model']] = 'One-vs-One(OvO)-AdaBoost' #rename model
OvROvO_results.loc[[3], ['Model']] = 'One-vs-Rest(OvR)-AdaBoost' #rename model
OvROvO_results.loc[[4], ['Model']] = 'One-vs-One(OvO)-RandomForest' #rename model
OvROvO_results.loc[[5], ['Model']] = 'One-vs-Rest(OvR)-RandomForest' #rename model
OvROvO_results.loc[[6], ['Model']] = 'One-vs-Rest(OvR)-HistGradientBoost' #rename model
OvROvO_results.loc[[7], ['Model']] = 'One-vs-One(OvO)-HistGradientBoost' #rename model
OvROvO_results


# Initialize VotingClassifier with best estimators for RF, HistGradientBoost, AdaBoost
vote_clf1 = VotingClassifier(estimators=[('RF', best_RF), ('HistGBoost', best_hgbst), 
                                        ('AdaBoost', best_ada)], voting='hard') #vote = hard


# View Accuracy, Confusion Matrix, and Performance
vote1_val = fit_predict_score(vote_clf1, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(vote_clf1, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.31      0.19      0.24      3653
           1       0.43      0.50      0.46     12665
           2       0.43      0.64      0.51     13916
           3       0.51      0.10      0.17      8701
           4       0.62      0.59      0.61     10422

    accuracy                           0.46     49357
   macro avg       0.46      0.40      0.40     49357
weighted avg       0.48      0.46      0.44     49357


# Initialize VotingClassifier with best estimators for RF, HistGradientBoost, & AdaBoost
vote_clf2 = VotingClassifier(estimators=[('RF', best_RF), ('HistGBoost', best_hgbst), 
                                        ('AdaBoost', best_ada)], voting='soft') #vote = soft


# View Accuracy, Confusion Matrix, and Performance
vote2_val = fit_predict_score(vote_clf2, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(vote_clf2, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.34      0.23      0.27      3653
           1       0.44      0.49      0.46     12665
           2       0.43      0.64      0.51     13916
           3       0.49      0.14      0.22      8701
           4       0.64      0.58      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.47      0.42      0.42     49357
weighted avg       0.48      0.47      0.45     49357


# Initialize VotingClassifier with OvR Models
vote_clf3 = VotingClassifier(estimators=[('OvR-RF', OvR_RF), ('OvR-HistGBoost', OvR_hgbst), 
                                        ('OvR-AdaBoost', OvR_ada)], voting='hard') #vote = hard


# View Accuracy, Confusion Matrix, and Performance
vote3_val = fit_predict_score(vote_clf3, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(vote_clf3, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.32      0.16      0.21      3653
           1       0.43      0.50      0.46     12665
           2       0.42      0.65      0.51     13916
           3       0.52      0.09      0.16      8701
           4       0.62      0.60      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.46      0.40      0.39     49357
weighted avg       0.48      0.47      0.44     49357


# Initialize VotingClassifier with OvR Models
vote_clf4 = VotingClassifier(estimators=[('OvR-RF', OvR_RF), ('OvR-HistGBoost', OvR_hgbst), 
                                        ('OvR-AdaBoost', OvR_ada)], voting='soft') #vote = soft


# View Accuracy, Confusion Matrix, and Performance
vote4_val = fit_predict_score(vote_clf4, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(vote_clf4, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.38      0.18      0.24      3653
           1       0.44      0.50      0.47     12665
           2       0.42      0.66      0.52     13916
           3       0.51      0.12      0.20      8701
           4       0.63      0.59      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.48      0.41      0.41     49357
weighted avg       0.49      0.47      0.45     49357


# Initialize VotingClassifier with OvO Models
vote_clf5 = VotingClassifier(estimators=[('OvO-RF', OvO_RF), ('OvO-HistGBoost', OvO_hgbst), 
                                        ('OvO-AdaBoost', OvO_ada)], voting='hard') #vote = hard


# View Accuracy, Confusion Matrix, and Performance
vote5_val = fit_predict_score(vote_clf5, X2n_train, y_train, X2n_val, y_val) # get scores as DF
fit_predict_display_scores(vote_clf5, X2n_train, y_train, X2n_val, y_val) # display  confusion matrix + score

              precision    recall  f1-score   support

           0       0.36      0.17      0.23      3653
           1       0.43      0.51      0.46     12665
           2       0.43      0.64      0.51     13916
           3       0.53      0.11      0.19      8701
           4       0.63      0.59      0.61     10422

    accuracy                           0.47     49357
   macro avg       0.47      0.41      0.40     49357
weighted avg       0.48      0.47      0.44     49357


# Display Accuracy Results for Voting Ensembles
vote_df = [vote1_val, vote3_val, vote5_val, vote2_val, vote4_val]
vote_results = pd.concat(vote_df, ignore_index=True) #combine all model results
vote_results.loc[[0], ['Model']] = 'VotingClassifier-Optimized Models w/ Hard Vote' #rename model
vote_results.loc[[1], ['Model']] = 'VotingClassifier-OvR w/ Hard Vote' #rename model
vote_results.loc[[2], ['Model']] = 'VotingClassifier-OvO w/ Hard Vote' #rename model
vote_results.loc[[3], ['Model']] = 'VotingClassifier-Optimized Models w/ Soft Vote' #rename model
vote_results.loc[[4], ['Model']] = 'VotingClassifier-OvR w/ Soft Vote' #rename model
vote_results


# Display Classifier Models with Highest Accuracy
best_df = [vote2_val, OvR_hgbst_val, vote4_val, best_hgbst_val, OvO_hgbst_val]
best_results = pd.concat(best_df, ignore_index=True) #combine all model results
best_results.loc[[0], ['Model']] = 'VotingClassifier-Optimized Models w/ Soft Vote' #rename model
best_results.loc[[1], ['Model']] = 'One-vs-Rest(OvR)-HistGradientBoost' #rename model
best_results.loc[[2], ['Model']] = 'VotingClassifier-OvR w/ Soft Vote' #rename model
best_results.loc[[3], ['Model']] = 'HistGradientBoostingClassifier' #rename model
best_results.loc[[4], ['Model']] = 'One-vs-One(OvO)-HistGradientBoost' #rename model
best_results


# View Classifier w/ Models & Soft Voting
vote_clf2

VotingClassifier(estimators=[('RF',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=19,
                                                     random_state=0)),
                             ('HistGBoost',
                              HistGradientBoostingClassifier(learning_rate=0.09,
                                                             random_state=0)),
                             ('AdaBoost',
                              AdaBoostClassifier(learning_rate=0.7,
                                                 random_state=0))],
                 voting='soft')

VotingClassifier(estimators=[('RF',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=19,
                                                     random_state=0)),
                             ('HistGBoost',
                              HistGradientBoostingClassifier(learning_rate=0.09,
                                                             random_state=0)),
                             ('AdaBoost',
                              AdaBoostClassifier(learning_rate=0.7,
                                                 random_state=0))],
                 voting='soft')

RandomForestClassifier(class_weight='balanced', max_depth=19, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

AdaBoostClassifier(learning_rate=0.7, random_state=0)


# Fit, Predict on Test Set and View Performance 
vote_test = fit_predict_score(vote_clf2, X2n_train, y_train, X2n_test, y_test) # get scores as DF
fit_predict_display_scores(vote_clf2, X2n_train, y_train, X2n_test, y_test) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.33      0.23      0.27      2010
           1       0.44      0.49      0.46      6998
           2       0.44      0.64      0.52      7788
           3       0.50      0.15      0.23      4792
           4       0.65      0.59      0.62      5833

    accuracy                           0.48     27421
   macro avg       0.47      0.42      0.42     27421
weighted avg       0.48      0.48      0.46     27421


# View Classifier 
OvR_hgbst

OneVsRestClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                             random_state=0),
                    n_jobs=-1)

OneVsRestClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                             random_state=0),
                    n_jobs=-1)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)


# Fit, Predict on Test Set and View Performance 
OvR_hgbst_test = fit_predict_score(OvR_hgbst, X2n_train, y_train, X2n_test, y_test) # get scores as DF
fit_predict_display_scores(OvR_hgbst, X2n_train, y_train, X2n_test, y_test) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.41      0.14      0.20      2010
           1       0.44      0.49      0.47      6998
           2       0.43      0.68      0.52      7788
           3       0.51      0.12      0.19      4792
           4       0.64      0.60      0.62      5833

    accuracy                           0.48     27421
   macro avg       0.49      0.41      0.40     27421
weighted avg       0.49      0.48      0.45     27421


# View Classifier w/ Models & Soft Voting
vote_clf4

VotingClassifier(estimators=[('OvR-RF',
                              OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                                   max_depth=19,
                                                                                   random_state=0),
                                                  n_jobs=-1)),
                             ('OvR-HistGBoost',
                              OneVsRestClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                                                           random_state=0),
                                                  n_jobs=-1)),
                             ('OvR-AdaBoost',
                              OneVsRestClassifier(estimator=AdaBoostClassifier(learning_rate=0.7,
                                                                               random_state=0),
                                                  n_jobs=-1))],
                 voting='soft')

VotingClassifier(estimators=[('OvR-RF',
                              OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                                   max_depth=19,
                                                                                   random_state=0),
                                                  n_jobs=-1)),
                             ('OvR-HistGBoost',
                              OneVsRestClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                                                           random_state=0),
                                                  n_jobs=-1)),
                             ('OvR-AdaBoost',
                              OneVsRestClassifier(estimator=AdaBoostClassifier(learning_rate=0.7,
                                                                               random_state=0),
                                                  n_jobs=-1))],
                 voting='soft')

RandomForestClassifier(class_weight='balanced', max_depth=19, random_state=0)

RandomForestClassifier(class_weight='balanced', max_depth=19, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

AdaBoostClassifier(learning_rate=0.7, random_state=0)

AdaBoostClassifier(learning_rate=0.7, random_state=0)


vote_OvR_test = fit_predict_score(vote_clf4, X2n_train, y_train, X2n_test, y_test) # get scores as DF
fit_predict_display_scores(vote_clf4, X2n_train, y_train, X2n_test, y_test) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.37      0.18      0.25      2010
           1       0.44      0.50      0.47      6998
           2       0.43      0.66      0.52      7788
           3       0.51      0.13      0.20      4792
           4       0.64      0.60      0.62      5833

    accuracy                           0.48     27421
   macro avg       0.48      0.41      0.41     27421
weighted avg       0.49      0.48      0.45     27421


# View Classifier w/ Optimized Parameters
best_hgbst

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)


# Fit, Predict on Test Set and View Performance 
best_hgbst_test = fit_predict_score(best_hgbst, X2n_train, y_train, X2n_test, y_test) # get scores as DF
fit_predict_display_scores(best_hgbst, X2n_train, y_train, X2n_test, y_test) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.40      0.15      0.22      2010
           1       0.44      0.49      0.46      6998
           2       0.43      0.67      0.53      7788
           3       0.50      0.13      0.21      4792
           4       0.64      0.59      0.62      5833

    accuracy                           0.48     27421
   macro avg       0.48      0.41      0.41     27421
weighted avg       0.49      0.48      0.45     27421


# View Classifier 
OvO_hgbst

OneVsOneClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                            random_state=0),
                   n_jobs=-1)

OneVsOneClassifier(estimator=HistGradientBoostingClassifier(learning_rate=0.09,
                                                            random_state=0),
                   n_jobs=-1)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)

HistGradientBoostingClassifier(learning_rate=0.09, random_state=0)


# Fit, Predict on Test Set and View Performance 
OvO_hgbst_test = fit_predict_score(OvO_hgbst, X2n_train, y_train, X2n_test, y_test) # get scores as DF
fit_predict_display_scores(OvO_hgbst, X2n_train, y_train, X2n_test, y_test) # display confusion matrix + score

              precision    recall  f1-score   support

           0       0.40      0.15      0.21      2010
           1       0.44      0.50      0.47      6998
           2       0.43      0.67      0.52      7788
           3       0.51      0.13      0.21      4792
           4       0.64      0.59      0.62      5833

    accuracy                           0.48     27421
   macro avg       0.48      0.41      0.41     27421
weighted avg       0.49      0.48      0.45     27421


# Display Final Results from Testing Evaluation
final_df = [vote2_val, vote_test, OvR_hgbst_val, OvR_hgbst_test, 
            OvO_hgbst_val, OvO_hgbst_test, best_hgbst_val, best_hgbst_test, vote4_val, vote_OvR_test]
final_results = pd.concat(final_df, ignore_index=True) #combine all model results
final_results.loc[[0, 1], ['Model']] = 'VotingClassifier-Optimized Models w/ Soft Vote' #rename model
final_results.loc[[2, 3], ['Model']] = 'One-vs-Rest(OvR)-HistGradientBoost' #rename model
final_results.loc[[4, 5], ['Model']] = 'One-vs-One(OvO)-HistGradientBoost' #rename model
final_results.loc[[6, 7], ['Model']] = 'HistGradientBoostingClassifier' #rename model
final_results.loc[[8, 9], ['Model']] = 'VotingClassifier-OvR w/ Soft Vote' #rename model
final_results.rename(index={0: 'Validation', 1: 'Testing', 2: 'Validation', 3: 'Testing', 
                            4: 'Validation', 5: 'Testing', 6: 'Validation', 7: 'Testing', 
                            8: 'Validation', 9: 'Testing'}) #rename df index

	Column	Description
0	case_id	Case_ID registered in Hospital
1	Hospital_code	Unique code for the Hospital
2	Hospital_type_code	Unique code for the type of Hospital
3	City_Code_Hospital	City Code of the Hospital
4	Hospital_region_code	Region Code of the Hospital
5	Available Extra Rooms in Hospital	Number of Extra rooms available in the Hospital
6	Department	Department overlooking the case
7	Ward_Type	Code for the Ward type
8	Ward_Facility_Code	Code for the Ward Facility
9	Bed Grade	Condition of Bed in the Ward
10	patientid	Unique Patient Id
11	City_Code_Patient	City Code for the patient
12	Type of Admission	Admission Type registered by the Hospital
13	Severity of Illness	Severity of the illness recorded at the time o...
14	Visitors with Patient	Number of Visitors with the patient
15	Age	Age of the patient
16	Admission_Deposit	Deposit at the Admission Time
17	Stay	Stay Days by the patient

	case_id	Hospital	Hospital_type	Hospital_city	Hospital_region	Available_Extra_Rooms_in_Hospital	Bed_Grade	patientid	City_Code_Patient	Patient_Visitors	Admission_Deposit
count	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000	313793.000000
mean	158937.911120	18.326419	1.257781	4.778169	0.780358	3.196419	2.622952	65743.029382	7.252447	3.280857	4881.893165
std	91939.506976	8.633613	1.535338	3.103237	0.752329	1.167924	0.872618	37978.083237	4.745243	1.761758	1086.243945
min	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	1800.000000
25%	79271.000000	11.000000	0.000000	2.000000	0.000000	2.000000	2.000000	32833.000000	4.000000	2.000000	4188.000000
50%	158950.000000	19.000000	1.000000	5.000000	1.000000	3.000000	3.000000	65735.000000	8.000000	3.000000	4742.000000
75%	238399.000000	26.000000	2.000000	7.000000	1.000000	4.000000	3.000000	98472.000000	8.000000	4.000000	5410.000000
max	318438.000000	32.000000	6.000000	13.000000	2.000000	24.000000	4.000000	131624.000000	38.000000	32.000000	11008.000000

	case_id	Hospital	Hospital_type	Hospital_city	Hospital_region	Available_Extra_Rooms_in_Hospital	Bed_Grade	patientid	City_Code_Patient	Patient_Visitors	Admission_Deposit
count	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000	274202.000000
mean	156856.854476	18.395227	1.259258	4.798853	0.778164	3.134904	2.602085	65680.640889	6.225389	3.033169	4905.324392
std	91653.023209	8.644682	1.548230	3.094707	0.748784	1.057848	0.871178	37980.128423	3.011789	1.179913	1065.103524
min	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	1801.000000
25%	77249.250000	11.000000	0.000000	2.000000	0.000000	2.000000	2.000000	32671.000000	4.000000	2.000000	4222.000000
50%	156369.500000	19.000000	1.000000	5.000000	1.000000	3.000000	3.000000	65659.500000	8.000000	3.000000	4763.000000
75%	235446.750000	26.000000	2.000000	7.000000	1.000000	4.000000	3.000000	98348.000000	8.000000	4.000000	5421.000000
max	318438.000000	32.000000	6.000000	13.000000	2.000000	6.000000	4.000000	131624.000000	14.000000	6.000000	11008.000000

	Model	Accuracy	Balanced Accuracy
0	KNeighborsClassifier	0.3623	0.3223
1	SGDClassifier	0.3734	0.3626
2	LinearSVC	0.4056	0.3921
3	RandomForestClassifier	0.4199	0.3758
4	AdaBoostClassifier	0.4328	0.3602
5	LinearDiscriminantAnalysis	0.4371	0.3543
6	GradientBoostingClassifier	0.4636	0.3903

	Model	Accuracy	Balanced Accuracy
0	QuadraticDiscriminantAnalysis	0.3914	0.3116
1	AdaBoostClassifier	0.4329	0.3561
2	RandomForestClassifier	0.4419	0.4171
3	GradientBoostingClassifier	0.4699	0.4018
4	HistGradientBoostingClassifier	0.4728	0.4059

	case_id	Hospital	Hospital_type	Hospital_city	Hospital_region	Available_Extra_Rooms_in_Hospital	Department	Ward_Type	Ward_Facility	Bed_Grade	patientid	City_Code_Patient	Type of Admission	Illness_Severity	Patient_Visitors	Age	Admission_Deposit	Stay_Days
0	1	8	2	3	2	3	radiotherapy	R	F	2.0	31397	7.0	Emergency	Extreme	2	51-60	4911.0	0-10
1	2	2	2	5	2	2	radiotherapy	S	F	2.0	31397	7.0	Trauma	Extreme	2	51-60	5954.0	41-50
2	3	10	4	1	0	2	anesthesia	S	E	2.0	31397	7.0	Trauma	Extreme	2	51-60	4745.0	31-40
3	4	26	1	2	1	2	radiotherapy	R	D	2.0	31397	7.0	Trauma	Extreme	2	51-60	7272.0	41-50
4	5	26	1	2	1	2	radiotherapy	S	D	2.0	31397	7.0	Trauma	Extreme	2	51-60	5558.0	41-50
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
318433	318434	6	0	6	0	3	radiotherapy	Q	F	4.0	86499	23.0	Emergency	Moderate	3	41-50	4144.0	11-20
318434	318435	24	0	1	0	2	anesthesia	Q	E	4.0	325	8.0	Urgent	Moderate	4	81-90	6699.0	31-40
318435	318436	7	0	4	0	3	gynecology	R	F	4.0	125235	10.0	Emergency	Minor	3	71-80	4235.0	11-20
318436	318437	11	1	2	1	3	anesthesia	Q	D	3.0	91081	8.0	Trauma	Minor	5	11-20	3761.0	11-20
318437	318438	19	0	7	1	5	gynecology	Q	C	2.0	21641	8.0	Emergency	Minor	2	11-20	4752.0	0-10

	Model	Accuracy	Balanced Accuracy
0	One-vs-Rest(OvR)-LDA	0.4369	0.3520
1	One-vs-One(OvO)-LDA	0.4383	0.3568
2	One-vs-One(OvO)-AdaBoost	0.4432	0.3691
3	One-vs-Rest(OvR)-AdaBoost	0.4424	0.3633
4	One-vs-One(OvO)-RandomForest	0.4533	0.4162
5	One-vs-Rest(OvR)-RandomForest	0.4537	0.4164
6	One-vs-Rest(OvR)-HistGradientBoost	0.4714	0.4014
7	One-vs-One(OvO)-HistGradientBoost	0.4733	0.4065

	Model	Accuracy	Balanced Accuracy
0	VotingClassifier-Optimized Models w/ Hard Vote	0.4645	0.4043
1	VotingClassifier-OvR w/ Hard Vote	0.4660	0.3996
2	VotingClassifier-OvO w/ Hard Vote	0.4691	0.4057
3	VotingClassifier-Optimized Models w/ Soft Vote	0.4706	0.4165
4	VotingClassifier-OvR w/ Soft Vote	0.4728	0.4097

	Model	Accuracy	Balanced Accuracy
Validation	VotingClassifier-Optimized Models w/ Soft Vote	0.4706	0.4165
Testing	VotingClassifier-Optimized Models w/ Soft Vote	0.4763	0.4200
Validation	One-vs-Rest(OvR)-HistGradientBoost	0.4714	0.4014
Testing	One-vs-Rest(OvR)-HistGradientBoost	0.4771	0.4053
Validation	One-vs-One(OvO)-HistGradientBoost	0.4733	0.4065
Testing	One-vs-One(OvO)-HistGradientBoost	0.4776	0.4080
Validation	HistGradientBoostingClassifier	0.4728	0.4059
Testing	HistGradientBoostingClassifier	0.4781	0.4092
Validation	VotingClassifier-OvR w/ Soft Vote	0.4728	0.4097
Testing	VotingClassifier-OvR w/ Soft Vote	0.4784	0.4142

Predicting Patient Stay Days (Length of Stay in Hospital): Classification Models for Imbalanced Target Variable¶

Introduction¶

Data Pre-Processing¶

Load Packages¶

Data Description¶

Data Quality and Data Types¶

Exploratory Data Analysis¶

Distribution of Numeric Features¶

Distribution of Categorical Features¶

Handling Outliers and Transformations¶

Removal of Outliers for Numeric Features¶

Transforming Categorical Features¶

Set-Up & Preparation for Machine Learning¶

Encoding for Categorical Features¶

Dummies for Categorical Features¶

Label Encoder for Target Variable¶

Train Test Split¶

Min-Max Normalization¶

Utility Functions¶

Baseline Models for Training/Validation¶

Random Forest Classifier¶

Gradient Boosting Classifier¶

AdaBoost Classifier¶

KNearest Neighbor Classifier¶

Linear Discriminant Analysis (LDA) Classifier¶

Support Vector Classifier (SVC)¶

Stochastic Gradient Descent (SGD) Classifier¶

Results Summary for Baseline Models¶

Optimized Models¶

Hyperparameter Tuning with GridSearchCV¶

Random Forest Optimization¶

Apply GridSearchCV to find optium parameter value for max_depth¶

Get Best Estimator from GridSearchCV with optium parameter value for max_depth¶

View Accuracy, Confusion Matrix, and Performance¶

Plot Feature Importance for Best Estimator¶

Gradient Boost Optimization¶

Apply GridSearchCV to find optium parameter value for learning rate¶

Get Best Estimator from GridSearchCV with optium parameter value for learning rate¶

View Accuracy, Confusion Matrix, and Performance¶

Plot Feature Importance for Best Estimator¶

AdaBoost Optimization¶

Apply GridSearchCV to find optium parameter value for learning rate¶

Get Best Estimator from GridSearchCV with optium parameter value for learning rate¶

View Accuracy, Confusion Matrix, and Performance¶

Plot Feature Importance for Best Estimator¶

Extended Models¶

Histogram-Based Gradient Boost¶

Apply GridSearchCV to find optium parameter value for learning rate¶

Get Best Estimator from GridSearchCV with optium parameter value for learning_rate¶

View Accuracy, Confusion Matrix, and Performance¶

Quadratic Discriminant Analysis (QDA)¶

Apply GridSearchCV to find optium parameter value for reg_param¶

Get Best Estimator from GridSearchCV with optium parameter value for reg_param¶

View Accuracy, Confusion Matrix, and Performance¶

Results Summary for Optimized Models¶

Multiclass Learning Models¶

One-vs-Rest (OvR) Models¶

OvR Random Forest¶

OvR AdaBoost¶

OvR HistGradient Boost¶

OvR Linear Discrimiant Analysis (LDA)¶

One-vs-One (OvO) Models¶

OvO Random Forest¶

OvO AdaBoost¶

OvO HistGradient Boost¶

OvO Linear Discrimiant Analysis (LDA)¶

Results Summary for OvR & OvO Models¶

Voting Ensemble Models¶

Ensemble with Optimized Models¶

With Hard Voting¶

With Soft Voting¶

Ensemble with OvR & OvO Models¶

OvR With Hard Voting¶

OvR with Soft Voting¶

OvO With Hard Voting¶

Results Summary for Voting Ensembles¶

Model Evaluation on Test Set¶

Models with Highest Performance¶

Voting Ensemble with Optimized Models¶

One-vs-Rest (OvR) - Histogram-Based Gradient Boost¶