Cluster analysis using K-Means algorithm is performed on the full dataset. To perform the cluster analysis, first, the class label, NObeyesdad is removed from the dataset. The dataset is transformed to ensure the correct data types exist for each feature. Dummy variables are created for the categorical features. The numeric dataset contains 2,111 rows and 43 columns.
import numpy as np
import pylab as pl
import pandas as pd
import importlib
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.metrics import silhouette_samples
%pwd
'/Users/cl'
# Load dataset to Pandas dataframe:
df = pd.read_csv('/Users/cl/ObesityDataset.csv', header=0)
# View dataframe:
df
Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 21.000000 | 1.620000 | 64.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 0.000000 | 1.000000 | no | Public_Transportation | Normal_Weight |
1 | Female | 21.000000 | 1.520000 | 56.000000 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.000000 | yes | 3.000000 | 0.000000 | Sometimes | Public_Transportation | Normal_Weight |
2 | Male | 23.000000 | 1.800000 | 77.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 1.000000 | Frequently | Public_Transportation | Normal_Weight |
3 | Male | 27.000000 | 1.800000 | 87.000000 | no | no | 3.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 0.000000 | Frequently | Walking | Overweight_Level_I |
4 | Male | 22.000000 | 1.780000 | 89.800000 | no | no | 2.0 | 1.0 | Sometimes | no | 2.000000 | no | 0.000000 | 0.000000 | Sometimes | Public_Transportation | Overweight_Level_II |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation | Obesity_Type_III |
2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation | Obesity_Type_III |
2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation | Obesity_Type_III |
2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation | Obesity_Type_III |
2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation | Obesity_Type_III |
2111 rows × 17 columns
#remove the class label column
df2 = df.iloc[:,:16]
df2
Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 21.000000 | 1.620000 | 64.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 0.000000 | 1.000000 | no | Public_Transportation |
1 | Female | 21.000000 | 1.520000 | 56.000000 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.000000 | yes | 3.000000 | 0.000000 | Sometimes | Public_Transportation |
2 | Male | 23.000000 | 1.800000 | 77.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 1.000000 | Frequently | Public_Transportation |
3 | Male | 27.000000 | 1.800000 | 87.000000 | no | no | 3.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 0.000000 | Frequently | Walking |
4 | Male | 22.000000 | 1.780000 | 89.800000 | no | no | 2.0 | 1.0 | Sometimes | no | 2.000000 | no | 0.000000 | 0.000000 | Sometimes | Public_Transportation |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation |
2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation |
2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation |
2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation |
2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation |
2111 rows × 16 columns
# Create a copy to clean the data:
cleaned_data = df2
cleaned_data
Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 21.000000 | 1.620000 | 64.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 0.000000 | 1.000000 | no | Public_Transportation |
1 | Female | 21.000000 | 1.520000 | 56.000000 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.000000 | yes | 3.000000 | 0.000000 | Sometimes | Public_Transportation |
2 | Male | 23.000000 | 1.800000 | 77.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 1.000000 | Frequently | Public_Transportation |
3 | Male | 27.000000 | 1.800000 | 87.000000 | no | no | 3.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 0.000000 | Frequently | Walking |
4 | Male | 22.000000 | 1.780000 | 89.800000 | no | no | 2.0 | 1.0 | Sometimes | no | 2.000000 | no | 0.000000 | 0.000000 | Sometimes | Public_Transportation |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation |
2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation |
2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation |
2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation |
2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation |
2111 rows × 16 columns
# Convert FCVC, NCP, CH20, FAF, and TUE into a Categorical Feature by first, converting it from Float to Integer:
cleaned_data['FCVC'] = cleaned_data['FCVC'].astype('int')
cleaned_data['NCP'] = cleaned_data['NCP'].astype('int')
cleaned_data['CH2O'] = cleaned_data['CH2O'].astype('int')
cleaned_data['FAF'] = cleaned_data['FAF'].astype('int')
cleaned_data['TUE'] = cleaned_data['TUE'].astype('int')
# Convert Age from Float to Integer:
cleaned_data['Age'] = cleaned_data['Age'].astype('int')
cleaned_data.dtypes
Gender object Age int64 Height float64 Weight float64 family_history_with_overweight object FAVC object FCVC int64 NCP int64 CAEC object SMOKE object CH2O int64 SCC object FAF int64 TUE int64 CALC object MTRANS object dtype: object
# Rename values in FCVC into Categorical Names:
cleaned_data['FCVC'] = cleaned_data['FCVC'].replace({1: 'Never'})
cleaned_data['FCVC'] = cleaned_data['FCVC'].replace({2: 'Sometimes'})
cleaned_data['FCVC'] = cleaned_data['FCVC'].replace({3: 'Always'})
# Rename values in NCP into Categorical Names:
cleaned_data['NCP'] = cleaned_data['NCP'].replace({1: '1'})
cleaned_data['NCP'] = cleaned_data['NCP'].replace({2: '2'})
cleaned_data['NCP'] = cleaned_data['NCP'].replace({3: '3'})
cleaned_data['NCP'] = cleaned_data['NCP'].replace({4: '3+'})
# Rename values in CH2O into Categorical Names:
cleaned_data['CH2O'] = cleaned_data['CH2O'].replace({1: 'Less than a liter'})
cleaned_data['CH2O'] = cleaned_data['CH2O'].replace({2: 'Between 1 and 2 L'})
cleaned_data['CH2O'] = cleaned_data['CH2O'].replace({3: 'More than 2 L'})
# Rename values in FAF into Categorical Names:
cleaned_data['FAF'] = cleaned_data['FAF'].replace({0: 'I do not have'})
cleaned_data['FAF'] = cleaned_data['FAF'].replace({1: '1 or 2 days'})
cleaned_data['FAF'] = cleaned_data['FAF'].replace({2: '2 or 4 days'})
cleaned_data['FAF'] = cleaned_data['FAF'].replace({3: '4 or 5 days'})
# Rename values in TUE into Categorical Names:
cleaned_data['TUE'] = cleaned_data['TUE'].replace({0: '0-2 Hours'})
cleaned_data['TUE'] = cleaned_data['TUE'].replace({1: '3-5 Hours'})
cleaned_data['TUE'] = cleaned_data['TUE'].replace({2: 'More than 5 Hours'})
cleaned_data
Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 21 | 1.620000 | 64.000000 | yes | no | Sometimes | 3 | Sometimes | no | Between 1 and 2 L | no | I do not have | 3-5 Hours | no | Public_Transportation |
1 | Female | 21 | 1.520000 | 56.000000 | yes | no | Always | 3 | Sometimes | yes | More than 2 L | yes | 4 or 5 days | 0-2 Hours | Sometimes | Public_Transportation |
2 | Male | 23 | 1.800000 | 77.000000 | yes | no | Sometimes | 3 | Sometimes | no | Between 1 and 2 L | no | 2 or 4 days | 3-5 Hours | Frequently | Public_Transportation |
3 | Male | 27 | 1.800000 | 87.000000 | no | no | Always | 3 | Sometimes | no | Between 1 and 2 L | no | 2 or 4 days | 0-2 Hours | Frequently | Walking |
4 | Male | 22 | 1.780000 | 89.800000 | no | no | Sometimes | 1 | Sometimes | no | Between 1 and 2 L | no | I do not have | 0-2 Hours | Sometimes | Public_Transportation |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Female | 20 | 1.710730 | 131.408528 | yes | yes | Always | 3 | Sometimes | no | Less than a liter | no | 1 or 2 days | 0-2 Hours | Sometimes | Public_Transportation |
2107 | Female | 21 | 1.748584 | 133.742943 | yes | yes | Always | 3 | Sometimes | no | Between 1 and 2 L | no | 1 or 2 days | 0-2 Hours | Sometimes | Public_Transportation |
2108 | Female | 22 | 1.752206 | 133.689352 | yes | yes | Always | 3 | Sometimes | no | Between 1 and 2 L | no | 1 or 2 days | 0-2 Hours | Sometimes | Public_Transportation |
2109 | Female | 24 | 1.739450 | 133.346641 | yes | yes | Always | 3 | Sometimes | no | Between 1 and 2 L | no | 1 or 2 days | 0-2 Hours | Sometimes | Public_Transportation |
2110 | Female | 23 | 1.738836 | 133.472641 | yes | yes | Always | 3 | Sometimes | no | Between 1 and 2 L | no | 1 or 2 days | 0-2 Hours | Sometimes | Public_Transportation |
2111 rows × 16 columns
# create dummy variables for cleaned dataset:
data_numeric = pd.get_dummies(cleaned_data)
pd.set_option("display.max_columns", 999)
data_numeric
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21 | 1.620000 | 64.000000 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 21 | 1.520000 | 56.000000 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 23 | 1.800000 | 77.000000 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 27 | 1.800000 | 87.000000 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 22 | 1.780000 | 89.800000 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | 20 | 1.710730 | 131.408528 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2107 | 21 | 1.748584 | 133.742943 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2108 | 22 | 1.752206 | 133.689352 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2109 | 24 | 1.739450 | 133.346641 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2110 | 23 | 1.738836 | 133.472641 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2111 rows × 43 columns
# Save Numeric Dataframe for future use:
data_numeric.to_csv('/Users/cl/Obesity_numeric.csv', index = False)
# Normalize the numeric dataset with Min-Max Scaling:
df_min_max_scaled = data_numeric.copy()
for column in df_min_max_scaled.columns:
df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())
# View normalized data:
print(df_min_max_scaled)
Age Height Weight Gender_Female Gender_Male \ 0 0.148936 0.320755 0.186567 1.0 0.0 1 0.148936 0.132075 0.126866 1.0 0.0 2 0.191489 0.660377 0.283582 0.0 1.0 3 0.276596 0.660377 0.358209 0.0 1.0 4 0.170213 0.622642 0.379104 0.0 1.0 ... ... ... ... ... ... 2106 0.127660 0.491943 0.689616 1.0 0.0 2107 0.148936 0.563366 0.707037 1.0 0.0 2108 0.170213 0.570200 0.706637 1.0 0.0 2109 0.212766 0.546132 0.704079 1.0 0.0 2110 0.191489 0.544974 0.705020 1.0 0.0 family_history_with_overweight_no family_history_with_overweight_yes \ 0 0.0 1.0 1 0.0 1.0 2 0.0 1.0 3 1.0 0.0 4 1.0 0.0 ... ... ... 2106 0.0 1.0 2107 0.0 1.0 2108 0.0 1.0 2109 0.0 1.0 2110 0.0 1.0 FAVC_no FAVC_yes FCVC_Always FCVC_Never FCVC_Sometimes NCP_1 \ 0 1.0 0.0 0.0 0.0 1.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 2 1.0 0.0 0.0 0.0 1.0 0.0 3 1.0 0.0 1.0 0.0 0.0 0.0 4 1.0 0.0 0.0 0.0 1.0 1.0 ... ... ... ... ... ... ... 2106 0.0 1.0 1.0 0.0 0.0 0.0 2107 0.0 1.0 1.0 0.0 0.0 0.0 2108 0.0 1.0 1.0 0.0 0.0 0.0 2109 0.0 1.0 1.0 0.0 0.0 0.0 2110 0.0 1.0 1.0 0.0 0.0 0.0 NCP_2 NCP_3 NCP_3+ CAEC_Always CAEC_Frequently CAEC_Sometimes \ 0 0.0 1.0 0.0 0.0 0.0 1.0 1 0.0 1.0 0.0 0.0 0.0 1.0 2 0.0 1.0 0.0 0.0 0.0 1.0 3 0.0 1.0 0.0 0.0 0.0 1.0 4 0.0 0.0 0.0 0.0 0.0 1.0 ... ... ... ... ... ... ... 2106 0.0 1.0 0.0 0.0 0.0 1.0 2107 0.0 1.0 0.0 0.0 0.0 1.0 2108 0.0 1.0 0.0 0.0 0.0 1.0 2109 0.0 1.0 0.0 0.0 0.0 1.0 2110 0.0 1.0 0.0 0.0 0.0 1.0 CAEC_no SMOKE_no SMOKE_yes CH2O_Between 1 and 2 L \ 0 0.0 1.0 0.0 1.0 1 0.0 0.0 1.0 0.0 2 0.0 1.0 0.0 1.0 3 0.0 1.0 0.0 1.0 4 0.0 1.0 0.0 1.0 ... ... ... ... ... 2106 0.0 1.0 0.0 0.0 2107 0.0 1.0 0.0 1.0 2108 0.0 1.0 0.0 1.0 2109 0.0 1.0 0.0 1.0 2110 0.0 1.0 0.0 1.0 CH2O_Less than a liter CH2O_More than 2 L SCC_no SCC_yes \ 0 0.0 0.0 1.0 0.0 1 0.0 1.0 0.0 1.0 2 0.0 0.0 1.0 0.0 3 0.0 0.0 1.0 0.0 4 0.0 0.0 1.0 0.0 ... ... ... ... ... 2106 1.0 0.0 1.0 0.0 2107 0.0 0.0 1.0 0.0 2108 0.0 0.0 1.0 0.0 2109 0.0 0.0 1.0 0.0 2110 0.0 0.0 1.0 0.0 FAF_1 or 2 days FAF_2 or 4 days FAF_4 or 5 days FAF_I do not have \ 0 0.0 0.0 0.0 1.0 1 0.0 0.0 1.0 0.0 2 0.0 1.0 0.0 0.0 3 0.0 1.0 0.0 0.0 4 0.0 0.0 0.0 1.0 ... ... ... ... ... 2106 1.0 0.0 0.0 0.0 2107 1.0 0.0 0.0 0.0 2108 1.0 0.0 0.0 0.0 2109 1.0 0.0 0.0 0.0 2110 1.0 0.0 0.0 0.0 TUE_0-2 Hours TUE_3-5 Hours TUE_More than 5 Hours CALC_Always \ 0 0.0 1.0 0.0 0.0 1 1.0 0.0 0.0 0.0 2 0.0 1.0 0.0 0.0 3 1.0 0.0 0.0 0.0 4 1.0 0.0 0.0 0.0 ... ... ... ... ... 2106 1.0 0.0 0.0 0.0 2107 1.0 0.0 0.0 0.0 2108 1.0 0.0 0.0 0.0 2109 1.0 0.0 0.0 0.0 2110 1.0 0.0 0.0 0.0 CALC_Frequently CALC_Sometimes CALC_no MTRANS_Automobile \ 0 0.0 0.0 1.0 0.0 1 0.0 1.0 0.0 0.0 2 1.0 0.0 0.0 0.0 3 1.0 0.0 0.0 0.0 4 0.0 1.0 0.0 0.0 ... ... ... ... ... 2106 0.0 1.0 0.0 0.0 2107 0.0 1.0 0.0 0.0 2108 0.0 1.0 0.0 0.0 2109 0.0 1.0 0.0 0.0 2110 0.0 1.0 0.0 0.0 MTRANS_Bike MTRANS_Motorbike MTRANS_Public_Transportation \ 0 0.0 0.0 1.0 1 0.0 0.0 1.0 2 0.0 0.0 1.0 3 0.0 0.0 0.0 4 0.0 0.0 1.0 ... ... ... ... 2106 0.0 0.0 1.0 2107 0.0 0.0 1.0 2108 0.0 0.0 1.0 2109 0.0 0.0 1.0 2110 0.0 0.0 1.0 MTRANS_Walking 0 0.0 1 0.0 2 0.0 3 1.0 4 0.0 ... ... 2106 0.0 2107 0.0 2108 0.0 2109 0.0 2110 0.0 [2111 rows x 43 columns]
# View class labels:
labels_df = df['NObeyesdad']
labels_df
0 Normal_Weight 1 Normal_Weight 2 Normal_Weight 3 Overweight_Level_I 4 Overweight_Level_II ... 2106 Obesity_Type_III 2107 Obesity_Type_III 2108 Obesity_Type_III 2109 Obesity_Type_III 2110 Obesity_Type_III Name: NObeyesdad, Length: 2111, dtype: object
# Transform class label into numeric:
le = preprocessing.LabelEncoder()
labels_num = le.fit_transform(labels_df)
labels_num
array([1, 1, 1, ..., 4, 4, 4])
# View class label names and numeric association:
label_names = dict(zip(le.transform(le.classes_), le.classes_))
print(label_names)
{0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Obesity_Type_I', 3: 'Obesity_Type_II', 4: 'Obesity_Type_III', 5: 'Overweight_Level_I', 6: 'Overweight_Level_II'}
Below is the exploration of clustering using K-means with the normalized data. Various values of k were tested and the centroids were evaluated to determine if a pattern appears in the clusters based on the data. For each value of K, the cluster centroids were examined to determine if any pattern exists in the data. A silhouette analysis is performed for to evaluate the separation between the resulting clusters and determine the quality of the clusters. The silhouette plots display a measure of how close each point in one cluster is to points in the neighboring clusters. The mean silhouette value is calculated and used as a threshold when determining the cluster quality. Clusters with most of their coefficients above the mean silhouette value are considered better quality which means that clusters are further away from the neighboring clusters. Clusters with most of their coefficients below the mean silhouette value reveals that samples are very close to the decision boundary between two neighboring clusters and negative coefficient values indicate that samples are assigned to the wrong cluster. When the silhouette plot does not display any negative coefficients and have the thickest plots visually above the silhouette mean, the correct number of K has been selected.
kmeans = KMeans(n_clusters=5, max_iter=500, verbose=1) #initialize k-means with n = 5
kmeans.fit(df_min_max_scaled)
Initialization complete Iteration 0, inertia 11787.00508682668 Iteration 1, inertia 8566.056202078382 Iteration 2, inertia 8338.71778464269 Iteration 3, inertia 8237.82330971913 Iteration 4, inertia 8176.41527470928 Iteration 5, inertia 8099.540960517189 Iteration 6, inertia 8076.56013777138 Iteration 7, inertia 8068.9930112547 Iteration 8, inertia 8068.221972230656 Iteration 9, inertia 8068.076896983779 Iteration 10, inertia 8067.991083649056 Iteration 11, inertia 8067.917526159745 Iteration 12, inertia 8067.434112719851 Iteration 13, inertia 8067.300618383514 Iteration 14, inertia 8066.690183145394 Iteration 15, inertia 8065.724907535046 Iteration 16, inertia 8065.1555932972105 Iteration 17, inertia 8064.406578178977 Iteration 18, inertia 8055.777491215358 Iteration 19, inertia 8050.3019028424 Iteration 20, inertia 8042.536097506457 Iteration 21, inertia 8026.138983270792 Iteration 22, inertia 7994.538660965364 Iteration 23, inertia 7981.791935092773 Iteration 24, inertia 7972.431678248945 Iteration 25, inertia 7970.020550287187 Iteration 26, inertia 7968.48651338652 Iteration 27, inertia 7966.955938122063 Iteration 28, inertia 7964.781849553093 Iteration 29, inertia 7958.222148843893 Iteration 30, inertia 7956.01524580361 Iteration 31, inertia 7955.555539283562 Iteration 32, inertia 7955.527509697825 Iteration 33, inertia 7955.494519204406 Converged at iteration 33: strict convergence. Initialization complete Iteration 0, inertia 11825.274489539268 Iteration 1, inertia 8581.769457282187 Iteration 2, inertia 8468.439310863918 Iteration 3, inertia 8354.00074441653 Iteration 4, inertia 8210.581719370295 Iteration 5, inertia 8130.050202902669 Iteration 6, inertia 8093.4953218710225 Iteration 7, inertia 8076.467461725121 Iteration 8, inertia 8059.590914960909 Iteration 9, inertia 8051.452820847045 Iteration 10, inertia 8044.333927734175 Iteration 11, inertia 8039.893289001246 Iteration 12, inertia 8036.123631679592 Iteration 13, inertia 8032.046237708075 Iteration 14, inertia 8029.674990266088 Iteration 15, inertia 8026.254886825644 Iteration 16, inertia 8023.089343537067 Iteration 17, inertia 8019.880934083217 Iteration 18, inertia 8008.990523342801 Iteration 19, inertia 7969.300639772133 Iteration 20, inertia 7919.97712322982 Iteration 21, inertia 7910.686198322982 Iteration 22, inertia 7906.238208315535 Iteration 23, inertia 7899.756117486131 Iteration 24, inertia 7892.545244873981 Iteration 25, inertia 7890.019624999294 Iteration 26, inertia 7889.072892812861 Iteration 27, inertia 7888.575279501931 Iteration 28, inertia 7888.2990556955165 Iteration 29, inertia 7887.869135061676 Iteration 30, inertia 7887.622886266341 Iteration 31, inertia 7887.258246367743 Iteration 32, inertia 7886.838669332199 Iteration 33, inertia 7886.583695027855 Iteration 34, inertia 7886.48352229928 Converged at iteration 34: strict convergence. Initialization complete Iteration 0, inertia 12174.525656749454 Iteration 1, inertia 8507.8234001361 Iteration 2, inertia 8262.156761189228 Iteration 3, inertia 8135.565056552313 Iteration 4, inertia 8072.3981424163885 Iteration 5, inertia 8047.041186769761 Iteration 6, inertia 8036.157681658016 Iteration 7, inertia 8032.793289742063 Iteration 8, inertia 8030.945913788686 Iteration 9, inertia 8030.700347934296 Iteration 10, inertia 8030.595081706321 Iteration 11, inertia 8030.463150369052 Iteration 12, inertia 8030.425825530467 Converged at iteration 12: strict convergence. Initialization complete Iteration 0, inertia 12478.757883097682 Iteration 1, inertia 8699.700914801515 Iteration 2, inertia 8423.74786273518 Iteration 3, inertia 8246.866008200308 Iteration 4, inertia 8137.329970995708 Iteration 5, inertia 8103.804191725505 Iteration 6, inertia 8092.207693677892 Iteration 7, inertia 8089.336054818334 Iteration 8, inertia 8087.034378963835 Iteration 9, inertia 8086.246069705694 Iteration 10, inertia 8084.619362324052 Iteration 11, inertia 8082.67643373668 Iteration 12, inertia 8082.188515787069 Converged at iteration 12: strict convergence. Initialization complete Iteration 0, inertia 11832.53539722714 Iteration 1, inertia 8466.1801078797 Iteration 2, inertia 8226.758916757437 Iteration 3, inertia 8115.239236717611 Iteration 4, inertia 8062.949256711981 Iteration 5, inertia 8047.895976309511 Iteration 6, inertia 8035.457973021038 Iteration 7, inertia 8028.199810257129 Iteration 8, inertia 8007.65676812841 Iteration 9, inertia 7963.399143602098 Iteration 10, inertia 7936.853393483585 Iteration 11, inertia 7924.273139615089 Iteration 12, inertia 7918.766371499822 Iteration 13, inertia 7916.290733124256 Iteration 14, inertia 7915.13212820532 Iteration 15, inertia 7913.658090371352 Iteration 16, inertia 7912.216196347794 Iteration 17, inertia 7911.340690474468 Iteration 18, inertia 7911.151814669988 Iteration 19, inertia 7911.113667420209 Converged at iteration 19: strict convergence. Initialization complete Iteration 0, inertia 11823.752663657368 Iteration 1, inertia 8276.46780690895 Iteration 2, inertia 8098.68081942101 Iteration 3, inertia 8037.29250680346 Iteration 4, inertia 7997.19138403611 Iteration 5, inertia 7925.487303831875 Iteration 6, inertia 7872.633962217118 Iteration 7, inertia 7853.57013270058 Iteration 8, inertia 7849.995965259756 Iteration 9, inertia 7843.743550028235 Iteration 10, inertia 7837.019905347132 Iteration 11, inertia 7834.827151309026 Iteration 12, inertia 7833.546296354347 Iteration 13, inertia 7833.009378377764 Iteration 14, inertia 7832.554446255171 Iteration 15, inertia 7832.210277934463 Iteration 16, inertia 7831.1753466434475 Iteration 17, inertia 7831.074055893812 Iteration 18, inertia 7831.027292617523 Iteration 19, inertia 7830.977013027618 Converged at iteration 19: strict convergence. Initialization complete Iteration 0, inertia 11596.309798286276 Iteration 1, inertia 8646.1997670067 Iteration 2, inertia 8394.42223224813 Iteration 3, inertia 8250.441907007784 Iteration 4, inertia 8175.292252415538 Iteration 5, inertia 8131.1038719677545 Iteration 6, inertia 8088.55452146436 Iteration 7, inertia 8032.992455465169 Iteration 8, inertia 7978.52526813498 Iteration 9, inertia 7956.801794915094 Iteration 10, inertia 7947.744891687089 Iteration 11, inertia 7941.340182646654 Iteration 12, inertia 7937.882158960003 Iteration 13, inertia 7936.406248808135 Iteration 14, inertia 7934.818469379045 Iteration 15, inertia 7934.023804051918 Iteration 16, inertia 7932.807323423373 Iteration 17, inertia 7931.7472552303025 Iteration 18, inertia 7931.066262593041 Iteration 19, inertia 7930.419081526247 Iteration 20, inertia 7915.339857061065 Iteration 21, inertia 7886.689696466735 Iteration 22, inertia 7859.242618003011 Iteration 23, inertia 7850.823479552829 Iteration 24, inertia 7847.789675688788 Iteration 25, inertia 7846.48853874517 Iteration 26, inertia 7842.455320611688 Iteration 27, inertia 7839.4277804122285 Iteration 28, inertia 7836.555716423136 Iteration 29, inertia 7835.294209939629 Iteration 30, inertia 7834.188876497395 Iteration 31, inertia 7833.097135623066 Iteration 32, inertia 7832.671449816852 Iteration 33, inertia 7832.380999582051 Iteration 34, inertia 7832.259089755594 Iteration 35, inertia 7832.232553830457 Converged at iteration 35: strict convergence. Initialization complete Iteration 0, inertia 12864.885199156606 Iteration 1, inertia 8534.055041539128 Iteration 2, inertia 8208.402584930189 Iteration 3, inertia 8134.677983160359 Iteration 4, inertia 8061.954032765476 Iteration 5, inertia 8012.741078486168 Iteration 6, inertia 7973.675852821764 Iteration 7, inertia 7960.156905577035 Iteration 8, inertia 7955.948758245115 Iteration 9, inertia 7937.8692456423405 Iteration 10, inertia 7929.672981125681 Iteration 11, inertia 7912.395873582756 Iteration 12, inertia 7884.993609624193 Iteration 13, inertia 7867.601997438204 Iteration 14, inertia 7864.718177831521 Iteration 15, inertia 7863.772078183434 Iteration 16, inertia 7863.723388842099 Converged at iteration 16: strict convergence. Initialization complete Iteration 0, inertia 11871.688456665612 Iteration 1, inertia 8444.666140084128 Iteration 2, inertia 8172.684939816708 Iteration 3, inertia 8022.5123919112075 Iteration 4, inertia 7989.103278787667 Iteration 5, inertia 7979.526024439475 Iteration 6, inertia 7972.321268249555 Iteration 7, inertia 7965.892255679502 Iteration 8, inertia 7959.862725195303 Iteration 9, inertia 7948.061310377594 Iteration 10, inertia 7926.876581312946 Iteration 11, inertia 7922.760473428007 Iteration 12, inertia 7920.461356227191 Iteration 13, inertia 7918.936152811791 Iteration 14, inertia 7918.355463902865 Iteration 15, inertia 7918.230916735995 Converged at iteration 15: strict convergence. Initialization complete Iteration 0, inertia 12258.81732786365 Iteration 1, inertia 8490.528120624003 Iteration 2, inertia 8287.639034367927 Iteration 3, inertia 8199.024276953247 Iteration 4, inertia 8142.15513015745 Iteration 5, inertia 8082.207427076215 Iteration 6, inertia 8037.963976518285 Iteration 7, inertia 8007.417500726093 Iteration 8, inertia 7978.869398652809 Iteration 9, inertia 7937.0245485312225 Iteration 10, inertia 7903.678870109956 Iteration 11, inertia 7866.78478195413 Iteration 12, inertia 7853.215930528641 Iteration 13, inertia 7847.326673615622 Iteration 14, inertia 7845.271749303697 Iteration 15, inertia 7844.525551419671 Iteration 16, inertia 7844.433188252878 Iteration 17, inertia 7844.348811651154 Converged at iteration 17: strict convergence.
KMeans(max_iter=500, n_clusters=5, verbose=1)
clusters5 = kmeans.predict(df_min_max_scaled)
pd.DataFrame(clusters5, columns=["Cluster"])
Cluster | |
---|---|
0 | 2 |
1 | 0 |
2 | 2 |
3 | 3 |
4 | 3 |
... | ... |
2106 | 0 |
2107 | 0 |
2108 | 0 |
2109 | 0 |
2110 | 0 |
2111 rows × 1 columns
def cluster_sizes(clusters):
#clusters is an array of cluster labels for each instance in the data
size = {}
cluster_labels = np.unique(clusters)
n_clusters = cluster_labels.shape[0]
for c in cluster_labels:
size[c] = len(df[clusters == c])
return size
size5 = cluster_sizes(clusters5)
for c5 in size5.keys():
print("Size of Cluster", c5, "= ", size5[c5])
Size of Cluster 0 = 420 Size of Cluster 1 = 423 Size of Cluster 2 = 455 Size of Cluster 3 = 355 Size of Cluster 4 = 458
# The centroids provide an aggregate representation and a characterization of each cluster:
pd.options.display.float_format='{:,.2f}'.format
centroids5 = pd.DataFrame(kmeans.cluster_centers_, columns=df_min_max_scaled.columns.values)
centroids5
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.19 | 0.43 | 0.52 | 1.00 | 0.00 | 0.00 | 1.00 | 0.05 | 0.95 | 0.90 | 0.06 | 0.04 | 0.03 | 0.01 | 0.96 | 0.00 | 0.01 | 0.05 | 0.92 | 0.02 | 0.99 | 0.01 | 0.65 | 0.32 | 0.03 | 0.98 | 0.02 | 0.36 | 0.03 | 0.01 | 0.60 | 0.90 | 0.08 | 0.02 | 0.00 | 0.01 | 0.96 | 0.03 | 0.02 | -0.00 | 0.00 | 0.97 | 0.01 |
1 | 0.37 | 0.51 | 0.36 | 0.33 | 0.67 | 0.06 | 0.94 | 0.05 | 0.95 | 0.13 | 0.07 | 0.80 | 0.18 | 0.19 | 0.59 | 0.05 | 0.02 | 0.04 | 0.94 | 0.00 | 0.98 | 0.02 | 0.54 | 0.40 | 0.06 | 0.98 | 0.02 | 0.27 | 0.19 | 0.02 | 0.52 | 0.78 | 0.20 | 0.01 | -0.00 | 0.06 | 0.59 | 0.35 | 0.99 | 0.00 | 0.00 | -0.00 | 0.00 |
2 | 0.16 | 0.41 | 0.30 | 0.55 | 0.45 | 0.00 | 1.00 | 0.16 | 0.84 | 0.08 | 0.06 | 0.86 | 0.35 | 0.20 | 0.43 | 0.02 | 0.02 | 0.14 | 0.83 | 0.01 | 0.99 | 0.01 | 0.53 | 0.38 | 0.10 | 0.95 | 0.05 | 0.32 | 0.17 | 0.05 | 0.46 | 0.46 | 0.44 | 0.10 | 0.00 | 0.04 | 0.11 | 0.85 | 0.00 | 0.00 | 0.00 | 0.96 | 0.03 |
3 | 0.15 | 0.37 | 0.14 | 0.65 | 0.35 | 0.98 | 0.02 | 0.29 | 0.71 | 0.34 | 0.11 | 0.55 | 0.30 | 0.07 | 0.54 | 0.10 | 0.05 | 0.34 | 0.51 | 0.10 | 0.98 | 0.02 | 0.39 | 0.48 | 0.13 | 0.86 | 0.14 | 0.31 | 0.21 | 0.07 | 0.41 | 0.53 | 0.40 | 0.07 | 0.00 | 0.04 | 0.72 | 0.24 | 0.07 | 0.00 | 0.01 | 0.85 | 0.06 |
4 | 0.18 | 0.63 | 0.42 | 0.00 | 1.00 | 0.02 | 0.98 | 0.05 | 0.95 | 0.14 | 0.17 | 0.69 | 0.09 | 0.19 | 0.71 | 0.02 | 0.02 | 0.05 | 0.93 | 0.01 | 0.96 | 0.04 | 0.66 | 0.27 | 0.08 | 0.98 | 0.02 | 0.45 | 0.12 | 0.02 | 0.41 | 0.67 | 0.27 | 0.05 | -0.00 | 0.02 | 0.97 | 0.01 | -0.00 | 0.01 | 0.00 | 0.95 | 0.03 |
# Silhouette Analysis at n = 5:
c5_silhouette = metrics.silhouette_samples(df_min_max_scaled, clusters5)
print('Mean Silhouette Value :', c5_silhouette.mean())
Mean Silhouette Value : 0.12696817428675627
def plot_silhouettes(data, clusters, metric='euclidean'):
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(clusters)
n_clusters = cluster_labels.shape[0]
silhouette_vals = metrics.silhouette_samples(data, clusters, metric='euclidean')
c_ax_lower, c_ax_upper = 0, 0
cticks = []
for i, k in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[clusters == k]
c_silhouette_vals.sort()
c_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
pl.barh(range(c_ax_lower, c_ax_upper), c_silhouette_vals, height=1.0,
edgecolor='none', color=color)
cticks.append((c_ax_lower + c_ax_upper) / 2)
c_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
pl.axvline(silhouette_avg, color="red", linestyle="--")
pl.yticks(cticks, cluster_labels)
pl.ylabel('Cluster')
pl.xlabel('Silhouette coefficient')
pl.tight_layout()
#pl.savefig('images/11_04.png', dpi=300)
pl.show()
return
# Plot and Evaluate the Silhouettes:
plot_silhouettes(df_min_max_scaled, clusters5)
Above, the plot of the silhouettes shows that cluster 0 outperformed the other clusters with all its coefficients above the mean silhouette value. Cluster 4 also performed well with many of its coefficients above the mean silhouette value. The remaining three clusters did not perform as well since most of their coefficients are below the mean silhouette value. Four of the clusters display negative values with cluster 3 having the most negative coefficients, which indicates that 5 clusters are too high for the dataset.
kmeans3 = KMeans(n_clusters=3, max_iter=500, verbose=1) # k-means with n = 3
kmeans3.fit(df_min_max_scaled)
Initialization complete Iteration 0, inertia 12879.623786128155 Iteration 1, inertia 9001.37311540421 Iteration 2, inertia 8858.998139520381 Iteration 3, inertia 8809.939108030303 Iteration 4, inertia 8791.272204868248 Iteration 5, inertia 8786.47670789417 Iteration 6, inertia 8785.4661617069 Iteration 7, inertia 8783.363746212328 Iteration 8, inertia 8772.54056168085 Iteration 9, inertia 8769.557983795672 Iteration 10, inertia 8768.483498001466 Iteration 11, inertia 8767.764124919666 Iteration 12, inertia 8766.35120571748 Iteration 13, inertia 8765.43867230093 Iteration 14, inertia 8765.398780834916 Converged at iteration 14: strict convergence. Initialization complete Iteration 0, inertia 14534.508672109176 Iteration 1, inertia 9309.441488842096 Iteration 2, inertia 9037.604181226367 Iteration 3, inertia 8938.658231693586 Iteration 4, inertia 8916.402643453937 Iteration 5, inertia 8888.331390013225 Iteration 6, inertia 8870.074074790271 Iteration 7, inertia 8861.211103290092 Iteration 8, inertia 8853.162119584285 Iteration 9, inertia 8846.868576454352 Iteration 10, inertia 8827.84407699484 Iteration 11, inertia 8799.1609794804 Iteration 12, inertia 8785.909269034124 Iteration 13, inertia 8774.601325517255 Iteration 14, inertia 8769.592571719972 Iteration 15, inertia 8767.63492356255 Iteration 16, inertia 8766.444849568958 Iteration 17, inertia 8765.971992499608 Iteration 18, inertia 8765.867441373892 Iteration 19, inertia 8765.66116613612 Iteration 20, inertia 8765.486144212005 Iteration 21, inertia 8765.461045034992 Converged at iteration 21: strict convergence. Initialization complete Iteration 0, inertia 13336.164818796795 Iteration 1, inertia 9567.269152448953 Iteration 2, inertia 9445.553561777502 Iteration 3, inertia 9356.780028542624 Iteration 4, inertia 9157.130002021417 Iteration 5, inertia 9032.754433029859 Iteration 6, inertia 8966.21740202369 Iteration 7, inertia 8938.469471906204 Iteration 8, inertia 8923.340927558393 Iteration 9, inertia 8916.997734419223 Iteration 10, inertia 8915.764064629098 Iteration 11, inertia 8915.37813313638 Iteration 12, inertia 8915.127678549246 Iteration 13, inertia 8914.954033413907 Iteration 14, inertia 8914.93152766286 Iteration 15, inertia 8914.917473861125 Converged at iteration 15: strict convergence. Initialization complete Iteration 0, inertia 12235.950953731766 Iteration 1, inertia 9177.059040690088 Iteration 2, inertia 8997.9480370336 Iteration 3, inertia 8886.478645540206 Iteration 4, inertia 8833.780310727954 Iteration 5, inertia 8820.17237743324 Iteration 6, inertia 8819.429241417094 Iteration 7, inertia 8819.16005670056 Iteration 8, inertia 8818.804519317317 Iteration 9, inertia 8818.682843171513 Iteration 10, inertia 8818.311487274063 Iteration 11, inertia 8818.159940170133 Iteration 12, inertia 8817.98907376872 Iteration 13, inertia 8817.865936421687 Iteration 14, inertia 8817.806917535885 Iteration 15, inertia 8817.776092265041 Iteration 16, inertia 8817.693711678381 Iteration 17, inertia 8817.668602183467 Converged at iteration 17: strict convergence. Initialization complete Iteration 0, inertia 14283.23672461032 Iteration 1, inertia 9655.242214042786 Iteration 2, inertia 9501.150166979987 Iteration 3, inertia 9385.8241565813 Iteration 4, inertia 9306.25122999164 Iteration 5, inertia 9258.611440798086 Iteration 6, inertia 9220.655830148746 Iteration 7, inertia 9205.884675342035 Iteration 8, inertia 9201.291862951928 Iteration 9, inertia 9200.842008469204 Iteration 10, inertia 9200.67414699048 Iteration 11, inertia 9200.629965113641 Converged at iteration 11: strict convergence. Initialization complete Iteration 0, inertia 12423.535126370873 Iteration 1, inertia 9135.382864146819 Iteration 2, inertia 9028.876396630194 Iteration 3, inertia 8968.812555470771 Iteration 4, inertia 8910.48089075226 Iteration 5, inertia 8856.621151522155 Iteration 6, inertia 8821.480910675746 Iteration 7, inertia 8818.661138265279 Iteration 8, inertia 8818.274441804033 Iteration 9, inertia 8818.095197547456 Iteration 10, inertia 8817.924137895994 Iteration 11, inertia 8817.828887772916 Iteration 12, inertia 8817.806917535885 Iteration 13, inertia 8817.776092265041 Iteration 14, inertia 8817.693711678381 Iteration 15, inertia 8817.668602183467 Converged at iteration 15: strict convergence. Initialization complete Iteration 0, inertia 15560.349375046955 Iteration 1, inertia 9451.504598782774 Iteration 2, inertia 9142.603376735178 Iteration 3, inertia 8984.738489060503 Iteration 4, inertia 8903.206814994957 Iteration 5, inertia 8849.255938606113 Iteration 6, inertia 8805.887151861752 Iteration 7, inertia 8787.427510335308 Iteration 8, inertia 8777.334874885435 Iteration 9, inertia 8773.33734831006 Iteration 10, inertia 8770.052840575934 Iteration 11, inertia 8768.924265588485 Iteration 12, inertia 8768.062375098845 Iteration 13, inertia 8766.037804380696 Iteration 14, inertia 8765.5613568244 Iteration 15, inertia 8765.461045034992 Converged at iteration 15: strict convergence. Initialization complete Iteration 0, inertia 14256.23978990368 Iteration 1, inertia 9202.001056480733 Iteration 2, inertia 8958.453465999479 Iteration 3, inertia 8866.570788484862 Iteration 4, inertia 8824.832909813438 Iteration 5, inertia 8819.837479151136 Iteration 6, inertia 8819.10078183813 Iteration 7, inertia 8819.035876469317 Iteration 8, inertia 8818.817563005117 Iteration 9, inertia 8818.646974003661 Iteration 10, inertia 8818.289586515795 Iteration 11, inertia 8818.159940170133 Iteration 12, inertia 8817.98907376872 Iteration 13, inertia 8817.865936421687 Iteration 14, inertia 8817.806917535887 Iteration 15, inertia 8817.776092265041 Iteration 16, inertia 8817.693711678381 Iteration 17, inertia 8817.668602183467 Converged at iteration 17: strict convergence. Initialization complete Iteration 0, inertia 14045.247730533238 Iteration 1, inertia 9577.932151433946 Iteration 2, inertia 9503.316242864512 Iteration 3, inertia 9481.915359950877 Iteration 4, inertia 9463.72742189162 Iteration 5, inertia 9441.526557119001 Iteration 6, inertia 9420.35228646382 Iteration 7, inertia 9406.373961589723 Iteration 8, inertia 9402.970170630471 Iteration 9, inertia 9400.047385253652 Iteration 10, inertia 9399.555304602607 Iteration 11, inertia 9399.498839497075 Iteration 12, inertia 9399.41018122203 Iteration 13, inertia 9399.298956532946 Iteration 14, inertia 9399.274680230332 Iteration 15, inertia 9399.252245628535 Converged at iteration 15: strict convergence. Initialization complete Iteration 0, inertia 13317.99475078725 Iteration 1, inertia 8977.851327856566 Iteration 2, inertia 8895.604119959466 Iteration 3, inertia 8873.687473142223 Iteration 4, inertia 8849.570415401187 Iteration 5, inertia 8805.858315057407 Iteration 6, inertia 8780.421553985545 Iteration 7, inertia 8775.53080855752 Iteration 8, inertia 8772.61887038512 Iteration 9, inertia 8769.175262861294 Iteration 10, inertia 8768.382144887562 Iteration 11, inertia 8767.704981473666 Iteration 12, inertia 8766.274753740365 Iteration 13, inertia 8765.417169729853 Iteration 14, inertia 8765.398780834916 Converged at iteration 14: strict convergence.
KMeans(max_iter=500, n_clusters=3, verbose=1)
clusters3 = kmeans3.predict(df_min_max_scaled)
size3 = cluster_sizes(clusters3)
for c in size3.keys():
print("Size of Cluster", c, "= ", size3[c])
Size of Cluster 0 = 1058 Size of Cluster 1 = 636 Size of Cluster 2 = 417
# View centroids for an aggregate representation and a characterization of each cluster:
pd.options.display.float_format='{:,.2f}'.format
centroids3 = pd.DataFrame(kmeans3.cluster_centers_, columns=df_min_max_scaled.columns.values)
centroids3
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.22 | 0.58 | 0.39 | -0.00 | 1.00 | 0.14 | 0.86 | 0.09 | 0.91 | 0.13 | 0.11 | 0.76 | 0.16 | 0.18 | 0.61 | 0.04 | 0.03 | 0.07 | 0.86 | 0.03 | 0.97 | 0.03 | 0.61 | 0.28 | 0.11 | 0.98 | 0.02 | 0.38 | 0.17 | 0.05 | 0.40 | 0.63 | 0.31 | 0.06 | 0.00 | 0.04 | 0.64 | 0.31 | 0.28 | 0.01 | 0.01 | 0.68 | 0.03 |
1 | 0.21 | 0.33 | 0.19 | 0.99 | 0.01 | 0.34 | 0.66 | 0.19 | 0.81 | 0.19 | 0.10 | 0.71 | 0.35 | 0.13 | 0.48 | 0.03 | 0.03 | 0.23 | 0.73 | 0.01 | 0.98 | 0.02 | 0.38 | 0.55 | 0.07 | 0.90 | 0.10 | 0.27 | 0.16 | 0.03 | 0.55 | 0.58 | 0.36 | 0.07 | 0.00 | 0.04 | 0.50 | 0.46 | 0.23 | -0.00 | 0.00 | 0.74 | 0.03 |
2 | 0.19 | 0.43 | 0.52 | 0.99 | 0.01 | 0.04 | 0.96 | 0.06 | 0.94 | 0.95 | 0.05 | 0.01 | 0.01 | 0.01 | 0.97 | 0.00 | 0.01 | 0.04 | 0.93 | 0.02 | 0.99 | 0.01 | 0.69 | 0.29 | 0.01 | 0.98 | 0.02 | 0.36 | 0.04 | 0.01 | 0.59 | 0.91 | 0.08 | 0.01 | 0.00 | 0.00 | 0.97 | 0.03 | 0.05 | -0.00 | 0.00 | 0.94 | 0.01 |
# Silhouette Analysis at n = 3:
c3_silhouette = metrics.silhouette_samples(df_min_max_scaled, clusters3)
print('Mean Silhouette Value :', c3_silhouette.mean())
Mean Silhouette Value : 0.11634874352766442
# Plot and Evaluate the Silhouettes:
plot_silhouettes(df_min_max_scaled, clusters3)
Above, shows the results of the silhouette analysis for K=3, which reveals that the algorithm performed neither better nor worse than at K = 5. The plot of the silhouettes shows that cluster 2 outperformed the other clusters with all its coefficients above the mean silhouette value. Cluster 1 performed the worst and did not have any coefficients above the mean silhouette value, but instead has negative coefficients. When evaluating the centroids, cluster 0 has Gender_Male with a value of 1.00 and Gender_Female with a value of 0. Cluster 0 most likely represents the male gender. Cluster 1 and 2 both contain a value of 0.99 for Gender_Female and 0.01 for Gender_Male, which shows that most likely Cluster 1 is misclassified. Most likely this cluster is pulling coefficients where it should not be and is too close to cluster 0 to be its own cluster. We can conclude from the silhouette plots that likely three cluster is still too high and that two clusters may be sufficient.
kmeans2 = KMeans(n_clusters=2, max_iter=500, verbose=1) # k-means with n = 2
kmeans2.fit(df_min_max_scaled)
Initialization complete Iteration 0, inertia 21141.470043487447 Iteration 1, inertia 10146.649638459037 Iteration 2, inertia 10052.393229584075 Iteration 3, inertia 9978.55949142651 Iteration 4, inertia 9931.576115637514 Iteration 5, inertia 9882.008890876072 Iteration 6, inertia 9833.033569152649 Iteration 7, inertia 9820.76522415991 Iteration 8, inertia 9810.381012107375 Iteration 9, inertia 9806.43745895125 Iteration 10, inertia 9802.52445906197 Iteration 11, inertia 9788.188974953495 Iteration 12, inertia 9784.590860736775 Iteration 13, inertia 9781.587461096071 Iteration 14, inertia 9764.62868015089 Iteration 15, inertia 9703.250240710777 Iteration 16, inertia 9654.064419604409 Iteration 17, inertia 9640.924590400316 Iteration 18, inertia 9633.269288654592 Iteration 19, inertia 9627.22637779535 Iteration 20, inertia 9626.211067157712 Iteration 21, inertia 9625.966275262466 Iteration 22, inertia 9625.900707169734 Iteration 23, inertia 9625.880356300455 Converged at iteration 23: strict convergence. Initialization complete Iteration 0, inertia 16459.557174738147 Iteration 1, inertia 9900.362639271163 Iteration 2, inertia 9489.377342057842 Iteration 3, inertia 9440.231621454297 Iteration 4, inertia 9439.746651906471 Iteration 5, inertia 9439.70314554615 Converged at iteration 5: strict convergence. Initialization complete Iteration 0, inertia 16654.425161503892 Iteration 1, inertia 9928.457371508537 Iteration 2, inertia 9838.23752341025 Iteration 3, inertia 9788.863247705292 Iteration 4, inertia 9548.861615101538 Iteration 5, inertia 9442.673615150014 Iteration 6, inertia 9439.835639639505 Iteration 7, inertia 9439.703145546147 Converged at iteration 7: strict convergence. Initialization complete Iteration 0, inertia 18427.311216298647 Iteration 1, inertia 10184.320163749075 Iteration 2, inertia 10006.009344174558 Iteration 3, inertia 9946.739537737863 Iteration 4, inertia 9924.402102243388 Iteration 5, inertia 9910.780896241331 Iteration 6, inertia 9905.110900156222 Iteration 7, inertia 9902.264415823509 Iteration 8, inertia 9897.72566174957 Iteration 9, inertia 9895.57672400781 Iteration 10, inertia 9894.774416642193 Iteration 11, inertia 9892.907792732634 Iteration 12, inertia 9889.535477719926 Iteration 13, inertia 9885.189090329697 Iteration 14, inertia 9882.336535503198 Iteration 15, inertia 9878.48056558997 Iteration 16, inertia 9874.201130739495 Iteration 17, inertia 9869.024206385815 Iteration 18, inertia 9859.9664768237 Iteration 19, inertia 9851.6930077618 Iteration 20, inertia 9844.315605926562 Iteration 21, inertia 9838.560305605188 Iteration 22, inertia 9828.42281943376 Iteration 23, inertia 9813.08654486762 Iteration 24, inertia 9801.820301212163 Iteration 25, inertia 9796.444293037726 Iteration 26, inertia 9794.424054751174 Iteration 27, inertia 9794.231982639427 Iteration 28, inertia 9794.184865647467 Converged at iteration 28: center shift 1.1769791973055172e-05 within tolerance 1.1825910645989139e-05. Initialization complete Iteration 0, inertia 16402.36648533896 Iteration 1, inertia 9994.565703554776 Iteration 2, inertia 9934.77322692749 Iteration 3, inertia 9909.912786027671 Iteration 4, inertia 9883.607078127476 Iteration 5, inertia 9850.433915715637 Iteration 6, inertia 9791.58073461284 Iteration 7, inertia 9734.449876536686 Iteration 8, inertia 9646.394815196176 Iteration 9, inertia 9627.336127098832 Iteration 10, inertia 9625.954342511643 Iteration 11, inertia 9625.884133923937 Iteration 12, inertia 9625.86108434419 Iteration 13, inertia 9625.840681351561 Converged at iteration 13: strict convergence. Initialization complete Iteration 0, inertia 14969.584620698479 Iteration 1, inertia 9901.195321252426 Iteration 2, inertia 9853.041301002952 Iteration 3, inertia 9828.346078346132 Iteration 4, inertia 9821.497722659567 Iteration 5, inertia 9816.797972478129 Iteration 6, inertia 9815.160587833674 Iteration 7, inertia 9813.203694536212 Iteration 8, inertia 9812.379447087993 Iteration 9, inertia 9811.952753836797 Iteration 10, inertia 9811.747610338476 Iteration 11, inertia 9811.640620134325 Iteration 12, inertia 9811.132844140293 Iteration 13, inertia 9810.651492658839 Iteration 14, inertia 9810.461157499503 Iteration 15, inertia 9809.569339539601 Iteration 16, inertia 9808.659582332328 Iteration 17, inertia 9808.2561123607 Iteration 18, inertia 9807.986336248636 Iteration 19, inertia 9807.906534765058 Iteration 20, inertia 9807.487360220597 Iteration 21, inertia 9806.667738269354 Iteration 22, inertia 9805.834740565373 Iteration 23, inertia 9800.048920420628 Iteration 24, inertia 9788.128577786762 Iteration 25, inertia 9779.417579204792 Iteration 26, inertia 9775.38432414417 Iteration 27, inertia 9761.684802413005 Iteration 28, inertia 9709.236797762258 Iteration 29, inertia 9653.799784178347 Iteration 30, inertia 9642.172114603334 Iteration 31, inertia 9636.202113008087 Iteration 32, inertia 9620.09415505929 Iteration 33, inertia 9617.977388582693 Iteration 34, inertia 9615.658558431482 Iteration 35, inertia 9610.65542615645 Iteration 36, inertia 9607.248540288658 Iteration 37, inertia 9603.493341174244 Iteration 38, inertia 9599.523654632054 Iteration 39, inertia 9597.642822643924 Iteration 40, inertia 9596.209483776676 Iteration 41, inertia 9595.35635092364 Iteration 42, inertia 9594.882428291552 Iteration 43, inertia 9594.678998923426 Iteration 44, inertia 9594.37636625157 Iteration 45, inertia 9594.115982262558 Iteration 46, inertia 9594.001524563828 Iteration 47, inertia 9592.870319464864 Iteration 48, inertia 9587.715138968875 Iteration 49, inertia 9578.793233518765 Iteration 50, inertia 9556.18315100723 Iteration 51, inertia 9517.365630351003 Iteration 52, inertia 9457.17665798308 Iteration 53, inertia 9440.017782540526 Iteration 54, inertia 9439.746651906471 Iteration 55, inertia 9439.703145546147 Converged at iteration 55: strict convergence. Initialization complete Iteration 0, inertia 16595.25826599754 Iteration 1, inertia 10109.37538869094 Iteration 2, inertia 9915.261803411886 Iteration 3, inertia 9789.099147804835 Iteration 4, inertia 9778.126542918279 Iteration 5, inertia 9777.915684703536 Converged at iteration 5: strict convergence. Initialization complete Iteration 0, inertia 15375.183674102089 Iteration 1, inertia 9952.678493621155 Iteration 2, inertia 9884.256882726944 Iteration 3, inertia 9864.875145644339 Iteration 4, inertia 9859.869224463644 Iteration 5, inertia 9858.379095829157 Iteration 6, inertia 9854.219459620635 Iteration 7, inertia 9841.672028071058 Iteration 8, inertia 9832.676859254072 Iteration 9, inertia 9818.161920484818 Iteration 10, inertia 9805.876930142955 Iteration 11, inertia 9798.29221813926 Iteration 12, inertia 9794.467009896513 Iteration 13, inertia 9794.22218892144 Iteration 14, inertia 9794.184865647467 Converged at iteration 14: center shift 1.1769791973055237e-05 within tolerance 1.1825910645989139e-05. Initialization complete Iteration 0, inertia 18359.985955431566 Iteration 1, inertia 10101.593413896659 Iteration 2, inertia 9982.754190681128 Iteration 3, inertia 9917.840303560613 Iteration 4, inertia 9890.496715960406 Iteration 5, inertia 9876.352494100358 Iteration 6, inertia 9865.682282638423 Iteration 7, inertia 9853.175178022406 Iteration 8, inertia 9841.405257717575 Iteration 9, inertia 9828.743052973643 Iteration 10, inertia 9812.10020465191 Iteration 11, inertia 9800.966469728079 Iteration 12, inertia 9795.474672343822 Iteration 13, inertia 9794.39208071576 Iteration 14, inertia 9794.247097906948 Iteration 15, inertia 9794.203342487108 Converged at iteration 15: center shift 1.1189193902305637e-05 within tolerance 1.1825910645989139e-05. Initialization complete Iteration 0, inertia 14840.19634222251 Iteration 1, inertia 10012.892987021458 Iteration 2, inertia 9937.527748973907 Iteration 3, inertia 9874.816695079277 Iteration 4, inertia 9840.194716250331 Iteration 5, inertia 9814.311264293689 Iteration 6, inertia 9800.700957677194 Iteration 7, inertia 9771.602530465227 Iteration 8, inertia 9541.236198237983 Iteration 9, inertia 9441.157699191357 Iteration 10, inertia 9439.835639639505 Iteration 11, inertia 9439.70314554615 Converged at iteration 11: strict convergence.
KMeans(max_iter=500, n_clusters=2, verbose=1)
clusters2 = kmeans2.predict(df_min_max_scaled)
size2 = cluster_sizes(clusters2)
for c in size2.keys():
print("Size of Cluster", c, "= ", size2[c])
Size of Cluster 0 = 1067 Size of Cluster 1 = 1044
# View centroids for an aggregate representation and a characterization of each cluster:
pd.options.display.float_format='{:,.2f}'.format
centroids2 = pd.DataFrame(kmeans2.cluster_centers_, columns=df_min_max_scaled.columns.values)
centroids2
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.22 | 0.58 | 0.39 | -0.00 | 1.00 | 0.14 | 0.86 | 0.09 | 0.91 | 0.13 | 0.11 | 0.75 | 0.16 | 0.18 | 0.61 | 0.04 | 0.03 | 0.07 | 0.86 | 0.03 | 0.97 | 0.03 | 0.61 | 0.28 | 0.11 | 0.98 | 0.02 | 0.38 | 0.17 | 0.05 | 0.39 | 0.63 | 0.31 | 0.06 | 0.00 | 0.04 | 0.65 | 0.31 | 0.27 | 0.01 | 0.01 | 0.68 | 0.03 |
1 | 0.21 | 0.36 | 0.32 | 1.00 | 0.00 | 0.22 | 0.78 | 0.14 | 0.86 | 0.49 | 0.08 | 0.43 | 0.22 | 0.09 | 0.68 | 0.02 | 0.02 | 0.16 | 0.81 | 0.01 | 0.99 | 0.01 | 0.50 | 0.45 | 0.05 | 0.93 | 0.07 | 0.30 | 0.11 | 0.02 | 0.57 | 0.71 | 0.25 | 0.05 | 0.00 | 0.03 | 0.68 | 0.29 | 0.16 | -0.00 | 0.00 | 0.82 | 0.02 |
# Silhouette Analysis at n = 2:
c2_silhouette = metrics.silhouette_samples(df_min_max_scaled, clusters2)
print('Mean Silhouette Value :', c2_silhouette.mean())
Mean Silhouette Value : 0.13093478332005926
# Plot and Evaluate the Silhouettes:
plot_silhouettes(df_min_max_scaled, clusters2)
above shows the results of the silhouette analysis for K=2, which achieved the best silhouette plot compared to previous plots at K = 5 and K = 3. This silhouette plot shows that both cluster 0 and 1 have coefficients that are above the mean silhouette value and none of the coefficients are negative. Both clusters are neither thick nor full, although, cluster 0 appears thicker than cluster 1, but from the clustering results above, this result is most successful. When looking at the centroids, the two features that stand out that most likely represent the clusters compared to all other features is Gender_Male and Gender_Female. In cluster 0, Gender_Male has a value of 1.00 while Gender_Female has a value of -0.00 and in cluster 1, Gender_Female has a value of 1.00 while Gender_Male has a value 0.00. Moreover, we can conclude from the silhouette plots above that likely, cluster 0 represents males and cluster 1 represents female. This evaluation shows that a pattern exists by gender and that gender may play a role in the dataset and in determining classification of obesity levels.
data_numeric.Age.min() #youngest age in the dataset
14
data_numeric.Age.max() #oldest age in the dataset
61
age_bins = pd.qcut(data_numeric.Age, [0, .61, .972, 1])
age_bins.head(5)
0 (13.999, 24.0] 1 (13.999, 24.0] 2 (13.999, 24.0] 3 (24.0, 40.0] 4 (13.999, 24.0] Name: Age, dtype: category Categories (3, interval[float64]): [(13.999, 24.0] < (24.0, 40.0] < (40.0, 61.0]]
age_bins = pd.qcut(data_numeric.Age, [0, .61, .972, 1], labels = ['Gen-Z', 'Millenials', 'Gen-X & Boomers'])
age_df = pd.concat([age_bins, df2['Age']], axis=1)
age_df.columns = ['Age Group', 'Age']
age_df.head(10)
Age Group | Age | |
---|---|---|
0 | Gen-Z | 21 |
1 | Gen-Z | 21 |
2 | Gen-Z | 23 |
3 | Millenials | 27 |
4 | Gen-Z | 22 |
5 | Millenials | 29 |
6 | Gen-Z | 23 |
7 | Gen-Z | 22 |
8 | Gen-Z | 24 |
9 | Gen-Z | 22 |
data_age_groups = data_numeric
data_age_groups["Age"] = age_df['Age Group']
data_age_groups.head(10)
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Gen-Z | 1.62 | 64.00 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | Gen-Z | 1.52 | 56.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | Gen-Z | 1.80 | 77.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | Millenials | 1.80 | 87.00 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | Gen-Z | 1.78 | 89.80 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 | Millenials | 1.62 | 53.00 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
6 | Gen-Z | 1.50 | 55.00 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
7 | Gen-Z | 1.64 | 53.00 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
8 | Gen-Z | 1.78 | 64.00 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
9 | Gen-Z | 1.72 | 68.00 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
# Create Dummy Variables for Binned Dataset:
df_age_groups = pd.get_dummies(data_age_groups)
df_age_groups.head(5)
Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | Age_Gen-Z | Age_Millenials | Age_Gen-X & Boomers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.62 | 64.00 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1 | 1.52 | 56.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
2 | 1.80 | 77.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
3 | 1.80 | 87.00 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4 | 1.78 | 89.80 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
K-means algorithm with the three generational age groups: Gen-Z, Millennials, and Gen-X and Boomers. This exploration is being explored to see if a pattern exists based on age range which the cluster analysis for the full dataset did not evaluate since the age groups were not grouped into categories. The youngest age is 14 and the oldest age is 61. The age groups are created by binning the Age attribute and then transforming the age group attribute into dummy variables. For exploratory purposes, K-means is performed on the dataset first without min-max normalization and second with min-max normalization at K = 3.
# Perform K-Means Clustering with N = 3:
kmeans = KMeans(n_clusters=3, max_iter=500, verbose=1) #initialize k-means with n = 3
kmeans.fit(df_age_groups)
Initialization complete Iteration 0, inertia 266216.09575717594 Iteration 1, inertia 215521.27371160412 Iteration 2, inertia 210755.68046562248 Iteration 3, inertia 209722.10499704749 Iteration 4, inertia 209418.75170585237 Iteration 5, inertia 209024.46758980726 Iteration 6, inertia 208957.87320545508 Iteration 7, inertia 208952.94787903182 Converged at iteration 7: center shift 0.0008306717121226465 within tolerance 0.0015358503717230955. Initialization complete Iteration 0, inertia 502841.1735953951 Iteration 1, inertia 327560.09540897014 Iteration 2, inertia 315527.1447145528 Iteration 3, inertia 309720.53549766104 Iteration 4, inertia 300890.08942123153 Iteration 5, inertia 292303.3695465401 Iteration 6, inertia 287217.5099774456 Iteration 7, inertia 283800.36697429675 Iteration 8, inertia 278425.3310745159 Iteration 9, inertia 270545.8711724134 Iteration 10, inertia 246805.06029650217 Iteration 11, inertia 232750.70198980303 Iteration 12, inertia 226653.87489398956 Iteration 13, inertia 216920.98018306002 Iteration 14, inertia 212292.1259899532 Iteration 15, inertia 209925.16585043436 Iteration 16, inertia 209641.8566251863 Iteration 17, inertia 209612.29209838895 Iteration 18, inertia 209601.88285883892 Iteration 19, inertia 209598.80247956378 Converged at iteration 19: strict convergence. Initialization complete Iteration 0, inertia 326936.9440904384 Iteration 1, inertia 226724.44726938492 Iteration 2, inertia 213994.4657317486 Iteration 3, inertia 210480.97315506768 Iteration 4, inertia 209660.8099794086 Iteration 5, inertia 209614.6304476699 Iteration 6, inertia 209601.88285883892 Iteration 7, inertia 209598.80247956383 Converged at iteration 7: strict convergence. Initialization complete Iteration 0, inertia 244660.38351519656 Iteration 1, inertia 211950.75265903442 Iteration 2, inertia 209849.7543072008 Iteration 3, inertia 209518.7500452044 Iteration 4, inertia 209060.11326694212 Iteration 5, inertia 208957.87320545508 Iteration 6, inertia 208952.94787903182 Converged at iteration 6: center shift 0.0008306717121226465 within tolerance 0.0015358503717230955. Initialization complete Iteration 0, inertia 293038.3352946884 Iteration 1, inertia 213135.03302011758 Iteration 2, inertia 210104.25819821077 Iteration 3, inertia 209618.1187074107 Iteration 4, inertia 209113.16767690537 Iteration 5, inertia 208962.37801962328 Iteration 6, inertia 208952.94787903185 Converged at iteration 6: center shift 0.0008306717121226769 within tolerance 0.0015358503717230955. Initialization complete Iteration 0, inertia 275553.3335700919 Iteration 1, inertia 222295.79489500792 Iteration 2, inertia 212492.7176855583 Iteration 3, inertia 209869.48135591563 Iteration 4, inertia 209628.24667814613 Iteration 5, inertia 209612.29209838895 Iteration 6, inertia 209601.88285883892 Iteration 7, inertia 209598.80247956383 Converged at iteration 7: strict convergence. Initialization complete Iteration 0, inertia 286761.1414317467 Iteration 1, inertia 210695.01280865865 Iteration 2, inertia 209668.3036758006 Iteration 3, inertia 209614.6304476699 Iteration 4, inertia 209601.88285883892 Iteration 5, inertia 209598.80247956383 Converged at iteration 5: strict convergence. Initialization complete Iteration 0, inertia 383050.70537416794 Iteration 1, inertia 268032.44644019724 Iteration 2, inertia 222407.19095729562 Iteration 3, inertia 213626.56770579072 Iteration 4, inertia 210456.74923156487 Iteration 5, inertia 209703.02658441686 Iteration 6, inertia 209362.03202101542 Iteration 7, inertia 209013.56709803338 Iteration 8, inertia 208954.93374886544 Iteration 9, inertia 208952.94787903185 Converged at iteration 9: center shift 0.0008306717121226467 within tolerance 0.0015358503717230955. Initialization complete Iteration 0, inertia 374254.3643704856 Iteration 1, inertia 227441.0113142989 Iteration 2, inertia 210113.7182635559 Iteration 3, inertia 209144.6971247129 Iteration 4, inertia 209037.23080006722 Iteration 5, inertia 208970.75843448716 Iteration 6, inertia 208958.73649992305 Converged at iteration 6: center shift 0.0010520324381042456 within tolerance 0.0015358503717230955. Initialization complete Iteration 0, inertia 306059.6127144407 Iteration 1, inertia 230381.17640096927 Iteration 2, inertia 216298.96923962721 Iteration 3, inertia 211115.72555699918 Iteration 4, inertia 209886.61690318544 Iteration 5, inertia 209525.64261090878 Iteration 6, inertia 209040.15129423398 Iteration 7, inertia 208957.87320545508 Iteration 8, inertia 208952.94787903182 Converged at iteration 8: center shift 0.0008306717121226166 within tolerance 0.0015358503717230955.
KMeans(max_iter=500, n_clusters=3, verbose=1)
age_clusters = kmeans.predict(df_age_groups)
size = cluster_sizes(age_clusters)
for c in size.keys():
print("Size of Cluster", c, "= ", size[c])
Size of Cluster 0 = 789 Size of Cluster 1 = 731 Size of Cluster 2 = 591
# View centroids for an aggregate representation and a characterization of each cluster:
pd.options.display.float_format='{:,.2f}'.format
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=df_age_groups.columns.values)
centroids
Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | Age_Gen-Z | Age_Millenials | Age_Gen-X & Boomers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.70 | 82.18 | 0.39 | 0.61 | 0.13 | 0.87 | 0.12 | 0.88 | 0.15 | 0.11 | 0.75 | 0.26 | 0.19 | 0.52 | 0.02 | 0.02 | 0.07 | 0.86 | 0.04 | 0.98 | 0.02 | 0.53 | 0.35 | 0.12 | 0.97 | 0.03 | 0.31 | 0.17 | 0.05 | 0.47 | 0.65 | 0.29 | 0.06 | 0.00 | 0.06 | 0.56 | 0.38 | 0.31 | 0.00 | 0.01 | 0.64 | 0.03 | 0.64 | 0.31 | 0.04 |
1 | 1.74 | 116.59 | 0.44 | 0.56 | 0.00 | 1.00 | 0.01 | 0.99 | 0.50 | 0.07 | 0.44 | 0.08 | 0.13 | 0.79 | 0.00 | 0.01 | 0.01 | 0.99 | 0.00 | 0.98 | 0.02 | 0.62 | 0.35 | 0.03 | 1.00 | 0.00 | 0.43 | 0.03 | 0.00 | 0.54 | 0.83 | 0.15 | 0.01 | 0.00 | 0.01 | 0.85 | 0.14 | 0.16 | 0.00 | 0.00 | 0.83 | 0.00 | 0.43 | 0.56 | 0.01 |
2 | 1.65 | 55.37 | 0.69 | 0.31 | 0.48 | 0.52 | 0.23 | 0.77 | 0.29 | 0.12 | 0.59 | 0.21 | 0.07 | 0.63 | 0.09 | 0.05 | 0.30 | 0.62 | 0.03 | 0.98 | 0.02 | 0.52 | 0.41 | 0.07 | 0.87 | 0.13 | 0.28 | 0.25 | 0.05 | 0.42 | 0.50 | 0.42 | 0.09 | 0.00 | 0.02 | 0.57 | 0.41 | 0.15 | 0.01 | 0.01 | 0.79 | 0.05 | 0.90 | 0.10 | 0.00 |
centroids['Age_Gen-Z'] #clusters containing Gen-Z
0 0.64 1 0.43 2 0.90 Name: Age_Gen-Z, dtype: float64
centroids['Age_Millenials'] #clusters containing Millenials
0 0.31 1 0.56 2 0.10 Name: Age_Millenials, dtype: float64
centroids['Age_Gen-X & Boomers'] #clusters containing Gen-X and Boomers
0 0.04 1 0.01 2 0.00 Name: Age_Gen-X & Boomers, dtype: float64
# Silhouette Analysis at n = 3:
age_silhouette = metrics.silhouette_samples(df_age_groups, age_clusters)
print('Mean Silhouette Value :', age_silhouette.mean())
Mean Silhouette Value : 0.5691256560102319
# Plot and Evaluate the Silhouettes:
plot_silhouettes(df_age_groups, age_clusters)
The results of cluster analysis without normalization shows a very healthy silhouette plot with all three clusters full, thick, and with coefficients above the mean silhouette value. Figure 2.1 below confirms that clusters when age is grouped by range. When looking at the centroids, cluster 2 shows Gen-Z at 0.9 while Millennials at .10 and Gen-X and Boomers at 0.00. Most likely Gen-Z is represented in cluster 2.
# Calculate Completeness and Homogeneity for the clusters:
complete = completeness_score(labels_num, age_clusters)
print(f"Completeness Score for Clusters: {complete}")
homogene = homogeneity_score(labels_num, age_clusters)
print(f"Homogeneity Score for Clusters: {homogene}")
Completeness Score for Clusters: 0.7020884578966542 Homogeneity Score for Clusters: 0.39448267211636195
The completeness and homogeneity scores were calculated for clusters since the class labels exist for further examination of the cluster quality. The completeness score was 0.70 which shows that members of a given class are assigned to the same cluster 70% of the time. The completeness score is positive and confirms that the clusters captured most of one class. The homogeneity score was much lower at 0.39 which shows that the clusters are not pure. These results may indicate that age group may be a factor in deciding the clusters for the data, but it may not be the main factor that affects obesity level for classification. The silhouette plots above display that a pattern exist but we must take into consideration that the data was not scaled. As such, we will next, perform K-means again with the data normalized to validate the results.
# Normalize the dataset with Min-Max Scaling:
df_age_groups_norm = df_age_groups.copy()
for column in df_age_groups_norm.columns:
df_age_groups_norm[column] = (df_age_groups_norm[column] - df_age_groups_norm[column].min()) / (df_age_groups_norm[column].max() - df_age_groups_norm[column].min())
# View normalized data:
print(df_age_groups_norm)
Height Weight Gender_Female Gender_Male \ 0 0.32 0.19 1.00 0.00 1 0.13 0.13 1.00 0.00 2 0.66 0.28 0.00 1.00 3 0.66 0.36 0.00 1.00 4 0.62 0.38 0.00 1.00 ... ... ... ... ... 2106 0.49 0.69 1.00 0.00 2107 0.56 0.71 1.00 0.00 2108 0.57 0.71 1.00 0.00 2109 0.55 0.70 1.00 0.00 2110 0.54 0.71 1.00 0.00 family_history_with_overweight_no family_history_with_overweight_yes \ 0 0.00 1.00 1 0.00 1.00 2 0.00 1.00 3 1.00 0.00 4 1.00 0.00 ... ... ... 2106 0.00 1.00 2107 0.00 1.00 2108 0.00 1.00 2109 0.00 1.00 2110 0.00 1.00 FAVC_no FAVC_yes FCVC_Always FCVC_Never FCVC_Sometimes NCP_1 \ 0 1.00 0.00 0.00 0.00 1.00 0.00 1 1.00 0.00 1.00 0.00 0.00 0.00 2 1.00 0.00 0.00 0.00 1.00 0.00 3 1.00 0.00 1.00 0.00 0.00 0.00 4 1.00 0.00 0.00 0.00 1.00 1.00 ... ... ... ... ... ... ... 2106 0.00 1.00 1.00 0.00 0.00 0.00 2107 0.00 1.00 1.00 0.00 0.00 0.00 2108 0.00 1.00 1.00 0.00 0.00 0.00 2109 0.00 1.00 1.00 0.00 0.00 0.00 2110 0.00 1.00 1.00 0.00 0.00 0.00 NCP_2 NCP_3 NCP_3+ CAEC_Always CAEC_Frequently CAEC_Sometimes \ 0 0.00 1.00 0.00 0.00 0.00 1.00 1 0.00 1.00 0.00 0.00 0.00 1.00 2 0.00 1.00 0.00 0.00 0.00 1.00 3 0.00 1.00 0.00 0.00 0.00 1.00 4 0.00 0.00 0.00 0.00 0.00 1.00 ... ... ... ... ... ... ... 2106 0.00 1.00 0.00 0.00 0.00 1.00 2107 0.00 1.00 0.00 0.00 0.00 1.00 2108 0.00 1.00 0.00 0.00 0.00 1.00 2109 0.00 1.00 0.00 0.00 0.00 1.00 2110 0.00 1.00 0.00 0.00 0.00 1.00 CAEC_no SMOKE_no SMOKE_yes CH2O_Between 1 and 2 L \ 0 0.00 1.00 0.00 1.00 1 0.00 0.00 1.00 0.00 2 0.00 1.00 0.00 1.00 3 0.00 1.00 0.00 1.00 4 0.00 1.00 0.00 1.00 ... ... ... ... ... 2106 0.00 1.00 0.00 0.00 2107 0.00 1.00 0.00 1.00 2108 0.00 1.00 0.00 1.00 2109 0.00 1.00 0.00 1.00 2110 0.00 1.00 0.00 1.00 CH2O_Less than a liter CH2O_More than 2 L SCC_no SCC_yes \ 0 0.00 0.00 1.00 0.00 1 0.00 1.00 0.00 1.00 2 0.00 0.00 1.00 0.00 3 0.00 0.00 1.00 0.00 4 0.00 0.00 1.00 0.00 ... ... ... ... ... 2106 1.00 0.00 1.00 0.00 2107 0.00 0.00 1.00 0.00 2108 0.00 0.00 1.00 0.00 2109 0.00 0.00 1.00 0.00 2110 0.00 0.00 1.00 0.00 FAF_1 or 2 days FAF_2 or 4 days FAF_4 or 5 days FAF_I do not have \ 0 0.00 0.00 0.00 1.00 1 0.00 0.00 1.00 0.00 2 0.00 1.00 0.00 0.00 3 0.00 1.00 0.00 0.00 4 0.00 0.00 0.00 1.00 ... ... ... ... ... 2106 1.00 0.00 0.00 0.00 2107 1.00 0.00 0.00 0.00 2108 1.00 0.00 0.00 0.00 2109 1.00 0.00 0.00 0.00 2110 1.00 0.00 0.00 0.00 TUE_0-2 Hours TUE_3-5 Hours TUE_More than 5 Hours CALC_Always \ 0 0.00 1.00 0.00 0.00 1 1.00 0.00 0.00 0.00 2 0.00 1.00 0.00 0.00 3 1.00 0.00 0.00 0.00 4 1.00 0.00 0.00 0.00 ... ... ... ... ... 2106 1.00 0.00 0.00 0.00 2107 1.00 0.00 0.00 0.00 2108 1.00 0.00 0.00 0.00 2109 1.00 0.00 0.00 0.00 2110 1.00 0.00 0.00 0.00 CALC_Frequently CALC_Sometimes CALC_no MTRANS_Automobile \ 0 0.00 0.00 1.00 0.00 1 0.00 1.00 0.00 0.00 2 1.00 0.00 0.00 0.00 3 1.00 0.00 0.00 0.00 4 0.00 1.00 0.00 0.00 ... ... ... ... ... 2106 0.00 1.00 0.00 0.00 2107 0.00 1.00 0.00 0.00 2108 0.00 1.00 0.00 0.00 2109 0.00 1.00 0.00 0.00 2110 0.00 1.00 0.00 0.00 MTRANS_Bike MTRANS_Motorbike MTRANS_Public_Transportation \ 0 0.00 0.00 1.00 1 0.00 0.00 1.00 2 0.00 0.00 1.00 3 0.00 0.00 0.00 4 0.00 0.00 1.00 ... ... ... ... 2106 0.00 0.00 1.00 2107 0.00 0.00 1.00 2108 0.00 0.00 1.00 2109 0.00 0.00 1.00 2110 0.00 0.00 1.00 MTRANS_Walking Age_Gen-Z Age_Millenials Age_Gen-X & Boomers 0 0.00 1.00 0.00 0.00 1 0.00 1.00 0.00 0.00 2 0.00 1.00 0.00 0.00 3 1.00 0.00 1.00 0.00 4 0.00 1.00 0.00 0.00 ... ... ... ... ... 2106 0.00 1.00 0.00 0.00 2107 0.00 1.00 0.00 0.00 2108 0.00 1.00 0.00 0.00 2109 0.00 1.00 0.00 0.00 2110 0.00 1.00 0.00 0.00 [2111 rows x 45 columns]
# Perform K-Means Clustering with N = 3:
kmeans3 = KMeans(n_clusters=3, max_iter=500, verbose=1)
kmeans3.fit(df_age_groups_norm)
Initialization complete Iteration 0, inertia 15293.30577388341 Iteration 1, inertia 9968.066331837832 Iteration 2, inertia 9920.180131785271 Iteration 3, inertia 9903.77236856074 Iteration 4, inertia 9896.176445738252 Iteration 5, inertia 9891.495588792406 Iteration 6, inertia 9889.624965572522 Iteration 7, inertia 9888.823257389276 Iteration 8, inertia 9888.487377405678 Iteration 9, inertia 9888.340285312179 Iteration 10, inertia 9888.20252958357 Iteration 11, inertia 9887.532993038576 Iteration 12, inertia 9887.504196402364 Converged at iteration 12: strict convergence. Initialization complete Iteration 0, inertia 15755.748152264905 Iteration 1, inertia 10291.16329342617 Iteration 2, inertia 10228.905860318164 Iteration 3, inertia 10188.107451127948 Iteration 4, inertia 10144.37857128053 Iteration 5, inertia 10107.001253617971 Iteration 6, inertia 10086.05688167689 Iteration 7, inertia 10077.969028399091 Iteration 8, inertia 10068.115905357135 Iteration 9, inertia 10054.640327576219 Iteration 10, inertia 10036.68992669457 Iteration 11, inertia 10012.308155223136 Iteration 12, inertia 9995.089501496252 Iteration 13, inertia 9979.257157992513 Iteration 14, inertia 9938.759373638944 Iteration 15, inertia 9911.724132573296 Iteration 16, inertia 9891.1788188847 Iteration 17, inertia 9754.311626803314 Iteration 18, inertia 9678.154213662237 Iteration 19, inertia 9667.068511677915 Iteration 20, inertia 9660.214612839805 Iteration 21, inertia 9654.574790207527 Iteration 22, inertia 9645.63794782316 Iteration 23, inertia 9644.899891129677 Iteration 24, inertia 9644.748605586845 Iteration 25, inertia 9644.595919167177 Iteration 26, inertia 9644.549083693555 Iteration 27, inertia 9641.042116346523 Iteration 28, inertia 9638.110099852393 Iteration 29, inertia 9634.979187430297 Iteration 30, inertia 9632.762887140108 Iteration 31, inertia 9632.086194242833 Iteration 32, inertia 9631.65577226331 Iteration 33, inertia 9631.569838198466 Iteration 34, inertia 9631.539746213291 Converged at iteration 34: strict convergence. Initialization complete Iteration 0, inertia 14748.903594063975 Iteration 1, inertia 9965.186003070134 Iteration 2, inertia 9825.169923880945 Iteration 3, inertia 9744.272576918649 Iteration 4, inertia 9667.587922972407 Iteration 5, inertia 9644.10573691932 Iteration 6, inertia 9635.267693732794 Iteration 7, inertia 9633.29914147962 Iteration 8, inertia 9632.952141350772 Iteration 9, inertia 9632.87014898023 Converged at iteration 9: strict convergence. Initialization complete Iteration 0, inertia 14484.898807356742 Iteration 1, inertia 9955.81303074818 Iteration 2, inertia 9713.282859132194 Iteration 3, inertia 9671.355873128363 Iteration 4, inertia 9650.513068725491 Iteration 5, inertia 9645.317685969698 Iteration 6, inertia 9642.530506536332 Iteration 7, inertia 9641.485885635826 Iteration 8, inertia 9640.28240915911 Iteration 9, inertia 9639.857937703955 Iteration 10, inertia 9639.774644478792 Iteration 11, inertia 9639.697843285941 Iteration 12, inertia 9639.504960365535 Iteration 13, inertia 9638.95769401947 Iteration 14, inertia 9638.573356540428 Iteration 15, inertia 9636.940111858781 Iteration 16, inertia 9635.980505559797 Iteration 17, inertia 9635.293549236569 Iteration 18, inertia 9633.992196284771 Iteration 19, inertia 9631.294896772604 Iteration 20, inertia 9629.157517862288 Iteration 21, inertia 9629.034229492028 Iteration 22, inertia 9628.960271079659 Iteration 23, inertia 9628.812549454027 Iteration 24, inertia 9628.623882692866 Iteration 25, inertia 9628.59305637934 Iteration 26, inertia 9628.524787673914 Iteration 27, inertia 9628.459530177464 Converged at iteration 27: strict convergence. Initialization complete Iteration 0, inertia 16152.65491388627 Iteration 1, inertia 10375.533418947318 Iteration 2, inertia 10144.857929200623 Iteration 3, inertia 10053.027516666378 Iteration 4, inertia 10011.007658000435 Iteration 5, inertia 9990.350407887636 Iteration 6, inertia 9980.01909365632 Iteration 7, inertia 9973.147134155095 Iteration 8, inertia 9962.93227967642 Iteration 9, inertia 9950.804233790155 Iteration 10, inertia 9929.106524971192 Iteration 11, inertia 9893.61445739094 Iteration 12, inertia 9871.997645307263 Iteration 13, inertia 9853.145440604721 Iteration 14, inertia 9818.078861860033 Iteration 15, inertia 9757.278886917615 Iteration 16, inertia 9713.510022024357 Iteration 17, inertia 9680.926171314575 Iteration 18, inertia 9674.777601266724 Iteration 19, inertia 9673.122071852886 Iteration 20, inertia 9672.685183118592 Iteration 21, inertia 9672.518137679992 Iteration 22, inertia 9672.500575556094 Converged at iteration 22: strict convergence. Initialization complete Iteration 0, inertia 16505.93842126068 Iteration 1, inertia 10174.540781006854 Iteration 2, inertia 9896.176365470483 Iteration 3, inertia 9755.217403382474 Iteration 4, inertia 9710.856879173405 Iteration 5, inertia 9695.291352573073 Iteration 6, inertia 9683.148926900818 Iteration 7, inertia 9667.5491075782 Iteration 8, inertia 9655.586520230188 Iteration 9, inertia 9649.747659712908 Iteration 10, inertia 9645.46312963105 Iteration 11, inertia 9644.40630542299 Iteration 12, inertia 9644.101013829977 Iteration 13, inertia 9643.642758888995 Iteration 14, inertia 9643.008226227485 Iteration 15, inertia 9642.844684153424 Iteration 16, inertia 9642.701135334233 Iteration 17, inertia 9642.57061072763 Iteration 18, inertia 9642.519519077938 Iteration 19, inertia 9642.504477800321 Converged at iteration 19: strict convergence. Initialization complete Iteration 0, inertia 17172.960091144152 Iteration 1, inertia 10194.671271050402 Iteration 2, inertia 10127.428286254335 Iteration 3, inertia 10110.648670434139 Iteration 4, inertia 10088.305356307605 Iteration 5, inertia 10056.98392140109 Iteration 6, inertia 10014.54147542604 Iteration 7, inertia 9984.782947200316 Iteration 8, inertia 9948.035333426422 Iteration 9, inertia 9907.185271483926 Iteration 10, inertia 9884.78043245219 Iteration 11, inertia 9863.689071376455 Iteration 12, inertia 9845.824302775172 Iteration 13, inertia 9831.477466918255 Iteration 14, inertia 9805.29705603225 Iteration 15, inertia 9788.493454238513 Iteration 16, inertia 9782.947379376854 Iteration 17, inertia 9781.810910498489 Iteration 18, inertia 9781.446129264701 Iteration 19, inertia 9781.063080143778 Iteration 20, inertia 9780.165908514351 Iteration 21, inertia 9779.376078944326 Iteration 22, inertia 9778.781046860006 Iteration 23, inertia 9777.25564144428 Iteration 24, inertia 9771.593080768085 Iteration 25, inertia 9759.993073682037 Iteration 26, inertia 9741.127172080985 Iteration 27, inertia 9725.59447725809 Iteration 28, inertia 9693.057299937595 Iteration 29, inertia 9674.633488354748 Iteration 30, inertia 9668.883857942214 Iteration 31, inertia 9668.062939210902 Iteration 32, inertia 9667.795522508974 Converged at iteration 32: strict convergence. Initialization complete Iteration 0, inertia 16121.151887575434 Iteration 1, inertia 10180.975017955254 Iteration 2, inertia 10012.790506392183 Iteration 3, inertia 9945.149592230093 Iteration 4, inertia 9902.19445401729 Iteration 5, inertia 9854.551471824738 Iteration 6, inertia 9796.629809393482 Iteration 7, inertia 9733.824033247713 Iteration 8, inertia 9709.60496866079 Iteration 9, inertia 9701.710992691354 Iteration 10, inertia 9698.331774194536 Iteration 11, inertia 9696.490391355273 Iteration 12, inertia 9695.20652681061 Iteration 13, inertia 9695.015903631236 Iteration 14, inertia 9694.98725574485 Converged at iteration 14: strict convergence. Initialization complete Iteration 0, inertia 15719.84699885392 Iteration 1, inertia 10093.886021577151 Iteration 2, inertia 9883.872126085509 Iteration 3, inertia 9824.269228240664 Iteration 4, inertia 9779.058685068434 Iteration 5, inertia 9725.75686585067 Iteration 6, inertia 9686.098533273536 Iteration 7, inertia 9677.796926090543 Iteration 8, inertia 9674.731965248277 Iteration 9, inertia 9673.142492501147 Iteration 10, inertia 9672.65337922592 Iteration 11, inertia 9672.51584737018 Iteration 12, inertia 9672.498249204844 Converged at iteration 12: strict convergence. Initialization complete Iteration 0, inertia 15272.42924127074 Iteration 1, inertia 10146.7544705174 Iteration 2, inertia 10060.416621065493 Iteration 3, inertia 10009.051773331206 Iteration 4, inertia 9916.53892424252 Iteration 5, inertia 9817.043864074043 Iteration 6, inertia 9800.317052776645 Iteration 7, inertia 9790.614521302858 Iteration 8, inertia 9786.58080875628 Iteration 9, inertia 9785.526700118644 Iteration 10, inertia 9783.706006793813 Iteration 11, inertia 9782.330522508842 Iteration 12, inertia 9781.663433153548 Iteration 13, inertia 9781.595470810445 Iteration 14, inertia 9781.360992187556 Iteration 15, inertia 9781.307176755947 Converged at iteration 15: strict convergence.
KMeans(max_iter=500, n_clusters=3, verbose=1)
clusters_norm3 = kmeans3.predict(df_age_groups_norm)
size3 = cluster_sizes(clusters_norm3)
for c in size3.keys():
print("Size of Cluster", c, "= ", size3[c])
Size of Cluster 0 = 541 Size of Cluster 1 = 1018 Size of Cluster 2 = 552
# View centroids for an aggregate representation and a characterization of each cluster:
pd.options.display.float_format='{:,.2f}'.format
centroids3 = pd.DataFrame(kmeans3.cluster_centers_, columns=df_age_groups_norm.columns.values)
centroids3
Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | Age_Gen-Z | Age_Millenials | Age_Gen-X & Boomers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.43 | 0.45 | 0.90 | 0.10 | 0.15 | 0.85 | 0.11 | 0.89 | 0.97 | 0.03 | 0.00 | 0.08 | 0.04 | 0.86 | 0.01 | 0.03 | 0.13 | 0.82 | 0.02 | 0.98 | 0.02 | 0.62 | 0.33 | 0.05 | 0.94 | 0.06 | 0.35 | 0.07 | 0.02 | 0.55 | 0.87 | 0.10 | 0.04 | 0.00 | 0.01 | 0.88 | 0.10 | 0.05 | -0.00 | 0.00 | 0.92 | 0.02 | 0.62 | 0.37 | 0.01 |
1 | 0.48 | 0.27 | 0.39 | 0.61 | 0.26 | 0.74 | 0.14 | 0.86 | 0.08 | 0.13 | 0.79 | 0.24 | 0.15 | 0.55 | 0.06 | 0.03 | 0.15 | 0.79 | 0.04 | 0.99 | 0.01 | 0.56 | 0.33 | 0.11 | 0.95 | 0.05 | 0.33 | 0.21 | 0.05 | 0.41 | 0.49 | 0.43 | 0.08 | 0.00 | 0.03 | 0.55 | 0.42 | 0.07 | 0.00 | 0.00 | 0.89 | 0.03 | 0.99 | 0.01 | 0.00 |
2 | 0.51 | 0.42 | 0.30 | 0.70 | 0.07 | 0.93 | 0.07 | 0.93 | 0.08 | 0.10 | 0.82 | 0.20 | 0.19 | 0.61 | 0.00 | 0.01 | 0.04 | 0.94 | 0.01 | 0.96 | 0.04 | 0.50 | 0.46 | 0.05 | 0.99 | 0.01 | 0.36 | 0.09 | 0.02 | 0.53 | 0.81 | 0.18 | 0.01 | 0.00 | 0.05 | 0.66 | 0.29 | 0.64 | 0.00 | 0.01 | 0.32 | 0.02 | 0.01 | 0.92 | 0.07 |
centroids3['Age_Gen-Z'] #clusters containing Gen-Z Normalized
0 0.62 1 0.99 2 0.01 Name: Age_Gen-Z, dtype: float64
centroids3['Age_Millenials'] #clusters containing Millenials Normalized
0 0.37 1 0.01 2 0.92 Name: Age_Millenials, dtype: float64
centroids3['Age_Gen-X & Boomers'] #clusters containing Gen-X and Boomers Normalized
0 0.01 1 0.00 2 0.07 Name: Age_Gen-X & Boomers, dtype: float64
# Silhouette Analysis at n = 3:
age_norm_silhouette = metrics.silhouette_samples(df_age_groups_norm, clusters_norm3)
print('Mean Silhouette Value :', age_norm_silhouette.mean())
Mean Silhouette Value : 0.11905854619225616
# Plot and Evaluate the Silhouettes:
plot_silhouettes(df_age_groups_norm, clusters_norm3)
Above, the results are drastically different from the results from the non-normalized data. Cluster 0 outperformed all other clusters with all its coefficients above the mean silhouette value. Cluster 2 performed adequately with many of its coefficients above the mean silhouette value and only a few of its coefficients in negative. Cluster 1 did not perform as well as many of the coefficients are in negative and none of them are above the mean silhouette value. When looking at the centroids, the values of the age group do not directly correspond to the silhouette plots.
# Calculate Completeness and Homogeneity for the clusters:
complete_norm = completeness_score(labels_num, clusters_norm3)
print(f"Completeness Score for Clusters: {complete_norm}")
homogene_norm = homogeneity_score(labels_num, clusters_norm3)
print(f"Homogeneity Score for Clusters: {homogene_norm}")
Completeness Score for Clusters: 0.3552093808452009 Homogeneity Score for Clusters: 0.19224754943375816
These results show that with the normalized data, a pattern may not necessarily appear in the age groups. Moreover, when examining K-means and clustering, we can see how not scaling the data may lead to conclusions or patterns about the data when a pattern may not necessarily exist. This is validated when evaluating the completeness and homogeneity scores, which both resulted in low scores. The completeness score was around 0.34 and the homogeneity score is lower at 0.18. These scores show that grouping by age is not the main determining factor for the classification of obesity levels. Age still may play a role as a key feature, but the clustering exploration does not necessary reveal that the age groupings have a significant pattern. By building the classification models and performing feature selection, we will be able to obtain a better picture of age and age groupings and their role in classifying obesity levels.
# Create a copy of the data with the Age Groups:
data_age_groups = data_numeric
data_age_groups["Age"] = age_df['Age Group']
data_age_groups
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Gen-Z | 1.62 | 64.00 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | Gen-Z | 1.52 | 56.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | Gen-Z | 1.80 | 77.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | Millenials | 1.80 | 87.00 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | Gen-Z | 1.78 | 89.80 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Gen-Z | 1.71 | 131.41 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2107 | Gen-Z | 1.75 | 133.74 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2108 | Gen-Z | 1.75 | 133.69 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2109 | Gen-Z | 1.74 | 133.35 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2110 | Gen-Z | 1.74 | 133.47 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2111 rows × 43 columns
# Add the class labels as a column to the dataset:
data_age_groups['NObeyesdad'] = labels_df
data_age_groups
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | NObeyesdad | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Gen-Z | 1.62 | 64.00 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
1 | Gen-Z | 1.52 | 56.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
2 | Gen-Z | 1.80 | 77.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
3 | Millenials | 1.80 | 87.00 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Overweight_Level_I |
4 | Gen-Z | 1.78 | 89.80 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Overweight_Level_II |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Gen-Z | 1.71 | 131.41 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2107 | Gen-Z | 1.75 | 133.74 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2108 | Gen-Z | 1.75 | 133.69 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2109 | Gen-Z | 1.74 | 133.35 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2110 | Gen-Z | 1.74 | 133.47 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2111 rows × 44 columns
genz_df = data_age_groups[data_age_groups["Age"] == 'Gen-Z']
genz_df
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | NObeyesdad | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Gen-Z | 1.62 | 64.00 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
1 | Gen-Z | 1.52 | 56.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
2 | Gen-Z | 1.80 | 77.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
4 | Gen-Z | 1.78 | 89.80 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Overweight_Level_II |
6 | Gen-Z | 1.50 | 55.00 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | Normal_Weight |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2106 | Gen-Z | 1.71 | 131.41 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2107 | Gen-Z | 1.75 | 133.74 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2108 | Gen-Z | 1.75 | 133.69 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2109 | Gen-Z | 1.74 | 133.35 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2110 | Gen-Z | 1.74 | 133.47 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
1353 rows × 44 columns
#Save Gen-Z dataframe to CSV:
genz_df.to_csv('/Users/cl/genz_dataframe.csv', index = False)
millen_df = data_age_groups[data_age_groups["Age"] == 'Millenials']
millen_df
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | NObeyesdad | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | Millenials | 1.80 | 87.00 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Overweight_Level_I |
5 | Millenials | 1.62 | 53.00 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Normal_Weight |
10 | Millenials | 1.85 | 105.00 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_I |
16 | Millenials | 1.93 | 102.00 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Overweight_Level_II |
17 | Millenials | 1.53 | 78.00 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2098 | Millenials | 1.61 | 104.95 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2099 | Millenials | 1.63 | 108.09 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2100 | Millenials | 1.63 | 107.38 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2101 | Millenials | 1.63 | 107.22 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
2102 | Millenials | 1.63 | 108.11 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Obesity_Type_III |
717 rows × 44 columns
# Save Millenials dataframe to CSV:
millen_df.to_csv('/Users/cl/millenials_dataframe.csv', index = False)
genxboomers_df = data_age_groups[data_age_groups["Age"] == 'Gen-X & Boomers']
genxboomers_df
Age | Height | Weight | Gender_Female | Gender_Male | family_history_with_overweight_no | family_history_with_overweight_yes | FAVC_no | FAVC_yes | FCVC_Always | FCVC_Never | FCVC_Sometimes | NCP_1 | NCP_2 | NCP_3 | NCP_3+ | CAEC_Always | CAEC_Frequently | CAEC_Sometimes | CAEC_no | SMOKE_no | SMOKE_yes | CH2O_Between 1 and 2 L | CH2O_Less than a liter | CH2O_More than 2 L | SCC_no | SCC_yes | FAF_1 or 2 days | FAF_2 or 4 days | FAF_4 or 5 days | FAF_I do not have | TUE_0-2 Hours | TUE_3-5 Hours | TUE_More than 5 Hours | CALC_Always | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Automobile | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | NObeyesdad | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13 | Gen-X & Boomers | 1.80 | 99.00 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
21 | Gen-X & Boomers | 1.69 | 87.00 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
92 | Gen-X & Boomers | 1.78 | 84.00 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Overweight_Level_I |
133 | Gen-X & Boomers | 1.65 | 66.00 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
137 | Gen-X & Boomers | 1.60 | 80.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | Obesity_Type_I |
161 | Gen-X & Boomers | 1.65 | 80.00 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
169 | Gen-X & Boomers | 1.63 | 77.00 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
197 | Gen-X & Boomers | 1.75 | 118.00 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Obesity_Type_II |
201 | Gen-X & Boomers | 1.54 | 80.00 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
232 | Gen-X & Boomers | 1.59 | 50.00 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | Normal_Weight |
252 | Gen-X & Boomers | 1.79 | 90.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
358 | Gen-X & Boomers | 1.75 | 110.00 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_II |
375 | Gen-X & Boomers | 1.80 | 92.00 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
492 | Gen-X & Boomers | 1.70 | 86.00 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
751 | Gen-X & Boomers | 1.72 | 82.92 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_I |
813 | Gen-X & Boomers | 1.77 | 75.63 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_I |
1013 | Gen-X & Boomers | 1.77 | 80.49 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1017 | Gen-X & Boomers | 1.65 | 79.17 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1034 | Gen-X & Boomers | 1.75 | 82.13 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1062 | Gen-X & Boomers | 1.73 | 86.95 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1063 | Gen-X & Boomers | 1.68 | 79.67 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1088 | Gen-X & Boomers | 1.66 | 80.99 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1101 | Gen-X & Boomers | 1.72 | 88.60 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1158 | Gen-X & Boomers | 1.67 | 80.40 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1162 | Gen-X & Boomers | 1.68 | 79.85 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1179 | Gen-X & Boomers | 1.74 | 84.73 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1208 | Gen-X & Boomers | 1.69 | 80.41 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Overweight_Level_II |
1215 | Gen-X & Boomers | 1.57 | 81.83 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1216 | Gen-X & Boomers | 1.58 | 81.94 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1267 | Gen-X & Boomers | 1.59 | 76.13 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1285 | Gen-X & Boomers | 1.65 | 86.64 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1286 | Gen-X & Boomers | 1.64 | 81.98 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1305 | Gen-X & Boomers | 1.60 | 77.35 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1325 | Gen-X & Boomers | 1.57 | 81.06 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1385 | Gen-X & Boomers | 1.57 | 81.92 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1386 | Gen-X & Boomers | 1.58 | 80.99 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1387 | Gen-X & Boomers | 1.58 | 81.92 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1489 | Gen-X & Boomers | 1.54 | 77.05 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1490 | Gen-X & Boomers | 1.59 | 77.00 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_I |
1529 | Gen-X & Boomers | 1.75 | 116.59 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_II |
1618 | Gen-X & Boomers | 1.75 | 115.81 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Obesity_Type_II |
# Save Gen-X and Boomers dataframe to CSV:
genxboomers_df.to_csv('/Users/cl/genxboomers_dataframe.csv', index = False)