#!/usr/bin/env python
# coding: utf-8

'''
written by: Sarkar, Titli (titli.sarkar@nih.gov)
on August 25, 2025

Supervised Machine Learning With Scikit-learn and Diabetes dataset
# ### Exploratory Data Analysis (EDA) and Wranging, Classification (k-nearest neighbor), Model Fitting, Hyperparameter Tuning, and Performance Evaluation
# ### Dataset:
# Diabetes is a chronic health condition affecting millions worldwide. Early prediction of diabetes can help in timely management and
# prevention of complications. In this article, we will walk through a Python-based machine learning project for predicting diabetes usinga Diabetes Dataset from Kaggle. \
# We will use Python libraries such as `numpy`, `pandas`, `scikit-learn`, and the `K-nearest-neighbours (knn)` classification algorithm. We will also learn the importance of handling missing data and  hyperparameter tuning for each model to achieve the best performance and generability.
# 
# **diabetes_prediction_dataset.csv**: https://www.kaggle.com/code/mahmoudbahnasy29/diabetes?select=diabetes_prediction_dataset.csv \
# This file contains medical and demographic data of patients along with their `diabetes status, whether positive or negative`. It consists of various features such as `age`, `gender`, `body mass index (BMI)`, `hypertension`, `heart disease`, `smoking history`, `HbA1c level`, and `blood glucose level`. The Dataset can be utilized to construct machine learning models that can predict the likelihood of diabetes in patients based on their medical history and demographic details.'''

# Step 1: Loading the necessary packages
# %pip install scikit-plot
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import scikitplot as skplt
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set()
get_ipython().run_line_magic('matplotlib', 'inline')
# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category= FutureWarning)
print("Imports Done.")

# Step 2 : Read data
data_dir = "/data/sarkart4/Data/BTEP_Coding_Club/Intro_to_scikit-learn/MLdiabetes_inpectplot-master/MLdiabetes_inpectplot-master/"
#os.chdir(data_dir)

df = pd.read_csv(data_dir+"inputs/diabetes_prediction_dataset.csv")
print(df.head(5)) # read first 5 lines of the data

original_dataframe_shape = df.shape
print("original_dataframe_shape=", original_dataframe_shape)

# Describe data
print(df.info(verbose=True))
print(df.describe().T)

# Preprocessing
# a. check for missing/ null values 
print(df.isnull().sum()) # No missing values found.

# b. Detect & Handle Duplicates
print(df.duplicated().sum())
df.drop_duplicates(inplace=True, ignore_index=True)
print(df.shape)
print(df.describe().T)

# Exploratory Data Analysis (EDA)
# Univariate Analysis
for col in df.columns:
    if col in df.select_dtypes('O').columns.to_list() + ['hypertension','heart_disease','diabetes']:
        fig,axes = plt.subplots(1,2)
        axc = sns.countplot(x=df[col],ax=axes[0])
        if col == 'smoking_history':
            axc.tick_params(axis='x', rotation=90)
        axc.figure.savefig(data_dir+col+"_countplot.png", dpi=100)
        plt.pie(x=df[col].value_counts().values,labels=df[col].value_counts().index,autopct='%.2f%%')
        plt.savefig(data_dir+col+"_pieplot.png", dpi=100)
    else:
        fig,axes = plt.subplots(1,2)
        ax = sns.histplot(x=df[col],kde=True,ax=axes[0])
        ax.figure.savefig(data_dir+col+"_histplot.png", dpi=100)
        sns.boxplot(x=df[col],ax=axes[1])    
        plt.savefig(data_dir+col+"_boxplot.png", dpi=100)
    plt.show()

# Bivariate Analysis
sns.boxplot(x=df['diabetes'],y=df['bmi'])
plt.savefig(data_dir+"diabetes_bmi_boxplot.png", dpi=100)
sns.boxplot(x=df['diabetes'],y=df['age'])
plt.savefig(data_dir+"diabetes_age_boxplot.png", dpi=100)

# Handling categorical variables
# We will use k nearest neighbours classification algorithm as our model later, which is a distance-based algorithm. \
# Therefore, we need to enumerate two categorical variables `gender` and `smoking_history` to numerical values. 

# Define your mapping dictionary
mapping_gender = {'Male':1,'Female':2,'Other':0}
mapping_smoking_history = {'No Info':0,'never':1,'ever':2,'former':3,'not current':4,'current':5}

# Apply the mapping
df['gender'] = df['gender'].map(mapping_gender)
df['smoking_history'] = df['smoking_history'].map(mapping_smoking_history)

# Data Wrangling (handling missing values)
# Copy original dataframe to a copy and manipulate this later
df_copy = df.copy(deep=True)

# replace zeros with NaNs for columns 'smoking_history' and 'gender'
df_copy[['smoking_history']] = df_copy[['smoking_history']].replace(0, np.NaN)
df_copy[['gender']] = df_copy[['gender']].replace(0, np.NaN)

# count total rows with NaNs
df_copy.isna().any(axis=1).sum()

# Drop rows where column 'smoking_history' has NaN values
df_cleaned = df_copy.dropna(subset=['smoking_history', 'gender'])
print("Cleaned dataframe shape:", df_cleaned.shape)

# histogram plot
hist = df.hist(figsize = [15, 15])
plt.savefig(data_dir+'dataframe_histogram.png') 

# pair plot on cleaned data
sns.pairplot(df_cleaned, hue = 'diabetes', aspect=1.5) # height=2.5
plt.savefig(data_dir+'dataframe_pairplot.png') 

# Standardization (min-max Normalization or Scaling):
df_min_max_scaled = (df_cleaned - df_cleaned.min()) / (df_cleaned.max() - df_cleaned.min())
print(df_min_max_scaled.describe().T)


# Feature Importance:
df2 = df_min_max_scaled
feature_names = df2.columns[:-1]

randfor = RandomForestClassifier()
randfor.fit(df2.drop(columns = "diabetes", axis=1),df2["diabetes"])

sp = skplt.estimators.plot_feature_importances(randfor, feature_names=feature_names, figsize=(10, 5), x_tick_rotation=90)
plt.savefig(data_dir+"feature_importances.png", dpi=300, bbox_inches='tight')
plt.show()

# Feature Corelation:
# Spearman’s rank correlation coefficient
correlation_matrix = df2.corr(method='spearman') # kendall, pearson
plt.figure(figsize=(8, 5.5)) # Adjust figure size as needed
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", square=False, cmap='Blues',) #RdYlGn/jet/coolwarm (vibrant), viridis/cividis (for colorblind), Blues (sequential)
plt.title("Spearman's rank Correlation Matrix")
plt.savefig(data_dir+"Spearmans_correlation_coefficient.png", dpi=300, bbox_inches='tight')
plt.show()

# Data Spliting 
X = df_min_max_scaled.drop(columns = "diabetes", axis=1) # data with feature columns
y = df_min_max_scaled["diabetes"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Step 4: ML Approach
# Model: k - Nearest Neighbour
# **How to choose the value of `k`?* 
# Hyperparameter Tuning 
# `GridSearchCV` in `scikit-learn` for determining the value of `k`

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1, 51, 2)} # Test odd k values from 1 to 51
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy') # 5-fold cross validation on train set 
grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_
print(f"Best k: {best_k}, Best accuracy: {best_score}")

# Model Training
# Step 1. Initialize and train the KNN classifier
k = best_k
model_knn = KNeighborsClassifier(n_neighbors=k)
model_knn.fit(X_train, y_train)

# Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = np.round(cross_val_score(model_knn, X_train, y_train, cv=kf, scoring='accuracy'), 2) # 5-fold cross-validation
print("Cross-validation scores for each fold on train data:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

# Prediction Accuracy on Test data (unseen by model)
pred_knn = model_knn.predict(X_test)
print(y_test.shape, pred_knn.shape)


# Evaluation (model Performance Analysis)
# 1. Confusion Matrix
pd.crosstab(y_test, pred_knn, rownames=['True'], colnames=['Predicted'], margins = True)

confusion_matrix = confusion_matrix(y_test, pred_knn)
p = sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig(data_dir+"confusion_matrix.png", dpi=100)

# 2. Classification Report 
print(classification_report(y_test,pred_knn))

# 3. Accuracy and Error 
print("KNN Accuracy: {}".format(np.round(accuracy_score(y_test, pred_knn), 2)))
print("Root Mean Squared Error: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred_knn)), 2)))

# 4. ROC - AUC:
y_pred_proba = model_knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

roc_auc = np.round(auc(fpr, tpr), 2)
print("auc = ", roc_auc)

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve")
plt.legend(loc="lower right")
plt.show()
plt.savefig(data_dir+"ROC_AUC.png", dpi=100)

#Area under ROC curve
from sklearn.metrics import roc_auc_score
print("roc_auc score = ", roc_auc_score(y_test,y_pred_proba))

print("Code End.")

