0% found this document useful (0 votes)
10 views

Slip

Uploaded by

prime04072001
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

Slip

Uploaded by

prime04072001
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

from sklearn.metrics import r2_score, mean_squared_error

# Linear Regression

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = sm.OLS(y_train, X_train).fit() #reg.summary()

y_pred = reg.predict(X_test)

params = reg.params

p_val = reg.pvalues[1]

print(f"Coefficients: b0: {params['const']}, b1: {params['Salary']}")

print(r2_score(y_test, y_pred))

print(mean_squared_error(y_test, y_pred))

residuals = reg.resid

std_residuals = reg.get_influence().resid_studentized_internal

influence = reg.get_influence()

cook_distance = influence.cooks_distance[0] #plt.stem()

leverage = influence.hat_matrix_diag

leverage_threshold = 3 * (len(X_train.columns) + 1) / len(X_train)

#cooks_threshold = 1

#cook_outliers = np.where(cook_distance > cooks_threshold)[0]

#MLR

from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif_factors(input_df) :

vif = pd.DataFrame()

vif["Features"] = input_df.columns

vif["VIF"] = [variance_inflation_factor(input_df.values, i) for i in range(input_df.shape[1])]

return vif

#Residual plot between standardized model.fittedvalues and standardized model.resid

#get_standardized_values = lambda x : (x - np.mean(x))/(np.std(x))


#Logistic Regression

X = pd.get_dummies(X_features, drop_first=True)

X = sm.add_constant(X)

model_1 = sm.Logit(train_Y, train_X).fit()

significant_features = model_1.pvalues[model_1.pvalues < 0.05].index

#make model with significant features

from sklearn import metrics

cutoff = np.arange(0.1, 0.91, 0.01)

youdens_index = []

cutoff_index = []

for i in cutoff:

predicted_values = model_2.predict(test_X)

predicted_values = (predicted_values > i).astype(int)

confusion_matrix = metrics.confusion_matrix(test_Y, predicted_values)

sensitivity = confusion_matrix[1][1]/(confusion_matrix[1][1] + confusion_matrix[1][0])

specificity = confusion_matrix[0][0]/(confusion_matrix[0][0] + confusion_matrix[0][1])

youden_index = sensitivity + specificity - 1

youdens_index.append(youden_index)

cutoff_index.append(i)

print("Youden index : ", youdens_index)

max_youden = -40

optimal_cutoff = -1

for i in range(0, len(youdens_index)):

if(youdens_index[i] > max_youden) :

max_youden = youdens_index[i]

optimal_cutoff = cutoff_index[i]

#print( metrics.classification_report( test_Y, predicted_values ) )

'''

For cost based

cost = 5*confusion_matrix[0][1] + confusion_matrix[1][0]

and then find min instead of max like in youden

'''

#DecisionTree
from sklearn.tree import DecisionTreeClassifier

data = pd.get_dummies(data, columns=['famhist']) #no drop first

model1 = DecisionTreeClassifier(criterion='gini', max_depth=6, random_state=42) #gini

model3 = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=42) #info gain

model1.fit(X_train, y_train)

from sklearn.tree import plot_tree

plot_tree(model1, feature_names=list(X_features.columns), class_names=["No CHD", "CHD"], filled=True, rounded=True)

#KNN

from sklearn.neighbors import KNeighborsClassifier

from sklearn.utils import resample, shuffle

upsampled_not_joined = resample(not_joined, replace = True, n_samples = 4000)

dfs = [joined, upsampled_not_joined]

new_df = pd.concat(dfs)

new_df = shuffle(new_df)

X = pd.get_dummies(X_features, drop_first=True)

Y = new_df.Status.map(lambda x: int(x == 'Joined'))

train_X, test_X, train_Y, test_Y = train_test_split(X,Y,train_size=0.8, random_state=42)

knn_clf = KNeighborsClassifier()

knn_clf.fit( train_X, train_Y )

from sklearn.model_selection import GridSearchCV

tuned_parameters = [{ 'n_neighbors': range(5,10),'metric': ['canberra', 'euclidean','minkowski']}]

clf = GridSearchCV( KNeighborsClassifier(),tuned_parameters,cv=10,scoring='roc_auc')

clf.fit(train_X, train_Y)

print("Best score for KNN is : ",clf.best_score_)

print("Best parameter for KNN is : ", clf.best_params_)

#ensemble

from sklearn.ensemble import RandomForestClassifier

radm_clf = RandomForestClassifier(max_depth=10, n_estimators=10)

radm_clf.fit( train_X, train_Y )

from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression()
ada_clf = AdaBoostClassifier(logreg_clf, n_estimators=50)

ada_clf.fit(train_X, train_Y)

#SVM

from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', C=1.0, probability=True)

svm_clf_poly = SVC(kernel='poly', degree=3, probability=True)

svm_clf_rbf = SVC(kernel='rbf', gamma=0.1, probability=True)

svm_clf_sigmoid = SVC(kernel='sigmoid', gamma='scale', coef0=0.0, probability=True)

svm_clf.fit(train_X, train_Y)

#Clustering

# normalize data

from sklearn.cluster import KMeans

cluster_range = range(1, 10)

cluster_errors = []

#elbow method to find best no. of clusters

for num_clusters in cluster_range:

clusters = KMeans(num_clusters, n_init="auto")

clusters.fit(scaled_df)

cluster_errors.append(clusters.inertia_)

plt.plot(cluster_range, cluster_errors, marker = "o")

#got k from plot

k=3

clusters = KMeans(k, random_state = 42, n_init="auto")

clusters.fit(scaled_df)

df["clusterid"] = clusters.labels_

# clusters.cluster_centers_

# clusters.labels_

#ROC Curve

from sklearn import metrics

def draw_roc_curve( model, test_X, test_y ):

test_results_df = pd.DataFrame( { 'actual': test_y } )

test_results_df = test_results_df.reset_index()

predict_proba_df = pd.DataFrame( model.predict_proba( test_X.values ) )

test_results_df['chd_1'] = predict_proba_df.iloc[:,1:2]

fpr, tpr, thresholds = metrics.roc_curve( test_results_df.actual,test_results_df.chd_1,drop_intermediate = False )


auc_score = metrics.roc_auc_score( test_results_df.actual,

test_results_df.chd_1 )

plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % auc_score)

plt.plot([0, 1], [0, 1], 'k--')

plt.xlim([0.0, 1.0])

plt.ylim([0.0, 1.05])

#PCA

from sklearn.preprocessing import StandardScaler

from numpy.linalg import eig

X_norm = StandardScaler().fit_transform(X)

X_norm = pd.DataFrame(X_norm, columns=feat)

X_mean_adj = X_norm - X_norm.mean()

cov_mat = X_mean_adj.cov()

val, vec = eig(cov_mat)

val = np.array(val)

vec = np.array(vec)

sorted_idx = np.argsort(-val)

val = val[sorted_idx]

vec = vec[:, sorted_idx]

pc1 = np.dot(vec[:, 0], X_mean_adj.T)

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

principle_components = pca.fit_transform(X_norm)

principle_components

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy