0% found this document useful (0 votes)
10 views

Mlda - Lab

MLDA LAB

Uploaded by

tarunsanka15
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

Mlda - Lab

MLDA LAB

Uploaded by

tarunsanka15
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 35

In [1]:

# Simple Linear Regression

# Importing the libraries


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
# Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
In [3]:
X

Out[3]:
array([[ 1.1],
[ 1.3],
[ 1.5],
[ 2. ],
[ 2.2],
[ 2.9],
[ 3. ],
[ 3.2],
[ 3.2],
[ 3.7],
[ 3.9],
[ 4. ],
[ 4. ],
[ 4.1],
[ 4.5],
[ 4.9],
[ 5.1],
[ 5.3],
[ 5.9],
[ 6. ],
[ 6.8],
[ 7.1],
[ 7.9],
[ 8.2],
[ 8.7],
[ 9. ],
[ 9.5],
[ 9.6],
[10.3],
[10.5]])
In [4]:
y

Out[4]:
array([ 39343, 46205, 37731, 43525, 39891, 56642, 60150, 54445,
64445, 57189, 63218, 55794, 56957, 57081, 61111, 67938,
66029, 83088, 81363, 93940, 91738, 98273, 101302, 113812,
109431, 105582, 116969, 112635, 122391, 121872], dtype=int64)
In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
In [6]:
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

Out[6]:
'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train =
sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train =
sc_y.fit_transform(y_train)'
In [7]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

Out[7]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [8]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
In [9]:
# Visualising the Training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [10]:
# Visualising the Test dataset results with y_test
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [11]:
# Visualising the Test dataset results with y_predict
plt.scatter(X_test, y_pred, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
In [14]:
print(regressor.intercept_)
26816.192244031176
In [15]:
print(regressor.coef_)
[9345.94244312]
In [17]:
from sklearn import metrics
print('Mean Absolute error:',metrics.mean_absolute_error(y_test,y_pred))
print('Mean Squared error:',metrics.mean_squared_error(y_test,y_pred))
print('Root Absolute error:',metrics.mean_squared_error(y_test,y_pred))
Mean Absolute error: 3426.42693743071
Mean Squared error: 21026037.329511303
Root Absolute error: 21026037.329511303
In [ ]:
In [3]:
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

Out[3]:
'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train =
sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train =
sc_y.fit_transform(y_train)'
In [4]:
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [5]:
from sklearn.cluster import KMeans
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 6, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)
y_kmeans

Out[5]:
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 0,
4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1, 5, 0, 5, 1, 5, 1, 5,
0, 5, 1, 5, 1, 5, 1, 5, 1, 5, 0, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5,
1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5,
1, 5, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2])
In [6]:
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroi
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
In [7]:
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroi
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

In [ ]:
In [ ]:
1. importing the packages - numpy,matplotlib,pandas, related to algorithm
2. read the file using pandas
3. slice into dependent and independent variables
4. split dataset into train and test - xtrain,xtest, ytrain, ytest
5. standarisation / normalisation - xtrain, then on xtest
6. algorithm implement on training dataset (training the model) - xtrain, ytrain
7. predict on test dataset by using ur object of train model -xtest - y_pred
8. Evaluation metrics - confusion matrix, accuracy score, precision,f-score, recall
9. visualisation
10. Tuning of hyperparameters
In [6]:
# Logistic Regression

# Importing the libraries


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [7]:
# Importing the dataset User ID,Gender,Age,EstimatedSalary,Purchased

#Now, to predict whether a user will purchase the product or not,


#one needs to find out the relationship between Age and Estimated Salary.
#Here User ID and Gender are not important factors for finding out this.

dataset = pd.read_csv('Product_purchase.csv')
X = dataset.iloc[:, [2, 3]].values #input
y = dataset.iloc[:, 4].values #output
In [8]:
# Splitting the dataset into the Training set and Test set
#Splitting the dataset to train and test. 75% of data is used for training the model and
#25% of it is used to test the performance of our model.

from sklearn.model_selection import train_test_split


xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)
In [9]:
# Feature Scaling
#Now, it is very important to perform feature scaling here because
#Age and Estimated Salary values lie in different ranges.
#If we don’t scale the features then Estimated Salary feature will dominate Age feature
#when the model finds the nearest neighbor to a data point in data space.

from sklearn.preprocessing import StandardScaler


sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)

print (xtrain[0:10, :])


[[ 0.58164944 -0.88670699]
[-0.60673761 1.46173768]
[-0.01254409 -0.5677824 ]
[-0.60673761 1.89663484]
[ 1.37390747 -1.40858358]
[ 1.47293972 0.99784738]
[ 0.08648817 -0.79972756]
[-0.01254409 -0.24885782]
[-0.21060859 -0.5677824 ]
[-0.21060859 -0.19087153]]
In [10]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(xtrain, ytrain)

Out[10]:
LogisticRegression(random_state=0)
In [12]:
# Predicting the Test set results
y_pred = classifier.predict(xtest)
In [13]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
print ("Confusion Matrix : \n", cm)

65 - True Positive = ytest =1, y_pred = 1


24 - False Positive = ytest= 0, y_pred = 0

65+24 = prediction done correctly = accuracy = 89 out of testdata 100 =0.89

3+8 = predcition went wrong

3 = ytest =1, yPred =0


8 = ytest=0, ypred=1
Confusion Matrix :
[[65 3]
[ 8 24]]
In [20]:
65 - True positive - dataset - purchase , algorithm - purchase
24 - True negative - dataset - not purchase, algorithm - not purchase

65+24 = correct prediction y_test, y_pred

3- False
8 - False

3+8 - predicted wrongly

100 - 89

print("whole dataset ",X.shape)


print("train dataset ",xtrain.shape)
print("test dataset ",xtest.shape)
whole dataset (400, 2)
train dataset (300, 2)
test dataset (100, 2)
In [21]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.89
In [22]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
precision recall f1-score support

0 0.89 0.96 0.92 68


1 0.89 0.75 0.81 32

accuracy 0.89 100


macro avg 0.89 0.85 0.87 100
weighted avg 0.89 0.89 0.89 100

In [ ]:
In [1]:
# load the iris dataset

from sklearn.datasets import load_iris


iris = load_iris()
In [2]:
# store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target

In [3]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
In [4]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

Out[4]:
GaussianNB()
In [5]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)
In [6]:
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)
Gaussian Naive Bayes model accuracy(in %): 95.0
In [7]:
import csv
import random
import math
In [8]:
def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
In [9]:
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
In [10]:
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
In [11]:
def mean(numbers):
return sum(numbers)/float(len(numbers))
In [12]:
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
In [13]:
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
In [14]:
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
In [15]:
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
In [16]:
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
In [17]:
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
In [18]:
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
In [19]:
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
In [20]:
def main():
filename = 'data.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset),len(trainingSet),len(
#prepare model
summaries = summarizeByClass(trainingSet)
#test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
main()
Split 871 rows into train = 583 and test = 288 rows
Accuracy: 20.48611111111111%
In [ ]:
In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
In [4]:
digits=load_digits()
np.shape(digits.data)

Out[4]:
(1797, 64)
In [5]:
dir(digits)

Out[5]:
['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']
In [6]:
x= digits.data
y= digits.target
In [7]:
plt.gray()
plt.matshow(digits.images[1700])
plt.show()
<Figure size 432x288 with 0 Axes>

In [8]:
print(digits.data[1700])
print(digits.target[1700])
[ 0. 0. 4. 9. 12. 16. 8. 0. 0. 0. 15. 15. 8. 8. 2. 0. 0. 4.
16. 11. 4. 1. 0. 0. 0. 8. 16. 16. 16. 14. 0. 0. 0. 0. 11. 9.
8. 16. 0. 0. 0. 0. 0. 0. 7. 16. 0. 0. 0. 0. 0. 8. 16. 12.
0. 0. 0. 0. 3. 13. 9. 1. 0. 0.]
5
In [9]:
for i in range(18,22):
plt.matshow(digits.images[i])
In [10]:
for i in range(18,22):
print(digits.target[i])
8
9
0
1
In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
In [25]:
import warnings
warnings.filterwarnings('ignore')
model=LogisticRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(y_pred)
[2 8 2 6 6 7 1 9 8 5 2 8 6 6 6 6 1 0 5 8 8 7 8 4 7 5 4 9 2 9 4 7 6 8 9 4 3
1 0 1 8 6 7 7 1 0 7 6 2 1 9 6 7 9 0 0 9 1 6 3 0 2 3 4 1 9 2 6 9 1 8 3 5 1
2 8 2 2 9 7 2 3 6 0 9 3 7 5 1 2 9 9 3 1 4 7 4 8 5 8 5 5 2 5 9 0 7 1 4 7 3
4 8 9 7 9 8 2 1 5 2 5 8 4 1 7 0 6 1 5 5 9 9 5 9 9 5 7 5 6 2 8 6 9 6 1 5 1
5 9 9 1 5 3 6 1 8 9 8 7 6 7 6 5 6 0 8 8 9 8 6 1 0 4 1 6 3 8 6 7 4 9 6 3 0
3 3 3 0 7 7 5 7 8 0 7 1 9 6 4 5 0 1 4 6 4 3 3 0 9 5 9 2 1 4 2 1 6 8 9 2 4
9 3 7 6 2 3 3 1 6 9 3 6 3 3 2 0 7 6 1 1 9 7 2 7 8 5 5 7 5 2 3 7 2 7 5 5 7
0 9 1 6 5 9 7 4 3 8 0 3 6 4 6 3 2 6 8 8 8 4 6 7 5 2 4 5 3 2 4 6 9 4 5 4 3
4 6 2 9 0 1 7 2 0 9 6 0 4 2 0 7 9 8 5 7 8 2 8 4 3 7 2 6 9 1 5 1 0 8 2 8 9
5 6 2 2 7 2 1 5 1 6 4 5 0 9 4 1 1 7 0 8 9 0 5 4 3 8 8]
In [26]:
np.shape(X_test)

Out[26]:
(360, 64)
In [27]:
np.shape(y_pred)

Out[27]:
(360,)
In [28]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
0.9666666666666667
In [29]:
cm=confusion_matrix(y_test,y_pred)
print(cm)
[[27 0 0 0 0 0 0 0 0 0]
[ 0 34 0 0 0 0 0 0 1 0]
[ 0 0 35 1 0 0 0 0 0 0]
[ 0 0 0 29 0 0 0 0 0 0]
[ 0 0 0 0 29 0 0 1 0 0]
[ 0 0 0 0 0 37 0 0 0 3]
[ 0 1 0 0 0 0 43 0 0 0]
[ 0 0 0 0 1 0 0 38 0 0]
[ 0 2 1 0 0 0 0 0 36 0]
[ 0 0 0 0 0 1 0 0 0 40]]
In [30]:
import seaborn as sn
plt.figure(figsize=(10,7))
sn.heatmap(cm,annot=True)
plt.xlabel('predicted')
plt.ylabel('Actual')

Out[30]:
Text(69.0, 0.5, 'Actual')
In [ ]:
In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import mode
from sklearn.neighbors import KNeighborsClassifier
In [2]:
# K Nearest Neighbors Classification

class K_Nearest_Neighbors_Classifier() :

def __init__( self, K ) :

self.K = K

# Function to store training set

def fit( self, X_train, Y_train ) :

self.X_train = X_train

self.Y_train = Y_train

# no_of_training_examples, no_of_features

self.m, self.n = X_train.shape

# Function for prediction

def predict( self, X_test ) :

self.X_test = X_test

# no_of_test_examples, no_of_features

self.m_test, self.n = X_test.shape

# initialize Y_predict

Y_predict = np.zeros( self.m_test )

for i in range( self.m_test ) :

x = self.X_test[i]

# find the K nearest neighbors from current test example

neighbors = np.zeros( self.K )

neighbors = self.find_neighbors( x )

# most frequent class in K neighbors

Y_predict[i] = mode( neighbors )[0][0]

return Y_predict

# Function to find the K nearest neighbors to current test example

def find_neighbors( self, x ) :

# calculate all the euclidean distances between current


# test example x and training set X_train

euclidean_distances = np.zeros( self.m )

for i in range( self.m ) :

d = self.euclidean( x, self.X_train[i] )

euclidean_distances[i] = d

# sort Y_train according to euclidean_distance_array and


# store into Y_train_sorted
inds = euclidean_distances.argsort()

Y_train_sorted = self.Y_train[inds]

return Y_train_sorted[:self.K]

# Function to calculate euclidean distance

def euclidean( self, x, x_train ) :

return np.sqrt( np.sum( np.square( x - x_train ) ) )


In [3]:
# Driver code

def main() :

# Importing dataset

df = pd.read_csv( "diabetes.csv" )

X = df.iloc[:,:-1].values

Y = df.iloc[:,-1:].values

# Splitting dataset into train and test set

X_train, X_test, Y_train, Y_test = train_test_split(


X, Y, test_size = 1/3, random_state = 0 )

# Model training

model = K_Nearest_Neighbors_Classifier( K = 3 )

model.fit( X_train, Y_train )

model1 = KNeighborsClassifier( n_neighbors = 3 )

model1.fit( X_train, Y_train )

# Prediction on test set

Y_pred = model.predict( X_test )

Y_pred1 = model1.predict( X_test )

# measure performance

correctly_classified = 0

correctly_classified1 = 0

# counter

count = 0

for count in range( np.size( Y_pred ) ) :

if Y_test[count] == Y_pred[count] :

correctly_classified = correctly_classified + 1

if Y_test[count] == Y_pred1[count] :

correctly_classified1 = correctly_classified1 + 1

count = count + 1

print( "Accuracy on test set by our model : ", (


correctly_classified / count ) * 100 )
print( "Accuracy on test set by sklearn model : ", (
correctly_classified1 / count ) * 100 )

if __name__ == "__main__" :
main()
<ipython-input-3-f267789a957d>:26: DataConversionWarning: A column-vector y was passed when a 1d array w
as expected. Please change the shape of y to (n_samples, ), for example using ravel().
model1.fit( X_train, Y_train )
Accuracy on test set by our model : 69.53125
Accuracy on test set by sklearn model : 69.53125
In [ ]:
In [1]:
#importing the packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
#reading the dataset using pandas

dataset=pd.read_csv("Iris.csv")
dataset

Out[2]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

... ... ... ... ... ...

145 6.7 3.0 5.2 2.3 Iris-virginica

146 6.3 2.5 5.0 1.9 Iris-virginica

147 6.5 3.0 5.2 2.0 Iris-virginica

148 6.2 3.4 5.4 2.3 Iris-virginica

149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns

In [3]:
#Slice into independent(X) and dependent(y) variables

X=dataset.iloc[:,[0,1,2,3]].values
y=dataset.iloc[:,4].values
In [4]:
print(X)
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.1 1.5 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
In [5]:
print(y)
['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica']
In [6]:
#Splitting dataset into train and test

from sklearn.model_selection import train_test_split


xtrain,xtest,ytrain,ytest=train_test_split(X,y, test_size=0.25,random_state=0)
In [7]:
from sklearn.tree import DecisionTreeClassifier
In [8]:
clf = DecisionTreeClassifier(criterion='gini')
In [9]:
clf.fit(xtrain,ytrain)

Out[9]:
DecisionTreeClassifier()
In [10]:
print("whole dataset",X.shape)
print("train dataset",xtrain.shape)
print("test dataset",xtest.shape)
whole dataset (150, 4)
train dataset (112, 4)
test dataset (38, 4)
In [11]:
#Predicting on test dataset

y_pred = clf.predict(xtest)
In [12]:
print(y_pred)
['Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
'Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'
'Iris-setosa' 'Iris-virginica']
In [13]:
#Confusion matrix for gini

from sklearn.metrics import confusion_matrix


cm = confusion_matrix(ytest, y_pred)
print ("Confusion Matrix : \n", cm)
Confusion Matrix :
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
In [14]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.9736842105263158
In [15]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

In [16]:
#Entropy

clf_entropy= DecisionTreeClassifier(criterion='entropy')
clf_entropy.fit(xtrain,ytrain)

Out[16]:
DecisionTreeClassifier(criterion='entropy')
In [17]:
y_pred_entropy = clf_entropy.predict(xtest)
In [18]:
y_pred_entropy

Out[18]:
array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
'Iris-virginica'], dtype=object)
In [19]:
#Confusion metrix for entropy

cm=confusion_matrix(ytest,y_pred)
print("confusion matrix:\n",cm)
confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
In [20]:
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.9736842105263158
In [21]:
print(classification_report(ytest, y_pred))
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

In [22]:
#Gini

for i in range(2,6):
clf_entropy= DecisionTreeClassifier(criterion='gini',min_samples_split=i,min_samples_leaf=i)
clf_entropy.fit(xtrain,ytrain)
print("Evalution for min_sample_split",i,"and min sample leaf",i)
print("confusion matrix:\n",confusion_matrix(ytest,y_pred))
print()
print ("Accuracy : ", accuracy_score(ytest, y_pred))
print()
print(classification_report(ytest, y_pred))
print()
Evalution for min_sample_split 2 and min sample leaf 2
confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 3 and min sample leaf 3


confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 4 and min sample leaf 4


confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 5 and min sample leaf 5


confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38
In [23]:
#Entropy

for i in range(2,6):
clf_entropy= DecisionTreeClassifier(criterion='entropy',min_samples_split=i,min_samples_leaf=i)
clf_entropy.fit(xtrain,ytrain)
print("Evalution for min_sample_split",i,"and min sample leaf",i)
print()
print("confusion matrix:\n",confusion_matrix(ytest,y_pred))
print()
print ("Accuracy : ", accuracy_score(ytest, y_pred))
print()
print(classification_report(ytest, y_pred))
Evalution for min_sample_split 2 and min sample leaf 2

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 3 and min sample leaf 3

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 4 and min sample leaf 4

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 5 and min sample leaf 5

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38
In [24]:
#Visualization

from sklearn import tree


fn=['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
cn=['setosa', 'versicolor', 'virginica']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
feature_names = fn,
class_names=cn,
filled = True);

In [25]:
clf= DecisionTreeClassifier(criterion='entropy')
clf.fit(xtrain,ytrain)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
feature_names = fn,
class_names=cn,
filled = True);
In [ ]:
#problem Statement Write a code for implementing XGBOOST Classifier
In [1]:
# importing required libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# read the train and test dataset
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
# shape of the dataset
print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data
# target variable - Survived
Shape of training data : (792, 17)
Shape of testing data : (100, 17)
In [2]:
# seperate the independent and target variable on training data
train_x = train_data.drop(columns=['Survived'],axis=1)
train_y = train_data['Survived']
# seperate the independent and target variable on testing data
test_x = test_data.drop(columns=['Survived'],axis=1)
test_y = test_data['Survived']
In [3]:
'''
#Create the object of the XGBoost model

'''
model = XGBClassifier()
# fit the model with the training data
model.fit(train_x,train_y)
# predict the target on the train dataset
predict_train = model.predict(train_x)
print('\nTarget on train data',predict_train)
[13:42:03] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric
used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set
eval_metric if you'd like to restore the old behavior.

Target on train data [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1


0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0
1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1
1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0
1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1
0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 0 1 0 0 0 1
0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0
0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0
0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0
1 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0
1 0 1 1 1 0 0 0 0 1 0 1 0 0 0]
C:\Users\Konduru Varshith\anaconda3\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of
label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this
warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object;
and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)

In [4]:
# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset
predict_test = model.predict(test_x)
print('\nTarget on test data',predict_test)
# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)
accuracy_score on train dataset : 1.0

Target on test data [0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1


0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0
1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0]

accuracy_score on test dataset : 0.85


In [ ]:

Loading [MathJax]/extensions/Safe.js
In [1]:
#r18cs380
In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
In [3]:
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)
In [5]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain= sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)

print(xtrain[0:10,:])
[[ 0.01543995 -0.11925475]
[-0.09984503 -1.04039491]
[ 1.05300481 -0.11925475]
[-1.36797986 0.34131533]
[ 1.1682898 0.11103029]
[-1.0221249 1.03217045]
[-0.56098497 1.49274053]
[-1.0221249 -2.42210516]
[ 0.70714986 -0.11925475]
[ 0.93771983 0.57160037]]
In [6]:
C = 1.0 # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=1,gamma='auto').fit(X, y)
In [7]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
h = (x_max / x_min)/100
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
In [8]:
plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)


plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [9]:
svc = svm.SVC(kernel='rbf', C=1,gamma=10).fit(X, y)
plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [10]:
svc = svm.SVC(kernel='rbf', C=1,gamma=100).fit(X, y)

plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [11]:
svc = svm.SVC(kernel='rbf', C=1,gamma='auto').fit(X, y)

plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()
In [12]:
svc = svm.SVC(kernel='rbf', C=100,gamma='auto').fit(X, y)

plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [13]:
svc = svm.SVC(kernel='rbf', C=1000,gamma='auto').fit(X, y)

plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()
In [ ]:
In [1]:
#multiple_linear_regression
#r18cs380
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('https://raw.githubusercontent.com/mk-gurucharan/Regression/master/Startups_Data.csv'
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)
[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
[1.0 0.0 0.0 162597.7 151377.59 443898.53]
[0.0 1.0 0.0 153441.51 101145.55 407934.54]
[0.0 0.0 1.0 144372.41 118671.85 383199.62]
[0.0 1.0 0.0 142107.34 91391.77 366168.42]
[0.0 0.0 1.0 131876.9 99814.71 362861.36]
[1.0 0.0 0.0 134615.46 147198.87 127716.82]
[0.0 1.0 0.0 130298.13 145530.06 323876.68]
[0.0 0.0 1.0 120542.52 148718.95 311613.29]
[1.0 0.0 0.0 123334.88 108679.17 304981.62]
[0.0 1.0 0.0 101913.08 110594.11 229160.95]
[1.0 0.0 0.0 100671.96 91790.61 249744.55]
[0.0 1.0 0.0 93863.75 127320.38 249839.44]
[1.0 0.0 0.0 91992.39 135495.07 252664.93]
[0.0 1.0 0.0 119943.24 156547.42 256512.92]
[0.0 0.0 1.0 114523.61 122616.84 261776.23]
[1.0 0.0 0.0 78013.11 121597.55 264346.06]
[0.0 0.0 1.0 94657.16 145077.58 282574.31]
[0.0 1.0 0.0 91749.16 114175.79 294919.57]
[0.0 0.0 1.0 86419.7 153514.11 0.0]
[1.0 0.0 0.0 76253.86 113867.3 298664.47]
[0.0 0.0 1.0 78389.47 153773.43 299737.29]
[0.0 1.0 0.0 73994.56 122782.75 303319.26]
[0.0 1.0 0.0 67532.53 105751.03 304768.73]
[0.0 0.0 1.0 77044.01 99281.34 140574.81]
[1.0 0.0 0.0 64664.71 139553.16 137962.62]
[0.0 1.0 0.0 75328.87 144135.98 134050.07]
[0.0 0.0 1.0 72107.6 127864.55 353183.81]
[0.0 1.0 0.0 66051.52 182645.56 118148.2]
[0.0 0.0 1.0 65605.48 153032.06 107138.38]
[0.0 1.0 0.0 61994.48 115641.28 91131.24]
[0.0 0.0 1.0 61136.38 152701.92 88218.23]
[1.0 0.0 0.0 63408.86 129219.61 46085.25]
[0.0 1.0 0.0 55493.95 103057.49 214634.81]
[1.0 0.0 0.0 46426.07 157693.92 210797.67]
[0.0 0.0 1.0 46014.02 85047.44 205517.64]
[0.0 1.0 0.0 28663.76 127056.21 201126.82]
[1.0 0.0 0.0 44069.95 51283.14 197029.42]
[0.0 0.0 1.0 20229.59 65947.93 185265.1]
[1.0 0.0 0.0 38558.51 82982.09 174999.3]
[1.0 0.0 0.0 28754.33 118546.05 172795.67]
[0.0 1.0 0.0 27892.92 84710.77 164470.71]
[1.0 0.0 0.0 23640.93 96189.63 148001.11]
[0.0 0.0 1.0 15505.73 127382.3 35534.17]
[1.0 0.0 0.0 22177.74 154806.14 28334.72]
[0.0 0.0 1.0 1000.23 124153.04 1903.93]
[0.0 1.0 0.0 1315.46 115816.21 297114.46]
[1.0 0.0 0.0 0.0 135426.92 0.0]
[0.0 0.0 1.0 542.05 51743.15 0.0]
[1.0 0.0 0.0 0.0 116983.8 45173.06]]
In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
LinearRegression()
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
df
Out[3]:
Real Values Predicted Values

0 103282.38 102613.307260

1 96712.80 88969.720298

2 81229.06 68377.662115

3 97427.84 98845.899455

4 105008.31 114393.773014

5 125370.37 130723.412943

6 156122.51 158922.404934

7 97483.56 98117.402057

8 71498.49 69483.670171

9 96778.92 97551.005854

In [4]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)
score

Out[4]:
0.9275800339704457
In [ ]:

Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy