0% found this document useful (0 votes)

10 views

Mlda - Lab

MLDA LAB

Uploaded by

tarunsanka15

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views

Mlda - Lab

MLDA LAB

Uploaded by

tarunsanka15

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 35

In [1]:

# Simple Linear Regression

# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
# Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
In [3]:
X

Out[3]:
array([[ 1.1],
[ 1.3],
[ 1.5],
[ 2. ],
[ 2.2],
[ 2.9],
[ 3. ],
[ 3.2],
[ 3.2],
[ 3.7],
[ 3.9],
[ 4. ],
[ 4. ],
[ 4.1],
[ 4.5],
[ 4.9],
[ 5.1],
[ 5.3],
[ 5.9],
[ 6. ],
[ 6.8],
[ 7.1],
[ 7.9],
[ 8.2],
[ 8.7],
[ 9. ],
[ 9.5],
[ 9.6],
[10.3],
[10.5]])
In [4]:
y

Out[4]:
array([ 39343, 46205, 37731, 43525, 39891, 56642, 60150, 54445,
64445, 57189, 63218, 55794, 56957, 57081, 61111, 67938,
66029, 83088, 81363, 93940, 91738, 98273, 101302, 113812,
109431, 105582, 116969, 112635, 122391, 121872], dtype=int64)
In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
In [6]:
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

Out[6]:
'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train =
sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train =
sc_y.fit_transform(y_train)'
In [7]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

Out[7]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [8]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
In [9]:
# Visualising the Training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [10]:
# Visualising the Test dataset results with y_test
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [11]:
# Visualising the Test dataset results with y_predict
plt.scatter(X_test, y_pred, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
In [14]:
print(regressor.intercept_)
26816.192244031176
In [15]:
print(regressor.coef_)
[9345.94244312]
In [17]:
from sklearn import metrics
print('Mean Absolute error:',metrics.mean_absolute_error(y_test,y_pred))
print('Mean Squared error:',metrics.mean_squared_error(y_test,y_pred))
print('Root Absolute error:',metrics.mean_squared_error(y_test,y_pred))
Mean Absolute error: 3426.42693743071
Mean Squared error: 21026037.329511303
Root Absolute error: 21026037.329511303
In [ ]:
In [3]:
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

Out[3]:
'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train =
sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train =
sc_y.fit_transform(y_train)'
In [4]:
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [5]:
from sklearn.cluster import KMeans
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 6, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)
y_kmeans

Out[5]:
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 0,
4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1, 5, 0, 5, 1, 5, 1, 5,
0, 5, 1, 5, 1, 5, 1, 5, 1, 5, 0, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5,
1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5,
1, 5, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2])
In [6]:
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroi
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
In [7]:
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroi
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

In [ ]:
In [ ]:
1. importing the packages - numpy,matplotlib,pandas, related to algorithm
2. read the file using pandas
3. slice into dependent and independent variables
4. split dataset into train and test - xtrain,xtest, ytrain, ytest
5. standarisation / normalisation - xtrain, then on xtest
6. algorithm implement on training dataset (training the model) - xtrain, ytrain
7. predict on test dataset by using ur object of train model -xtest - y_pred
8. Evaluation metrics - confusion matrix, accuracy score, precision,f-score, recall
9. visualisation
10. Tuning of hyperparameters
In [6]:
# Logistic Regression

# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [7]:
# Importing the dataset User ID,Gender,Age,EstimatedSalary,Purchased

#Now, to predict whether a user will purchase the product or not,

#one needs to find out the relationship between Age and Estimated Salary.
#Here User ID and Gender are not important factors for finding out this.

dataset = pd.read_csv('Product_purchase.csv')
X = dataset.iloc[:, [2, 3]].values #input
y = dataset.iloc[:, 4].values #output
In [8]:
# Splitting the dataset into the Training set and Test set
#Splitting the dataset to train and test. 75% of data is used for training the model and
#25% of it is used to test the performance of our model.

from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)
In [9]:
# Feature Scaling
#Now, it is very important to perform feature scaling here because
#Age and Estimated Salary values lie in different ranges.
#If we don’t scale the features then Estimated Salary feature will dominate Age feature
#when the model finds the nearest neighbor to a data point in data space.

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)

print (xtrain[0:10, :])

[[ 0.58164944 -0.88670699]
[-0.60673761 1.46173768]
[-0.01254409 -0.5677824 ]
[-0.60673761 1.89663484]
[ 1.37390747 -1.40858358]
[ 1.47293972 0.99784738]
[ 0.08648817 -0.79972756]
[-0.01254409 -0.24885782]
[-0.21060859 -0.5677824 ]
[-0.21060859 -0.19087153]]
In [10]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(xtrain, ytrain)

Out[10]:
LogisticRegression(random_state=0)
In [12]:
# Predicting the Test set results
y_pred = classifier.predict(xtest)
In [13]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
print ("Confusion Matrix : \n", cm)

65 - True Positive = ytest =1, y_pred = 1

24 - False Positive = ytest= 0, y_pred = 0

65+24 = prediction done correctly = accuracy = 89 out of testdata 100 =0.89

3+8 = predcition went wrong

3 = ytest =1, yPred =0

8 = ytest=0, ypred=1
Confusion Matrix :
[[65 3]
[ 8 24]]
In [20]:
65 - True positive - dataset - purchase , algorithm - purchase
24 - True negative - dataset - not purchase, algorithm - not purchase

65+24 = correct prediction y_test, y_pred

3- False
8 - False

3+8 - predicted wrongly

100 - 89

print("whole dataset ",X.shape)

print("train dataset ",xtrain.shape)
print("test dataset ",xtest.shape)
whole dataset (400, 2)
train dataset (300, 2)
test dataset (100, 2)
In [21]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.89
In [22]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
precision recall f1-score support

0 0.89 0.96 0.92 68

1 0.89 0.75 0.81 32

accuracy 0.89 100

macro avg 0.89 0.85 0.87 100
weighted avg 0.89 0.89 0.89 100

In [ ]:
In [1]:
# load the iris dataset

from sklearn.datasets import load_iris

iris = load_iris()
In [2]:
# store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target

In [3]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
In [4]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

Out[4]:
GaussianNB()
In [5]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)
In [6]:
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)
Gaussian Naive Bayes model accuracy(in %): 95.0
In [7]:
import csv
import random
import math
In [8]:
def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
In [9]:
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
In [10]:
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
In [11]:
def mean(numbers):
return sum(numbers)/float(len(numbers))
In [12]:
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
In [13]:
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
In [14]:
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
In [15]:
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
In [16]:
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
In [17]:
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
In [18]:
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
In [19]:
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
In [20]:
def main():
filename = 'data.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset),len(trainingSet),len(
#prepare model
summaries = summarizeByClass(trainingSet)
#test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
main()
Split 871 rows into train = 583 and test = 288 rows
Accuracy: 20.48611111111111%
In [ ]:
In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
In [4]:
digits=load_digits()
np.shape(digits.data)

Out[4]:
(1797, 64)
In [5]:
dir(digits)

Out[5]:
['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']
In [6]:
x= digits.data
y= digits.target
In [7]:
plt.gray()
plt.matshow(digits.images[1700])
plt.show()
<Figure size 432x288 with 0 Axes>

In [8]:
print(digits.data[1700])
print(digits.target[1700])
[ 0. 0. 4. 9. 12. 16. 8. 0. 0. 0. 15. 15. 8. 8. 2. 0. 0. 4.
16. 11. 4. 1. 0. 0. 0. 8. 16. 16. 16. 14. 0. 0. 0. 0. 11. 9.
8. 16. 0. 0. 0. 0. 0. 0. 7. 16. 0. 0. 0. 0. 0. 8. 16. 12.
0. 0. 0. 0. 3. 13. 9. 1. 0. 0.]
5
In [9]:
for i in range(18,22):
plt.matshow(digits.images[i])
In [10]:
for i in range(18,22):
print(digits.target[i])
8
9
0
1
In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
In [25]:
import warnings
warnings.filterwarnings('ignore')
model=LogisticRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(y_pred)
[2 8 2 6 6 7 1 9 8 5 2 8 6 6 6 6 1 0 5 8 8 7 8 4 7 5 4 9 2 9 4 7 6 8 9 4 3
1 0 1 8 6 7 7 1 0 7 6 2 1 9 6 7 9 0 0 9 1 6 3 0 2 3 4 1 9 2 6 9 1 8 3 5 1
2 8 2 2 9 7 2 3 6 0 9 3 7 5 1 2 9 9 3 1 4 7 4 8 5 8 5 5 2 5 9 0 7 1 4 7 3
4 8 9 7 9 8 2 1 5 2 5 8 4 1 7 0 6 1 5 5 9 9 5 9 9 5 7 5 6 2 8 6 9 6 1 5 1
5 9 9 1 5 3 6 1 8 9 8 7 6 7 6 5 6 0 8 8 9 8 6 1 0 4 1 6 3 8 6 7 4 9 6 3 0
3 3 3 0 7 7 5 7 8 0 7 1 9 6 4 5 0 1 4 6 4 3 3 0 9 5 9 2 1 4 2 1 6 8 9 2 4
9 3 7 6 2 3 3 1 6 9 3 6 3 3 2 0 7 6 1 1 9 7 2 7 8 5 5 7 5 2 3 7 2 7 5 5 7
0 9 1 6 5 9 7 4 3 8 0 3 6 4 6 3 2 6 8 8 8 4 6 7 5 2 4 5 3 2 4 6 9 4 5 4 3
4 6 2 9 0 1 7 2 0 9 6 0 4 2 0 7 9 8 5 7 8 2 8 4 3 7 2 6 9 1 5 1 0 8 2 8 9
5 6 2 2 7 2 1 5 1 6 4 5 0 9 4 1 1 7 0 8 9 0 5 4 3 8 8]
In [26]:
np.shape(X_test)

Out[26]:
(360, 64)
In [27]:
np.shape(y_pred)

Out[27]:
(360,)
In [28]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
0.9666666666666667
In [29]:
cm=confusion_matrix(y_test,y_pred)
print(cm)
[[27 0 0 0 0 0 0 0 0 0]
[ 0 34 0 0 0 0 0 0 1 0]
[ 0 0 35 1 0 0 0 0 0 0]
[ 0 0 0 29 0 0 0 0 0 0]
[ 0 0 0 0 29 0 0 1 0 0]
[ 0 0 0 0 0 37 0 0 0 3]
[ 0 1 0 0 0 0 43 0 0 0]
[ 0 0 0 0 1 0 0 38 0 0]
[ 0 2 1 0 0 0 0 0 36 0]
[ 0 0 0 0 0 1 0 0 0 40]]
In [30]:
import seaborn as sn
plt.figure(figsize=(10,7))
sn.heatmap(cm,annot=True)
plt.xlabel('predicted')
plt.ylabel('Actual')

Out[30]:
Text(69.0, 0.5, 'Actual')
In [ ]:
In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import mode
from sklearn.neighbors import KNeighborsClassifier
In [2]:
# K Nearest Neighbors Classification

class K_Nearest_Neighbors_Classifier() :

def init( self, K ) :

self.K = K

# Function to store training set

def fit( self, X_train, Y_train ) :

self.X_train = X_train

self.Y_train = Y_train

# no_of_training_examples, no_of_features

self.m, self.n = X_train.shape

# Function for prediction

def predict( self, X_test ) :

self.X_test = X_test

# no_of_test_examples, no_of_features

self.m_test, self.n = X_test.shape

# initialize Y_predict

Y_predict = np.zeros( self.m_test )

for i in range( self.m_test ) :

x = self.X_test[i]

# find the K nearest neighbors from current test example

neighbors = np.zeros( self.K )

neighbors = self.find_neighbors( x )

# most frequent class in K neighbors

Y_predict[i] = mode( neighbors )[0][0]

return Y_predict

# Function to find the K nearest neighbors to current test example

def find_neighbors( self, x ) :

# calculate all the euclidean distances between current

# test example x and training set X_train

euclidean_distances = np.zeros( self.m )

for i in range( self.m ) :

d = self.euclidean( x, self.X_train[i] )

euclidean_distances[i] = d

# sort Y_train according to euclidean_distance_array and

# store into Y_train_sorted
inds = euclidean_distances.argsort()

Y_train_sorted = self.Y_train[inds]

return Y_train_sorted[:self.K]

# Function to calculate euclidean distance

def euclidean( self, x, x_train ) :

return np.sqrt( np.sum( np.square( x - x_train ) ) )

In [3]:
# Driver code

def main() :

# Importing dataset

df = pd.read_csv( "diabetes.csv" )

X = df.iloc[:,:-1].values

Y = df.iloc[:,-1:].values

# Splitting dataset into train and test set

X_train, X_test, Y_train, Y_test = train_test_split(

X, Y, test_size = 1/3, random_state = 0 )

# Model training

model = K_Nearest_Neighbors_Classifier( K = 3 )

model.fit( X_train, Y_train )

model1 = KNeighborsClassifier( n_neighbors = 3 )

model1.fit( X_train, Y_train )

# Prediction on test set

Y_pred = model.predict( X_test )

Y_pred1 = model1.predict( X_test )

# measure performance

correctly_classified = 0

correctly_classified1 = 0

# counter

count = 0

for count in range( np.size( Y_pred ) ) :

if Y_test[count] == Y_pred[count] :

correctly_classified = correctly_classified + 1

if Y_test[count] == Y_pred1[count] :

correctly_classified1 = correctly_classified1 + 1

count = count + 1

print( "Accuracy on test set by our model : ", (

correctly_classified / count ) * 100 )
print( "Accuracy on test set by sklearn model : ", (
correctly_classified1 / count ) * 100 )

if __name__ == "__main__" :
main()
<ipython-input-3-f267789a957d>:26: DataConversionWarning: A column-vector y was passed when a 1d array w
as expected. Please change the shape of y to (n_samples, ), for example using ravel().
model1.fit( X_train, Y_train )
Accuracy on test set by our model : 69.53125
Accuracy on test set by sklearn model : 69.53125
In [ ]:
In [1]:
#importing the packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
#reading the dataset using pandas

dataset=pd.read_csv("Iris.csv")
dataset

Out[2]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

... ... ... ... ... ...

145 6.7 3.0 5.2 2.3 Iris-virginica

146 6.3 2.5 5.0 1.9 Iris-virginica

147 6.5 3.0 5.2 2.0 Iris-virginica

148 6.2 3.4 5.4 2.3 Iris-virginica

149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns

In [3]:
#Slice into independent(X) and dependent(y) variables

X=dataset.iloc[:,[0,1,2,3]].values
y=dataset.iloc[:,4].values
In [4]:
print(X)
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.1 1.5 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
In [5]:
print(y)
['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica']
In [6]:
#Splitting dataset into train and test

from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(X,y, test_size=0.25,random_state=0)
In [7]:
from sklearn.tree import DecisionTreeClassifier
In [8]:
clf = DecisionTreeClassifier(criterion='gini')
In [9]:
clf.fit(xtrain,ytrain)

Out[9]:
DecisionTreeClassifier()
In [10]:
print("whole dataset",X.shape)
print("train dataset",xtrain.shape)
print("test dataset",xtest.shape)
whole dataset (150, 4)
train dataset (112, 4)
test dataset (38, 4)
In [11]:
#Predicting on test dataset

y_pred = clf.predict(xtest)
In [12]:
print(y_pred)
['Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
'Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'
'Iris-setosa' 'Iris-virginica']
In [13]:
#Confusion matrix for gini

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ytest, y_pred)
print ("Confusion Matrix : \n", cm)
Confusion Matrix :
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
In [14]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.9736842105263158
In [15]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

In [16]:
#Entropy

clf_entropy= DecisionTreeClassifier(criterion='entropy')
clf_entropy.fit(xtrain,ytrain)

Out[16]:
DecisionTreeClassifier(criterion='entropy')
In [17]:
y_pred_entropy = clf_entropy.predict(xtest)
In [18]:
y_pred_entropy

Out[18]:
array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
'Iris-virginica'], dtype=object)
In [19]:
#Confusion metrix for entropy

cm=confusion_matrix(ytest,y_pred)
print("confusion matrix:\n",cm)
confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
In [20]:
print ("Accuracy : ", accuracy_score(ytest, y_pred))
Accuracy : 0.9736842105263158
In [21]:
print(classification_report(ytest, y_pred))
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

In [22]:
#Gini

for i in range(2,6):
clf_entropy= DecisionTreeClassifier(criterion='gini',min_samples_split=i,min_samples_leaf=i)
clf_entropy.fit(xtrain,ytrain)
print("Evalution for min_sample_split",i,"and min sample leaf",i)
print("confusion matrix:\n",confusion_matrix(ytest,y_pred))
print()
print ("Accuracy : ", accuracy_score(ytest, y_pred))
print()
print(classification_report(ytest, y_pred))
print()
Evalution for min_sample_split 2 and min sample leaf 2
confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 3 and min sample leaf 3

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 4 and min sample leaf 4

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 5 and min sample leaf 5

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38
In [23]:
#Entropy

for i in range(2,6):
clf_entropy= DecisionTreeClassifier(criterion='entropy',min_samples_split=i,min_samples_leaf=i)
clf_entropy.fit(xtrain,ytrain)
print("Evalution for min_sample_split",i,"and min sample leaf",i)
print()
print("confusion matrix:\n",confusion_matrix(ytest,y_pred))
print()
print ("Accuracy : ", accuracy_score(ytest, y_pred))
print()
print(classification_report(ytest, y_pred))
Evalution for min_sample_split 2 and min sample leaf 2

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 3 and min sample leaf 3

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 4 and min sample leaf 4

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38

Evalution for min_sample_split 5 and min sample leaf 5

confusion matrix:
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]

Accuracy : 0.9736842105263158

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9

accuracy 0.97 38
macro avg 0.97 0.98 0.97 38
weighted avg 0.98 0.97 0.97 38
In [24]:
#Visualization

from sklearn import tree

fn=['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
cn=['setosa', 'versicolor', 'virginica']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
feature_names = fn,
class_names=cn,
filled = True);

In [25]:
clf= DecisionTreeClassifier(criterion='entropy')
clf.fit(xtrain,ytrain)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
feature_names = fn,
class_names=cn,
filled = True);
In [ ]:
#problem Statement Write a code for implementing XGBOOST Classifier
In [1]:
# importing required libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# read the train and test dataset
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
# shape of the dataset
print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data
# target variable - Survived
Shape of training data : (792, 17)
Shape of testing data : (100, 17)
In [2]:
# seperate the independent and target variable on training data
train_x = train_data.drop(columns=['Survived'],axis=1)
train_y = train_data['Survived']
# seperate the independent and target variable on testing data
test_x = test_data.drop(columns=['Survived'],axis=1)
test_y = test_data['Survived']
In [3]:
'''
#Create the object of the XGBoost model

'''
model = XGBClassifier()
# fit the model with the training data
model.fit(train_x,train_y)
# predict the target on the train dataset
predict_train = model.predict(train_x)
print('\nTarget on train data',predict_train)
[13:42:03] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric
used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set
eval_metric if you'd like to restore the old behavior.

Target on train data [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1

0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0
1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1
1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0
1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1
0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 0 1 0 0 0 1
0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0
0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0
0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0
1 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0
1 0 1 1 1 0 0 0 0 1 0 1 0 0 0]
C:\Users\Konduru Varshith\anaconda3\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of
label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this
warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object;
and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)

In [4]:
# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset
predict_test = model.predict(test_x)
print('\nTarget on test data',predict_test)
# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)
accuracy_score on train dataset : 1.0

Target on test data [0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1

0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0
1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0]

accuracy_score on test dataset : 0.85

In [ ]:

Loading [MathJax]/extensions/Safe.js
In [1]:
#r18cs380
In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
In [3]:
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)
In [5]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain= sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)

print(xtrain[0:10,:])
[[ 0.01543995 -0.11925475]
[-0.09984503 -1.04039491]
[ 1.05300481 -0.11925475]
[-1.36797986 0.34131533]
[ 1.1682898 0.11103029]
[-1.0221249 1.03217045]
[-0.56098497 1.49274053]
[-1.0221249 -2.42210516]
[ 0.70714986 -0.11925475]
[ 0.93771983 0.57160037]]
In [6]:
C = 1.0 # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=1,gamma='auto').fit(X, y)
In [7]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
h = (x_max / x_min)/100
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
In [8]:
plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)

plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [9]:
svc = svm.SVC(kernel='rbf', C=1,gamma=10).fit(X, y)
plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()

In [10]:
svc = svm.SVC(kernel='rbf', C=1,gamma=100).fit(X, y)

In [11]:
svc = svm.SVC(kernel='rbf', C=1,gamma='auto').fit(X, y)

In [13]:
svc = svm.SVC(kernel='rbf', C=1000,gamma='auto').fit(X, y)

plt.subplot(1, 1, 1)
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
plt.show()
In [ ]:
In [1]:
#multiple_linear_regression
#r18cs380
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('https://raw.githubusercontent.com/mk-gurucharan/Regression/master/Startups_Data.csv'
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)
[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
[1.0 0.0 0.0 162597.7 151377.59 443898.53]
[0.0 1.0 0.0 153441.51 101145.55 407934.54]
[0.0 0.0 1.0 144372.41 118671.85 383199.62]
[0.0 1.0 0.0 142107.34 91391.77 366168.42]
[0.0 0.0 1.0 131876.9 99814.71 362861.36]
[1.0 0.0 0.0 134615.46 147198.87 127716.82]
[0.0 1.0 0.0 130298.13 145530.06 323876.68]
[0.0 0.0 1.0 120542.52 148718.95 311613.29]
[1.0 0.0 0.0 123334.88 108679.17 304981.62]
[0.0 1.0 0.0 101913.08 110594.11 229160.95]
[1.0 0.0 0.0 100671.96 91790.61 249744.55]
[0.0 1.0 0.0 93863.75 127320.38 249839.44]
[1.0 0.0 0.0 91992.39 135495.07 252664.93]
[0.0 1.0 0.0 119943.24 156547.42 256512.92]
[0.0 0.0 1.0 114523.61 122616.84 261776.23]
[1.0 0.0 0.0 78013.11 121597.55 264346.06]
[0.0 0.0 1.0 94657.16 145077.58 282574.31]
[0.0 1.0 0.0 91749.16 114175.79 294919.57]
[0.0 0.0 1.0 86419.7 153514.11 0.0]
[1.0 0.0 0.0 76253.86 113867.3 298664.47]
[0.0 0.0 1.0 78389.47 153773.43 299737.29]
[0.0 1.0 0.0 73994.56 122782.75 303319.26]
[0.0 1.0 0.0 67532.53 105751.03 304768.73]
[0.0 0.0 1.0 77044.01 99281.34 140574.81]
[1.0 0.0 0.0 64664.71 139553.16 137962.62]
[0.0 1.0 0.0 75328.87 144135.98 134050.07]
[0.0 0.0 1.0 72107.6 127864.55 353183.81]
[0.0 1.0 0.0 66051.52 182645.56 118148.2]
[0.0 0.0 1.0 65605.48 153032.06 107138.38]
[0.0 1.0 0.0 61994.48 115641.28 91131.24]
[0.0 0.0 1.0 61136.38 152701.92 88218.23]
[1.0 0.0 0.0 63408.86 129219.61 46085.25]
[0.0 1.0 0.0 55493.95 103057.49 214634.81]
[1.0 0.0 0.0 46426.07 157693.92 210797.67]
[0.0 0.0 1.0 46014.02 85047.44 205517.64]
[0.0 1.0 0.0 28663.76 127056.21 201126.82]
[1.0 0.0 0.0 44069.95 51283.14 197029.42]
[0.0 0.0 1.0 20229.59 65947.93 185265.1]
[1.0 0.0 0.0 38558.51 82982.09 174999.3]
[1.0 0.0 0.0 28754.33 118546.05 172795.67]
[0.0 1.0 0.0 27892.92 84710.77 164470.71]
[1.0 0.0 0.0 23640.93 96189.63 148001.11]
[0.0 0.0 1.0 15505.73 127382.3 35534.17]
[1.0 0.0 0.0 22177.74 154806.14 28334.72]
[0.0 0.0 1.0 1000.23 124153.04 1903.93]
[0.0 1.0 0.0 1315.46 115816.21 297114.46]
[1.0 0.0 0.0 0.0 135426.92 0.0]
[0.0 0.0 1.0 542.05 51743.15 0.0]
[1.0 0.0 0.0 0.0 116983.8 45173.06]]
In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
LinearRegression()
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
df
Out[3]:
Real Values Predicted Values

0 103282.38 102613.307260

1 96712.80 88969.720298

2 81229.06 68377.662115

3 97427.84 98845.899455

4 105008.31 114393.773014

5 125370.37 130723.412943

6 156122.51 158922.404934

7 97483.56 98117.402057

8 71498.49 69483.670171

9 96778.92 97551.005854

In [4]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)
score

Out[4]:
0.9275800339704457
In [ ]:

Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

Regression Analysis - Cheatsheet
No ratings yet
Regression Analysis - Cheatsheet
9 pages
16BCB0126 VL2018195002535 Pe003
No ratings yet
16BCB0126 VL2018195002535 Pe003
40 pages
Final ML File
No ratings yet
Final ML File
34 pages
ML Lab
No ratings yet
ML Lab
7 pages
Presentation 1
No ratings yet
Presentation 1
2 pages
ML Shristi File
No ratings yet
ML Shristi File
49 pages
LAB-4 Report
No ratings yet
LAB-4 Report
21 pages
DM ML Practical
No ratings yet
DM ML Practical
13 pages
Minor_lab
No ratings yet
Minor_lab
4 pages
ML Lab Prgms Split
No ratings yet
ML Lab Prgms Split
3 pages
ML Codes
No ratings yet
ML Codes
9 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
ml lab
No ratings yet
ml lab
23 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
47 pages
DA_Programs
No ratings yet
DA_Programs
44 pages
Mlext
No ratings yet
Mlext
1 page
Udacity Machine Learning Analysis Supervised Learning
100% (1)
Udacity Machine Learning Analysis Supervised Learning
504 pages
Machine Learning LAB
No ratings yet
Machine Learning LAB
20 pages
Advance AI and ML LAB
No ratings yet
Advance AI and ML LAB
16 pages
ML Remaining
No ratings yet
ML Remaining
17 pages
Experiment1111
No ratings yet
Experiment1111
25 pages
AIML PRACTICALS
No ratings yet
AIML PRACTICALS
22 pages
DOC-20241108-WA0003
No ratings yet
DOC-20241108-WA0003
16 pages
Tugas Clustering - 132021012 - Kevin Gazkia Naufal
No ratings yet
Tugas Clustering - 132021012 - Kevin Gazkia Naufal
6 pages
1st PGM
No ratings yet
1st PGM
10 pages
Practical File of AI and ML
No ratings yet
Practical File of AI and ML
26 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
Programs Lab Bca
No ratings yet
Programs Lab Bca
16 pages
Ai Last 5
No ratings yet
Ai Last 5
4 pages
DataScience All 1to8
No ratings yet
DataScience All 1to8
6 pages
AIML_LAB
No ratings yet
AIML_LAB
37 pages
ML Activity Kalyan
No ratings yet
ML Activity Kalyan
21 pages
ML L - Ab
No ratings yet
ML L - Ab
13 pages
MlLabManualdocx 2024 09 04 22 02 58
No ratings yet
MlLabManualdocx 2024 09 04 22 02 58
19 pages
ml
No ratings yet
ml
17 pages
Data Science Machine Leraning222
No ratings yet
Data Science Machine Leraning222
11 pages
lab manual
No ratings yet
lab manual
9 pages
Machine Learnin
100% (2)
Machine Learnin
23 pages
Aiml Ex 4-7
No ratings yet
Aiml Ex 4-7
8 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
AI LAB
No ratings yet
AI LAB
19 pages
Data Mining Practicals
No ratings yet
Data Mining Practicals
22 pages
Machine
100% (1)
Machine
45 pages
Simple Linear Regression
No ratings yet
Simple Linear Regression
11 pages
Data - Preprocessing - Tools - Ipynb - Colaboratory
No ratings yet
Data - Preprocessing - Tools - Ipynb - Colaboratory
4 pages
MLLabManual
No ratings yet
MLLabManual
24 pages
Implementing KNN Algorithm on the Iris Dataset
No ratings yet
Implementing KNN Algorithm on the Iris Dataset
7 pages
Sofcomputing Da2
No ratings yet
Sofcomputing Da2
7 pages
Classification Review
No ratings yet
Classification Review
8 pages
ML_Lab_01999676272
No ratings yet
ML_Lab_01999676272
12 pages
SVM K NN MLP With Sklearn Jupyter NoteBo
No ratings yet
SVM K NN MLP With Sklearn Jupyter NoteBo
22 pages
MachineLearning
No ratings yet
MachineLearning
10 pages
Data analytics
No ratings yet
Data analytics
10 pages
Week 7 Laboratory Activity
No ratings yet
Week 7 Laboratory Activity
12 pages
Roll NO 2020
No ratings yet
Roll NO 2020
8 pages
Pca 2382487
No ratings yet
Pca 2382487
8 pages
ML Record Print
No ratings yet
ML Record Print
20 pages
data preprocessing
No ratings yet
data preprocessing
9 pages
Rajeek8 12
No ratings yet
Rajeek8 12
21 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Preboard-2 PAPERS 10 SET-2 - Answer Key
No ratings yet
Preboard-2 PAPERS 10 SET-2 - Answer Key
12 pages
Understanding Confusion Matrix
No ratings yet
Understanding Confusion Matrix
4 pages
Multi-Label Long Short-Term Memory-Based Framework To Analyze Drug Functions From Biological Properties
No ratings yet
Multi-Label Long Short-Term Memory-Based Framework To Analyze Drug Functions From Biological Properties
6 pages
CS246 Hw1
No ratings yet
CS246 Hw1
5 pages
Octree-Based Region Growing For Point Cloud Segmentation
No ratings yet
Octree-Based Region Growing For Point Cloud Segmentation
38 pages
Jaton 2017 We Get The Algorithms of Our Ground Truths Designing Referential Databases in Digital Image Processing
No ratings yet
Jaton 2017 We Get The Algorithms of Our Ground Truths Designing Referential Databases in Digital Image Processing
30 pages
Machine Learning For Coronavirus Covid-19 Detection From Chest X-Rays Machine Learning For Coronavirus Covid-19 Detection From Chest X-Rays
No ratings yet
Machine Learning For Coronavirus Covid-19 Detection From Chest X-Rays Machine Learning For Coronavirus Covid-19 Detection From Chest X-Rays
10 pages
Natural Language: Anguage Odels
No ratings yet
Natural Language: Anguage Odels
28 pages
Facial Expression Recognition Using Deep Learning
No ratings yet
Facial Expression Recognition Using Deep Learning
13 pages
Updated Dumps for UiPath-SAIv1 Exam
No ratings yet
Updated Dumps for UiPath-SAIv1 Exam
16 pages
Report 2
No ratings yet
Report 2
6 pages
Notes of NLP - Unit-2
No ratings yet
Notes of NLP - Unit-2
23 pages
Using Machine Learning To Identify and Evaluate Football Talented
No ratings yet
Using Machine Learning To Identify and Evaluate Football Talented
10 pages
Combining Xxsentence Similarities Measures To Identify Paraphrases
No ratings yet
Combining Xxsentence Similarities Measures To Identify Paraphrases
15 pages
Research Paper TBANTAY Final 1
No ratings yet
Research Paper TBANTAY Final 1
40 pages
Early Predicting of Students Performance in Higher
No ratings yet
Early Predicting of Students Performance in Higher
12 pages
IID3 Classifier To Diagnosis of High Blood Glucose Levels During Pregnancy
No ratings yet
IID3 Classifier To Diagnosis of High Blood Glucose Levels During Pregnancy
16 pages
Assignment 5
No ratings yet
Assignment 5
43 pages
Day1
No ratings yet
Day1
36 pages
Elex2023 Proceedings
No ratings yet
Elex2023 Proceedings
651 pages
Biomedical Image Analysis Using Python
No ratings yet
Biomedical Image Analysis Using Python
27 pages
Logistic Regression On Titanic Dataset
No ratings yet
Logistic Regression On Titanic Dataset
6 pages
healthcare-10-01993
No ratings yet
healthcare-10-01993
31 pages
Pothole Segmentation - CNN
No ratings yet
Pothole Segmentation - CNN
44 pages
revision 2 board examination
No ratings yet
revision 2 board examination
9 pages
Machine Learning Engineer Nanodegree Supervised Learning Project: Finding Donors For CharityML
No ratings yet
Machine Learning Engineer Nanodegree Supervised Learning Project: Finding Donors For CharityML
16 pages
Wood 2017
100% (1)
Wood 2017
8 pages
Part A Assignment - No - 5 PDF
No ratings yet
Part A Assignment - No - 5 PDF
8 pages
Data Mining, Klasifikasi
No ratings yet
Data Mining, Klasifikasi
88 pages
(IJCST-V11I1P5) :jitendra Maan, Harsh Maan
No ratings yet
(IJCST-V11I1P5) :jitendra Maan, Harsh Maan
6 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Mlda - Lab

Uploaded by

Mlda - Lab

Uploaded by

In [1]:

# Simple Linear Regression

# Importing the libraries

# Importing the libraries

#Now, to predict whether a user will purchase the product or not,

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

print (xtrain[0:10, :])

65 - True Positive = ytest =1, y_pred = 1

65+24 = prediction done correctly = accuracy = 89 out of testdata 100 =0.89

3+8 = predcition went wrong

3 = ytest =1, yPred =0

65+24 = correct prediction y_test, y_pred

3+8 - predicted wrongly

print("whole dataset ",X.shape)

0 0.89 0.96 0.92 68

accuracy 0.89 100

from sklearn.datasets import load_iris

def __init__( self, K ) :

# Function to store training set

def fit( self, X_train, Y_train ) :

self.m, self.n = X_train.shape

# Function for prediction

def predict( self, X_test ) :

self.m_test, self.n = X_test.shape

Y_predict = np.zeros( self.m_test )

for i in range( self.m_test ) :

# find the K nearest neighbors from current test example

neighbors = np.zeros( self.K )

# most frequent class in K neighbors

Y_predict[i] = mode( neighbors )[0][0]

# Function to find the K nearest neighbors to current test example

def find_neighbors( self, x ) :

# calculate all the euclidean distances between current

euclidean_distances = np.zeros( self.m )

for i in range( self.m ) :

# sort Y_train according to euclidean_distance_array and

# Function to calculate euclidean distance

def euclidean( self, x, x_train ) :

return np.sqrt( np.sum( np.square( x - x_train ) ) )

# Splitting dataset into train and test set

X_train, X_test, Y_train, Y_test = train_test_split(

model.fit( X_train, Y_train )

model1 = KNeighborsClassifier( n_neighbors = 3 )

model1.fit( X_train, Y_train )

# Prediction on test set

Y_pred = model.predict( X_test )

Y_pred1 = model1.predict( X_test )

for count in range( np.size( Y_pred ) ) :

print( "Accuracy on test set by our model : ", (

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

... ... ... ... ... ...

145 6.7 3.0 5.2 2.3 Iris-virginica

146 6.3 2.5 5.0 1.9 Iris-virginica

147 6.5 3.0 5.2 2.0 Iris-virginica

148 6.2 3.4 5.4 2.3 Iris-virginica

149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix

Iris-setosa 1.00 1.00 1.00 13

Iris-setosa 1.00 1.00 1.00 13

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Evalution for min_sample_split 3 and min sample leaf 3

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Evalution for min_sample_split 4 and min sample leaf 4

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13

Evalution for min_sample_split 5 and min sample leaf 5

precision recall f1-score support

def init( self, K ) :