0% found this document useful (0 votes)

12 views

ML assignment

The document outlines a series of tasks involving data analysis and machine learning techniques using Python. It includes tasks such as clustering with KMeans and hierarchical clustering, calculating utilities in a grid environment, performing factor analysis on housing data, and building a Random Forest classifier for fraud detection. Each task demonstrates data preprocessing, model training, and evaluation methods, highlighting the effectiveness of the chosen algorithms.

Uploaded by

fifotox176

Available Formats

Download as ODT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

12 views

ML assignment

Uploaded by

fifotox176

Available Formats

Download as ODT, PDF, TXT or read online on Scribd

You are on page 1/ 11

Task 1:

import pandas as pd
from sklearn.preprocessing import StandardScaler

# to Load the dataset

fl = pd.read_csv('ecommerce.csv')

# to Display the first few rows of the dataframe

print(fl.head())

print(fl.info())
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Normalize the data

scaler = StandardScaler()
sceled_data = scaler.fit_transform(fl)

# Elbow Method
Ac = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++',n_init ='auto', random_state=10)
kmeans.fit(sceled_data)
Ac.append(kmeans.inertia_)
# Plotting the results of the Elbow Method
plt.plot(range(1, 11), Ac)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Ac')
plt.show()

# Silhouette Score
for n_clusters in range(2, 11):
clusterer = KMeans(n_clusters=n_clusters,n_init='auto', random_state=10)
cluster_labels = clusterer.fit_predict(sceled_data)
silhouette_avg = silhouette_score(sceled_data, cluster_labels)
print(f"For n_clusters = {n_clusters}, the average silhouette_score is : {silhouette_avg}")
optimal_clusters = 4

# Applying Hierarchical Clustering

from scipy.cluster.hierarchy import dendrogram, linkage

# Using the 'ward' method for Hierarchical clustering

z = linkage(sceled_data, method='ward')

# Plotting the dendrogram to plot histogram

plt.figure(figsize=(10, 5))
dendrogram(z, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Customer Index')
plt.ylabel('Distance')
plt.show()

For n_clusters = 2, the average silhouette_score is : 0.3489864966095777

For n_clusters = 3, the average silhouette_score is : 0.507952731017849
For n_clusters = 4, the average silhouette_score is : 0.6109029564611685
For n_clusters = 5, the average silhouette_score is : 0.5027280434062202
For n_clusters = 6, the average silhouette_score is : 0.3968672862014622
For n_clusters = 7, the average silhouette_score is : 0.30834094932263917
For n_clusters = 8, the average silhouette_score is : 0.2998299646859838
For n_clusters = 9, the average silhouette_score is : 0.29659504567055106
For n_clusters = 10, the average silhouette_score is : 0.29147277579139513

Explanation: #importing pandas library to read the file, then we are normalizing the data using
the standard scalar function
#from importing K-Means from the ski library and fitting our dataset K-means s calculated
#Elbow method and silhouette score is calculated both of them show the optimum value at 4
# calculating Elbow method and silhouette score is not good for hierarchical clustering so we
taking for k-means
#then the from ski learn hierarchical clustering is imported and graph is plotted.

Task 2:
import numpy as np

# Define the grid with utilities

# For simplicity, inaccessible states are set to None and terminal states with
their rewards

grid_utilities = np.array([

[7.41, 7.52, 7.65, 10, 7.54],

[7.31, None, -10, 5.82, -10],

[7.15, None, -10, 4.31, None],

[6.98, 6.77, 6.44, 5.87, 6.12],

[6.90, 6.80, 6.59, 6.51, 6.34]

])

# Define the reward for non-terminal states

reward = -0.1

# Define the success probability

success_prob = 0.8

# Define the failure probability (divided equally among perpendicular

directions)
failure_prob = 0.2 / 2 # 0.2 probability split between two perpendicular
directions

# Function to calculate the utility of a given action from a given state

def calculate_utility(state, action, grid):

nrows, ncols = grid.shape

x, y = state

# Directions

directions = {

'UP': (-1, 0),

'DOWN': (1, 0),

'LEFT': (0, -1),

'RIGHT': (0, 1)

# Calculate the new position after the action

dx, dy = directions[action]

new_x, new_y = x + dx, y + dy

if 0 <= new_x < nrows and 0 <= new_y < ncols and grid[new_x, new_y] is
not None:

primary_utility = grid[new_x, new_y]

else:

primary_utility = grid[x, y]
# Calculate the utility of perpendicular mooves

perp_utility = 0

for perp_action in ['LEFT', 'RIGHT'] if action in ['UP', 'DOWN'] else ['UP',

'DOWN']:

dx, dy = directions[perp_action]

perp_x, perp_y = x + dx, y + dy

if 0 <= perp_x < nrows and 0 <= perp_y < ncols and grid[perp_x,
perp_y] is not None:

perp_utility += failure_prob * grid[perp_x, perp_y]

else:

perp_utility += failure_prob * grid[x, y]

# Calculate total expected utility

total_utility = success_prob * primary_utility + perp_utility + reward

return total_utility

green_states = [(1 ,0), (3, 2), (4, 1)] # Placeholder positions for green states

optimal_actions = {}

for state in green_states:

utilities = {action: calculate_utility(state, action, grid_utilities) for action in

['UP', 'DOWN', 'LEFT', 'RIGHT']}

optimal_action = max(utilities, key=utilities.get)

optimal_actions[state] = optimal_action
optimal_actions
It gave me the answer of {(1, 0): 'UP', (3, 2): 'DOWN', (4, 1): 'LEFT'}

Task 3:
import pandas as pd #its used to load the dataset in this program
from sklearn.decomposition import FactorAnalysis #Factor analysis is imported from inbuilt
sklearn library

# Load the dataset

Y = 'kc_house_data_reduced (1).csv'
dta = pd.read_csv(Y)
dta.head()
# dropping the price column from the orginal dataset and assigning it to X
X = dta.drop('price', axis=1)
print(X)

# Creating the Factor Analysis model with 2 components that is reducing more factors in to two
fact_ana = FactorAnalysis(n_components=2, random_state=0)
fact_ana.fit(X)

#components show thw 2 variable components

components = fact_ana.components_

# Formatting the components for better understding that is we are fixing the colums and index
components_d = pd.DataFrame(components, columns=X.columns, index=['Size', 'Quality'])
components_d

sqft_basemen
grade sqft_above sqft_living15
condition t

Size -0.044862 0.965322 648.830918 196.716360 558.405490

Quality 0.157787 -0.277271 -395.732030 370.346887 -132.457091

#first components have positive number for sqft_above and square ft variance which captures the
size of the house and component 2 gives the quality of the house rom the condition and the grade
#Factor analysis helps to find the latent variables in the data and serve as a dimensionality
reduction technique

Task 4
import pandas as pd

# Load the dataset

file_path = 'bs140513_032310.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Data Preprocessing
# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['customer', 'age', 'gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category']
for col in categorical_cols:
data[col] = le.fit_transform(data[col])

# Splitting the dataset into features and target variable

X = data.drop('fraud', axis=1)
y = data['fraud']

# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest Classifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predicting and Evaluating the Model

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=model.classes_).plot()
plt.show() #

accuracy =print('accuracy: {}'.format( accuracy_score(y_test, y_pred)))

report = print('classification report:\n {}\n'.format(classification_report(y_test, y_pred)))

accuracy, report

accuracy: 0.996207821473316
classification report:
precision recall f1-score support

0 1.00 1.00 1.00 117512

1 0.90 0.76 0.83 1417

accuracy 1.00 118929

macro avg 0.95 0.88 0.91 118929
weighted avg 1.00 1.00 1.00 118929
The above results indicates that the fraudulent transactions were detected correctly at high
percent
Explanation:
In this project we are importing pandas to read the data file given by user.
We are using Random forest classification in this methods. The reason for choosing Random
forest over K-Nearest neighbor classifier is KNN can become tough to split with large dataset as
it requires space to store the dataset and required more time and K-Nearest neighbor is not good
in scaling. But overall KNN is good in Handling imbalance data that is its good for fitting
imbalance data. We can fit to multiple models than any other models and its capable of capturing
complex interactions And logistic regression wont work good in handling non linear
relationships data as good as random forest classification and Random forest may perform well
in unbalanced dataset but logic regression may struggle more in this type of dataset
In this code initially we are assigning every label name in the column and by the label encoders
from SK learn is used to convert it in to numerical value for the further processing. By doing like
these its very easy for the algorithm to make further classification. After that we are dropping
fraud tab from the dataset because its every value is zero and no use of it so we are dropping it
and assigning to the variable named Y
After that we are using standard scalar from SK learn and assigning it to the variable named
scalar. Now we fit the model data X to the scalar to make sure that the machine learning model
does not get skewed or biased towards one certain features just because of their scales or units of
measurement
Then we split the data in to training data set and testing data set and random state of 42 is used to
ensure same random numbers are produced again and again
Then we are using random forest classification method to train and testing the data
Model evaluation is done by confusion matrix and classification report from the ski kit learn
And the output that is out of 1082 correctly detected out of 1417 and the results shows that the
results were detected correctly at 76 percent and by the confusion matrix the graph is drawn to it

Final_Code
No ratings yet
Final_Code
3 pages
Slip
No ratings yet
Slip
5 pages
Mlda - Lab
No ratings yet
Mlda - Lab
35 pages
D3 docs
No ratings yet
D3 docs
6 pages
Prac7 8 9 10
No ratings yet
Prac7 8 9 10
12 pages
ML Shristi File
No ratings yet
ML Shristi File
49 pages
05 E RandomForest LoanData
No ratings yet
05 E RandomForest LoanData
8 pages
ML2 Practical List
No ratings yet
ML2 Practical List
80 pages
ML Lab
No ratings yet
ML Lab
7 pages
Aiml Lab
No ratings yet
Aiml Lab
14 pages
ARTIFICIAL INTELLIGENCE AND MACHINE LEARNING_BhavyaSharan_059 (1)
No ratings yet
ARTIFICIAL INTELLIGENCE AND MACHINE LEARNING_BhavyaSharan_059 (1)
47 pages
ML Exp5 C36
No ratings yet
ML Exp5 C36
18 pages
kmeans
No ratings yet
kmeans
5 pages
ML Lab Record
No ratings yet
ML Lab Record
33 pages
Machine Learning Laboratory Manual
No ratings yet
Machine Learning Laboratory Manual
11 pages
Final ML File
No ratings yet
Final ML File
34 pages
graph_analysis3_code
No ratings yet
graph_analysis3_code
2 pages
Udacity Machine Learning Analysis Supervised Learning
100% (1)
Udacity Machine Learning Analysis Supervised Learning
504 pages
Presentation 1
No ratings yet
Presentation 1
2 pages
ML Codes
No ratings yet
ML Codes
9 pages
Practical 5
No ratings yet
Practical 5
6 pages
Implement the Knn (2)
No ratings yet
Implement the Knn (2)
5 pages
AAM 7th prac
No ratings yet
AAM 7th prac
4 pages
DATA MINING EX1
No ratings yet
DATA MINING EX1
10 pages
MLT Exp 09
No ratings yet
MLT Exp 09
3 pages
Machine Learning practical file
No ratings yet
Machine Learning practical file
31 pages
ML Lab Manual
No ratings yet
ML Lab Manual
28 pages
MLLabManual
No ratings yet
MLLabManual
24 pages
Slip Clustering
No ratings yet
Slip Clustering
2 pages
Code:: To Find Frequent Itemsets and Association Between Different Itemsets Using Apriori Algorithm
No ratings yet
Code:: To Find Frequent Itemsets and Association Between Different Itemsets Using Apriori Algorithm
28 pages
Programs Lab Bca
No ratings yet
Programs Lab Bca
16 pages
Ass6(DMDS)
No ratings yet
Ass6(DMDS)
7 pages
Advance AI and ML LAB
No ratings yet
Advance AI and ML LAB
16 pages
TOO
No ratings yet
TOO
7 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
AIML_LAB
No ratings yet
AIML_LAB
37 pages
Clustering Mall Data Students
No ratings yet
Clustering Mall Data Students
11 pages
ML Practical 205160694034
No ratings yet
ML Practical 205160694034
33 pages
Pramkk
No ratings yet
Pramkk
10 pages
21BCE5775 Clustering
No ratings yet
21BCE5775 Clustering
42 pages
21BEC505 Exp2
No ratings yet
21BEC505 Exp2
7 pages
ASSESSMENT2
No ratings yet
ASSESSMENT2
22 pages
Machine Learning Model Building
No ratings yet
Machine Learning Model Building
6 pages
Week 6 (PCA, SVD, LDA)
No ratings yet
Week 6 (PCA, SVD, LDA)
14 pages
Experiment1111
No ratings yet
Experiment1111
25 pages
Ai Last 5
No ratings yet
Ai Last 5
4 pages
Reading Data: #Importing Required Libraries
No ratings yet
Reading Data: #Importing Required Libraries
16 pages
Docu 4
No ratings yet
Docu 4
3 pages
Market analysis by pchandru
No ratings yet
Market analysis by pchandru
10 pages
Untitled document-2-1-13-7-11.4
No ratings yet
Untitled document-2-1-13-7-11.4
5 pages
Ai Combined Update
No ratings yet
Ai Combined Update
274 pages
ASSESSMENT2
No ratings yet
ASSESSMENT2
22 pages
Clustering
No ratings yet
Clustering
1 page
Recsify Technologies Assignment
No ratings yet
Recsify Technologies Assignment
10 pages
AIML
No ratings yet
AIML
12 pages
Warpper Method
No ratings yet
Warpper Method
8 pages
ML 5
No ratings yet
ML 5
2 pages
2403res62 - CS564 - Assignment - 4 - K-Means-Iris - Intrinsic - CVIs
No ratings yet
2403res62 - CS564 - Assignment - 4 - K-Means-Iris - Intrinsic - CVIs
30 pages
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
robot doc
No ratings yet
robot doc
7 pages
Research Proposal Title
No ratings yet
Research Proposal Title
9 pages
Research Proposal Title
No ratings yet
Research Proposal Title
8 pages
Sustainability in cricket
No ratings yet
Sustainability in cricket
1 page
Clustering Categorical Data: A Survey: International Journal of Information Technology & Decision Making December 2019
No ratings yet
Clustering Categorical Data: A Survey: International Journal of Information Technology & Decision Making December 2019
43 pages
Dataware Q&a Bank
100% (1)
Dataware Q&a Bank
42 pages
WWW - Topmentor.In: India'S First 100% Practical Training Institute
No ratings yet
WWW - Topmentor.In: India'S First 100% Practical Training Institute
18 pages
Krebs Chapter 12 2017
100% (1)
Krebs Chapter 12 2017
48 pages
JNTUA R20 B.Tech - CSE Data Science III IV Year Course Structure Syllabus
No ratings yet
JNTUA R20 B.Tech - CSE Data Science III IV Year Course Structure Syllabus
118 pages
Lycke 2012
No ratings yet
Lycke 2012
8 pages
K Mean Clustering
No ratings yet
K Mean Clustering
36 pages
A Novel Density-Based Clustering Algorithm For Predicting Cardiovascular Disease
No ratings yet
A Novel Density-Based Clustering Algorithm For Predicting Cardiovascular Disease
12 pages
PDF Laporan Praktikum Data Mining - Compress
No ratings yet
PDF Laporan Praktikum Data Mining - Compress
142 pages
10.1007@978 3 030 30143 9 PDF
No ratings yet
10.1007@978 3 030 30143 9 PDF
398 pages
Statistical Models And Methods For Data Science 1st Leonardo Grilli instant download
100% (1)
Statistical Models And Methods For Data Science 1st Leonardo Grilli instant download
46 pages
Final Report On Face Recognition
67% (6)
Final Report On Face Recognition
22 pages
Mastering Data Analysis with R 1st Edition Daroczi 2024 Scribd Download
100% (8)
Mastering Data Analysis with R 1st Edition Daroczi 2024 Scribd Download
60 pages
Recommendation System
No ratings yet
Recommendation System
13 pages
PG (Purdue) Data Science
No ratings yet
PG (Purdue) Data Science
30 pages
Social Cultural and Behavioral Modeling 13th International Conference SBP BRiMS 2020 Washington DC USA October 18 21 2020 Proceedings Robert Thomson - The full ebook version is available, download now to explore
100% (1)
Social Cultural and Behavioral Modeling 13th International Conference SBP BRiMS 2020 Washington DC USA October 18 21 2020 Proceedings Robert Thomson - The full ebook version is available, download now to explore
58 pages
Activity 1 PDF
No ratings yet
Activity 1 PDF
3 pages
Ijtech Template
No ratings yet
Ijtech Template
10 pages
Birla Institute of Technology & Science, Pilani Course Handout Part A: Content Design
No ratings yet
Birla Institute of Technology & Science, Pilani Course Handout Part A: Content Design
5 pages
An Overview of Machine Learning Methods
No ratings yet
An Overview of Machine Learning Methods
7 pages
GeoAI
No ratings yet
GeoAI
50 pages
DWDM MCQ
No ratings yet
DWDM MCQ
218 pages
2017_BIANCHI_Role revolution towards a new meaning of positions in basketball
No ratings yet
2017_BIANCHI_Role revolution towards a new meaning of positions in basketball
24 pages
Data Mining
100% (1)
Data Mining
6 pages
Sample Multiple Choice Questions. Class: Ty BSC (It) Semester-Vi Subject: Business Intelligence
No ratings yet
Sample Multiple Choice Questions. Class: Ty BSC (It) Semester-Vi Subject: Business Intelligence
8 pages
Which ML Algo Should I Use SAS
No ratings yet
Which ML Algo Should I Use SAS
20 pages
Text-To-Picture Tools, Systems, and Approaches: A Survey
No ratings yet
Text-To-Picture Tools, Systems, and Approaches: A Survey
27 pages
Prediksi Soal (Ii)
No ratings yet
Prediksi Soal (Ii)
17 pages
A Fair Load Sharing Approach Based On Microgrid Clusters and Transactive Energy Concept
No ratings yet
A Fair Load Sharing Approach Based On Microgrid Clusters and Transactive Energy Concept
4 pages
List of Selected Papers For The Publication
No ratings yet
List of Selected Papers For The Publication
17 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

ML assignment

Uploaded by

ML assignment

Uploaded by

Task 1:

# to Load the dataset

# to Display the first few rows of the dataframe

# Normalize the data

# Applying Hierarchical Clustering

# Using the 'ward' method for Hierarchical clustering

# Plotting the dendrogram to plot histogram

For n_clusters = 2, the average silhouette_score is : 0.3489864966095777

# Define the grid with utilities

[7.41, 7.52, 7.65, 10, 7.54],

[7.31, None, -10, 5.82, -10],

[7.15, None, -10, 4.31, None],

[6.98, 6.77, 6.44, 5.87, 6.12],

[6.90, 6.80, 6.59, 6.51, 6.34]

# Define the reward for non-terminal states

# Define the success probability

# Define the failure probability (divided equally among perpendicular

# Function to calculate the utility of a given action from a given state

def calculate_utility(state, action, grid):

nrows, ncols = grid.shape

'UP': (-1, 0),

'DOWN': (1, 0),

'LEFT': (0, -1),

# Calculate the new position after the action

new_x, new_y = x + dx, y + dy

primary_utility = grid[new_x, new_y]

for perp_action in ['LEFT', 'RIGHT'] if action in ['UP', 'DOWN'] else ['UP',

perp_x, perp_y = x + dx, y + dy

perp_utility += failure_prob * grid[perp_x, perp_y]

perp_utility += failure_prob * grid[x, y]

# Calculate total expected utility

total_utility = success_prob * primary_utility + perp_utility + reward

for state in green_states:

utilities = {action: calculate_utility(state, action, grid_utilities) for action in

optimal_action = max(utilities, key=utilities.get)

# Load the dataset

#components show thw 2 variable components

Size -0.044862 0.965322 648.830918 196.716360 558.405490

Quality 0.157787 -0.277271 -395.732030 370.346887 -132.457091

# Load the dataset

# Splitting the dataset into features and target variable

# Splitting the data into training and testing sets

# Training the Random Forest Classifier

# Predicting and Evaluating the Model

accuracy =print('accuracy: {}'.format( accuracy_score(y_test, y_pred)))

0 1.00 1.00 1.00 117512

accuracy 1.00 118929

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.