Air BNB Data Analysis
Air BNB Data Analysis
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
In [7]: od.download(dataset)
Skipping, found downloaded files in ".\airbnbopendata" (use force=True to force downloa
d)
In [8]: import os
In [10]: os.listdir(data_dir)
['Airbnb_Open_Data.csv']
Out[10]:
Out[11]:
neighbourhood
id NAME host id host_identity_verified host name neighbourhood
group
Skylit Midtown
1 1002102 52335172823 verified Jenna Manhattan Midtown
Castle
THE VILLAGE
OF
2 1002403 78829239556 NaN Elise Manhattan Harlem
HARLEM....NEW
YORK !
Entire Apt:
Spacious
4 1003689 92037596077 verified Lyndon Manhattan East Harlem
Studio/Loft by
central park
Spare room in
102594 6092437 12312296767 verified Krik Brooklyn Williamsburg
Williamsburg
Best Location
Morningside
102595 6092990 near Columbia 77864383453 unconfirmed Mifan Manhattan
Heights
U
Comfy, bright
102596 6093542 room in 69050334417 unconfirmed Megan Brooklyn Park Slope
Brooklyn
Big Studio-One
102597 6094094 Stop from 11160591270 unconfirmed Christopher Queens Long Island City
Midtown
In [13]: print(frame.isnull().sum())
id 0
NAME 250
host id 0
host_identity_verified 289
host name 404
neighbourhood group 29
neighbourhood 16
lat 8
long 8
country 532
country code 131
instant_bookable 105
cancellation_policy 76
room type 0
Construction year 214
price 247
service fee 273
minimum nights 400
number of reviews 183
last review 15832
reviews per month 15818
review rate number 319
calculated host listings count 319
availability 365 448
house_rules 51842
license 102056
dtype: int64
I wanted to see the column names so, I performed the below code
In [14]: print(frame.columns)
Index(['id', 'NAME', 'host id', 'host_identity_verified', 'host name',
'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country',
'country code', 'instant_bookable', 'cancellation_policy', 'room type',
'Construction year', 'price', 'service fee', 'minimum nights',
'number of reviews', 'last review', 'reviews per month',
'review rate number', 'calculated host listings count',
'availability 365', 'house_rules', 'license'],
dtype='object')
In [15]: print(frame.head())
id NAME host id \
0 1001254 Clean & quiet apt home by the park 80014485718
1 1002102 Skylit Midtown Castle 52335172823
2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! 78829239556
3 1002755 NaN 85098326012
4 1003689 Entire Apt: Spacious Studio/Loft by central park 92037596077
host_identity_verified host name neighbourhood group neighbourhood \
0 unconfirmed Madaline Brooklyn Kensington
1 verified Jenna Manhattan Midtown
2 NaN Elise Manhattan Harlem
3 unconfirmed Garry Brooklyn Clinton Hill
4 verified Lyndon Manhattan East Harlem
number of reviews last review reviews per month review rate number \
0 9.0 10/19/2021 0.21 4.0
1 45.0 5/21/2022 0.38 4.0
2 0.0 NaN NaN 5.0
3 270.0 7/5/2019 4.64 4.0
4 9.0 11/19/2018 0.10 3.0
house_rules license
0 Clean up and treat the home the way you'd like... NaN
1 Pet friendly but please confirm with me if the... NaN
2 I encourage you to use my kitchen, cooking and... NaN
3 NaN NaN
4 Please no smoking in the house, porch or on th... NaN
[5 rows x 26 columns]
In [16]: print(frame.shape)
(102058, 26)
Used the mode function to fill the null values of each data.
In [17]: frame_copy=frame.copy()
frame_copy['long'].fillna(frame_copy['long'].mode()[0], inplace=True)
frame_copy['NAME'].fillna(frame_copy['NAME'].mode()[0], inplace=True)
frame_copy['neighbourhood group'].fillna(frame_copy['neighbourhood group'].mode()[0],inp
frame_copy['neighbourhood'].fillna(frame_copy['neighbourhood'].mode()[0],inplace = True)
frame_copy['host name'].fillna(frame_copy['host name'].mode()[0],inplace=True)
#replacing null values using mode in name column
frame_copy['lat'].fillna(frame_copy['lat'].mode()[0],inplace=True)
#replacing null values using mode in name column
frame_copy['long'].fillna(frame_copy['long'].mode()[0], inplace=True)
#replacing null values using mode in name column
frame_copy['country'].fillna(frame_copy['country'].mode()[0], inplace=True)
#replacing null values using mode in name column
frame_copy['country code'].fillna(frame_copy['country code'].mode()[0], inplace=True)
#replacing null values using mode in name column
frame_copy['instant_bookable'].fillna(frame_copy['instant_bookable'].mode()[0], inplace=
#replacing null values using mode in name column
frame_copy['cancellation_policy'].fillna(frame_copy['cancellation_policy'].mode()[0], inp
frame_copy['host_identity_verified'].fillna(frame_copy['host_identity_verified'].mode()[
frame_copy['Construction year'].fillna(frame_copy['Construction year'].mode()[0], inplac
frame_copy['service fee'].fillna(frame_copy['service fee'].mode()[0], inplace=True)
frame_copy['minimum nights'].fillna(frame_copy['minimum nights'].mode()[0], inplace=True
frame_copy['number of reviews'].fillna(frame_copy['number of reviews'].mode()[0], inplac
frame_copy['last review'].fillna(frame_copy['last review'].mode()[0], inplace=True)
frame_copy['reviews per month'].fillna(frame_copy['reviews per month'].mode()[0], inplac
frame_copy['review rate number'].fillna(frame_copy['review rate number'].mode()[0], inpl
frame_copy['calculated host listings count'].fillna(frame_copy['calculated host listings
frame_copy['availability 365'].fillna(frame_copy['availability 365'].mode()[0], inplace=
frame_copy['house_rules'].fillna(frame_copy['house_rules'].mode()[0], inplace=True)
frame_copy['price'].fillna(frame_copy['price'].mode()[0], inplace=True)
In [19]: print(frame.isnull().sum())
id 0
NAME 0
host id 0
host_identity_verified 0
host name 0
neighbourhood group 0
neighbourhood 0
lat 0
long 0
country 0
country code 0
instant_bookable 0
cancellation_policy 0
room type 0
Construction year 0
price 0
service fee 0
minimum nights 0
number of reviews 0
last review 0
reviews per month 0
review rate number 0
calculated host listings count 0
availability 365 0
house_rules 0
license 102056
dtype: int64
In [21]: print(frame.isnull().sum())
id 0
NAME 0
host id 0
host_identity_verified 0
host name 0
neighbourhood group 0
neighbourhood 0
lat 0
long 0
country 0
country code 0
instant_bookable 0
cancellation_policy 0
room type 0
Construction year 0
price 0
service fee 0
minimum nights 0
number of reviews 0
last review 0
reviews per month 0
review rate number 0
calculated host listings count 0
availability 365 0
house_rules 0
dtype: int64
The 'types_of_rooms' variable contains the percentage of each type of room in the dataset, rounded to one
decimal point.
The first four lines of code convert the price column from object to float data type, by removing commas
and dollar signs, and then parsing it as numeric data using pandas. The next two lines of code create two
groups based on the room type (Entire home/apt and Private room) and extract the price values for each
group. The stats.ttest_ind() function from the scipy library is then used to perform an independent two-
sample t-test between the two groups, assuming unequal variances.
Finally, the t-statistic and p-value for the t-test are printed using f-strings.
The purpose of this code is to perform a statistical analysis to test whether there is a significant difference in
mean price between the two groups of listings (Entire home/apt and Private room). The t-test is a common
method for comparing means of two groups and determining whether the difference between them is
statistically significant or not.
# Perform t-test
t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
# Print results
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
T-statistic: 0.10435722164673732
P-value: 0.9168860835285177
In the code below, I performed an ANOVA (Analysis of Variance) test to determine if there are significant
differences in mean price between three groups of listings, based on their neighbourhood_group feature.
In [24]: # create three groups based on neighbourhood_group
group1 = frame[frame['neighbourhood group'] == 'Brooklyn']['price']
group2 = frame[frame['neighbourhood group'] == 'Manhattan']['price']
group3 = frame[frame['neighbourhood group'] == 'Queens']['price']
# print results
print("F-statistic:", f_statistic)
print("P-value:", p_value)
F-statistic: 3.1372044124800285
P-value: 0.043408308935618714
print(tukey_res)
Now, In the below code data analysis have been performed by using different types of graphs like barpchart,
piechart, boxplot, histogram, scatterplot and violinplot.
# create barplot
fig, ax = plt.subplots()
ax.bar(avg_price.index, avg_price.values, color=['blue', 'green', 'red', 'orange'], widt
# create legend
colors = {'Private room': 'blue', 'Entire home/apt': 'green', 'Shared room': 'red', 'Hot
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
ax.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left')
# display plot
plt.show()
# add legend
plt.legend(handles=boxplots['boxes'], labels=labels, bbox_to_anchor=(1.05, 1), loc='uppe