Python Forum

Full Version: Outliers remain in the scatterplot even after removal
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I am currently using the 'train.csv' file found here: https://www.kaggle.com/c/house-prices-ad...=train.csv

after scatterplotting the features 'YearBuilt' vs 'SalePrice', I am removing the csv rows containing outliers showing in thegraph. My code removes these outliers the first time I use the 'drop' command, but after looking at the graph without the firstly detected outliers more outliers appear, but once I try to remove them, they still appear on the scatterplot, even though I can definitely see that the rows are indeed being erased. I cannot understand why it does that and what should I do to fix it.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
from scipy.stats import pearsonr  


train_data=pd.read_csv(r'train.csv')
corr_YearBuilt0= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt0, 'corr_YearBuilt0')

sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show()

print(train_data.shape,'+++++++++shape')

outliers_year_built= train_data['SalePrice'].between(500000, 800000, inclusive=False)  & 
train_data['YearBuilt'].between(1980, 2020, inclusive=False)
"""PRINT AND DROP ROWS CONTAINING OUTLIERS"""
if outliers_year_built.any():
    print(train_data[outliers_year_built],'outliars')
    print (train_data[outliers_year_built].index)
    print(train_data[outliers_year_built].index.values.tolist(),'----------location outliers------')
    train_data.drop(train_data.index[train_data[outliers_year_built].index.values.tolist()], 
inplace=True)
else:
    print('no outliers')

sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show()

corr_YearBuilt= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt, 'corr_YearBuilt')
print(train_data.shape,'+++++++++shape')

"""DROP ROWS CONTAINING ADDITIONAL OUTLIERS"""
outliers_year_built_2= train_data['SalePrice'].between(200000, 500000, inclusive=False)  & 
train_data['YearBuilt'].between(1860, 1920, inclusive=False)
if outliers_year_built_2.any():
    train_data.drop(train_data.index[train_data[outliers_year_built_2].index.values.tolist()], 
inplace=True) #THIS ROW DOES NOT SEEM TO WORK AS THE OUTLIERS KEEP APPEARING IN THE FOLLOWING 
                                                                                   #SCATTERPLOT
else:
    print('no outliers')

sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show()  #THE OUTLIERS I TRIED TO REMOVE RIGHT ABOVE ARE STILL AROUND

corr_YearBuilt_2= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt_2, 'corr_YearBuilt')
print(train_data.shape,'+++++++++shape')