# PROJECT:-DETECTION AND REMOVAL OF OUTLIERS FOR ANY DATASET

# FOR NORMAL DISTRIBUTION OF DATA :

# importing important modules / libraries

import pandas as p
import matplotlib.pyplot as py
import numpy as n
from scipy.stats import norm

# reading DATASET file - (CSV file)

data_set=p.read_csv("PATH FOR THE STORED DATASET FILE IN CSV FORMAT")
data_set.sample(5)

# Visualization of  a BELL CURVE for a normal distribution...

py.hist(data_set.COLUMN_NAME,bins= 20, rwidth=0.8, density= True)
py.xlabel('ANY NAME(PREFER COLUMN NAME)')
py.ylabel('Count')

range = n.arange(data_set.COLUMN_NAME.min(),data_set.COLUMN_NAME.max(),0.1)
py.plot(range,norm.pdf(range,data_set.COLUMN_NAME.mean(),
                       data_set.COLUMN_NAME.std()))

#Detection of outlier ---> using z-score / standard score

#Step - 1 : Computation of z-score...

data_set['zscore'] = (data_set.COLUMN_NAME - data_set.COLUMN_NAME.mean())/data_set.COLUMN_NAME.std()
data_set.head(5)

#Step - 2 : Detection of outliers....

outlier_detect=data_set[(data_set.zscore<-3) | (data_set.zscore>3)]
outlier_detect.head()

#shape of outliers detect...

outlier_detect.shape[0]

#STEPS FOR REMOVAL OF OUTLIER ---

# Step - 1 : Removal of outlier ---> using z-score / standard score

data_set_with_no_outliers = data_set[(data_set.zscore>-3) & (data_set.zscore<3)]
data_set_with_no_outliers.sample(5)

# Initializing new dataset into old dataset...
data_set = data_set_with_no_outliers


#END --> Visualization of dataset with no outliers

py.hist(data_set.COLUMN_NAME, bins=20, rwidth = 0.8, density = True)
py.xlabel('ANY NAME(PREFER COLUMN NAME)')
py.ylabel('Count')
rnge = n.arange(data_set.COLUMN_NAME.min(),data_set.COLUMN_NAME.max(),0.1)
py.plot(rnge,norm.pdf(rnge,data_set.COLUMN_NAME.mean(),data_set.COLUMN_NAME.std()))

# ------------------------------------------------------------------------------------------------------------


# FOR SKEWED DISTRIBUTION OF DATA :

# importing important modules / libraries

import pandas as p
import matplotlib.pyplot as py
import seaborn as s

# reading DATASET file - (CSV file)

data_set=p.read_csv("PATH FOR THE STORED DATASET FILE IN CSV FORMAT")
data_set.sample(5)

# Describe dataset
data_set.describe()

# Visualization --> using boxplot

boxplot = s.boxplot(data_set['Column_Name'])
py.show()

# STEPS FOR THE DETECTION OF OUTLIERS ---
# Detection of outlier ---> using Interquantile range - IQR

# Step 1 : Computation of Q1 and Q3...

Q1 = data_set.Column_Name.quantile(0.25)
Q3 = data_set.Column_Name.quantile(0.75)

# Display Q1 , Q3..
(Q1, Q3)

# Step 2 : Computation of IQR...

IQR = Q3 - Q1

# Display IQR..
IQR

# Step 3 : Setting up lower_limit  and upper_limit...

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR

# Display lower_limit , upper_limit..

lower_limit, upper_limit

# Step 4 : Detection of Outliers..

outlier_data_set = data_set[(data_set.Column_Name < lower_limit) | (data_set.Column_Name > upper_limit)]

# Numbers of Outliers detect...

outlier_data_set.shape[0]

#STEP FOR THE REMOVAL OF OUTLIERS ---

# Removal of Outlier
data_set_with_no_outlier = data_set[(data_set.Column_Name > lower_limit) & (data_set.Column_Name < upper_limit)]
data_set_with_no_outlier.sample(5)

# final step for total removal of outliers from skewed
# distribution of data...
data_set = data_set_with_no_outlier

# END --> Visualization of dataset with no outliers
new_boxplot = s.boxplot(data_set['Column_Name'])
py.show()

# Source code ----> END !!
# Create and Built up by Rohit Saktel >>>>>

