In [1]:
!pip install --upgrade numpy
Collecting numpy
  Downloading numpy-1.18.4-cp37-cp37m-manylinux1_x86_64.whl (20.2 MB)
     |████████████████████████████████| 20.2 MB 1.0 MB/s eta 0:00:01
ERROR: tensorflow 1.14.0 has requirement tensorboard<1.15.0,>=1.14.0, but you'll have tensorboard 2.0.0 which is incompatible.
ERROR: tensorflow 1.14.0 has requirement tensorflow-estimator<1.15.0rc0,>=1.14.0rc0, but you'll have tensorflow-estimator 2.0.1 which is incompatible.
ERROR: tensorflow-gpu 2.0.0 has requirement gast==0.2.2, but you'll have gast 0.3.3 which is incompatible.
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.18.1
    Uninstalling numpy-1.18.1:
      Successfully uninstalled numpy-1.18.1
Successfully installed numpy-1.18.4
In [2]:
# Imports
import plotly.graph_objs as go
import plotly_express as px 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots
import scipy.stats as sps

import numpy as np
import pandas as pd
import scipy

init_notebook_mode(connected=True)

import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

Introduction

There are a certain set of assumptions that are applicable when working with regression problems. Take for instance linear regression where we have the following assumptions-

1) We have a linear relationship between the independent variable and the target variable.
2) Our data is homoscedastic
3) Residuals have a normal distribution
4) Minimal multicollinearity

The subject of this notebook is the third point: How do we know that the residuals due to a linear regression model are normally distributed? This leads to a more general question. Given a dataset, can we say that the data is normally distributed? It seems like a fairly trivial problem, just plot the histogram of the data and see if it looks like a normal distribution. Histograms can be deceptive, it depends on the number of bins you choose which in turn depends on the number of data points available.

Thankfully, there are certain tools available to us in order to determine if a dataset comes from a normal distribution or not.

In this notebook we are going to cover two graphical tools:
1) Graphical way: Histogram
2) Graphical way: Quantile-quantile(qq) plots

The tests that are used to determine if data comes from a normal distribution are called normality tests.

Before we get into it, let us set up a problem. We generate a dataset and set up a linear regression problem. We fit a model to it and get the residues.

In [3]:
# # generate data and visualize it
np.random.seed(1)
x = np.arange(0,1000)
noise = np.random.normal(0,10,1000)
slope = 0.1
b = 5.0 

y = (slope*x)+b 
y_noised =  y+noise



# test train split 
x_train, x_test, y_train, y_test = train_test_split(x,y_noised, test_size=0.2, random_state=1)

x_train_shape = x_train.shape[0]
y_train_shape = y_train.shape[0]

x_train_reshaped = x_train.reshape(x_train_shape, 1)
y_train_reshaped = y_train.reshape(y_train_shape, 1)


x_test_shape = x_test.shape[0]
x_test_reshaped = x_test.reshape(x_test_shape, 1)


# fitting the model in sklearn 
lr = LinearRegression()
lr.fit(x_train_reshaped, y_train_reshaped)

pred_slope = lr.coef_
pred_b = lr.intercept_
y_pred= lr.predict(x_test_reshaped)
residuals = y_test - y_pred.reshape(y_pred.shape[0],)

# fitting the model line to the data 
model_line = (pred_slope*x)+pred_b
model_line_reshaped = model_line.reshape(model_line.shape[1])


fig = make_subplots(rows=1, cols=2,subplot_titles=["Linear data", "Residual Plot"])

fig.add_trace(go.Scatter(x=x,y=y_noised, mode='markers', marker={'color':'green'}, name="noised data"), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=model_line_reshaped, mode='lines', marker={'color':'red'}, name='model'), row=1, col=1)
fig.add_trace(go.Scatter(x=x_test, y=residuals, mode='markers', marker={'color':'blue'}, name='residuals'), row=1, col=2)

fig.update_xaxes(title_text="x" ,row=1, col=1)
fig.update_xaxes(title_text="x", row=1, col=2)
fig.update_yaxes(title_text="Y noised", row=1, col=1)
fig.update_yaxes(title_text="error = predicted-observed" , row=1, col=2)
fig.update_layout(title_text="Figure 1")


iplot(fig)

The problem with histograms

The plot to the right in Figure 1 is a plot of residuals. A residual is the difference between the actual values, which are the green points in the left plot of figure 1, and the predicted values, which fall on the red line. One of the assumptions of linear regression is that the residuals are drawn from a normal distribution, another way of saying this would be that the plot a histogram would be normal-like. Take a look at figure 2 below. The histogram has a single peak with 20 bins around bin -5 to 0 and falls off on either side. However, can we say that this histogram represents a normal distribution? Perhaps. As we change the bin size our conclusion becomes murkier since there is no easy way to say where the peak of the normal distribution is. This makes it difficult for us to judge which bin size is suited to interpret the distribution.

The histogram is one graphical way to say that the data comes from a normal distribution, but the histogram can be deceptive since changing the number of bins alter the shape of the distribution and this may lead to some confusion. We need a better way to identify if the data comes from a normal distribution. This is where quantile-quantile plots come in.

In [4]:
residual_df = pd.DataFrame(data=residuals,columns=["residuals"])

# callback function for the slider
def change_bins(number_bins):
    return px.histogram(residual_df, x="residuals", nbins=int(number_bins), title="Figure 2")

slider_obj =  widgets.FloatSlider(value=20, min=10, max=100,step=5, description="Num of bins", continuous_update=False)
interact(change_bins, number_bins=slider_obj);

A simple quantile-quantile plot using Plotly

An alternative to using a histogram is to use a quantile quantile plot. A quantile-quantile plot(qq plot) is a scatter plot where we plot the dataset values vs normal distribution values for quantiles determined from the dataset. The y coordinate of a qq plot is the dataset values, the x coordinates are values from the normal distribution.

To generate the data for the quantile-quantile plot, we must define the quantiles for which we need the values from the normal distribution, so first we write-

num_divisions = residuals.shape[0]+1
quantiles = np.arange(1,residuals.shape[0])/num_divisions

The num_divisions tells us the how many divisions in the normal distribution we need. If you run the code above you should see the quantiles will be -

[0.00497512, 0.00995025, 0.01492537, 0.0199005 ...]

So if we have n data points in a dataset, we need to divide the normal distribution into n+1 regions to get n from the normal distribution.

So for example in our case we have 200 points in the dataset hence we need to have 201 divisions in the normal distribution. Once we have the quantile values we then insert them into the ppf function. This is the inverse of the cumulative distribution which gives us the z-score of the normal distribution for a given quantile value.

The quantile values that we are feeding into the ppf function are nothing more than the ratio of the area under the normal curve normalized by 1.

qq_x_data = sps.norm.ppf(quantiles)

These values are the x values for the qq plot, we get the y values by just sorting the residuals

qq_y_data = np.sort(residuals)

Next, we need to get the data for plotting the reference line. For that, we need two points to determine the slope and y intercept of the line. For this, we are going adopt the recommendation of Cleveland in Visualizing data [1]. The x values for the first and third quartiles will come from normal distribution. The y values will come from our ordered dataset. Hence we write-

# guide line data
line_x0 = sps.norm.ppf(0.25)
line_x1 = sps.norm.ppf(0.75)

line_y0 = np.quantile(residuals, 0.25)
line_y1 = np.quantile(residuals, 0.75)

using these two points we form a line-

slope = (line_y1-line_y0)/(line_x1-line_x0)
line_intercept = line_y1 - (slope*line_x1)

x_range_line = np.arange(-3,3,0.001)
y_values_line = (slope*x_range_line) + line_intercept

Note: Do not fit a regression line to the points on the qq plot will not satisfy the assumptions of linear regression (think about it! why?)

In the below cell the full code for generating the qq plot data is shown.

In [5]:
num_divisions = residuals.shape[0]+1
quantiles = np.arange(1,residuals.shape[0])/num_divisions

# scatter data 
qq_x_data = sps.norm.ppf(quantiles)
qq_y_data = np.sort(residuals)

# guide line data
line_x0 = sps.norm.ppf(0.25)
line_x1 = sps.norm.ppf(0.75)

line_y0 = np.quantile(residuals, 0.25)
line_y1 = np.quantile(residuals, 0.75)


slope = (line_y1-line_y0)/(line_x1-line_x0)
line_intercept = line_y1 - (slope*line_x1)

x_range_line = np.arange(-3,3,0.001)
y_values_line = (slope*x_range_line) + line_intercept  

Now that we have the data for the qq plot and the reference line we will use plotly to plot them. The code for it is in the next cell.

In [6]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=qq_x_data,
                         y=qq_y_data,
                         mode='markers',
                         marker={'color':'blue'},
                         name="qq data"))
fig.add_trace(go.Scatter(x=x_range_line,
                         y=y_values_line,
                         mode='lines',
                         marker={'color':'red'},
                         name="guide line"))

fig['layout'].update(title='Figure 3',
                     xaxis={
                            'title': 'Theoritical Quantities',
                            'zeroline': True
                           },
                    yaxis={
                            'title': 'Sample Quantities'
                          },
                   showlegend=True,
                   )


iplot(fig)

The way to read a qq plot is to see how many points fall along the reference line. If the majority of POINTS fall onto this line then we assume that the data represents a normal distribution. If the majority of points do not follow this trend then we can say that the data does not have a normal trend. Usually in this case we should try out other normality tests as well like the Anderson Darling test, KS test etc in order to be sure that the data is not normal. We will delve more into this next notebook where we look at how to combine and interpret graphical and hypothesis test for normality.

The main takeaway from this plot is that the qq plot is a simple graphical way to decipher if data set follows a normal distribution or not.

Quantile-quantile plots for data from various distributions

In this section, we will look at data that comes from different types of distribution and show how you can use a qq plot to compare them to a normal distribution.

Suppose we have a dataset that follows a uniform distribution. then the qq plot will look different.

In [7]:
uniform_data = np.random.uniform(0,10,1000)

num_divisions = uniform_data.shape[0]+1
quantiles = np.arange(1,uniform_data.shape[0])/num_divisions

# scatter data 
qq_uniform_x = sps.norm.ppf(quantiles)
qq_uniform_y = np.sort(uniform_data)

line_y0 = np.quantile(uniform_data, 0.25)
line_y1 = np.quantile(uniform_data, 0.75)

slope = (line_y1-line_y0)/(line_x1-line_x0)
line_intercept = line_y1 - (slope*line_x1)

# points to plot the line 
x_uniform = np.arange(-3,3,0.001)
y_uniform = (slope*x_range_line) + line_intercept  

fig = make_subplots(rows=1, cols=2,subplot_titles=["Data histogram", "QQ plot"])

fig.add_trace(go.Histogram(x=uniform_data,
                           name="uniform data"),
              row=1,
              col=1)
fig.add_trace(go.Scatter(x=qq_uniform_x,
                         y=qq_uniform_y,
                         mode='markers',
                         marker={'color':'blue'},
                         name="qq data"),
               row=1,
               col=2)
fig.add_trace(go.Scatter(x=x_uniform,
                         y=y_uniform,
                         mode='lines',
                         marker={'color':'red'},
                         name="guide line"),
              row=1,
              col=2)

fig.update_xaxes(title_text="data values", row=1, col=1)
fig.update_xaxes(title_text="theoretical quantiles", range=[-4,4], row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="observed quantiles" , row=1, col=2)




iplot(fig)

You can see that the scatter points have an s-shaped curve and many points do not line on the reference line. This is a good signature that the observed data does not come from a normal distribution. The same can be seen in the histogram as well.

In fact we can do this for many other distribution types. Here is a plotly plot comparing the residuals data to multiple distributions. For this, we are going to use ipython widgets to condense multiple plots into plotly.

Poke around the arguments of the distributions in the distributions function to generate different shapes of data distributions. For example, change the mean and the std of the lognormal distribution to see what happens to the histogram and the qq plot. This will help you see different types of data and their qq plot.

In [8]:
def get_qqdata(observed_data):
    np.random.seed(0)
    num_divisions = observed_data.shape[0]+1
    quantiles = np.arange(1,observed_data.shape[0])/num_divisions

    # scatter data 
    qq_x = sps.norm.ppf(quantiles)
    qq_y = np.sort(observed_data)

    line_y0 = np.quantile(observed_data, 0.25)
    line_y1 = np.quantile(observed_data, 0.75)

    slope = (line_y1-line_y0)/(line_x1-line_x0)
    line_intercept = line_y1 - (slope*line_x1)

    # points to plot the line 
    x_line = np.arange(-3,3,0.001)
    y_line = (slope*x_range_line) + line_intercept  

    qq_data = {'qqx': qq_x, 'qqy':qq_y, 'linex': x_line, 'liney':y_line}
    
    return qq_data




def dist_qqplot(dist_data, dist_name) :
    qq_data = get_qqdata(dist_data)


    fig = make_subplots(rows=1, cols=2,subplot_titles=["Data histogram", "QQ plot"])
    fig.add_trace(go.Histogram(x=dist_data, name=dist_name), row=1, col=1)
    fig.update_xaxes(title_text="data values" ,row=1, col=1)
    fig.add_trace(go.Scatter(x=qq_data["qqx"] , y=qq_data["qqy"], mode='markers', marker={'color':'blue'}, name="qq data"), row=1,col=2)
    fig.add_trace(go.Scatter(x=qq_data["linex"], y=qq_data["liney"], mode='lines', marker={'color':'red'}, name="guide line"), row=1,col=2)

    
    fig.update_xaxes(title_text="theoretical quantiles", range=[-4,4], row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.update_yaxes(title_text="observed quantiles" , row=1, col=2)
    
    return iplot(fig)



def distributions(dist): 
    
    # change parameter values here to see how the qq plot can change
    selected_data ={"triangular": np.random.triangular(10,100,115,1000),
                    'lognormal': np.random.lognormal(10,1,1000),
                    'chi square': np.random.chisquare(8,1000)}
    
    return dist_qqplot(selected_data[dist], dist )

toggle_obj = widgets.ToggleButtons(description="Distribution:", options=["triangular","lognormal", "chi square"])
interact(distributions, dist=toggle_obj);
In [ ]: