I’m trying to plot a normal distribution curve for a set of values. Unfortunately, the below code (taken from this post) doesn’t seem to be plotting the curve correctly over the histograms (please refer attached image). I’m sure I’m missing something or have done something silly but can’t seem to figure out. Can someone please help? I’ve included my code below – I’m getting the values from a dataframe but have included these as a list s
for convenience:
import numpy as np import scipy import pandas as pd from scipy.stats import norm import matplotlib.pyplot as plt from matplotlib.mlab import normpdf mu = 0 sigma = 1 n_bins = 50 s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4][![enter image description here][1]][1] fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True) #histogram n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' ) pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2)) print(pdf) median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75) #probability density function axes[1].plot(bins, pdf, color='orange', alpha=.6) #to ensure pdf and bins line up to use fill_between. bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)] pdf_1 = pdf[:int(len(pdf)/2)] pdf_2 = pdf[int(len(pdf)/2):] pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))] pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))] #fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR #axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange') #axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange') #add text to bottom graph. axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center') axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center') axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center') axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center') axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center') axes[1].set_ylabel('Probability Density') #top boxplot axes[0].boxplot(s, 0, 'gD', vert=False) axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5) axes[0].axis('off') plt.rcParams["figure.figsize"] = (10,10) plt.subplots_adjust(hspace=0) plt.show()
Advertisement
Answer
You have set mu
and sigma
arbitrarily to 0
and 1
respectively but you should calculate it for your actual data:
data = pd.Series(s) mu = data.mean() sigma = data.std()
Update with full working example:
import numpy as np import scipy import pandas as pd from scipy.stats import norm import matplotlib.pyplot as plt n_bins = 50 s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4] fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True) #histogram n, bins, patches = axes[1].hist(s, n_bins, density=True, alpha=.1, edgecolor='black' ) data = pd.Series(s) mu = data.mean() sigma = data.std() pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2)) median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75) #probability density function axes[1].plot(bins, pdf, color='orange', alpha=.6) #fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR iqr = 1.5 * (q3-q1) x1 = np.linspace(q1 - iqr, q1) x2 = np.linspace(q3, q3 + iqr) pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2)) pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2)) axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange') axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange') #add text to bottom graph. axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1) -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center') axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3) -norm(mu, sigma).cdf(q1) )), xy=(median , 0), ha='center') axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3) )), xy=(q3+iqr/2, 0), ha='center') axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center') axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center') axes[1].set_ylabel('Probability Density') #top boxplot axes[0].boxplot(s, 0, 'gD', vert=False) axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5) axes[0].axis('off')