I’m trying to plot a normal distribution curve for a set of values. Unfortunately, the below code (taken from this post) doesn’t seem to be plotting the curve correctly over the histograms (please refer attached image). I’m sure I’m missing something or have done something silly but can’t seem to figure out. Can someone please help? I’ve included my code below – I’m getting the values from a dataframe but have included these as a list
s
for convenience:
JavaScript
x
52
52
1
import numpy as np
2
import scipy
3
import pandas as pd
4
from scipy.stats import norm
5
import matplotlib.pyplot as plt
6
from matplotlib.mlab import normpdf
7
mu = 0
8
sigma = 1
9
n_bins = 50
10
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4][![enter image description here][1]][1]
11
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
12
13
#histogram
14
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
15
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
16
print(pdf)
17
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
18
19
#probability density function
20
axes[1].plot(bins, pdf, color='orange', alpha=.6)
21
22
#to ensure pdf and bins line up to use fill_between.
23
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
24
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
25
pdf_1 = pdf[:int(len(pdf)/2)]
26
pdf_2 = pdf[int(len(pdf)/2):]
27
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
28
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
29
30
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
31
#axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
32
#axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
33
34
#add text to bottom graph.
35
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
36
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
37
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
38
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
39
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
40
41
axes[1].set_ylabel('Probability Density')
42
43
#top boxplot
44
axes[0].boxplot(s, 0, 'gD', vert=False)
45
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
46
axes[0].axis('off')
47
48
plt.rcParams["figure.figsize"] = (10,10)
49
50
plt.subplots_adjust(hspace=0)
51
plt.show()
52
Advertisement
Answer
You have set mu
and sigma
arbitrarily to 0
and 1
respectively but you should calculate it for your actual data:
JavaScript
1
4
1
data = pd.Series(s)
2
mu = data.mean()
3
sigma = data.std()
4
Update with full working example:
JavaScript
1
43
43
1
import numpy as np
2
import scipy
3
import pandas as pd
4
from scipy.stats import norm
5
import matplotlib.pyplot as plt
6
n_bins = 50
7
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4]
8
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
9
10
#histogram
11
n, bins, patches = axes[1].hist(s, n_bins, density=True, alpha=.1, edgecolor='black' )
12
data = pd.Series(s)
13
mu = data.mean()
14
sigma = data.std()
15
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
16
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
17
18
#probability density function
19
axes[1].plot(bins, pdf, color='orange', alpha=.6)
20
21
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
22
iqr = 1.5 * (q3-q1)
23
x1 = np.linspace(q1 - iqr, q1)
24
x2 = np.linspace(q3, q3 + iqr)
25
pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
26
pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
27
axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
28
axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')
29
30
#add text to bottom graph.
31
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1) -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
32
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3) -norm(mu, sigma).cdf(q1) )), xy=(median , 0), ha='center')
33
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3) )), xy=(q3+iqr/2, 0), ha='center')
34
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
35
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
36
37
axes[1].set_ylabel('Probability Density')
38
39
#top boxplot
40
axes[0].boxplot(s, 0, 'gD', vert=False)
41
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
42
axes[0].axis('off')
43