I am trying to create a seaborn histplot
and am almost done, however, I noticed that my x-axis is out of order.
original_data = {0.0: 29076, 227.92: 26401, 473.51: 12045, 195.98: 7500, 495.0: 3750, 53.83: 3750, 385.0: 3750, 97.08: 3750, 119.39: 3750, 118.61: 3750, 30.0: 3750, 13000.0: 3750, 553.22: 3750, 1420.31: 3750, 1683.03: 3750, 1360.48: 3750, 1361.16: 3750, 1486.66: 3750, 1398.5: 3750, 4324.44: 3750, 4500.0: 3750, 1215.51: 3750, 1461.27: 3750, 772.5: 3750, 3330.0: 3750, 915.75: 3750, 2403.1225: 3750, 1119.5: 3750, 2658.13: 3618, 492.0: 1818, 10000.0: 1809, 0.515: 1809, 118.305: 1809, 215.0: 1809, 513.0: 1809, 237.5: 1809, 15452.5: 1809, 377838.0: 1809, 584983.0: 1809, 10772.61: 1809, 883.87: 1809, 110494.0: 1809, 2727.0: 1809, 1767.0: 1809, 4792.5: 1809, 6646.5: 1809, 7323.75: 1809, 4399.5: 1809, 2737.5: 1809, 9088.5: 1809, 6405.0: 1809, 0.36: 1809, 112.055: 1809, 247.5: 1809, 232.5: 1809, 18000.0: 1809, 38315.0: 1809, 8100.0: 1809, 63115.34: 1809, 27551.0: 1809, 6398.58: 1809, 78.0: 1809, 26.0: 1809, 1413.0: 1809, 2230.5: 1809, 604.5: 1809, 4037.25: 1809, 18507.0: 1809, 732.75: 1809, 22665.0: 1809, 12212.25: 1809, 17833.5: 1809, 4177.5: 1809, 1521.0: 1809, 2307.0: 1809, 1873.5: 1809, 1948.5: 1809, 1182.0: 1809, 1473.0: 1695} import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt from collections import Counter df = pd.read_csv('data.csv') costs = df['evals'].to_numpy() original_data = Counter(df['evals'].to_numpy()) new = [] for c in costs: if c >= 0 and c < 100: new.append('<$100') elif c >= 100 and c < 500: new.append('<$500 and >= $100') elif c >= 500 and c < 2000: new.append('<$500 and >= $2000') elif c >= 2000 and c < 5000: new.append('<$2000 and >= $500') elif c >= 5000 and c < 10000: new.append('<$10000 and >= $5000') elif c >= 10000 and c < 20000: new.append('<$20000 and >= $10000') elif c >= 20000 and c < 40000: new.append('<$40000 and >= $20000') else: new.append('>= $40000') order = ['<$100', '<$500 and >= $100', '<$500 and >= $2000', '<$2000 and >= $500', '<$10000 and >= $5000', '<$20000 and >= $10000', '<$40000 and >= $20000'] plt.figure(figsize=(20,8)) sns.set_style("darkgrid") sns.histplot(data=new, stat='probability', kde=True) plt.show()
Adding order
argument as shown here creates the following error(s):
Traceback (most recent call last): File "c:Userswundermahneval_plots.py", line 28, in <module> sns.histplot(data=new, stat='probability', kde=True, order=order) File "C:Python367-64libsite-packagesseaborndistributions.py", line 1435, in histplot **kwargs, File "C:Python367-64libsite-packagesseaborndistributions.py", line 508, in plot_univariate_histogram scout = self.ax.fill_between([], [], color=color, **plot_kws) File "C:Python367-64libsite-packagesmatplotlib__init__.py", line 1565, in inner return func(ax, *map(sanitize_sequence, args), **kwargs) File "C:Python367-64libsite-packagesmatplotlibaxes_axes.py", line 5229, in fill_between collection = mcoll.PolyCollection(polys, **kwargs) File "C:Python367-64libsite-packagesmatplotlibcollections.py", line 1072, in __init__ Collection.__init__(self, **kwargs) File "C:Python367-64libsite-packagesmatplotlibcollections.py", line 164, in __init__ self.update(kwargs) File "C:Python367-64libsite-packagesmatplotlibartist.py", line 1006, in update ret = [_update_property(self, k, v) for k, v in props.items()] File "C:Python367-64libsite-packagesmatplotlibartist.py", line 1006, in <listcomp> ret = [_update_property(self, k, v) for k, v in props.items()] File "C:Python367-64libsite-packagesmatplotlibartist.py", line 1002, in _update_property .format(type(self).__name__, k)) AttributeError: 'PolyCollection' object has no property 'order'
How can I force that order on my x-axis
?
Advertisement
Answer
You could create a bar plot, using np.histogram
to count how many values are in each bin. The bins need to be set explicitly, as they aren’t equally spaced.
Using sns.histplot
directly on the costs
array would show bars with all different widths, which looks quite confusing. Also note that you can’t show a kde when the x-axis isn’t numeric.
import numpy as np import seaborn as sns import matplotlib.pyplot as plt from matplotlib.ticker import PercentFormatter, ScalarFormatter original_data = {0.0: 29076, 227.92: 26401, 473.51: 12045, 195.98: 7500, 495.0: 3750, 53.83: 3750, 385.0: 3750, 97.08: 3750, 119.39: 3750, 118.61: 3750, 30.0: 3750, 13000.0: 3750, 553.22: 3750, 1420.31: 3750, 1683.03: 3750, 1360.48: 3750, 1361.16: 3750, 1486.66: 3750, 1398.5: 3750, 4324.44: 3750, 4500.0: 3750, 1215.51: 3750, 1461.27: 3750, 772.5: 3750, 3330.0: 3750, 915.75: 3750, 2403.1225: 3750, 1119.5: 3750, 2658.13: 3618, 492.0: 1818, 10000.0: 1809, 0.515: 1809, 118.305: 1809, 215.0: 1809, 513.0: 1809, 237.5: 1809, 15452.5: 1809, 377838.0: 1809, 584983.0: 1809, 10772.61: 1809, 883.87: 1809, 110494.0: 1809, 2727.0: 1809, 1767.0: 1809, 4792.5: 1809, 6646.5: 1809, 7323.75: 1809, 4399.5: 1809, 2737.5: 1809, 9088.5: 1809, 6405.0: 1809, 0.36: 1809, 112.055: 1809, 247.5: 1809, 232.5: 1809, 18000.0: 1809, 38315.0: 1809, 8100.0: 1809, 63115.34: 1809, 27551.0: 1809, 6398.58: 1809, 78.0: 1809, 26.0: 1809, 1413.0: 1809, 2230.5: 1809, 604.5: 1809, 4037.25: 1809, 18507.0: 1809, 732.75: 1809, 22665.0: 1809, 12212.25: 1809, 17833.5: 1809, 4177.5: 1809, 1521.0: 1809, 2307.0: 1809, 1873.5: 1809, 1948.5: 1809, 1182.0: 1809, 1473.0: 1695} costs = list(original_data.values()) bins = [0, 100, 500, 2000, 5000, 10000, 20000, 40000, 1000000] bin_values, bin_edges = np.histogram(costs, bins=bins) labels = [f'< ${b0} andn>= ${b1}' for b0, b1 in zip(bins[1:-2], bins[2:-1])] labels = [f'< ${bins[1]}'] + labels + [f'>= ${bins[-2]}'] fig, ax = plt.subplots(figsize=(12, 4)) sns.barplot(x=labels, y=bin_values / bin_values.sum(), color='dodgerblue', ax=ax) ax.yaxis.set_major_formatter(PercentFormatter(1)) plt.show()
Alternatively, sns.histplot()
could be displayed with a logarithmic x-axis to make the bar widths more equal while maintaining a numeric axis. In that case a kde could be calculated on the logs of the values.
from scipy.stats import gaussian_kde bins = [0, 100, 500, 2000, 5000, 10000, 20000, 40000, 100000] fig, ax = plt.subplots(figsize=(12, 4)) sns.histplot(costs, bins=bins, stat='probability', ec='black', lw=1, ax=ax) xs = np.logspace(2, np.log10(bins[-1] ), 500) kde = gaussian_kde(np.log(costs) ) ax.plot(xs, kde(np.log(xs)), color='crimson') ax.set_xscale('log') ax.set_xticks(bins[1:-1]) ax.set_xticks([], minor=True) ax.xaxis.set_major_formatter(ScalarFormatter()) ax.yaxis.set_major_formatter(PercentFormatter(1))