In [2]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
In [3]:
df1 = pd.read_csv('sales_data_sample.csv')
In [4]:
df1.shape
Out[4]:
(2823, 25)
In [5]:
df1.columns
Out[5]:
Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE'], dtype='object')
In [6]:
df1.head()
Out[6]:
ORDERNUMBER | QUANTITYORDERED | PRICEEACH | ORDERLINENUMBER | SALES | ORDERDATE | STATUS | QTR_ID | MONTH_ID | YEAR_ID | ... | ADDRESSLINE1 | ADDRESSLINE2 | CITY | STATE | POSTALCODE | COUNTRY | TERRITORY | CONTACTLASTNAME | CONTACTFIRSTNAME | DEALSIZE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10107 | 30 | 95.70 | 2 | 2871.00 | 2/24/2003 0:00 | Shipped | 1 | 2 | 2003 | ... | 897 Long Airport Avenue | NaN | NYC | NY | 10022 | USA | NaN | Yu | Kwai | Small |
1 | 10121 | 34 | 81.35 | 5 | 2765.90 | 5/7/2003 0:00 | Shipped | 2 | 5 | 2003 | ... | 59 rue de l'Abbaye | NaN | Reims | NaN | 51100 | France | EMEA | Henriot | Paul | Small |
2 | 10134 | 41 | 94.74 | 2 | 3884.34 | 7/1/2003 0:00 | Shipped | 3 | 7 | 2003 | ... | 27 rue du Colonel Pierre Avia | NaN | Paris | NaN | 75508 | France | EMEA | Da Cunha | Daniel | Medium |
3 | 10145 | 45 | 83.26 | 6 | 3746.70 | 8/25/2003 0:00 | Shipped | 3 | 8 | 2003 | ... | 78934 Hillside Dr. | NaN | Pasadena | CA | 90003 | USA | NaN | Young | Julie | Medium |
4 | 10159 | 49 | 100.00 | 14 | 5205.27 | 10/10/2003 0:00 | Shipped | 4 | 10 | 2003 | ... | 7734 Strong St. | NaN | San Francisco | CA | NaN | USA | NaN | Brown | Julie | Medium |
5 rows × 25 columns
In [7]:
bin_columns = ['PRICEEACH', 'SALES']
In [8]:
for i, col in enumerate(bin_columns):
plt.figure(i)
sns.histplot(data=df1, x = col)
In [9]:
sns.histplot(data=df1, x = 'SALES', binwidth = 2000)
Out[9]:
<AxesSubplot:xlabel='SALES', ylabel='Count'>
In [10]:
bins = [0, 2000, 4000, 6000, 8000, 10000, 12000, 14000]
df1['bins'] = pd.cut(df1['SALES'], bins)
In [11]:
df1['bins'].value_counts()
Out[11]:
(2000, 4000] 1340 (4000, 6000] 624 (0, 2000] 565 (6000, 8000] 215 (8000, 10000] 63 (10000, 12000] 13 (12000, 14000] 2 Name: bins, dtype: int64
In [ ]: