Thursday, November 3, 2022

Stratified sampling and fixed size sampling plus visualization using pie plot (Nov 2022)

Download Code And Data

1. Stratified Sampling Using Pandas

import pandas as pd import numpy as np import matplotlib.pyplot as plt complete_data = pd.read_csv('sales_data_sample.csv') colslist = ['COUNTRY', 'PRODUCTLINE'] train_size = 0.33 data_sample = complete_data.groupby(colslist, group_keys=False).apply( lambda x: x.sample( int(train_size*len(x)), random_state=1 ) ) complete_data.head()
complete_data.shape (2823, 25) data_sample.shape (865, 25) def plot_pie(labels, sizes, title = ""): colors = ['#f47961', '#f0c419', '#255c61', '#78909c', '#6ad4cf', '#17aee8', '#5c6bc0', '#444b6e', '#ef4c60', '#744593', '#ee5691', '#9ccc65', '#708b75', '#d1cb65', '#0d8de1', '#a4554b', '#694f5d', '#45adb3', '#26a69a', '#bdc7cc', ] colors = colors[0:len(labels)] explode = (0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) # explode 1st slice explode = explode[0:len(labels)] # Plot plt.figure(num=None, figsize=(9, 7), dpi=80, facecolor='w', edgecolor='k') plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) plt.title(title) plt.axis('equal') plt.show() pie_plot_data = complete_data.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head()
plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries Before Sampling')
pie_plot_data = data_sample.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head()
plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries After Sampling')

2. Fixed Size Sampling With Equal Representation When Number of Records is Too Large

data_sample = complete_data.groupby(colslist, group_keys=False).apply( lambda x: x.sample(n = 1, random_state=1) ).reset_index(drop=True) pie_plot_data = data_sample.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head()
pie_plot_data.tail()
plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries After Sampling')
data_sample[data_sample['COUNTRY'] == 'USA'][colslist]
Tags: Technology,Data Visualization,Machine Learning,

No comments:

Post a Comment