1. Stratified Sampling Using Pandas
import pandas as pd import numpy as np import matplotlib.pyplot as plt complete_data = pd.read_csv('sales_data_sample.csv') colslist = ['COUNTRY', 'PRODUCTLINE'] train_size = 0.33 data_sample = complete_data.groupby(colslist, group_keys=False).apply( lambda x: x.sample( int(train_size*len(x)), random_state=1 ) ) complete_data.head() complete_data.shape (2823, 25) data_sample.shape (865, 25) def plot_pie(labels, sizes, title = ""): colors = ['#f47961', '#f0c419', '#255c61', '#78909c', '#6ad4cf', '#17aee8', '#5c6bc0', '#444b6e', '#ef4c60', '#744593', '#ee5691', '#9ccc65', '#708b75', '#d1cb65', '#0d8de1', '#a4554b', '#694f5d', '#45adb3', '#26a69a', '#bdc7cc', ] colors = colors[0:len(labels)] explode = (0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) # explode 1st slice explode = explode[0:len(labels)] # Plot plt.figure(num=None, figsize=(9, 7), dpi=80, facecolor='w', edgecolor='k') plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) plt.title(title) plt.axis('equal') plt.show() pie_plot_data = complete_data.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head() plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries Before Sampling') pie_plot_data = data_sample.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head() plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries After Sampling')2. Fixed Size Sampling With Equal Representation When Number of Records is Too Large
data_sample = complete_data.groupby(colslist, group_keys=False).apply( lambda x: x.sample(n = 1, random_state=1) ).reset_index(drop=True) pie_plot_data = data_sample.groupby('COUNTRY', as_index=False)['COUNTRY'].value_counts() pie_plot_data.sort_values(by=['count'], inplace = True) pie_plot_data.head() pie_plot_data.tail() plot_pie(pie_plot_data.COUNTRY.values, pie_plot_data['count'].values, 'Countries After Sampling') data_sample[data_sample['COUNTRY'] == 'USA'][colslist]
Thursday, November 3, 2022
Stratified sampling and fixed size sampling plus visualization using pie plot (Nov 2022)
Download Code And Data
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment