import pandas as pd
import statsmodels as sm


df = pd.read_csv('student_marks.csv')

df


# Short summary
df['Marks1'].describe()

count      6.000000
mean     248.333333
std       29.268869
min      200.000000
25%      235.000000
50%      255.000000
75%      267.500000
max      280.000000
Name: Marks1, dtype: float64


import statsmodels.stats.descriptivestats as ds


# Long summary
desc_stats = ds.describe(df['Marks1'])
print("desc_stats using statsmodels : ", desc_stats)

desc_stats using statsmodels :                        Marks1
nobs                6.000000
missing             0.000000
mean              248.333333
std_err             4.878145
upper_ci          257.894321
lower_ci          238.772345
std                29.268869
iqr                32.500000
iqr_normal         24.092286
mad                22.222222
mad_normal         27.851425
coef_var            0.117861
range              80.000000
max               280.000000
min               200.000000
skew               -0.660190
kurtosis            2.228664
jarque_bera         0.584591
jarque_bera_pval    0.746548
mode              200.000000
mode_freq           0.166667
median            255.000000
1%                201.500000
5%                207.500000
10%               215.000000
25%               235.000000
50%               255.000000
75%               267.500000
90%               275.000000
95%               277.500000
99%               279.500000

/home/ashish/anaconda3/lib/python3.9/site-packages/statsmodels/stats/descriptivestats.py:418: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
mode_res = stats.mode(ser.dropna())


marks_col = ['Marks1', 'Marks2', 'Marks3']
for i in marks_col:
minmarks = df[i].describe().loc['min']
maxmarks = df[i].describe().loc['max']
print(i, minmarks, maxmarks)

Marks1 200.0 280.0
Marks2 40.0 70.0
Marks3 30.0 60.0


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


scaled_marks = scaler.fit_transform(df[marks_col])
scaled_marks = pd.DataFrame(scaled_marks)


scaled_marks.columns = ['Scaled1', 'Scaled2', 'Scaled3']


scaled_marks


dfo = pd.read_csv('student_marks_outlier.csv')

dfo


scaled_marks = scaler.fit_transform(dfo[marks_col])
scaled_marks = pd.DataFrame(scaled_marks)


scaled_marks


import statistics as st


import pandas as pd


df = pd.read_csv('marks.csv')

df


def get_zscore(in_row):
return (in_row['marks'] - in_row['mean']) / in_row['std']

df['zscore'] = df.apply(get_zscore, axis = 1)

df


# How many std. away the data point is form the mean?
# This is given by zscore.
print((70-60)/15)
print((72-68)/6)

0.6666666666666666
0.6666666666666666

0.6666666666666666


import pandas as pd


df = pd.read_csv('age_fat.csv')

df


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(df)


df_scaled = pd.DataFrame(data.round(2), columns = ['age', 'fat'])


df_scaled


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv('../2_scatterplot and boxplot using matplotlib/bodyfat.csv')


df = df[['Age', 'BodyFat']]


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(df)


df_std = pd.DataFrame(data.round(2), columns = ['age_std', 'fat_std'])


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_minmax = scaler.fit_transform(df)
df_minmax = pd.DataFrame(df_minmax.round(2),
                        columns = ['age_minmax', 'fat_minmax'])


df = pd.concat([df, df_std, df_minmax], axis = 1)

df


plt.hist(df['Age'])
plt.show()


plt.hist(df['age_std'])
plt.show()


plt.hist(df['age_minmax'])
plt.show()

	Scaled1	Scaled2	Scaled3
0	1.000	1.000000	1.000000
1	0.000	0.666667	0.833333
2	0.875	0.000000	0.000000
3	0.625	0.500000	0.533333
4	0.750	0.466667	0.500000
5	0.375	0.500000	0.166667

	0	1	2
0	0.016667	0.006048	0.006036
1	0.000000	0.004032	0.005030
2	0.014583	0.000000	0.000000
3	0.010417	0.003024	0.003219
4	0.012500	0.002823	0.003018
5	0.006250	0.003024	0.001006
6	1.000000	1.000000	1.000000

	age	%fat
0	23	9.5
1	23	26.5
2	27	7.8
3	27	17.8
4	39	31.4
5	41	25.9
6	47	27.4
7	49	27.2
8	50	31.2
9	52	34.6
10	54	42.5
11	54	28.8
12	56	33.4
13	57	30.2
14	58	34.1
15	58	32.9
16	60	41.2
17	61	35.7

	age	fat
0	-1.83	-2.14
1	-1.83	-0.25
2	-1.51	-2.33
3	-1.51	-1.22
4	-0.58	0.29
5	-0.42	-0.32
6	0.04	-0.15
7	0.20	-0.18
8	0.28	0.27
9	0.43	0.65
10	0.59	1.53
11	0.59	0.00
12	0.74	0.51
13	0.82	0.16
14	0.90	0.59
15	0.90	0.46
16	1.06	1.38
17	1.13	0.77

	Age	BodyFat	age_std	fat_std	age_minmax	fat_minmax
0	23	12.3	-1.74	-0.82	0.02	0.26
1	22	6.1	-1.82	-1.56	0.00	0.13
2	22	25.3	-1.82	0.74	0.00	0.53
3	26	10.4	-1.50	-1.05	0.07	0.22
4	24	28.7	-1.66	1.14	0.03	0.60
...	...	...	...	...	...	...
247	70	11.0	2.00	-0.98	0.81	0.23
248	72	33.6	2.16	1.73	0.85	0.71
249	72	29.3	2.16	1.22	0.85	0.62
250	72	26.0	2.16	0.82	0.85	0.55
251	74	31.9	2.31	1.53	0.88	0.67

survival8

Pages

Thursday, June 1, 2023

Ch 3 - Scaling and Normalization

Scaling vs. Normalization: What's the difference?

Scaling

Why do we have to scale / normalize the input for an artificial neural network?

What is there is an outlier in the data?

Now in code

Min Max Scaler

With outlier in data: MinMaxScaler squashes the other values¶

Thing to remember: Do not use MinMaxScaler on data with outliers.¶

Standard Scaler

Q1: Which subject is easy?

Assuming there are 100K records in this kind of format. Then you would not go to fill z-score for these records. You will need something automated.¶

Q2: Applying zscore normalization using Scikit Learn on Pandas DataFrame

Q3: Visualizing what minmax scaler and standard scaler do to the data.

No comments:

Post a Comment

	Marks1	Marks2	Marks3
Student 1	280	70	60
Student 2	200	60	55
Student 3	270	40	30

	Student	Marks1	Marks2	Marks3
0	1	280	70	60
1	2	200	60	55
2	3	270	40	30
3	Harshitha	250	55	46
4	Yaju	260	54	45
5	Sahaj	230	55	35

	subject	marks	mean	std
0	Sub1	70	60	15
1	Sub2	72	68	6