Tuesday, October 25, 2022

Way 3: How isin() works for Plain Pandas and how we have to use to_numpy() for it in PySpark's Pandas API (Ways in which Pandas API on PySpark differs from Plain Pandas)

Download Code

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.DataFrame({
    'alphabets': [
        'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau',
        'upsilon', 'phi', 'chi', 'psi', 'omega', # Greek Alphabets
        'ka', 'kh', 'ga', 'gh', 'ng', 'ch', 'chh', 'ja', 'jh', 'ny', 'ta', 'th', 'da', 'dh', 'na', 'ta', 'th', 'da', 'dh', 'na', 'pa', 'ph', 'ba', 'bh', 'ma', 
        'ya', 'ra', 'la', 'va', 'sh', 'sh', 'sa', 'ha', 'ksh', 'tr', 'gy', 'shr' # Hindi Consonants
    ]
})

df['first_letter'] = df['alphabets'].str[0] # Won't work for Pandas API on PySpark 

ixs = np.random.permutation(df.shape[0])
split_pct = 0.5

train_ixs = ixs[:round(len(ixs) * split_pct)]
test_ixs = ixs[round(len(ixs) * split_pct):]

df_train = df.iloc[train_ixs]
df_test = df.iloc[test_ixs]

df_train.head()

df_test.head()
not_in_train_but_in_test = df_test[-(df_test.first_letter.isin(df_train.first_letter))]
import pyspark print(pyspark.__version__) 3.3.0 from pyspark import pandas as ppd df_ppd = ppd.DataFrame({ 'alphabets': [ 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega', # Greek 'ka', 'kh', 'ga', 'gh', 'ng', 'ch', 'chh', 'ja', 'jh', 'ny', 'ta', 'th', 'da', 'dh', 'na', 'ta', 'th', 'da', 'dh', 'na', 'pa', 'ph', 'ba', 'bh', 'ma', 'ya', 'ra', 'la', 'va', 'sh', 'sh', 'sa', 'ha', 'ksh', 'tr', 'gy', 'shr' # Hindi ] }) df_ppd['first_letter'] = df_ppd['alphabets'].apply(lambda x: x[0]) df_ppd_train = df_ppd.iloc[train_ixs] df_ppd_test = df_ppd.iloc[test_ixs]

Errors: We cannot filter PySpark's Pandas API based DataFrame using the same code we used for Pure Pandas DataFrame

1. not_in_train_but_in_test = df_ppd_test[-(df_ppd_test.first_letter.isin(df_ppd_train.first_letter))] --------------------------------------------------------------------------- PandasNotImplementedError Traceback (most recent call last) Cell In [62], line 1 ----> 1 not_in_train_but_in_test = df_ppd_test[-(df_ppd_test.first_letter.isin(df_ppd_train.first_letter))] File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/base.py:880, in IndexOpsMixin.isin(self, values) 873 if not is_list_like(values): 874 raise TypeError( 875 "only list-like objects are allowed to be passed" 876 " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__) 877 ) 879 values = ( --> 880 cast(np.ndarray, values).tolist() if isinstance(values, np.ndarray) else list(values) 881 ) 883 other = [SF.lit(v) for v in values] 884 scol = self.spark.column.isin(other) File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/series.py:6485, in Series.__iter__(self) 6484 def __iter__(self) -> None: -> 6485 return MissingPandasLikeSeries.__iter__(self) File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/missing/__init__.py:23, in unsupported_function..unsupported_function(*args, **kwargs) 22 def unsupported_function(*args, **kwargs): ---> 23 raise PandasNotImplementedError( 24 class_name=class_name, method_name=method_name, reason=reason 25 ) PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead. 2. df_ppd_test.first_letter.isin(df_ppd_train.first_letter) --------------------------------------------------------------------------- PandasNotImplementedError Traceback (most recent call last) Cell In [63], line 1 ----> 1 df_ppd_test.first_letter.isin(df_ppd_train.first_letter) File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/base.py:880, in IndexOpsMixin.isin(self, values) 873 if not is_list_like(values): 874 raise TypeError( 875 "only list-like objects are allowed to be passed" 876 " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__) 877 ) 879 values = ( --> 880 cast(np.ndarray, values).tolist() if isinstance(values, np.ndarray) else list(values) 881 ) 883 other = [SF.lit(v) for v in values] 884 scol = self.spark.column.isin(other) File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/series.py:6485, in Series.__iter__(self) 6484 def __iter__(self) -> None: -> 6485 return MissingPandasLikeSeries.__iter__(self) File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/missing/__init__.py:23, in unsupported_function..unsupported_function(*args, **kwargs) 22 def unsupported_function(*args, **kwargs): ---> 23 raise PandasNotImplementedError( 24 class_name=class_name, method_name=method_name, reason=reason 25 ) PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.

Use of: DataFrame.to_numpy() → numpy.ndarray

Returns: A NumPy ndarray representing the values in this DataFrame or Series. Note: This method should only be used if the resulting NumPy ndarray is expected to be small, as all the data is loaded into the driver’s memory.
df_ppd_test.first_letter.isin(df_ppd_train.first_letter.to_numpy()) 0 False 1 False 2 True 3 True 4 False 6 False 7 True 9 False 10 True 12 True 13 False 23 True 24 False 25 False 28 True 30 True 31 True 33 True 34 True 39 True 41 True 43 True 44 True 45 True 46 False 47 False 49 False 53 True 56 False 57 False 58 True Name: first_letter, dtype: bool not_in_train_but_in_test = df_ppd_test[- ( df_ppd_test.first_letter.isin( df_ppd_train.first_letter.to_numpy() ) )]
Tags: Technology,Spark,

No comments:

Post a Comment