import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
'alphabets': [
'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau',
'upsilon', 'phi', 'chi', 'psi', 'omega', # Greek Alphabets
'ka', 'kh', 'ga', 'gh', 'ng', 'ch', 'chh', 'ja', 'jh', 'ny', 'ta', 'th', 'da', 'dh', 'na', 'ta', 'th', 'da', 'dh', 'na', 'pa', 'ph', 'ba', 'bh', 'ma',
'ya', 'ra', 'la', 'va', 'sh', 'sh', 'sa', 'ha', 'ksh', 'tr', 'gy', 'shr' # Hindi Consonants
]
})
df['first_letter'] = df['alphabets'].str[0] # Won't work for Pandas API on PySpark
ixs = np.random.permutation(df.shape[0])
split_pct = 0.5
train_ixs = ixs[:round(len(ixs) * split_pct)]
test_ixs = ixs[round(len(ixs) * split_pct):]
df_train = df.iloc[train_ixs]
df_test = df.iloc[test_ixs]
df_train.head()
df_test.head()
not_in_train_but_in_test = df_test[-(df_test.first_letter.isin(df_train.first_letter))]
import pyspark
print(pyspark.__version__)
3.3.0
from pyspark import pandas as ppd
df_ppd = ppd.DataFrame({
'alphabets': [
'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau',
'upsilon', 'phi', 'chi', 'psi', 'omega', # Greek
'ka', 'kh', 'ga', 'gh', 'ng', 'ch', 'chh', 'ja', 'jh', 'ny', 'ta', 'th', 'da', 'dh', 'na', 'ta', 'th', 'da', 'dh', 'na', 'pa', 'ph', 'ba', 'bh', 'ma',
'ya', 'ra', 'la', 'va', 'sh', 'sh', 'sa', 'ha', 'ksh', 'tr', 'gy', 'shr' # Hindi
]
})
df_ppd['first_letter'] = df_ppd['alphabets'].apply(lambda x: x[0])
df_ppd_train = df_ppd.iloc[train_ixs]
df_ppd_test = df_ppd.iloc[test_ixs]
Errors: We cannot filter PySpark's Pandas API based DataFrame using the same code we used for Pure Pandas DataFrame
1.
not_in_train_but_in_test = df_ppd_test[-(df_ppd_test.first_letter.isin(df_ppd_train.first_letter))]
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
Cell In [62], line 1
----> 1 not_in_train_but_in_test = df_ppd_test[-(df_ppd_test.first_letter.isin(df_ppd_train.first_letter))]
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/base.py:880, in IndexOpsMixin.isin(self, values)
873 if not is_list_like(values):
874 raise TypeError(
875 "only list-like objects are allowed to be passed"
876 " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__)
877 )
879 values = (
--> 880 cast(np.ndarray, values).tolist() if isinstance(values, np.ndarray) else list(values)
881 )
883 other = [SF.lit(v) for v in values]
884 scol = self.spark.column.isin(other)
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/series.py:6485, in Series.__iter__(self)
6484 def __iter__(self) -> None:
-> 6485 return MissingPandasLikeSeries.__iter__(self)
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/missing/__init__.py:23, in unsupported_function..unsupported_function(*args, **kwargs)
22 def unsupported_function(*args, **kwargs):
---> 23 raise PandasNotImplementedError(
24 class_name=class_name, method_name=method_name, reason=reason
25 )
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
2.
df_ppd_test.first_letter.isin(df_ppd_train.first_letter)
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
Cell In [63], line 1
----> 1 df_ppd_test.first_letter.isin(df_ppd_train.first_letter)
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/base.py:880, in IndexOpsMixin.isin(self, values)
873 if not is_list_like(values):
874 raise TypeError(
875 "only list-like objects are allowed to be passed"
876 " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__)
877 )
879 values = (
--> 880 cast(np.ndarray, values).tolist() if isinstance(values, np.ndarray) else list(values)
881 )
883 other = [SF.lit(v) for v in values]
884 scol = self.spark.column.isin(other)
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/series.py:6485, in Series.__iter__(self)
6484 def __iter__(self) -> None:
-> 6485 return MissingPandasLikeSeries.__iter__(self)
File ~/anaconda3/envs/mh/lib/python3.9/site-packages/pyspark/pandas/missing/__init__.py:23, in unsupported_function..unsupported_function(*args, **kwargs)
22 def unsupported_function(*args, **kwargs):
---> 23 raise PandasNotImplementedError(
24 class_name=class_name, method_name=method_name, reason=reason
25 )
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
Use of: DataFrame.to_numpy() → numpy.ndarray
Returns: A NumPy ndarray representing the values in this DataFrame or Series.
Note: This method should only be used if the resulting NumPy ndarray is expected to be small, as all the data is loaded into the driver’s memory.
df_ppd_test.first_letter.isin(df_ppd_train.first_letter.to_numpy())
0 False
1 False
2 True
3 True
4 False
6 False
7 True
9 False
10 True
12 True
13 False
23 True
24 False
25 False
28 True
30 True
31 True
33 True
34 True
39 True
41 True
43 True
44 True
45 True
46 False
47 False
49 False
53 True
56 False
57 False
58 True
Name: first_letter, dtype: bool
not_in_train_but_in_test = df_ppd_test[- ( df_ppd_test.first_letter.isin( df_ppd_train.first_letter.to_numpy() ) )]
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Tuesday, October 25, 2022
Way 3: How isin() works for Plain Pandas and how we have to use to_numpy() for it in PySpark's Pandas API (Ways in which Pandas API on PySpark differs from Plain Pandas)
Download Code
Labels:
Spark,
Technology
Subscribe to:
Post Comments (Atom)








No comments:
Post a Comment