import seaborn as sns from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer from pyspark import SparkContext from pyspark.sql import SQLContext # Main entry point for DataFrame and SQL functionality. sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) import pyspark print(pyspark.__version__) 3.3.0 with open('./input/student.csv', mode = 'r', encoding = 'utf8') as f: data = f.readlines() data ['sno,FirstName,LASTNAME\n', 'one,Ram,\n', 'two,,Sharma\n', 'three,Shyam,NA\n', 'four,Kabir,\n', 'five,NA,Singh\n'] df_student.head()When you load a Pandas DataFrame by reading from a CSV, blank values and 'NA' values are converted to 'NaN' values by default as shown above.
print(type(df_student)) <class 'pandas.core.frame.DataFrame'> df_student.fillna('Not Applicable', inplace = True) # Handles blank and 'NA' values both. df_student from pyspark import pandas as ppd df_student_pyspark = ppd.read_csv('./input/student.csv') type(df_student_pyspark) pyspark.pandas.frame.DataFrame df_student_pyspark df_student_pyspark.fillna('Not Applicable', inplace = True) # Handles blank (None) values. df_student_pyspark
Tuesday, October 25, 2022
Way 1: In Reading null and NA values (Ways in which Pandas API on PySpark differs from Plain Pandas)
Labels:
Spark,
Technology
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment