In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
23/02/08 08:21:23 WARN Utils: Your hostname, ashish-Lenovo-ideapad-130-15IKB resolves to a loopback address: 127.0.1.1; using 192.168.1.108 instead (on interface wlp2s0) 23/02/08 08:21:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/08 08:21:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 23/02/08 08:21:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
1: For RDD¶
In [2]:
rdd = sc.textFile("../in/TelecomData.csv")
Using collect()¶
In [5]:
rdd.collect()
Out[5]:
['TXCUST00001,982120000,Male,N,PrePaid,Active,Active,InActive,20,N', 'TXCUST00002,982120001,Male,N,PostPaid,Active,Active,InActive,25,N', 'TXCUST00003,982120002,Male,N,PrePaid,Active,Active,InActive,20,Y', 'TXCUST00004,982120003,Male,Y,PrePaid,Active,Active,InActive,25,N', 'TXCUST00005,982120004,Male,N,PrePaid,Active,Active,InActive,15,N', ... 'TXCUST00497,982120496,Male,N,PostPaid,Active,Active,Active,15,N', 'TXCUST00498,982120497,Male,N,PostPaid,Active,Active,Active,20,N', 'TXCUST00499,982120498,Male,N,PostPaid,Active,Active,Active,25,N', 'TXCUST00500,982120499,Male,N,PostPaid,Active,Active,Active,20,N']
Using take()¶
In [6]:
rdd.take(10)
Out[6]:
['TXCUST00001,982120000,Male,N,PrePaid,Active,Active,InActive,20,N', 'TXCUST00002,982120001,Male,N,PostPaid,Active,Active,InActive,25,N', 'TXCUST00003,982120002,Male,N,PrePaid,Active,Active,InActive,20,Y', 'TXCUST00004,982120003,Male,Y,PrePaid,Active,Active,InActive,25,N', 'TXCUST00005,982120004,Male,N,PrePaid,Active,Active,InActive,15,N', 'TXCUST00006,982120005,Male,N,PrePaid,Active,Active,InActive,15,N', 'TXCUST00007,982120006,Male,N,PrePaid,Active,Active,InActive,12,N', 'TXCUST00008,982120007,Male,N,PrePaid,Active,Active,InActive,10,N', 'TXCUST00009,982120008,Male,Y,PrePaid,Active,Active,InActive,15,Y', 'TXCUST00010,982120009,Female,N,PrePaid,Active,Active,InActive,15,N']
2: For SQL DataFrame¶
In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
In [10]:
df = spark.read.option("header", False).csv('../in/TelecomData.csv')
In [13]:
df.collect()
Out[13]:
[Row(_c0='TXCUST00001', _c1='982120000', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='N'), Row(_c0='TXCUST00002', _c1='982120001', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00003', _c1='982120002', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='Y'), Row(_c0='TXCUST00004', _c1='982120003', _c2='Male', _c3='Y', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00005', _c1='982120004', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='15', _c9='N'), ... Row(_c0='TXCUST00497', _c1='982120496', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='15', _c9='N'), Row(_c0='TXCUST00498', _c1='982120497', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='20', _c9='N'), Row(_c0='TXCUST00499', _c1='982120498', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='25', _c9='N'), Row(_c0='TXCUST00500', _c1='982120499', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='20', _c9='N')]
In [15]:
df.take(5)
Out[15]:
[Row(_c0='TXCUST00001', _c1='982120000', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='N'), Row(_c0='TXCUST00002', _c1='982120001', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00003', _c1='982120002', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='Y'), Row(_c0='TXCUST00004', _c1='982120003', _c2='Male', _c3='Y', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00005', _c1='982120004', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='15', _c9='N')]
In [16]:
df.show()
+-----------+---------+------+---+--------+------+------+--------+---+---+ | _c0| _c1| _c2|_c3| _c4| _c5| _c6| _c7|_c8|_c9| +-----------+---------+------+---+--------+------+------+--------+---+---+ |TXCUST00001|982120000| Male| N| PrePaid|Active|Active|InActive| 20| N| |TXCUST00002|982120001| Male| N|PostPaid|Active|Active|InActive| 25| N| |TXCUST00003|982120002| Male| N| PrePaid|Active|Active|InActive| 20| Y| |TXCUST00004|982120003| Male| Y| PrePaid|Active|Active|InActive| 25| N| |TXCUST00005|982120004| Male| N| PrePaid|Active|Active|InActive| 15| N| |TXCUST00006|982120005| Male| N| PrePaid|Active|Active|InActive| 15| N| |TXCUST00007|982120006| Male| N| PrePaid|Active|Active|InActive| 12| N| |TXCUST00008|982120007| Male| N| PrePaid|Active|Active|InActive| 10| N| |TXCUST00009|982120008| Male| Y| PrePaid|Active|Active|InActive| 15| Y| |TXCUST00010|982120009|Female| N| PrePaid|Active|Active|InActive| 15| N| |TXCUST00011|982120010|Female| N| PrePaid|Active|Active|InActive| 15| N| |TXCUST00012|982120011|Female| N|PostPaid|Active|Active|InActive| 20| N| |TXCUST00013|982120012|Female| N|PostPaid|Active|Active|InActive| 25| Y| |TXCUST00014|982120013|Female| N|PostPaid|Active|Active|InActive| 25| N| |TXCUST00015|982120014|Female| Y|PostPaid|Active|Active|InActive| 25| N| |TXCUST00016|982120015|Female| N|PostPaid|Active|Active|InActive| 30| N| |TXCUST00017|982120016|Female| N|PostPaid|Active|Active|InActive| 15| N| |TXCUST00018|982120017|Female| N|PostPaid|Active|Active|InActive| 15| N| |TXCUST00019|982120018|Female| N|PostPaid|Active|Active|InActive| 15| N| |TXCUST00020|982120019|Female| N|PostPaid|Active|Active|InActive| 20| N| +-----------+---------+------+---+--------+------+------+--------+---+---+ only showing top 20 rows
In [18]:
df.head(5)
Out[18]:
[Row(_c0='TXCUST00001', _c1='982120000', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='N'), Row(_c0='TXCUST00002', _c1='982120001', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00003', _c1='982120002', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='20', _c9='Y'), Row(_c0='TXCUST00004', _c1='982120003', _c2='Male', _c3='Y', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='25', _c9='N'), Row(_c0='TXCUST00005', _c1='982120004', _c2='Male', _c3='N', _c4='PrePaid', _c5='Active', _c6='Active', _c7='InActive', _c8='15', _c9='N')]
In [19]:
df.tail(5)
Out[19]:
[Row(_c0='TXCUST00496', _c1='982120495', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='15', _c9='N'), Row(_c0='TXCUST00497', _c1='982120496', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='15', _c9='N'), Row(_c0='TXCUST00498', _c1='982120497', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='20', _c9='N'), Row(_c0='TXCUST00499', _c1='982120498', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='25', _c9='N'), Row(_c0='TXCUST00500', _c1='982120499', _c2='Male', _c3='N', _c4='PostPaid', _c5='Active', _c6='Active', _c7='Active', _c8='20', _c9='N')]
3: For PySpark.Pandas¶
In [20]:
from pyspark import pandas as ppd
df_pandas = ppd.read_csv('../in/TelecomData.csv',
names = ['CustomerID', 'MobileNumber', 'Gender', 'SeniorCitizen', 'Mode', 'Calls', 'SMS', 'InternetServiceStatus', 'MonthlyCharges', 'CustomerChurn'])
WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched. /home/ashish/anaconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead. warnings.warn(message, PandasAPIOnSparkAdviceWarning)
In [28]:
# AttributeError: 'DataFrame' object has no attribute 'collect'
# AttributeError: 'DataFrame' object has no attribute 'show'
# df_pandas.take(2) # TypeError: `indices` must be a list-like except dict or set
df_pandas.take([0, 1, 2])
Out[28]:
CustomerID | MobileNumber | Gender | SeniorCitizen | Mode | Calls | SMS | InternetServiceStatus | MonthlyCharges | CustomerChurn | |
---|---|---|---|---|---|---|---|---|---|---|
0 | TXCUST00001 | 982120000 | Male | N | PrePaid | Active | Active | InActive | 20 | N |
1 | TXCUST00002 | 982120001 | Male | N | PostPaid | Active | Active | InActive | 25 | N |
2 | TXCUST00003 | 982120002 | Male | N | PrePaid | Active | Active | InActive | 20 | Y |
In [29]:
df_pandas.head()
Out[29]:
CustomerID | MobileNumber | Gender | SeniorCitizen | Mode | Calls | SMS | InternetServiceStatus | MonthlyCharges | CustomerChurn | |
---|---|---|---|---|---|---|---|---|---|---|
0 | TXCUST00001 | 982120000 | Male | N | PrePaid | Active | Active | InActive | 20 | N |
1 | TXCUST00002 | 982120001 | Male | N | PostPaid | Active | Active | InActive | 25 | N |
2 | TXCUST00003 | 982120002 | Male | N | PrePaid | Active | Active | InActive | 20 | Y |
3 | TXCUST00004 | 982120003 | Male | Y | PrePaid | Active | Active | InActive | 25 | N |
4 | TXCUST00005 | 982120004 | Male | N | PrePaid | Active | Active | InActive | 15 | N |
In [30]:
df_pandas.tail()
Out[30]:
CustomerID | MobileNumber | Gender | SeniorCitizen | Mode | Calls | SMS | InternetServiceStatus | MonthlyCharges | CustomerChurn | |
---|---|---|---|---|---|---|---|---|---|---|
495 | TXCUST00496 | 982120495 | Male | N | PostPaid | Active | Active | Active | 15 | N |
496 | TXCUST00497 | 982120496 | Male | N | PostPaid | Active | Active | Active | 15 | N |
497 | TXCUST00498 | 982120497 | Male | N | PostPaid | Active | Active | Active | 20 | N |
498 | TXCUST00499 | 982120498 | Male | N | PostPaid | Active | Active | Active | 25 | N |
499 | TXCUST00500 | 982120499 | Male | N | PostPaid | Active | Active | Active | 20 | N |
In [ ]: