Problem Statement: Consider the Universal Identity Number data scenario with two datasets UIN Customer data and Bank account linking data. UIN Card data (UINCardData.csv): Schema Details: UIN, MobileNumber,Gender,SeniorCitizens,Income Bank account link data (BankAccountLink.csv): Schema Details: MobileNumber, LinkedtoBankAccount, BankAccountNumber Requirement Join both datasets and find the UIN number that is not linked with the Bank Account number. Print UIN number and BankAccountNumber. Save the final output to a specified HDFS directory.
In [39]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
In [73]:
uin = sc.textFile("./UINCardData.csv")
In [74]:
uin.take(5)
Out[74]:
['UIN00001,982120000,Male,N,65000', 'UIN00002,982120001,Male,N,35000', 'UIN00003,982120002,Male,N,34000', 'UIN00004,982120003,Male,Y,44000', 'UIN00005,982120004,Male,N,54000']
In [75]:
from collections import namedtuple
UINCARD = namedtuple('UINCARD', ['UIN', 'MobileNumber'])
In [76]:
uin = uin.map(lambda line:line.split(",")).map(lambda r : (r[1], UINCARD(r[0], r[1])))
In [77]:
uin.take(5)
Out[77]:
[('982120000', UINCARD(UIN='UIN00001', MobileNumber='982120000')), ('982120001', UINCARD(UIN='UIN00002', MobileNumber='982120001')), ('982120002', UINCARD(UIN='UIN00003', MobileNumber='982120002')), ('982120003', UINCARD(UIN='UIN00004', MobileNumber='982120003')), ('982120004', UINCARD(UIN='UIN00005', MobileNumber='982120004'))]
In [90]:
bankacc = sc.textFile("BankAccountLink.csv")
In [91]:
print(bankacc.count())
bankacc.take(5)
500
Out[91]:
['982120000,Y,20004562111', '982120001,Y,20004562112', '982120002,Y,20004562113', '982120003,Y,20004562114', '982120004,Y,20004562115']
In [92]:
bankacc = bankacc.filter(lambda r: r.split(",")[1] == 'N')
In [94]:
print(bankacc.count())
bankacc.take(5)
53
Out[94]:
['982120006,N,20004552111', '982120018,N,20004552112', '982120019,N,20004552113', '982120020,N,20004552114', '982120021,N,20004552115']
In [95]:
BANKACC = namedtuple('BANKACC', ['MobileNumber', 'LinkedtoBankAccount', 'BankAccountNumber'])
In [96]:
bankacc = bankacc.map(lambda line:line.split(",")).map(lambda r : (r[0], BANKACC(r[0], r[1], r[2])))
In [97]:
bankacc.take(5)
Out[97]:
[('982120006', BANKACC(MobileNumber='982120006', LinkedtoBankAccount='N', BankAccountNumber='20004552111')), ('982120018', BANKACC(MobileNumber='982120018', LinkedtoBankAccount='N', BankAccountNumber='20004552112')), ('982120019', BANKACC(MobileNumber='982120019', LinkedtoBankAccount='N', BankAccountNumber='20004552113')), ('982120020', BANKACC(MobileNumber='982120020', LinkedtoBankAccount='N', BankAccountNumber='20004552114')), ('982120021', BANKACC(MobileNumber='982120021', LinkedtoBankAccount='N', BankAccountNumber='20004552115'))]
In [98]:
result = uin.join(bankacc)
In [99]:
result.collect()
Out[99]:
[('982120006', (UINCARD(UIN='UIN00007', MobileNumber='982120006'), BANKACC(MobileNumber='982120006', LinkedtoBankAccount='N', BankAccountNumber='20004552111'))), ('982120020', (UINCARD(UIN='UIN00021', MobileNumber='982120020'), BANKACC(MobileNumber='982120020', LinkedtoBankAccount='N', BankAccountNumber='20004552114'))), ('982120048', (UINCARD(UIN='UIN00049', MobileNumber='982120048'), BANKACC(MobileNumber='982120048', LinkedtoBankAccount='N', BankAccountNumber='20004552124'))), ('982120049', (UINCARD(UIN='UIN00050', MobileNumber='982120049'), BANKACC(MobileNumber='982120049', LinkedtoBankAccount='N', BankAccountNumber='20004552125'))), ('982120297', (UINCARD(UIN='UIN00298', MobileNumber='982120297'), BANKACC(MobileNumber='982120297', LinkedtoBankAccount='N', BankAccountNumber='20004552912'))), ('982120301', (UINCARD(UIN='UIN00302', MobileNumber='982120301'), BANKACC(MobileNumber='982120301', LinkedtoBankAccount='N', BankAccountNumber='20004552916'))), ('982120313', (UINCARD(UIN='UIN00314', MobileNumber='982120313'), BANKACC(MobileNumber='982120313', LinkedtoBankAccount='N', BankAccountNumber='20004552928'))), ('982120324', (UINCARD(UIN='UIN00325', MobileNumber='982120324'), BANKACC(MobileNumber='982120324', LinkedtoBankAccount='N', BankAccountNumber='20004552939'))), ('982120018', (UINCARD(UIN='UIN00019', MobileNumber='982120018'), BANKACC(MobileNumber='982120018', LinkedtoBankAccount='N', BankAccountNumber='20004552112'))), ('982120019', (UINCARD(UIN='UIN00020', MobileNumber='982120019'), BANKACC(MobileNumber='982120019', LinkedtoBankAccount='N', BankAccountNumber='20004552113'))), ('982120021', (UINCARD(UIN='UIN00022', MobileNumber='982120021'), BANKACC(MobileNumber='982120021', LinkedtoBankAccount='N', BankAccountNumber='20004552115'))), ('982120022', (UINCARD(UIN='UIN00023', MobileNumber='982120022'), BANKACC(MobileNumber='982120022', LinkedtoBankAccount='N', BankAccountNumber='20004552116'))), ('982120030', (UINCARD(UIN='UIN00031', MobileNumber='982120030'), BANKACC(MobileNumber='982120030', LinkedtoBankAccount='N', BankAccountNumber='20004552118'))), ('982120033', (UINCARD(UIN='UIN00034', MobileNumber='982120033'), BANKACC(MobileNumber='982120033', LinkedtoBankAccount='N', BankAccountNumber='20004552121'))), ('982120046', (UINCARD(UIN='UIN00047', MobileNumber='982120046'), BANKACC(MobileNumber='982120046', LinkedtoBankAccount='N', BankAccountNumber='20004552122'))), ('982120047', (UINCARD(UIN='UIN00048', MobileNumber='982120047'), BANKACC(MobileNumber='982120047', LinkedtoBankAccount='N', BankAccountNumber='20004552123'))), ('982120052', (UINCARD(UIN='UIN00053', MobileNumber='982120052'), BANKACC(MobileNumber='982120052', LinkedtoBankAccount='N', BankAccountNumber='20004552128'))), ('982120300', (UINCARD(UIN='UIN00301', MobileNumber='982120300'), BANKACC(MobileNumber='982120300', LinkedtoBankAccount='N', BankAccountNumber='20004552915'))), ('982120309', (UINCARD(UIN='UIN00310', MobileNumber='982120309'), BANKACC(MobileNumber='982120309', LinkedtoBankAccount='N', BankAccountNumber='20004552924'))), ('982120310', (UINCARD(UIN='UIN00311', MobileNumber='982120310'), BANKACC(MobileNumber='982120310', LinkedtoBankAccount='N', BankAccountNumber='20004552925'))), ('982120312', (UINCARD(UIN='UIN00313', MobileNumber='982120312'), BANKACC(MobileNumber='982120312', LinkedtoBankAccount='N', BankAccountNumber='20004552927'))), ('982120315', (UINCARD(UIN='UIN00316', MobileNumber='982120315'), BANKACC(MobileNumber='982120315', LinkedtoBankAccount='N', BankAccountNumber='20004552930'))), ('982120316', (UINCARD(UIN='UIN00317', MobileNumber='982120316'), BANKACC(MobileNumber='982120316', LinkedtoBankAccount='N', BankAccountNumber='20004552931'))), ('982120317', (UINCARD(UIN='UIN00318', MobileNumber='982120317'), BANKACC(MobileNumber='982120317', LinkedtoBankAccount='N', BankAccountNumber='20004552932'))), ('982120318', (UINCARD(UIN='UIN00319', MobileNumber='982120318'), BANKACC(MobileNumber='982120318', LinkedtoBankAccount='N', BankAccountNumber='20004552933'))), ('982120320', (UINCARD(UIN='UIN00321', MobileNumber='982120320'), BANKACC(MobileNumber='982120320', LinkedtoBankAccount='N', BankAccountNumber='20004552935'))), ('982120325', (UINCARD(UIN='UIN00326', MobileNumber='982120325'), BANKACC(MobileNumber='982120325', LinkedtoBankAccount='N', BankAccountNumber='20004552940'))), ('982120031', (UINCARD(UIN='UIN00032', MobileNumber='982120031'), BANKACC(MobileNumber='982120031', LinkedtoBankAccount='N', BankAccountNumber='20004552119'))), ('982120051', (UINCARD(UIN='UIN00052', MobileNumber='982120051'), BANKACC(MobileNumber='982120051', LinkedtoBankAccount='N', BankAccountNumber='20004552127'))), ('982120053', (UINCARD(UIN='UIN00054', MobileNumber='982120053'), BANKACC(MobileNumber='982120053', LinkedtoBankAccount='N', BankAccountNumber='20004552129'))), ('982120055', (UINCARD(UIN='UIN00056', MobileNumber='982120055'), BANKACC(MobileNumber='982120055', LinkedtoBankAccount='N', BankAccountNumber='20004552131'))), ('982120298', (UINCARD(UIN='UIN00299', MobileNumber='982120298'), BANKACC(MobileNumber='982120298', LinkedtoBankAccount='N', BankAccountNumber='20004552913'))), ('982120299', (UINCARD(UIN='UIN00300', MobileNumber='982120299'), BANKACC(MobileNumber='982120299', LinkedtoBankAccount='N', BankAccountNumber='20004552914'))), ('982120304', (UINCARD(UIN='UIN00305', MobileNumber='982120304'), BANKACC(MobileNumber='982120304', LinkedtoBankAccount='N', BankAccountNumber='20004552919'))), ('982120306', (UINCARD(UIN='UIN00307', MobileNumber='982120306'), BANKACC(MobileNumber='982120306', LinkedtoBankAccount='N', BankAccountNumber='20004552921'))), ('982120307', (UINCARD(UIN='UIN00308', MobileNumber='982120307'), BANKACC(MobileNumber='982120307', LinkedtoBankAccount='N', BankAccountNumber='20004552922'))), ('982120314', (UINCARD(UIN='UIN00315', MobileNumber='982120314'), BANKACC(MobileNumber='982120314', LinkedtoBankAccount='N', BankAccountNumber='20004552929'))), ('982120319', (UINCARD(UIN='UIN00320', MobileNumber='982120319'), BANKACC(MobileNumber='982120319', LinkedtoBankAccount='N', BankAccountNumber='20004552934'))), ('982120029', (UINCARD(UIN='UIN00030', MobileNumber='982120029'), BANKACC(MobileNumber='982120029', LinkedtoBankAccount='N', BankAccountNumber='20004552117'))), ('982120032', (UINCARD(UIN='UIN00033', MobileNumber='982120032'), BANKACC(MobileNumber='982120032', LinkedtoBankAccount='N', BankAccountNumber='20004552110'))), ('982120050', (UINCARD(UIN='UIN00051', MobileNumber='982120050'), BANKACC(MobileNumber='982120050', LinkedtoBankAccount='N', BankAccountNumber='20004552126'))), ('982120054', (UINCARD(UIN='UIN00055', MobileNumber='982120054'), BANKACC(MobileNumber='982120054', LinkedtoBankAccount='N', BankAccountNumber='20004552130'))), ('982120056', (UINCARD(UIN='UIN00057', MobileNumber='982120056'), BANKACC(MobileNumber='982120056', LinkedtoBankAccount='N', BankAccountNumber='20004552132'))), ('982120296', (UINCARD(UIN='UIN00297', MobileNumber='982120296'), BANKACC(MobileNumber='982120296', LinkedtoBankAccount='N', BankAccountNumber='20004552911'))), ('982120302', (UINCARD(UIN='UIN00303', MobileNumber='982120302'), BANKACC(MobileNumber='982120302', LinkedtoBankAccount='N', BankAccountNumber='20004552917'))), ('982120303', (UINCARD(UIN='UIN00304', MobileNumber='982120303'), BANKACC(MobileNumber='982120303', LinkedtoBankAccount='N', BankAccountNumber='20004552918'))), ('982120305', (UINCARD(UIN='UIN00306', MobileNumber='982120305'), BANKACC(MobileNumber='982120305', LinkedtoBankAccount='N', BankAccountNumber='20004552920'))), ('982120308', (UINCARD(UIN='UIN00309', MobileNumber='982120308'), BANKACC(MobileNumber='982120308', LinkedtoBankAccount='N', BankAccountNumber='20004552923'))), ('982120311', (UINCARD(UIN='UIN00312', MobileNumber='982120311'), BANKACC(MobileNumber='982120311', LinkedtoBankAccount='N', BankAccountNumber='20004552926'))), ('982120321', (UINCARD(UIN='UIN00322', MobileNumber='982120321'), BANKACC(MobileNumber='982120321', LinkedtoBankAccount='N', BankAccountNumber='20004552936'))), ('982120322', (UINCARD(UIN='UIN00323', MobileNumber='982120322'), BANKACC(MobileNumber='982120322', LinkedtoBankAccount='N', BankAccountNumber='20004552937'))), ('982120323', (UINCARD(UIN='UIN00324', MobileNumber='982120323'), BANKACC(MobileNumber='982120323', LinkedtoBankAccount='N', BankAccountNumber='20004552938'))), ('982120326', (UINCARD(UIN='UIN00327', MobileNumber='982120326'), BANKACC(MobileNumber='982120326', LinkedtoBankAccount='N', BankAccountNumber='20004552941')))]
In [ ]: