Problem Statement: Consider the Universal Identity Number data scenario with two datasets UIN Customer data and Bank account linking data. UIN Card data (UINCardData.csv): Schema Details: UIN, MobileNumber,Gender,SeniorCitizens,Income Bank account link data (BankAccountLink.csv): Schema Details: MobileNumber, LinkedtoBankAccount, BankAccountNumber Requirement Join both datasets and find the UIN number that is not linked with the Bank Account number. Print UIN number and BankAccountNumber. Save the final output to a specified HDFS directory.
In [39]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
In [73]:
uin = sc.textFile("./UINCardData.csv")
In [74]:
uin.take(5)
Out[74]:
['UIN00001,982120000,Male,N,65000', 'UIN00002,982120001,Male,N,35000', 'UIN00003,982120002,Male,N,34000', 'UIN00004,982120003,Male,Y,44000', 'UIN00005,982120004,Male,N,54000']
In [75]:
from collections import namedtuple
UINCARD = namedtuple('UINCARD', ['UIN', 'MobileNumber'])
In [76]:
uin = uin.map(lambda line:line.split(",")).map(lambda r : (r[1], UINCARD(r[0], r[1])))
In [77]:
uin.take(5)
Out[77]:
[('982120000', UINCARD(UIN='UIN00001', MobileNumber='982120000')),
('982120001', UINCARD(UIN='UIN00002', MobileNumber='982120001')),
('982120002', UINCARD(UIN='UIN00003', MobileNumber='982120002')),
('982120003', UINCARD(UIN='UIN00004', MobileNumber='982120003')),
('982120004', UINCARD(UIN='UIN00005', MobileNumber='982120004'))]
In [90]:
bankacc = sc.textFile("BankAccountLink.csv")
In [91]:
print(bankacc.count())
bankacc.take(5)
500
Out[91]:
['982120000,Y,20004562111', '982120001,Y,20004562112', '982120002,Y,20004562113', '982120003,Y,20004562114', '982120004,Y,20004562115']
In [92]:
bankacc = bankacc.filter(lambda r: r.split(",")[1] == 'N')
In [94]:
print(bankacc.count())
bankacc.take(5)
53
Out[94]:
['982120006,N,20004552111', '982120018,N,20004552112', '982120019,N,20004552113', '982120020,N,20004552114', '982120021,N,20004552115']
In [95]:
BANKACC = namedtuple('BANKACC', ['MobileNumber', 'LinkedtoBankAccount', 'BankAccountNumber'])
In [96]:
bankacc = bankacc.map(lambda line:line.split(",")).map(lambda r : (r[0], BANKACC(r[0], r[1], r[2])))
In [97]:
bankacc.take(5)
Out[97]:
[('982120006',
BANKACC(MobileNumber='982120006', LinkedtoBankAccount='N', BankAccountNumber='20004552111')),
('982120018',
BANKACC(MobileNumber='982120018', LinkedtoBankAccount='N', BankAccountNumber='20004552112')),
('982120019',
BANKACC(MobileNumber='982120019', LinkedtoBankAccount='N', BankAccountNumber='20004552113')),
('982120020',
BANKACC(MobileNumber='982120020', LinkedtoBankAccount='N', BankAccountNumber='20004552114')),
('982120021',
BANKACC(MobileNumber='982120021', LinkedtoBankAccount='N', BankAccountNumber='20004552115'))]
In [98]:
result = uin.join(bankacc)
In [99]:
result.collect()
Out[99]:
[('982120006',
(UINCARD(UIN='UIN00007', MobileNumber='982120006'),
BANKACC(MobileNumber='982120006', LinkedtoBankAccount='N', BankAccountNumber='20004552111'))),
('982120020',
(UINCARD(UIN='UIN00021', MobileNumber='982120020'),
BANKACC(MobileNumber='982120020', LinkedtoBankAccount='N', BankAccountNumber='20004552114'))),
('982120048',
(UINCARD(UIN='UIN00049', MobileNumber='982120048'),
BANKACC(MobileNumber='982120048', LinkedtoBankAccount='N', BankAccountNumber='20004552124'))),
('982120049',
(UINCARD(UIN='UIN00050', MobileNumber='982120049'),
BANKACC(MobileNumber='982120049', LinkedtoBankAccount='N', BankAccountNumber='20004552125'))),
('982120297',
(UINCARD(UIN='UIN00298', MobileNumber='982120297'),
BANKACC(MobileNumber='982120297', LinkedtoBankAccount='N', BankAccountNumber='20004552912'))),
('982120301',
(UINCARD(UIN='UIN00302', MobileNumber='982120301'),
BANKACC(MobileNumber='982120301', LinkedtoBankAccount='N', BankAccountNumber='20004552916'))),
('982120313',
(UINCARD(UIN='UIN00314', MobileNumber='982120313'),
BANKACC(MobileNumber='982120313', LinkedtoBankAccount='N', BankAccountNumber='20004552928'))),
('982120324',
(UINCARD(UIN='UIN00325', MobileNumber='982120324'),
BANKACC(MobileNumber='982120324', LinkedtoBankAccount='N', BankAccountNumber='20004552939'))),
('982120018',
(UINCARD(UIN='UIN00019', MobileNumber='982120018'),
BANKACC(MobileNumber='982120018', LinkedtoBankAccount='N', BankAccountNumber='20004552112'))),
('982120019',
(UINCARD(UIN='UIN00020', MobileNumber='982120019'),
BANKACC(MobileNumber='982120019', LinkedtoBankAccount='N', BankAccountNumber='20004552113'))),
('982120021',
(UINCARD(UIN='UIN00022', MobileNumber='982120021'),
BANKACC(MobileNumber='982120021', LinkedtoBankAccount='N', BankAccountNumber='20004552115'))),
('982120022',
(UINCARD(UIN='UIN00023', MobileNumber='982120022'),
BANKACC(MobileNumber='982120022', LinkedtoBankAccount='N', BankAccountNumber='20004552116'))),
('982120030',
(UINCARD(UIN='UIN00031', MobileNumber='982120030'),
BANKACC(MobileNumber='982120030', LinkedtoBankAccount='N', BankAccountNumber='20004552118'))),
('982120033',
(UINCARD(UIN='UIN00034', MobileNumber='982120033'),
BANKACC(MobileNumber='982120033', LinkedtoBankAccount='N', BankAccountNumber='20004552121'))),
('982120046',
(UINCARD(UIN='UIN00047', MobileNumber='982120046'),
BANKACC(MobileNumber='982120046', LinkedtoBankAccount='N', BankAccountNumber='20004552122'))),
('982120047',
(UINCARD(UIN='UIN00048', MobileNumber='982120047'),
BANKACC(MobileNumber='982120047', LinkedtoBankAccount='N', BankAccountNumber='20004552123'))),
('982120052',
(UINCARD(UIN='UIN00053', MobileNumber='982120052'),
BANKACC(MobileNumber='982120052', LinkedtoBankAccount='N', BankAccountNumber='20004552128'))),
('982120300',
(UINCARD(UIN='UIN00301', MobileNumber='982120300'),
BANKACC(MobileNumber='982120300', LinkedtoBankAccount='N', BankAccountNumber='20004552915'))),
('982120309',
(UINCARD(UIN='UIN00310', MobileNumber='982120309'),
BANKACC(MobileNumber='982120309', LinkedtoBankAccount='N', BankAccountNumber='20004552924'))),
('982120310',
(UINCARD(UIN='UIN00311', MobileNumber='982120310'),
BANKACC(MobileNumber='982120310', LinkedtoBankAccount='N', BankAccountNumber='20004552925'))),
('982120312',
(UINCARD(UIN='UIN00313', MobileNumber='982120312'),
BANKACC(MobileNumber='982120312', LinkedtoBankAccount='N', BankAccountNumber='20004552927'))),
('982120315',
(UINCARD(UIN='UIN00316', MobileNumber='982120315'),
BANKACC(MobileNumber='982120315', LinkedtoBankAccount='N', BankAccountNumber='20004552930'))),
('982120316',
(UINCARD(UIN='UIN00317', MobileNumber='982120316'),
BANKACC(MobileNumber='982120316', LinkedtoBankAccount='N', BankAccountNumber='20004552931'))),
('982120317',
(UINCARD(UIN='UIN00318', MobileNumber='982120317'),
BANKACC(MobileNumber='982120317', LinkedtoBankAccount='N', BankAccountNumber='20004552932'))),
('982120318',
(UINCARD(UIN='UIN00319', MobileNumber='982120318'),
BANKACC(MobileNumber='982120318', LinkedtoBankAccount='N', BankAccountNumber='20004552933'))),
('982120320',
(UINCARD(UIN='UIN00321', MobileNumber='982120320'),
BANKACC(MobileNumber='982120320', LinkedtoBankAccount='N', BankAccountNumber='20004552935'))),
('982120325',
(UINCARD(UIN='UIN00326', MobileNumber='982120325'),
BANKACC(MobileNumber='982120325', LinkedtoBankAccount='N', BankAccountNumber='20004552940'))),
('982120031',
(UINCARD(UIN='UIN00032', MobileNumber='982120031'),
BANKACC(MobileNumber='982120031', LinkedtoBankAccount='N', BankAccountNumber='20004552119'))),
('982120051',
(UINCARD(UIN='UIN00052', MobileNumber='982120051'),
BANKACC(MobileNumber='982120051', LinkedtoBankAccount='N', BankAccountNumber='20004552127'))),
('982120053',
(UINCARD(UIN='UIN00054', MobileNumber='982120053'),
BANKACC(MobileNumber='982120053', LinkedtoBankAccount='N', BankAccountNumber='20004552129'))),
('982120055',
(UINCARD(UIN='UIN00056', MobileNumber='982120055'),
BANKACC(MobileNumber='982120055', LinkedtoBankAccount='N', BankAccountNumber='20004552131'))),
('982120298',
(UINCARD(UIN='UIN00299', MobileNumber='982120298'),
BANKACC(MobileNumber='982120298', LinkedtoBankAccount='N', BankAccountNumber='20004552913'))),
('982120299',
(UINCARD(UIN='UIN00300', MobileNumber='982120299'),
BANKACC(MobileNumber='982120299', LinkedtoBankAccount='N', BankAccountNumber='20004552914'))),
('982120304',
(UINCARD(UIN='UIN00305', MobileNumber='982120304'),
BANKACC(MobileNumber='982120304', LinkedtoBankAccount='N', BankAccountNumber='20004552919'))),
('982120306',
(UINCARD(UIN='UIN00307', MobileNumber='982120306'),
BANKACC(MobileNumber='982120306', LinkedtoBankAccount='N', BankAccountNumber='20004552921'))),
('982120307',
(UINCARD(UIN='UIN00308', MobileNumber='982120307'),
BANKACC(MobileNumber='982120307', LinkedtoBankAccount='N', BankAccountNumber='20004552922'))),
('982120314',
(UINCARD(UIN='UIN00315', MobileNumber='982120314'),
BANKACC(MobileNumber='982120314', LinkedtoBankAccount='N', BankAccountNumber='20004552929'))),
('982120319',
(UINCARD(UIN='UIN00320', MobileNumber='982120319'),
BANKACC(MobileNumber='982120319', LinkedtoBankAccount='N', BankAccountNumber='20004552934'))),
('982120029',
(UINCARD(UIN='UIN00030', MobileNumber='982120029'),
BANKACC(MobileNumber='982120029', LinkedtoBankAccount='N', BankAccountNumber='20004552117'))),
('982120032',
(UINCARD(UIN='UIN00033', MobileNumber='982120032'),
BANKACC(MobileNumber='982120032', LinkedtoBankAccount='N', BankAccountNumber='20004552110'))),
('982120050',
(UINCARD(UIN='UIN00051', MobileNumber='982120050'),
BANKACC(MobileNumber='982120050', LinkedtoBankAccount='N', BankAccountNumber='20004552126'))),
('982120054',
(UINCARD(UIN='UIN00055', MobileNumber='982120054'),
BANKACC(MobileNumber='982120054', LinkedtoBankAccount='N', BankAccountNumber='20004552130'))),
('982120056',
(UINCARD(UIN='UIN00057', MobileNumber='982120056'),
BANKACC(MobileNumber='982120056', LinkedtoBankAccount='N', BankAccountNumber='20004552132'))),
('982120296',
(UINCARD(UIN='UIN00297', MobileNumber='982120296'),
BANKACC(MobileNumber='982120296', LinkedtoBankAccount='N', BankAccountNumber='20004552911'))),
('982120302',
(UINCARD(UIN='UIN00303', MobileNumber='982120302'),
BANKACC(MobileNumber='982120302', LinkedtoBankAccount='N', BankAccountNumber='20004552917'))),
('982120303',
(UINCARD(UIN='UIN00304', MobileNumber='982120303'),
BANKACC(MobileNumber='982120303', LinkedtoBankAccount='N', BankAccountNumber='20004552918'))),
('982120305',
(UINCARD(UIN='UIN00306', MobileNumber='982120305'),
BANKACC(MobileNumber='982120305', LinkedtoBankAccount='N', BankAccountNumber='20004552920'))),
('982120308',
(UINCARD(UIN='UIN00309', MobileNumber='982120308'),
BANKACC(MobileNumber='982120308', LinkedtoBankAccount='N', BankAccountNumber='20004552923'))),
('982120311',
(UINCARD(UIN='UIN00312', MobileNumber='982120311'),
BANKACC(MobileNumber='982120311', LinkedtoBankAccount='N', BankAccountNumber='20004552926'))),
('982120321',
(UINCARD(UIN='UIN00322', MobileNumber='982120321'),
BANKACC(MobileNumber='982120321', LinkedtoBankAccount='N', BankAccountNumber='20004552936'))),
('982120322',
(UINCARD(UIN='UIN00323', MobileNumber='982120322'),
BANKACC(MobileNumber='982120322', LinkedtoBankAccount='N', BankAccountNumber='20004552937'))),
('982120323',
(UINCARD(UIN='UIN00324', MobileNumber='982120323'),
BANKACC(MobileNumber='982120323', LinkedtoBankAccount='N', BankAccountNumber='20004552938'))),
('982120326',
(UINCARD(UIN='UIN00327', MobileNumber='982120326'),
BANKACC(MobileNumber='982120326', LinkedtoBankAccount='N', BankAccountNumber='20004552941')))]
In [ ]: