FYI:
Country: Code
Kenya: +254
Uganda: +256
import os
import re
import json
from time import time
import pandas as pd
import matplotlib.pyplot as plt
import phonenumbers
from phonenumbers.phonenumberutil import region_code_for_country_code
from phonenumbers.phonenumberutil import region_code_for_number
import pycountry
for path, subdirs, files in os.walk("f1/WA_Africa_202112/"):
p = path
files_list = files
all_lines = []
for i in files_list:
with open(os.path.join(p, i), mode='r', encoding='utf-8') as f:
lines = f.readlines()
all_lines += lines
lines_2 = []
for j in all_lines:
if "Ashish Jain" in j:
pass
else:
lines_2.append(j)
lines_3 = []
lines_wags = []
for i in lines_2:
if "https://chat.whatsapp.com/" in i:
lines_wags.append(i)
else:
lines_3.append(i)
phone_numbers = []
lines_temp = []
for i in lines_3:
x = re.findall(r"[+][0-9]{3}\s[0-9][0-9][0-9]\s[0-9]{6}", i) # +123 123 123456
y = re.findall(r"[+][0-9][0-9][0-9]\s[0-9]{9}", i) # +254 111222333
z = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9]", i) # +258 12 345 6789
a = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9]{7}", i) # +92 123 1234567
b = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9]", i) # +234 123 456 1234
c = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9][0-9][0-9]", i) # +44 1234 123456
d = re.findall(r"[0][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]", i) # 0123412345
e = re.findall(r"[+][0-9]{11}", i) # +12345678901
f = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9][0-9][-][0-9][0-9][0-9][0-9][0-9][0-9]", i) # +212 123-123456
g = re.findall(r"[+][0-9]\s[\(][0-9][0-9][0-9][\)]\s[0-9]{3}[-][0-9]{4}", i) # +1 (123) 123-1234
h = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]", i) # +31 12345671234
i2 = re.findall(r"[+][0-9]{2}\s[0-9]{5}\s[0-9]{5}", i) # +91 12345 12345
j = re.findall(r"[+][0-9]{2}\s[0-9]{3}[-][0-9]{4}[-][0-9]{5}", i) # +62 123-1234-12345
k = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9]\s[0-9][0-9]", i) # +90 123 123 27 12
l = re.findall(r"[+][0-9]{2}\s[0-9]{4}\s[0-9]{3}\s[0-9]{3}", i) # +91 1234 123 123
m = re.findall(r"[+][0-9]{2}\s[0-9]{2}\s[0-9]{3}\s[0-9]{4}", i) # +27 65 123 1234
n = re.findall(r"[+][0-9]{5}[-][0-9]{7}", i) # +12345-1234567
o = re.findall(r"[+][0-9]{3}\s[0-9]{4}\s[0-9]{4}", i) # +968 1234 1234
p = re.findall(r"[+][0-9]{12}", i) # +123451234512
q = re.findall(r"[+][0-9]{3}\s[0-9]{8}", i) # +229 12345678
r = re.findall(r"[+][0-9]{2}\s[0-9]{3}\s[0-9]{3}\s[0-9]{3}", i) # +40 123 123 123
s = re.findall(r"[+][0-9]{2}\s[0-9]{3}\s[0-9]{3}\s[0-9]{4}", i) # +98 123 123 1234
t = re.findall(r"[+][0-9]{3}\s[0-9]{3}\s[0-9]{3}\s[0-9]{3}", i) # +123 123 123 123
u = re.findall(r"[+][0-9]{3}\s([0-9]{2}\s){3}[0-9]{2}", i) # +228 12 34 56 78
v = re.findall(r"[+][0-9]{3}\s[0-9]{2}\s[0-9]{6}", i) # +232 30 123456
w2 = re.findall(r"[+][0-9]{3}\s[0-9]{3}\s[0-9]{2}\s[0-9]{2}\s[0-9]{2}", i) # +265 123 45 67 89
x2 = re.findall(r"[+][0-9]{3}\s[0-9]{2}\s[0-9]{3}\s[0-9]{3}", i) # # +267 12 123 123
w = x + y + z + a + b + c + d + e + f + g + h + i2 + j + k + l + m + n + o + p + q + r + s + t + u + v + w2 + x2
phone_numbers += w
lines_temp += [i for j in range(len(w))]
ts = str(time())
print("Before:", len(phone_numbers))
phone_numbers = sorted(set(phone_numbers))
print("After:", len(phone_numbers))
with open(file = "phone_numbers_" + ts + ".txt", mode="w", encoding = "utf-8") as f:
f.write(json.dumps(phone_numbers))
phone_numbers_2 = [i.replace("\xa0", " ") for i in phone_numbers]
phn_num_list = []
cntry = []
for i in phone_numbers_2:
try:
pn = phonenumbers.parse(i)
# print(region_code_for_country_code(pn.country_code))
country = pycountry.countries.get(alpha_2 = region_code_for_number(pn))
phn_num_list.append(i)
cntry.append(country.name)
except:
pass
df1 = pd.DataFrame({
"phn": phn_num_list,
"cntry": cntry
})
df2 = df1.groupby('cntry').count()
def plot_pie(labels, sizes, title = ""):
colors = ['#f47961', '#f0c419', '#255c61', '#78909c', '#6ad4cf', '#17aee8', '#5c6bc0', '#444b6e', '#ef4c60', '#744593',
'#ee5691', '#9ccc65', '#708b75', '#d1cb65', '#0d8de1', '#a4554b', '#694f5d', '#45adb3', '#26a69a', '#bdc7cc', ]
colors = colors[0:len(labels)]
explode = explode = [0.1] + [0]*1000 # explode 1st slice
explode = explode[0:len(labels)]
# Plot
plt.figure(num=None, figsize=(9, 7), dpi=80, facecolor='w', edgecolor='k')
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title(title)
plt.axis('equal')
plt.show()
top_10 = sorted(df2.phn.values)[::-1][0:9]
df2.head()
df2 = df2.reset_index()
df2.head()
def get_cntry_label(in_row):
if(in_row['phn'] in top_10):
return in_row['cntry']
else:
return 'Others'
df2['cntry_lbl'] = df2.apply(get_cntry_label, axis = 1)
df3 = df2[['cntry_lbl', 'phn']].groupby('cntry_lbl').sum('phn')
df3
plot_pie(df3.index, df3.phn.values, 'Countries')
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Friday, December 17, 2021
Pie Plot using Python for African Countries from WhatsApp Phone Numbers Dataset
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment