Friday, December 17, 2021

Pie Plot using Python for African Countries from WhatsApp Phone Numbers Dataset



FYI:
Country: Code 
Kenya: +254
Uganda: +256

import os
import re
import json
from time import time
import pandas as pd

import matplotlib.pyplot as plt

import phonenumbers
from phonenumbers.phonenumberutil import region_code_for_country_code
from phonenumbers.phonenumberutil import region_code_for_number
import pycountry


for path, subdirs, files in os.walk("f1/WA_Africa_202112/"):
    p = path
    files_list = files

all_lines = []

for i in files_list:
    with open(os.path.join(p, i), mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        all_lines += lines

lines_2 = []
for j in all_lines:
    if "Ashish Jain" in j:
        pass
    else:
        lines_2.append(j)

lines_3 = []
lines_wags = []
for i in lines_2:
    if "https://chat.whatsapp.com/" in i:
        lines_wags.append(i)
    else:
        lines_3.append(i)

phone_numbers = []
lines_temp = []
for i in lines_3:
    
    x = re.findall(r"[+][0-9]{3}\s[0-9][0-9][0-9]\s[0-9]{6}", i) # +123 123 123456
    y = re.findall(r"[+][0-9][0-9][0-9]\s[0-9]{9}", i) # +254 111222333
    z = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9]", i) # +258 12 345 6789 
    a = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9]{7}", i) # +92 123 1234567
    b = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9]", i) # +234 123 456 1234
    c = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9][0-9]\s[0-9][0-9][0-9][0-9][0-9][0-9]", i) # +44 1234 123456
    d = re.findall(r"[0][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]", i) # 0123412345
    e = re.findall(r"[+][0-9]{11}", i) # +12345678901
    f = re.findall(r"[+][0-9][0-9][0-9]\s[0-9][0-9][0-9][-][0-9][0-9][0-9][0-9][0-9][0-9]", i) # +212 123-123456
    g = re.findall(r"[+][0-9]\s[\(][0-9][0-9][0-9][\)]\s[0-9]{3}[-][0-9]{4}", i) # +1 (123) 123-1234
    h = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]", i) # +31 12345671234
    i2 = re.findall(r"[+][0-9]{2}\s[0-9]{5}\s[0-9]{5}", i) # +91 12345 12345
    j = re.findall(r"[+][0-9]{2}\s[0-9]{3}[-][0-9]{4}[-][0-9]{5}", i) # +62 123-1234-12345
    k = re.findall(r"[+][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9][0-9]\s[0-9][0-9]\s[0-9][0-9]", i) # +90 123 123 27 12
    l = re.findall(r"[+][0-9]{2}\s[0-9]{4}\s[0-9]{3}\s[0-9]{3}", i) # +91 1234 123 123
    m = re.findall(r"[+][0-9]{2}\s[0-9]{2}\s[0-9]{3}\s[0-9]{4}", i) # +27 65 123 1234
    n = re.findall(r"[+][0-9]{5}[-][0-9]{7}", i) # +12345-1234567
    o = re.findall(r"[+][0-9]{3}\s[0-9]{4}\s[0-9]{4}", i) # +968 1234 1234
    p = re.findall(r"[+][0-9]{12}", i) # +123451234512
    q = re.findall(r"[+][0-9]{3}\s[0-9]{8}", i) # +229 12345678
    r = re.findall(r"[+][0-9]{2}\s[0-9]{3}\s[0-9]{3}\s[0-9]{3}", i) # +40 123 123 123 
    s = re.findall(r"[+][0-9]{2}\s[0-9]{3}\s[0-9]{3}\s[0-9]{4}", i) # +98 123 123 1234
    t = re.findall(r"[+][0-9]{3}\s[0-9]{3}\s[0-9]{3}\s[0-9]{3}", i) # +123 123 123 123
    u = re.findall(r"[+][0-9]{3}\s([0-9]{2}\s){3}[0-9]{2}", i) # +228 12 34 56 78
    v = re.findall(r"[+][0-9]{3}\s[0-9]{2}\s[0-9]{6}", i) # +232 30 123456
    w2 = re.findall(r"[+][0-9]{3}\s[0-9]{3}\s[0-9]{2}\s[0-9]{2}\s[0-9]{2}", i) # +265 123 45 67 89
    x2 = re.findall(r"[+][0-9]{3}\s[0-9]{2}\s[0-9]{3}\s[0-9]{3}", i) # # +267 12 123 123
    
    w = x + y + z + a + b + c + d + e + f + g + h + i2 + j + k + l + m + n + o + p + q + r + s + t + u + v + w2 + x2
    
    phone_numbers += w
    lines_temp += [i for j in range(len(w))]



ts = str(time())

print("Before:", len(phone_numbers))
phone_numbers = sorted(set(phone_numbers))
print("After:", len(phone_numbers))
with open(file = "phone_numbers_" + ts + ".txt", mode="w", encoding = "utf-8") as f:
    f.write(json.dumps(phone_numbers))

phone_numbers_2 = [i.replace("\xa0", " ") for i in phone_numbers]

phn_num_list = []
cntry = []

for i in phone_numbers_2:
    try:
        pn = phonenumbers.parse(i)
        # print(region_code_for_country_code(pn.country_code))

        country = pycountry.countries.get(alpha_2 = region_code_for_number(pn))
        
        phn_num_list.append(i)
        cntry.append(country.name)
    except:
        pass

df1 = pd.DataFrame({
    "phn": phn_num_list,
    "cntry": cntry
})

df2 = df1.groupby('cntry').count()


def plot_pie(labels, sizes, title = ""):
    colors = ['#f47961', '#f0c419', '#255c61', '#78909c', '#6ad4cf', '#17aee8', '#5c6bc0', '#444b6e', '#ef4c60', '#744593', 
              '#ee5691', '#9ccc65', '#708b75', '#d1cb65', '#0d8de1', '#a4554b', '#694f5d', '#45adb3', '#26a69a', '#bdc7cc', ]
    colors = colors[0:len(labels)]
    
    explode = explode = [0.1] + [0]*1000  # explode 1st slice
    explode = explode[0:len(labels)]
    
    # Plot
    plt.figure(num=None, figsize=(9, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
    plt.title(title)
    plt.axis('equal')
    plt.show()


top_10 = sorted(df2.phn.values)[::-1][0:9]

df2.head()

df2 = df2.reset_index() df2.head()
def get_cntry_label(in_row): if(in_row['phn'] in top_10): return in_row['cntry'] else: return 'Others' df2['cntry_lbl'] = df2.apply(get_cntry_label, axis = 1) df3 = df2[['cntry_lbl', 'phn']].groupby('cntry_lbl').sum('phn') df3
plot_pie(df3.index, df3.phn.values, 'Countries')
Tags: Technology,Data Visualization,Machine Learning,

No comments:

Post a Comment