survival8: Creating a Taxonomy for BBC News Articles (Part 5 based on

Tuesday, June 7, 2022

Creating a Taxonomy for BBC News Articles (Part 5 based on - A Hybrid Approach to Hypernym Discovery)


import pandas as pd
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity # Expects 2D arrays as input
from scipy.spatial.distance import cosine # Works with 1D vectors

from sklearn.metrics import classification_report



smodel = SentenceTransformer('distilbert-base-nli-mean-tokens')



df1 = pd.read_csv('bbc_news_train.csv')



df1.head()





def get_sentence_vector(query):
    query_vec = smodel.encode([query])[0]
    return query_vec



%%time
df1['textVec'] = df1['Text'].apply(lambda x: get_sentence_vector(x))



df1.head()





def std_category(x):
    if(x == 'tech'):
        return 'technology'
    elif (x == 'sport'):
        return 'sports'
    else:
        return x

df1['Category'] = df1['Category'].apply(std_category)

def get_cosine_sim(x, Y):
    y = smodel.encode([Y])[0]
    
    return cosine(x, y)

df1['cdist_business'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'business'))

df1['Category'].unique()



array(['business', 'technology', 'politics', 'sports', 'entertainment'], dtype=object)



df1['cdist_technology'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'technology'))
df1['cdist_politics'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'politics'))
df1['cdist_sports'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'sports'))
df1['cdist_entertainment'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'entertainment'))

def get_prediction(in_row):
    min_dist = 99999999
    label = ""
    for i in ['cdist_business', 'cdist_technology', 'cdist_politics', 'cdist_sports', 'cdist_entertainment']:
        d = in_row[i]
        if d < min_dist:
            min_dist = d
            label = i.split('_')[1]
    return label

df1['prediction'] = df1.apply(lambda in_row: get_prediction(in_row), axis = 1)

df1.head()





target_names = ['business', 'entertainment', 'politics', 'sports', 'technology']
print(classification_report(df1['Category'], df1['prediction'], target_names=target_names))





from collections import Counter
Counter(df1['Category'])



Counter({'business': 336,
    'technology': 261,
    'politics': 274,
    'sports': 346,
    'entertainment': 273})

survival8

Pages

Tuesday, June 7, 2022

Creating a Taxonomy for BBC News Articles (Part 5 based on - A Hybrid Approach to Hypernym Discovery)

No comments:

Post a Comment