Tuesday, June 7, 2022

Creating a Taxonomy for BBC News Articles (Part 5 based on - A Hybrid Approach to Hypernym Discovery)


import pandas as pd
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity # Expects 2D arrays as input
from scipy.spatial.distance import cosine # Works with 1D vectors

from sklearn.metrics import classification_report



smodel = SentenceTransformer('distilbert-base-nli-mean-tokens')



df1 = pd.read_csv('bbc_news_train.csv')



df1.head()


def get_sentence_vector(query): query_vec = smodel.encode([query])[0] return query_vec %%time df1['textVec'] = df1['Text'].apply(lambda x: get_sentence_vector(x)) df1.head()
def std_category(x): if(x == 'tech'): return 'technology' elif (x == 'sport'): return 'sports' else: return x df1['Category'] = df1['Category'].apply(std_category) def get_cosine_sim(x, Y): y = smodel.encode([Y])[0] return cosine(x, y) df1['cdist_business'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'business')) df1['Category'].unique() array(['business', 'technology', 'politics', 'sports', 'entertainment'], dtype=object) df1['cdist_technology'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'technology')) df1['cdist_politics'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'politics')) df1['cdist_sports'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'sports')) df1['cdist_entertainment'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'entertainment')) def get_prediction(in_row): min_dist = 99999999 label = "" for i in ['cdist_business', 'cdist_technology', 'cdist_politics', 'cdist_sports', 'cdist_entertainment']: d = in_row[i] if d < min_dist: min_dist = d label = i.split('_')[1] return label df1['prediction'] = df1.apply(lambda in_row: get_prediction(in_row), axis = 1) df1.head()
target_names = ['business', 'entertainment', 'politics', 'sports', 'technology'] print(classification_report(df1['Category'], df1['prediction'], target_names=target_names))
from collections import Counter Counter(df1['Category']) Counter({'business': 336, 'technology': 261, 'politics': 274, 'sports': 346, 'entertainment': 273})
Tags: Technology,Natural Language Processing,

No comments:

Post a Comment