import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity # Expects 2D arrays as input
from scipy.spatial.distance import cosine # Works with 1D vectors
from sklearn.metrics import classification_report
smodel = SentenceTransformer('distilbert-base-nli-mean-tokens')
df1 = pd.read_csv('bbc_news_train.csv')
df1.head()
def get_sentence_vector(query):
query_vec = smodel.encode([query])[0]
return query_vec
%%time
df1['textVec'] = df1['Text'].apply(lambda x: get_sentence_vector(x))
df1.head()
def std_category(x):
if(x == 'tech'):
return 'technology'
elif (x == 'sport'):
return 'sports'
else:
return x
df1['Category'] = df1['Category'].apply(std_category)
def get_cosine_sim(x, Y):
y = smodel.encode([Y])[0]
return cosine(x, y)
df1['cdist_business'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'business'))
df1['Category'].unique()
array(['business', 'technology', 'politics', 'sports', 'entertainment'], dtype=object)
df1['cdist_technology'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'technology'))
df1['cdist_politics'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'politics'))
df1['cdist_sports'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'sports'))
df1['cdist_entertainment'] = df1['textVec'].apply(lambda x: get_cosine_sim(x, 'entertainment'))
def get_prediction(in_row):
min_dist = 99999999
label = ""
for i in ['cdist_business', 'cdist_technology', 'cdist_politics', 'cdist_sports', 'cdist_entertainment']:
d = in_row[i]
if d < min_dist:
min_dist = d
label = i.split('_')[1]
return label
df1['prediction'] = df1.apply(lambda in_row: get_prediction(in_row), axis = 1)
df1.head()
target_names = ['business', 'entertainment', 'politics', 'sports', 'technology']
print(classification_report(df1['Category'], df1['prediction'], target_names=target_names))
from collections import Counter
Counter(df1['Category'])
Counter({'business': 336,
'technology': 261,
'politics': 274,
'sports': 346,
'entertainment': 273})
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Tuesday, June 7, 2022
Creating a Taxonomy for BBC News Articles (Part 5 based on - A Hybrid Approach to Hypernym Discovery)
Subscribe to:
Post Comments (Atom)




No comments:
Post a Comment