Monday, June 20, 2022

Creating a Taxonomy for BBC News Articles (Part 8 - Using Latent Dirichlet Allocation for topic modeling)


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize

import sys

import gensim
from gensim import corpora
from gensim.models.lsimodel import LsiModel, stochastic_svd
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import RpModel
from gensim.matutils import corpus2dense, Dense2Corpus
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel # Latent Dirichlet Allocation and not 'Latent Discriminant Analysis'

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection, johnson_lindenstrauss_min_dim
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics.pairwise import euclidean_distances
import string

from collections import Counter
from preprocess import preprocess_text


Latent Dirichlet Allocation is a statistical technique to identify topics in textual data.

This below is a snapshot of few results we were getting till we trimmed and trimmed our words from the "probability" equations. Doing this to a great extent will lead to over-fitting. #1 Topic Id: 0 Probabilities: 0.006*"people" + 0.003*"mobile" + 0.003*"film" + 0.003*"phone" + 0.003*"music" + 0.003*"service" + 0.002*"party" + 0.002*"game" + 0.002*"time" + 0.002*"election" + 0.002*"government" + 0.002*"firm" + 0.002*"million" + 0.002*"way" + 0.002*"number" + 0.002*"take" + 0.002*"win" + 0.002*"player" + 0.002*"well" + 0.002*"next" Topic Id: 1 Probabilities: 0.004*"game" + 0.003*"time" + 0.003*"government" + 0.003*"people" + 0.003*"service" + 0.003*"company" + 0.003*"world" + 0.002*"month" + 0.002*"firm" + 0.002*"bbc" + 0.002*"phone" + 0.002*"film" + 0.002*"take" + 0.002*"home" + 0.002*"way" + 0.002*"week" + 0.002*"show" + 0.002*"market" + 0.002*"second" + 0.002*"well" Topic Id: 2 Probabilities: 0.006*"people" + 0.003*"world" + 0.003*"time" + 0.003*"game" + 0.003*"number" + 0.003*"government" + 0.003*"company" + 0.003*"firm" + 0.002*"month" + 0.002*"market" + 0.002*"many" + 0.002*"country" + 0.002*"film" + 0.002*"play" + 0.002*"sale" + 0.002*"tax" + 0.002*"take" + 0.002*"minister" + 0.002*"player" + 0.002*"award" Topic Id: 3 Probabilities: 0.005*"film" + 0.004*"time" + 0.003*"game" + 0.003*"labour" + 0.003*"world" + 0.002*"sale" + 0.002*"week" + 0.002*"election" + 0.002*"month" + 0.002*"people" + 0.002*"country" + 0.002*"show" + 0.002*"party" + 0.002*"firm" + 0.002*"want" + 0.002*"government" + 0.002*"since" + 0.002*"service" + 0.002*"many" + 0.002*"good" Topic Id: 4 Probabilities: 0.004*"time" + 0.003*"people" + 0.003*"game" + 0.003*"government" + 0.003*"world" + 0.002*"number" + 0.002*"company" + 0.002*"music" + 0.002*"win" + 0.002*"blair" + 0.002*"right" + 0.002*"labour" + 0.002*"show" + 0.002*"mobile" + 0.002*"england" + 0.002*"firm" + 0.002*"country" + 0.002*"next" + 0.002*"day" + 0.002*"plan" #2 Topic Id: 0 Probabilities: 0.005*"people" + 0.004*"time" + 0.003*"sale" + 0.003*"film" + 0.003*"mobile" + 0.003*"game" + 0.002*"company" + 0.002*"show" + 0.002*"firm" + 0.002*"million" + 0.002*"party" + 0.002*"world" + 0.002*"country" + 0.002*"labour" + 0.002*"way" + 0.002*"service" + 0.002*"week" + 0.002*"2004" + 0.002*"top" + 0.002*"music" Topic Id: 1 Probabilities: 0.004*"film" + 0.004*"time" + 0.003*"people" + 0.003*"government" + 0.003*"award" + 0.003*"game" + 0.002*"world" + 0.002*"music" + 0.002*"party" + 0.002*"player" + 0.002*"country" + 0.002*"blair" + 0.002*"labour" + 0.002*"win" + 0.002*"bbc" + 0.002*"home" + 0.002*"number" + 0.002*"service" + 0.002*"election" + 0.002*"top" Topic Id: 2 Probabilities: 0.006*"game" + 0.003*"people" + 0.003*"world" + 0.003*"time" + 0.003*"company" + 0.003*"firm" + 0.002*"phone" + 0.002*"film" + 0.002*"month" + 0.002*"number" + 0.002*"service" + 0.002*"player" + 0.002*"england" + 0.002*"market" + 0.002*"minister" + 0.002*"right" + 0.002*"home" + 0.002*"government" + 0.002*"next" + 0.002*"british" Topic Id: 3 Probabilities: 0.005*"people" + 0.004*"government" + 0.004*"time" + 0.003*"world" + 0.003*"election" + 0.002*"game" + 0.002*"week" + 0.002*"company" + 0.002*"labour" + 0.002*"plan" + 0.002*"service" + 0.002*"next" + 0.002*"minister" + 0.002*"win" + 0.002*"work" + 0.002*"technology" + 0.002*"way" + 0.002*"film" + 0.002*"day" + 0.002*"bbc" Topic Id: 4 Probabilities: 0.004*"people" + 0.003*"government" + 0.003*"firm" + 0.003*"time" + 0.003*"world" + 0.003*"market" + 0.003*"number" + 0.002*"month" + 0.002*"country" + 0.002*"price" + 0.002*"group" + 0.002*"film" + 0.002*"company" + 0.002*"lord" + 0.002*"show" + 0.002*"economy" + 0.002*"2004" + 0.002*"london" + 0.002*"report" + 0.002*"use" import re import string import nltk from nltk.corpus import stopwords nltk.download('stopwords') from nltk.stem import WordNetLemmatizer nltk.download('wordnet') from nltk.tokenize import word_tokenize nltk.download('punkt') def cleanup_text(text): # Remove tags remove = re.compile(r'') text = re.sub(remove, '', text) text = re.sub("&#[0-9]+;", '', text) # Remove special characters reviews = '' for x in text: if x.isalnum(): reviews = reviews + x else: reviews = reviews + ' ' #Convert to lower text = reviews.lower() return text def remove_punctuations(text): exclude = set(string.punctuation) exclude.remove("-") text = ''.join(ch for ch in text if ch not in exclude) return text def remove_stopwords(text): stop_words = set(stopwords.words('english')) days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] stop_words.update(days_of_week) words = word_tokenize(text) words = [x for x in words if len(x) > 2] words = [x for x in words if x not in stop_words] return words def lemmatize_word(text): le = WordNetLemmatizer() text = [le.lemmatize(w) for w in text] return text def preprocess_text(doc): doc = cleanup_text(doc) doc = remove_punctuations(doc) words = remove_stopwords(doc) words = lemmatize_word(words) doc = " ".join(words) return doc def remove_additional_words(text): additional_stopwords = ["new", "like", "many", "also", "even", "get", "say", "according", "would", "could", "know", "made", "make", "come", "didnt", "dont", "doesnt", "go", "may", "back", "going", "including", "added", "set", "take", "want", "use", "000", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "20", "u", "one", "two", "three", "year", "first", "last", "good", "best", "well", "told", "said"] days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] additional_stopwords += days_of_week words = word_tokenize(text) words = [x for x in words if len(x) > 2] words = [x for x in words if x not in additional_stopwords] doc = " ".join(words) return doc df1 = pd.read_csv('bbc_news_train.csv') df1['Preprocess_text'] = df1['Text'].apply(preprocess_text) df1['Preprocess_text'] = df1['Preprocess_text'].apply(remove_additional_words) print(Counter(df1['Category'])) Counter({'business': 336, 'tech': 261, 'politics': 274, 'sport': 346, 'entertainment': 273}) from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary # Create a corpus from a list of texts clean_corpus = [doc.split() for doc in df1['Preprocess_text'].values.tolist()] common_dictionary = Dictionary(clean_corpus) common_corpus = [common_dictionary.doc2bow(text) for text in clean_corpus] %%time NO_OF_TOPICS_FOR_TRAINING = 5 NO_OF_WORDS_IN_TOPIC = 20 lda = LdaModel(common_corpus, num_topics = NO_OF_TOPICS_FOR_TRAINING, id2word = common_dictionary) ldamodel_topics = lda.print_topics(NO_OF_TOPICS_FOR_TRAINING, NO_OF_WORDS_IN_TOPIC) for (topic_id, probabilities) in ldamodel_topics: topic_string = "\n\nTopic Id: " + str(topic_id) + "\n Probabilities: " + str(probabilities) print(topic_string) Topic Id: 0 Probabilities: 0.004*"people" + 0.003*"time" + 0.003*"government" + 0.003*"film" + 0.003*"game" + 0.003*"sale" + 0.003*"company" + 0.002*"world" + 0.002*"music" + 0.002*"player" + 0.002*"country" + 0.002*"month" + 0.002*"show" + 0.002*"2004" + 0.002*"market" + 0.002*"group" + 0.002*"next" + 0.002*"bbc" + 0.002*"second" + 0.002*"technology" Topic Id: 1 Probabilities: 0.004*"people" + 0.004*"time" + 0.003*"world" + 0.003*"game" + 0.002*"party" + 0.002*"firm" + 0.002*"day" + 0.002*"service" + 0.002*"show" + 0.002*"way" + 0.002*"think" + 0.002*"company" + 0.002*"market" + 0.002*"next" + 0.002*"music" + 0.002*"win" + 0.002*"award" + 0.002*"british" + 0.002*"still" + 0.002*"phone" Topic Id: 2 Probabilities: 0.004*"game" + 0.004*"film" + 0.004*"government" + 0.003*"people" + 0.003*"time" + 0.003*"firm" + 0.003*"sale" + 0.003*"labour" + 0.002*"number" + 0.002*"market" + 0.002*"country" + 0.002*"company" + 0.002*"service" + 0.002*"tax" + 0.002*"way" + 0.002*"plan" + 0.002*"week" + 0.002*"technology" + 0.002*"blair" + 0.002*"minister" Topic Id: 3 Probabilities: 0.005*"people" + 0.004*"time" + 0.003*"world" + 0.003*"game" + 0.003*"labour" + 0.003*"week" + 0.003*"party" + 0.002*"month" + 0.002*"election" + 0.002*"win" + 0.002*"way" + 0.002*"show" + 0.002*"company" + 0.002*"old" + 0.002*"number" + 0.002*"play" + 0.002*"music" + 0.002*"group" + 0.002*"net" + 0.002*"mobile" Topic Id: 4 Probabilities: 0.006*"people" + 0.004*"time" + 0.003*"film" + 0.003*"world" + 0.003*"service" + 0.003*"government" + 0.003*"mobile" + 0.003*"company" + 0.003*"game" + 0.002*"win" + 0.002*"firm" + 0.002*"number" + 0.002*"election" + 0.002*"month" + 0.002*"phone" + 0.002*"home" + 0.002*"party" + 0.002*"country" + 0.002*"minister" + 0.002*"england" for i in range (1, len(df1)): tagged_topic = df1.iloc[i]["Category"] text_for_inference = df1.iloc[i]["Text"].split() bow = common_dictionary.doc2bow(text_for_inference) document_topics = lda.get_document_topics(bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False) document_topics.sort(key=lambda elem: elem[1], reverse=True) document_topics_string = "\n\nArticle ID: " + tagged_topic + " : " for (topic_id, probability) in document_topics: document_topics_string = document_topics_string + " Topic ID: " + str(topic_id) + ", Probability: " + str(probability) + "," print(document_topics_string) for topic, prob in document_topics: topic_terms_string = "\n Topic ID: " + str(topic) + " : " + lda.print_topic(topic) print(topic_terms_string) Article ID: business : Topic ID: 0, Probability: 0.97704554, Topic ID: 4, Probability: 0.018701652, Topic ID: 0 : 0.004*"people" + 0.003*"time" + 0.003*"government" + 0.003*"film" + 0.003*"game" + 0.003*"sale" + 0.003*"company" + 0.002*"world" + 0.002*"music" + 0.002*"player" Topic ID: 4 : 0.006*"people" + 0.004*"time" + 0.003*"film" + 0.003*"world" + 0.003*"service" + 0.003*"government" + 0.003*"mobile" + 0.003*"company" + 0.003*"game" + 0.002*"win" Article ID: business : Topic ID: 4, Probability: 0.7235847, Topic ID: 0, Probability: 0.27315685, Topic ID: 4 : 0.006*"people" + 0.004*"time" + 0.003*"film" + 0.003*"world" + 0.003*"service" + 0.003*"government" + 0.003*"mobile" + 0.003*"company" + 0.003*"game" + 0.002*"win" Topic ID: 0 : 0.004*"people" + 0.003*"time" + 0.003*"government" + 0.003*"film" + 0.003*"game" + 0.003*"sale" + 0.003*"company" + 0.002*"world" + 0.002*"music" + 0.002*"player" Article ID: tech : Topic ID: 4, Probability: 0.9569476, Topic ID: 3, Probability: 0.040294025, Topic ID: 4 : 0.006*"people" + 0.004*"time" + 0.003*"film" + 0.003*"world" + 0.003*"service" + 0.003*"government" + 0.003*"mobile" + 0.003*"company" + 0.003*"game" + 0.002*"win" Topic ID: 3 : 0.005*"people" + 0.004*"time" + 0.003*"world" + 0.003*"game" + 0.003*"labour" + 0.003*"week" + 0.003*"party" + 0.002*"month" + 0.002*"election" + 0.002*"win" ... ... ...
Tags: Machine Learning,Natural Language Processing,Technology,

No comments:

Post a Comment