import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
import sys
import gensim
from gensim import corpora
from gensim.models.lsimodel import LsiModel, stochastic_svd
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import RpModel
from gensim.matutils import corpus2dense, Dense2Corpus
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel # Latent Dirichlet Allocation and not 'Latent Discriminant Analysis'
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection, johnson_lindenstrauss_min_dim
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics.pairwise import euclidean_distances
import string
from collections import Counter
from preprocess import preprocess_text
import spacy
from time import time
nlp = spacy.load("en_core_web_sm")
def remove_verbs_and_adjectives(text):
doc = nlp(text)
additional_stopwords = ["new", "like", "many", "also", "even", "get", "say", "according", "would", "could",
"know", "made", "make", "come", "didnt", "dont", "doesnt", "go", "may", "back",
"going", "including", "added", "set", "take", "want", "use",
"000", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "20", "u",
"one", "two", "three", "year", "first", "last", "good", "best", "well", "told", "said"]
days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
additional_stopwords += days_of_week
words = [token.text for token in doc if (token.pos_ not in ["VERB", "NUM", "ADJ", "ADV", "ADP", "SCONJ", "DET",
"X", "INTJ", "CCONJ", "AUX", 'PART', 'PRON', 'PUNCT', 'SYM'])]
# Only Noun and (PROPN) Proper Noun allowed.
words = [x for x in words if len(x) > 2]
words = [x for x in words if x not in additional_stopwords]
doc = " ".join(words)
return doc
df1 = pd.read_csv('bbc_news_train.csv')
%%time
df1['Preprocess_text'] = df1['Text'].apply(preprocess_text)
df1['Preprocess_text'] = df1['Preprocess_text'].apply(remove_verbs_and_adjectives)
CPU times: total: 2min 3s
Wall time: 2min 8s
df1[['Text', 'Preprocess_text']].head()
Counter(df1['Category'])
Counter({'business': 336,
'tech': 261,
'politics': 274,
'sport': 346,
'entertainment': 273})
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
# Create a corpus from a list of texts
clean_corpus = [doc.split() for doc in df1['Preprocess_text'].values.tolist()]
common_dictionary = Dictionary(clean_corpus)
common_corpus = [common_dictionary.doc2bow(text) for text in clean_corpus]
%%time
NO_OF_TOPICS_FOR_TRAINING = 5
NO_OF_WORDS_IN_TOPIC = 20
lda = LdaModel(common_corpus, num_topics = NO_OF_TOPICS_FOR_TRAINING, id2word = common_dictionary)
CPU times: total: 9.39 s
Wall time: 8.27 s
ldamodel_topics = lda.print_topics(NO_OF_TOPICS_FOR_TRAINING, NO_OF_WORDS_IN_TOPIC)
for (topic_id, probabilities) in ldamodel_topics:
topic_string = "\n\nTopic Id: " + str(topic_id) + "\n Probabilities: " + str(probabilities)
print(topic_string)
Topic Id: 0
Probabilities: 0.006*"government" + 0.004*"film" + 0.004*"time" + 0.004*"labour" + 0.004*"people" + 0.004*"service" + 0.004*"election" + 0.004*"minister" + 0.003*"award" + 0.003*"blair" + 0.003*"market" + 0.003*"week" + 0.003*"party" + 0.003*"game" + 0.003*"number" + 0.003*"director" + 0.003*"brown" + 0.002*"actor" + 0.002*"company" + 0.002*"star"
Topic Id: 1
Probabilities: 0.008*"game" + 0.007*"time" + 0.006*"people" + 0.005*"world" + 0.005*"film" + 0.004*"service" + 0.003*"player" + 0.003*"bbc" + 0.003*"company" + 0.003*"home" + 0.003*"day" + 0.003*"plan" + 0.003*"show" + 0.003*"country" + 0.003*"music" + 0.003*"week" + 0.003*"team" + 0.003*"number" + 0.003*"firm" + 0.003*"party"
Topic Id: 2
Probabilities: 0.009*"film" + 0.008*"people" + 0.005*"time" + 0.005*"game" + 0.005*"phone" + 0.004*"world" + 0.004*"company" + 0.004*"month" + 0.003*"firm" + 0.003*"award" + 0.003*"market" + 0.003*"government" + 0.003*"bbc" + 0.003*"mobile" + 0.003*"day" + 0.003*"software" + 0.003*"sale" + 0.003*"director" + 0.003*"number" + 0.002*"service"
Topic Id: 3
Probabilities: 0.008*"people" + 0.006*"government" + 0.006*"country" + 0.006*"world" + 0.005*"company" + 0.005*"firm" + 0.004*"time" + 0.004*"month" + 0.004*"number" + 0.004*"market" + 0.004*"tax" + 0.004*"week" + 0.003*"way" + 0.003*"service" + 0.003*"deal" + 0.003*"group" + 0.003*"minister" + 0.003*"sale" + 0.003*"plan" + 0.003*"music"
Topic Id: 4
Probabilities: 0.006*"people" + 0.006*"game" + 0.006*"time" + 0.005*"election" + 0.004*"player" + 0.004*"england" + 0.004*"number" + 0.003*"world" + 0.003*"party" + 0.003*"music" + 0.003*"company" + 0.003*"group" + 0.003*"report" + 0.003*"bbc" + 0.003*"part" + 0.003*"service" + 0.003*"month" + 0.003*"sale" + 0.003*"government" + 0.003*"way"
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Monday, June 27, 2022
Creating a Taxonomy for BBC News Articles (Part 9 - Using POS Tagging as word filter with Latent Dirichlet Allocation)
Subscribe to:
Post Comments (Atom)

No comments:
Post a Comment