import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re import nltk from nltk.tokenize import word_tokenize import sys import gensim from gensim import corpora from gensim.models.lsimodel import LsiModel, stochastic_svd from gensim.models.coherencemodel import CoherenceModel from gensim.models import RpModel from gensim.matutils import corpus2dense, Dense2Corpus from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel # Latent Dirichlet Allocation and not 'Latent Discriminant Analysis' from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.preprocessing import normalize from sklearn.decomposition import LatentDirichletAllocation, PCA from sklearn.cluster import KMeans from sklearn.random_projection import SparseRandomProjection, johnson_lindenstrauss_min_dim from sklearn.random_projection import GaussianRandomProjection from sklearn.metrics.pairwise import euclidean_distances import string from collections import Counter from preprocess import preprocess_text import spacy from time import time nlp = spacy.load("en_core_web_sm") def remove_verbs_and_adjectives(text): doc = nlp(text) additional_stopwords = ["new", "like", "many", "also", "even", "get", "say", "according", "would", "could", "know", "made", "make", "come", "didnt", "dont", "doesnt", "go", "may", "back", "going", "including", "added", "set", "take", "want", "use", "000", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "20", "u", "one", "two", "three", "year", "first", "last", "good", "best", "well", "told", "said"] days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] additional_stopwords += days_of_week words = [token.text for token in doc if (token.pos_ not in ["VERB", "NUM", "ADJ", "ADV", "ADP", "SCONJ", "DET", "X", "INTJ", "CCONJ", "AUX", 'PART', 'PRON', 'PUNCT', 'SYM'])] # Only Noun and (PROPN) Proper Noun allowed. words = [x for x in words if len(x) > 2] words = [x for x in words if x not in additional_stopwords] doc = " ".join(words) return doc df1 = pd.read_csv('bbc_news_train.csv') %%time df1['Preprocess_text'] = df1['Text'].apply(preprocess_text) df1['Preprocess_text'] = df1['Preprocess_text'].apply(remove_verbs_and_adjectives) CPU times: total: 2min 3s Wall time: 2min 8s df1[['Text', 'Preprocess_text']].head() Counter(df1['Category']) Counter({'business': 336, 'tech': 261, 'politics': 274, 'sport': 346, 'entertainment': 273}) from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary # Create a corpus from a list of texts clean_corpus = [doc.split() for doc in df1['Preprocess_text'].values.tolist()] common_dictionary = Dictionary(clean_corpus) common_corpus = [common_dictionary.doc2bow(text) for text in clean_corpus] %%time NO_OF_TOPICS_FOR_TRAINING = 5 NO_OF_WORDS_IN_TOPIC = 20 lda = LdaModel(common_corpus, num_topics = NO_OF_TOPICS_FOR_TRAINING, id2word = common_dictionary) CPU times: total: 9.39 s Wall time: 8.27 s ldamodel_topics = lda.print_topics(NO_OF_TOPICS_FOR_TRAINING, NO_OF_WORDS_IN_TOPIC) for (topic_id, probabilities) in ldamodel_topics: topic_string = "\n\nTopic Id: " + str(topic_id) + "\n Probabilities: " + str(probabilities) print(topic_string) Topic Id: 0 Probabilities: 0.006*"government" + 0.004*"film" + 0.004*"time" + 0.004*"labour" + 0.004*"people" + 0.004*"service" + 0.004*"election" + 0.004*"minister" + 0.003*"award" + 0.003*"blair" + 0.003*"market" + 0.003*"week" + 0.003*"party" + 0.003*"game" + 0.003*"number" + 0.003*"director" + 0.003*"brown" + 0.002*"actor" + 0.002*"company" + 0.002*"star" Topic Id: 1 Probabilities: 0.008*"game" + 0.007*"time" + 0.006*"people" + 0.005*"world" + 0.005*"film" + 0.004*"service" + 0.003*"player" + 0.003*"bbc" + 0.003*"company" + 0.003*"home" + 0.003*"day" + 0.003*"plan" + 0.003*"show" + 0.003*"country" + 0.003*"music" + 0.003*"week" + 0.003*"team" + 0.003*"number" + 0.003*"firm" + 0.003*"party" Topic Id: 2 Probabilities: 0.009*"film" + 0.008*"people" + 0.005*"time" + 0.005*"game" + 0.005*"phone" + 0.004*"world" + 0.004*"company" + 0.004*"month" + 0.003*"firm" + 0.003*"award" + 0.003*"market" + 0.003*"government" + 0.003*"bbc" + 0.003*"mobile" + 0.003*"day" + 0.003*"software" + 0.003*"sale" + 0.003*"director" + 0.003*"number" + 0.002*"service" Topic Id: 3 Probabilities: 0.008*"people" + 0.006*"government" + 0.006*"country" + 0.006*"world" + 0.005*"company" + 0.005*"firm" + 0.004*"time" + 0.004*"month" + 0.004*"number" + 0.004*"market" + 0.004*"tax" + 0.004*"week" + 0.003*"way" + 0.003*"service" + 0.003*"deal" + 0.003*"group" + 0.003*"minister" + 0.003*"sale" + 0.003*"plan" + 0.003*"music" Topic Id: 4 Probabilities: 0.006*"people" + 0.006*"game" + 0.006*"time" + 0.005*"election" + 0.004*"player" + 0.004*"england" + 0.004*"number" + 0.003*"world" + 0.003*"party" + 0.003*"music" + 0.003*"company" + 0.003*"group" + 0.003*"report" + 0.003*"bbc" + 0.003*"part" + 0.003*"service" + 0.003*"month" + 0.003*"sale" + 0.003*"government" + 0.003*"way"
Monday, June 27, 2022
Creating a Taxonomy for BBC News Articles (Part 9 - Using POS Tagging as word filter with Latent Dirichlet Allocation)
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment