Monday, June 27, 2022

Creating a Taxonomy for BBC News Articles (Part 9 - Using POS Tagging as word filter with Latent Dirichlet Allocation)


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize

import sys

import gensim
from gensim import corpora
from gensim.models.lsimodel import LsiModel, stochastic_svd
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import RpModel
from gensim.matutils import corpus2dense, Dense2Corpus
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel # Latent Dirichlet Allocation and not 'Latent Discriminant Analysis'

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection, johnson_lindenstrauss_min_dim
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics.pairwise import euclidean_distances
import string

from collections import Counter
from preprocess import preprocess_text

import spacy

from time import time 

nlp = spacy.load("en_core_web_sm")


def remove_verbs_and_adjectives(text):
    doc = nlp(text)

    additional_stopwords = ["new", "like", "many", "also", "even", "get", "say", "according", "would", "could",
                            "know", "made", "make", "come", "didnt", "dont", "doesnt", "go", "may", "back", 
                            "going", "including", "added", "set", "take", "want", "use",
                            "000", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "20", "u",
                           "one", "two", "three", "year", "first", "last", "good", "best", "well", "told", "said"]
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    
    additional_stopwords += days_of_week
    
    words = [token.text for token in doc if (token.pos_ not in ["VERB", "NUM", "ADJ", "ADV", "ADP", "SCONJ", "DET",
                                                                "X", "INTJ", "CCONJ", "AUX", 'PART', 'PRON', 'PUNCT', 'SYM'])]

                                                            
    # Only Noun and (PROPN) Proper Noun allowed.
    
    words = [x for x in words if len(x) > 2]
    words = [x for x in words if x not in additional_stopwords]
        
    doc = " ".join(words)
    return doc



df1 = pd.read_csv('bbc_news_train.csv')



%%time
df1['Preprocess_text'] = df1['Text'].apply(preprocess_text)
df1['Preprocess_text'] = df1['Preprocess_text'].apply(remove_verbs_and_adjectives)



CPU times: total: 2min 3s
Wall time: 2min 8s



df1[['Text', 'Preprocess_text']].head()


Counter(df1['Category']) Counter({'business': 336, 'tech': 261, 'politics': 274, 'sport': 346, 'entertainment': 273}) from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary # Create a corpus from a list of texts clean_corpus = [doc.split() for doc in df1['Preprocess_text'].values.tolist()] common_dictionary = Dictionary(clean_corpus) common_corpus = [common_dictionary.doc2bow(text) for text in clean_corpus] %%time NO_OF_TOPICS_FOR_TRAINING = 5 NO_OF_WORDS_IN_TOPIC = 20 lda = LdaModel(common_corpus, num_topics = NO_OF_TOPICS_FOR_TRAINING, id2word = common_dictionary) CPU times: total: 9.39 s Wall time: 8.27 s ldamodel_topics = lda.print_topics(NO_OF_TOPICS_FOR_TRAINING, NO_OF_WORDS_IN_TOPIC) for (topic_id, probabilities) in ldamodel_topics: topic_string = "\n\nTopic Id: " + str(topic_id) + "\n Probabilities: " + str(probabilities) print(topic_string) Topic Id: 0 Probabilities: 0.006*"government" + 0.004*"film" + 0.004*"time" + 0.004*"labour" + 0.004*"people" + 0.004*"service" + 0.004*"election" + 0.004*"minister" + 0.003*"award" + 0.003*"blair" + 0.003*"market" + 0.003*"week" + 0.003*"party" + 0.003*"game" + 0.003*"number" + 0.003*"director" + 0.003*"brown" + 0.002*"actor" + 0.002*"company" + 0.002*"star" Topic Id: 1 Probabilities: 0.008*"game" + 0.007*"time" + 0.006*"people" + 0.005*"world" + 0.005*"film" + 0.004*"service" + 0.003*"player" + 0.003*"bbc" + 0.003*"company" + 0.003*"home" + 0.003*"day" + 0.003*"plan" + 0.003*"show" + 0.003*"country" + 0.003*"music" + 0.003*"week" + 0.003*"team" + 0.003*"number" + 0.003*"firm" + 0.003*"party" Topic Id: 2 Probabilities: 0.009*"film" + 0.008*"people" + 0.005*"time" + 0.005*"game" + 0.005*"phone" + 0.004*"world" + 0.004*"company" + 0.004*"month" + 0.003*"firm" + 0.003*"award" + 0.003*"market" + 0.003*"government" + 0.003*"bbc" + 0.003*"mobile" + 0.003*"day" + 0.003*"software" + 0.003*"sale" + 0.003*"director" + 0.003*"number" + 0.002*"service" Topic Id: 3 Probabilities: 0.008*"people" + 0.006*"government" + 0.006*"country" + 0.006*"world" + 0.005*"company" + 0.005*"firm" + 0.004*"time" + 0.004*"month" + 0.004*"number" + 0.004*"market" + 0.004*"tax" + 0.004*"week" + 0.003*"way" + 0.003*"service" + 0.003*"deal" + 0.003*"group" + 0.003*"minister" + 0.003*"sale" + 0.003*"plan" + 0.003*"music" Topic Id: 4 Probabilities: 0.006*"people" + 0.006*"game" + 0.006*"time" + 0.005*"election" + 0.004*"player" + 0.004*"england" + 0.004*"number" + 0.003*"world" + 0.003*"party" + 0.003*"music" + 0.003*"company" + 0.003*"group" + 0.003*"report" + 0.003*"bbc" + 0.003*"part" + 0.003*"service" + 0.003*"month" + 0.003*"sale" + 0.003*"government" + 0.003*"way"
Tags: Machine Learning, Natural Language Processing, Technology

No comments:

Post a Comment