Sunday, September 25, 2022

BERT Embedding Based Model For Bot Detection on Twitter (Sep 2022)

Download Code

import pandas as pd
import numpy as np
import re
from time import time
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer

nltk.download('vader_lexicon')

df = pd.read_csv('tweets_f234_users_vader_bert_url_len_lang.csv')

df = df[df['lang'] == 'en']


sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') We are not encoding tweets again and again. For practical purpose, we encoded once and saved the embeddings on the disk. %%time nparray = sbert_model.encode(df['clean_tweet'].values) df_embeddings = pd.DataFrame(nparray) # CPU times: user 18h 28s, sys: 9min 51s, total: 18h 10min 19s # Wall time: 9h 7min 48s Then for this demo, we read them from the disk. %%time df_embeddings = pd.read_csv('tweet_embeddings_using_bert_f234_1664047300.csv') # CPU times: user 1min 8s, sys: 7.25 s, total: 1min 15s # Wall time: 1min 24s print(df_embeddings.shape) (334679, 768) X_original_features = df[['vader_sentiment', 'url_flag', 'len', 'bert_clear_expression_conf']] y = df['account_type'] # CPU times: user 28 s, sys: 6.76 s, total: 34.8 s # Wall time: 14.1 s pca = PCA(n_components=8) # Data with n_components=64 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=32 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=16 takes infinite time (> 3 mins) to train a RandomForestClassifier. df_embeddings_pca = pca.fit_transform(df_embeddings) print(type(df_embeddings_pca)) df_embeddings_pca = pd.DataFrame(df_embeddings_pca) print(df_embeddings_pca.shape) print(X_original_features.shape) <class 'numpy.ndarray'> (334679, 8) (334679, 4) for i in X_original_features.columns: df_embeddings_pca[i] = X_original_features[i].values print(df_embeddings_pca.shape) (334679, 12) X_train, X_test, y_train, y_test = train_test_split(df_embeddings_pca, y, test_size=0.33, random_state=42) %%time clf = RandomForestClassifier(random_state=0) clf = clf.fit(X_train, y_train) # With PCA(n_components=8) # Wall time: 2min 4s pred = clf.predict(X_test) labels = ['bot', 'human'] print(classification_report(y_test, y_pred = pred, labels = labels)) precision recall f1-score support bot 0.85 0.54 0.66 23603 human 0.89 0.97 0.93 86842 accuracy 0.88 110445 macro avg 0.87 0.76 0.79 110445 weighted avg 0.88 0.88 0.87 110445
Tags: Technology,Natural Language Processing,

No comments:

Post a Comment