import pandas as pd import numpy as np import re from time import time import seaborn as sns import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.decomposition import PCA from sentence_transformers import SentenceTransformer nltk.download('vader_lexicon') df = pd.read_csv('tweets_f234_users_vader_bert_url_len_lang.csv') df = df[df['lang'] == 'en'] sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') We are not encoding tweets again and again. For practical purpose, we encoded once and saved the embeddings on the disk. %%time nparray = sbert_model.encode(df['clean_tweet'].values) df_embeddings = pd.DataFrame(nparray) # CPU times: user 18h 28s, sys: 9min 51s, total: 18h 10min 19s # Wall time: 9h 7min 48s Then for this demo, we read them from the disk. %%time df_embeddings = pd.read_csv('tweet_embeddings_using_bert_f234_1664047300.csv') # CPU times: user 1min 8s, sys: 7.25 s, total: 1min 15s # Wall time: 1min 24s print(df_embeddings.shape) (334679, 768) X_original_features = df[['vader_sentiment', 'url_flag', 'len', 'bert_clear_expression_conf']] y = df['account_type'] # CPU times: user 28 s, sys: 6.76 s, total: 34.8 s # Wall time: 14.1 s pca = PCA(n_components=8) # Data with n_components=64 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=32 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=16 takes infinite time (> 3 mins) to train a RandomForestClassifier. df_embeddings_pca = pca.fit_transform(df_embeddings) print(type(df_embeddings_pca)) df_embeddings_pca = pd.DataFrame(df_embeddings_pca) print(df_embeddings_pca.shape) print(X_original_features.shape) <class 'numpy.ndarray'> (334679, 8) (334679, 4) for i in X_original_features.columns: df_embeddings_pca[i] = X_original_features[i].values print(df_embeddings_pca.shape) (334679, 12) X_train, X_test, y_train, y_test = train_test_split(df_embeddings_pca, y, test_size=0.33, random_state=42) %%time clf = RandomForestClassifier(random_state=0) clf = clf.fit(X_train, y_train) # With PCA(n_components=8) # Wall time: 2min 4s pred = clf.predict(X_test) labels = ['bot', 'human'] print(classification_report(y_test, y_pred = pred, labels = labels)) precision recall f1-score support bot 0.85 0.54 0.66 23603 human 0.89 0.97 0.93 86842 accuracy 0.88 110445 macro avg 0.87 0.76 0.79 110445 weighted avg 0.88 0.88 0.87 110445
Sunday, September 25, 2022
BERT Embedding Based Model For Bot Detection on Twitter (Sep 2022)
Download Code
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment