import pandas as pd import numpy as np import re from time import time import seaborn as sns import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.decomposition import PCA from sentence_transformers import SentenceTransformer nltk.download('vader_lexicon') df = pd.read_csv('tweets_f234_users_vader_bert_url_len_lang.csv') df = df[df['lang'] == 'en'] sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') We are not encoding tweets again and again. For practical purpose, we encoded once and saved the embeddings on the disk. %%time nparray = sbert_model.encode(df['clean_tweet'].values) df_embeddings = pd.DataFrame(nparray) # CPU times: user 18h 28s, sys: 9min 51s, total: 18h 10min 19s # Wall time: 9h 7min 48s Then for this demo, we read them from the disk. %%time df_embeddings = pd.read_csv('tweet_embeddings_using_bert_f234_1664047300.csv') # CPU times: user 1min 8s, sys: 7.25 s, total: 1min 15s # Wall time: 1min 24s print(df_embeddings.shape) (334679, 768) X_original_features = df[['vader_sentiment', 'url_flag', 'len', 'bert_clear_expression_conf']] y = df['account_type'] # CPU times: user 28 s, sys: 6.76 s, total: 34.8 s # Wall time: 14.1 s pca = PCA(n_components=8) # Data with n_components=64 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=32 takes infinite time (> 3 mins) to train a RandomForestClassifier. # Data with n_components=16 takes infinite time (> 3 mins) to train a RandomForestClassifier. df_embeddings_pca = pca.fit_transform(df_embeddings) print(type(df_embeddings_pca)) df_embeddings_pca = pd.DataFrame(df_embeddings_pca) print(df_embeddings_pca.shape) print(X_original_features.shape) <class 'numpy.ndarray'> (334679, 8) (334679, 4) for i in X_original_features.columns: df_embeddings_pca[i] = X_original_features[i].values print(df_embeddings_pca.shape) (334679, 12) X_train, X_test, y_train, y_test = train_test_split(df_embeddings_pca, y, test_size=0.33, random_state=42) %%time clf = RandomForestClassifier(random_state=0) clf = clf.fit(X_train, y_train) # With PCA(n_components=8) # Wall time: 2min 4s pred = clf.predict(X_test) labels = ['bot', 'human'] print(classification_report(y_test, y_pred = pred, labels = labels)) precision recall f1-score support bot 0.85 0.54 0.66 23603 human 0.89 0.97 0.93 86842 accuracy 0.88 110445 macro avg 0.87 0.76 0.79 110445 weighted avg 0.88 0.88 0.87 110445
Pages
▼
Sunday, September 25, 2022
BERT Embedding Based Model For Bot Detection on Twitter (Sep 2022)
Download Code
No comments:
Post a Comment