survival8: Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier

Friday, September 23, 2022
Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier

Download Code

import pandas as pd
import numpy as np
import re
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ashish/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True


df = pd.read_csv('input/tweets_of_f234_users_1663839312.csv')



userid_accountype = df[['userid', 'account_type']].drop_duplicates()
userid_accountype['account_type'].value_counts()


human    146
bot       71
Name: account_type, dtype: int64


%%time
# Wall time: 1min 41s
pred = []
vader_label = []
sid = SentimentIntensityAnalyzer()
for sentence in df['clean_tweet'].values:
        
    ss = sid.polarity_scores(sentence)
    
    pred.append(ss['compound'])
    
    if(ss['compound'] < 0.05 and ss['compound'] > -0.05):
        vader_label.append('neutral')
    elif(ss['compound'] >= 0.05):
        vader_label.append('positive')
    elif(ss['compound'] <= -0.05):
        vader_label.append('negative')

CPU times: user 2min 40s, sys: 2 s, total: 2min 42s
Wall time: 2min 42s

df['vader_sentiment'] = pred
df['vader_label'] = vader_label




df_mean_sentiment = df.groupby(['userid', 'vader_label']).mean().reset_index()
df_mean_sentiment.rename({'vader_sentiment': 'mean_sentiment'}, axis='columns', inplace = True)
df_mean_sentiment





df_var_sentiment = df.groupby(['userid']).var().reset_index()
df_var_sentiment.rename({'vader_sentiment': 'variance_sentiment'}, axis='columns', inplace = True)
df_var_sentiment





df_mean_var = df_mean_sentiment.merge(df_var_sentiment, on=['userid'], how = 'inner')
df_mean_var




df_mean_var_w_label = df_mean_var.merge(userid_accountype, on = 'userid', how = 'inner')
df_mean_var_w_label




df_mean_var_w_label['account_type'].value_counts()


human    435
bot      205
Name: account_type, dtype: int64


sns.scatterplot(data = df_mean_var_w_label, x = "mean_sentiment", y = "variance_sentiment", hue = "account_type", style = "account_type")



sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "mean_sentiment", hue = "account_type", style = "account_type")



sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "variance_sentiment", hue = "account_type", style = "account_type")




def get_url_flag(in_tweet):
    m = re.search(r"http[a-zA-Z0-9/\-.:%]+", in_tweet)
    rtn = False
    if m:
        rtn = True
    return rtn

df['url_flag'] = df['clean_tweet'].apply(get_url_flag)




df.groupby(['account_type'])['url_flag'].value_counts()

account_type  url_flag
bot           True         68899
                False        47947
human         False       202148
                True        169643
Name: url_flag, dtype: int64

print(68899 / (68899+47947))
print(169643 / (169643 + 202148))


0.5896564709104292
0.4562859240810025


df['len'] = df['clean_tweet'].apply(len)
df.groupby(['account_type'])['len'].mean()



account_type
bot      111.761455
human    109.129188
Name: len, dtype: float64





clf = RandomForestClassifier(random_state=0)
X = df[['vader_sentiment', 'url_flag', 'len']]
y = df['account_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = clf.fit(X_train, y_train)

pred = clf.predict(X_test)

labels = ['bot', 'human']

print(classification_report(y_test, y_pred = pred, labels = labels))



             precision    recall  f1-score   support
bot             0.54      0.24      0.33     38217
human           0.80      0.93      0.86    123034

accuracy                            0.77    161251
macro avg       0.67      0.59      0.60    161251
weighted avg    0.74      0.77      0.74    161251


Accuracy of: 0.77
survival8

Friday, September 23, 2022

Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier

Accuracy of: 0.77

No comments:

Post a Comment