Friday, September 23, 2022

Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier

Download Code

import pandas as pd
import numpy as np
import re
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ashish/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True


df = pd.read_csv('input/tweets_of_f234_users_1663839312.csv')

userid_accountype = df[['userid', 'account_type']].drop_duplicates() userid_accountype['account_type'].value_counts() human 146 bot 71 Name: account_type, dtype: int64 %%time # Wall time: 1min 41s pred = [] vader_label = [] sid = SentimentIntensityAnalyzer() for sentence in df['clean_tweet'].values: ss = sid.polarity_scores(sentence) pred.append(ss['compound']) if(ss['compound'] < 0.05 and ss['compound'] > -0.05): vader_label.append('neutral') elif(ss['compound'] >= 0.05): vader_label.append('positive') elif(ss['compound'] <= -0.05): vader_label.append('negative') CPU times: user 2min 40s, sys: 2 s, total: 2min 42s Wall time: 2min 42s df['vader_sentiment'] = pred df['vader_label'] = vader_label
df_mean_sentiment = df.groupby(['userid', 'vader_label']).mean().reset_index() df_mean_sentiment.rename({'vader_sentiment': 'mean_sentiment'}, axis='columns', inplace = True) df_mean_sentiment
df_var_sentiment = df.groupby(['userid']).var().reset_index() df_var_sentiment.rename({'vader_sentiment': 'variance_sentiment'}, axis='columns', inplace = True) df_var_sentiment
df_mean_var = df_mean_sentiment.merge(df_var_sentiment, on=['userid'], how = 'inner') df_mean_var
df_mean_var_w_label = df_mean_var.merge(userid_accountype, on = 'userid', how = 'inner') df_mean_var_w_label
df_mean_var_w_label['account_type'].value_counts() human 435 bot 205 Name: account_type, dtype: int64 sns.scatterplot(data = df_mean_var_w_label, x = "mean_sentiment", y = "variance_sentiment", hue = "account_type", style = "account_type")
sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "mean_sentiment", hue = "account_type", style = "account_type")
sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "variance_sentiment", hue = "account_type", style = "account_type")
def get_url_flag(in_tweet): m = re.search(r"http[a-zA-Z0-9/\-.:%]+", in_tweet) rtn = False if m: rtn = True return rtn df['url_flag'] = df['clean_tweet'].apply(get_url_flag)
df.groupby(['account_type'])['url_flag'].value_counts() account_type url_flag bot True 68899 False 47947 human False 202148 True 169643 Name: url_flag, dtype: int64 print(68899 / (68899+47947)) print(169643 / (169643 + 202148)) 0.5896564709104292 0.4562859240810025 df['len'] = df['clean_tweet'].apply(len) df.groupby(['account_type'])['len'].mean() account_type bot 111.761455 human 109.129188 Name: len, dtype: float64
clf = RandomForestClassifier(random_state=0) X = df[['vader_sentiment', 'url_flag', 'len']] y = df['account_type'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf = clf.fit(X_train, y_train) pred = clf.predict(X_test) labels = ['bot', 'human'] print(classification_report(y_test, y_pred = pred, labels = labels)) precision recall f1-score support bot 0.54 0.24 0.33 38217 human 0.80 0.93 0.86 123034 accuracy 0.77 161251 macro avg 0.67 0.59 0.60 161251 weighted avg 0.74 0.77 0.74 161251

Accuracy of: 0.77

Tags: Technology,Natural Language Processing,

No comments:

Post a Comment