import pandas as pd
import numpy as np
import re
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to
[nltk_data] /home/ashish/nltk_data...
[nltk_data] Package vader_lexicon is already up-to-date!
True
df = pd.read_csv('input/tweets_of_f234_users_1663839312.csv')
userid_accountype = df[['userid', 'account_type']].drop_duplicates()
userid_accountype['account_type'].value_counts()
human 146
bot 71
Name: account_type, dtype: int64
%%time
# Wall time: 1min 41s
pred = []
vader_label = []
sid = SentimentIntensityAnalyzer()
for sentence in df['clean_tweet'].values:
ss = sid.polarity_scores(sentence)
pred.append(ss['compound'])
if(ss['compound'] < 0.05 and ss['compound'] > -0.05):
vader_label.append('neutral')
elif(ss['compound'] >= 0.05):
vader_label.append('positive')
elif(ss['compound'] <= -0.05):
vader_label.append('negative')
CPU times: user 2min 40s, sys: 2 s, total: 2min 42s
Wall time: 2min 42s
df['vader_sentiment'] = pred
df['vader_label'] = vader_label
df_mean_sentiment = df.groupby(['userid', 'vader_label']).mean().reset_index()
df_mean_sentiment.rename({'vader_sentiment': 'mean_sentiment'}, axis='columns', inplace = True)
df_mean_sentiment
df_var_sentiment = df.groupby(['userid']).var().reset_index()
df_var_sentiment.rename({'vader_sentiment': 'variance_sentiment'}, axis='columns', inplace = True)
df_var_sentiment
df_mean_var = df_mean_sentiment.merge(df_var_sentiment, on=['userid'], how = 'inner')
df_mean_var
df_mean_var_w_label = df_mean_var.merge(userid_accountype, on = 'userid', how = 'inner')
df_mean_var_w_label
df_mean_var_w_label['account_type'].value_counts()
human 435
bot 205
Name: account_type, dtype: int64
sns.scatterplot(data = df_mean_var_w_label, x = "mean_sentiment", y = "variance_sentiment", hue = "account_type", style = "account_type")
sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "mean_sentiment", hue = "account_type", style = "account_type")
sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "variance_sentiment", hue = "account_type", style = "account_type")
def get_url_flag(in_tweet):
m = re.search(r"http[a-zA-Z0-9/\-.:%]+", in_tweet)
rtn = False
if m:
rtn = True
return rtn
df['url_flag'] = df['clean_tweet'].apply(get_url_flag)
df.groupby(['account_type'])['url_flag'].value_counts()
account_type url_flag
bot True 68899
False 47947
human False 202148
True 169643
Name: url_flag, dtype: int64
print(68899 / (68899+47947))
print(169643 / (169643 + 202148))
0.5896564709104292
0.4562859240810025
df['len'] = df['clean_tweet'].apply(len)
df.groupby(['account_type'])['len'].mean()
account_type
bot 111.761455
human 109.129188
Name: len, dtype: float64
clf = RandomForestClassifier(random_state=0)
X = df[['vader_sentiment', 'url_flag', 'len']]
y = df['account_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
labels = ['bot', 'human']
print(classification_report(y_test, y_pred = pred, labels = labels))
precision recall f1-score support
bot 0.54 0.24 0.33 38217
human 0.80 0.93 0.86 123034
accuracy 0.77 161251
macro avg 0.67 0.59 0.60 161251
weighted avg 0.74 0.77 0.74 161251
Accuracy of: 0.77
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Friday, September 23, 2022
Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier
Download Code
Subscribe to:
Post Comments (Atom)











No comments:
Post a Comment