import pandas as pd import numpy as np import re import seaborn as sns import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report nltk.download('vader_lexicon') [nltk_data] Downloading package vader_lexicon to [nltk_data] /home/ashish/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date! True df = pd.read_csv('input/tweets_of_f234_users_1663839312.csv') userid_accountype = df[['userid', 'account_type']].drop_duplicates() userid_accountype['account_type'].value_counts() human 146 bot 71 Name: account_type, dtype: int64 %%time # Wall time: 1min 41s pred = [] vader_label = [] sid = SentimentIntensityAnalyzer() for sentence in df['clean_tweet'].values: ss = sid.polarity_scores(sentence) pred.append(ss['compound']) if(ss['compound'] < 0.05 and ss['compound'] > -0.05): vader_label.append('neutral') elif(ss['compound'] >= 0.05): vader_label.append('positive') elif(ss['compound'] <= -0.05): vader_label.append('negative') CPU times: user 2min 40s, sys: 2 s, total: 2min 42s Wall time: 2min 42s df['vader_sentiment'] = pred df['vader_label'] = vader_label df_mean_sentiment = df.groupby(['userid', 'vader_label']).mean().reset_index() df_mean_sentiment.rename({'vader_sentiment': 'mean_sentiment'}, axis='columns', inplace = True) df_mean_sentiment df_var_sentiment = df.groupby(['userid']).var().reset_index() df_var_sentiment.rename({'vader_sentiment': 'variance_sentiment'}, axis='columns', inplace = True) df_var_sentiment df_mean_var = df_mean_sentiment.merge(df_var_sentiment, on=['userid'], how = 'inner') df_mean_var df_mean_var_w_label = df_mean_var.merge(userid_accountype, on = 'userid', how = 'inner') df_mean_var_w_label df_mean_var_w_label['account_type'].value_counts() human 435 bot 205 Name: account_type, dtype: int64 sns.scatterplot(data = df_mean_var_w_label, x = "mean_sentiment", y = "variance_sentiment", hue = "account_type", style = "account_type") sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "mean_sentiment", hue = "account_type", style = "account_type") sns.scatterplot(data = df_mean_var_w_label, x = "account_type", y = "variance_sentiment", hue = "account_type", style = "account_type") def get_url_flag(in_tweet): m = re.search(r"http[a-zA-Z0-9/\-.:%]+", in_tweet) rtn = False if m: rtn = True return rtn df['url_flag'] = df['clean_tweet'].apply(get_url_flag) df.groupby(['account_type'])['url_flag'].value_counts() account_type url_flag bot True 68899 False 47947 human False 202148 True 169643 Name: url_flag, dtype: int64 print(68899 / (68899+47947)) print(169643 / (169643 + 202148)) 0.5896564709104292 0.4562859240810025 df['len'] = df['clean_tweet'].apply(len) df.groupby(['account_type'])['len'].mean() account_type bot 111.761455 human 109.129188 Name: len, dtype: float64 clf = RandomForestClassifier(random_state=0) X = df[['vader_sentiment', 'url_flag', 'len']] y = df['account_type'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf = clf.fit(X_train, y_train) pred = clf.predict(X_test) labels = ['bot', 'human'] print(classification_report(y_test, y_pred = pred, labels = labels)) precision recall f1-score support bot 0.54 0.24 0.33 38217 human 0.80 0.93 0.86 123034 accuracy 0.77 161251 macro avg 0.67 0.59 0.60 161251 weighted avg 0.74 0.77 0.74 161251Accuracy of: 0.77
Friday, September 23, 2022
Baseline Model For Bot Detection on Twitter Using VADER and RandomForestClassifier
Download Code
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment