survival8: 6 Labeled Datasets For Sentiment Analysis

Monday, October 3, 2022

6 Labeled Datasets For Sentiment Analysis

Download Code and Data


import pandas as pd
import seaborn as sns


1. Amazon Reviews


amazon_reviews = pd.read_csv('input/amazonReviewSnippets_GroundTruth.txt', sep = '\t')

amazon_reviews['dataset'] = 'amazon'

def get_sentiment_label(sentiment_score):
    if (sentiment_score < 0):
        return 'Negative'
    else:
        return 'Positive'

amazon_reviews['sentiment_label'] = amazon_reviews['sentiment'].apply(get_sentiment_label)

amazon_reviews['length'] = amazon_reviews['text'].apply(len)

def get_word_count(text):
    text = text.split()
    return len(text)

amazon_reviews['word_count'] = amazon_reviews['text'].apply(get_word_count)

amazon_reviews.head()






sns.countplot(x ='sentiment_label', data = amazon_reviews)





amazon_reviews['word_count'].describe()


count    3546.000000
mean       17.300056
std        31.449383
min         1.000000
25%         9.000000
50%        15.000000
75%        21.000000
max      1220.000000
Name: word_count, dtype: float64


If number of max number of tokens in a text exceeds 512, plain BERT embedding cannot be used and we have to use SentenceBERT as the embedding technique.

2. Movie Reviews 


movie_reviews = pd.read_csv('input/movieReviewSnippets_GroundTruth.txt', sep = '\t')

movie_reviews['dataset'] = 'movie reviews'

movie_reviews['sentiment_label'] = movie_reviews['sentiment'].apply(get_sentiment_label)
movie_reviews['word_count'] = movie_reviews['text'].apply(get_word_count)

movie_reviews.head(5)




sns.countplot(x ='sentiment_label', data = movie_reviews)



movie_reviews['word_count'].describe()


count    10605.000000
mean        18.864875
std          8.702398
min          1.000000
25%         12.000000
50%         18.000000
75%         25.000000
max         51.000000
Name: word_count, dtype: float64


3. New York Editorial Snippets


nyt_editorial_snippets = pd.read_csv('input/nytEditorialSnippets_GroundTruth.txt', sep = '\t')
nyt_editorial_snippets['dataset'] = 'nyt_editorial_snippets'

nyt_editorial_snippets['sentiment_label'] = nyt_editorial_snippets['sentiment'].apply(get_sentiment_label)
nyt_editorial_snippets['word_count'] = nyt_editorial_snippets['text'].apply(get_word_count)

nyt_editorial_snippets.head()





sns.countplot(x ='sentiment_label', data = nyt_editorial_snippets)




nyt_editorial_snippets['word_count'].describe()


count    5183.000000
mean       17.482925
std         8.767046
min         1.000000
25%        11.000000
50%        17.000000
75%        23.000000
max        91.000000
Name: word_count, dtype: float64


4. General Twitter Data (Tweets)


tweets_groud_truth = pd.read_csv('input/tweets_GroundTruth.txt', sep = '\t')

tweets_groud_truth['dataset'] = 'tweets_groud_truth'

tweets_groud_truth['sentiment_label'] = tweets_groud_truth['sentiment'].apply(get_sentiment_label)
tweets_groud_truth['word_count'] = tweets_groud_truth['text'].apply(get_word_count)

tweets_groud_truth.head()





sns.countplot(x ='sentiment_label', data = tweets_groud_truth)




tweets_groud_truth['word_count'].describe()


count    4200.000000
mean       13.619286
std         6.720463
min         1.000000
25%         8.000000
50%        13.000000
75%        19.000000
max        32.000000
Name: word_count, dtype: float64


5. US Presidential Election of 2016


us_presidential_election_2016 = pd.read_csv('input/us_politics_presidential_election_2016.csv', sep = ',')

us_presidential_election_2016 = us_presidential_election_2016[['id', 'sentiment', 'text']]

us_presidential_election_2016['dataset'] = 'us_presidential_election_2016'
us_presidential_election_2016.head()






sns.countplot(x ='sentiment', data = us_presidential_election_2016)






us_presidential_election_2016['word_count'] = us_presidential_election_2016['text'].apply(get_word_count)
us_presidential_election_2016['word_count'].describe()



count    13871.000000
mean        16.943912
std          5.224908
min          2.000000
25%         13.000000
50%         18.000000
75%         21.000000
max         29.000000
Name: word_count, dtype: float64


6. Stock Market Related Tweets


stock_market_tweets = pd.read_csv('input/stock_market_twitter_data.csv')
stock_market_tweets['sentiment_label'] = stock_market_tweets['Sentiment'].apply(get_sentiment_label)
stock_market_tweets['word_count'] = stock_market_tweets['Text'].apply(get_word_count)

stock_market_tweets.head()





sns.countplot(x ='sentiment_label', data = stock_market_tweets)




stock_market_tweets['word_count'].describe()


count    5791.000000
mean       14.006562
std         6.595463
min         2.000000
25%         9.000000
50%        14.000000
75%        19.000000
max        32.000000
Name: word_count, dtype: float64