import pandas as pd import seaborn as sns1. Amazon Reviews
amazon_reviews = pd.read_csv('input/amazonReviewSnippets_GroundTruth.txt', sep = '\t') amazon_reviews['dataset'] = 'amazon' def get_sentiment_label(sentiment_score): if (sentiment_score < 0): return 'Negative' else: return 'Positive' amazon_reviews['sentiment_label'] = amazon_reviews['sentiment'].apply(get_sentiment_label) amazon_reviews['length'] = amazon_reviews['text'].apply(len) def get_word_count(text): text = text.split() return len(text) amazon_reviews['word_count'] = amazon_reviews['text'].apply(get_word_count) amazon_reviews.head() sns.countplot(x ='sentiment_label', data = amazon_reviews) amazon_reviews['word_count'].describe() count 3546.000000 mean 17.300056 std 31.449383 min 1.000000 25% 9.000000 50% 15.000000 75% 21.000000 max 1220.000000 Name: word_count, dtype: float64If number of max number of tokens in a text exceeds 512, plain BERT embedding cannot be used and we have to use SentenceBERT as the embedding technique.
2. Movie Reviews
movie_reviews = pd.read_csv('input/movieReviewSnippets_GroundTruth.txt', sep = '\t') movie_reviews['dataset'] = 'movie reviews' movie_reviews['sentiment_label'] = movie_reviews['sentiment'].apply(get_sentiment_label) movie_reviews['word_count'] = movie_reviews['text'].apply(get_word_count) movie_reviews.head(5) sns.countplot(x ='sentiment_label', data = movie_reviews) movie_reviews['word_count'].describe() count 10605.000000 mean 18.864875 std 8.702398 min 1.000000 25% 12.000000 50% 18.000000 75% 25.000000 max 51.000000 Name: word_count, dtype: float643. New York Editorial Snippets
nyt_editorial_snippets = pd.read_csv('input/nytEditorialSnippets_GroundTruth.txt', sep = '\t') nyt_editorial_snippets['dataset'] = 'nyt_editorial_snippets' nyt_editorial_snippets['sentiment_label'] = nyt_editorial_snippets['sentiment'].apply(get_sentiment_label) nyt_editorial_snippets['word_count'] = nyt_editorial_snippets['text'].apply(get_word_count) nyt_editorial_snippets.head() sns.countplot(x ='sentiment_label', data = nyt_editorial_snippets) nyt_editorial_snippets['word_count'].describe() count 5183.000000 mean 17.482925 std 8.767046 min 1.000000 25% 11.000000 50% 17.000000 75% 23.000000 max 91.000000 Name: word_count, dtype: float644. General Twitter Data (Tweets)
tweets_groud_truth = pd.read_csv('input/tweets_GroundTruth.txt', sep = '\t') tweets_groud_truth['dataset'] = 'tweets_groud_truth' tweets_groud_truth['sentiment_label'] = tweets_groud_truth['sentiment'].apply(get_sentiment_label) tweets_groud_truth['word_count'] = tweets_groud_truth['text'].apply(get_word_count) tweets_groud_truth.head() sns.countplot(x ='sentiment_label', data = tweets_groud_truth) tweets_groud_truth['word_count'].describe() count 4200.000000 mean 13.619286 std 6.720463 min 1.000000 25% 8.000000 50% 13.000000 75% 19.000000 max 32.000000 Name: word_count, dtype: float645. US Presidential Election of 2016
us_presidential_election_2016 = pd.read_csv('input/us_politics_presidential_election_2016.csv', sep = ',') us_presidential_election_2016 = us_presidential_election_2016[['id', 'sentiment', 'text']] us_presidential_election_2016['dataset'] = 'us_presidential_election_2016' us_presidential_election_2016.head() sns.countplot(x ='sentiment', data = us_presidential_election_2016) us_presidential_election_2016['word_count'] = us_presidential_election_2016['text'].apply(get_word_count) us_presidential_election_2016['word_count'].describe() count 13871.000000 mean 16.943912 std 5.224908 min 2.000000 25% 13.000000 50% 18.000000 75% 21.000000 max 29.000000 Name: word_count, dtype: float646. Stock Market Related Tweets
stock_market_tweets = pd.read_csv('input/stock_market_twitter_data.csv') stock_market_tweets['sentiment_label'] = stock_market_tweets['Sentiment'].apply(get_sentiment_label) stock_market_tweets['word_count'] = stock_market_tweets['Text'].apply(get_word_count) stock_market_tweets.head() sns.countplot(x ='sentiment_label', data = stock_market_tweets) stock_market_tweets['word_count'].describe() count 5791.000000 mean 14.006562 std 6.595463 min 2.000000 25% 9.000000 50% 14.000000 75% 19.000000 max 32.000000 Name: word_count, dtype: float64
Monday, October 3, 2022
6 Labeled Datasets For Sentiment Analysis
Download Code and Data
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment