For this analysis, i’ll be using a Reddit API wrapper, called “praw”, to loop through the /r/politics subreddit headlines.
In [1]:
from IPython import display
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', context='talk', palette='Dark2')
In [2]:
"""
For this analysis, i'll be using a Reddit API wrapper,
called "praw", to loop through the /r/politics subreddit headlines.
"""
import praw
In [3]:
reddit = praw.Reddit(client_id='---------------',
client_secret='----------------------------',
user_agent='theone9807')
In [4]:
headlines = set()
In [6]:
#Now, we can iterate through the /r/politics subreddit using the API client:
for submission in reddit.subreddit('politics').new(limit=None):
headlines.add(submission.title)
display.clear_output()
print(len(headlines))
In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
In [16]:
sia = SIA()
results = []
for line in headlines:
pol_score = sia.polarity_scores(line)
pol_score['headline'] = line
results.append(pol_score)
pprint(results[:5], width=100)
In [17]:
df = pd.DataFrame.from_records(results)
df.head()
Out[17]:
In [18]:
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()
Out[18]:
In [19]:
#We have all the data we need to save, so let's do that:
df2 = df[['headline', 'label']]
df2.to_csv('reddit_headlines_labels.csv', mode='a', encoding='utf-8', index=False)
In [20]:
"""
Dataset Info and Statistics
Let's first take a peak at a few positive and negative headlines:
"""
print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)
print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)
In [21]:
#Now let's check how many total positives and negatives we have in this dataset:
print(df.label.value_counts())
print(df.label.value_counts(normalize=True) * 100)
#The first line gives us raw value counts of the labels,
#whereas the second line provides percentages with the normalize keyword.
In [22]:
fig, ax = plt.subplots(figsize=(8, 8))
counts = df.label.value_counts(normalize=True) * 100
sns.barplot(x=counts.index, y=counts, ax=ax)
ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")
plt.show()
In [23]:
"""
Tokenization is the process of breaking a stream of text up into meaningful elements called tokens.
You can tokenize a paragraph into sentences, a sentence into words and so on.
"""
from nltk.tokenize import word_tokenize, RegexpTokenizer
example = "This is an example sentence! However, it isn't a very informative one"
print(word_tokenize(example, language='english'))
In [24]:
#use another tokenizer that only looks at words, not punctuation
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(example)
Out[24]:
In [25]:
#We can grab a simple list of stopwords from NLTK:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words[:20])
In [26]:
#Let's start by creating a function that will read a list of headlines and perform lowercasing, tokenizing, and stopword removal:
def process_text(headlines):
tokens = []
for line in headlines:
toks = tokenizer.tokenize(line)
toks = [t.lower() for t in toks if t.lower() not in stop_words]
tokens.extend(toks)
return tokens
In [27]:
"""We can grab all of the positive label headlines from our dataframe,
hand them over to our function, then call NLTK's `FreqDist` function to get
the most common words in the positive headlines:
"""
pos_lines = list(df[df.label == 1].headline)
pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)
pos_freq.most_common(20)
Out[27]:
In [28]:
y_val = [x[1] for x in pos_freq.most_common()]
fig = plt.figure(figsize=(10,5))
plt.plot(y_val)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Positive)")
plt.show()
The above chart is showing the frequency patterns, where the y-axis is the frequency of the words and in x-axis is the words ranked by their frequency. So, the most frequent word, which in our case is ‘trump’, is plotted at (1,74).
In [ ]:
In [ ]:
In [ ]:
