Analyzing Donald Trump's Speeches: NLP, Named-Entity Recognition, and Sentiment Analysis¶

by Grayson Adkins, updated August 15, 2024

In this notebook, I analyze former President Donald Trump's campagin rally speeches using natural language processing (NLP) tools NLTK and spaCy and perform sentiment analysis using a BERT model.

Open In Colab

Install dependencies¶

In [ ]:
!pip install -q bs4

Scrape speeches¶

I'm using speech transcripts available at rev.com, but they could just as easily be transcribed from audio files (See my notebook Call Summarization Pipeline). BeautifulSoup is used from scraping the transcripts.

In [ ]:
import requests
from bs4 import BeautifulSoup

def scrape_website_text(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors

        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)

        return text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
url = "https://www.rev.com/blog/transcripts/trump-and-vance-speak-at-atlanta-rally"
website_text = scrape_website_text(url)
print(website_text)
In [ ]:
import re
from bs4 import BeautifulSoup

import re
from bs4 import BeautifulSoup

def extract_speaker_speech(html_content, speaker_name):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the text from the transcript
    transcript_text = soup.get_text(separator=' ')

    # Refine the regex to precisely match "Donald Trump" while excluding "Donald Trump, Jr."
    speaker_pattern = fr'\b{re.escape(speaker_name)}\b\s*(?!, Jr)\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\):\s*(.*?)(?=\s*\b[A-Za-z]+\s*[A-Za-z]*\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\)|$)'
    speaker_lines = re.findall(speaker_pattern, transcript_text, re.DOTALL)

    # Join the extracted lines into a single string
    speaker_speech = ' '.join([line.strip() for line in speaker_lines])

    # Clean up any leftover timestamps or speaker annotations
    cleaned_speech = re.sub(r'\(\s*\d{2}:\d{2}:\d{2}\s*\)', '', speaker_speech)

    return cleaned_speech.strip()

# Specify the speaker's name you want to extract
speaker_name = "Donald Trump"
speaker_speech = extract_speaker_speech(website_text, speaker_name)
print(speaker_speech)
In [ ]:
with open("speaker_text.txt", "w") as file:
    file.write(speaker_speech)

Pre-Processing: Frequency count by word¶

First, let's clean up the speech by applying some filters. Below, we count the frequency that each word appears in the speech and filter out stop words (i.e. low-value words) such as 'get', 'again', 'tell', 'even', 'could', 'would', 'like', etc. We'll also apply lemmitization to identify all variations of a given root word.

In [ ]:
!pip install -q nltk
In [ ]:
import nltk
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download the required NLTK data
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# Manually extend the stop words list with additional irrelevant words
additional_stop_words = {'get', 'again', 'tell', 'even', 'could', 'would', 'like', 'much', 'many', 'us', 'see', 'make', 'just', 'say', 'says', 'said', 'go', 'also', 'still', 'take', 'use', 'thank', 'their', 'it\'s', 'right', 'that', 'really', 'don\'t', 'He\'s', 'said', 'didn\'t', 'didn\' t', 'come', 'going', 'know', 'you', 'said,', 'they\' re', 'they \' re', 'want', 'don\' t', 'don \' t', 'he\' s', 'she\' s', 'that\' s', 'it\' s', 'it \' s'}
stop_words.update(additional_stop_words)

# Tokenize the text, remove stop words, and filter by word length (e.g., 4 characters or more)
filtered_words = [word for word in speaker_speech.split() if word.lower() not in stop_words and len(word) >= 4]

# # Tokenize, remove stop words, filter by word length, and apply lemmatization
# filtered_words = [
#     lemmatizer.lemmatize(word.lower())
#     for word in speaker_speech.split()
#     if word.lower() not in stop_words and len(word) >= 4
# ]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [ ]:
# Count the filtered words
filtered_word_counts = Counter(filtered_words)

# Generate a word cloud
wordcloud = WordCloud(width=800, height=500, background_color='white').generate_from_frequencies(filtered_word_counts)

# Save the word cloud as a PNG file
wordcloud.to_file("word_cloud.png")

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
In [ ]:
# from collections import Counter
# import re

# def count_words(text):
#     # Convert text to lowercase to make the count case-insensitive
#     text = text.lower()

#     # Use regex to find all words (alphanumeric sequences)
#     words = re.findall(r'\b\w+\b', text)

#     # Use Counter to count occurrences of each word
#     word_count = Counter(words)

#     return dict(word_count)

# # Count words
# word_counts = count_words(speaker_speech)
# print(word_counts)
In [ ]:
import matplotlib.pyplot as plt
from collections import Counter

# Count the words using Counter
word_counts = Counter(speaker_speech.split())

def plot_word_frequencies(word_counts, top_n=20):
    # Get the most common words and their counts
    most_common_words = word_counts.most_common(top_n)

    # Separate the words and their counts for plotting
    words, counts = zip(*most_common_words)

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    plt.barh(words, counts, color='skyblue')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.title(f'Top {top_n} Most Frequent Words')
    plt.gca().invert_yaxis()  # Invert y-axis to have the most frequent word at the top
    plt.show()

# Plot the word frequencies
plot_word_frequencies(word_counts)
In [ ]:
import pandas as pd
from collections import Counter

def save_word_counts_to_csv(word_counts, filename="word_counts.csv"):
    # Convert the Counter object to a DataFrame
    word_counts_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])

    # Sort the DataFrame by count in descending order
    word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)

    # Save the DataFrame to a CSV file
    word_counts_df.to_csv(filename, index=False)

    print(f"Word counts have been saved to {filename}")

# Save word count to file
save_word_counts_to_csv(word_counts)
Word counts have been saved to word_counts.csv
In [ ]:
!pip install wordcloud
In [ ]:
from wordcloud import WordCloud

# Count the words using Counter
word_counts = Counter(speaker_speech.split())

# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Remove axes
plt.show()

Sentence-level Sentiment Analysis¶

In [ ]:
!pip install -q transformers torch
In [ ]:
import csv
from transformers import pipeline

def analyze_sentence_sentiment(text, output_filename="sentence_sentiment_analysis_results.csv"):
    # Load the sentiment analysis pipeline with a BERT-based model
    sentiment_analyzer = pipeline("sentiment-analysis")

    # Split the text into sentences
    sentences = text.split('.')

    # Open a CSV file to write the results
    with open(output_filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Sentence", "Sentiment", "Score"])

        # Analyze sentiment for each sentence and write the results to the CSV
        for sentence in sentences:
            if sentence.strip():  # Check if the sentence is not empty
                result = sentiment_analyzer(sentence.strip())
                sentiment = result[0]['label']
                score = result[0]['score']
                # Write the sentence, sentiment, and score to the CSV file
                writer.writerow([sentence.strip(), sentiment, score])

    print(f"Sentiment analysis results have been saved to '{output_filename}'")

# Call the function
analyze_sentence_sentiment(speaker_speech)
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Sentiment analysis results have been saved to 'sentiment_analysis_results.csv'
In [ ]:
# # Analyze sentiment for each sentence using the BERT model
# for sentence in sentences:
#     if sentence.strip():  # Check if the sentence is not empty
#         result = sentiment_analyzer(sentence.strip())
#         sentiment = result[0]['label']
#         score = result[0]['score']
#         print(f"Sentence: {sentence.strip()}\nSentiment: {sentiment}, Score: {score:.4f}\n")

Named Entity Recognition & Entity Sentitment Analysis¶

In [ ]:
!pip install -q spacy
!python -m spacy download en_core_web_sm
In [ ]:
import spacy
import csv
from transformers import pipeline

def extract_entities_with_sentiment(text, entity_sentiment_file="entity_sentiments.csv"):
    # Load the spaCy model for English
    nlp = spacy.load("en_core_web_sm")

    # Load the sentiment analysis pipeline with a BERT-based model
    sentiment_analyzer = pipeline("sentiment-analysis")

    # Process the text with spaCy
    doc = nlp(text)

    # Entity-Level Sentiment Analysis
    with open(entity_sentiment_file, mode="w", newline="") as entity_file:
        entity_writer = csv.writer(entity_file)
        # Write the header without the "Sentence" column
        entity_writer.writerow(["Entity", "Label", "Sentiment", "Score"])

        for ent in doc.ents:
            # Extract the sentence containing the entity
            sentence = ent.sent.text
            # Perform sentiment analysis on the sentence
            sentiment_result = sentiment_analyzer(sentence)
            sentiment = sentiment_result[0]['label']
            score = sentiment_result[0]['score']
            # Write the entity, its label, and the sentiment to the CSV file (without the sentence)
            entity_writer.writerow([ent.text, ent.label_, sentiment, score])

    print(f"Entity-level sentiments have been saved to '{entity_sentiment_file}'")

# Call the function
extract_entities_with_sentiment(speaker_speech)
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Entity-level sentiments have been saved to 'entity_sentiments.csv'