- Business & Data Research
- Posts
- Sentiment Analysis using R Program
Sentiment Analysis using R Program
NLP
Use Case / Problem Statement:
The goal of this project is to perform sentiment analysis on a dataset of social media posts (such as youtube) to understand the public's opinion on a specific topic, such as a brand, product, or social issue. This analysis will help identify the overall sentiment (positive, negative, or neutral) and key topics driving the sentiment.
#Required Package:
install.packages("tm") # for text mining
install.packages("SnowballC") # for text stemming
install.packages("wordcloud") # word-cloud generator
install.packages("RColorBrewer") # color palettes
install.packages("syuzhet") # for sentiment analysis
install.packages("ggplot2") # for plotting graphs
#Loading the library:
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
#Reading the data from csv file:
sentimental_dataset <- read.csv("/Users/Sample Datasets Kaggle/Youtube_data/UScomments.csv",header = TRUE)
str(sentimental_dataset)
head(sentimental_dataset, n= 10)
#loading the data as corpus to clean the dataset from the file :
textdocument <- Corpus(VectorSource(sentimental_dataset))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
textdocument <- tm_map(textdocument, toSpace, "/")
textdocument <- tm_map(textdocument, toSpace, "@")
textdocument <- tm_map(textdocument, toSpace, "\\|")
#Converting the text to lower case:
textdocument <- tm_map(textdocument, content_transformer(tolower))
#Remove Numbers:
textdocument <- tm_map(textdocument, removeNumbers)
#Remove english common stop words:
textdocument <- tm_map(textdocument,removeWords, stopwords("english"))
#Remove punctuations:
textdocument <- tm_map(textdocument, removePunctuation)
#Eliminate White Spaces:
textdocument <- tm_map(textdocument, stripWhitespace)
#Text stemming - which reduces the word to their root form :
textdocument <- tm_map(textdocument, stemDocument)
###Building the term document matrix:
textdocument_matrix <- TermDocumentMatrix(textdocument)
dtm_matrix <- as.matrix(textdocument_matrix)
#Sort by decreasing value of frequency:
dtm_v <- sort(rowSums(dtm_matrix),decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq = dtm_v)
#Display the top 5 most frequency words :
head(dtm_d,5)
#Plot the most frequent words
barplot(dtm_d[1:5,]$freq,las = 2,names.arg = dtm_d[1:5,]$word,
col = "lightgreen",main = "Top 5 most frequent words from YouTube Comments",
ylab = "Most Frequent words used in Comments")
#Generate the word count details :
set.seed(1234)
wordcloud(dtm_d$word,freq = dtm_d$freq,min.freq = 5,
max.words = 100,random.order = FALSE,rot.per = 0.40,
colors = brewer.pal(8,"Dark2"))