I'm really interested in sentiment analysis and one of the package i can use is sentiment package along with rstem. i'm currently using R3.5.5 and R-studio v1.2.5033 because the sentiment and rstem package only works on this R version for me.
I know its an old package and maybe a little out dated but i'm still curious about how it works especially classify emotion function.
The only explanation i could find is that it use naive Bayes classifier trained on Carlo Strapparava and Alessandro Valitutti's emotions lexicon. here is the script
# Create Matrix
create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, removeNumbers=TRUE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf)
{
stem_words <- function(x) {
split <- strsplit(x,"")
return(wordStem(split[[1]],language=language))
}
control <-list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting)
if(stemWords == TRUE) control <-append(control,list(stemming=stem_words),after=6)
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse="")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
corpus <-Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
gc()
return(matrix)
}
# Classify Emotion
Classify_emotion <- function(textColumns,algorithm="bayes",prior=1.0,verbose=FALSE,...){
matrix <- create_matrix(textColumns,...)
lexicon <- read.csv(system.file("Data/emotions.csv.gz",package="sentiment"),header=FALSE)
counts <-
list(anger=length(which(lexicon[,2]=="anger")),
disgust=length(which(lexicon[,2] =="disgust")),
fear=length(which(lexicon[,2]=="fear")),
joy=length(which(lexicon[ ,2]=="joy")),
sadness=length(which(lexicon[,2]=="sadness")),
surprise=length(which(lexicon[,2]=="surprise")),
total=nrow(lexicon))
documents <- c()
for (i in 1:nrow(matrix)) {
if (verbose) print(paste("DOCUMENT",i))
scores <- list(anger=0,disgust=0,fear=0,joy=0,sadness=0,surprise=0)
doc <- matrix[i,]
words <- findFreqTerms(doc,lowfreq=1)
for (word in words) {
for (key in names(scores)) {
emotions <- lexicon[which(lexicon[,2]==key),]
index <- pmatch(word,emotions[,1],nomatch=0)
if (index > 0) {
entry <- emotions[index,]
category <- as.character(entry[[2]])
count <- counts[[category]]
score <- 1.0
if (algorithm=="bayes") score <-abs(log(score*prior/count))
if (verbose) {
print(paste("WORD:",word,"CAT:",category,"SCORE:",score))
}
scores[[category]] <- scores[[category]]+score
}
}
}
if (algorithm=="bayes") {
for (key in names(scores)) {
count <- counts[[key]]
total <- counts[["total"]]
score <- abs(log(count/total))
scores[[key]] <- scores[[key]]+score
}
}else {
for (key in names(scores)) {
scores[[key]] <- scores[[key]]+0.000001
}
}
best_fit <- names(scores)[which.max(unlist(scores))]
if (best_fit == "disgust"&& as.numeric(unlist(scores[2]))-3.09234 < .01)
best_fit <- NA
documents <-
rbind(documents,c(scores$anger,scores$disgust,scores$fear,scores$joy,
scores$sadness,scores$surprise,best_fit))
}
colnames(documents) <-
c("ANGER","DISGUST","FEAR","JOY","SADNESS","SURPRISE","BEST_FIT")
return(documents)
}
i've learned a little about text classification using naive bayes classifier.
What I don't understand is how Carlo Strapparava and Alessandro Valitutti's emotions lexicon works especially in this sentiment package.
Whether all the data that we process into functions becomes test data? or it all becomes train data? or maybe the lexicon is the train data?
could someone breakdown the process? i really need help for this.
*sry for bad english