I have created a function to clean the corpus
#function to clean corpus
clean_corpus <- function(df, dfcol) {
wordCorpus <- VCorpus(VectorSource(data.table(df$dfcol))) #create corpus
wordCorpus <- tm_map(wordCorpus, content_transformer(stripWhitespace)) #remove whitespace
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower)) #convert to lowercase
wordCorpus <- tm_map(wordCorpus, content_transformer(removeWords), c(stopwords('english'), 'john', 'james')) #remove stopwords
#replace foreign characters like "âë" to "'" to be removed in remove punctuation step
removeForeignChar <- function(x) gsub("[^0-9A-Za-z///' ]","'" , x ,ignore.case = TRUE)
wordCorpus <- tm_map(wordCorpus, content_transformer(removeForeignChar))
wordCorpus <- tm_map(wordCorpus, content_transformer(removePunctuation)) #remove punctuations
wordCorpus <- tm_map(wordCorpus, content_transformer(removeNumbers)) #remove numbers
removeURL <- function(x) gsub("http[[:alnum:]]*",'',x) #function to remove URLs
wordCorpus <- tm_map(wordCorpus, content_transformer(removeURL)) #remove URLs
return(wordCorpus)
}
Then I apply it to a dataframe called 'usvideos' and column 'title'
titleCorpus <- clean_corpus(usvideos, title)
However, when I check titleCorpus, I find that it is empty. How do I solve this problem?