Hey Stack Overflow peeps,
I've currently analysing some small text data suing a dictionary i constructed.
I've tried applying my dictionary to my DFM by using dfm_lookup function, but i cannot recieve any output.
Does anyone know whats wrong?
My dictionary is a 24 word dict.
My code is:
## **FIRST I GATHER DATA** ###
library(rvest)
library(data.table)
url <- "https://www.ft.dk/da/udvalg/udvalgene/sau/dokumenter/alle_spoergsmaal?startDate=20170101&endDate=20191130&pageSize=3067"
url2 <- "https://www.ft.dk/da/udvalg/udvalgene/sau/dokumenter/alle_spoergsmaal?committeeAbbreviation=SAU&startDate=20051031&endDate=20191031&pageSize=1000&totalNumberOfRecords=18656&pageNumber="
n <- 2:19 # definerer antal sider fra hjemmesiden
p <- read_html(url)
spm <- p %>% html_nodes(".highlighted+ .column-documents .column-documents__icon-text") %>% html_text(trim=T)
spmstillerforsøg <- p %>% html_nodes("td.column-documents.hidden-xs") %>% html_text(trim=T)
dato <- p %>% html_nodes("td[data-title='Dato']") %>% html_text(trim = T)
dfs <- list()
for (i in n) {
if (i == 1){
p <- read_html(url)
} else {
urlnew <- paste0(url2, i)
p <- read_html(urlnew)
}}
#links <- p %>% html_nodes(xpath= "//a[@class='column-documents__link']") %>% html_attr("href)
df <- data.frame(spm = spm,
spoerger = spmstillerforsøg,
dato = dato,
#fulltext = NA,
stringsAsFactors = F)
dfs[[length(dfs) + 1]] <- df
#}
dfnew <- as.data.frame(rbindlist(dfs))
save(dfnew, file="rigtigdata.Rda")
## **THEN I CREATE MY DFM AND CREATE MY DICTIONARY AND TRY AND APPLY IT**
library(quanteda)
mitcorpus <- corpus(dfnew, text_field = "spm")
texts <- tokens(df$spm,
what = "word",
remove_numbers = T,
remove_punct = T,
remove_symbols = T,
remove_separators = T,
remove_hyphens = T,
remove_url = T,
verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
quanteda_options("language_stemmer" = "danish")
texts <- tokens_wordstem(texts)
# get actual dfm from tokens
txt.mat <- dfm(texts)
## **HERE I CREATE MY DICTIONARY **
ordbord <- dictionary(list(reorganisering = c("årsværker", "personaleændringer", "personale"),
grænsekontrol = c("Grænsekontrol", "grænsekontrollen"),
afskrivning_restancer = c("skatterestancer", "restanceinddrivelsesmyndighede"),
leasing = c("leasingbiler", "leasing"),
sambeskatning = c("sambeskatning"),
afgifter = c("afgiften", "afgifter", "afgiftsstigning", "sukkerafgift", "tobaksafgift", "registreringsafgift", "PSO-afgift", "afgiftsstigning", "afgiftslettelser"),
gældssanering = c("gældssanering"),
indrivelse = c("skatteinddrivelse", "gældsindrivelse", "indrivelsen"),
indsatsplaner = c("indsatsplaner"),
henvendelser = c("henvendelser", "henvendelse", "henvendelsen"),
EFI = c("EFI"),
topskat = c("topskat", "topskattelettelser", "topskatten"),
skattelettelser = c("skattelettelser"),
udbytteskat = c("udbytteskat", "udbyttesagen", "udbytteskandalen"),
boligbeskatning = c("boligskat", "ejendomsvurderinger", "boligbeskatning", "boligskatten", "grundværdi"),
moms = c("moms", "momsfritagelse", "nulmoms"),
EU = c("EU", "ECOFIN", "toldsystem", "toldsystemet"),
virksomhedskat = c("selskabsskat", "virksomhedsskat"),
arveafgift = c("arveafgift", "generationsskifte", "virksomhedsoverdragelse", "familieejede"),
eksport = c("eksport", "bileksport"),
kontrol = c("skattekontrol", "kontrol", "kontrollen"),
merprovenue = c("merprovenue", "skattehuller"),
fradrag = c("beskæftigelsesfradrag", "fradrag", "skattefradrag", "fradraget"),
skatteunddragelse = c("sort", "skatteunddragelse")))
mindfmmedordbog <- dfm_lookup(tm,
dictionary = ordbord)
mindfmmedordbog
This just returns: Document-feature matrix of: 3,067 documents, 24 features (99.3% sparse).
Any info on how to get further, escpecially how i find out how many times and when my words are being used.
Again, all help is much appreciated - i'm still learning.
Thx in advance !