I have a problem which I could not solve through the already asked questions in this Forum. Further on in this Question you will find the Google Link to my Data-Set and the Code I wrote.
Now to my Problem: In Task 2.1, I am asked to code a two stage sampling method, which is more efficient than my single stage sample, it only works if I code
surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = TRUE)
If I code (replace2 = FALSE instead of TRUE)
surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = FALSE)
It won't work and showing me the Error "cannot take a sample larger than the population when 'replace = FALSE"
Maybe I am Blind and don't see the obvious or I am not that advanced in R yet to figure out the solution. Would be Awesome if one of you has an idea!
https://drive.google.com/file/d/1PQHzACdN0G29IkIhvWL8Y3m_G69YWhr-/view?usp=sharing
library(survey)
#check for needed packages
load("runoff_updated.RData")
View(runoff)
#load and view the requiered data
runoff$id <- c(seq(1, dim(runoff)[1], 1))
length(unique(runoff$station))
#get number of polling stations with the lenght function
### 1.2 Draw a cluster sample, based on your clustering variable "station" with $m=30$ clusters.
surveysample1stage <- function(data, m = 30, clustvar = "station", replace = FALSE){
# Store the number of clusters within the population in "H_cls"
H_cls <- length(unique(data[,clustvar]))
# Draw a Simple Random sample of the specified clusters number "m"
clustselect <- sample(unique(data[,clustvar]), size = m, replace = replace)
# Extract all data of the sampled cluster from our specified dataset
# Note: The %in% operator is used to verify if an element belongs to a vector
clustsample <- data[data[,clustvar] %in% clustselect,]
# Attach number of clusters to our extraction
clustsample$H <- H_cls
# Return the sample
return(clustsample)
}
# Draw a cluster sample using the function
set.seed(33)
sample1 <- surveysample1stage(runoff, m = 30, clustvar = "station")
### 1.3 Store this sample as a surveydesign-object. Make sure to have all parameters specified according to cluster-sampling.
sample1_design <- svydesign(id = ~station, data = sample1, fpc = ~H)
### 1.4 Estimate the population total of Sarkozy voters and its 95% confidence interval (Hint: use the confint function on your estimated population total). Interpret.
sum(sample1$vote == 0)
mean(sample1$vote == 0)
#looking in our sample data frame who voted for sarkozy (vote == 0)
confint(svymean(x = ~vote == 0, design = sample1_design))
#Interpretation: Relativly Large Area where our svymean can be (between 0,477 and 0,573)
### 1.5 Compare this estimate to the true population parameter. Calculate the design effect and Interpret.
sum(runoff$vote == 0)
mean(runoff$vote == 0)
#looking in our data frame, getting the total number of people who voted sarkozy and their percentage in the total population
#Interpretation: Yes the true population parameter is between the lower and upper bound, but that shows us again how large our sample confidence interval is!
#Design Effect (DEFF):
sample1_est <- svymean(x = ~vote == 0, design = sample1_design, deff = TRUE)
sample1_deff <- attr(sample1_est, 'deff')[1]
sample1_deff
# or just:
sample1_est
################
## Exercise 2 ##
################
### 2.1 Develop and justify an alternative two-stage sampling strategy to increase the estimate's efficiency, while keeping the overall sample size $n$ fixed at the size of the cluster sample used before.
surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = TRUE)
{
# Store number of clusters in the population to H_cls
H_cls <- length(unique(data[,clustvar]))
# Sample the clusters in stage 1
clustselect <- sample(unique(data[,clustvar]), prob = prob1, size = m, replace = replace1)
# Extract the data, using the %in% operator to verify if an element belongs to a vector
clustsample <- data[data[,clustvar] %in% clustselect,]
# use unique values of the defined cluster variable as clustnames
clustnames <- sort(unique(clustsample[,clustvar]))
# create empty list object for subsamples to be stored in
subsamples <- list()
# loop: for each cluster ...
for (h in 1:m)
{
# ... select elements that belong to the sampled clusters
obs <- which(clustsample[,clustvar] == clustnames[h])
# ... draw SRS of given size from those elements, store in empty list
subsamples[[h]] <- clustsample[sample(obs, size = nh[h], replace = replace2),]
# ... add length of sampled clusters to this list
subsamples[[h]]$N_h <- length(obs)
}
# convert list of data frames into single data frame;
# note: do.call() constructs and executes a function call from a function and a list of arguments to be passed to it.
subsamples_df <- do.call("rbind", subsamples)
# attach number of clusters in the population to data frame
subsamples_df$H <- H_cls
# return sample data frame
return(subsamples_df)
}
# Population size
N <- dim(runoff)[1]
# Cluster population size
N_h <- table(runoff$station)
# Number of clusters to be sampled
m <- 30
# Overall sample size
n <- dim(sample1)[1]
# Stage 1 inclusion probabilities propotionate to size (PPS)
prob1_pps <- N_h*m/N
# Constant sample sizes at stage 2
n_h_eq <- c(round(rep(n/m, m)))
# Draw the sample
set.seed(33)
sample2 <- surveysample2stage(data = runoff, clustvar = "station", m = m, prob1 = prob1_pps, replace1 = F, nh = n_h_eq)
sample2$p1 <- sample2$N_h*m/N
sample2$p2 <- n/m/sample2$N_h
sample2_design <- svydesign(ids = ~station+id, probs = ~p1+p2, pps = 'brewer', data = sample2)
### 2.2 Estimate the population total of Sarkozy voters and the design effect. Compare the results to those obtained under Exercise 1. Interpret.
sum(sample2$vote == 0)
mean(sample2$vote == 0)
sample2_est<- svymean(x = ~vote == 0, design = sample2_design, deff = TRUE)
sample2_deff <- attr(sample2_est, 'deff')[1]
sample2_deff
# or just:
sample2_est
#confidence Interval of sample2
confint(svymean(x = ~vote == 0, design = sample2_design))