Quantcast
Channel: Active questions tagged r - Stack Overflow
Viewing all articles
Browse latest Browse all 201867

"Cannot take a sample larger than the population when 'replace = FALSE" [closed]

$
0
0

I have a problem which I could not solve through the already asked questions in this Forum. Further on in this Question you will find the Google Link to my Data-Set and the Code I wrote.

Now to my Problem: In Task 2.1, I am asked to code a two stage sampling method, which is more efficient than my single stage sample, it only works if I code

surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = TRUE)

If I code (replace2 = FALSE instead of TRUE)

surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = FALSE)

It won't work and showing me the Error "cannot take a sample larger than the population when 'replace = FALSE"

Maybe I am Blind and don't see the obvious or I am not that advanced in R yet to figure out the solution. Would be Awesome if one of you has an idea!

https://drive.google.com/file/d/1PQHzACdN0G29IkIhvWL8Y3m_G69YWhr-/view?usp=sharing

library(survey)
#check for needed packages

load("runoff_updated.RData") 
View(runoff)
#load and view the requiered data 

runoff$id <- c(seq(1, dim(runoff)[1], 1))

length(unique(runoff$station)) 
#get number of polling stations with the lenght function


### 1.2 Draw a cluster sample, based on your clustering variable "station"  with $m=30$ clusters.

surveysample1stage <- function(data, m = 30, clustvar = "station", replace = FALSE){

  # Store the number of clusters within the population in "H_cls"
  H_cls <- length(unique(data[,clustvar]))

  # Draw a Simple Random sample of the specified clusters number "m"
  clustselect <- sample(unique(data[,clustvar]), size = m, replace = replace)

  # Extract all data of the sampled cluster from our specified dataset
  # Note: The %in% operator is used to verify if an element belongs to a vector
  clustsample <- data[data[,clustvar] %in% clustselect,]

  # Attach number of clusters to our extraction
  clustsample$H <- H_cls

  # Return the sample
  return(clustsample)
}

# Draw a cluster sample using the function

set.seed(33)
sample1 <- surveysample1stage(runoff, m = 30, clustvar = "station")


### 1.3 Store this sample as a surveydesign-object. Make sure to have all parameters specified according to cluster-sampling.

sample1_design <- svydesign(id = ~station, data = sample1, fpc = ~H)


### 1.4 Estimate the population total of Sarkozy voters and its 95% confidence interval (Hint: use the confint function on your estimated population total). Interpret.

sum(sample1$vote == 0) 
mean(sample1$vote == 0)
#looking in our sample data frame who voted for sarkozy (vote == 0)

confint(svymean(x = ~vote == 0, design = sample1_design))

#Interpretation: Relativly Large Area where our svymean can be (between 0,477 and 0,573)


### 1.5 Compare this estimate to the true population parameter. Calculate the design effect and Interpret. 

sum(runoff$vote == 0)
mean(runoff$vote == 0)
#looking in our data frame, getting the total number of people who voted sarkozy and their percentage in the total population

#Interpretation: Yes the true population parameter is between the lower and upper bound, but that shows us again how large our sample confidence interval is! 

#Design Effect (DEFF):

sample1_est <- svymean(x = ~vote == 0, design = sample1_design, deff = TRUE)

sample1_deff <- attr(sample1_est, 'deff')[1]
sample1_deff

# or just:
sample1_est


################
## Exercise 2 ##
################

### 2.1 Develop and justify an alternative two-stage sampling strategy to increase the estimate's efficiency, while keeping the overall sample size $n$ fixed at the size of the cluster sample used before.

surveysample2stage <- function(data, m = 30, clustvar = "station", prob1 = NULL, nh = NULL, replace1 = FALSE, replace2 = TRUE)
{  

  # Store number of clusters in the population to H_cls
  H_cls <- length(unique(data[,clustvar]))

  # Sample the clusters in stage 1
  clustselect <- sample(unique(data[,clustvar]), prob = prob1, size = m, replace = replace1)

  # Extract the data, using the %in% operator to verify if an element belongs to a vector
  clustsample <- data[data[,clustvar] %in% clustselect,]   

  # use unique values of the defined cluster variable as clustnames
  clustnames <- sort(unique(clustsample[,clustvar]))  

  # create empty list object for subsamples to be stored in
  subsamples <- list()  

  # loop: for each cluster ...  
  for (h in 1:m) 
  {

    # ... select elements that belong to the sampled clusters
    obs <- which(clustsample[,clustvar] == clustnames[h])  

    # ... draw SRS of given size from those elements, store in empty list
    subsamples[[h]] <- clustsample[sample(obs, size = nh[h], replace = replace2),]

    # ... add length of sampled clusters to this list
    subsamples[[h]]$N_h <- length(obs)

  }

  # convert list of data frames into single data frame; 
  # note: do.call() constructs and executes a function call from a function and a list of arguments to be passed to it.
  subsamples_df <- do.call("rbind", subsamples) 

  # attach number of clusters in the population to data frame
  subsamples_df$H <- H_cls  

  # return sample data frame
  return(subsamples_df)  
} 



# Population size
N <- dim(runoff)[1]

# Cluster population size
N_h <- table(runoff$station) 

# Number of clusters to be sampled
m <- 30

# Overall sample size 
n <- dim(sample1)[1]

# Stage 1 inclusion probabilities propotionate to size (PPS)
prob1_pps <- N_h*m/N

# Constant sample sizes at stage 2
n_h_eq <- c(round(rep(n/m, m)))

# Draw the sample
set.seed(33)

sample2 <- surveysample2stage(data = runoff, clustvar = "station", m = m, prob1 = prob1_pps, replace1 = F, nh = n_h_eq)

sample2$p1 <- sample2$N_h*m/N
sample2$p2 <- n/m/sample2$N_h


sample2_design <- svydesign(ids = ~station+id, probs = ~p1+p2, pps = 'brewer', data = sample2)  


### 2.2 Estimate the population total of Sarkozy voters and the design effect.  Compare the results to those obtained under Exercise 1. Interpret.

sum(sample2$vote == 0) 
mean(sample2$vote == 0)

sample2_est<- svymean(x = ~vote == 0, design = sample2_design, deff = TRUE)

sample2_deff <- attr(sample2_est, 'deff')[1]
sample2_deff

# or just:
sample2_est


#confidence Interval of sample2
confint(svymean(x = ~vote == 0, design = sample2_design))

Viewing all articles
Browse latest Browse all 201867

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>