How to do this for all columns of the object x
automatically, using base R
or SAS
base.
Here is example using R:
# sample data
set.seed(123)
x <- data.frame(var1=runif(100), var2=runif(100), flag=rbinom(100, size=1, prob=0.7))
x
# calculate percentile of each column
r <- apply(x, 2, function(x) quantile(x, probs=seq(0,1,0.05)))
res <- data.frame(item_id=rownames(r), r, row.names = NULL)
# assign group for each percentile
res$group <- seq_along(res$item_id)
res
# find the bin of the variable (var1, var2, ...) given percentile bin (interval);
x$bin_var1 <- findInterval(x$var1, res$var1)
x
# calculate the occurence, rate of the dummy flag column name (no=no occurence; yes=occurence of flag==1; total=total obs per bucket; rate_var=rate of var1)
op <- data.frame(with(x, aggregate(flag, list(bin_var1), FUN=function(x) c(sum(x==0),sum(x==1), length(x), sum(x==1)/length(x)))))
op1 <- data.frame(do.call(data.frame, op))
colnames(op1) <- c("group","no","yes","total","rate_var1")
op1
# merge
final <- merge(res, op1, by="group")
final
In this SAS
solution I'm missing how to include the rate
the ration of flag=1/flag all
, in R
I'm using the findInterval
function to assign bin and then calculate the rate
, sum(flag=1)
...this part I'm not sue how to do in SAS
.
Example:
data x;
length groups $12;
input groups Var1 Var2 Flag;
datalines;
constrict 3.50 1.09 1
constrict 0.75 1.50 0
constrict 0.70 3.50 1
no_constrict 1.10 1.70 1
no_constrict 0.90 0.45 1
no_constrict 0.55 2.75 1
no_constrict 1.40 2.33 0
constrict 2.30 1.64 1
constrict 0.85 1.415 0
no_constrict 1.80 1.80 1
no_constrict 0.95 1.36 1
no_constrict 1.50 1.36 0
constrict 0.60 1.50 0
constrict 0.95 1.90 0
constrict 1.60 0.40 1
constrict 2.35 0.03 1
no_constrict 1.10 2.20 0
constrict 0.80 3.33 0
no_constrict 0.75 1.90 0
;
proc univariate data=x noprint;
class groups;
var var1
var2
flag
;
output out=res pctlpts=0 to 100 by 10 pctlpre=var1_
var2_
flag_
;
proc sql;
create table op1
as select a.*,
b.*
from x
as a
left join res
as b
on a.groups=b.groups;
quit;