Simulation for means on the fly

A New View of Statistics

© 1997 Will G Hopkins

Go to: Previous · Contents · Search · Home

On The Fly for the Effect-Size Statistic in LONGITUDINAL STUDIES WITHOUT A CONTROL GROUP and USING SAMPLE SD TO CALCULATE EFFECT SIZE

See preamble for simulation in cross-sectional studies and longitudinal studies with no control group and using population standard deviation.

The findings here are a little different. Effect of error in the denominator pushes required sample size UP for small-to-large effects, but it comes down again for very large effects (e.g. for r=0.9, sample size = 21, 25, 32, 31, and 15 for ES=0, 0.41, 0.88, 1.54, 2.90 resp.). Bias is actually LOW for small-moderate effects, but within 5% of confidence interval (for reliability=0.9, which is all I've looked at systematically). For very large effects bias is negligible, but lower and upper confidence limits are both higher than the thresholds on the scale, at least partly because the approximation I use for the confidence interval is no longer so good at such high values for ES. All this for nmax1=20 and nmax2=40. So it looks like it's a winner.

I have yet to write a simulation for the case of a control group, but I have no doubt the method will work. Meanwhile, at the bottom of this page I have run a short simulation to check that the formulae for confidence limits and intervals adapted from the Becker formulae are correct or accurate in this case.

 
options linesize=85;
options pagesize=30;
 
%macro whatever;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
  true=rannor(0);
  y1=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
  y2=sqrt(&r)*true+sqrt(1-&r)*rannor(0)+&es;
  output;
  end;
end;
 
%mend;
 
%let trialn=500;
%let r=0.9;
%let es=0.0;
%let startn=10;
%let nmax1=20; *total size limit for 1st interation;
%let nmax2=40; *total size limit for 2nd or more interations;
%whatever;
 
data dat0;
set;
dataset="initial";
 
data;
set;
y=y1; test=1; output;
y=y2; test=2; output;
drop y1 y2;
 
proc anova noprint outstat=datanova;
class id test;
model y=id test;
by trial;
 
data datanova;
set;
if _source_="ID";
icc=(f-1)/(f+1);
keep trial icc;
 
proc means noprint data=dat1;
var y1 y2;
output out=dat n=n mean= var=vary1 vary2;
by trial;
 
data dat2;
merge dat datanova;
by trial;
iter=1;
samplees=(y2-y1)/sqrt((vary1+vary2)/2);
crrctes=samplees*(1-3/(4*n-1));
sdes=sqrt(2*(1-icc)/n+crrctes**2/2/(n-1));
confint=2*1.96*sdes;
*cipred = 6.163550E-3*crrctes**4 + 1.547127E-1*crrctes**2 + 4.029553E-1;
cipred = 6.163550E-3*samplees**4 + 1.547127E-1*samplees**2 + 4.029553E-1;
if cipred<confint then do;
  nnew=round((n-1)*(confint/cipred)**2)-n+1;
  if nnew+n>&nmax1 then nnew=&nmax1-n;
  if nnew then do;
    do id=n+1 to nnew+n;
      true=rannor(0);
      y1=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
      y2=sqrt(&r)*true+sqrt(1-&r)*rannor(0)+&es;
      output;
      end;
    end;
  end;
keep trial y1 y2 id nnew iter;
 
/*
proc univariate noprint;
var samplees;
output mean=mean pctlpre=Q pctlpts=2.5 50 97.5;
title "Sampling distn for es in x-over, r=&r es=&es n=&startn";
title2 "for sample effect size";
 
proc print;
 
proc means mean std min max maxdec=2 data=dat2;
var samplees crrctes confint cipred;
title "Stats for es in x-over, r=&r es=&es n=&startn";
 
run;
*/
 
 
*2nd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
*proc print;
*run;
 
data;
set;
y=y1; test=1; output;
y=y2; test=2; output;
drop y1 y2;
 
proc anova noprint outstat=datanova;
class id test;
model y=id test;
by trial;
 
data datanova;
set;
if _source_="ID";
icc=(f-1)/(f+1);
keep trial icc;
 
proc means noprint data=dat1;
var y1 y2;
output out=dat n=n mean= var=vary1 vary2;
by trial;
 
data dat2;
merge dat datanova;
by trial;
iter=2;
samplees=(y2-y1)/sqrt((vary1+vary2)/2);
crrctes=samplees*(1-3/(4*n-1));
sdes=sqrt(2*(1-icc)/n+crrctes**2/2/(n-1));
confint=2*1.96*sdes;
*cipred = 6.163550E-3*crrctes**4 + 1.547127E-1*crrctes**2 + 4.029553E-1;
cipred = 6.163550E-3*samplees**4 + 1.547127E-1*samplees**2 + 4.029553E-1;
if cipred<confint then do;
  nnew=round((n-1)*(confint/cipred)**2)-n+1;
  if nnew+n>&nmax2 then nnew=&nmax2-n;
  if nnew then do;
    do id=n+1 to nnew+n;
      true=rannor(0);
      y1=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
      y2=sqrt(&r)*true+sqrt(1-&r)*rannor(0)+&es;
      output;
      end;
    end;
  end;
keep trial y1 y2 id nnew iter;
 
*3nd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
data;
set;
y=y1; test=1; output;
y=y2; test=2; output;
drop y1 y2;
 
proc anova noprint outstat=datanova;
class id test;
model y=id test;
by trial;
 
data datanova;
set;
if _source_="ID";
icc=(f-1)/(f+1);
keep trial icc;
 
proc means noprint data=dat1;
var y1 y2;
output out=dat n=n mean= var=vary1 vary2;
by trial;
 
data dat2;
merge dat datanova;
by trial;
iter=3;
samplees=(y2-y1)/sqrt((vary1+vary2)/2);
crrctes=samplees*(1-3/(4*n-1));
sdes=sqrt(2*(1-icc)/n+crrctes**2/2/(n-1));
confint=2*1.96*sdes;
*cipred = 6.163550E-3*crrctes**4 + 1.547127E-1*crrctes**2 + 4.029553E-1;
cipred = 6.163550E-3*samplees**4 + 1.547127E-1*samplees**2 + 4.029553E-1;
if cipred<confint then do;
  nnew=round((n-1)*(confint/cipred)**2)-n+1;
  if nnew then do;
*  if nnew+n>&nmax2 then nnew=&nmax2-n;
    do id=n+1 to nnew+n;
      true=rannor(0);
      y1=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
      y2=sqrt(&r)*true+sqrt(1-&r)*rannor(0)+&es;
      output;
      end;
    end;
  end;
keep trial y1 y2 id nnew iter;
 
 
 
*output results;
 
data dat1;
set dat1 dat2;
dataset="final";
 
data datboth;
set dat0 dat1;
 
proc sort;
by dataset trial;
 
 
data;
set;
y=y1; test=1; output;
y=y2; test=2; output;
drop y1 y2;
 
proc anova noprint outstat=datanova;
class id test;
model y=id test;
by dataset trial;
 
data datanova;
set;
if _source_="ID";
icc=(f-1)/(f+1);
keep trial icc dataset;
 
proc means noprint data=datboth;
var y1 y2;
output out=dat n=n mean= var=vary1 vary2;
by dataset trial;
 
data dat2;
merge dat datanova;
by dataset trial;
samplees=(y2-y1)/sqrt((vary1+vary2)/2);
crrctes=samplees*(1-3/(4*n-1));
sdes=sqrt(2*(1-icc)/n+crrctes**2/2/(n-1));
confint=2*tinv(0.975,n-1)*sdes;
confliml=sqrt(2*(1-icc)/n)*tinv(0.025,n-1,sqrt(n/(2*(1-icc)))*crrctes);
conflimu=sqrt(2*(1-icc)/n)*tinv(0.975,n-1,sqrt(n/(2*(1-icc)))*crrctes);
 
proc means noprint;
var n crrctes confint confliml conflimu;
by dataset;
output mean=;
 
proc print noobs;
var dataset n crrctes confint confliml conflimu;
format _numeric_ 5.2 n 4.;
title "ES stats rely=&r es=&es startn=&startn trials=&trialn";
title2 "nmax1=&nmax1 nmax2=&nmax2, longitudinal study, no control group";
title3 "using sample SD and ICCs";
 
data datfinal;
set dat2;
if dataset="final";
 
proc univariate noprint;
var crrctes;
output mean=mean pctlpre=Q pctlpts=2.5 50 97.5;
 
proc print noobs;
format _numeric_ 5.2;
title "Sampling distn for es, r=&r es=&es n=&startn";
title2 "for corrected final sample effect size";
 
proc means mean std min max maxdec=0 data=datfinal;
var n;
title "Stats for final sample size  r=&r es=&es n=&startn nmax=&nmax";
 
proc sort data=dat1;
by trial iter;
 
proc means noprint;
var nnew;
output mean=;
by trial iter;
 
proc sort;
by iter;
 
proc means noprint;
var nnew;
by iter;
output n=n mean= std=std min=min max=max;
 
data;
set;
if iter;
 
proc print noobs;
var iter n nnew std min max;
format _numeric_ 5.0;
title "Number of extra observations at each iteration";
title2 "for rely=&r es=&es startn=&startn nmax1=&nmax1 nmax2=&nmax2";
 
run;

Here's the bit that checks the formulae when there is a control group.

 
%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn/2;
  true=rannor(0);
  y1=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
  y2=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
  true=rannor(0);
  y3=sqrt(&r)*true+sqrt(1-&r)*rannor(0);
  y4=sqrt(&r)*true+sqrt(1-&r)*rannor(0)+&es;
  ydiff=y4-y3-(y2-y1);
  output;
  end;
end;
keep trial ydiff y1-y4;
 
%mend;
 
%let trialn=1000; *no. of trials;
%let r=0.7; *reliability;
%let es=0.4; *effect size;
%let startn=20; *initial size (=startn/2 in each group);
%data;
 
proc means noprint data=dat1;
var ydiff y1 y2 y3 y4;
output out=dat n=n mean=ydiff var=d vary1 vary2 vary3 vary4;
by trial;
 
data dat2;
set dat;
iter=1;
samplees=ydiff/sqrt((vary1+vary2+vary3+vary4)/4);
*samplees=ydiff/sqrt((vary1+vary3)/2);
crrctes=samplees*(1-3/(8*n-1));
sdes=sqrt(4*(1-&r)/n+crrctes**2/4/(n-2)); *sic! n-2 is best empirically;
ciapprox=2*tinv(0.975,2*(n-1))*sdes;
cipred = 6.163550E-3*crrctes**4 + 1.547127E-1*crrctes**2 + 4.029553E-1;
confliml=sqrt(4*(1-&r)/n)*tinv(0.025,2*(n-1),crrctes*sqrt(n/4/(1-&r)));
conflimu=sqrt(4*(1-&r)/n)*tinv(0.975,2*(n-1),crrctes*sqrt(n/4/(1-&r)));
confint=conflimu-confliml;
 
 
proc univariate noprint;
var samplees;
output mean=mean std=std pctlpre=Q pctlpts=2.5 50 97.5;
title "Sampling distn for es, r=&r es=&es n=&startn";
title2 "for sample effect size";
 
proc print;
 
proc means mean std min max maxdec=3 data=dat2;
var samplees sdes crrctes confliml conflimu confint ciapprox cipred;
title "Stats for es, r=&r es=&es n=&startn";
 
run;

Note: the degrees of freedom for the non-central t statistic (2*(n-1)) applies to calculation of the effect size using pre-test values of the SDs only, not pre- and post-test values as I have used. But if reliability is practically perfect, as it often is, the degrees of freedom are effectively unaltered. (If reliability is zero, there are twice as many degrees of freedom.) For lower reliability, use of all four standard deviations in the effect-size calculation will make the actual confidence interval less than that calculated. Someone needs to work out an adjustment for the degrees of freedom.

Go to: Previous · Contents · Search · Home

A New View of Statistics	© 1997 Will G Hopkins
Go to: Previous · Contents · Search · Home