|
A New View of Statistics | |
SAS program to see how much bias there is when you use CI=0.20 as the stopping rule.
Lower down: what about when use statistical significance as the rule?
Main finding: correlations are biased high by 0.01 at most
using on the fly, but by 0.10 using statistical significance.
First I checked whether the samples sizes for correlations in the
middle of each step of the magnitude scale do indeed give CIs of 0.20;
options linesize=80;
options pagesize=30;
*generate correlated y and x;
%macro data;
data dat1;
do trial=1 to 1000;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;
%mend;
*rho=population correlation;
*startn=(initial) sample size;
%let rho=0.823; *others were 0.61, 0.404, 0.202, 0.0;
%let startn=46; *others were 155, 270, 350, 380;
%data;
proc corr noprint outp=dat;
var x;
with y;
by trial;
data;
set;
if _type_="CORR";
proc univariate noprint;
var x;
output pctlpre=Q pctlpts=2.5 50 97.5;
title "Median and 95% CLs for samples";
proc print;
run;
OK here is the main program to check for bias in the "on the fly" method;
To estimate the sample size to give confidence interval of 0.20 for
any correlation, I derived the sample sizes for correlations in the
middle of each step of the magnitude scale, by trial and error on a
hand calculator. I got these numbers and correlations:
r n
0.0 380
0.202 350
0.405 270
0.61 155
0.823 46
-0.202 350
-0.405 270
-0.61 155
-0.823 46
I checked these, as shown in the simulation above.
I then fit a fourth-order polynomial to the data using a graphing program:
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2
This is the curve shown on the page devoted to On the Fly for
Correlations, except that I plateaued it for r>0.82.
I have included sample-size limits on the iterations, to see if
that makes any difference.
%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;
%mend;
%let rho=0.7; *population correlation;
%let startn=45; *initial sample size;
%let nmax1=100; *total size limit for 1st interation;
%let nmax2=100; *total size limit for 2nd or more interations;
%let trialn=400; *no of trials for the simulation;
%data;
data dat0;
set;
dataset="initial";
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=1;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
if nnew+nprev>&nmax1 then nnew=&nmax1-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*2nd iteration;
data dat1;
set dat1 dat2;
proc sort;
by trial;
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=2;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*3rd iteration;
data dat1;
set dat1 dat2;
proc sort;
by trial;
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=3;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
* if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*4th iteration;
data dat1;
set dat1 dat2;
proc sort;
by trial;
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=4;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
* if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*output results;
data dat1;
set dat1 dat2;
dataset="final";
data datboth;
set dat0 dat1;
proc sort;
by dataset trial;
proc corr noprint outp=dat;
var x;
with y;
by dataset trial;
data dat2;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));
proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;
data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);
proc print noobs;
var dataset n_trials n_final mean_r;
format n_final 4. mean_r 6.3;
title "Mean final sample sizes and correlations (via z transform)";
title2 "for startn=&startn rho=&rho nmax=&nmax";
data datfinal;
set dat2;
if dataset="final";
sampler=(exp(2*z)-1)/(exp(2*z)+1); *restores the raw r;
proc univariate noprint;
var sampler ;
output mean=mean pctlpre=Q pctlpts=2.5 50 97.5;
proc print noobs;
format _numeric_ 5.2;
title "Sampling distn for r rho=&rho n=&startn nmax=&nmax";
title2 "for final sample correlation";
proc means n mean std min max maxdec=0 data=datfinal;
var nfinal;
title "Stats for final sample size r=&rho n=&startn nmax=&nmax";
proc sort data=dat1;
by trial iter;
proc means noprint;
var nnew;
output mean=;
by trial iter;
proc sort;
by iter;
proc means noprint;
var nnew;
by iter;
output n=n mean= std=std min=min max=max;
data;
set;
if iter;
proc print noobs;
var iter n nnew std min max;
format _numeric_ 5.0;
title "Number of extra observations at each iteration";
title2 "for rho=&rho startn=&startn nmax1=&nmax1 nmax2=&nmax2";
run;
************;
*Now try statistical significance as the stopping rule;
%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;
%mend;
%let rho=0.3; *population correlation;
%let startn=45; *initial sample size;
%let trialn=300; *no of trials for the simulation;
%data;
data dat0;
set;
dataset="initial";
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*2nd iteration;
data dat1;
set dat1 dat2;
proc sort;
by trial;
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*3rd iteration;
data dat1;
set dat1 dat2;
proc sort;
by trial;
proc corr noprint outp=dat;
var x;
with y;
by trial;
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;
*output results;
data dat1;
set dat1 dat2;
dataset="final";
data;
set dat0 dat1;
proc sort;
by dataset trial;
proc corr noprint outp=dat;
var x;
with y;
by dataset trial;
data;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));
proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;
data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);
proc print noobs;
var dataset n_trials n_final mean_r;
title "Mean final sample sizes and correlations (via z transform)";
title2 "For stat sig method. r=&rho startn=&startn";
run;
Go to: Previous · Contents · Search
· Home
resources=AT=sportsci.org · webmaster=AT=sportsci.org · Sportsci Homepage · Copyright
©1997
Last updated 1 June 97