A New View of Statistics Go to: Previous · Contents · Search · Home
On-The-Fly Sampling for a CORRELATION COEFFICIENT

SAS program to see how much bias there is when you use CI=0.20 as the stopping rule.

Lower down: what about when use statistical significance as the rule?

Main finding: correlations are biased high by 0.01 at most using on the fly, but by 0.10 using statistical significance.

```First I checked whether the samples sizes for correlations in the
middle of each step of the magnitude scale do indeed give CIs of 0.20;

options linesize=80;
options pagesize=30;

*generate correlated y and x;
%macro data;
data dat1;
do trial=1 to 1000;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;

%mend;
*rho=population correlation;
*startn=(initial) sample size;
%let rho=0.823; *others were 0.61, 0.404, 0.202, 0.0;
%let startn=46; *others were 155, 270, 350, 380;
%data;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data;
set;
if _type_="CORR";

proc univariate noprint;
var x;
output pctlpre=Q pctlpts=2.5 50 97.5;
title "Median and 95% CLs for samples";

proc print;
run;

OK here is the main program to check for bias in the "on the fly" method;

To estimate the sample size to give confidence interval of 0.20 for
any correlation, I derived the sample sizes for correlations in the
middle of each step of the magnitude scale, by trial and error on a
hand calculator.  I got these numbers and correlations:
r       n
0.0     380
0.202   350
0.405   270
0.61    155
0.823   46
-0.202  350
-0.405  270
-0.61   155
-0.823  46

I checked these, as shown in the simulation above.

I then fit a fourth-order polynomial to the data using a graphing program:
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2

This is the curve shown on the page devoted to On the Fly for
Correlations, except that I plateaued it for r>0.82.

I have included sample-size limits on the iterations, to see if
that makes any difference.

%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;

%mend;

%let rho=0.7; *population correlation;
%let startn=45; *initial sample size;
%let nmax1=100; *total size limit for 1st interation;
%let nmax2=100; *total size limit for 2nd or more interations;
%let trialn=400; *no of trials for the simulation;
%data;

data dat0;
set;
dataset="initial";

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
iter=1;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
if nnew+nprev>&nmax1 then nnew=&nmax1-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*2nd iteration;

data dat1;
set dat1 dat2;

proc sort;
by trial;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
iter=2;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*3rd iteration;

data dat1;
set dat1 dat2;

proc sort;
by trial;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
iter=3;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
*  if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*4th iteration;

data dat1;
set dat1 dat2;

proc sort;
by trial;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
iter=4;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
nnew=round(n-nprev);
*  if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*output results;
data dat1;
set dat1 dat2;
dataset="final";

data datboth;
set dat0 dat1;

proc sort;
by dataset trial;

proc corr noprint outp=dat;
var x;
with y;
by dataset trial;

data dat2;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));

proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;

data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);

proc print noobs;
var dataset n_trials n_final mean_r;
format n_final 4. mean_r 6.3;
title "Mean final sample sizes and correlations (via z transform)";
title2 "for startn=&startn rho=&rho  nmax=&nmax";

data datfinal;
set dat2;
if dataset="final";
sampler=(exp(2*z)-1)/(exp(2*z)+1); *restores the raw r;

proc univariate noprint;
var sampler ;
output mean=mean pctlpre=Q pctlpts=2.5 50 97.5;

proc print noobs;
format _numeric_ 5.2;
title "Sampling distn for r  rho=&rho n=&startn nmax=&nmax";
title2 "for final sample correlation";

proc means n mean std min max maxdec=0 data=datfinal;
var nfinal;
title "Stats for final sample size  r=&rho n=&startn nmax=&nmax";

proc sort data=dat1;
by trial iter;

proc means noprint;
var nnew;
output mean=;
by trial iter;

proc sort;
by iter;

proc means noprint;
var nnew;
by iter;
output n=n mean= std=std min=min max=max;

data;
set;
if iter;

proc print noobs;
var iter n nnew std min max;
format _numeric_ 5.0;
title "Number of extra observations at each iteration";
title2 "for rho=&rho startn=&startn nmax1=&nmax1 nmax2=&nmax2";

run;

************;

*Now try statistical significance as the stopping rule;

%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop id;

%mend;

%let rho=0.3; *population correlation;
%let startn=45; *initial sample size;
%let trialn=300; *no of trials for the simulation;
%data;

data dat0;
set;
dataset="initial";

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*2nd iteration;

data dat1;
set dat1 dat2;

proc sort;
by trial;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*3rd iteration;

data dat1;
set dat1 dat2;

proc sort;
by trial;

proc corr noprint outp=dat;
var x;
with y;
by trial;

data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
n=3+(1.96/r)**2;
nnew=round(n-nprev);
if nnew>800 then nnew=800;
do i=1 to nnew;
x=rannor(0);
y=&rho*x+sqrt(1-&rho**2)*rannor(0);
output;
end;
end;
drop i;

*output results;
data dat1;
set dat1 dat2;
dataset="final";

data;
set dat0 dat1;

proc sort;
by dataset trial;

proc corr noprint outp=dat;
var x;
with y;
by dataset trial;

data;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));

proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;

data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);

proc print noobs;
var dataset n_trials n_final mean_r;
title "Mean final sample sizes and correlations (via z transform)";
title2 "For stat sig method. r=&rho startn=&startn";

run;```

Go to: Previous · Contents · Search · Home