A New View of Statistics

© 1997 Will G Hopkins

Go to: Previous · Contents · Search · Home


On-The-Fly Sampling for a CORRELATION COEFFICIENT

SAS program to see how much bias there is when you use CI=0.20 as the stopping rule.

Lower down: what about when use statistical significance as the rule?

Main finding: correlations are biased high by 0.01 at most using on the fly, but by 0.10 using statistical significance.

First I checked whether the samples sizes for correlations in the
middle of each step of the magnitude scale do indeed give CIs of 0.20;
 
options linesize=80;
options pagesize=30;
 
*generate correlated y and x;
%macro data;
data dat1;
do trial=1 to 1000;
do id=1 to &startn;
  x=rannor(0);
  y=&rho*x+sqrt(1-&rho**2)*rannor(0);
  output;
  end;
end;
drop id;
 
%mend;
*rho=population correlation;
*startn=(initial) sample size;
%let rho=0.823; *others were 0.61, 0.404, 0.202, 0.0;
%let startn=46; *others were 155, 270, 350, 380;
%data;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
data;
set;
if _type_="CORR";
 
proc univariate noprint;
var x;
output pctlpre=Q pctlpts=2.5 50 97.5;
title "Median and 95% CLs for samples";
 
proc print;
run;
 
 
 
OK here is the main program to check for bias in the "on the fly" method;
 
To estimate the sample size to give confidence interval of 0.20 for 
any correlation, I derived the sample sizes for correlations in the 
middle of each step of the magnitude scale, by trial and error on a 
hand calculator.  I got these numbers and correlations:
r       n
0.0     380
0.202   350
0.405   270
0.61    155
0.823   46
-0.202  350
-0.405  270
-0.61   155
-0.823  46
 
I checked these, as shown in the simulation above.
 
I then fit a fourth-order polynomial to the data using a graphing program:
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2
 
This is the curve shown on the page devoted to On the Fly for 
Correlations, except that I plateaued it for r>0.82.
 
I have included sample-size limits on the iterations, to see if 
that makes any difference.
 
 
 
 
%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
  x=rannor(0);
  y=&rho*x+sqrt(1-&rho**2)*rannor(0);
  output;
  end;
end;
drop id;
 
%mend;
 
%let rho=0.7; *population correlation;
%let startn=45; *initial sample size;
%let nmax1=100; *total size limit for 1st interation;
%let nmax2=100; *total size limit for 2nd or more interations;
%let trialn=400; *no of trials for the simulation;
%data;
 
data dat0;
set;
dataset="initial";
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=1;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
  nnew=round(n-nprev);
  if nnew+nprev>&nmax1 then nnew=&nmax1-nprev;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
 
*2nd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=2;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
  nnew=round(n-nprev);
  if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
*3rd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=3;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
  nnew=round(n-nprev);
*  if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
 
*4th iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
iter=4;
nprev=lag(r);
if _type_="CORR";
r=abs(r);
n = 3.586814E+2*r**4 -7.362850E+2*r**2 + 3.800062E+2;
if n>nprev+0.5 then do;
  nnew=round(n-nprev);
*  if nnew+nprev>&nmax2 then nnew=&nmax2-nprev;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
 
 
*output results;
data dat1;
set dat1 dat2;
dataset="final";
 
data datboth;
set dat0 dat1;
 
proc sort;
by dataset trial;
 
proc corr noprint outp=dat;
var x;
with y;
by dataset trial;
 
data dat2;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));
 
proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;
 
data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);
 
proc print noobs;
var dataset n_trials n_final mean_r;
format n_final 4. mean_r 6.3;
title "Mean final sample sizes and correlations (via z transform)";
title2 "for startn=&startn rho=&rho  nmax=&nmax";
 
data datfinal;
set dat2;
if dataset="final";
sampler=(exp(2*z)-1)/(exp(2*z)+1); *restores the raw r;
 
proc univariate noprint;
var sampler ;
output mean=mean pctlpre=Q pctlpts=2.5 50 97.5;
 
proc print noobs;
format _numeric_ 5.2;
title "Sampling distn for r  rho=&rho n=&startn nmax=&nmax";
title2 "for final sample correlation";
 
proc means n mean std min max maxdec=0 data=datfinal;
var nfinal;
title "Stats for final sample size  r=&rho n=&startn nmax=&nmax";
 
proc sort data=dat1;
by trial iter;
 
proc means noprint;
var nnew;
output mean=;
by trial iter;
 
proc sort;
by iter;
 
proc means noprint;
var nnew;
by iter;
output n=n mean= std=std min=min max=max;
 
data;
set;
if iter;
 
proc print noobs;
var iter n nnew std min max;
format _numeric_ 5.0;
title "Number of extra observations at each iteration";
title2 "for rho=&rho startn=&startn nmax1=&nmax1 nmax2=&nmax2";
 
run;
 
 
 
************;
 
*Now try statistical significance as the stopping rule;
 
%macro data;
data dat1;
do trial=1 to &trialn;
do id=1 to &startn;
  x=rannor(0);
  y=&rho*x+sqrt(1-&rho**2)*rannor(0);
  output;
  end;
end;
drop id;
 
%mend;
 
%let rho=0.3; *population correlation;
%let startn=45; *initial sample size;
%let trialn=300; *no of trials for the simulation;
%data;
 
data dat0;
set;
dataset="initial";
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
  n=3+(1.96/r)**2;
  nnew=round(n-nprev);
  if nnew>800 then nnew=800;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
 
*2nd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
  n=3+(1.96/r)**2;
  nnew=round(n-nprev);
  if nnew>800 then nnew=800;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
*3rd iteration;
 
data dat1;
set dat1 dat2;
 
proc sort;
by trial;
 
proc corr noprint outp=dat;
var x;
with y;
by trial;
 
*add next lot of observations;
data dat2;
set dat(rename=(x=r));
nprev=lag(r);
if _type_="CORR";
r=0.5*log((1+r)/(1-r));
if r<1.96/sqrt(nprev-3) then do;
  n=3+(1.96/r)**2;
  nnew=round(n-nprev);
  if nnew>800 then nnew=800;
  do i=1 to nnew;
      x=rannor(0);
      y=&rho*x+sqrt(1-&rho**2)*rannor(0);
      output;
      end;
  end;
drop i;
 
 
*output results;
data dat1;
set dat1 dat2;
dataset="final";
 
data;
set dat0 dat1;
 
proc sort;
by dataset trial;
 
proc corr noprint outp=dat;
var x;
with y;
by dataset trial;
 
data;
set dat(rename=(x=r));
nfinal=lag(r);
if _type_="CORR";
z=0.5*log((1+r)/(1-r));
 
proc means noprint;
var z nfinal;
by dataset;
output n=n_trials mean=z n_final;
 
data;
set;
mean_r=(exp(2*z)-1)/(exp(2*z)+1);
 
proc print noobs;
var dataset n_trials n_final mean_r;
title "Mean final sample sizes and correlations (via z transform)";
title2 "For stat sig method. r=&rho startn=&startn";
 
run;


Go to: Previous · Contents · Search · Home
resources=AT=sportsci.org · webmaster=AT=sportsci.org · Sportsci Homepage · Copyright ©1997
Last updated 1 June 97