s-lang/modules/stats.sl

import ("stats");

% This file contains the following public functions:
%
%   ks_test          One sample Kolmogorov test
%   ad_test          Anderson-Darling test
%   ks_test2         Two sample Smirnov test
%   mw_test	     Two sample Mann-Whitney-Wilcoxon test
%   chisqr_test	     Chisqr-test
%   t_test           Student t test
%   t_test2          Two-sample Student t test
%   welch_t_test
%   spearman_r       Two-sample Spearman rank test
%   kendall_tau      Kendall tau correlation test
%   mann_kendall     Mann-Kendall trend test
%   pearson_r        Pearson's r correlation test
%   correlation      2 sample correlation
%   z_test
%   f_test2          2 sample F test
%   skewness
%   kurtosis
%
autoload ("ad_ktest", "statslib/ad_test");
autoload ("ad_test", "statslib/ad_test");
autoload ("ks_test", "statslib/ks_test");
autoload ("ks_test2", "statslib/ks_test");
autoload ("kuiper_test", "statslib/kuiper");
autoload ("kuiper_test2", "statslib/kuiper");

define normal_cdf ()
{
   variable m, s, a;
   variable nargs = _NARGS;

   switch (nargs)
     {
      case 1:
	m = NULL, s = NULL;
     }
     {
      case 3:
	(m, s) = ();
     }
     {
	_pop_n (nargs);
	usage ("cdf = normal_cdf (A [, mean, stddev])");
     }
   a = ();

   if (nargs != 1)
     a = (a-m)/double(s);

   if (typeof (a) == Array_Type)
     return array_map (Double_Type, &_normal_cdf, a);

   return _normal_cdf (a);
}

define poisson_cdf ()
{
   variable lam, n;
   if (_NARGS != 2)
     {
	_pop_n (_NARGS);
	usage ("cdf = poisson_cdf (lambda, n)");
     }
   (lam, n) = ();

   if ((typeof (n) == Array_Type) or (typeof (lam) == Array_Type))
     return array_map (Double_Type, &_poisson_cdf, lam, n);

   return _poisson_cdf (lam, n);
}

define sample_mean ()
{
   variable args = __pop_args (_NARGS);
   return mean (__push_args(args));
}

% These functions return the biased stddev
define sample_stddev ()
{
   variable x = ();
   variable n = 1.0*length (x);
   return stddev(x) * sqrt((n-1.0)/n);
}

private define get_mean_stddev (x)
{
   variable m = mean(x);
   variable n = 1.0*length (x);
   variable s = stddev(x) * sqrt((n-1.0)/n);
   return m, s, n;
}

define skewness ()
{
   if (_NARGS != 1)
     usage ("s = %s(A);", _function_name ());
   variable x = ();
   variable m, s, n;
   (m, s, n) = get_mean_stddev (x);

   x = sum (((x - m)/s)^3)/n;

   if ((s == 0.0) && isnan (x))
     x = 0.0;

   return x;
}

define kurtosis ()
{
   if (_NARGS != 1)
     usage ("s = %s(A);", _function_name ());
   variable x = ();
   variable m, s, n;
   (m, s, n) = get_mean_stddev (x);

   x = sum (((x - m)/s)^4)/n - 3.0;

   if ((s == 0.0) && isnan (x))
     x = 0.0;

   return x;
}

define covariance ()
{
   variable n = _NARGS;
   if (n == 0)
     usage ("Sigma = covariance (X1, X2, ..., Xn [;qualifiers])\n" +
	    "Qualifiers:\n" +
	    " mu=[mu1,mu2,..,muN]  (expected values E(Xi))"
	   );

   variable Xs = __pop_list (n);
   variable i, m = length (Xs[0]);
   _for i (0, n-1, 1)
     {
	if (length (Xs[i]) != m)
	  throw InvalidParmError, "Arrays must be of the same size";
     }
   variable mus = qualifier ("mu");
   variable norm = 1.0;
   if (mus == NULL)
     {
	mus = Double_Type[n];
	_for i (0, n-1, 1)
	  mus[i] = mean (Xs[i]);
	norm = m/(m-1.0);
     }
   if (length (mus) != n)
     throw InvalidParmError, "The value mu qualifier has the wrong length";

   variable cov = Double_Type[n,n];
   _for i (0, n-1, 1)
     {
	variable j;
	variable dx_i = Xs[i]-mus[i];
	_for j (i, n-1, 1)
	  {
	     variable c = norm * mean (dx_i*(Xs[j] - mus[j]));
	     cov[i,j] = c;
	     cov[j,i] = c;
	  }
     }
   return cov;
}

% This function assumes the distribution is symmetric
private define map_cdf_to_pval (cdf)
{
   variable side = qualifier ("side", NULL);

   variable pval = cdf;		       %  side="<"
   if (side == ">")
     pval = 1.0 - cdf;
   else if (side != "<")	       %  double-sided
     pval = 2.0 * _min (1.0-pval, pval);

   return pval;
}

define chisqr_test ()
{
   variable t_ref = NULL;
   variable nr = _NARGS;
   if (nr > 1)
     {
	t_ref = ();
	if (typeof (t_ref) == Ref_Type)
	  nr--;
	else
	  {
	     t_ref;		       %  push it back
	     t_ref = NULL;
	  }
     }

   if (nr < 2)
     {
	usage ("p=%s(X,Y,...,Z [,&T])", _function_name);
     }
   variable args = __pop_args (nr);
   variable datasets = Array_Type[nr];
   variable nc = length (args[0].value);
   variable c = Double_Type[nc];

   _for (0, nr-1, 1)
     {
	variable i = ();
	variable d = args[i].value;
	if (length (d) != nc)
	  verror ("The chisqr test requires datasets to be of the same length");
	datasets[i] = d;
	c += d;
     }
   variable N = sum (c);
   variable t = 0.0;
   _for (0, nr-1, 1)
     {
        i = ();
	d = datasets[i];
	variable e = sum (d)/N * c;
	t += sum((d-e)^2/e);
     }

   if (t_ref != NULL)
     @t_ref = t;

   return 1.0 - chisqr_cdf ((nr-1)*(nc-1), t);
}

% Usage: r = compute_rank (X, [&tie_fun [,&tied_groups]])
% Here, if tied_groups is non-NULL, it will be an array whose length
% represents the number of tied groups, and each element being the number
% within the kth group.
private define compute_rank ()
{
   variable x, tie_fun = &mean, group_ties_ref = NULL;
   if (_NARGS == 3)
     group_ties_ref = ();
   if (_NARGS >= 2)
     tie_fun = ();
   x = ();
   if (tie_fun == NULL)
     tie_fun = &mean;

   variable indx = array_sort (x);
   x = x[indx];
   variable n = length (x);
   variable r = double([1:n]);

   % Worry about ties
   variable ties;
   () = wherediff (x, &ties);
   % Here, ties is an array of indices {j} where x[j-1]==x[j].
   % We want those where x[j] == x[j+1].
   ties -= 1;

   variable m = length (ties);
   variable group_ties = Int_Type[0];
   if (m)
     {
	variable i = 0;
	variable g = 0;
	group_ties = Int_Type[m];
	while (i < m)
	  {
	     variable ties_i = ties[i];
	     variable j = i;
	     j++;
	     variable dties = ties_i - i;
	     while ((j < m) && (dties + j == ties[j]))
	       j++;

	     variable dn = j - i;
	     i = [ties_i:ties_i+dn];
	     r[i] = (@tie_fun)(r[i]);
	     group_ties[g] = dn+1;
	     i = j;
	     g++;
	  }
	group_ties = group_ties[[0:g-1]];
     }

   if (group_ties_ref != NULL)
     @group_ties_ref = group_ties;

   % Now put r back in the order of x before it was sorted.
   return r[array_sort(indx)];
}

% Min sum:  1+2+...+n = n*(n+1)/2
% Max sum:  (m+1) + (m+2) + ... (m+n) = n*m + n*(n+1)/2
% Average: (n*(n+1) + n*m)/2 = n*(n+m+1)/2
define mw_test ()
{
   variable w_ref = NULL;
   if (_NARGS == 3)
     w_ref = ();
   else if (_NARGS != 2)
     {
	usage ("p = %s (X1, Y1 [,&w]);  %% Two-Sample Mann-Whitney",
	       _function_name ());
     }
   variable x, y;
   (x, y) = ();
   variable side = qualifier ("side", NULL);

   variable n = length (x), m = length (y);
   variable N = m+n;
   variable mn = m*n;

   variable gties;
   variable r = compute_rank ([x,y], &mean, &gties);
   variable w = sum (r[[0:n-1]]);

   variable has_ties = length (gties);
#iffalse
   if (has_ties)
     vmessage ("*** Warning: mw_test: ties found--- using asymptotic cdf");
#endif

   variable p;

   if (has_ties || ((m > 50) && (n > 50)))
     {
	% Asymptotic
	variable wstar = w - 0.5*n*(N+1);
	variable vw = (mn/12.0)*(N+1 - sum((gties-1)*gties*(gties+1))/(N*(N-1)));

	p = normal_cdf (wstar/sqrt(vw));

	if (side == ">")
	  p = 1.0 - p;
	else if (side != "<")
	  p = 2 * _min (p, 1.0-p);
     }
   else
     {
	% exact
	if (side == ">")
	  p = 1.0 - mann_whitney_cdf (n, m, w);
	else if (side == "<")
	  p = mann_whitney_cdf (n, m, w);
	else
	  {
	     p = mann_whitney_cdf (n, m, w);
	     p = 2 * _min (p, 1-p);
	  }
     }

   if (w_ref != NULL)
     @w_ref = w;

   return p;
}

define t_test ()
{
   variable x, mu;
   variable tref = NULL;

   if (_NARGS == 2)
     (x,mu) = ();
   else if (_NARGS == 3)
     (x,mu,tref) = ();
   else
     {
	usage ("p = t_test (X, mu [,&t] [; qualifiers]);  %% Student's t-test\n"
	       + "Qualifiers:\n"
	       + " side=\"<\" | \">\""
	      );
     }

   variable n = length (x);
   variable stat = sqrt(n)*((mean(x) - mu)/stddev(x));
   if (tref != NULL) @tref = stat;

   return map_cdf_to_pval (student_t_cdf(stat, n-1) ;; __qualifiers);
}

define t_test2 ()
{
   variable x, y;
   variable tref = NULL;

   if (_NARGS == 2)
     (x,y) = ();
   else if (_NARGS == 3)
     (x,y,tref) = ();
   else
     {
	usage ("p = t_test2 (X, Y [,&t] [; qualifiers]);  %% Student's 2 sample (unpaired) t-test\n"
	       + "Qualifiers:\n"
	       + " side=\"<\" | \">\""
	      );
     }
   variable side = qualifier ("side", NULL);

   variable nx = length (x), mx = mean(x), sx = stddev (x);
   variable ny = length (y), my = mean(y), sy = stddev (y);
   variable df = nx+ny-2;
   variable stat
     = (mx-my)/sqrt((((nx-1)*sx*sx+(ny-1)*sy*sy)*(nx+ny))/(nx*ny*df));

   if (tref != NULL) @tref = stat;

   return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
}

define welch_t_test ()
{
   variable x, y;
   variable tref = NULL;

   if (_NARGS == 2)
     (x,y) = ();
   else if (_NARGS == 3)
     (x,y,tref) = ();
   else
     {
	usage ("p = welch_t_test2 (X, Y [,&t] [; qualifiers]);  %% Welch's 2 sample t-test\n"
	       + "Qualifiers:\n"
	       + " side=\"<\" | \">\""
	      );
     }
   variable side = qualifier ("side", NULL);

   variable nx = length (x), mx = mean(x), sx = stddev (x), vx = sx*sx/nx;
   variable ny = length (y), my = mean(y), sy = stddev (y), vy = sy*sy/ny;
   variable vxvy = vx+vy;
   variable stat = (mx-my)/sqrt(vxvy);
   variable df = (vxvy*vxvy)/((vx*vx)/(nx-1) + (vy*vy)/(ny-1));

   if (tref != NULL) @tref = stat;

   return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
}

define z_test ()
{
   variable x, mu, sigma;
   variable tref = NULL;

   if (_NARGS == 4)
     tref = ();
   else if (_NARGS != 3)
     {
	usage ("p = z_test (X, mu, sigma [,&stat] [; qualifiers]);\n"
	       + "Qualifiers:\n"
	       + " side=\"<\" | \">\""
	      );
     }
   (x, mu, sigma) = ();
   variable side = qualifier ("side", NULL);

   variable n = length (x);
   variable stat = (mean(x)-mu)/(sigma/sqrt(n));
   if (tref != NULL) @tref = stat;

   return map_cdf_to_pval (normal_cdf(stat) ;; __qualifiers);
}

define f_test2 ()
{
   variable x, y;
   variable tref = NULL;

   if (_NARGS == 2)
     (x,y) = ();
   else if (_NARGS == 3)
     (x,y,tref) = ();
   else
     {
	usage ("p = f_test2 (X, Y [,&t] [; qualifiers]);  %% 2 sample F-test\n"
	       + "Qualifiers:\n"
	       + " side=\"<\" | \">\""
	      );
     }
   variable side = qualifier ("side", NULL);

   variable v1 = stddev(x)^2;
   variable v2 = stddev(y)^2;
   variable n1 = length(x)-1;
   variable n2 = length(y)-1;
   variable swap = 0;
   if (v1 < v2)
     {
	swap = 1;
	(v1, v2) = (v2, v1);
	(n1, n2) = (n2, n1);
     }
   variable stat = (v1/v2);

   variable pval = f_cdf (stat, n1, n2);
   if (side == ">")
     {
	if (swap)
	  pval = 1.0 - pval;
     }
   else if (side == "<")
     {
	ifnot (swap)
	  pval = 1.0 - pval;
     }
   else
     pval = 2.0 * _min (1.0-pval, pval);

   if (tref != NULL) @tref = stat;
   return pval;
}

define spearman_r ()
{
   variable w_ref = NULL;
   if (_NARGS == 3)
     w_ref = ();
   else if (_NARGS != 2)
     {
	usage ("p = %s (X1, Y1 [,&r]);  %% Spearman's rank correlation",
	       _function_name ());
     }
   variable x, y;
   (x, y) = ();
   variable n = length (y), m = length (x);

   variable gties_x, gties_y;
   variable rx = compute_rank (x, &mean, &gties_x);
   variable ry = compute_rank (y, &mean, &gties_y);

   variable d = sum ((rx-ry)^2);
   variable cx = sum(gties_x*(gties_x*gties_x-1.0));
   variable cy = sum(gties_y*(gties_y*gties_y-1.0));

   variable den = double(n) * (n+1.0) * (n-1.0);

   variable r = (1.0 - 6.0*(d+(cx+cy)/12.0)/den)
     / sqrt((1.0-cx/den)*(1.0-cy/den));
   if (w_ref != NULL)
     @w_ref = r;

   variable t = r * sqrt ((n-2)/(1-r*r));

   return map_cdf_to_pval (student_t_cdf(t,n-2) ;; __qualifiers);
}

% This function is assumed to always pass back a new array.
private define compute_integer_rank (x, is_sorted)
{
   variable n = length (x);
   variable indx = NULL, rev_indx = NULL;
   ifnot (is_sorted)
     {
	indx = array_sort (x);
	x = x[indx];
	% Create a reverse-permutation to restore the array order
	% upon return.
	rev_indx = [0:n-1];
	rev_indx[indx] = @rev_indx;
     }

   variable r = [1:n];

   % Account for ties
   variable ties;
   () = wherediff (x, &ties);
   % Here, ties is an array of indices {j} where x[j-1]==x[j].
   % We want those where x[j] == x[j+1].
   ties -= 1;

   variable m = length (ties);
   variable i = 0, j;
   while (i < m)
     {
	variable ties_i = ties[i];
	j = i;
	j++;
	variable dties = ties_i - i;
	while ((j < m) && (dties + j == ties[j]))
	  j++;

	variable dn = j - i;
	i = [ties_i:ties_i+dn];
	r[i] = r[ties_i];
	i = j;
     }

   if (indx == NULL)
     return r;

   return r[rev_indx];
}

define kendall_tau ()
{
   variable w_ref = NULL;
   if (_NARGS == 3)
     w_ref = ();
   else if (_NARGS != 2)
     {
	usage ("p = %s (X1, Y1 [,&r]);  %% Kendall's tau correlation",
	       _function_name ());
     }

   variable x, y;
   (x, y) = ();
   variable n = length (x);
   if (n != length (y))
     throw InvalidParmError, "Arrays must be the same length for kendall_tau";

   % _kendall_tau will modify the contents of the arrays.  Be sure to
   % pass new instances to it.  The sort operation below will achieve
   % that.
   variable i = array_sort (x);
   x = compute_integer_rank (x[i], 1);
   y = compute_integer_rank (y[i], 0);

   variable tau, z;

   (tau, z) = _kendall_tau (x, y);

   if (w_ref != NULL)
     @w_ref = tau;

   return map_cdf_to_pval (z ;; __qualifiers);
}

define mann_kendall ()
{
   variable w_ref = NULL;
   if (_NARGS == 2)
     w_ref = ();
   else if (_NARGS != 1)
     {
	usage ("p = %s (X [,&r]);  %% Mann-Kendall trend test",
	       _function_name ());
     }

   variable x;
   x = ();
   variable n = length (x);
   variable i = [0:n-1];

   % _kendall_tau will modify the contents of the arrays.  Be sure to
   % pass new instances to it.  compute_integer_rank will create a new
   % instance.
   x = compute_integer_rank (x, 0);
   variable tau, z;
   (tau, z) = _kendall_tau (i, x);

   if (w_ref != NULL)
     @w_ref = tau;

   return map_cdf_to_pval (z ;; __qualifiers);
}

define pearson_r ()
{
   variable w_ref = NULL;
   if (_NARGS == 3)
     w_ref = ();
   else if (_NARGS != 2)
     {
	usage ("p = %s (X1, Y1 [,&r] [; qualifiers]);  %% Pearson's r correlation\n", +
	       "Qualifiers:\n" +
	       " side=\"<\" | \">\"",
	       _function_name ());
     }

   variable x, y;
   (x, y) = ();
   variable n = length(x);
   % Note: covariance handles the 1/(N-1) normalization factor
   variable r = covariance (x, y)[0,1]/(stddev(x)*stddev(y));
   if (w_ref != NULL)
     @w_ref = r;

   % This is meaningful only for gaussian distributions
   variable df = length(x)-2;
   r = sqrt(df)*r/sqrt(1-r*r);
   return map_cdf_to_pval (student_t_cdf (r, df) ;; __qualifiers);
}

define correlation ()
{
   if (_NARGS != 2)
     usage ("c = correlation (X, Y);");
   variable x, y; (x,y) = ();
   variable n = length(x);
   if (n != length(y))
     throw InvalidParmError, "Arrays must be the same length";
   variable mx = mean(x), sx = stddev(x), my = mean(y), sy = stddev(y);
   return sum ((x-mx)*(y-my))/((n-1)*sx*sy);
}

provide ("stats");