701 строка
15 KiB
Plaintext
701 строка
15 KiB
Plaintext
import ("stats");
|
|
|
|
% This file contains the following public functions:
|
|
%
|
|
% ks_test One sample Kolmogorov test
|
|
% ad_test Anderson-Darling test
|
|
% ks_test2 Two sample Smirnov test
|
|
% mw_test Two sample Mann-Whitney-Wilcoxon test
|
|
% chisqr_test Chisqr-test
|
|
% t_test Student t test
|
|
% t_test2 Two-sample Student t test
|
|
% welch_t_test
|
|
% spearman_r Two-sample Spearman rank test
|
|
% kendall_tau Kendall tau correlation test
|
|
% mann_kendall Mann-Kendall trend test
|
|
% pearson_r Pearson's r correlation test
|
|
% correlation 2 sample correlation
|
|
% z_test
|
|
% f_test2 2 sample F test
|
|
% skewness
|
|
% kurtosis
|
|
%
|
|
autoload ("ad_ktest", "statslib/ad_test");
|
|
autoload ("ad_test", "statslib/ad_test");
|
|
autoload ("ks_test", "statslib/ks_test");
|
|
autoload ("ks_test2", "statslib/ks_test");
|
|
autoload ("kuiper_test", "statslib/kuiper");
|
|
autoload ("kuiper_test2", "statslib/kuiper");
|
|
|
|
define normal_cdf ()
|
|
{
|
|
variable m, s, a;
|
|
variable nargs = _NARGS;
|
|
|
|
switch (nargs)
|
|
{
|
|
case 1:
|
|
m = NULL, s = NULL;
|
|
}
|
|
{
|
|
case 3:
|
|
(m, s) = ();
|
|
}
|
|
{
|
|
_pop_n (nargs);
|
|
usage ("cdf = normal_cdf (A [, mean, stddev])");
|
|
}
|
|
a = ();
|
|
|
|
if (nargs != 1)
|
|
a = (a-m)/double(s);
|
|
|
|
if (typeof (a) == Array_Type)
|
|
return array_map (Double_Type, &_normal_cdf, a);
|
|
|
|
return _normal_cdf (a);
|
|
}
|
|
|
|
define poisson_cdf ()
|
|
{
|
|
variable lam, n;
|
|
if (_NARGS != 2)
|
|
{
|
|
_pop_n (_NARGS);
|
|
usage ("cdf = poisson_cdf (lambda, n)");
|
|
}
|
|
(lam, n) = ();
|
|
|
|
if ((typeof (n) == Array_Type) or (typeof (lam) == Array_Type))
|
|
return array_map (Double_Type, &_poisson_cdf, lam, n);
|
|
|
|
return _poisson_cdf (lam, n);
|
|
}
|
|
|
|
define sample_mean ()
|
|
{
|
|
variable args = __pop_args (_NARGS);
|
|
return mean (__push_args(args));
|
|
}
|
|
|
|
% These functions return the biased stddev
|
|
define sample_stddev ()
|
|
{
|
|
variable x = ();
|
|
variable n = 1.0*length (x);
|
|
return stddev(x) * sqrt((n-1.0)/n);
|
|
}
|
|
|
|
private define get_mean_stddev (x)
|
|
{
|
|
variable m = mean(x);
|
|
variable n = 1.0*length (x);
|
|
variable s = stddev(x) * sqrt((n-1.0)/n);
|
|
return m, s, n;
|
|
}
|
|
|
|
define skewness ()
|
|
{
|
|
if (_NARGS != 1)
|
|
usage ("s = %s(A);", _function_name ());
|
|
variable x = ();
|
|
variable m, s, n;
|
|
(m, s, n) = get_mean_stddev (x);
|
|
|
|
x = sum (((x - m)/s)^3)/n;
|
|
|
|
if ((s == 0.0) && isnan (x))
|
|
x = 0.0;
|
|
|
|
return x;
|
|
}
|
|
|
|
define kurtosis ()
|
|
{
|
|
if (_NARGS != 1)
|
|
usage ("s = %s(A);", _function_name ());
|
|
variable x = ();
|
|
variable m, s, n;
|
|
(m, s, n) = get_mean_stddev (x);
|
|
|
|
x = sum (((x - m)/s)^4)/n - 3.0;
|
|
|
|
if ((s == 0.0) && isnan (x))
|
|
x = 0.0;
|
|
|
|
return x;
|
|
}
|
|
|
|
define covariance ()
|
|
{
|
|
variable n = _NARGS;
|
|
if (n == 0)
|
|
usage ("Sigma = covariance (X1, X2, ..., Xn [;qualifiers])\n" +
|
|
"Qualifiers:\n" +
|
|
" mu=[mu1,mu2,..,muN] (expected values E(Xi))"
|
|
);
|
|
|
|
variable Xs = __pop_list (n);
|
|
variable i, m = length (Xs[0]);
|
|
_for i (0, n-1, 1)
|
|
{
|
|
if (length (Xs[i]) != m)
|
|
throw InvalidParmError, "Arrays must be of the same size";
|
|
}
|
|
variable mus = qualifier ("mu");
|
|
variable norm = 1.0;
|
|
if (mus == NULL)
|
|
{
|
|
mus = Double_Type[n];
|
|
_for i (0, n-1, 1)
|
|
mus[i] = mean (Xs[i]);
|
|
norm = m/(m-1.0);
|
|
}
|
|
if (length (mus) != n)
|
|
throw InvalidParmError, "The value mu qualifier has the wrong length";
|
|
|
|
variable cov = Double_Type[n,n];
|
|
_for i (0, n-1, 1)
|
|
{
|
|
variable j;
|
|
variable dx_i = Xs[i]-mus[i];
|
|
_for j (i, n-1, 1)
|
|
{
|
|
variable c = norm * mean (dx_i*(Xs[j] - mus[j]));
|
|
cov[i,j] = c;
|
|
cov[j,i] = c;
|
|
}
|
|
}
|
|
return cov;
|
|
}
|
|
|
|
% This function assumes the distribution is symmetric
|
|
private define map_cdf_to_pval (cdf)
|
|
{
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable pval = cdf; % side="<"
|
|
if (side == ">")
|
|
pval = 1.0 - cdf;
|
|
else if (side != "<") % double-sided
|
|
pval = 2.0 * _min (1.0-pval, pval);
|
|
|
|
return pval;
|
|
}
|
|
|
|
define chisqr_test ()
|
|
{
|
|
variable t_ref = NULL;
|
|
variable nr = _NARGS;
|
|
if (nr > 1)
|
|
{
|
|
t_ref = ();
|
|
if (typeof (t_ref) == Ref_Type)
|
|
nr--;
|
|
else
|
|
{
|
|
t_ref; % push it back
|
|
t_ref = NULL;
|
|
}
|
|
}
|
|
|
|
if (nr < 2)
|
|
{
|
|
usage ("p=%s(X,Y,...,Z [,&T])", _function_name);
|
|
}
|
|
variable args = __pop_args (nr);
|
|
variable datasets = Array_Type[nr];
|
|
variable nc = length (args[0].value);
|
|
variable c = Double_Type[nc];
|
|
|
|
_for (0, nr-1, 1)
|
|
{
|
|
variable i = ();
|
|
variable d = args[i].value;
|
|
if (length (d) != nc)
|
|
verror ("The chisqr test requires datasets to be of the same length");
|
|
datasets[i] = d;
|
|
c += d;
|
|
}
|
|
variable N = sum (c);
|
|
variable t = 0.0;
|
|
_for (0, nr-1, 1)
|
|
{
|
|
i = ();
|
|
d = datasets[i];
|
|
variable e = sum (d)/N * c;
|
|
t += sum((d-e)^2/e);
|
|
}
|
|
|
|
if (t_ref != NULL)
|
|
@t_ref = t;
|
|
|
|
return 1.0 - chisqr_cdf ((nr-1)*(nc-1), t);
|
|
}
|
|
|
|
% Usage: r = compute_rank (X, [&tie_fun [,&tied_groups]])
|
|
% Here, if tied_groups is non-NULL, it will be an array whose length
|
|
% represents the number of tied groups, and each element being the number
|
|
% within the kth group.
|
|
private define compute_rank ()
|
|
{
|
|
variable x, tie_fun = &mean, group_ties_ref = NULL;
|
|
if (_NARGS == 3)
|
|
group_ties_ref = ();
|
|
if (_NARGS >= 2)
|
|
tie_fun = ();
|
|
x = ();
|
|
if (tie_fun == NULL)
|
|
tie_fun = &mean;
|
|
|
|
variable indx = array_sort (x);
|
|
x = x[indx];
|
|
variable n = length (x);
|
|
variable r = double([1:n]);
|
|
|
|
% Worry about ties
|
|
variable ties;
|
|
() = wherediff (x, &ties);
|
|
% Here, ties is an array of indices {j} where x[j-1]==x[j].
|
|
% We want those where x[j] == x[j+1].
|
|
ties -= 1;
|
|
|
|
variable m = length (ties);
|
|
variable group_ties = Int_Type[0];
|
|
if (m)
|
|
{
|
|
variable i = 0;
|
|
variable g = 0;
|
|
group_ties = Int_Type[m];
|
|
while (i < m)
|
|
{
|
|
variable ties_i = ties[i];
|
|
variable j = i;
|
|
j++;
|
|
variable dties = ties_i - i;
|
|
while ((j < m) && (dties + j == ties[j]))
|
|
j++;
|
|
|
|
variable dn = j - i;
|
|
i = [ties_i:ties_i+dn];
|
|
r[i] = (@tie_fun)(r[i]);
|
|
group_ties[g] = dn+1;
|
|
i = j;
|
|
g++;
|
|
}
|
|
group_ties = group_ties[[0:g-1]];
|
|
}
|
|
|
|
if (group_ties_ref != NULL)
|
|
@group_ties_ref = group_ties;
|
|
|
|
% Now put r back in the order of x before it was sorted.
|
|
return r[array_sort(indx)];
|
|
}
|
|
|
|
% Min sum: 1+2+...+n = n*(n+1)/2
|
|
% Max sum: (m+1) + (m+2) + ... (m+n) = n*m + n*(n+1)/2
|
|
% Average: (n*(n+1) + n*m)/2 = n*(n+m+1)/2
|
|
define mw_test ()
|
|
{
|
|
variable w_ref = NULL;
|
|
if (_NARGS == 3)
|
|
w_ref = ();
|
|
else if (_NARGS != 2)
|
|
{
|
|
usage ("p = %s (X1, Y1 [,&w]); %% Two-Sample Mann-Whitney",
|
|
_function_name ());
|
|
}
|
|
variable x, y;
|
|
(x, y) = ();
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable n = length (x), m = length (y);
|
|
variable N = m+n;
|
|
variable mn = m*n;
|
|
|
|
variable gties;
|
|
variable r = compute_rank ([x,y], &mean, >ies);
|
|
variable w = sum (r[[0:n-1]]);
|
|
|
|
variable has_ties = length (gties);
|
|
#iffalse
|
|
if (has_ties)
|
|
vmessage ("*** Warning: mw_test: ties found--- using asymptotic cdf");
|
|
#endif
|
|
|
|
variable p;
|
|
|
|
if (has_ties || ((m > 50) && (n > 50)))
|
|
{
|
|
% Asymptotic
|
|
variable wstar = w - 0.5*n*(N+1);
|
|
variable vw = (mn/12.0)*(N+1 - sum((gties-1)*gties*(gties+1))/(N*(N-1)));
|
|
|
|
p = normal_cdf (wstar/sqrt(vw));
|
|
|
|
if (side == ">")
|
|
p = 1.0 - p;
|
|
else if (side != "<")
|
|
p = 2 * _min (p, 1.0-p);
|
|
}
|
|
else
|
|
{
|
|
% exact
|
|
if (side == ">")
|
|
p = 1.0 - mann_whitney_cdf (n, m, w);
|
|
else if (side == "<")
|
|
p = mann_whitney_cdf (n, m, w);
|
|
else
|
|
{
|
|
p = mann_whitney_cdf (n, m, w);
|
|
p = 2 * _min (p, 1-p);
|
|
}
|
|
}
|
|
|
|
if (w_ref != NULL)
|
|
@w_ref = w;
|
|
|
|
return p;
|
|
}
|
|
|
|
define t_test ()
|
|
{
|
|
variable x, mu;
|
|
variable tref = NULL;
|
|
|
|
if (_NARGS == 2)
|
|
(x,mu) = ();
|
|
else if (_NARGS == 3)
|
|
(x,mu,tref) = ();
|
|
else
|
|
{
|
|
usage ("p = t_test (X, mu [,&t] [; qualifiers]); %% Student's t-test\n"
|
|
+ "Qualifiers:\n"
|
|
+ " side=\"<\" | \">\""
|
|
);
|
|
}
|
|
|
|
variable n = length (x);
|
|
variable stat = sqrt(n)*((mean(x) - mu)/stddev(x));
|
|
if (tref != NULL) @tref = stat;
|
|
|
|
return map_cdf_to_pval (student_t_cdf(stat, n-1) ;; __qualifiers);
|
|
}
|
|
|
|
define t_test2 ()
|
|
{
|
|
variable x, y;
|
|
variable tref = NULL;
|
|
|
|
if (_NARGS == 2)
|
|
(x,y) = ();
|
|
else if (_NARGS == 3)
|
|
(x,y,tref) = ();
|
|
else
|
|
{
|
|
usage ("p = t_test2 (X, Y [,&t] [; qualifiers]); %% Student's 2 sample (unpaired) t-test\n"
|
|
+ "Qualifiers:\n"
|
|
+ " side=\"<\" | \">\""
|
|
);
|
|
}
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable nx = length (x), mx = mean(x), sx = stddev (x);
|
|
variable ny = length (y), my = mean(y), sy = stddev (y);
|
|
variable df = nx+ny-2;
|
|
variable stat
|
|
= (mx-my)/sqrt((((nx-1)*sx*sx+(ny-1)*sy*sy)*(nx+ny))/(nx*ny*df));
|
|
|
|
if (tref != NULL) @tref = stat;
|
|
|
|
return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
|
|
}
|
|
|
|
define welch_t_test ()
|
|
{
|
|
variable x, y;
|
|
variable tref = NULL;
|
|
|
|
if (_NARGS == 2)
|
|
(x,y) = ();
|
|
else if (_NARGS == 3)
|
|
(x,y,tref) = ();
|
|
else
|
|
{
|
|
usage ("p = welch_t_test2 (X, Y [,&t] [; qualifiers]); %% Welch's 2 sample t-test\n"
|
|
+ "Qualifiers:\n"
|
|
+ " side=\"<\" | \">\""
|
|
);
|
|
}
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable nx = length (x), mx = mean(x), sx = stddev (x), vx = sx*sx/nx;
|
|
variable ny = length (y), my = mean(y), sy = stddev (y), vy = sy*sy/ny;
|
|
variable vxvy = vx+vy;
|
|
variable stat = (mx-my)/sqrt(vxvy);
|
|
variable df = (vxvy*vxvy)/((vx*vx)/(nx-1) + (vy*vy)/(ny-1));
|
|
|
|
if (tref != NULL) @tref = stat;
|
|
|
|
return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
|
|
}
|
|
|
|
define z_test ()
|
|
{
|
|
variable x, mu, sigma;
|
|
variable tref = NULL;
|
|
|
|
if (_NARGS == 4)
|
|
tref = ();
|
|
else if (_NARGS != 3)
|
|
{
|
|
usage ("p = z_test (X, mu, sigma [,&stat] [; qualifiers]);\n"
|
|
+ "Qualifiers:\n"
|
|
+ " side=\"<\" | \">\""
|
|
);
|
|
}
|
|
(x, mu, sigma) = ();
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable n = length (x);
|
|
variable stat = (mean(x)-mu)/(sigma/sqrt(n));
|
|
if (tref != NULL) @tref = stat;
|
|
|
|
return map_cdf_to_pval (normal_cdf(stat) ;; __qualifiers);
|
|
}
|
|
|
|
define f_test2 ()
|
|
{
|
|
variable x, y;
|
|
variable tref = NULL;
|
|
|
|
if (_NARGS == 2)
|
|
(x,y) = ();
|
|
else if (_NARGS == 3)
|
|
(x,y,tref) = ();
|
|
else
|
|
{
|
|
usage ("p = f_test2 (X, Y [,&t] [; qualifiers]); %% 2 sample F-test\n"
|
|
+ "Qualifiers:\n"
|
|
+ " side=\"<\" | \">\""
|
|
);
|
|
}
|
|
variable side = qualifier ("side", NULL);
|
|
|
|
variable v1 = stddev(x)^2;
|
|
variable v2 = stddev(y)^2;
|
|
variable n1 = length(x)-1;
|
|
variable n2 = length(y)-1;
|
|
variable swap = 0;
|
|
if (v1 < v2)
|
|
{
|
|
swap = 1;
|
|
(v1, v2) = (v2, v1);
|
|
(n1, n2) = (n2, n1);
|
|
}
|
|
variable stat = (v1/v2);
|
|
|
|
variable pval = f_cdf (stat, n1, n2);
|
|
if (side == ">")
|
|
{
|
|
if (swap)
|
|
pval = 1.0 - pval;
|
|
}
|
|
else if (side == "<")
|
|
{
|
|
ifnot (swap)
|
|
pval = 1.0 - pval;
|
|
}
|
|
else
|
|
pval = 2.0 * _min (1.0-pval, pval);
|
|
|
|
if (tref != NULL) @tref = stat;
|
|
return pval;
|
|
}
|
|
|
|
define spearman_r ()
|
|
{
|
|
variable w_ref = NULL;
|
|
if (_NARGS == 3)
|
|
w_ref = ();
|
|
else if (_NARGS != 2)
|
|
{
|
|
usage ("p = %s (X1, Y1 [,&r]); %% Spearman's rank correlation",
|
|
_function_name ());
|
|
}
|
|
variable x, y;
|
|
(x, y) = ();
|
|
variable n = length (y), m = length (x);
|
|
|
|
variable gties_x, gties_y;
|
|
variable rx = compute_rank (x, &mean, >ies_x);
|
|
variable ry = compute_rank (y, &mean, >ies_y);
|
|
|
|
variable d = sum ((rx-ry)^2);
|
|
variable cx = sum(gties_x*(gties_x*gties_x-1.0));
|
|
variable cy = sum(gties_y*(gties_y*gties_y-1.0));
|
|
|
|
variable den = double(n) * (n+1.0) * (n-1.0);
|
|
|
|
variable r = (1.0 - 6.0*(d+(cx+cy)/12.0)/den)
|
|
/ sqrt((1.0-cx/den)*(1.0-cy/den));
|
|
if (w_ref != NULL)
|
|
@w_ref = r;
|
|
|
|
variable t = r * sqrt ((n-2)/(1-r*r));
|
|
|
|
return map_cdf_to_pval (student_t_cdf(t,n-2) ;; __qualifiers);
|
|
}
|
|
|
|
% This function is assumed to always pass back a new array.
|
|
private define compute_integer_rank (x, is_sorted)
|
|
{
|
|
variable n = length (x);
|
|
variable indx = NULL, rev_indx = NULL;
|
|
ifnot (is_sorted)
|
|
{
|
|
indx = array_sort (x);
|
|
x = x[indx];
|
|
% Create a reverse-permutation to restore the array order
|
|
% upon return.
|
|
rev_indx = [0:n-1];
|
|
rev_indx[indx] = @rev_indx;
|
|
}
|
|
|
|
variable r = [1:n];
|
|
|
|
% Account for ties
|
|
variable ties;
|
|
() = wherediff (x, &ties);
|
|
% Here, ties is an array of indices {j} where x[j-1]==x[j].
|
|
% We want those where x[j] == x[j+1].
|
|
ties -= 1;
|
|
|
|
variable m = length (ties);
|
|
variable i = 0, j;
|
|
while (i < m)
|
|
{
|
|
variable ties_i = ties[i];
|
|
j = i;
|
|
j++;
|
|
variable dties = ties_i - i;
|
|
while ((j < m) && (dties + j == ties[j]))
|
|
j++;
|
|
|
|
variable dn = j - i;
|
|
i = [ties_i:ties_i+dn];
|
|
r[i] = r[ties_i];
|
|
i = j;
|
|
}
|
|
|
|
if (indx == NULL)
|
|
return r;
|
|
|
|
return r[rev_indx];
|
|
}
|
|
|
|
define kendall_tau ()
|
|
{
|
|
variable w_ref = NULL;
|
|
if (_NARGS == 3)
|
|
w_ref = ();
|
|
else if (_NARGS != 2)
|
|
{
|
|
usage ("p = %s (X1, Y1 [,&r]); %% Kendall's tau correlation",
|
|
_function_name ());
|
|
}
|
|
|
|
variable x, y;
|
|
(x, y) = ();
|
|
variable n = length (x);
|
|
if (n != length (y))
|
|
throw InvalidParmError, "Arrays must be the same length for kendall_tau";
|
|
|
|
% _kendall_tau will modify the contents of the arrays. Be sure to
|
|
% pass new instances to it. The sort operation below will achieve
|
|
% that.
|
|
variable i = array_sort (x);
|
|
x = compute_integer_rank (x[i], 1);
|
|
y = compute_integer_rank (y[i], 0);
|
|
|
|
variable tau, z;
|
|
|
|
(tau, z) = _kendall_tau (x, y);
|
|
|
|
if (w_ref != NULL)
|
|
@w_ref = tau;
|
|
|
|
return map_cdf_to_pval (z ;; __qualifiers);
|
|
}
|
|
|
|
define mann_kendall ()
|
|
{
|
|
variable w_ref = NULL;
|
|
if (_NARGS == 2)
|
|
w_ref = ();
|
|
else if (_NARGS != 1)
|
|
{
|
|
usage ("p = %s (X [,&r]); %% Mann-Kendall trend test",
|
|
_function_name ());
|
|
}
|
|
|
|
variable x;
|
|
x = ();
|
|
variable n = length (x);
|
|
variable i = [0:n-1];
|
|
|
|
% _kendall_tau will modify the contents of the arrays. Be sure to
|
|
% pass new instances to it. compute_integer_rank will create a new
|
|
% instance.
|
|
x = compute_integer_rank (x, 0);
|
|
variable tau, z;
|
|
(tau, z) = _kendall_tau (i, x);
|
|
|
|
if (w_ref != NULL)
|
|
@w_ref = tau;
|
|
|
|
return map_cdf_to_pval (z ;; __qualifiers);
|
|
}
|
|
|
|
define pearson_r ()
|
|
{
|
|
variable w_ref = NULL;
|
|
if (_NARGS == 3)
|
|
w_ref = ();
|
|
else if (_NARGS != 2)
|
|
{
|
|
usage ("p = %s (X1, Y1 [,&r] [; qualifiers]); %% Pearson's r correlation\n", +
|
|
"Qualifiers:\n" +
|
|
" side=\"<\" | \">\"",
|
|
_function_name ());
|
|
}
|
|
|
|
variable x, y;
|
|
(x, y) = ();
|
|
variable n = length(x);
|
|
% Note: covariance handles the 1/(N-1) normalization factor
|
|
variable r = covariance (x, y)[0,1]/(stddev(x)*stddev(y));
|
|
if (w_ref != NULL)
|
|
@w_ref = r;
|
|
|
|
% This is meaningful only for gaussian distributions
|
|
variable df = length(x)-2;
|
|
r = sqrt(df)*r/sqrt(1-r*r);
|
|
return map_cdf_to_pval (student_t_cdf (r, df) ;; __qualifiers);
|
|
}
|
|
|
|
define correlation ()
|
|
{
|
|
if (_NARGS != 2)
|
|
usage ("c = correlation (X, Y);");
|
|
variable x, y; (x,y) = ();
|
|
variable n = length(x);
|
|
if (n != length(y))
|
|
throw InvalidParmError, "Arrays must be the same length";
|
|
variable mx = mean(x), sx = stddev(x), my = mean(y), sy = stddev(y);
|
|
return sum ((x-mx)*(y-my))/((n-1)*sx*sy);
|
|
}
|
|
|
|
provide ("stats");
|