/* Copyright (C) 2017,2018 John E. Davis This file is part of the S-Lang Library. The S-Lang Library is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. The S-Lang Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include "_slint.h" #include "stats-module.h" static double alnorm (double x, int use_upper) { double cdf; cdf = 0.5 * (1.0 + erf (x/sqrt(2.0))); if (use_upper) return 1.0 - cdf; return cdf; } static int prtaus_large_n (_pSLint64_Type is, _pSLint64_Type n, double *probp) { double h[15]; double x, r, sc, p; int i; /* PROBABILITIES CALCULATED BY MODIFIED EDGEWORTH SERIES FOR N GREATER THAN 8 */ /* CALCULATION OF TCHEBYCHEFF-HERMITE POLYNOMIALS */ x = (is-1.0) / sqrt((6.0 + n*(5.0-n*(3.0+2*n)))/(-18.0)); h[0] = x; h[1] = x*x - 1.0; for (i = 2; i < 15; i++) { h[i] = x * h[i-1] - (i-1.0)*h[i-2]; } r = 1.0 / n; sc = r*(h[2]*(r*(r*(r*0.506f - 0.5325f) + 0.045f) - 0.09f) + r*(h[4]*(r*(r*0.3214f - 0.036735f) + 0.036735f) + h[6]*(r*(r*0.07787f - 0.023336f) + 0.00405f) + r*(h[8]*(-.0033061f - r*0.0065166f) + h[10]*(r*0.0025927f - 1.215e-4f) + r*(h[12]*1.4878e-4f + h[14]*2.7338e-6f)))); p = alnorm (x,1) + sc*0.398942*exp(-0.5*x*x); if (p < 0.0) p = 0.0; if (p > 1.0) p = 1.0; *probp = p; return 0; } /* * ALGORITHM AS 71 APPL. STATIST. (1974) VOL.23, NO.1 * * GIVEN A VALUE OF IS CALCULATED FROM TWO RANKINGS (WITHOUT TIES) * OF N OBJECTS, THE FUNCTION COMPUTES THE PROBABILITY OF * OBTAINING A VALUE GREATER THAN, OR EQUAL TO, IS. * */ static int prtaus (_pSLint64_Type is, _pSLint64_Type n, double *probp) { #define MAX_N_EXACT 30 #define MAX_L_COLS ((MAX_N_EXACT)*(MAX_N_EXACT-1)/4 + 2) _pSLint64_Type row0[MAX_L_COLS]; /* allow 1-based indexing */ _pSLint64_Type row1[MAX_L_COLS]; /* allow 1-based indexing */ _pSLint64_Type *curr, *prev; _pSLint64_Type i, il, k, m, im; if (n > MAX_N_EXACT) return prtaus_large_n (is, n, probp); /* Use recurrence relation for n <= MAX_N_EXACT */ *probp = 1.0; m = n*(n-1)/2; if (is >= 0) m -= is; else m += is; if ((m == 0) && (is <= 0)) return 0; if (is < 0) m = m - 2; im = m/2; memset (row0, 0, (im+1)*sizeof(_pSLint64_Type)); memset (row1, 0, (im+1)*sizeof(_pSLint64_Type)); prev = row0; prev[0] = 1; curr = row1; curr[0] = 1; il = 0; i = 1; m = 1; while (i < n) { _pSLint64_Type *tmp; _pSLint64_Type in, io; tmp = curr; curr = prev; prev = tmp; il += i; i++; m = m*i; k = (im < il) ? im : il; k++; in = 1; io = (i <= k) ? i : k; while (in < io) { curr[in] = curr[in-1] + prev[in]; in++; } io = 0; while (in < k) { curr[in] = curr[in-1] + prev[in] - prev[io]; io++; in++; } } k = 0; for (i = 0; i <= im; i++) k += curr[i]; *probp = ((double) k)/m; if (is < 0) *probp = 1.0 - *probp; return 0; } static _pSLuint64_Type kendall_insertion_sort (SLindex_Type *arr, size_t num) { size_t maxj, i; _pSLuint64_Type nexch; if (num < 2) return 0; nexch = 0; i = maxj = num - 1; while (i--) { size_t j = i; SLindex_Type val = arr[i]; while ((j < maxj) && (arr[j+1] < val)) { arr[j] = arr[j+1]; j++; } arr[j] = val; nexch += (j - i); } return nexch; } static _pSLuint64_Type kendall_merge (SLindex_Type *left, size_t left_num, SLindex_Type *right, size_t right_num, SLindex_Type *work) { _pSLuint64_Type nexch = 0; while (left_num && right_num) { if (*right < *left) { *work++ = *right++; right_num--; nexch += left_num; continue; } *work++ = *left++; left_num--; } if (left_num) memcpy(work, left, left_num * sizeof(SLindex_Type)); else if (right_num) memcpy(work, right, right_num * sizeof(SLindex_Type)); return nexch; } static _pSLuint64_Type kendall_merge_sort(SLindex_Type *a, size_t num, SLindex_Type *work) { _pSLuint64_Type nexch; size_t left_num, right_num; SLindex_Type *left, *right; if (num < 8) return kendall_insertion_sort (a, num); left = a; left_num = num/2; right = a + left_num; right_num = num - left_num; nexch = kendall_merge_sort(left, left_num, work); nexch += kendall_merge_sort(right, right_num, work); nexch += kendall_merge(left, left_num, right, right_num, work); memcpy(a, work, num * sizeof(SLindex_Type)); return nexch; } /* This also computes the quantities needed for the p-value. From wikipedia: * v = sum(x*(x-1)*(2*x+5)) * v1 = sum (x*(x-1)) * v2 = sum(x*(x-1)*(x-2)) * where x is the number of tied values in a group. */ static _pSLuint64_Type kendall_count_tied_pairs (SLindex_Type *a, size_t num, _pSLuint64_Type *v, _pSLuint64_Type *v1, _pSLuint64_Type *v2) { _pSLuint64_Type n = 0; size_t i; i = 1; while (i < num) { _pSLuint64_Type di, dn; size_t i0; if (a[i] != a[i-1]) { i++; continue; } /* In a group with ties */ i0 = i-1; i++; while ((i < num) && (a[i] == a[i-1])) i++; di = i-i0; dn = di*(di-1); *v1 += dn; *v2 += dn*(di-2); *v += dn * (2*di+5); n += dn/2; i++; } return n; } /* * This function assumes that the input arrays are sorted on the first array. * That is, the slang code that wraps this will have to do: * * i = array_sort (a); * a = a[i]; b = [i]; * * The basic idea is the following: * * The total number of pairs formed from an array of size N is * n0 = N*(N-1)/2. Consider 2 such arrays A and B. Then consider the ith and * jth elemnts of the arrays, where j>i. One of the following will be true: * * A[i]>A[j] and B[i]>B[j] "concordent" * A[i]A[j] and B[i]B[j] "disconcordent" * A[i]=A[j] and B[i]!=B[j] "A is tied" * A[i]!=A[j] and B[i]=B[j] "B is tied" * A[i]=A[j] and B[i]=B[j] "joint tie" * * Then: n0 = nc + nd + (t + v) + u * n0 = nc + nd + (u + v) + t * where nc=num concordent, nd = num disconcordent, t=A ties, u=B ties, v=joint ties * * Let t+v = nA = total number of ties in A (includes joint) * u+v = nB = total number of ties in B (includes joint) * Then: n0 = nc + nd + nA + u * * nc + nd = n0 - nA - u * = n0 - nA - (nB - v) * = n0 + v - (nA + nB) * Knight indicates that nd is equal to the number of exchanges (ne) in sorting B. * Hence: * * nc - nd = (nc + nd) - 2*ne * = (n0 + v) - (nA + nB + 2*ne) */ double _pSLstats_kendall_tau (SLindex_Type *a, SLindex_Type *b, size_t num, double *taup) { double tau, sigma, prob; _pSLuint64_Type n0, na, nb, ne, v; _pSLuint64_Type va, va1, va2, vb, vb1, vb2; size_t i; n0 = num; n0 = n0*(n0-1)/2; na = v = 0; va = va1 = va2 = 0; vb = vb1 = vb2 = 0; i = 1; while (i < num) { _pSLuint64_Type di; size_t i0; if (a[i-1] != a[i]) { i++; continue; } i0 = i - 1; i++; while ((i < num) && (a[i-1] == a[i])) i++; di = i-i0; na += di*(di-1)/2; (void) kendall_insertion_sort (b+i0, di); v += kendall_count_tied_pairs(b+i0, di, &va, &va1, &va2); i++; /* ok to do since if i= is. */ (void) prtaus (is, num, &prob); prob = 1.0 - prob; /* prob of statistic < is */ /* fprintf (stdout, "prtaus: num=%lu, is=%lu, prob=%g\n", num, is, prob); */ return prob; } /* Otherwise, use normal distributuon */ tau = (n0 + v - na - nb - ne); /* This should be >= 0 */ tau -= ne; /* may be < 0 */ /* From wikipedia */ sigma = (n0*(4.0*num+10.0) - va - vb)/18.0; sigma += va1*(double)vb1/(4.0*n0); sigma += va2*(double)vb2/(18.0*n0*(num-2)); sigma = sqrt (sigma); *taup = tau/sqrt(n0-na)/sqrt(n0-nb); /* avoid overflow */ /* To compute the probability use the continuity correction recommended by Kendall */ if (tau > 0) tau -= 1; else if (tau < 0) tau += 1; return 0.5 * (1.0 + erf (tau/sigma/sqrt(2.0))); }