s-lang/modules/stats_kendall.c

/*
Copyright (C) 2017,2018 John E. Davis

This file is part of the S-Lang Library.

The S-Lang Library is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.

The S-Lang Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <slang.h>

#include "_slint.h"
#include "stats-module.h"

static double alnorm (double x, int use_upper)
{
   double cdf;
   cdf = 0.5 * (1.0 + erf (x/sqrt(2.0)));
   if (use_upper) return 1.0 - cdf;
   return cdf;
}

static int prtaus_large_n (_pSLint64_Type is, _pSLint64_Type n, double *probp)
{
   double h[15];
   double x, r, sc, p;
   int i;

   /* PROBABILITIES CALCULATED BY MODIFIED EDGEWORTH SERIES FOR N GREATER THAN 8 */
   /* CALCULATION OF TCHEBYCHEFF-HERMITE POLYNOMIALS */
   x = (is-1.0) / sqrt((6.0 + n*(5.0-n*(3.0+2*n)))/(-18.0));
   h[0] = x;
   h[1] = x*x - 1.0;
   for (i = 2; i < 15; i++)
     {
	h[i] = x * h[i-1] - (i-1.0)*h[i-2];
     }

   r = 1.0 / n;
   sc = r*(h[2]*(r*(r*(r*0.506f - 0.5325f) + 0.045f) - 0.09f)
	   + r*(h[4]*(r*(r*0.3214f - 0.036735f) + 0.036735f)
		+ h[6]*(r*(r*0.07787f - 0.023336f) + 0.00405f)
		+ r*(h[8]*(-.0033061f - r*0.0065166f)
		     + h[10]*(r*0.0025927f - 1.215e-4f)
		     + r*(h[12]*1.4878e-4f + h[14]*2.7338e-6f))));

   p = alnorm (x,1) + sc*0.398942*exp(-0.5*x*x);
   if (p < 0.0) p = 0.0;
   if (p > 1.0) p = 1.0;
   *probp = p;

   return 0;
}


/*
 * ALGORITHM AS 71  APPL. STATIST. (1974) VOL.23, NO.1
 *
 * GIVEN A VALUE OF IS CALCULATED FROM TWO RANKINGS (WITHOUT TIES)
 * OF N OBJECTS, THE FUNCTION COMPUTES THE PROBABILITY OF
 * OBTAINING A VALUE GREATER THAN, OR EQUAL TO, IS.
 *
 */
static int prtaus (_pSLint64_Type is, _pSLint64_Type n, double *probp)
{
#define MAX_N_EXACT 30
#define MAX_L_COLS ((MAX_N_EXACT)*(MAX_N_EXACT-1)/4 + 2)
   _pSLint64_Type row0[MAX_L_COLS];  /* allow 1-based indexing */
   _pSLint64_Type row1[MAX_L_COLS];  /* allow 1-based indexing */
   _pSLint64_Type *curr, *prev;
   _pSLint64_Type i, il, k, m, im;

   if (n > MAX_N_EXACT)
     return prtaus_large_n (is, n, probp);

   /* Use recurrence relation for n <= MAX_N_EXACT */

   *probp = 1.0;

   m = n*(n-1)/2;
   if (is >= 0) m -= is; else m += is;
   if ((m == 0) && (is <= 0)) return 0;

   if (is < 0) m = m - 2;
   im = m/2;

   memset (row0, 0, (im+1)*sizeof(_pSLint64_Type));
   memset (row1, 0, (im+1)*sizeof(_pSLint64_Type));
   prev = row0; prev[0] = 1;
   curr = row1; curr[0] = 1;

   il = 0;
   i = 1;
   m = 1;
   while (i < n)
     {
	_pSLint64_Type *tmp;
	_pSLint64_Type in, io;

	tmp = curr; curr = prev; prev = tmp;
        il += i;
        i++;
        m = m*i;
	k = (im < il) ? im : il;

	k++;
	in = 1;
	io = (i <= k) ? i : k;
	while (in < io)
	  {
	     curr[in] = curr[in-1] + prev[in];
	     in++;
	  }
	io = 0;
	while (in < k)
	  {
	     curr[in] = curr[in-1] + prev[in] - prev[io];
	     io++;
	     in++;
	  }
     }

   k = 0;
   for (i = 0; i <= im; i++)
     k += curr[i];

   *probp = ((double) k)/m;
   if (is < 0) *probp = 1.0 - *probp;

   return 0;
}

static _pSLuint64_Type kendall_insertion_sort (SLindex_Type *arr, size_t num)
{
   size_t maxj, i;
   _pSLuint64_Type nexch;

   if (num < 2) return 0;

   nexch = 0;
   i = maxj = num - 1;
   while (i--)
     {
        size_t j = i;
        SLindex_Type val = arr[i];

	while ((j < maxj) && (arr[j+1] < val))
	  {
	     arr[j] = arr[j+1];
	     j++;
	  }
        arr[j] = val;
	nexch += (j - i);
    }
   return nexch;
}

static _pSLuint64_Type kendall_merge (SLindex_Type *left, size_t left_num,
			    SLindex_Type *right, size_t right_num,
			    SLindex_Type *work)
{
   _pSLuint64_Type nexch = 0;

   while (left_num && right_num)
     {
        if (*right < *left)
	  {
	     *work++ = *right++;
	     right_num--;
	     nexch += left_num;
	     continue;
	  }

	*work++ = *left++;
	left_num--;
     }

   if (left_num)
     memcpy(work, left, left_num * sizeof(SLindex_Type));
   else if (right_num)
     memcpy(work, right, right_num * sizeof(SLindex_Type));

   return nexch;
}

static _pSLuint64_Type kendall_merge_sort(SLindex_Type *a, size_t num, SLindex_Type *work)
{
   _pSLuint64_Type nexch;
   size_t left_num, right_num;
   SLindex_Type *left, *right;

   if (num < 8)
     return kendall_insertion_sort (a, num);

   left = a;
   left_num = num/2;
   right = a + left_num;
   right_num = num - left_num;

   nexch = kendall_merge_sort(left, left_num, work);
   nexch += kendall_merge_sort(right, right_num, work);
   nexch += kendall_merge(left, left_num, right, right_num, work);

   memcpy(a, work, num * sizeof(SLindex_Type));
   return nexch;
}

/* This also computes the quantities needed for the p-value.  From wikipedia:
 *   v = sum(x*(x-1)*(2*x+5))
 *   v1 = sum (x*(x-1))
 *   v2 = sum(x*(x-1)*(x-2))
 * where x is the number of tied values in a group.
 */
static _pSLuint64_Type
kendall_count_tied_pairs (SLindex_Type *a, size_t num,
			  _pSLuint64_Type *v, _pSLuint64_Type *v1, _pSLuint64_Type *v2)
{
   _pSLuint64_Type n = 0;
   size_t i;

   i = 1;
   while (i < num)
     {
	_pSLuint64_Type di, dn;
	size_t i0;

	if (a[i] != a[i-1])
	  {
	     i++;
	     continue;
	  }

	/* In a group with ties */
	i0 = i-1;
	i++;
	while ((i < num) && (a[i] == a[i-1]))
	  i++;

	di = i-i0;
	dn = di*(di-1);
	*v1 += dn;
	*v2 += dn*(di-2);
	*v += dn * (2*di+5);
	n += dn/2;

	i++;
     }

   return n;
}

/*
 * This function assumes that the input arrays are sorted on the first array.
 * That is, the slang code that wraps this will have to do:
 *
 *    i = array_sort (a);
 *    a = a[i]; b = [i];
 *
 * The basic idea is the following:
 *
 * The total number of pairs formed from an array of size N is
 * n0 = N*(N-1)/2.  Consider 2 such arrays A and B.  Then consider the ith and
 * jth elemnts of the arrays, where j>i.  One of the following will be true:
 *
 *     A[i]>A[j] and B[i]>B[j]     "concordent"
 *     A[i]<A[j] and B[i]<B[j]     "concordent"
 *     A[i]>A[j] and B[i]<B[j]     "disconcordent"
 *     A[i]<A[j] and B[i]>B[j]     "disconcordent"
 *     A[i]=A[j] and B[i]!=B[j]    "A is tied"
 *     A[i]!=A[j] and B[i]=B[j]    "B is tied"
 *     A[i]=A[j] and B[i]=B[j]     "joint tie"
 *
 * Then: n0 = nc + nd + (t + v) + u
 *       n0 = nc + nd + (u + v) + t
 * where nc=num concordent, nd = num disconcordent, t=A ties, u=B ties, v=joint ties
 *
 * Let t+v = nA = total number of ties in A (includes joint)
 *     u+v = nB = total number of ties in B (includes joint)
 * Then: n0 = nc + nd + nA + u
 *
 *   nc + nd = n0 - nA - u
 *           = n0 - nA - (nB - v)
 *           = n0 + v - (nA + nB)
 * Knight indicates that nd is equal to the number of exchanges (ne) in sorting B.
 * Hence:
 *
 *   nc - nd = (nc + nd) - 2*ne
 *           = (n0 + v) - (nA + nB + 2*ne)
 */
double _pSLstats_kendall_tau (SLindex_Type *a, SLindex_Type *b, size_t num, double *taup)
{
   double tau, sigma, prob;
   _pSLuint64_Type n0, na, nb, ne, v;
   _pSLuint64_Type va, va1, va2, vb, vb1, vb2;
   size_t i;

   n0 = num;
   n0 = n0*(n0-1)/2;

   na = v = 0;
   va = va1 = va2 = 0;
   vb = vb1 = vb2 = 0;

   i = 1;
   while (i < num)
     {
	_pSLuint64_Type di;
	size_t i0;

	if (a[i-1] != a[i])
	  {
	     i++;
	     continue;
	  }

	i0 = i - 1;
	i++;
	while ((i < num) && (a[i-1] == a[i]))
	  i++;

	di = i-i0;
	na += di*(di-1)/2;

	(void) kendall_insertion_sort (b+i0, di);
	v += kendall_count_tied_pairs(b+i0, di, &va, &va1, &va2);
	i++;			       /* ok to do since if i<num, the a[i-1]!=a[i] */
     }

   /* Sort b, using a as workspace */
   ne = kendall_merge_sort (b, num, a);
   nb = kendall_count_tied_pairs (b, num, &vb, &vb1, &vb2);

   /* Of no ties, use exact probability distribution */
   if ((na == 0) && (nb == 0))
     {
	_pSLint64_Type is;

	if (n0 < 2*ne)
	  is = -(_pSLint64_Type)(2*ne - n0);
	else
	  is = (_pSLint64_Type)(n0 - 2*ne);

	*taup = ((double)is)/n0;

	/* prob is probability of getting a statistic >= is. */
	(void) prtaus (is, num, &prob);
	prob = 1.0 - prob;	       /* prob of statistic < is */
	/* fprintf (stdout, "prtaus: num=%lu, is=%lu, prob=%g\n", num, is, prob); */
	return prob;
     }

   /* Otherwise, use normal distributuon */
   tau = (n0 + v - na - nb - ne);      /* This should be >= 0 */
   tau -= ne;			       /* may be < 0 */

   /* From wikipedia */
   sigma = (n0*(4.0*num+10.0) - va - vb)/18.0;
   sigma += va1*(double)vb1/(4.0*n0);
   sigma += va2*(double)vb2/(18.0*n0*(num-2));
   sigma = sqrt (sigma);

   *taup = tau/sqrt(n0-na)/sqrt(n0-nb);  /* avoid overflow */

   /* To compute the probability use the continuity correction recommended by Kendall */
   if (tau > 0) tau -= 1; else if (tau < 0) tau += 1;
   return 0.5 * (1.0 + erf (tau/sigma/sqrt(2.0)));
}