swilk.go

package onlinestats

import (
	"math"
	"sort"
	"strconv"
)

func SWilk(x []float64) (float64, float64, error) {

	data := make([]float64, len(x)+1)
	copy(data[1:], x)
	sort.Float64s(data[1:])
	data[0] = math.NaN()

	length := len(x)
	w, pw, err := swilkHelper(data, length, nil)
	return w, pw, err
}

// Calculate the Shapiro-Wilk W test and its significance level
// Based on the public domain code at https://joinup.ec.europa.eu/svn/sextante/soft/sextante_lib/trunk/algorithms/src/es/unex/sextante/tables/normalityTest/SWilk.java

/*
 * Constants and polynomial coefficients for swilk(). NOTE: FORTRAN counts the elements of the array x[length] as
 * x[1] through x[length], not x[0] through x[length-1]. To avoid making pervasive, subtle changes to the algorithm
 * (which would inevitably introduce pervasive, subtle bugs) the referenced arrays are padded with an unused 0th
 * element, and the algorithm is ported so as to continue accessing from [1] through [length].
 */
var c1 = []float64{math.NaN(), 0.0E0, 0.221157E0, -0.147981E0, -0.207119E1, 0.4434685E1, -0.2706056E1}
var c2 = []float64{math.NaN(), 0.0E0, 0.42981E-1, -0.293762E0, -0.1752461E1, 0.5682633E1, -0.3582633E1}
var c3 = []float64{math.NaN(), 0.5440E0, -0.39978E0, 0.25054E-1, -0.6714E-3}
var c4 = []float64{math.NaN(), 0.13822E1, -0.77857E0, 0.62767E-1, -0.20322E-2}
var c5 = []float64{math.NaN(), -0.15861E1, -0.31082E0, -0.83751E-1, 0.38915E-2}
var c6 = []float64{math.NaN(), -0.4803E0, -0.82676E-1, 0.30302E-2}
var c7 = []float64{math.NaN(), 0.164E0, 0.533E0}
var c8 = []float64{math.NaN(), 0.1736E0, 0.315E0}
var c9 = []float64{math.NaN(), 0.256E0, -0.635E-2}
var g = []float64{math.NaN(), -0.2273E1, 0.459E0}

const (
	z90   = 0.12816E1
	z95   = 0.16449E1
	z99   = 0.23263E1
	zm    = 0.17509E1
	zss   = 0.56268E0
	bf1   = 0.8378E0
	xx90  = 0.556E0
	xx95  = 0.622E0
	sqrth = 0.70711E0
	th    = 0.375E0
	small = 1E-19
	pi6   = 0.1909859E1
	stqr  = 0.1047198E1
	upper = true
)

/**
* ALGORITHM AS R94 APPL. STATIST. (1995) VOL.44, NO.4
*
* Calculates Shapiro-Wilk normality test and P-value for sample sizes 3 <= n <= 5000 .
* Corrects AS 181, which was found to be inaccurate for n > 50.
*
* As described above with the constants, the data arrays x[] and a[] are referenced with a base element of 1 (like FORTRAN)
* instead of 0 (like Java) to avoid screwing up the algorithm. To pass in 100 data points, declare x[101] and fill elements
* x[1] through x[100] with data. x[0] will be ignored.
*
* @param x
*                Input; Data set to analyze; 100 points go in x[101] array from x[1] through x[100]
* @param n
*                Input; Number of data points in x
* @param a
*                Shapiro-Wilk coefficients.  Can be nil, or pre-computed by swilkCoeffs and passed in.
 */

type SwilkFault int

func (s SwilkFault) Error() string {
	return "swilk fault " + strconv.Itoa(int(s))
}

func swilkHelper(x []float64, n int, a []float64) (w float64, pw float64, err error) {

	if n > 5000 {
		return 0, 0, SwilkFault(2)
	}

	pw = 1.0
	if w >= 0.0 {
		w = 1.0
	}
	an := float64(n)
	if n < 3 {
		return 0, 0, SwilkFault(1)
	}

	if a == nil {
		a = SwilkCoeffs(n)
	}

	if n < 3 {
		return
	}

	// If W input as negative, calculate significance level of -W
	var w1, xx float64
	if w < 0.0 {
		w1 = 1.0 + w
	} else {

		// Check for zero range

		range_ := x[n] - x[1]
		if range_ < small {
			return 0, 0, SwilkFault(6)
		}

		// Check for correct sort order on range - scaled X
		// TODO(dgryski): did the FORTRAN code puke on out-of-order X ? with ifault=7 ?
		xx = x[1] / range_
		sx := xx
		sa := -a[1]
		j := n - 1
		for i := 2; i <= n; i++ {
			xi := x[i] / range_
			// IF (XX-XI .GT. SMALL) PRINT *,' ANYTHING'
			sx += xi
			if i != j {
				sa += float64(sign(1, i-j)) * a[imin(i, j)]
			}
			xx = xi
			j--
		}

		// Calculate W statistic as squared correlation between data and coefficients
		sa /= float64(n)
		sx /= float64(n)
		ssa := 0.0
		ssx := 0.0
		sax := 0.0
		j = n
		var asa float64
		for i := 1; i <= n; i++ {
			if i != j {
				asa = float64(sign(1, i-j))*a[imin(i, j)] - sa
			} else {
				asa = -sa
			}
			xsx := x[i]/range_ - sx
			ssa += asa * asa
			ssx += xsx * xsx
			sax += asa * xsx
			j--
		}

		// W1 equals (1-W) calculated to avoid excessive rounding error
		// for W very near 1 (a potential problem in very large samples)

		ssassx := math.Sqrt(ssa * ssx)
		w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx)
	}
	w = 1.0 - w1

	// Calculate significance level for W (exact for N=3)

	if n == 3 {
		pw = pi6 * (math.Asin(math.Sqrt(w)) - stqr)
		return w, pw, nil
	}
	y := math.Log(w1)
	xx = math.Log(an)
	m := 0.0
	s := 1.0
	if n <= 11 {
		gamma := poly(g, 2, an)
		if y >= gamma {
			pw = small
			return w, pw, nil
		}
		y = -math.Log(gamma - y)
		m = poly(c3, 4, an)
		s = math.Exp(poly(c4, 4, an))
	} else {
		m = poly(c5, 4, xx)
		s = math.Exp(poly(c6, 3, xx))
	}
	pw = alnorm((y-m)/s, upper)

	return w, pw, nil
}

// Precomputes the coefficients array a for SWilk
func SwilkCoeffs(n int) []float64 {

	a := make([]float64, n+1)

	an := float64(n)

	n2 := n / 2

	if n == 3 {
		a[1] = sqrth
	} else {
		an25 := an + 0.25
		summ2 := 0.0
		for i := 1; i <= n2; i++ {
			a[i] = ppnd((float64(i) - th) / an25)
			summ2 += a[i] * a[i]
		}
		summ2 *= 2.0
		ssumm2 := math.Sqrt(summ2)
		rsn := 1.0 / math.Sqrt(an)
		a1 := poly(c1, 6, rsn) - a[1]/ssumm2

		// Normalize coefficients

		var i1 int
		var fac float64
		if n > 5 {
			i1 = 3
			a2 := -a[2]/ssumm2 + poly(c2, 6, rsn)
			fac = math.Sqrt((summ2 - 2.0*a[1]*a[1] - 2.0*a[2]*a[2]) / (1.0 - 2.0*a1*a1 - 2.0*a2*a2))
			a[1] = a1
			a[2] = a2
		} else {
			i1 = 2
			fac = math.Sqrt((summ2 - 2.0*a[1]*a[1]) / (1.0 - 2.0*a1*a1))
			a[1] = a1
		}
		for i := i1; i <= n2; i++ {
			a[i] = -a[i] / fac
		}
	}

	return a
}

/**
 * Constructs an int with the absolute value of x and the sign of y
 *
 * @param x
 *                int to copy absolute value from
 * @param y
 *                int to copy sign from
 * @return int with absolute value of x and sign of y
 */
func sign(x int, y int) int {
	var result = x
	if x < 0 {
		result = -x
	}
	if y < 0 {
		result = -result
	}
	return result
}

// Constants & polynomial coefficients for ppnd(), slightly renamed to avoid conflicts. Could define
// them inside ppnd(), but static constants are more efficient.

// Coefficients for P close to 0.5
const (
	a0_p = 3.3871327179E+00
	a1_p = 5.0434271938E+01
	a2_p = 1.5929113202E+02
	a3_p = 5.9109374720E+01
	b1_p = 1.7895169469E+01
	b2_p = 7.8757757664E+01
	b3_p = 6.7187563600E+01

	// Coefficients for P not close to 0, 0.5 or 1 (names changed to avoid conflict with swilk())
	c0_p = 1.4234372777E+00
	c1_p = 2.7568153900E+00
	c2_p = 1.3067284816E+00
	c3_p = 1.7023821103E-01
	d1_p = 7.3700164250E-01
	d2_p = 1.2021132975E-01

	// Coefficients for P near 0 or 1.
	e0_p = 6.6579051150E+00
	e1_p = 3.0812263860E+00
	e2_p = 4.2868294337E-01
	e3_p = 1.7337203997E-02
	f1_p = 2.4197894225E-01
	f2_p = 1.2258202635E-02

	split1 = 0.425
	split2 = 5.0
	const1 = 0.180625
	const2 = 1.6
)

/**
 * ALGORITHM AS 241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477-484.
 *
 * Produces the normal deviate Z corresponding to a given lower tail area of P; Z is accurate to about 1 part in 10**7.
 *
 * @param p
 * @return
 */
func ppnd(p float64) float64 {
	q := p - 0.5
	var r float64
	if math.Abs(q) <= split1 {
		r = const1 - q*q
		return q * (((a3_p*r+a2_p)*r+a1_p)*r + a0_p) / (((b3_p*r+b2_p)*r+b1_p)*r + 1.0)
	} else {
		if q < 0.0 {
			r = p
		} else {
			r = 1.0 - p
		}
		if r <= 0.0 {
			return 0.0
		}
		r = math.Sqrt(-math.Log(r))
		var normal_dev float64
		if r <= split2 {
			r -= const2
			normal_dev = (((c3_p*r+c2_p)*r+c1_p)*r + c0_p) / ((d2_p*r+d1_p)*r + 1.0)
		} else {
			r -= split2
			normal_dev = (((e3_p*r+e2_p)*r+e1_p)*r + e0_p) / ((f2_p*r+f1_p)*r + 1.0)
		}
		if q < 0.0 {
			normal_dev = -normal_dev
		}
		return normal_dev
	}
}

/**
 * Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
 *
 * Calculates the algebraic polynomial of order nord-1 with array of coefficients c. Zero order coefficient is c[1]
 *
 * @param c
 * @param nord
 * @param x
 * @return
 */
func poly(c []float64, nord int, x float64) float64 {
	poly := c[1]
	if nord == 1 {
		return poly
	}
	p := x * c[nord]
	if nord != 2 {
		n2 := nord - 2
		j := n2 + 1
		for i := 1; i <= n2; i++ {
			p = (p + c[j]) * x
			j--
		}
	}
	poly += p
	return poly
}

// Constants & polynomial coefficients for alnorm(), slightly renamed to avoid conflicts.
const (
	con_a    = 1.28
	ltone_a  = 7.0
	utzero_a = 18.66
	p_a      = 0.398942280444
	q_a      = 0.39990348504
	r_a      = 0.398942280385
	a1_a     = 5.75885480458
	a2_a     = 2.62433121679
	a3_a     = 5.92885724438

	b1_a = -29.8213557807
	b2_a = 48.6959930692

	c1_a = -3.8052E-8
	c2_a = 3.98064794E-4
	c3_a = -0.151679116635
	c4_a = 4.8385912808
	c5_a = 0.742380924027
	c6_a = 3.99019417011

	d1_a = 1.00000615302
	d2_a = 1.98615381364
	d3_a = 5.29330324926
	d4_a = -15.1508972451
	d5_a = 30.789933034
)

/**
 * Algorithm AS66 Applied Statistics (1973) vol.22, no.3
 *
 * Evaluates the tail area of the standardised normal curve from x to infinity if upper is true or from minus infinity to x if
 * upper is false.
 */
func alnorm(x float64, upper bool) float64 {
	up := upper
	z := x
	if z < 0.0 {
		up = !up
		z = -z
	}
	var fn_val float64
	if z > ltone_a && (!up || z > utzero_a) {
		fn_val = 0.0
	} else {
		y := 0.5 * z * z
		if z <= con_a {
			fn_val = 0.5 - z*(p_a-q_a*y/(y+a1_a+b1_a/(y+a2_a+b2_a/(y+a3_a))))
		} else {
			fn_val = r_a * math.Exp(-y) / (z + c1_a + d1_a/(z+c2_a+d2_a/(z+c3_a+d3_a/(z+c4_a+d4_a/(z+c5_a+d5_a/(z+c6_a))))))
		}
	}
	if !up {
		fn_val = 1.0 - fn_val
	}
	return fn_val
}

func imin(i, j int) int {
	if i < j {
		return i
	}
	return j
}