Source code for nmmn.stats

"""
Statistical methods
=====================

- fit residuals
- Computing prediction and confidence bands
- Comparing goodness-of-fit of different models
- operations on statistical distributions
- custom statistical distributions
- p-values and significance
"""

import numpy as np
import scipy
import scipy.stats
from . import misc





# Residuals of a fit
# ===================
# Residual sum of squares, raw scatter about best-fit and 
# intrinsic scatter.

[docs]def scatterfit(x,y,a=None,b=None):
	"""
Compute the mean deviation of the data about the linear model given if A,B
(*y=ax+b*) provided as arguments. Otherwise, compute the mean deviation about 
the best-fit line.

:param x,y: assumed to be Numpy arrays. 
:param a,b: scalars.
:rtype: float sd with the mean deviation.
	"""

	if a==None:	
		# Performs linear regression
		a, b, r, p, err = scipy.stats.linregress(x,y)
	
	# Std. deviation of an individual measurement (Bevington, eq. 6.15)
	N=np.size(x)
	sd=1./(N-2.)* np.sum((y-a*x-b)**2)
	sd=np.sqrt(sd)
	
	return sd
	

[docs]def scatterfitg(ydata,ymod,deg):
	"""
Compute the mean deviation of the data about the model given in
the array ymod, tabulated exactly like ydata.

Usage:

>>> sd=scatterfitg(ydata,ymod,n)

:param ydata: data
:param ymod: model evaluated at xdata
:param n: number of free parameters in the model
  
:type ydata,ymod: Numpy arrays
:rtype: float sd with the mean deviation.
	"""	
	# Std. deviation of an individual measurement (Bevington, eq. 6.15)
	N=np.size(ydata)
	sd=1./(N-deg)* np.sum((ydata-ymod)**2)
	sd=np.sqrt(sd)
	
	return sd



[docs]def scatpratt(x,y,errx,erry,a,b):
	"""
This is an alternative way of computing the "raw" scatter about the best-fit 
linear relation in the presence of measurement errors, proposed by 
Pratt et al. 2009, A&A, 498, 361. 

In the words of Cavagnolo et al. (2010): 
*We have quantified the total scatter about the best-fit relation using a 
weighted estimate of the orthogonal distances to the best-fit line (see 
Pratt et al. 2009).*
	
Usage:

.. function:: sd=scatpratt(x,y,errx,erry,a,b)

  :param x,y: X and Y data arrays
  :param errx, erry: standard deviations in X and Y
  :param a,b: slope and intercept of best-fit linear relation
  :rtype: float sd with the scatter about the best-fit.

v1 Mar 20 2012
	"""
	N=np.size(x)
	
	# Equation 4 from Pratt et al. 2009
	sdsq=erry**2+a**2*errx**2
	wden=1./N*np.sum(1./sdsq)	# Denominator of w
	w=1./sdsq/wden
	
	# Equation 3 from Pratt et al. 2009
	sdrawsq=1./(N-2.)*np.sum(w*(y-a*x-b)**2)
	sdraw=np.sqrt(sdrawsq)
	
	return sdraw



[docs]def residual(ydata,ymod):
	"""
Compute the residual sum of squares (RSS) also known as the sum of 
squared residuals (see http://en.wikipedia.org/wiki/Residual_sum_of_squares).

Usage:

>>> rss=residual(ydata,ymod)

where
  ydata : data
  ymod : model evaluated at xdata
  
ydata,ymod assumed to be Numpy arrays. 
Returns the float rss with the mean deviation.

Dec. 2011
	"""	
	# Std. deviation of an individual measurement (Bevington, eq. 6.15)
	N=np.size(ydata)
	rss=np.sum((ydata-ymod)**2)
	
	return rss



[docs]def chisq(x,y,a=None,b=None,sd=None):
	"""
Returns the chi-square error statistic as the sum of squared errors between
Y(i) and AX(i) + B. If individual standard deviations (array sd) are supplied, 
then the chi-square error statistic is computed as the sum of squared errors
divided by the standard deviations.	Inspired on the IDL procedure linfit.pro.
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
	
If a linear model is not provided via A,B (y=ax+b) then the method computes
the chi-square using the best-fit line.

x,y,sd assumed to be Numpy arrays. a,b scalars.
Returns the float chisq with the chi-square statistic.
	"""

	if a==None:	
		# Performs linear regression
		a, b, r, p, err = scipy.stats.linregress(x,y)
	
	# Chi-square statistic (Bevington, eq. 6.9)
	if sd==None:
		chisq=np.sum((y-a*x-b)**2)
	else:
		chisq=np.sum( ((y-a*x-b)/sd)**2 )
	
	return chisq




[docs]def chisqg(ydata,ymod,sd=None):
	"""
Returns the chi-square error statistic as the sum of squared errors between
Ydata(i) and Ymodel(i). If individual standard deviations (array sd) are supplied, 
then the chi-square error statistic is computed as the sum of squared errors
divided by the standard deviations.	Inspired on the IDL procedure linfit.pro.
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.

x,y,sd assumed to be Numpy arrays. a,b scalars.
Returns the float chisq with the chi-square statistic.
	"""
	# Chi-square statistic (Bevington, eq. 6.9)
	if sd==None:
		chisq=np.sum((ydata-ymod)**2)
	else:
		chisq=np.sum( ((ydata-ymod)/sd)**2 )
	
	return chisq






[docs]def chisqxy(x,y,errx,erry,a,b):
	"""
Returns the chi-square error statistic for a linear fit, 
computed taking into account the errors in both X and Y (i.e. the
effective variance). See equation 3 in Ascenso et al. 2012, A&A or
Yee & Ellingson 2003 ApJ.

Usage:

>>> chisq=chisqxy(xdata,ydata,errx,erry,a,b)

where

- xdata,ydata : data
- errx,erry : measurement uncertainties in the data
- a,b : slope and intercept of the best-fit linear regression model
	"""
	sdsq=erry**2+a**2*errx**2
	chisq=np.sum( (y-a*x-b)**2/sdsq )
	
	return chisq





[docs]def intscat(x,y,errx,erry,a,b):
	"""
Estimates the intrinsic scatter about the best-fit, taking into account
the errors in X and Y and the "raw" scatter. Inspired by Pratt et al. 2009,
A&A, 498, 361. 
	
Usage:

>>> sd=intscat(x,y,errx,erry,a,b)

where
  
- x,y : X and Y data arrays
- errx, erry : standard deviations in X and Y
- a,b : slope and intercept of best-fit linear relation
  
Returns the float sd with the scatter about the best-fit.

v1 Apr 9th 2012
	"""
	# Raw scatter
	sdraw=scatpratt(x,y,errx,erry,a,b)
	sdrawsq=sdraw**2
	
	# Statistical variance, eq. 4 from Pratt et al. 2009
	sdsq=erry**2+a**2*errx**2
	
	# Intrinsic scatter for each data point
	intscati=np.sqrt( np.abs(sdsq-sdrawsq) )
	
	return np.mean(intscati)
	
	


[docs]def linregress_error(x,y):
	"""
Compute the uncertainties in the parameters A,B of the least-squares
linear regression fit y=ax+b to the data (scipy.stats.linregress).

x,y assumed to be Numpy arrays.
Returns the sequence sigma_a, sigma_b with the standard deviations in A and B.
	"""
	
	# Performs linear regression
	a, b, r, p, err = scipy.stats.linregress(x,y)
	
	# Std. deviation of an individual measurement (Bevington, eq. 6.15)
	N=np.size(x)
	sd=1./(N-2.)* np.sum((y-a*x-b)**2);	sd=np.sqrt(sd)
	
	# Std. deviation in parameters
	Delta=N*np.sum(x**2)-(np.sum(x))**2	# Bevington, eq. 6.13
	sdb=sd**2/Delta*np.sum(x**2);	sdb=np.sqrt(sdb)	# Bevington, eq. 6.23
	sda=N*sd**2/Delta;	sda=np.sqrt(sda)
	
	return sda,sdb
		
	










	


# Computing prediction and confidence bands
# ===========================================
#
#


[docs]def confband(xd,yd,a,b,conf=0.95,x=None):
	"""
Calculates the confidence band of the linear regression model at the desired confidence
level, using analytical methods. The 2sigma confidence interval is 95% sure to contain 
the best-fit regression line. This is not the same as saying it will contain 95% of 
the data points.

Arguments:

- conf: desired confidence level, by default 0.95 (2 sigma)
- xd,yd: data arrays
- a,b: linear fit parameters as in y=ax+b
- x: (optional) array with x values to calculate the confidence band. If none is provided, will by default generate 100 points in the original x-range of the data.
  
Returns:
Sequence (lcb,ucb,x) with the arrays holding the lower and upper confidence bands 
corresponding to the [input] x array.

Usage:

>>> lcb,ucb,x=nemmen.confband(all.kp,all.lg,a,b,conf=0.95)
calculates the confidence bands for the given input arrays

>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the confidence band

References:
1. http://en.wikipedia.org/wiki/Simple_linear_regression, see Section Confidence intervals
2. http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm

v1 Dec. 2011
v2 Jun. 2012: corrected bug in computing dy
	"""
	alpha=1.-conf	# significance
	n=xd.size	# data sample size

	if x==None: x=np.linspace(xd.min(),xd.max(),100)

	# Predicted values (best-fit model)
	y=a*x+b

	# Auxiliary definitions
	sd=scatterfit(xd,yd,a,b)	# Scatter of data about the model
	sxd=np.sum((xd-xd.mean())**2)
	sx=(x-xd.mean())**2	# array

	# Quantile of Student's t distribution for p=1-alpha/2
	q=scipy.stats.t.ppf(1.-alpha/2.,n-2)

	# Confidence band
	dy=q*sd*np.sqrt( 1./n + sx/sxd )
	ucb=y+dy	# Upper confidence band
	lcb=y-dy	# Lower confidence band

	return lcb,ucb,x
	
	
	
	
	
[docs]def predband(xd,yd,a,b,conf=0.95,x=None):
	"""
Calculates the prediction band of the linear regression model at the desired confidence
level, using analytical methods. 

Clarification of the difference between confidence and prediction bands:
"The 2sigma confidence interval is 95% sure to contain the best-fit regression line. 
This is not the same as saying it will contain 95% of the data points. The prediction bands are
further from the best-fit line than the confidence bands, a lot further if you have many data 
points. The 95% prediction interval is the area in which you expect 95% of all data points to fall."
(from http://graphpad.com/curvefit/linear_regression.htm)

Arguments:

- conf: desired confidence level, by default 0.95 (2 sigma)
- xd,yd: data arrays
- a,b: linear fit parameters as in y=ax+b
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
  by default generate 100 points in the original x-range of the data.
  
Usage:

>>> lpb,upb,x=nemmen.predband(all.kp,all.lg,a,b,conf=0.95)
calculates the prediction bands for the given input arrays

>>> pylab.fill_between(x, lpb, upb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band  

:returns: Sequence (lpb,upb,x) with the arrays holding the lower and upper confidence bands 
corresponding to the [input] x array.

References:

1. `Introduction to Simple Linear Regression, Gerard 
E. Dallal, Ph.D. <http://www.JerryDallal.com/LHSP/slr.htm>`_
	"""
	alpha=1.-conf	# significance
	n=xd.size	# data sample size

	if x is None: x=np.linspace(xd.min(),xd.max(),100)

	# Predicted values (best-fit model)
	y=a*x+b

	# Auxiliary definitions
	sd=scatterfit(xd,yd,a,b)	# Scatter of data about the model
	sxd=np.sum((xd-xd.mean())**2)
	sx=(x-xd.mean())**2	# array

	# Quantile of Student's t distribution for p=1-alpha/2
	q=scipy.stats.t.ppf(1.-alpha/2.,n-2)

	# Prediction band
	dy=q*sd*np.sqrt( 1.+1./n + sx/sxd )
	upb=y+dy	# Upper prediction band
	lpb=y-dy	# Lower prediction band

	return lpb,upb,x
	



[docs]def confbandnl(xd,yd,fun,par,varcov,deg,conf=0.95,x=None):
	"""
Calculates the confidence band of a nonlinear model at the desired confidence
level, using analytical methods. 

Arguments:

- xd,yd: data arrays
- fun : function f(v) - the model - which returns a scalar. v is an array
  such that v[0]=x (scalar), v[i>0] = parameters of the model
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
  parameters that will be fed into fun
- varcov : variance-covariance matrix obtained from the nonlinear fit
- deg : number of free parameters in the model
- conf: desired confidence level, by default 0.95 (2 sigma)
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
  by default generate 100 points in the original x-range of the data.
  
Usage:

>>> lcb,ucb,x=nemmen.confbandnl(all.kp,all.lg,broken,bfit,bcov,4,conf=0.95)
calculates the confidence bands for the given input arrays

>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band  

:returns: Sequence (lcb,ucb,x) with the arrays holding the lower and upper confidence bands 
corresponding to the [input] x array.

References:

1. `How does Prism compute confidence and prediction bands for nonlinear regression? <http://www.graphpad.com/faq/viewfaq.cfm?faq=1099>`_
2. http://stats.stackexchange.com/questions/15423/how-to-compute-prediction-bands-for-non-linear-regression
3. see also my notebook
	"""
	import numdifftools
	
	alpha=1.-conf	# significance
	n=xd.size	# data sample size

	if x is None: x=np.linspace(xd.min(),xd.max(),100)

	# Gradient (needs to be evaluated)
	dfun=numdifftools.Gradient(fun)
	
	# Quantile of Student's t distribution for p=1-alpha/2
	q=scipy.stats.t.ppf(1.-alpha/2.,n-2)

	# Residual sum of squares		
	rss=residual(yd, misc.evalfun(fun,xd,par) )
	
	grad,p=[],[]
	i=0
	y=misc.evalfun(fun,x,par)
	v=np.zeros_like(x)
		
	for i in range(x.size):
		# List: arrays consisting of [x[i], par1, par2, ...]
		p.append( np.concatenate(([x[i]],par)) )
	
		# List: each element -> gradient evaluated at each xi
		grad.append(dfun(p[i]))
				
		# Before processing the grad array, eliminates the first element
		temp=np.dot(np.transpose(grad[i][1:]), varcov)
		v[i]=np.dot(temp, grad[i][1:])
		
	# Confidence band
	dy=q*np.sqrt( v*rss/(n-deg) )
	ucb=y+dy	# Upper confidence band
	lcb=y-dy	# Lower confidence band

	return lcb,ucb,x





[docs]def predbandnl(xd,yd,fun,par,varcov,deg,conf=0.95,x=None):
	"""
Calculates the prediction band of a nonlinear model at the desired confidence
level, using analytical methods. 

Arguments:

- xd,yd: data arrays
- fun : function f(v) - the model - which returns a scalar. v is an array
  such that v[0]=x, v[i>0] = parameters of the model
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
  parameters that will be fed into fun
- varcov : variance-covariance matrix obtained from the nonlinear fit
- deg : number of free parameters in the model
- conf: desired confidence level, by default 0.95 (2 sigma)
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
  by default generate 100 points in the original x-range of the data.
  
Usage:

>>> lpb,upb,x=nemmen.predbandnl(all.kp,all.lg,broken,bfit,bcov,4,conf=0.95)
calculates the prediction bands for the given input arrays

>>> pylab.fill_between(x, lpb, upb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band  

:returns: Sequence (lpb,upb,x) with the arrays holding the lower and upper confidence bands 
corresponding to the [input] x array.

References:
1. http://www.graphpad.com/faq/viewfaq.cfm?faq=1099, "How does Prism compute confidence and prediction bands for nonlinear regression?"
2. http://stats.stackexchange.com/questions/15423/how-to-compute-prediction-bands-for-non-linear-regression
3. see also my notebook)
	"""
	import numdifftools
	
	alpha=1.-conf	# significance
	n=xd.size	# data sample size

	if x is None: x=np.linspace(xd.min(),xd.max(),100)

	# Gradient (needs to be evaluated)
	dfun=numdifftools.Gradient(fun)
	
	# Quantile of Student's t distribution for p=1-alpha/2
	q=scipy.stats.t.ppf(1.-alpha/2.,n-2)

	# Residual sum of squares		
	rss=residual(yd, misc.evalfun(fun,xd,par) )
	
	grad,p=[],[]
	i=0
	y=misc.evalfun(fun,x,par)
	v=np.zeros_like(x)
		
	for i in range(x.size):
		# List: arrays consisting of [x[i], par1, par2, ...]
		p.append( np.concatenate(([x[i]],par)) )
	
		# List: each element -> gradient evaluated at each xi
		grad.append(dfun(p[i]))
				
		# Before processing the grad array, eliminates the first element
		temp=np.dot(np.transpose(grad[i][1:]), varcov)
		v[i]=np.dot(temp, grad[i][1:])
		
	# Confidence band
	dy=q*np.sqrt( (1.+v)*rss/(n-deg) )
	upb=y+dy	# Upper prediction band
	lpb=y-dy	# Lower prediction band

	return lpb,upb,x
		
	


[docs]def confbandmc(x,par,varcov,n=100000,sigmas=1.):
	"""
Calculates the 1sigma confidence band of a linear model without needing
to specify the data points, by doing a Monte Carlo simulation using the
Multivariate normal distribution. Assumes the parameters follow a 2D
normal distribution.

Arguments:

- x: array with x values to calculate the confidence band. 
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
  parameters
- varcov : variance-covariance matrix of the parameters
- n : number of (a,b) points generated from the multivariate gaussian
- sigmas : number of standard deviations contained within the prediction
  band

Output:

- lcb, ucb : lower and upper prediction bands
- y : values of the model tabulated at x

Usage:

>>> lcb,ucb,y=nemmen.confbandmc(x,bfit,bcov)
calculates the prediction bands for the given input arrays

>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
	"""
	# Generates many realizations of a and b from the multinormal distribution
	ar,br = np.random.multivariate_normal(par,varcov,n).T
	
	erry=[]	# will contain the std deviation in y
	y=[]	# values of y for each x
	
	for xi in x:
		yr=ar*xi+br
		erry.append( yr.std() )
		y.append( yr.mean() )	
	
	erry=np.array(erry)
	y=np.array(y)

	ucb=y+sigmas*erry	# Upper confidence band
	lcb=y-sigmas*erry	# Lower confidence band

	return lcb,ucb,y
	
	
	
	
[docs]def credbandmc(x,slope,inter,sigmas=1.):
	"""
Calculates the confidence (or credibility in case of Bayesian analysis) 
band of a linear regression model, given the posterior probability distributions
of the slope and intercept (presumably computed via Bayesian regression,
see bayeslin.pro). These distributions do not need to be normal.

Arguments:

- x: array with x values to calculate the confidence band. 
- slope, inter: posterior distributions of slope and intercept for linear
  regression
- sigmas : number of standard deviations contained within the confidence
  band

Output:

- lcb, ucb : lower and upper confidence bands
- y : best-fit values of the model tabulated at x

Usage:

>>> lcb,ucb,y=nemmen.predbandmc(x,a,b)
calculates the prediction bands for the given input arrays

>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band

v1 Jun. 2012: inspired by private communication with B. Kelly.
	"""
	a,b = slope,inter
	
	# Define the confidence/credibility interval
	conf=1.-scipy.special.erf(sigmas/np.sqrt(2.))
	
	lcb,ucb,ym=np.zeros_like(x),np.zeros_like(x),np.zeros_like(x)
	
	for i, xi in enumerate(x): 
		# Distribution of y for each x
		yp=a*xi+b	# 'p' as in posterior
		
		# Lower confidence band
		lcb[i]=scipy.stats.scoreatpercentile(yp,conf*100./2.)
		# Upper confidence band
		ucb[i]=scipy.stats.scoreatpercentile(yp,100.*(1-conf/2.))
		
		ym[i]=np.median(yp)	
		

	return lcb,ucb,ym
	
	

[docs]def predband_linboot(x,a,b,n):
	"""
	Computes information for plotting the prediction band ("bow tie" plot) around
	best-fit. Run this after running `linboot` below.

	Usage:

	Plots prediction band and best-fit given data x,y

	>>> n=1000
	>>> a,b=nmmn.stats.linboot(x,y,n)
	>>> xMock,yBest,yStd=nmmn.stats.confband_linboot(x,a,b,n)
	>>> plt.plot(x,y,'o')
	>>> plt.plot(xMock,amed*xMock+bmed)
	>>> plt.fill_between(xMock,yBest-yStd,yBest+yStd,alpha=0.3, facecolor='gray')		

:param x: array of x data values
:param a: array of slopes from bootstrapped fits
:param b: array of intercepts from bootstrapped fits
:param n: number of bootstrapping resamples that will be generated
:returns: arrays x (mock x-values with same interval as data), corresponding best-fit values, yStd (prediction band)
	"""
	xsim=np.linspace(x.min(),x.max(),50)

	for i in range(n):
	    if i==0: 
	        yarr=a[i]*xsim+b[i]
	    else:
	        yarr2=a[i]*xsim+b[i]
	        yarr=np.vstack((yarr,yarr2))

	# best-fit values
	amed=np.median(a)
	bmed=np.median(b)
	yBest=amed*xsim+bmed
	# prediction band
	yStd=np.std(yarr,axis=0)

	return xsim,yBest,yStd















# Monte Carlo simulations, generate random numbers
# ===================================================	
	

[docs]def gauss2d(par,varcov,n=10000):
	"""
Calculates random numbers drawn from the multinormal distribution in
two dimensions. Computes also the probability associated with each
number that can be used to computed confidence levels.

Arguments:

- par : array or list with structure [par0, par1, par2, ...] with the best-fit
  parameters
- varcov : variance-covariance matrix of the parameters
- n : number of (a,b) points generated from the multivariate gaussian

Output:

- x,y : arrays of random values generated from the multinormal distribution
- prob : probability associated with each value

Usage:
>>> x,y,prob=gauss2d(par,varcov,100000)

v1 Apr. 2012
	"""
	# Generates many realizations of a and b from the multinormal distribution
	x,y = np.random.multivariate_normal(par,varcov,n).T
	
	# Best-fit values
	x0,y0 = par[0], par[1]
	
	dx,dy = x-x0,y-y0
	sigx,sigy=np.sqrt(varcov[0,0]),np.sqrt(varcov[1,1])
	cov=varcov[0,1]
	rho=cov/sigx*sigy
	
	numchisq=(dx/sigx)**2+(dy/sigy)**2-2.*rho*(dx/sigx)*(dy/sigy)
	chisq=numchisq/(1.-rho**2)
	prob=np.exp(-chisq/2.)
	
	return x,y,prob




[docs]def bootcorr(x,y,nboot):
	"""
Given (X,Y) data points with intrinsic scatter, computes the 
bootstrapped Pearson and Spearman correlation coefficients. This
will give you an idea of how much outliers affect your correlation.

Usage:

>>> r,rho=bootcorr(x,y,100000)

performs 100000 bootstrapping realizations on the arrays x and y.

:returns: *r* - array with bootstrapped Pearson statistics
:returns: *rho* - bootstrapped array with Spearman statistics

	"""
	r,rho=[],[]
	for i in range(nboot):
		[xsim,ysim]=bootstrap([x,y])
		
		# Pearson r
		rsim=scipy.stats.pearsonr(xsim,ysim)	
		r.append(rsim[0])
		
		# Spearman rho
		rhosim=scipy.stats.spearmanr(xsim,ysim)
		rho.append(rhosim[0])

	r,rho=np.array(r),np.array(rho)

	results=np.array([ np.median(r), r.std(), np.median(rho), rho.std() ])
	print("<r>    err_r <rho> errrho")
	print(np.round(results, 2))

	results=np.array([ r2p(np.median(r)-np.abs(r.std()),x.size), r2p(np.median(r),x.size), r2p(np.median(r)+np.abs(r.std()),x.size) ])
	print("Prob. <- <r>-std,  <r>,    <r>+std")
	print(results)

	print("Rejection of H0 respectively at")
	for p in results:
		print(round(p2sig(p),2)	)
		
	return r,rho


[docs]def gen_ts(y,erry,n,zeropad=True):
    """
Given a time series (TS) with uncertainties on the signal, this will generate 
*n* new TS with y-values distributed according to the error bars. 

Output will be a :math:`n \\times {\rm{size}(t)}` array. Each row of this
array contains a simulated TS.

:param y: array of y-values for time series (do not need to be in order)
:param erry: array of 1-sigma errors on y-values
:param n: number of Mock TS to generate
:param zeropad: are y-values<0 not allowed? `True` will make any values<0 into 0
:returns: `n x size(t)` array. Each row of this array contains a simulated TS
    """
    ysim=np.empty((n,y.size))
    
    for i in range(y.size):
        # generate new points given normal distribution
        ysim[:,i]=np.random.normal(y[i],erry[i],n)   
        
    # makes sure no value is smaller than zero
    ysim[ysim<0]=0.
    
    return ysim



[docs]def random(a,b,n=None):
	"""
Generates an array of random uniformly distributed floats in the 
interval *[x0,x1)*.

>>> random(0.3,0.4,1000)
	"""
	return (b - a) * scipy.random.random_sample(n) + a




[docs]def linboot(x,y,n):
	"""
Performs the usual linear regression *with bootstrapping*. 

Usage:

>>> a,b=lincorr(x,y,10000)

performs 10000 bootstrapped fits on the arrays x and y. 

Plot the best-fit with

>>> amed=np.median(a)
>>> bmed=np.median(b)
>>> plt.plot(x,y,'o')
>>> plt.plot(x,amed*x+bmed)

:param x: array of x data values
:param y: array of y data values
:param n: number of bootstrapping resamples that will be generated
:returns: arrays `a` and `b`, each element is a different bootstrapped fit
	"""
	from . import lsd

	# y = A*x + B
	a, b=[],[]

	for i in range(n):
			# This is needed for small datasets. With datasets of 4 points or less,
			# bootstrapping can generate a mock array with 4 repeated points. This
			# will cause an error in the linear regression.
			allEquals=True
			while allEquals:
				[xsim,ysim]=lsd.bootstrap([x,y])

				allEquals=lsd.allEqual(xsim)

			# Linear fit
			asim, bsim, rsim, p, err = scipy.stats.linregress(xsim,ysim)
			a.append(asim)
			b.append(bsim)

	a,b=np.array(a),np.array(b)

	return a,b














	
	
	
# Comparing goodness-of-fit of different models
# ================================================
# F-test, Akaike information criterion and reduced chi-squared	

[docs]def ftest(rss1,rss2,n,p1,p2):
	"""
Carries out the F-test to decide which model fits the data better.
Computes the F statistic, associated p-value and the significance
in sigmas with which we can reject the null hypothesis. You can also
give the :math:`\chi^2` values instead of RSS if you have y-errors.

Note that p2>p1 must be obeyed, i.e. model 2 is "nested" within model 1.

Usage:

>>> fstat,pvalue,conf = ftest(rss1,rss2,n,p1,p2)

Arguments:
- rss1, rss2 : residual sum of squares for models 1 and 2
- n : sample size, i.e. number of data points
- p1, p2 : number of free parameters in the models

Returns:
- fstat : the F statistic
- pvalue : p-value associated with the F statistic
- conf : significance in sigmas with which we can reject the null hypothesis

References:
1. http://en.wikipedia.org/wiki/F-test
2. http://graphpad.com/curvefit/2_models__1_dataset.htm, Graphpad.com, Comparing the fits of two models (CurveFit.com)

v1 Dec. 2011
v2 Jan 16 2012: added comment regarding y-errors
"""
	fstat=scipy.stats.f_value(rss1,rss2,n-p1,n-p2)
	pvalue=1.-scipy.stats.f.cdf(fstat,p2-p1,n-p2)
	conf=np.sqrt(2.)*scipy.special.erfinv(1.-pvalue)
	
	return fstat, pvalue, conf
	
	



[docs]def aic(k,n,rss,errors=False):
	"""
Computes the Akaike information criterion which is "a measure of the relative goodness of
fit of a statistical model". Unlike the F-test, it does not assume that one model is 
a particular case of the more complex one. If errors==False then assume the
errors are the same for every data point. Otherwise, assumes you are providing
the chi-squared values instead of RSS.

Usage:

>>> aic = aic(k,n,rss)

:param rss: residual sum of squares for the model in case errors=False, otherwise assumes you are giving the chi-square values
:param n: sample size, i.e. number of data points
:param k: number of free parameters in the model
:returns: AIC statistic

References:

1. Documentation for Origin software on fit comparison: http://www.originlab.com/index.aspx?go=Products/Origin/DataAnalysis/CurveFitting/NonlinearFitting&pid=1195 (first heard about this test there)
2. http://en.wikipedia.org/wiki/Akaike_information_criterion

v1 Dec 2011
	"""
	if errors==False:
		# AIC assuming the errors are identical and given the residual sum of squares,
		# see http://en.wikipedia.org/wiki/Akaike_information_criterion#Relevance_to_chi-squared_fitting
		aicstat=n*np.log(rss/n)+2.*k
	else:
		# If you have different errors for the data points, it will assume rss=chisq
		aicstat=rss+2.*k
			
	# AICc = AIC with a correction for finite sample size
	aicc=aicstat+2.*k*(k+1.)/(n-k-1.)
	
	return aicc
	
	
	

[docs]def bic(k,n,rss,errors=False):
	"""
Computes the Bayesian information criterion which is "a criterion for model selection
among a finite set of models". From wikipedia: "The BIC [...] introduces a 
penalty term for the number of parameters in the model. The penalty term is larger 
in BIC than in AIC."

In order to use BIC to quantify the evidence against a specific model, check out
the section "use of BIC" in the presentation "171:290 Model Selection, Lecture VI: 
The Bayesian Information Criterion" by Joseph Cavanaugh 
(http://myweb.uiowa.edu/cavaaugh/ms_lec_6_ho.pdf).

If errors==False then assume the errors are the same for every data point. 
Otherwise, assumes you are providing the chi-squared values instead of RSS.

Usage:
>>> bic = bic(k,n,rss)

Arguments:
- rss : residual sum of squares for the model in case errors=False, otherwise assumes you are giving the chi-square values
- n : sample size, i.e. number of data points
- k : number of free parameters in the model

Returns: BIC statistic

References:
1. http://en.wikipedia.org/wiki/Bayesian_information_criterion
2. Model Selection, Lecture VI: The Bayesian Information Criterion" by Joseph Cavanaugh (http://myweb.uiowa.edu/cavaaugh/ms_lec_6_ho.pdf)

v1 Apr 2012
	"""
	if errors==False:
		# BIC assuming the errors are identical and given the residual sum of squares,
		# using the unbiased variance
		bicstat=n*np.log(rss/(n-1.))+k*np.log(n)
	else:
		# If you have different errors for the data points, it will assume rss=chisq
		bicstat=rss+k*np.log(n)
	
	return bicstat
	
	



[docs]def redchisq(x,y,a=None,b=None,sd=None):
	"""
Returns the reduced chi-square error statistic for a linear fit:
  chisq/nu
where nu is the number of degrees of freedom. If individual standard deviations 
(array sd) are supplied, then the chi-square error statistic is computed as the 
sum of squared errors divided by the standard deviations. 
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
	
If a linear model is not provided via A,B (y=ax+b) then the method computes
the chi-square using the best-fit line.

x,y,sd assumed to be Numpy arrays. a,b scalars. deg integer.
Returns the float chisq/nu with the reduced chi-square statistic.
	"""

	if a is None:	
		# Performs linear regression
		a, b, r, p, err = scipy.stats.linregress(x,y)
	
	# Chi-square statistic
	if sd is None:
		chisq=np.sum((y-a*x-b)**2)
	else:
		chisq=np.sum( ((y-a*x-b)/sd)**2 )
		
	# Number of degrees of freedom assuming 2 free parameters
	nu=x.size-3
	
	return chisq/nu
	



[docs]def redchisqxy(x,y,errx,erry,a,b):
	"""
Returns the reduced chi-square error statistic for a linear fit :math:`\chi^2/\nu`
where nu is the number of degrees of freedom. The chi-square statistic is
computed taking into account the errors in both X and Y (i.e. the
effective variance). See equation 3 in Ascenso et al. 2012, A&A or
Yee & Ellingson 2003 ApJ.

Usage:

>>> chisq=redchisqxy(xdata,ydata,errx,erry,a,b)

where

- xdata,ydata : data
- errx,erry : measurement uncertainties in the data
- a,b : slope and intercept of the best-fit linear regression model
	"""
	sdsq=erry**2+a**2*errx**2
	chisq=np.sum( (y-a*x-b)**2/sdsq )
		
	# Number of degrees of freedom assuming 2 free parameters
	nu=x.size-3
	
	return chisq/nu





[docs]def redchisqg(ydata,ymod,deg=2,sd=None):
	"""
Returns the reduced chi-square error statistic for an arbitrary model, 
chisq/nu, where nu is the number of degrees of freedom. If individual 
standard deviations (array sd) are supplied, then the chi-square error 
statistic is computed as the sum of squared errors divided by the standard 
deviations. See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.

ydata,ymod,sd assumed to be Numpy arrays. deg integer.

Usage:

>>> chisq=redchisqg(ydata,ymod,n,sd)

where

- ydata : data
- ymod : model evaluated at the same x points as ydata
- n : number of free parameters in the model
- sd : uncertainties in ydata
  	"""
	# Chi-square statistic
	if sd is None:
		chisq=np.sum((ydata-ymod)**2)
	else:
		chisq=np.sum( ((ydata-ymod)/sd)**2 )
		
	# Number of degrees of freedom assuming 2 free parameters
	nu=ydata.size-1-deg
	
	return chisq/nu	


[docs]def r2(ydata,ymod):
	"""
Computes the "coefficient of determination" R^2. According to wikipedia,
"It is the proportion of variability in a data set that is accounted for 
by the statistical model. It provides a measure of how well future outcomes 
are likely to be predicted by the model."

This method was inspired by http://stackoverflow.com/questions/3460357/calculating-the-coefficient-of-determination-in-python

References:
1. http://en.wikipedia.org/wiki/Coefficient_of_determination
2. http://stackoverflow.com/questions/3460357/calculating-the-coefficient-of-determination-in-python

v1 Apr 18th 2012
	"""
	ss_tot = np.sum( (ydata-ydata.mean())**2 )
	ss_err = np.sum( (ydata-ymod)**2 )
	
	return 1.-ss_err/ss_tot



[docs]def fitstats(xdata,ydata,errx,erry,a,b):
	"""
Computes different useful statistics for the given (X +- errX, Y +- errY)
arrays of data. Also quantifies the goodness-of-fit of the provided
linear fit with slope 'a' and intercept 'b'.

Usage:

>>> r,rho,rchisq,scat,iscat,r2=fitstats(xdata,ydata,errx,erry,a,b)

  :param xdata,ydata: data
  :param errx,erry: measurement uncertainties in the data
  :param a,b: slope and intercept of the best-fit linear regression model
  
Returns:

	- Pearson 'r'
	- Spearman 'rho'
	- Reduced chi-squared 'chi^2_nu'
	- raw scatter about the best-fit
	- intrinsic scatter
	- Coefficient of determination 'R^2'
	"""
	# Pearson r
	r=scipy.stats.pearsonr(xdata,ydata)	
	# Spearman rho
	rho=scipy.stats.spearmanr(xdata,ydata)	
	# Reduced chi-squared
	rchisq=redchisqxy(xdata,ydata,errx,erry,a,b)
	# raw scatter
	scat=scatpratt(xdata,ydata,errx,erry,a,b)
	# intrinsic scatter	
	iscat=intscat(xdata,ydata,errx,erry,a,b)	
	# Coefficient of determination R^2
	rtwo=r2(ydata,a*xdata+b)
	
	return r,rho,rchisq,scat,iscat,rtwo


















# Operations on statistical distributions
# =======================================
#

[docs]def splitstd(x):
	"""
Given the input distribution, this method computes two standard deviations:
the left and right side spreads. This is especially useful if you are dealing
with a split normal distribution from your data and you want to quantify the
lower and upper error bars without having to fit a split normal distribution.

This is a quick nonparametric method, as opposed to the more computationally
intensive minimization involved when fitting a function.

Algorithm:

- divide the original distribution in two distributions at the mode
- create two new distributions, which are the symmetrical versions of the left and right side of the original one
- compute the standard deviations for the new mirror distributions

:param x: array or list containing the distribution
:returns: left.stddev, right.stddev

.. note:: do not forget to inspect *x*.
	"""
	#med=median(x)
	med=mode(x)

	x1=x[ np.where(x<med) ] # left side
	x1mirror=x1+2.*np.abs(x1-med) # mirror side
	x1new=np.concatenate((x1,x1mirror))

	x2=x[ np.where(x>=med) ] # right side
	x2mirror=x2-2.*np.abs(x2-med) # mirror side
	x2new=np.concatenate((x2,x2mirror))

	return x1new.std(), x2new.std()




[docs]def mode(x,**kwargs):
	"""
Finds the mode of a distribution, i.e. the value where the PDF peaks.

:param x: input list/array with the distribution
	"""
	from . import lsd

	yh,xh=np.histogram(x,50,normed=True,**kwargs)
	dxh=(xh[1]-xh[0])/2.
	xh=xh+dxh

	return xh[ lsd.search(yh.max(),yh) ]















# Custom statistical distributions
# ==================================
#

[docs]def randomvariate(pdf,n=1000,xmin=0,xmax=1):
	"""
Rejection method for random number generation:
Uses the rejection method for generating random numbers derived from an arbitrary 
probability distribution. For reference, see Bevington's book, page 84. Based on
rejection*.py.

Usage:

>>> randomvariate(P,N,xmin,xmax)

  where

:param P: probability distribution function from which you want to generate random numbers
:param N: desired number of random values
:param xmin,xmax: range of random numbers desired
  
:returns: the sequence (ran,ntrials) where
  	ran : array of shape N with the random variates that follow the input P
  	ntrials : number of trials the code needed to achieve N

Here is the algorithm:

- generate x' in the desired range
- generate y' between Pmin and Pmax (Pmax is the maximal value of your pdf)
- if y'<P(x') accept x', otherwise reject
- repeat until desired number is achieved

v1 Nov. 2011
	"""
	# Calculates the minimal and maximum values of the PDF in the desired
	# interval. The rejection method needs these values in order to work
	# properly.
	x=np.linspace(xmin,xmax,1000)
	y=pdf(x)
	pmin=0.
	pmax=y.max()

	# Counters
	naccept=0
	ntrial=0

	# Keeps generating numbers until we achieve the desired n
	ran=[]	# output list of random numbers
	while naccept<n:
		x=np.random.uniform(xmin,xmax)	# x'
		y=np.random.uniform(pmin,pmax)	# y'

		if y<pdf(x):
			ran.append(x)
			naccept=naccept+1
		ntrial=ntrial+1
	
	ran=np.asarray(ran)
	
	return ran,ntrial





[docs]def splitnorm(x,sig1=1.,sig2=1.,mu=0.):
	"""
Split normal distribution (or asymmetric gaussian) PDF. Useful when dealing
with asymmetric error bars.

See http://en.wikipedia.org/wiki/Split_normal_distribution.

:param x: input array where the PDF will be computed
:param sig1: left standard deviation
:param sig2: right std. dev.
:param mu: mode
:returns: probability distribution function for the array x
	"""
	const=np.sqrt(2./np.pi)/(sig1 + sig2)
    
	p=np.where(x<mu, 
		np.exp(-((x-mu)**2/(2.*sig1**2)))*const, 
		np.exp(-((x-mu)**2/(2.*sig2**2)))*const)
    
	return p



[docs]class splitnorm_gen(scipy.stats.rv_continuous):
    """
Split normal distribution defined using scipy.stats. Can be called as 
any scipy.stats distribution. I took the effort of defining the extra
CDF and PPF methods below in order to speedup calls to this class 
(I have particularly in mind the package *mcerp*).

:param x: input array where the PDF will be computed
:param sig1: left standard deviation
:param sig2: right std. dev.
:param mu: mode

Examples: 

Defines distribution with sig1=1, sig2=3, mu=0:

>>> split = splitnorm_gen(name='splitnorm', shapes='sig1, sig2, mu')
>>> s=split(1,3,0.0001)
>>> x=np.linspace(-10,10,100)

Computes PDF:

>>> s.pdf(x)

Computes CDF:

>>> s.cdf(x)

Generates 100 random numbers:

>>> s.rvs(100)

.. warning:: for some reason, this fails if mu=0.
    """
    def _pdf(self, x, sig1, sig2, mu):
        const=np.sqrt(2./np.pi)/(sig1 + sig2)
            
        p=np.where(x<=mu, 
                np.exp(-((x-mu)**2/(2.*sig1**2)))*const, 
                np.exp(-((x-mu)**2/(2.*sig2**2)))*const)
                      
        return p
    
    def _cdf(self, x, sig1, sig2, mu):
        const=1./(sig1+sig2)
        
        c=np.where(x<=mu, 
                sig1*const*(1.+scipy.special.erf((x-mu)/(np.sqrt(2.)*sig1))), 
                const*(sig1+sig2*scipy.special.erf((x-mu)/(np.sqrt(2.)*sig2)))
                )
        
        return c
    
    def _ppf(self,q,sig1,sig2,mu):
        nu=sig1/(sig1 + sig2)
        
        pf=np.where(q<=nu, 
                np.sqrt(2.)*sig1*scipy.special.erfinv((sig1 + sig2)/sig1*q - 1.) + mu, 
                np.sqrt(2)*sig2*scipy.special.erfinv(((sig1 + sig2)*q - sig1)/sig2) + mu
                )
        
        return pf     
    
    def _stats(self, sig1, sig2, mu):
        mean=mu+np.sqrt(2./np.pi)*(sig2 - sig1)
        var=(1.-2./np.pi)*(sig2-sig1)**2 + sig1*sig2
        skew=np.sqrt(2./np.pi)*(sig2-sig1)*( (4./np.pi-1.)*(sig2-sig1)**2+sig1*sig2 )
        kurt=None
        
        return mean, var, skew, kurt






















# p-values and significance
# ===========================
#


[docs]def r2p(r,n):
	"""
Given a value of the Pearson r coefficient, this method gives the
corresponding p-value of the null hypothesis (i.e. no correlation).

Code copied from scipy.stats.pearsonr source.

Usage:

>>> p=r2p(r,n)

where 'r' is the Pearson coefficient and n is the number of data
points.

:returns: p-value of the null hypothesis.
	"""
	import scipy.special
	df = n-2
	if abs(r) == 1.0:
		prob = 0.0
	else:
		t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
		prob = scipy.special.betainc(0.5*df, 0.5, df / (df + t_squared))
        
	return prob



[docs]def p2sig(p):
	"""
Given the p-value Pnull (i.e. probability of the null hypothesis) below, 
evaluates the confidence level with which we can reject the hypothesis in standard 
deviations.

Inspired on prob2sig.nb.

Usage:

>>> s=p2sig(0.001)
	"""
	sig=np.sqrt(2.)*scipy.special.erfinv(1.-p)
	
	return sig



[docs]def conf2sig(p):
	"""
Given a confidence p-value, translates it into standard deviations.
E.g., p=0.683 -> 1sigma, p=0.954 -> 2sigma, etc.

Inspired on p2sig method.

Usage:

>>> s=conf2sig(0.683)
	"""
	sig=np.sqrt(2.)*scipy.special.erfinv(p)
	
	return sig



[docs]def sig2conf(sig):
	"""
Given a number of std. deviations, translates it into a p-value.
E.g., 1sigma -> p=0.683, 2sigma -> p=0.954, etc.

Inspired on p2sig method.

Usage:

>>> s=sig2conf(5.)
	"""
	return scipy.special.erf(sig/np.sqrt(2.))
Source code for nmmn.stats

nmmn

Navigation

Related Topics