"""
Statistical methods
=====================
- fit residuals
- Computing prediction and confidence bands
- Comparing goodness-of-fit of different models
- operations on statistical distributions
- custom statistical distributions
- p-values and significance
"""
import numpy as np
import scipy
import scipy.stats
from . import misc
# Residuals of a fit
# ===================
# Residual sum of squares, raw scatter about best-fit and
# intrinsic scatter.
[docs]def scatterfit(x,y,a=None,b=None):
"""
Compute the mean deviation of the data about the linear model given if A,B
(*y=ax+b*) provided as arguments. Otherwise, compute the mean deviation about
the best-fit line.
:param x,y: assumed to be Numpy arrays.
:param a,b: scalars.
:rtype: float sd with the mean deviation.
"""
if a==None:
# Performs linear regression
a, b, r, p, err = scipy.stats.linregress(x,y)
# Std. deviation of an individual measurement (Bevington, eq. 6.15)
N=np.size(x)
sd=1./(N-2.)* np.sum((y-a*x-b)**2)
sd=np.sqrt(sd)
return sd
[docs]def scatterfitg(ydata,ymod,deg):
"""
Compute the mean deviation of the data about the model given in
the array ymod, tabulated exactly like ydata.
Usage:
>>> sd=scatterfitg(ydata,ymod,n)
:param ydata: data
:param ymod: model evaluated at xdata
:param n: number of free parameters in the model
:type ydata,ymod: Numpy arrays
:rtype: float sd with the mean deviation.
"""
# Std. deviation of an individual measurement (Bevington, eq. 6.15)
N=np.size(ydata)
sd=1./(N-deg)* np.sum((ydata-ymod)**2)
sd=np.sqrt(sd)
return sd
[docs]def scatpratt(x,y,errx,erry,a,b):
"""
This is an alternative way of computing the "raw" scatter about the best-fit
linear relation in the presence of measurement errors, proposed by
Pratt et al. 2009, A&A, 498, 361.
In the words of Cavagnolo et al. (2010):
*We have quantified the total scatter about the best-fit relation using a
weighted estimate of the orthogonal distances to the best-fit line (see
Pratt et al. 2009).*
Usage:
.. function:: sd=scatpratt(x,y,errx,erry,a,b)
:param x,y: X and Y data arrays
:param errx, erry: standard deviations in X and Y
:param a,b: slope and intercept of best-fit linear relation
:rtype: float sd with the scatter about the best-fit.
v1 Mar 20 2012
"""
N=np.size(x)
# Equation 4 from Pratt et al. 2009
sdsq=erry**2+a**2*errx**2
wden=1./N*np.sum(1./sdsq) # Denominator of w
w=1./sdsq/wden
# Equation 3 from Pratt et al. 2009
sdrawsq=1./(N-2.)*np.sum(w*(y-a*x-b)**2)
sdraw=np.sqrt(sdrawsq)
return sdraw
[docs]def residual(ydata,ymod):
"""
Compute the residual sum of squares (RSS) also known as the sum of
squared residuals (see http://en.wikipedia.org/wiki/Residual_sum_of_squares).
Usage:
>>> rss=residual(ydata,ymod)
where
ydata : data
ymod : model evaluated at xdata
ydata,ymod assumed to be Numpy arrays.
Returns the float rss with the mean deviation.
Dec. 2011
"""
# Std. deviation of an individual measurement (Bevington, eq. 6.15)
N=np.size(ydata)
rss=np.sum((ydata-ymod)**2)
return rss
[docs]def chisq(x,y,a=None,b=None,sd=None):
"""
Returns the chi-square error statistic as the sum of squared errors between
Y(i) and AX(i) + B. If individual standard deviations (array sd) are supplied,
then the chi-square error statistic is computed as the sum of squared errors
divided by the standard deviations. Inspired on the IDL procedure linfit.pro.
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
If a linear model is not provided via A,B (y=ax+b) then the method computes
the chi-square using the best-fit line.
x,y,sd assumed to be Numpy arrays. a,b scalars.
Returns the float chisq with the chi-square statistic.
"""
if a==None:
# Performs linear regression
a, b, r, p, err = scipy.stats.linregress(x,y)
# Chi-square statistic (Bevington, eq. 6.9)
if sd==None:
chisq=np.sum((y-a*x-b)**2)
else:
chisq=np.sum( ((y-a*x-b)/sd)**2 )
return chisq
[docs]def chisqg(ydata,ymod,sd=None):
"""
Returns the chi-square error statistic as the sum of squared errors between
Ydata(i) and Ymodel(i). If individual standard deviations (array sd) are supplied,
then the chi-square error statistic is computed as the sum of squared errors
divided by the standard deviations. Inspired on the IDL procedure linfit.pro.
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
x,y,sd assumed to be Numpy arrays. a,b scalars.
Returns the float chisq with the chi-square statistic.
"""
# Chi-square statistic (Bevington, eq. 6.9)
if sd==None:
chisq=np.sum((ydata-ymod)**2)
else:
chisq=np.sum( ((ydata-ymod)/sd)**2 )
return chisq
[docs]def chisqxy(x,y,errx,erry,a,b):
"""
Returns the chi-square error statistic for a linear fit,
computed taking into account the errors in both X and Y (i.e. the
effective variance). See equation 3 in Ascenso et al. 2012, A&A or
Yee & Ellingson 2003 ApJ.
Usage:
>>> chisq=chisqxy(xdata,ydata,errx,erry,a,b)
where
- xdata,ydata : data
- errx,erry : measurement uncertainties in the data
- a,b : slope and intercept of the best-fit linear regression model
"""
sdsq=erry**2+a**2*errx**2
chisq=np.sum( (y-a*x-b)**2/sdsq )
return chisq
[docs]def intscat(x,y,errx,erry,a,b):
"""
Estimates the intrinsic scatter about the best-fit, taking into account
the errors in X and Y and the "raw" scatter. Inspired by Pratt et al. 2009,
A&A, 498, 361.
Usage:
>>> sd=intscat(x,y,errx,erry,a,b)
where
- x,y : X and Y data arrays
- errx, erry : standard deviations in X and Y
- a,b : slope and intercept of best-fit linear relation
Returns the float sd with the scatter about the best-fit.
v1 Apr 9th 2012
"""
# Raw scatter
sdraw=scatpratt(x,y,errx,erry,a,b)
sdrawsq=sdraw**2
# Statistical variance, eq. 4 from Pratt et al. 2009
sdsq=erry**2+a**2*errx**2
# Intrinsic scatter for each data point
intscati=np.sqrt( np.abs(sdsq-sdrawsq) )
return np.mean(intscati)
[docs]def linregress_error(x,y):
"""
Compute the uncertainties in the parameters A,B of the least-squares
linear regression fit y=ax+b to the data (scipy.stats.linregress).
x,y assumed to be Numpy arrays.
Returns the sequence sigma_a, sigma_b with the standard deviations in A and B.
"""
# Performs linear regression
a, b, r, p, err = scipy.stats.linregress(x,y)
# Std. deviation of an individual measurement (Bevington, eq. 6.15)
N=np.size(x)
sd=1./(N-2.)* np.sum((y-a*x-b)**2); sd=np.sqrt(sd)
# Std. deviation in parameters
Delta=N*np.sum(x**2)-(np.sum(x))**2 # Bevington, eq. 6.13
sdb=sd**2/Delta*np.sum(x**2); sdb=np.sqrt(sdb) # Bevington, eq. 6.23
sda=N*sd**2/Delta; sda=np.sqrt(sda)
return sda,sdb
# Computing prediction and confidence bands
# ===========================================
#
#
[docs]def confband(xd,yd,a,b,conf=0.95,x=None):
"""
Calculates the confidence band of the linear regression model at the desired confidence
level, using analytical methods. The 2sigma confidence interval is 95% sure to contain
the best-fit regression line. This is not the same as saying it will contain 95% of
the data points.
Arguments:
- conf: desired confidence level, by default 0.95 (2 sigma)
- xd,yd: data arrays
- a,b: linear fit parameters as in y=ax+b
- x: (optional) array with x values to calculate the confidence band. If none is provided, will by default generate 100 points in the original x-range of the data.
Returns:
Sequence (lcb,ucb,x) with the arrays holding the lower and upper confidence bands
corresponding to the [input] x array.
Usage:
>>> lcb,ucb,x=nemmen.confband(all.kp,all.lg,a,b,conf=0.95)
calculates the confidence bands for the given input arrays
>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the confidence band
References:
1. http://en.wikipedia.org/wiki/Simple_linear_regression, see Section Confidence intervals
2. http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm
v1 Dec. 2011
v2 Jun. 2012: corrected bug in computing dy
"""
alpha=1.-conf # significance
n=xd.size # data sample size
if x==None: x=np.linspace(xd.min(),xd.max(),100)
# Predicted values (best-fit model)
y=a*x+b
# Auxiliary definitions
sd=scatterfit(xd,yd,a,b) # Scatter of data about the model
sxd=np.sum((xd-xd.mean())**2)
sx=(x-xd.mean())**2 # array
# Quantile of Student's t distribution for p=1-alpha/2
q=scipy.stats.t.ppf(1.-alpha/2.,n-2)
# Confidence band
dy=q*sd*np.sqrt( 1./n + sx/sxd )
ucb=y+dy # Upper confidence band
lcb=y-dy # Lower confidence band
return lcb,ucb,x
[docs]def predband(xd,yd,a,b,conf=0.95,x=None):
"""
Calculates the prediction band of the linear regression model at the desired confidence
level, using analytical methods.
Clarification of the difference between confidence and prediction bands:
"The 2sigma confidence interval is 95% sure to contain the best-fit regression line.
This is not the same as saying it will contain 95% of the data points. The prediction bands are
further from the best-fit line than the confidence bands, a lot further if you have many data
points. The 95% prediction interval is the area in which you expect 95% of all data points to fall."
(from http://graphpad.com/curvefit/linear_regression.htm)
Arguments:
- conf: desired confidence level, by default 0.95 (2 sigma)
- xd,yd: data arrays
- a,b: linear fit parameters as in y=ax+b
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
by default generate 100 points in the original x-range of the data.
Usage:
>>> lpb,upb,x=nemmen.predband(all.kp,all.lg,a,b,conf=0.95)
calculates the prediction bands for the given input arrays
>>> pylab.fill_between(x, lpb, upb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
:returns: Sequence (lpb,upb,x) with the arrays holding the lower and upper confidence bands
corresponding to the [input] x array.
References:
1. `Introduction to Simple Linear Regression, Gerard
E. Dallal, Ph.D. <http://www.JerryDallal.com/LHSP/slr.htm>`_
"""
alpha=1.-conf # significance
n=xd.size # data sample size
if x is None: x=np.linspace(xd.min(),xd.max(),100)
# Predicted values (best-fit model)
y=a*x+b
# Auxiliary definitions
sd=scatterfit(xd,yd,a,b) # Scatter of data about the model
sxd=np.sum((xd-xd.mean())**2)
sx=(x-xd.mean())**2 # array
# Quantile of Student's t distribution for p=1-alpha/2
q=scipy.stats.t.ppf(1.-alpha/2.,n-2)
# Prediction band
dy=q*sd*np.sqrt( 1.+1./n + sx/sxd )
upb=y+dy # Upper prediction band
lpb=y-dy # Lower prediction band
return lpb,upb,x
[docs]def confbandnl(xd,yd,fun,par,varcov,deg,conf=0.95,x=None):
"""
Calculates the confidence band of a nonlinear model at the desired confidence
level, using analytical methods.
Arguments:
- xd,yd: data arrays
- fun : function f(v) - the model - which returns a scalar. v is an array
such that v[0]=x (scalar), v[i>0] = parameters of the model
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
parameters that will be fed into fun
- varcov : variance-covariance matrix obtained from the nonlinear fit
- deg : number of free parameters in the model
- conf: desired confidence level, by default 0.95 (2 sigma)
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
by default generate 100 points in the original x-range of the data.
Usage:
>>> lcb,ucb,x=nemmen.confbandnl(all.kp,all.lg,broken,bfit,bcov,4,conf=0.95)
calculates the confidence bands for the given input arrays
>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
:returns: Sequence (lcb,ucb,x) with the arrays holding the lower and upper confidence bands
corresponding to the [input] x array.
References:
1. `How does Prism compute confidence and prediction bands for nonlinear regression? <http://www.graphpad.com/faq/viewfaq.cfm?faq=1099>`_
2. http://stats.stackexchange.com/questions/15423/how-to-compute-prediction-bands-for-non-linear-regression
3. see also my notebook
"""
import numdifftools
alpha=1.-conf # significance
n=xd.size # data sample size
if x is None: x=np.linspace(xd.min(),xd.max(),100)
# Gradient (needs to be evaluated)
dfun=numdifftools.Gradient(fun)
# Quantile of Student's t distribution for p=1-alpha/2
q=scipy.stats.t.ppf(1.-alpha/2.,n-2)
# Residual sum of squares
rss=residual(yd, misc.evalfun(fun,xd,par) )
grad,p=[],[]
i=0
y=misc.evalfun(fun,x,par)
v=np.zeros_like(x)
for i in range(x.size):
# List: arrays consisting of [x[i], par1, par2, ...]
p.append( np.concatenate(([x[i]],par)) )
# List: each element -> gradient evaluated at each xi
grad.append(dfun(p[i]))
# Before processing the grad array, eliminates the first element
temp=np.dot(np.transpose(grad[i][1:]), varcov)
v[i]=np.dot(temp, grad[i][1:])
# Confidence band
dy=q*np.sqrt( v*rss/(n-deg) )
ucb=y+dy # Upper confidence band
lcb=y-dy # Lower confidence band
return lcb,ucb,x
[docs]def predbandnl(xd,yd,fun,par,varcov,deg,conf=0.95,x=None):
"""
Calculates the prediction band of a nonlinear model at the desired confidence
level, using analytical methods.
Arguments:
- xd,yd: data arrays
- fun : function f(v) - the model - which returns a scalar. v is an array
such that v[0]=x, v[i>0] = parameters of the model
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
parameters that will be fed into fun
- varcov : variance-covariance matrix obtained from the nonlinear fit
- deg : number of free parameters in the model
- conf: desired confidence level, by default 0.95 (2 sigma)
- x: (optional) array with x values to calculate the confidence band. If none is provided, will
by default generate 100 points in the original x-range of the data.
Usage:
>>> lpb,upb,x=nemmen.predbandnl(all.kp,all.lg,broken,bfit,bcov,4,conf=0.95)
calculates the prediction bands for the given input arrays
>>> pylab.fill_between(x, lpb, upb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
:returns: Sequence (lpb,upb,x) with the arrays holding the lower and upper confidence bands
corresponding to the [input] x array.
References:
1. http://www.graphpad.com/faq/viewfaq.cfm?faq=1099, "How does Prism compute confidence and prediction bands for nonlinear regression?"
2. http://stats.stackexchange.com/questions/15423/how-to-compute-prediction-bands-for-non-linear-regression
3. see also my notebook)
"""
import numdifftools
alpha=1.-conf # significance
n=xd.size # data sample size
if x is None: x=np.linspace(xd.min(),xd.max(),100)
# Gradient (needs to be evaluated)
dfun=numdifftools.Gradient(fun)
# Quantile of Student's t distribution for p=1-alpha/2
q=scipy.stats.t.ppf(1.-alpha/2.,n-2)
# Residual sum of squares
rss=residual(yd, misc.evalfun(fun,xd,par) )
grad,p=[],[]
i=0
y=misc.evalfun(fun,x,par)
v=np.zeros_like(x)
for i in range(x.size):
# List: arrays consisting of [x[i], par1, par2, ...]
p.append( np.concatenate(([x[i]],par)) )
# List: each element -> gradient evaluated at each xi
grad.append(dfun(p[i]))
# Before processing the grad array, eliminates the first element
temp=np.dot(np.transpose(grad[i][1:]), varcov)
v[i]=np.dot(temp, grad[i][1:])
# Confidence band
dy=q*np.sqrt( (1.+v)*rss/(n-deg) )
upb=y+dy # Upper prediction band
lpb=y-dy # Lower prediction band
return lpb,upb,x
[docs]def confbandmc(x,par,varcov,n=100000,sigmas=1.):
"""
Calculates the 1sigma confidence band of a linear model without needing
to specify the data points, by doing a Monte Carlo simulation using the
Multivariate normal distribution. Assumes the parameters follow a 2D
normal distribution.
Arguments:
- x: array with x values to calculate the confidence band.
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
parameters
- varcov : variance-covariance matrix of the parameters
- n : number of (a,b) points generated from the multivariate gaussian
- sigmas : number of standard deviations contained within the prediction
band
Output:
- lcb, ucb : lower and upper prediction bands
- y : values of the model tabulated at x
Usage:
>>> lcb,ucb,y=nemmen.confbandmc(x,bfit,bcov)
calculates the prediction bands for the given input arrays
>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
"""
# Generates many realizations of a and b from the multinormal distribution
ar,br = np.random.multivariate_normal(par,varcov,n).T
erry=[] # will contain the std deviation in y
y=[] # values of y for each x
for xi in x:
yr=ar*xi+br
erry.append( yr.std() )
y.append( yr.mean() )
erry=np.array(erry)
y=np.array(y)
ucb=y+sigmas*erry # Upper confidence band
lcb=y-sigmas*erry # Lower confidence band
return lcb,ucb,y
[docs]def credbandmc(x,slope,inter,sigmas=1.):
"""
Calculates the confidence (or credibility in case of Bayesian analysis)
band of a linear regression model, given the posterior probability distributions
of the slope and intercept (presumably computed via Bayesian regression,
see bayeslin.pro). These distributions do not need to be normal.
Arguments:
- x: array with x values to calculate the confidence band.
- slope, inter: posterior distributions of slope and intercept for linear
regression
- sigmas : number of standard deviations contained within the confidence
band
Output:
- lcb, ucb : lower and upper confidence bands
- y : best-fit values of the model tabulated at x
Usage:
>>> lcb,ucb,y=nemmen.predbandmc(x,a,b)
calculates the prediction bands for the given input arrays
>>> pylab.fill_between(x, lcb, ucb, alpha=0.3, facecolor='gray')
plots a shaded area containing the prediction band
v1 Jun. 2012: inspired by private communication with B. Kelly.
"""
a,b = slope,inter
# Define the confidence/credibility interval
conf=1.-scipy.special.erf(sigmas/np.sqrt(2.))
lcb,ucb,ym=np.zeros_like(x),np.zeros_like(x),np.zeros_like(x)
for i, xi in enumerate(x):
# Distribution of y for each x
yp=a*xi+b # 'p' as in posterior
# Lower confidence band
lcb[i]=scipy.stats.scoreatpercentile(yp,conf*100./2.)
# Upper confidence band
ucb[i]=scipy.stats.scoreatpercentile(yp,100.*(1-conf/2.))
ym[i]=np.median(yp)
return lcb,ucb,ym
[docs]def predband_linboot(x,a,b,n):
"""
Computes information for plotting the prediction band ("bow tie" plot) around
best-fit. Run this after running `linboot` below.
Usage:
Plots prediction band and best-fit given data x,y
>>> n=1000
>>> a,b=nmmn.stats.linboot(x,y,n)
>>> xMock,yBest,yStd=nmmn.stats.confband_linboot(x,a,b,n)
>>> plt.plot(x,y,'o')
>>> plt.plot(xMock,amed*xMock+bmed)
>>> plt.fill_between(xMock,yBest-yStd,yBest+yStd,alpha=0.3, facecolor='gray')
:param x: array of x data values
:param a: array of slopes from bootstrapped fits
:param b: array of intercepts from bootstrapped fits
:param n: number of bootstrapping resamples that will be generated
:returns: arrays x (mock x-values with same interval as data), corresponding best-fit values, yStd (prediction band)
"""
xsim=np.linspace(x.min(),x.max(),50)
for i in range(n):
if i==0:
yarr=a[i]*xsim+b[i]
else:
yarr2=a[i]*xsim+b[i]
yarr=np.vstack((yarr,yarr2))
# best-fit values
amed=np.median(a)
bmed=np.median(b)
yBest=amed*xsim+bmed
# prediction band
yStd=np.std(yarr,axis=0)
return xsim,yBest,yStd
# Monte Carlo simulations, generate random numbers
# ===================================================
[docs]def gauss2d(par,varcov,n=10000):
"""
Calculates random numbers drawn from the multinormal distribution in
two dimensions. Computes also the probability associated with each
number that can be used to computed confidence levels.
Arguments:
- par : array or list with structure [par0, par1, par2, ...] with the best-fit
parameters
- varcov : variance-covariance matrix of the parameters
- n : number of (a,b) points generated from the multivariate gaussian
Output:
- x,y : arrays of random values generated from the multinormal distribution
- prob : probability associated with each value
Usage:
>>> x,y,prob=gauss2d(par,varcov,100000)
v1 Apr. 2012
"""
# Generates many realizations of a and b from the multinormal distribution
x,y = np.random.multivariate_normal(par,varcov,n).T
# Best-fit values
x0,y0 = par[0], par[1]
dx,dy = x-x0,y-y0
sigx,sigy=np.sqrt(varcov[0,0]),np.sqrt(varcov[1,1])
cov=varcov[0,1]
rho=cov/sigx*sigy
numchisq=(dx/sigx)**2+(dy/sigy)**2-2.*rho*(dx/sigx)*(dy/sigy)
chisq=numchisq/(1.-rho**2)
prob=np.exp(-chisq/2.)
return x,y,prob
[docs]def bootcorr(x,y,nboot):
"""
Given (X,Y) data points with intrinsic scatter, computes the
bootstrapped Pearson and Spearman correlation coefficients. This
will give you an idea of how much outliers affect your correlation.
Usage:
>>> r,rho=bootcorr(x,y,100000)
performs 100000 bootstrapping realizations on the arrays x and y.
:returns: *r* - array with bootstrapped Pearson statistics
:returns: *rho* - bootstrapped array with Spearman statistics
"""
r,rho=[],[]
for i in range(nboot):
[xsim,ysim]=bootstrap([x,y])
# Pearson r
rsim=scipy.stats.pearsonr(xsim,ysim)
r.append(rsim[0])
# Spearman rho
rhosim=scipy.stats.spearmanr(xsim,ysim)
rho.append(rhosim[0])
r,rho=np.array(r),np.array(rho)
results=np.array([ np.median(r), r.std(), np.median(rho), rho.std() ])
print("<r> err_r <rho> errrho")
print(np.round(results, 2))
results=np.array([ r2p(np.median(r)-np.abs(r.std()),x.size), r2p(np.median(r),x.size), r2p(np.median(r)+np.abs(r.std()),x.size) ])
print("Prob. <- <r>-std, <r>, <r>+std")
print(results)
print("Rejection of H0 respectively at")
for p in results:
print(round(p2sig(p),2) )
return r,rho
[docs]def gen_ts(y,erry,n,zeropad=True):
"""
Given a time series (TS) with uncertainties on the signal, this will generate
*n* new TS with y-values distributed according to the error bars.
Output will be a :math:`n \\times {\rm{size}(t)}` array. Each row of this
array contains a simulated TS.
:param y: array of y-values for time series (do not need to be in order)
:param erry: array of 1-sigma errors on y-values
:param n: number of Mock TS to generate
:param zeropad: are y-values<0 not allowed? `True` will make any values<0 into 0
:returns: `n x size(t)` array. Each row of this array contains a simulated TS
"""
ysim=np.empty((n,y.size))
for i in range(y.size):
# generate new points given normal distribution
ysim[:,i]=np.random.normal(y[i],erry[i],n)
# makes sure no value is smaller than zero
ysim[ysim<0]=0.
return ysim
[docs]def random(a,b,n=None):
"""
Generates an array of random uniformly distributed floats in the
interval *[x0,x1)*.
>>> random(0.3,0.4,1000)
"""
return (b - a) * scipy.random.random_sample(n) + a
[docs]def linboot(x,y,n):
"""
Performs the usual linear regression *with bootstrapping*.
Usage:
>>> a,b=lincorr(x,y,10000)
performs 10000 bootstrapped fits on the arrays x and y.
Plot the best-fit with
>>> amed=np.median(a)
>>> bmed=np.median(b)
>>> plt.plot(x,y,'o')
>>> plt.plot(x,amed*x+bmed)
:param x: array of x data values
:param y: array of y data values
:param n: number of bootstrapping resamples that will be generated
:returns: arrays `a` and `b`, each element is a different bootstrapped fit
"""
from . import lsd
# y = A*x + B
a, b=[],[]
for i in range(n):
# This is needed for small datasets. With datasets of 4 points or less,
# bootstrapping can generate a mock array with 4 repeated points. This
# will cause an error in the linear regression.
allEquals=True
while allEquals:
[xsim,ysim]=lsd.bootstrap([x,y])
allEquals=lsd.allEqual(xsim)
# Linear fit
asim, bsim, rsim, p, err = scipy.stats.linregress(xsim,ysim)
a.append(asim)
b.append(bsim)
a,b=np.array(a),np.array(b)
return a,b
# Comparing goodness-of-fit of different models
# ================================================
# F-test, Akaike information criterion and reduced chi-squared
[docs]def ftest(rss1,rss2,n,p1,p2):
"""
Carries out the F-test to decide which model fits the data better.
Computes the F statistic, associated p-value and the significance
in sigmas with which we can reject the null hypothesis. You can also
give the :math:`\chi^2` values instead of RSS if you have y-errors.
Note that p2>p1 must be obeyed, i.e. model 2 is "nested" within model 1.
Usage:
>>> fstat,pvalue,conf = ftest(rss1,rss2,n,p1,p2)
Arguments:
- rss1, rss2 : residual sum of squares for models 1 and 2
- n : sample size, i.e. number of data points
- p1, p2 : number of free parameters in the models
Returns:
- fstat : the F statistic
- pvalue : p-value associated with the F statistic
- conf : significance in sigmas with which we can reject the null hypothesis
References:
1. http://en.wikipedia.org/wiki/F-test
2. http://graphpad.com/curvefit/2_models__1_dataset.htm, Graphpad.com, Comparing the fits of two models (CurveFit.com)
v1 Dec. 2011
v2 Jan 16 2012: added comment regarding y-errors
"""
fstat=scipy.stats.f_value(rss1,rss2,n-p1,n-p2)
pvalue=1.-scipy.stats.f.cdf(fstat,p2-p1,n-p2)
conf=np.sqrt(2.)*scipy.special.erfinv(1.-pvalue)
return fstat, pvalue, conf
[docs]def aic(k,n,rss,errors=False):
"""
Computes the Akaike information criterion which is "a measure of the relative goodness of
fit of a statistical model". Unlike the F-test, it does not assume that one model is
a particular case of the more complex one. If errors==False then assume the
errors are the same for every data point. Otherwise, assumes you are providing
the chi-squared values instead of RSS.
Usage:
>>> aic = aic(k,n,rss)
:param rss: residual sum of squares for the model in case errors=False, otherwise assumes you are giving the chi-square values
:param n: sample size, i.e. number of data points
:param k: number of free parameters in the model
:returns: AIC statistic
References:
1. Documentation for Origin software on fit comparison: http://www.originlab.com/index.aspx?go=Products/Origin/DataAnalysis/CurveFitting/NonlinearFitting&pid=1195 (first heard about this test there)
2. http://en.wikipedia.org/wiki/Akaike_information_criterion
v1 Dec 2011
"""
if errors==False:
# AIC assuming the errors are identical and given the residual sum of squares,
# see http://en.wikipedia.org/wiki/Akaike_information_criterion#Relevance_to_chi-squared_fitting
aicstat=n*np.log(rss/n)+2.*k
else:
# If you have different errors for the data points, it will assume rss=chisq
aicstat=rss+2.*k
# AICc = AIC with a correction for finite sample size
aicc=aicstat+2.*k*(k+1.)/(n-k-1.)
return aicc
[docs]def bic(k,n,rss,errors=False):
"""
Computes the Bayesian information criterion which is "a criterion for model selection
among a finite set of models". From wikipedia: "The BIC [...] introduces a
penalty term for the number of parameters in the model. The penalty term is larger
in BIC than in AIC."
In order to use BIC to quantify the evidence against a specific model, check out
the section "use of BIC" in the presentation "171:290 Model Selection, Lecture VI:
The Bayesian Information Criterion" by Joseph Cavanaugh
(http://myweb.uiowa.edu/cavaaugh/ms_lec_6_ho.pdf).
If errors==False then assume the errors are the same for every data point.
Otherwise, assumes you are providing the chi-squared values instead of RSS.
Usage:
>>> bic = bic(k,n,rss)
Arguments:
- rss : residual sum of squares for the model in case errors=False, otherwise assumes you are giving the chi-square values
- n : sample size, i.e. number of data points
- k : number of free parameters in the model
Returns: BIC statistic
References:
1. http://en.wikipedia.org/wiki/Bayesian_information_criterion
2. Model Selection, Lecture VI: The Bayesian Information Criterion" by Joseph Cavanaugh (http://myweb.uiowa.edu/cavaaugh/ms_lec_6_ho.pdf)
v1 Apr 2012
"""
if errors==False:
# BIC assuming the errors are identical and given the residual sum of squares,
# using the unbiased variance
bicstat=n*np.log(rss/(n-1.))+k*np.log(n)
else:
# If you have different errors for the data points, it will assume rss=chisq
bicstat=rss+k*np.log(n)
return bicstat
[docs]def redchisq(x,y,a=None,b=None,sd=None):
"""
Returns the reduced chi-square error statistic for a linear fit:
chisq/nu
where nu is the number of degrees of freedom. If individual standard deviations
(array sd) are supplied, then the chi-square error statistic is computed as the
sum of squared errors divided by the standard deviations.
See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
If a linear model is not provided via A,B (y=ax+b) then the method computes
the chi-square using the best-fit line.
x,y,sd assumed to be Numpy arrays. a,b scalars. deg integer.
Returns the float chisq/nu with the reduced chi-square statistic.
"""
if a is None:
# Performs linear regression
a, b, r, p, err = scipy.stats.linregress(x,y)
# Chi-square statistic
if sd is None:
chisq=np.sum((y-a*x-b)**2)
else:
chisq=np.sum( ((y-a*x-b)/sd)**2 )
# Number of degrees of freedom assuming 2 free parameters
nu=x.size-3
return chisq/nu
[docs]def redchisqxy(x,y,errx,erry,a,b):
"""
Returns the reduced chi-square error statistic for a linear fit :math:`\chi^2/\nu`
where nu is the number of degrees of freedom. The chi-square statistic is
computed taking into account the errors in both X and Y (i.e. the
effective variance). See equation 3 in Ascenso et al. 2012, A&A or
Yee & Ellingson 2003 ApJ.
Usage:
>>> chisq=redchisqxy(xdata,ydata,errx,erry,a,b)
where
- xdata,ydata : data
- errx,erry : measurement uncertainties in the data
- a,b : slope and intercept of the best-fit linear regression model
"""
sdsq=erry**2+a**2*errx**2
chisq=np.sum( (y-a*x-b)**2/sdsq )
# Number of degrees of freedom assuming 2 free parameters
nu=x.size-3
return chisq/nu
[docs]def redchisqg(ydata,ymod,deg=2,sd=None):
"""
Returns the reduced chi-square error statistic for an arbitrary model,
chisq/nu, where nu is the number of degrees of freedom. If individual
standard deviations (array sd) are supplied, then the chi-square error
statistic is computed as the sum of squared errors divided by the standard
deviations. See http://en.wikipedia.org/wiki/Goodness_of_fit for reference.
ydata,ymod,sd assumed to be Numpy arrays. deg integer.
Usage:
>>> chisq=redchisqg(ydata,ymod,n,sd)
where
- ydata : data
- ymod : model evaluated at the same x points as ydata
- n : number of free parameters in the model
- sd : uncertainties in ydata
"""
# Chi-square statistic
if sd is None:
chisq=np.sum((ydata-ymod)**2)
else:
chisq=np.sum( ((ydata-ymod)/sd)**2 )
# Number of degrees of freedom assuming 2 free parameters
nu=ydata.size-1-deg
return chisq/nu
[docs]def r2(ydata,ymod):
"""
Computes the "coefficient of determination" R^2. According to wikipedia,
"It is the proportion of variability in a data set that is accounted for
by the statistical model. It provides a measure of how well future outcomes
are likely to be predicted by the model."
This method was inspired by http://stackoverflow.com/questions/3460357/calculating-the-coefficient-of-determination-in-python
References:
1. http://en.wikipedia.org/wiki/Coefficient_of_determination
2. http://stackoverflow.com/questions/3460357/calculating-the-coefficient-of-determination-in-python
v1 Apr 18th 2012
"""
ss_tot = np.sum( (ydata-ydata.mean())**2 )
ss_err = np.sum( (ydata-ymod)**2 )
return 1.-ss_err/ss_tot
[docs]def fitstats(xdata,ydata,errx,erry,a,b):
"""
Computes different useful statistics for the given (X +- errX, Y +- errY)
arrays of data. Also quantifies the goodness-of-fit of the provided
linear fit with slope 'a' and intercept 'b'.
Usage:
>>> r,rho,rchisq,scat,iscat,r2=fitstats(xdata,ydata,errx,erry,a,b)
:param xdata,ydata: data
:param errx,erry: measurement uncertainties in the data
:param a,b: slope and intercept of the best-fit linear regression model
Returns:
- Pearson 'r'
- Spearman 'rho'
- Reduced chi-squared 'chi^2_nu'
- raw scatter about the best-fit
- intrinsic scatter
- Coefficient of determination 'R^2'
"""
# Pearson r
r=scipy.stats.pearsonr(xdata,ydata)
# Spearman rho
rho=scipy.stats.spearmanr(xdata,ydata)
# Reduced chi-squared
rchisq=redchisqxy(xdata,ydata,errx,erry,a,b)
# raw scatter
scat=scatpratt(xdata,ydata,errx,erry,a,b)
# intrinsic scatter
iscat=intscat(xdata,ydata,errx,erry,a,b)
# Coefficient of determination R^2
rtwo=r2(ydata,a*xdata+b)
return r,rho,rchisq,scat,iscat,rtwo
# Operations on statistical distributions
# =======================================
#
[docs]def splitstd(x):
"""
Given the input distribution, this method computes two standard deviations:
the left and right side spreads. This is especially useful if you are dealing
with a split normal distribution from your data and you want to quantify the
lower and upper error bars without having to fit a split normal distribution.
This is a quick nonparametric method, as opposed to the more computationally
intensive minimization involved when fitting a function.
Algorithm:
- divide the original distribution in two distributions at the mode
- create two new distributions, which are the symmetrical versions of the left and right side of the original one
- compute the standard deviations for the new mirror distributions
:param x: array or list containing the distribution
:returns: left.stddev, right.stddev
.. note:: do not forget to inspect *x*.
"""
#med=median(x)
med=mode(x)
x1=x[ np.where(x<med) ] # left side
x1mirror=x1+2.*np.abs(x1-med) # mirror side
x1new=np.concatenate((x1,x1mirror))
x2=x[ np.where(x>=med) ] # right side
x2mirror=x2-2.*np.abs(x2-med) # mirror side
x2new=np.concatenate((x2,x2mirror))
return x1new.std(), x2new.std()
[docs]def mode(x,**kwargs):
"""
Finds the mode of a distribution, i.e. the value where the PDF peaks.
:param x: input list/array with the distribution
"""
from . import lsd
yh,xh=np.histogram(x,50,normed=True,**kwargs)
dxh=(xh[1]-xh[0])/2.
xh=xh+dxh
return xh[ lsd.search(yh.max(),yh) ]
# Custom statistical distributions
# ==================================
#
[docs]def randomvariate(pdf,n=1000,xmin=0,xmax=1):
"""
Rejection method for random number generation:
Uses the rejection method for generating random numbers derived from an arbitrary
probability distribution. For reference, see Bevington's book, page 84. Based on
rejection*.py.
Usage:
>>> randomvariate(P,N,xmin,xmax)
where
:param P: probability distribution function from which you want to generate random numbers
:param N: desired number of random values
:param xmin,xmax: range of random numbers desired
:returns: the sequence (ran,ntrials) where
ran : array of shape N with the random variates that follow the input P
ntrials : number of trials the code needed to achieve N
Here is the algorithm:
- generate x' in the desired range
- generate y' between Pmin and Pmax (Pmax is the maximal value of your pdf)
- if y'<P(x') accept x', otherwise reject
- repeat until desired number is achieved
v1 Nov. 2011
"""
# Calculates the minimal and maximum values of the PDF in the desired
# interval. The rejection method needs these values in order to work
# properly.
x=np.linspace(xmin,xmax,1000)
y=pdf(x)
pmin=0.
pmax=y.max()
# Counters
naccept=0
ntrial=0
# Keeps generating numbers until we achieve the desired n
ran=[] # output list of random numbers
while naccept<n:
x=np.random.uniform(xmin,xmax) # x'
y=np.random.uniform(pmin,pmax) # y'
if y<pdf(x):
ran.append(x)
naccept=naccept+1
ntrial=ntrial+1
ran=np.asarray(ran)
return ran,ntrial
[docs]def splitnorm(x,sig1=1.,sig2=1.,mu=0.):
"""
Split normal distribution (or asymmetric gaussian) PDF. Useful when dealing
with asymmetric error bars.
See http://en.wikipedia.org/wiki/Split_normal_distribution.
:param x: input array where the PDF will be computed
:param sig1: left standard deviation
:param sig2: right std. dev.
:param mu: mode
:returns: probability distribution function for the array x
"""
const=np.sqrt(2./np.pi)/(sig1 + sig2)
p=np.where(x<mu,
np.exp(-((x-mu)**2/(2.*sig1**2)))*const,
np.exp(-((x-mu)**2/(2.*sig2**2)))*const)
return p
[docs]class splitnorm_gen(scipy.stats.rv_continuous):
"""
Split normal distribution defined using scipy.stats. Can be called as
any scipy.stats distribution. I took the effort of defining the extra
CDF and PPF methods below in order to speedup calls to this class
(I have particularly in mind the package *mcerp*).
:param x: input array where the PDF will be computed
:param sig1: left standard deviation
:param sig2: right std. dev.
:param mu: mode
Examples:
Defines distribution with sig1=1, sig2=3, mu=0:
>>> split = splitnorm_gen(name='splitnorm', shapes='sig1, sig2, mu')
>>> s=split(1,3,0.0001)
>>> x=np.linspace(-10,10,100)
Computes PDF:
>>> s.pdf(x)
Computes CDF:
>>> s.cdf(x)
Generates 100 random numbers:
>>> s.rvs(100)
.. warning:: for some reason, this fails if mu=0.
"""
def _pdf(self, x, sig1, sig2, mu):
const=np.sqrt(2./np.pi)/(sig1 + sig2)
p=np.where(x<=mu,
np.exp(-((x-mu)**2/(2.*sig1**2)))*const,
np.exp(-((x-mu)**2/(2.*sig2**2)))*const)
return p
def _cdf(self, x, sig1, sig2, mu):
const=1./(sig1+sig2)
c=np.where(x<=mu,
sig1*const*(1.+scipy.special.erf((x-mu)/(np.sqrt(2.)*sig1))),
const*(sig1+sig2*scipy.special.erf((x-mu)/(np.sqrt(2.)*sig2)))
)
return c
def _ppf(self,q,sig1,sig2,mu):
nu=sig1/(sig1 + sig2)
pf=np.where(q<=nu,
np.sqrt(2.)*sig1*scipy.special.erfinv((sig1 + sig2)/sig1*q - 1.) + mu,
np.sqrt(2)*sig2*scipy.special.erfinv(((sig1 + sig2)*q - sig1)/sig2) + mu
)
return pf
def _stats(self, sig1, sig2, mu):
mean=mu+np.sqrt(2./np.pi)*(sig2 - sig1)
var=(1.-2./np.pi)*(sig2-sig1)**2 + sig1*sig2
skew=np.sqrt(2./np.pi)*(sig2-sig1)*( (4./np.pi-1.)*(sig2-sig1)**2+sig1*sig2 )
kurt=None
return mean, var, skew, kurt
# p-values and significance
# ===========================
#
[docs]def r2p(r,n):
"""
Given a value of the Pearson r coefficient, this method gives the
corresponding p-value of the null hypothesis (i.e. no correlation).
Code copied from scipy.stats.pearsonr source.
Usage:
>>> p=r2p(r,n)
where 'r' is the Pearson coefficient and n is the number of data
points.
:returns: p-value of the null hypothesis.
"""
import scipy.special
df = n-2
if abs(r) == 1.0:
prob = 0.0
else:
t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
prob = scipy.special.betainc(0.5*df, 0.5, df / (df + t_squared))
return prob
[docs]def p2sig(p):
"""
Given the p-value Pnull (i.e. probability of the null hypothesis) below,
evaluates the confidence level with which we can reject the hypothesis in standard
deviations.
Inspired on prob2sig.nb.
Usage:
>>> s=p2sig(0.001)
"""
sig=np.sqrt(2.)*scipy.special.erfinv(1.-p)
return sig
[docs]def conf2sig(p):
"""
Given a confidence p-value, translates it into standard deviations.
E.g., p=0.683 -> 1sigma, p=0.954 -> 2sigma, etc.
Inspired on p2sig method.
Usage:
>>> s=conf2sig(0.683)
"""
sig=np.sqrt(2.)*scipy.special.erfinv(p)
return sig
[docs]def sig2conf(sig):
"""
Given a number of std. deviations, translates it into a p-value.
E.g., 1sigma -> p=0.683, 2sigma -> p=0.954, etc.
Inspired on p2sig method.
Usage:
>>> s=sig2conf(5.)
"""
return scipy.special.erf(sig/np.sqrt(2.))