From 782f52e37a1e4bccc7eaa8cba9a39b8b2cb105ed Mon Sep 17 00:00:00 2001 From: Justin Bois Date: Fri, 4 Nov 2016 00:29:14 -0700 Subject: [PATCH 001/157] Fixed bug in approx_hess3 related to floats and tuples. --- statsmodels/tools/numdiff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/tools/numdiff.py b/statsmodels/tools/numdiff.py index ae78a2accdb..d113c2423d5 100644 --- a/statsmodels/tools/numdiff.py +++ b/statsmodels/tools/numdiff.py @@ -349,7 +349,7 @@ def approx_hess3(x, f, epsilon=None, args=(), kwargs={}): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs) - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs) - - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs),) + - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs)) )/(4.*hess[i, j]) hess[j, i] = hess[i, j] return hess From bd3ab9abbd421f096a651f74146874fb1985ccf3 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 18 Jan 2018 08:46:38 -0500 Subject: [PATCH 002/157] add additional em tests --- .../multivariate/tests/test_ml_factor.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/statsmodels/multivariate/tests/test_ml_factor.py b/statsmodels/multivariate/tests/test_ml_factor.py index 7f3810bb3e5..5a5573f29f7 100644 --- a/statsmodels/multivariate/tests/test_ml_factor.py +++ b/statsmodels/multivariate/tests/test_ml_factor.py @@ -62,6 +62,45 @@ def test_exact(): rslt.summary() # smoke test +def test_exact_em(): + # Test if we can recover exact factor-structured matrices with + # default starting values using the EM algorithm. + + np.random.seed(23324) + + # Works for larger k_var but slow for routine testing. + for k_var in 5, 10, 25: + for n_factor in 1, 2, 3: + load = np.random.normal(size=(k_var, n_factor)) + uniq = np.linspace(1, 2, k_var) + c = np.dot(load, load.T) + c.flat[::c.shape[0]+1] += uniq + s = np.sqrt(np.diag(c)) + c /= np.outer(s, s) + fa = Factor(corr=c, n_factor=n_factor, method='ml') + load_e, uniq_e = fa._fit_ml_em(200) + c_e = np.dot(load_e, load_e.T) + c_e.flat[::c_e.shape[0]+1] += uniq_e + assert_allclose(c_e, c, rtol=1e-4, atol=1e-4) + + +def test_em(): + + n_factor = 1 + cor = np.asarray([[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]]) + + fa = Factor(corr=cor, n_factor=n_factor, method='ml') + rslt = fa.fit(opt={'gtol': 1e-4}) + load_opt = rslt.loadings + uniq_opt = rslt.uniqueness + + load_em, uniq_em = fa._fit_ml_em(1000) + cc = np.dot(load_em, load_em.T) + cc.flat[::cc.shape[0]+1] += uniq_em + + assert_allclose(cc, rslt.fitted_cov, rtol=1e-2, atol=1e-2) + + def test_1factor(): """ # R code: From 8031f0c24de96dfd29a9563b60e8b6db40012e4b Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 18 Jan 2018 10:34:54 -0500 Subject: [PATCH 003/157] linear algebra optimization for EM --- statsmodels/multivariate/factor.py | 21 ++++++++++++------- .../multivariate/tests/test_ml_factor.py | 2 +- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/statsmodels/multivariate/factor.py b/statsmodels/multivariate/factor.py index 076eda5b465..7503983c845 100644 --- a/statsmodels/multivariate/factor.py +++ b/statsmodels/multivariate/factor.py @@ -448,26 +448,33 @@ def _fit_ml_em(self, iter): for k in range(iter): loadu = load / uniq[:, None] + f = np.dot(load.T, loadu) f.flat[::f.shape[0]+1] += 1 - flu = np.linalg.solve(f, loadu.T) - lflu = np.dot(load, flu) - e = self.corr - np.dot(lflu, self.corr) - e /= uniq[:, None] + r = np.linalg.solve(f, loadu.T) + + q = np.dot(loadu.T, load) - d = np.dot(load.T, e) + h = np.dot(r, load) - c = load - np.dot(lflu, load) + c = load - np.dot(load, h) c /= uniq[:, None] + g = np.dot(q, r) + + e = np.dot(g, self.corr) + + d = np.dot(loadu.T, self.corr) - e + a = np.dot(d, c) a -= np.dot(load.T, c) a.flat[::a.shape[0]+1] += 1 + b = np.dot(self.corr, c) load = np.linalg.solve(a, b.T).T - uniq = np.diag(self.corr - np.dot(load, d)) + uniq = np.diag(self.corr) - (load * d.T).sum(1) return load, uniq diff --git a/statsmodels/multivariate/tests/test_ml_factor.py b/statsmodels/multivariate/tests/test_ml_factor.py index 5a5573f29f7..9f29f8c3810 100644 --- a/statsmodels/multivariate/tests/test_ml_factor.py +++ b/statsmodels/multivariate/tests/test_ml_factor.py @@ -90,7 +90,7 @@ def test_em(): cor = np.asarray([[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]]) fa = Factor(corr=cor, n_factor=n_factor, method='ml') - rslt = fa.fit(opt={'gtol': 1e-4}) + rslt = fa.fit(opt={'gtol': 1e-3}) load_opt = rslt.loadings uniq_opt = rslt.uniqueness From df5eefc43168fa5ce7cb068e91f70a956c8820b7 Mon Sep 17 00:00:00 2001 From: Juan Escamilla Date: Fri, 19 Jan 2018 20:12:37 +0000 Subject: [PATCH 004/157] Bug: Bug fixed. Change to 'inverse of sigma (np.linalg.inv)' instead of the 'pseudo-inverse (np.linalg.pinv)'. Pseudo-inverse does not always preserve positive definiteness of sigma. --- statsmodels/regression/linear_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py index 5aaa58bdd1a..fa3c6935de6 100644 --- a/statsmodels/regression/linear_model.py +++ b/statsmodels/regression/linear_model.py @@ -165,8 +165,12 @@ def _get_sigma(sigma, nobs): if sigma.shape != (nobs, nobs): raise ValueError("Sigma must be a scalar, 1d of length %s or a 2d " "array of shape %s x %s" % (nobs, nobs, nobs)) - cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sigma)).T - + ## Bug fix: np.linalg.pinv does not preserve positive definiteness. + ## Proposal: Use np.linalg.inv(sigma). The current implementation uses a squared matrix. I do not understand why the original author decided to use a pseudo inverse matrix instead of the actual inverse. + ## Original line + #cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sigma)).T + ## Bug fix: + cholsigmainv = np.linalg.cholesky(np.linalg.inv(sigma)).T return sigma, cholsigmainv From 1d568c3ecc36a2c11d1db417ec3251700da49753 Mon Sep 17 00:00:00 2001 From: Juan Escamilla Date: Sat, 20 Jan 2018 20:46:58 +0000 Subject: [PATCH 005/157] BUG: changed np.linalg.pinv to np.linalg.inv in _get_sigma (Removed comments) --- statsmodels/regression/linear_model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py index fa3c6935de6..f57fd0f7d85 100644 --- a/statsmodels/regression/linear_model.py +++ b/statsmodels/regression/linear_model.py @@ -165,11 +165,6 @@ def _get_sigma(sigma, nobs): if sigma.shape != (nobs, nobs): raise ValueError("Sigma must be a scalar, 1d of length %s or a 2d " "array of shape %s x %s" % (nobs, nobs, nobs)) - ## Bug fix: np.linalg.pinv does not preserve positive definiteness. - ## Proposal: Use np.linalg.inv(sigma). The current implementation uses a squared matrix. I do not understand why the original author decided to use a pseudo inverse matrix instead of the actual inverse. - ## Original line - #cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sigma)).T - ## Bug fix: cholsigmainv = np.linalg.cholesky(np.linalg.inv(sigma)).T return sigma, cholsigmainv From 4e311c7249cf4fc52040b88cb436bf21d77a2f2e Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 21 Jan 2018 12:03:37 -0500 Subject: [PATCH 006/157] minor code format changes --- statsmodels/multivariate/factor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/statsmodels/multivariate/factor.py b/statsmodels/multivariate/factor.py index 7503983c845..7b038381b62 100644 --- a/statsmodels/multivariate/factor.py +++ b/statsmodels/multivariate/factor.py @@ -453,18 +453,14 @@ def _fit_ml_em(self, iter): f.flat[::f.shape[0]+1] += 1 r = np.linalg.solve(f, loadu.T) - q = np.dot(loadu.T, load) - h = np.dot(r, load) c = load - np.dot(load, h) c /= uniq[:, None] g = np.dot(q, r) - e = np.dot(g, self.corr) - d = np.dot(loadu.T, self.corr) - e a = np.dot(d, c) From 69b05efccf0fbda07103d0c24280fd17c7f5764f Mon Sep 17 00:00:00 2001 From: tommyod Date: Sun, 21 Jan 2018 21:02:22 +0100 Subject: [PATCH 007/157] STY: Fixed some pep8 violations, removed unused imports --- statsmodels/nonparametric/bandwidths.py | 2 - statsmodels/nonparametric/kde.py | 56 +++++++++++-------------- statsmodels/nonparametric/kdetools.py | 24 +++++------ 3 files changed, 37 insertions(+), 45 deletions(-) diff --git a/statsmodels/nonparametric/bandwidths.py b/statsmodels/nonparametric/bandwidths.py index d4830dc323b..48286822812 100644 --- a/statsmodels/nonparametric/bandwidths.py +++ b/statsmodels/nonparametric/bandwidths.py @@ -4,8 +4,6 @@ from scipy.stats import scoreatpercentile as sap from statsmodels.sandbox.nonparametric import kernels -#from scipy.stats import norm - def _select_sigma(X): """ Returns the smaller of std(X, ddof=1) or normalized IQR(X) over axis 0. diff --git a/statsmodels/nonparametric/kde.py b/statsmodels/nonparametric/kde.py index 04d5fb72eaa..95078421772 100644 --- a/statsmodels/nonparametric/kde.py +++ b/statsmodels/nonparametric/kde.py @@ -13,16 +13,12 @@ """ from __future__ import absolute_import, print_function, division from statsmodels.compat.python import range -# for 2to3 with extensions -import warnings - import numpy as np from scipy import integrate, stats from statsmodels.sandbox.nonparametric import kernels -from statsmodels.tools.decorators import (cache_readonly, - resettable_cache) +from statsmodels.tools.decorators import (cache_readonly, resettable_cache) from . import bandwidths -from .kdetools import (forrt, revrt, silverman_transform, counts) +from .kdetools import (forrt, revrt, silverman_transform) from .linbin import fast_linbin #### Kernels Switch for estimators #### @@ -169,7 +165,6 @@ def cdf(self): Will not work if fit has not been called. """ _checkisfit(self) - density = self.density kern = self.kernel if kern.domain is None: # TODO: test for grid point at domain bound a,b = -np.inf,np.inf @@ -181,8 +176,8 @@ def cdf(self): support = np.r_[a,support] gridsize = len(support) endog = self.endog - probs = [integrate.quad(func, support[i-1], support[i], - args=endog)[0] for i in range(1,gridsize)] + probs = [integrate.quad(func, support[i - 1], support[i], + args=endog)[0] for i in range(1, gridsize)] return np.cumsum(probs) @cache_readonly @@ -226,16 +221,15 @@ def entr(x,s): pdf = kern.density(s,x) return pdf*np.log(pdf+1e-12) - pdf = self.density kern = self.kernel if kern.domain is not None: - a,b = self.domain + a, b = self.domain else: - a,b = -np.inf,np.inf + a, b = -np.inf, np.inf endog = self.endog #TODO: below could run into integr problems, cf. stats.dist._entropy - return -integrate.quad(entr, a,b, args=(endog,))[0] + return -integrate.quad(entr, a, b, args=(endog,))[0] @cache_readonly def icdf(self): @@ -249,8 +243,7 @@ def icdf(self): """ _checkisfit(self) gridsize = len(self.density) - return stats.mstats.mquantiles(self.endog, np.linspace(0,1, - gridsize)) + return stats.mstats.mquantiles(self.endog, np.linspace(0, 1, gridsize)) def evaluate(self, point): """ @@ -268,7 +261,7 @@ def evaluate(self, point): #### Kernel Density Estimator Functions #### def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, - adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True): + adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True): """ Rosenblatt-Parzen univariate kernel density estimator. @@ -320,8 +313,8 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None """ X = np.asarray(X) if X.ndim == 1: - X = X[:,None] - clip_x = np.logical_and(X>clip[0], X clip[0], X < clip[1]) X = X[clip_x] nobs = len(X) # after trim @@ -336,6 +329,7 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None else: # ensure weights is a numpy array weights = np.asarray(weights) + if len(weights) != len(clip_x): msg = "The length of the weights must be the same as the given X." raise ValueError(msg) @@ -352,11 +346,11 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None bw = bandwidths.select_bandwidth(X, bw, kern) bw *= adjust - a = np.min(X,axis=0) - cut*bw - b = np.max(X,axis=0) + cut*bw + a = np.min(X, axis=0) - cut * bw + b = np.max(X, axis=0) + cut * bw grid = np.linspace(a, b, gridsize) - k = (X.T - grid[:,None])/bw # uses broadcasting to make a gridsize x nobs + k = (X.T - grid[:, None])/bw # uses broadcasting to make a gridsize x nobs # set kernel bandwidth kern.seth(bw) @@ -370,9 +364,9 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None else: k = kern(k) # estimate density - k[k<0] = 0 # get rid of any negative values, do we need this? + k[k < 0] = 0 # get rid of any negative values, do we need this? - dens = np.dot(k,weights)/(q*bw) + dens = np.dot(k, weights)/(q*bw) if retgrid: return dens, grid, bw @@ -380,7 +374,7 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None return dens, bw def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, - adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True): + adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True): """ Rosenblatt-Parzen univariate kernel density estimator @@ -451,7 +445,7 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N Series C. 31.2, 93-9. """ X = np.asarray(X) - X = X[np.logical_and(X>clip[0], X clip[0], X < clip[1])] # won't work for two columns. # will affect underlying data? # Get kernel object corresponding to selection @@ -467,13 +461,13 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N # 1 Make grid and discretize the data if gridsize == None: - gridsize = np.max((nobs,512.)) + gridsize = np.max((nobs, 512.)) gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2 - a = np.min(X)-cut*bw - b = np.max(X)+cut*bw - grid,delta = np.linspace(a,b,gridsize,retstep=True) - RANGE = b-a + a = np.min(X) - cut * bw + b = np.max(X) + cut * bw + grid,delta = np.linspace(a, b, int(gridsize), retstep=True) + RANGE = b - a #TODO: Fix this? # This is the Silverman binning function, but I believe it's buggy (SS) @@ -491,7 +485,7 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N # binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta #NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING - binned = fast_linbin(X,a,b,gridsize)/(delta*nobs) + binned = fast_linbin(X, a, b, gridsize) / (delta * nobs) # step 2 compute FFT of the weights, using Munro (1976) FFT convention y = forrt(binned) diff --git a/statsmodels/nonparametric/kdetools.py b/statsmodels/nonparametric/kdetools.py index 04fb39e7569..99b55796c6b 100644 --- a/statsmodels/nonparametric/kdetools.py +++ b/statsmodels/nonparametric/kdetools.py @@ -3,23 +3,23 @@ from statsmodels.compat.python import range import numpy as np -def forrt(X,m=None): +def forrt(X, m=None): """ RFFT with order like Munro (1976) FORTT routine. """ if m is None: m = len(X) - y = np.fft.rfft(X,m)/m - return np.r_[y.real,y[1:-1].imag] + y = np.fft.rfft(X, m) / m + return np.r_[y.real, y[1:-1].imag] -def revrt(X,m=None): +def revrt(X, m=None): """ Inverse of forrt. Equivalent to Munro (1976) REVRT routine. """ if m is None: m = len(X) - i = int(m // 2+1) - y = X[:i] + np.r_[0,X[i:],0]*1j + i = int(m // 2 + 1) + y = X[:i] + np.r_[0, X[i:], 0] * 1j return np.fft.irfft(y)*m def silverman_transform(bw, M, RANGE): @@ -33,12 +33,12 @@ def silverman_transform(bw, M, RANGE): J = np.arange(M/2+1) FAC1 = 2*(np.pi*bw/RANGE)**2 JFAC = J**2*FAC1 - BC = 1 - 1./3 * (J*1./M*np.pi)**2 + BC = 1 - 1. / 3 * (J * 1./M*np.pi)**2 FAC = np.exp(-JFAC)/BC - kern_est = np.r_[FAC,FAC[1:-1]] + kern_est = np.r_[FAC, FAC[1:-1]] return kern_est -def counts(x,v): +def counts(x, v): """ Counts the number of elements of x that fall within the grid points v @@ -46,12 +46,12 @@ def counts(x,v): ----- Using np.digitize and np.bincount """ - idx = np.digitize(x,v) + idx = np.digitize(x, v) try: # numpy 1.6 return np.bincount(idx, minlength=len(v)) except: bc = np.bincount(idx) - return np.r_[bc,np.zeros(len(v)-len(bc))] + return np.r_[bc, np.zeros(len(v) - len(bc))] -def kdesum(x,axis=0): +def kdesum(x, axis=0): return np.asarray([np.sum(x[i] - x, axis) for i in range(len(x))]) From 252bdfcf163b8d08cd81d20845755185bd0d6f7e Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 25 Jan 2018 14:29:23 -0500 Subject: [PATCH 008/157] initial commit --- statsmodels/regression/mixed_linear_model.py | 21 ++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py index f97f1dd64a4..804d8ed6b8f 100644 --- a/statsmodels/regression/mixed_linear_model.py +++ b/statsmodels/regression/mixed_linear_model.py @@ -2086,18 +2086,21 @@ class MixedLMResults(base.LikelihoodModelResults, base.ResultMixin): **Attributes** model : class instance - Pointer to PHreg model instance that called fit. + Pointer to MixedLM model instance that called fit. normalized_cov_params : array The sampling covariance matrix of the estimates fe_params : array The fitted fixed-effects coefficients - re_params : array + cov_re : array The fitted random-effects covariance matrix bse_fe : array The standard errors of the fitted fixed effects coefficients bse_re : array The standard errors of the fitted random effects covariance - matrix + matrix and variance components. The first `k_re * (k_re + 1)` + parameters are the standard errors for the lower triangle of + `cov_re`, the remaining elements are the standard errors for + the variance components. See Also -------- @@ -2162,11 +2165,17 @@ def bse_fe(self): @cache_readonly def bse_re(self): """ - Returns the standard errors of the variance parameters. Note - that the sampling distribution of variance parameters is + Returns the standard errors of the variance parameters. + + The first `k_re x (k_re + 1)` elements of the returned array + are the standard errors of the lower tirangle of `cov_re`. + The remaining elements are the standard errors of the variance + components. + + Note that the sampling distribution of variance parameters is strongly skewed unless the sample size is large, so these standard errors may not give meaningful confidence intervals - of p-values if used in the usual way. + or p-values if used in the usual way. """ p = self.model.exog.shape[1] return np.sqrt(self.scale * np.diag(self.cov_params())[p:]) From 877c46c5f35272804944d1910550997e077aab83 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 25 Jan 2018 15:26:24 -0500 Subject: [PATCH 009/157] pep8 fixes --- statsmodels/regression/mixed_linear_model.py | 198 +++++++++---------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py index 804d8ed6b8f..267e7fade5e 100644 --- a/statsmodels/regression/mixed_linear_model.py +++ b/statsmodels/regression/mixed_linear_model.py @@ -187,7 +187,7 @@ def _multi_dot_three(A, B, C): cost1 = (A.shape[0] * A.shape[1] * B.shape[1] + # (AB) A.shape[0] * B.shape[1] * C.shape[1]) # (--)C # cost2 = cost((AB)C) - cost2 = (B.shape[0] * B.shape[1] * C.shape[1] + # (BC) + cost2 = (B.shape[0] * B.shape[1] * C.shape[1] + # (BC) A.shape[0] * A.shape[1] * C.shape[1]) # A(--) if cost1 < cost2: @@ -227,6 +227,7 @@ def _get_exog_re_names(self, exog_re): defnames = ["x_re{0:1d}".format(k + 1) for k in range(exog_re.shape[1])] return defnames + class MixedLMParams(object): """ This class represents a parameter state for a mixed linear model. @@ -256,7 +257,6 @@ def __init__(self, k_fe, k_re, k_vc): self.k_tot = self.k_fe + self.k_re2 + self.k_vc self._ix = np.tril_indices(self.k_re) - def from_packed(params, k_fe, k_re, use_sqrt, has_fe): """ Create a MixedLMParams object from packed parameter vector. @@ -319,7 +319,8 @@ def from_packed(params, k_fe, k_re, use_sqrt, has_fe): from_packed = staticmethod(from_packed) - def from_components(fe_params=None, cov_re=None, cov_re_sqrt=None, vcomp=None): + def from_components(fe_params=None, cov_re=None, cov_re_sqrt=None, + vcomp=None): """ Create a MixedLMParams object from each parameter component. @@ -377,7 +378,6 @@ def copy(self): obj.vcomp = self.vcomp.copy() return obj - def get_packed(self, use_sqrt, has_fe=False): """ Return the model parameters packed into a single vector. @@ -574,7 +574,8 @@ class MixedLM(base.LikelihoodModel): A mixed model with fixed effects for the columns of ``exog`` and independent random coefficients for the columns of ``exog_re``: - >>> free = MixedLMParams.from_components(fe_params=np.ones(exog.shape[1]), \ + >>> free = MixedLMParams.from_components( + fe_params=np.ones(exog.shape[1]), cov_re=np.eye(exog_re.shape[1])) >>> model = sm.MixedLM(endog, exog, groups, exog_re=exog_re) >>> result = model.fit(free=free) @@ -599,7 +600,8 @@ def __init__(self, endog, exog, groups, exog_re=None, _allowed_kwargs = ["missing_idx", "design_info", "formula"] for x in kwargs.keys(): if x not in _allowed_kwargs: - raise ValueError("argument %s not permitted for MixedLM initialization" % x) + raise ValueError( + "argument %s not permitted for MixedLM initialization" % x) self.use_sqrt = use_sqrt @@ -632,7 +634,8 @@ def __init__(self, endog, exog, groups, exog_re=None, self._init_keys.extend(["use_sqrt", "exog_vc"]) - self.k_fe = exog.shape[1] # Number of fixed effects parameters + # Number of fixed effects parameters + self.k_fe = exog.shape[1] if exog_re is None and exog_vc is None: # Default random effects structure (random intercepts). @@ -668,7 +671,7 @@ def __init__(self, endog, exog, groups, exog_re=None, # HACK: could've been set in from_formula already # needs refactor (param_names, exog_re_names, - exog_re_names_full) = self._make_param_names(exog_re) + exog_re_names_full) = self._make_param_names(exog_re) self.data.param_names = param_names self.data.exog_re_names = exog_re_names self.data.exog_re_names_full = exog_re_names_full @@ -680,7 +683,7 @@ def __init__(self, endog, exog, groups, exog_re=None, group_labels = list(set(groups)) group_labels.sort() row_indices = dict((s, []) for s in group_labels) - for i,g in enumerate(groups): + for i, g in enumerate(groups): row_indices[g].append(i) self.row_indices = row_indices self.group_labels = group_labels @@ -717,7 +720,6 @@ def __init__(self, endog, exog, groups, exog_re=None, # Precompute this self._lin, self._quad = self._reparam() - def _setup_vcomp(self, exog_vc): if exog_vc is None: exog_vc = {} @@ -727,7 +729,6 @@ def _setup_vcomp(self, exog_vc): vc_names.sort() self._vc_names = vc_names - def _make_param_names(self, exog_re): """ Returns the full parameter names list, just the exogenous random @@ -752,7 +753,6 @@ def _make_param_names(self, exog_re): return exog_names + param_names + vc_names, exog_re_names, param_names - @classmethod def from_formula(cls, formula, data, re_formula=None, vc_formula=None, subset=None, use_sparse=False, missing='none', *args, @@ -853,7 +853,8 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, """ if "groups" not in kwargs.keys(): - raise AttributeError("'groups' is a required keyword argument in MixedLM.from_formula") + raise AttributeError("'groups' is a required keyword argument " + + "in MixedLM.from_formula") groups = kwargs["groups"] # If `groups` is a variable name, retrieve the data for the @@ -864,9 +865,11 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, groups = np.asarray(data[groups]) del kwargs["groups"] - # Bypass all upstream missing data handling to properly handle variance components + # Bypass all upstream missing data handling to properly handle + # variance components if missing == 'drop': - data, groups = _handle_missing(data, groups, formula, re_formula, vc_formula) + data, groups = _handle_missing(data, groups, formula, re_formula, + vc_formula) missing = 'none' if re_formula is not None: @@ -883,7 +886,8 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, eval_env = EvalEnvironment({}) exog_re = patsy.dmatrix(re_formula, data, eval_env=eval_env) exog_re_names = exog_re.design_info.column_names - exog_re_names = [x.replace("Intercept", group_name) for x in exog_re_names] + exog_re_names = [x.replace("Intercept", group_name) + for x in exog_re_names] exog_re = np.asarray(exog_re) if exog_re.ndim == 1: exog_re = exog_re[:, None] @@ -911,8 +915,9 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, for group_ix, group in enumerate(kylist): ii = gb.groups[group] vcg = vc_formula[vc_name] - mat = patsy.dmatrix(vcg, data.loc[ii, :], eval_env=eval_env, - return_type='dataframe') + mat = patsy.dmatrix( + vcg, data.loc[ii, :], eval_env=eval_env, + return_type='dataframe') if use_sparse: exog_vc[vc_name][group] = sparse.csr_matrix(mat) else: @@ -921,12 +926,9 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, else: exog_vc = None - mod = super(MixedLM, cls).from_formula(formula, data, - subset=None, - exog_re=exog_re, - exog_vc=exog_vc, - groups=groups, - *args, **kwargs) + mod = super(MixedLM, cls).from_formula( + formula, data, subset=None, exog_re=exog_re, + exog_vc=exog_vc, groups=groups, *args, **kwargs) # expand re names to account for pairs of RE (param_names, @@ -940,7 +942,6 @@ def from_formula(cls, formula, data, re_formula=None, vc_formula=None, return mod - def predict(self, params, exog=None): """ Return predicted values from a design matrix. @@ -972,7 +973,6 @@ def predict(self, params, exog=None): return np.dot(exog, params) - def group_list(self, array): """ Returns `array` split into subarrays corresponding to the @@ -989,7 +989,6 @@ def group_list(self, array): return [np.array(array[self.row_indices[k], :]) for k in self.group_labels] - def fit_regularized(self, start_params=None, method='l1', alpha=0, ceps=1e-4, ptol=1e-6, maxit=200, **fit_kwargs): """ @@ -1093,7 +1092,8 @@ def fit_regularized(self, start_params=None, method='l1', alpha=0, ex_r, ex2_r = self._aex_r[group_ix], self._aex_r2[group_ix] resid = resid_all[self.row_indices[group]] - solver = _smw_solver(scale, ex_r, ex2_r, cov_re_inv, 1 / vc_var) + solver = _smw_solver(scale, ex_r, ex2_r, cov_re_inv, + 1 / vc_var) x = exog[:, j] u = solver(x) @@ -1125,7 +1125,7 @@ def fit_regularized(self, start_params=None, method='l1', alpha=0, ii[self.k_fe:] = True ii = np.flatnonzero(ii) hess1 = hess[ii, :][:, ii] - pcov[np.ix_(ii,ii)] = np.linalg.inv(-hess1) + pcov[np.ix_(ii, ii)] = np.linalg.inv(-hess1) params_object = MixedLMParams.from_components(fe_params, cov_re=cov_re) @@ -1145,7 +1145,6 @@ def fit_regularized(self, start_params=None, method='l1', alpha=0, return MixedLMResultsWrapper(results) - def get_fe_params(self, cov_re, vcomp): """ Use GLS to update the fixed effects parameter estimates. @@ -1164,7 +1163,7 @@ def get_fe_params(self, cov_re, vcomp): return np.array([]) if self.k_re == 0: - cov_re_inv = np.empty((0,0)) + cov_re_inv = np.empty((0, 0)) else: cov_re_inv = np.linalg.inv(cov_re) @@ -1172,7 +1171,9 @@ def get_fe_params(self, cov_re, vcomp): if not hasattr(self, "_endex_li"): self._endex_li = [] for group_ix, _ in enumerate(self.group_labels): - mat = np.concatenate((self.exog_li[group_ix], self.endog_li[group_ix][:, None]), axis=1) + mat = np.concatenate( + (self.exog_li[group_ix], + self.endog_li[group_ix][:, None]), axis=1) self._endex_li.append(mat) xtxy = 0. @@ -1188,7 +1189,6 @@ def get_fe_params(self, cov_re, vcomp): return fe_params - def _reparam(self): """ Returns parameters of the map converting parameters from the @@ -1230,7 +1230,7 @@ def _reparam(self): # Quadratic terms for random effects covariance. ii = np.tril_indices(k_re) - ix = [(a,b) for a,b in zip(ii[0], ii[1])] + ix = [(a, b) for a, b in zip(ii[0], ii[1])] for i1 in range(k_re2): for i2 in range(k_re2): ix1 = ix[i1] @@ -1249,7 +1249,6 @@ def _reparam(self): return lin, quad - def _expand_vcomp(self, vcomp, group): """ Replicate variance parameters to match a group's design. @@ -1270,13 +1269,13 @@ def _expand_vcomp(self, vcomp, group): vc_var = [] for j, k in enumerate(self._vc_names): if group in self.exog_vc[k]: - vc_var.append(vcomp[j] * np.ones(self.exog_vc[k][group].shape[1])) + vc_var.append( + vcomp[j] * np.ones(self.exog_vc[k][group].shape[1])) if len(vc_var) > 0: return np.concatenate(vc_var) else: return np.empty(0) - def _augment_exog(self, group_ix): """ Concatenate the columns for variance components to the columns @@ -1290,7 +1289,7 @@ def _augment_exog(self, group_ix): group = self.group_labels[group_ix] ex = [ex_r] if self.k_re > 0 else [] any_sparse = False - for j,k in enumerate(self._vc_names): + for j, k in enumerate(self._vc_names): if group not in self.exog_vc[k]: continue ex.append(self.exog_vc[k][group]) @@ -1306,7 +1305,6 @@ def _augment_exog(self, group_ix): return ex - def loglike(self, params, profile_fe=True): """ Evaluate the (profile) log-likelihood of the linear mixed @@ -1384,7 +1382,8 @@ def loglike(self, params, profile_fe=True): resid = resid_all[self.row_indices[group]] # Part 1 of the log likelihood (for both ML and REML) - ld = _smw_logdet(1., ex_r, ex2_r, cov_re_inv, 1 / vc_var, cov_aug_logdet) + ld = _smw_logdet(1., ex_r, ex2_r, cov_re_inv, 1 / vc_var, + cov_aug_logdet) likeval -= ld / 2. # Part 2 of the log likelihood (for both ML and REML) @@ -1398,7 +1397,7 @@ def loglike(self, params, profile_fe=True): if self.reml: likeval -= (self.n_totobs - self.k_fe) * np.log(qf) / 2. - _,ld = np.linalg.slogdet(xvx) + _, ld = np.linalg.slogdet(xvx) likeval -= ld / 2. likeval -= (self.n_totobs - self.k_fe) * np.log(2 * np.pi) / 2. likeval += ((self.n_totobs - self.k_fe) * @@ -1412,7 +1411,6 @@ def loglike(self, params, profile_fe=True): return likeval - def _gen_dV_dPar(self, ex_r, solver, group, max_ix=None): """ A generator that yields the element-wise derivative of the @@ -1439,8 +1437,9 @@ def _gen_dV_dPar(self, ex_r, solver, group, max_ix=None): for j2 in range(j1 + 1): if max_ix is not None and jj > max_ix: return - mat_l, mat_r = ex_r[:,j1:j1+1], ex_r[:,j2:j2+1] # Need 2d - vsl, vsr = axr[:,j1:j1+1], axr[:,j2:j2+1] + # Need 2d + mat_l, mat_r = ex_r[:, j1:j1+1], ex_r[:, j2:j2+1] + vsl, vsr = axr[:, j1:j1+1], axr[:, j2:j2+1] yield jj, mat_l, mat_r, vsl, vsr, j1 == j2 jj += 1 @@ -1454,7 +1453,6 @@ def _gen_dV_dPar(self, ex_r, solver, group, max_ix=None): yield jj, mat, mat, axmat, axmat, True jj += 1 - def score(self, params, profile_fe=True): """ Returns the score vector of the profile log-likelihood. @@ -1467,17 +1465,19 @@ def score(self, params, profile_fe=True): """ if type(params) is not MixedLMParams: - params = MixedLMParams.from_packed(params, self.k_fe, - self.k_re, self.use_sqrt, - has_fe=False) + params = MixedLMParams.from_packed( + params, self.k_fe, self.k_re, self.use_sqrt, + has_fe=False) if profile_fe: params.fe_params = self.get_fe_params(params.cov_re, params.vcomp) if self.use_sqrt: - score_fe, score_re, score_vc = self.score_sqrt(params, calc_fe=not profile_fe) + score_fe, score_re, score_vc = self.score_sqrt( + params, calc_fe=not profile_fe) else: - score_fe, score_re, score_vc = self.score_full(params, calc_fe=not profile_fe) + score_fe, score_re, score_vc = self.score_full( + params, calc_fe=not profile_fe) if self._freepat is not None: score_fe *= self._freepat.fe_params @@ -1489,7 +1489,6 @@ def score(self, params, profile_fe=True): else: return np.concatenate((score_fe, score_re, score_vc)) - def score_full(self, params, calc_fe): """ Returns the score with respect to untransformed parameters. @@ -1563,7 +1562,7 @@ def score_full(self, params, calc_fe): # V^{-1} exog' dV/dQ_jj exog V^{-1}, where Q_jj is the jj^th # covariance parameter. - xtax = [0.,] * (self.k_re2 + self.k_vc) + xtax = [0., ] * (self.k_re2 + self.k_vc) # Temporary related to the gradient of log |V| dlv = np.zeros(self.k_re2 + self.k_vc) @@ -1591,7 +1590,8 @@ def score_full(self, params, calc_fe): # Contributions to the covariance parameter gradient vir = solver(resid) - for jj, matl, matr, vsl, vsr, sym in self._gen_dV_dPar(ex_r, solver, group): + for (jj, matl, matr, vsl, vsr, sym) in\ + self._gen_dV_dPar(ex_r, solver, group): dlv[jj] = _dotsum(matr, vsl) if not sym: dlv[jj] += _dotsum(matl, vsr) @@ -1644,7 +1644,6 @@ def score_full(self, params, calc_fe): return score_fe, score_re, score_vc - def score_sqrt(self, params, calc_fe=True): """ Returns the score with respect to transformed parameters. @@ -1690,7 +1689,6 @@ def score_sqrt(self, params, calc_fe=True): return score_fe, score_re, score_vc - def hessian(self, params): """ Returns the model's Hessian matrix. @@ -1737,7 +1735,7 @@ def hessian(self, params): rvir = 0. xtvix = 0. - xtax = [0.,] * (self.k_re2 + self.k_vc) + xtax = [0., ] * (self.k_re2 + self.k_vc) m = self.k_re2 + self.k_vc B = np.zeros(m) D = np.zeros((m, m)) @@ -1761,7 +1759,8 @@ def hessian(self, params): vir = solver(resid) rvir += np.dot(resid, vir) - for jj1, matl1, matr1, vsl1, vsr1, sym1 in self._gen_dV_dPar(ex_r, solver, group): + for (jj1, matl1, matr1, vsl1, vsr1, sym1) in\ + self._gen_dV_dPar(ex_r, solver, group): ul = _dot(viexog.T, matl1) ur = _dot(matr1.T, vir) @@ -1788,14 +1787,19 @@ def hessian(self, params): if not sym1: E.append((vsr1, matl1)) - for jj2, matl2, matr2, vsl2, vsr2, sym2 in self._gen_dV_dPar(ex_r, solver, group, jj1): + for (jj2, matl2, matr2, vsl2, vsr2, sym2) in\ + self._gen_dV_dPar(ex_r, solver, group, jj1): - re = sum([_multi_dot_three(matr2.T, x[0], x[1].T) for x in E]) - vt = 2 * _dot(_multi_dot_three(vir[None, :], matl2, re), vir[:, None]) + re = sum([_multi_dot_three(matr2.T, x[0], x[1].T) + for x in E]) + vt = 2 * _dot(_multi_dot_three(vir[None, :], matl2, re), + vir[:, None]) if not sym2: - le = sum([_multi_dot_three(matl2.T, x[0], x[1].T) for x in E]) - vt += 2 * _dot(_multi_dot_three(vir[None, :], matr2, le), vir[:, None]) + le = sum([_multi_dot_three(matl2.T, x[0], x[1].T) + for x in E]) + vt += 2 * _dot(_multi_dot_three( + vir[None, :], matr2, le), vir[:, None]) D[jj1, jj2] += vt if jj1 != jj2: @@ -1846,7 +1850,6 @@ def hessian(self, params): return hess - def get_scale(self, fe_params, cov_re, vcomp): """ Returns the estimated error variance based on given estimates @@ -1898,7 +1901,6 @@ def get_scale(self, fe_params, cov_re, vcomp): return qf - def fit(self, start_params=None, reml=True, niter_sa=0, do_cg=True, fe_pen=None, cov_pen=None, free=None, full_output=False, method='bfgs', **kwargs): @@ -1971,17 +1973,16 @@ def fit(self, start_params=None, reml=True, niter_sa=0, else: # It's a packed array if len(start_params) == self.k_fe + self.k_re2 + self.k_vc: - params = MixedLMParams.from_packed(start_params, self.k_fe, - self.k_re, self.use_sqrt, - has_fe=True) + params = MixedLMParams.from_packed( + start_params, self.k_fe, self.k_re, self.use_sqrt, + has_fe=True) elif len(start_params) == self.k_re2 + self.k_vc: - params = MixedLMParams.from_packed(start_params, self.k_fe, - self.k_re, self.use_sqrt, - has_fe=False) + params = MixedLMParams.from_packed( + start_params, self.k_fe, self.k_re, self.use_sqrt, + has_fe=False) else: raise ValueError("invalid start_params") - if do_cg: kwargs["retall"] = hist is not None if "disp" not in kwargs: @@ -2012,8 +2013,8 @@ def fit(self, start_params=None, reml=True, niter_sa=0, # Convert to the final parameterization (i.e. undo the square # root transform of the covariance matrix, and the profiling # over the error variance). - params = MixedLMParams.from_packed(params, self.k_fe, self.k_re, - use_sqrt=self.use_sqrt, has_fe=False) + params = MixedLMParams.from_packed( + params, self.k_fe, self.k_re, use_sqrt=self.use_sqrt, has_fe=False) cov_re_unscaled = params.cov_re vcomp_unscaled = params.vcomp fe_params = self.get_fe_params(cov_re_unscaled, vcomp_unscaled) @@ -2022,8 +2023,9 @@ def fit(self, start_params=None, reml=True, niter_sa=0, cov_re = scale * cov_re_unscaled vcomp = scale * vcomp_unscaled - if (((self.k_re > 0) and (np.min(np.abs(np.diag(cov_re))) < 0.01)) or - ((self.k_vc > 0) and (np.min(np.abs(vcomp)) < 0.01))): + f1 = (self.k_re > 0) and (np.min(np.abs(np.diag(cov_re))) < 0.01) + f2 = (self.k_vc > 0) and (np.min(np.abs(vcomp)) < 0.01) + if f1 or f2: msg = "The MLE may be on the boundary of the parameter space." warnings.warn(msg, ConvergenceWarning) @@ -2044,7 +2046,8 @@ def fit(self, start_params=None, reml=True, niter_sa=0, else: pcov = np.linalg.inv(-hess) if np.any(hess_diag >= 0): - msg = "The Hessian matrix at the estimated parameter values is not positive definite." + msg = ("The Hessian matrix at the estimated parameter values " + + "is not positive definite.") warnings.warn(msg, ConvergenceWarning) # Prepare a results class instance @@ -2114,7 +2117,6 @@ def __init__(self, model, params, cov_params): self.nobs = self.model.nobs self.df_resid = self.nobs - np_matrix_rank(self.model.exog) - @cache_readonly def fittedvalues(self): """ @@ -2140,7 +2142,6 @@ def fittedvalues(self): return fit - @cache_readonly def resid(self): """ @@ -2151,7 +2152,6 @@ def resid(self): """ return self.model.endog - self.fittedvalues - @cache_readonly def bse_fe(self): """ @@ -2161,7 +2161,6 @@ def bse_fe(self): p = self.model.exog.shape[1] return np.sqrt(np.diag(self.cov_params())[0:p]) - @cache_readonly def bse_re(self): """ @@ -2180,7 +2179,6 @@ def bse_re(self): p = self.model.exog.shape[1] return np.sqrt(self.scale * np.diag(self.cov_params())[p:]) - def _expand_re_names(self, group): names = list(self.model.data.exog_re_names) @@ -2191,7 +2189,6 @@ def _expand_re_names(self, group): names.extend(na) return names - @cache_readonly def random_effects(self): """ @@ -2206,7 +2203,8 @@ def random_effects(self): try: cov_re_inv = np.linalg.inv(self.cov_re) except np.linalg.LinAlgError: - raise ValueError("Cannot predict random effects from singular covariance structure.") + raise ValueError("Cannot predict random effects from " + + "singular covariance structure.") vcomp = self.vcomp k_re = self.k_re @@ -2216,7 +2214,8 @@ def random_effects(self): endog = self.model.endog_li[group_ix] exog = self.model.exog_li[group_ix] - ex_r, ex2_r = self.model._aex_r[group_ix], self.model._aex_r2[group_ix] + ex_r = self.model._aex_r[group_ix] + ex2_r = self.model._aex_r2[group_ix] vc_var = self.model._expand_vcomp(vcomp, group) # Get the residuals relative to fixed effects @@ -2225,18 +2224,19 @@ def random_effects(self): expval = np.dot(exog, self.fe_params) resid = resid - expval - solver = _smw_solver(self.scale, ex_r, ex2_r, cov_re_inv, 1 / vc_var) + solver = _smw_solver(self.scale, ex_r, ex2_r, cov_re_inv, + 1 / vc_var) vir = solver(resid) xtvir = _dot(ex_r.T, vir) xtvir[0:k_re] = np.dot(self.cov_re, xtvir[0:k_re]) xtvir[k_re:] *= vc_var - ranef_dict[group] = pd.Series(xtvir, index=self._expand_re_names(group)) + ranef_dict[group] = pd.Series( + xtvir, index=self._expand_re_names(group)) return ranef_dict - @cache_readonly def random_effects_cov(self): """ @@ -2261,11 +2261,13 @@ def random_effects_cov(self): ranef_dict = {} for group_ix in range(self.model.n_groups): - ex_r, ex2_r = self.model._aex_r[group_ix], self.model._aex_r2[group_ix] + ex_r = self.model._aex_r[group_ix] + ex2_r = self.model._aex_r2[group_ix] label = self.model.group_labels[group_ix] vc_var = self.model._expand_vcomp(vcomp, group_ix) - solver = _smw_solver(self.scale, ex_r, ex2_r, cov_re_inv, 1 / vc_var) + solver = _smw_solver(self.scale, ex_r, ex2_r, cov_re_inv, + 1 / vc_var) n = ex_r.shape[0] m = self.cov_re.shape[0] @@ -2285,8 +2287,8 @@ def random_effects_cov(self): return ranef_dict - - # Need to override since t-tests are only used for fixed effects parameters. + # Need to override since t-tests are only used for fixed effects + # parameters. def t_test(self, r_matrix, scale=None, use_t=None): """ Compute a t-test for a each linear hypothesis of the form Rb = q @@ -2316,15 +2318,16 @@ def t_test(self, r_matrix, scale=None, use_t=None): """ if r_matrix.shape[1] != self.k_fe: - raise ValueError("r_matrix for t-test should have %d columns" % self.k_fe) + raise ValueError("r_matrix for t-test should have %d columns" + % self.k_fe) d = self.k_re2 + self.k_vc z0 = np.zeros((r_matrix.shape[0], d)) r_matrix = np.concatenate((r_matrix, z0), axis=1) - tst_rslt = super(MixedLMResults, self).t_test(r_matrix, scale=scale, use_t=use_t) + tst_rslt = super(MixedLMResults, self).t_test( + r_matrix, scale=scale, use_t=use_t) return tst_rslt - def summary(self, yname=None, xname_fe=None, xname_re=None, title=None, alpha=.05): """ @@ -2416,7 +2419,7 @@ def summary(self, yname=None, xname_fe=None, xname_re=None, sdf = pd.DataFrame(index=self.model.data.param_names, data=sdf) sdf.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|', - '[' + str(alpha/2), str(1-alpha/2) + ']'] + '[' + str(alpha/2), str(1-alpha/2) + ']'] for col in sdf.columns: sdf[col] = [float_fmt % x if np.isfinite(x) else "" for x in sdf[col]] @@ -2425,12 +2428,10 @@ def summary(self, yname=None, xname_fe=None, xname_re=None, return smry - @cache_readonly def llf(self): return self.model.loglike(self.params_object, profile_fe=False) - @cache_readonly def aic(self): if self.reml: @@ -2441,7 +2442,6 @@ def aic(self): df = self.params.size + 1 return -2 * (self.llf - df) - @cache_readonly def bic(self): if self.reml: @@ -2452,7 +2452,6 @@ def bic(self): df = self.params.size + 1 return -2 * self.llf + np.log(self.nobs) * df - def profile_re(self, re_ix, vtype, num_low=5, dist_low=1., num_high=5, dist_high=1.): """ @@ -2618,6 +2617,7 @@ def _handle_missing(data, groups, formula, re_formula, vc_formula): for fml in forms: # Unicode conversion is for Py2 compatability rl = StringIO(fml) + def rlu(): line = rl.readline() return asunicode(line, 'ascii') From 822af00abb802a56f86983a748601c170dcf4aba Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 28 Jan 2018 23:49:03 -0500 Subject: [PATCH 010/157] minor formatting --- statsmodels/regression/tests/test_lme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index e58abf4cf22..8649d1381d5 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -586,7 +586,7 @@ def do1(reml, irf, ds_ix): mdf = md.fit(gtol=1e-7, reml=reml) else: mdf = md.fit(gtol=1e-7, reml=reml) - + else: # Independent random effects k_fe = rslt.exog_fe.shape[1] k_re = rslt.exog_re.shape[1] From b49d3a714eb8737deee296cc0ba19e4b94ddcf37 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Wed, 31 Jan 2018 20:46:28 -0500 Subject: [PATCH 011/157] REF: use full dtype in issubdtype call --- statsmodels/tsa/kalmanf/kalmanfilter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/statsmodels/tsa/kalmanf/kalmanfilter.py b/statsmodels/tsa/kalmanf/kalmanfilter.py index 2391d58844c..9389b95bdc9 100644 --- a/statsmodels/tsa/kalmanf/kalmanfilter.py +++ b/statsmodels/tsa/kalmanf/kalmanfilter.py @@ -574,10 +574,10 @@ def geterrors(cls, y, k, k_ar, k_ma, k_lags, nobs, Z_mat, m, R_mat, T_mat, """ Returns just the errors of the Kalman Filter """ - if issubdtype(paramsdtype, float): + if issubdtype(paramsdtype, np.float64): return kalman_loglike.kalman_filter_double(y, k, k_ar, k_ma, k_lags, int(nobs), Z_mat, R_mat, T_mat)[0] - elif issubdtype(paramsdtype, complex): + elif issubdtype(paramsdtype, np.complex128): return kalman_loglike.kalman_filter_complex(y, k, k_ar, k_ma, k_lags, int(nobs), Z_mat, R_mat, T_mat)[0] else: @@ -643,11 +643,11 @@ def loglike(cls, params, arma_model, set_sigma2=True): #TODO: this won't work for time-varying parameters (y, k, nobs, k_ar, k_ma, k_lags, newparams, Z_mat, m, R_mat, T_mat, paramsdtype) = cls._init_kalman_state(params, arma_model) - if issubdtype(paramsdtype, float): + if issubdtype(paramsdtype, np.float64): loglike, sigma2 = kalman_loglike.kalman_loglike_double(y, k, k_ar, k_ma, k_lags, int(nobs), Z_mat, R_mat, T_mat) - elif issubdtype(paramsdtype, complex): + elif issubdtype(paramsdtype, np.complex128): loglike, sigma2 = kalman_loglike.kalman_loglike_complex(y, k, k_ar, k_ma, k_lags, int(nobs), Z_mat.astype(complex), From 3cb9161133ed018512f4b2f97adcb21a6eefc83e Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 3 Feb 2018 08:47:51 +0100 Subject: [PATCH 012/157] Replaced https by http in a statsmodels link. We currently do not support https, and chrome shows a big warning when trying to access via https. --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2cf3d85b321..6afe83c8b60 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -21,7 +21,7 @@ For a pull request to be accepted, you must meet the below requirements. This gr - **One branch. One feature.** Branches are cheap and github makes it easy to merge and delete branches with a few clicks. Avoid the temptation to lump in a bunch of unrelated changes when working on a feature, if possible. This helps us keep track of what has changed when preparing a release. - Commit messages should be clear and concise. This means a subject line of less than 80 characters, and, if necessary, a blank line followed by a commit message body. We have an `informal commit format standard `_ that we try to adhere to. You can see what this looks like in practice by ``git log --oneline -n 10``. If your commit references or closes a specific issue, you can close it by mentioning it in the `commit message `_. (*For maintainers*: These suggestions go for Merge commit comments too. These are partially the record for release notes.) -- Code submissions must always include tests. See our `notes on testing `_. +- Code submissions must always include tests. See our `notes on testing `_. - Each function, class, method, and attribute needs to be documented using docstrings. We conform to the `numpy docstring standard `_. - If you are adding new functionality, you need to add it to the documentation by editing (or creating) the appropriate file in ``docs/source``. - Make sure your documentation changes parse correctly. Change into the top-level ``docs/`` directory and type:: From 20f58a7940dc71e50e3c84c63e5c024d724cfb7b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sat, 3 Feb 2018 14:07:01 +0000 Subject: [PATCH 013/157] BUG: Check dtype for medcouple Ensure input is double in medcouple closes #4243 --- statsmodels/stats/stattools.py | 1 + statsmodels/stats/tests/test_statstools.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/statsmodels/stats/stattools.py b/statsmodels/stats/stattools.py index c2fa3da62d1..6cd91830a65 100644 --- a/statsmodels/stats/stattools.py +++ b/statsmodels/stats/stattools.py @@ -445,6 +445,7 @@ def medcouple(y, axis=0): distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ + y = np.asarray(y, dtype=np.double) # GH 4243 if axis is None: return _medcouple_1d(y.ravel()) diff --git a/statsmodels/stats/tests/test_statstools.py b/statsmodels/stats/tests/test_statstools.py index 081d9a18767..2cae15466d5 100644 --- a/statsmodels/stats/tests/test_statstools.py +++ b/statsmodels/stats/tests/test_statstools.py @@ -2,7 +2,7 @@ # TODO: Test robust kurtosis import numpy as np import pandas as pd -from numpy.testing import (assert_almost_equal, assert_raises) +from numpy.testing import (assert_almost_equal, assert_raises, assert_equal) from statsmodels.stats.stattools import (omni_normtest, jarque_bera, durbin_watson, _medcouple_1d, medcouple, robust_kurtosis, robust_skewness) @@ -192,11 +192,15 @@ def test_medcouple_symmetric(self): mc = medcouple(np.arange(5.0)) assert_almost_equal(mc, 0) - def test_medcouple_nonzero(self): mc = medcouple(np.array([1, 2, 7, 9, 10.0])) assert_almost_equal(mc, -0.3333333) + def test_medcouple_int(self): + # GH 4243 + mc1 = medcouple(np.array([1, 2, 7, 9, 10])) + mc2 = medcouple(np.array([1, 2, 7, 9, 10.0])) + assert_equal(mc1, mc2) def test_medcouple_symmetry(self): x = np.random.standard_normal(100) @@ -204,7 +208,6 @@ def test_medcouple_symmetry(self): mcn = medcouple(-x) assert_almost_equal(mcp + mcn, 0) - def test_durbin_watson(self): x = np.random.standard_normal(100) dw = sum(np.diff(x)**2.0) / np.dot(x, x) From 2cc80264f3ed9127ca8abb9c29cc15e3875db82c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 4 Feb 2018 08:05:57 +0100 Subject: [PATCH 014/157] Fixed non-utf8 character in data. --- statsmodels/tsa/vector_ar/data/e6.dat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/tsa/vector_ar/data/e6.dat b/statsmodels/tsa/vector_ar/data/e6.dat index 5063953091a..697e4684b38 100644 --- a/statsmodels/tsa/vector_ar/data/e6.dat +++ b/statsmodels/tsa/vector_ar/data/e6.dat @@ -1,7 +1,7 @@ /* sample: 1972Q2 -- 1998Q4 West German data until 1990Q2, all of Germany aferwards -Dp - \Delta log gdp deflator (source: Deutsches Institut für Wirtschaftsforschung, +Dp - \Delta log gdp deflator (source: Deutsches Institut für Wirtschaftsforschung, Volkswirtschaftliche Gesamtrechnung) R - nominal long term interest rate (Umlaufsrendite) (source: Monatsberichte der Deutschen Bundesbank, From a054ca87467eb2bc9e42b8c69885d1a6618ff116 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 10 Feb 2018 14:02:06 -0500 Subject: [PATCH 015/157] fix spelling error in docstring --- statsmodels/regression/mixed_linear_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py index 267e7fade5e..5209b83cfd6 100644 --- a/statsmodels/regression/mixed_linear_model.py +++ b/statsmodels/regression/mixed_linear_model.py @@ -2167,7 +2167,7 @@ def bse_re(self): Returns the standard errors of the variance parameters. The first `k_re x (k_re + 1)` elements of the returned array - are the standard errors of the lower tirangle of `cov_re`. + are the standard errors of the lower triangle of `cov_re`. The remaining elements are the standard errors of the variance components. From 9542d53fee8c839537f88a094bd9d6d1e23d30dc Mon Sep 17 00:00:00 2001 From: Dror Atariah Date: Tue, 13 Feb 2018 20:21:13 +0100 Subject: [PATCH 016/157] Fixed docs as per GH4253 https://github.com/statsmodels/statsmodels/issues/4253#issuecomment-365321531 --- statsmodels/stats/power.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/statsmodels/stats/power.py b/statsmodels/stats/power.py index c007c397664..8861e7931cf 100644 --- a/statsmodels/stats/power.py +++ b/statsmodels/stats/power.py @@ -628,7 +628,7 @@ def __init__(self, ddof=0, **kwds): def power(self, effect_size, nobs1, alpha, ratio=1, alternative='two-sided'): - '''Calculate the power of a t-test for two independent sample + '''Calculate the power of a z-test for two independent sample Parameters ---------- @@ -647,8 +647,6 @@ def power(self, effect_size, nobs1, alpha, ratio=1, ratio : float ratio of the number of observations in sample 2 relative to sample 1. see description of nobs1 - The default for ratio is 1; to solve for ratio given the other - arguments it has to be explicitly set to None. alternative : string, 'two-sided' (default), 'larger', 'smaller' extra argument to choose whether the power is calculated for a two-sided (default) or one sided test. The one-sided test can be From 62cd8086c67504605681514b8777cc17f66fe075 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Mon, 19 Feb 2018 12:17:29 -0600 Subject: [PATCH 017/157] Install conda3 instead --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 30c7b21f646..7659915b948 100644 --- a/.travis.yml +++ b/.travis.yml @@ -69,11 +69,10 @@ notifications: # Setup anaconda before_install: - - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh + - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh - - ./miniconda.sh -b + - ./miniconda.sh -b -p /home/travis/miniconda - export PATH=/home/travis/miniconda/bin:$PATH - - export PATH=/home/travis/miniconda2/bin:$PATH - export MKL_NUM_THREADS=1 - export NUMEXPR_NUM_THREADS=1 - export OMP_NUM_THREADS=1 From 1b386fc4c8a5c075a7e06b7a0a71bac934024da5 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Mon, 19 Feb 2018 15:34:23 -0600 Subject: [PATCH 018/157] Replace genfromtext with read_csv --- statsmodels/tsa/vector_ar/util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/vector_ar/util.py b/statsmodels/tsa/vector_ar/util.py index 806936e4d0a..daf38299de5 100644 --- a/statsmodels/tsa/vector_ar/util.py +++ b/statsmodels/tsa/vector_ar/util.py @@ -9,6 +9,7 @@ import numpy as np import scipy.stats as stats import scipy.linalg.decomp as decomp +import pandas as pd import statsmodels.tsa.tsatools as tsa @@ -108,6 +109,7 @@ def comp_matrix(coefs): #------------------------------------------------------------------------------- # Miscellaneous stuff + def parse_lutkepohl_data(path): # pragma: no cover """ Parse data files from Lütkepohl (2005) book @@ -137,7 +139,8 @@ def parse_lutkepohl_data(path): # pragma: no cover year, freq, start_point = m.groups() break - data = np.genfromtxt(path, names=True, skip_header=to_skip+1) + data = (pd.read_csv(path, delimiter=r"\s+", header=to_skip+1) + .to_records(index=False)) n = len(data) @@ -284,4 +287,4 @@ def seasonal_dummies(n_seasons, len_endog, first_period=0, centered=False): if centered: season_exog -= 1 / n_seasons - return season_exog \ No newline at end of file + return season_exog From e06c881b9c1b5485fecb634fe83a38030e4631ec Mon Sep 17 00:00:00 2001 From: Nick DeRobertis Date: Sun, 25 Feb 2018 13:06:55 -0500 Subject: [PATCH 019/157] TST: Modify test for summary col regressor order to catch issue #3767 --- statsmodels/iolib/tests/test_summary2.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/statsmodels/iolib/tests/test_summary2.py b/statsmodels/iolib/tests/test_summary2.py index 36e248b82a1..cb0654a0537 100644 --- a/statsmodels/iolib/tests/test_summary2.py +++ b/statsmodels/iolib/tests/test_summary2.py @@ -1,6 +1,7 @@ import warnings import numpy as np +import pandas as pd from numpy.testing import assert_equal from statsmodels.iolib.summary2 import summary_col @@ -61,17 +62,19 @@ def test_summary_col_ordering_preserved(self): x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) + x2 = pd.DataFrame(x2, columns=['const', 'b', 'a']) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x2).fit() reg2 = OLS(y2, x2).fit() + info_dict = {'R2': lambda x: '{:.3f}'.format(int(x.rsquared)), 'N': lambda x: '{0:d}'.format(int(x.nobs))} original = actual = summary_col([reg1, reg2], float_format='%0.4f') - actual = summary_col([reg1, reg2], regressor_order=['x2', 'x1'], + actual = summary_col([reg1, reg2], regressor_order=['a', 'b'], float_format='%0.4f', info_dict=info_dict) - variables = ('const', 'x1', 'x2') + variables = ('const', 'b', 'a') for line in str(original).split('\n'): for variable in variables: if line.startswith(variable): From 8e5b0a330cbd142e1c23315f00822cfbd3899d08 Mon Sep 17 00:00:00 2001 From: Nick DeRobertis Date: Sun, 25 Feb 2018 13:13:52 -0500 Subject: [PATCH 020/157] BUG: Fix summary col reordering regressors without reordering coefficients, fixing #3767 --- statsmodels/iolib/summary2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/iolib/summary2.py b/statsmodels/iolib/summary2.py index 742814fd042..27eabfa8711 100644 --- a/statsmodels/iolib/summary2.py +++ b/statsmodels/iolib/summary2.py @@ -488,7 +488,7 @@ def summary_col(results, float_format='%.4f', model_names=(), stars=False, order = ordered + list(np.unique(unordered)) f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], []) - summ.index = f(np.unique(varnames)) + summ.index = f(pd.unique(varnames)) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] if drop_omitted: From 6e1d1c0ff645e3cde990187d26fc25d9742e17e1 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Sun, 25 Feb 2018 18:57:43 -0600 Subject: [PATCH 021/157] Notebook shoudl probably use Gaussian distribution for GLM fit --- .../notebooks/distributed_estimation.ipynb | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/examples/notebooks/distributed_estimation.ipynb b/examples/notebooks/distributed_estimation.ipynb index 0a60f1f011c..89f1dc61ffd 100644 --- a/examples/notebooks/distributed_estimation.ipynb +++ b/examples/notebooks/distributed_estimation.ipynb @@ -9,13 +9,12 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", + "from scipy.stats.distributions import norm\n", "from statsmodels.base.distributed_estimation import DistributedModel\n", "\n", "def _exog_gen(exog, partitions):\n", @@ -52,16 +51,14 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "X = np.random.normal(size=(1000, 25))\n", "beta = np.random.normal(size=25)\n", "beta *= np.random.randint(0, 2, size=25)\n", - "y = X.dot(beta) + np.random.normal(size=1000)\n", + "y = norm.rvs(loc=X.dot(beta))\n", "m = 5" ] }, @@ -74,10 +71,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "debiased_OLS_mod = DistributedModel(m)\n", @@ -94,17 +89,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { - "collapsed": false + "scrolled": false }, "outputs": [], "source": [ "from statsmodels.genmod.generalized_linear_model import GLM\n", - "from statsmodels.genmod.families import Binomial\n", + "from statsmodels.genmod.families import Gaussian\n", "\n", "debiased_GLM_mod = DistributedModel(m, model_class=GLM,\n", - " init_kwds={\"family\": Binomial()})\n", + " init_kwds={\"family\": Gaussian()})\n", "debiased_GLM_fit = debiased_GLM_mod.fit(zip(_endog_gen(y, m), _exog_gen(X, m)),\n", " fit_kwds={\"alpha\": 0.2})" ] @@ -118,10 +113,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from statsmodels.base.distributed_estimation import _est_regularized_naive, _join_naive\n", @@ -142,10 +135,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from statsmodels.base.distributed_estimation import _est_unregularized_naive, DistributedResults\n", @@ -175,7 +166,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.14" } }, "nbformat": 4, From b0238bbf52c8f961131516eed5e674d12232980d Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Sun, 25 Feb 2018 22:20:30 -0600 Subject: [PATCH 022/157] Add pytest folders to .gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index b84ddc83c8d..5b21d2d270b 100644 --- a/.gitignore +++ b/.gitignore @@ -83,3 +83,7 @@ statsmodels/tsa/statespace/_smoothers/_conventional.pyx statsmodels/tsa/statespace/_smoothers/_univariate.pyx statsmodels/tsa/statespace/_smoothers/_alternative.pyx statsmodels/tsa/statespace/_smoothers/_classical.pyx + +#pytest +.cache +.pytest_cache From 5a9fbe4028bfc7e9feb9a72edb4fde1734e6ebad Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Sun, 25 Feb 2018 23:29:45 -0600 Subject: [PATCH 023/157] All we have to do is update conda --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7659915b948..119dc01c631 100644 --- a/.travis.yml +++ b/.travis.yml @@ -77,8 +77,7 @@ before_install: - export NUMEXPR_NUM_THREADS=1 - export OMP_NUM_THREADS=1 - conda config --set always_yes yes - # Temporarily disabled until conda is fixed - # - conda update --quiet conda + - conda update --quiet conda # Fix for headless TravisCI - "export DISPLAY=:99.0" - "sh -e /etc/init.d/xvfb start" From 28c929fa9177c658632a638826edecaf821fcc93 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Mon, 26 Feb 2018 00:00:20 -0600 Subject: [PATCH 024/157] maxiter in ARIMA is 500 [skip ci] --- statsmodels/tsa/arima_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py index 8cadbf6c4a4..dea0e22aa9e 100644 --- a/statsmodels/tsa/arima_model.py +++ b/statsmodels/tsa/arima_model.py @@ -850,7 +850,7 @@ def fit(self, start_params=None, trend='c', method="css-mle", approximate the Hessian, projected gradient tolerance of 1e-8 and factr = 1e2. You can change these by using kwargs. maxiter : int, optional - The maximum number of function evaluations. Default is 50. + The maximum number of function evaluations. Default is 500. tol : float The convergence tolerance. Default is 1e-08. full_output : bool, optional @@ -1104,7 +1104,7 @@ def fit(self, start_params=None, trend='c', method="css-mle", approximate the Hessian, projected gradient tolerance of 1e-8 and factr = 1e2. You can change these by using kwargs. maxiter : int, optional - The maximum number of function evaluations. Default is 50. + The maximum number of function evaluations. Default is 500. tol : float The convergence tolerance. Default is 1e-08. full_output : bool, optional From e67ce58ba1da341dad7e52564aa1a4cbfb4e8098 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Sat, 3 Mar 2018 10:52:40 -0500 Subject: [PATCH 025/157] BUG: Results arparams, etc. w/ nonconsecutive lags --- statsmodels/tsa/statespace/sarimax.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/statespace/sarimax.py b/statsmodels/tsa/statespace/sarimax.py index 7062d2cd0fe..1042341d64a 100644 --- a/statsmodels/tsa/statespace/sarimax.py +++ b/statsmodels/tsa/statespace/sarimax.py @@ -1785,9 +1785,19 @@ def __init__(self, model, params, filter_results, cov_type='opg', self.param_terms = self.model.param_terms start = end = 0 for name in self.param_terms: - end += self.model_orders[name] + if name == 'ar': + k = self.model.k_ar_params + elif name == 'ma': + k = self.model.k_ma_params + elif name == 'seasonal_ar': + k = self.model.k_seasonal_ar_params + elif name == 'seasonal_ma': + k = self.model.k_seasonal_ma_params + else: + k = self.model_orders[name] + end += k setattr(self, '_params_%s' % name, self.params[start:end]) - start += self.model_orders[name] + start += k # Handle removing data self._data_attr_model.extend(['orig_endog', 'orig_exog']) From 70b2b86275b3006cc0fcdf6904dcce6ea1b5a0b3 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Sat, 3 Mar 2018 11:54:32 -0500 Subject: [PATCH 026/157] ENH: Add seasonalarparams, seasonalmaparams. --- statsmodels/tsa/statespace/sarimax.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/statespace/sarimax.py b/statsmodels/tsa/statespace/sarimax.py index 1042341d64a..7e176bb6911 100644 --- a/statsmodels/tsa/statespace/sarimax.py +++ b/statsmodels/tsa/statespace/sarimax.py @@ -1842,18 +1842,40 @@ def mafreq(self): def arparams(self): """ (array) Autoregressive parameters actually estimated in the model. - Does not include parameters whose values are constrained to be zero. + Does not include seasonal autoregressive parameters (see + `seasonalarparams`) or parameters whose values are constrained to be + zero. """ return self._params_ar + @cache_readonly + def seasonalarparams(self): + """ + (array) Seasonal autoregressive parameters actually estimated in the + model. Does not include nonseasonal autoregressive parameters (see + `arparams`) or parameters whose values are constrained to be zero. + """ + return self._params_seasonal_ar + @cache_readonly def maparams(self): """ (array) Moving average parameters actually estimated in the model. - Does not include parameters whose values are constrained to be zero. + Does not include seasonal moving average parameters (see + `seasonalmaparams`) or parameters whose values are constrained to be + zero. """ return self._params_ma + @cache_readonly + def seasonalmaparams(self): + """ + (array) Seasonal moving average parameters actually estimated in the + model. Does not include nonseasonal moving average parameters (see + `maparams`) or parameters whose values are constrained to be zero. + """ + return self._params_seasonal_ma + def get_prediction(self, start=None, end=None, dynamic=False, index=None, exog=None, **kwargs): """ From b9e6c00ae6e07e3ec3cbb3224e04d4879caaaea6 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Sat, 3 Mar 2018 12:07:25 -0500 Subject: [PATCH 027/157] TST: Add test for arparams, maparams, etc. --- statsmodels/tsa/statespace/tests/test_sarimax.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/statsmodels/tsa/statespace/tests/test_sarimax.py b/statsmodels/tsa/statespace/tests/test_sarimax.py index b551a3d0877..8f26d41e47c 100644 --- a/statsmodels/tsa/statespace/tests/test_sarimax.py +++ b/statsmodels/tsa/statespace/tests/test_sarimax.py @@ -1776,6 +1776,13 @@ def setup_class(cls, *args, **kwargs): kwargs['exog'] = (endog - np.floor(endog))**2 super(Test_sarimax_exogenous, cls).setup_class(50, *args, **kwargs) + def test_results_params(self): + result = self.model.filter(self.true_params) + assert_allclose(self.true_params[1:4], result.arparams) + assert_allclose(self.true_params[4:6], result.maparams) + assert_allclose(self.true_params[6:9], result.seasonalarparams) + assert_allclose(self.true_params[9:11], result.seasonalmaparams) + class Test_sarimax_exogenous_not_hamilton(SARIMAXCoverageTest): # // SARIMAX and exogenous # arima wpi x, arima(3,2,2) sarima(3,2,2,4) noconstant vce(oim) From 20961c6ad0ff502c660c79daf877d1e8d63defb4 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 3 Mar 2018 10:09:52 -0800 Subject: [PATCH 028/157] flake8 fixups in model.py --- statsmodels/base/model.py | 191 +++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 106 deletions(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 51a433cb1b7..cb350d7d356 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -1,5 +1,5 @@ from __future__ import print_function -from statsmodels.compat.python import iterkeys, lzip, range, reduce +from statsmodels.compat.python import lzip, range, reduce import numpy as np from scipy import stats from statsmodels.base.data import handle_data @@ -53,8 +53,8 @@ class Model(object): `endog` and `exog` are references to any data provided. So if the data is already stored in numpy arrays and it is changed then `endog` and `exog` will change as well. - """ % {'params_doc' : _model_params_doc, - 'extra_params_doc' : _missing_param_doc + _extra_param_doc} + """ % {'params_doc': _model_params_doc, + 'extra_params_doc': _missing_param_doc + _extra_param_doc} def __init__(self, endog, exog=None, **kwargs): missing = kwargs.pop('missing', 'none') @@ -74,16 +74,14 @@ def __init__(self, endog, exog=None, **kwargs): if hasconst is not None: self._init_keys.append('hasconst') - def _get_init_kwds(self): """return dictionary with extra keys used in model.__init__ """ kwds = dict(((key, getattr(self, key, None)) - for key in self._init_keys)) + for key in self._init_keys)) return kwds - def _handle_data(self, endog, exog, missing, hasconst, **kwargs): data = handle_data(endog, exog, missing, hasconst, **kwargs) # kwargs arrays could have changed, easier to just attach here @@ -98,7 +96,8 @@ def _handle_data(self, endog, exog, missing, hasconst, **kwargs): return data @classmethod - def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwargs): + def from_formula(cls, formula, data, subset=None, drop_cols=None, + *args, **kwargs): """ Create a Model from a formula and dataframe. @@ -135,8 +134,8 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwarg args and kwargs are passed on to the model instantiation. E.g., a numpy structured or rec array, a dictionary, or a pandas DataFrame. """ - #TODO: provide a docs template for args/kwargs from child models - #TODO: subset could use syntax. issue #469. + # TODO: provide a docs template for args/kwargs from child models + # TODO: subset could use syntax. issue #469. if subset is not None: data = data.loc[subset] eval_env = kwargs.pop('eval_env', None) @@ -164,7 +163,7 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwarg try: cols.remove(col) except ValueError: - pass # OK if not present + pass # OK if not present design_info = design_info.builder.subset(cols).design_info kwargs.update({'missing_idx': missing_idx, @@ -465,7 +464,7 @@ def hess(params, *args): retall=retall, full_output=full_output) - #NOTE: this is for fit_regularized and should be generalized + # NOTE: this is for fit_regularized and should be generalized cov_params_func = kwargs.setdefault('cov_params_func', None) if cov_params_func: Hinv = cov_params_func(self, xopt, retvals) @@ -490,18 +489,15 @@ def hess(params, *args): if 'cov_type' in kwargs: cov_kwds = kwargs.get('cov_kwds', {}) - kwds = {'cov_type':kwargs['cov_type'], 'cov_kwds':cov_kwds} + kwds = {'cov_type': kwargs['cov_type'], 'cov_kwds': cov_kwds} else: kwds = {} if 'use_t' in kwargs: kwds['use_t'] = kwargs['use_t'] - #prints for debugging - #print('kwargs inLikelihoodModel.fit', kwargs) - #print('kwds inLikelihoodModel.fit', kwds) - #TODO: add Hessian approximation and change the above if needed + # TODO: add Hessian approximation and change the above if needed mlefit = LikelihoodModelResults(self, xopt, Hinv, scale=1., **kwds) - #TODO: hardcode scale? + # TODO: hardcode scale? if isinstance(retvals, dict): mlefit.mle_retvals = retvals if warn_convergence and not retvals['converged']: @@ -514,7 +510,7 @@ def hess(params, *args): return mlefit -#TODO: the below is unfinished +# TODO: the below is unfinished class GenericLikelihoodModel(LikelihoodModel): """ Allows the fitting of any likelihood function via maximum likelihood. @@ -565,27 +561,26 @@ class GenericLikelihoodModel(LikelihoodModel): def __init__(self, endog, exog=None, loglike=None, score=None, hessian=None, missing='none', extra_params_names=None, **kwds): - # let them be none in case user wants to use inheritance - if not loglike is None: + # let them be none in case user wants to use inheritance + if loglike is not None: self.loglike = loglike - if not score is None: + if score is not None: self.score = score - if not hessian is None: + if hessian is not None: self.hessian = hessian self.__dict__.update(kwds) # TODO: data structures? - #TODO temporary solution, force approx normal - #self.df_model = 9999 - #somewhere: CacheWriteWarning: 'df_model' cannot be overwritten + # TODO temporary solution, force approx normal + # self.df_model = 9999 + # somewhere: CacheWriteWarning: 'df_model' cannot be overwritten super(GenericLikelihoodModel, self).__init__(endog, exog, missing=missing) # this won't work for ru2nmnl, maybe np.ndim of a dict? if exog is not None: - #try: self.nparams = (exog.shape[1] if np.ndim(exog) == 2 else 1) if extra_params_names is not None: @@ -601,7 +596,7 @@ def _set_extra_params_names(self, extra_params_names): self.nparams = len(self.exog_names) - #this is redundant and not used when subclassing + # this is redundant and not used when subclassing def initialize(self): if not self.score: # right now score is not optional self.score = approx_fprime @@ -610,9 +605,9 @@ def initialize(self): else: # can use approx_hess_p if we have a gradient if not self.hessian: pass - #Initialize is called by - #statsmodels.model.LikelihoodModel.__init__ - #and should contain any preprocessing that needs to be done for a model + # Initialize is called by + # statsmodels.model.LikelihoodModel.__init__ + # and should contain any preprocessing that needs to be done for a model if self.exog is not None: # assume constant er = np_matrix_rank(self.exog) @@ -624,7 +619,7 @@ def initialize(self): super(GenericLikelihoodModel, self).initialize() def expandparams(self, params): - ''' + """ expand to full parameter array when some parameters are fixed Parameters @@ -649,7 +644,7 @@ def expandparams(self, params): this could also be replaced by a more general parameter transformation. - ''' + """ paramsfull = self.fixed_params.copy() paramsfull[self.fixed_paramsmask] = params return paramsfull @@ -667,26 +662,26 @@ def loglikeobs(self, params): return -self.nloglikeobs(params) def score(self, params): - ''' + """ Gradient of log-likelihood evaluated at params - ''' + """ kwds = {} kwds.setdefault('centered', True) return approx_fprime(params, self.loglike, **kwds).ravel() def score_obs(self, params, **kwds): - ''' + """ Jacobian/Gradient of log-likelihood evaluated at params for each observation. - ''' - #kwds.setdefault('epsilon', 1e-4) + """ + # kwds.setdefault('epsilon', 1e-4) kwds.setdefault('centered', True) return approx_fprime(params, self.loglikeobs, **kwds) def hessian(self, params): - ''' + """ Hessian of log-likelihood evaluated at params - ''' + """ from statsmodels.tools.numdiff import approx_hess # need options for hess (epsilon) return approx_hess(params, self.loglike) @@ -736,20 +731,20 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, disp=disp, callback=callback, **kwargs) genericmlefit = GenericLikelihoodModelResults(self, mlefit) - #amend param names + # amend param names exog_names = [] if (self.exog_names is None) else self.exog_names k_miss = len(exog_names) - len(mlefit.params) if not k_miss == 0: if k_miss < 0: - self._set_extra_params_names( - ['par%d' % i for i in range(-k_miss)]) + self._set_extra_params_names(['par%d' % i + for i in range(-k_miss)]) else: # I don't want to raise after we have already fit() import warnings warnings.warn('more exog_names than parameters', ValueWarning) return genericmlefit - #fit.__doc__ += LikelihoodModel.fit.__doc__ + # fit.__doc__ += LikelihoodModel.fit.__doc__ class Results(object): @@ -838,12 +833,11 @@ def predict(self, exog=None, transform=True, *args, **kwargs): return predict_results - def summary(self): pass -#TODO: public method? +# TODO: public method? class LikelihoodModelResults(Results): """ Class to contain results from likelihood models @@ -1023,7 +1017,7 @@ def __init__(self, model, params, normalized_cov_params=None, scale=1., if cov_type == 'nonrobust': self.cov_type = 'nonrobust' - self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + self.cov_kwds = {'description': 'Standard Errors assume that the ' + 'covariance matrix of the errors is correctly ' + 'specified.'} else: @@ -1033,28 +1027,26 @@ def __init__(self, model, params, normalized_cov_params=None, scale=1., use_t = self.use_t # TODO: we shouldn't need use_t in get_robustcov_results get_robustcov_results(self, cov_type=cov_type, use_self=True, - use_t=use_t, **cov_kwds) - + use_t=use_t, **cov_kwds) def normalized_cov_params(self): raise NotImplementedError - def _get_robustcov_results(self, cov_type='nonrobust', use_self=True, - use_t=None, **cov_kwds): + use_t=None, **cov_kwds): from statsmodels.base.covtype import get_robustcov_results if cov_kwds is None: cov_kwds = {} if cov_type == 'nonrobust': self.cov_type = 'nonrobust' - self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + self.cov_kwds = {'description': 'Standard Errors assume that the ' + 'covariance matrix of the errors is correctly ' + 'specified.'} else: # TODO: we shouldn't need use_t in get_robustcov_results get_robustcov_results(self, cov_type=cov_type, use_self=True, - use_t=use_t, **cov_kwds) + use_t=use_t, **cov_kwds) @cache_readonly def llf(self): @@ -1082,13 +1074,12 @@ def tvalues(self): def pvalues(self): if self.use_t: df_resid = getattr(self, 'df_resid_inference', self.df_resid) - return stats.t.sf(np.abs(self.tvalues), df_resid)*2 + return stats.t.sf(np.abs(self.tvalues), df_resid) * 2 else: - return stats.norm.sf(np.abs(self.tvalues))*2 - + return stats.norm.sf(np.abs(self.tvalues)) * 2 def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, - other=None): + other=None): """ Returns the variance/covariance matrix. @@ -1143,7 +1134,7 @@ def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, dot_fun = np.dot if (cov_p is None and self.normalized_cov_params is None and - not hasattr(self, 'cov_params_default')): + not hasattr(self, 'cov_params_default')): raise ValueError('need covariance of parameters for computing ' '(unnormalized) covariances') if column is not None and (r_matrix is not None or other is not None): @@ -1165,7 +1156,6 @@ def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, if column.shape == (): return cov_p[column, column] else: - #return cov_p[column][:, column] return cov_p[column[:, None], column] elif r_matrix is not None: r_matrix = np.asarray(r_matrix) @@ -1180,9 +1170,8 @@ def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, else: # if r_matrix is None and column is None: return cov_p - #TODO: make sure this works as needed for GLMs - def t_test(self, r_matrix, cov_p=None, scale=None, - use_t=None): + # TODO: make sure this works as needed for GLMs + def t_test(self, r_matrix, cov_p=None, scale=None, use_t=None): """ Compute a t-test for a each linear hypothesis of the form Rb = q @@ -1280,7 +1269,7 @@ def t_test(self, r_matrix, cov_p=None, scale=None, num_params = r_matrix.shape[1] if (cov_p is None and self.normalized_cov_params is None and - not hasattr(self, 'cov_params_default')): + not hasattr(self, 'cov_params_default')): raise ValueError('Need covariance of parameters for computing ' 'T statistics') if num_params != self.params.shape[0]: @@ -1296,7 +1285,7 @@ def t_test(self, r_matrix, cov_p=None, scale=None, "number of rows") if use_t is None: - #switch to use_t false if undefined + # switch to use_t false if undefined use_t = (hasattr(self, 'use_t') and self.use_t) _t = _sd = None @@ -1418,7 +1407,7 @@ def f_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None): invcov=invcov, use_f=True) return res - #TODO: untested for GLMs? + # TODO: untested for GLMs? def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None, use_f=None): """ @@ -1472,7 +1461,7 @@ def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None, where the rank of the covariance of the noise is not full. """ if use_f is None: - #switch to use_t false if undefined + # switch to use_t false if undefined use_f = (hasattr(self, 'use_t') and self.use_t) from patsy import DesignInfo @@ -1520,9 +1509,8 @@ def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None, return ContrastResults(chi2=F, df_denom=J, statistic=F, distribution='chi2', distargs=(J,)) - def wald_test_terms(self, skip_single=False, extra_constraints=None, - combine_terms=None): + combine_terms=None): """ Compute a sequence of Wald tests for terms over multiple columns @@ -1589,7 +1577,6 @@ def wald_test_terms(self, skip_single=False, extra_constraints=None, if design_info is None and extra_constraints is None: raise ValueError('no constraints, nothing to do') - identity = np.eye(len(result.params)) constraints = [] combined = defaultdict(list) @@ -1658,7 +1645,6 @@ def wald_test_terms(self, skip_single=False, extra_constraints=None, res.temp = constraints + combined_constraints + extra_constraints return res - def conf_int(self, alpha=.05, cols=None, method='default'): """ Returns the confidence interval of the fitted parameters. @@ -1734,7 +1720,7 @@ def conf_int(self, alpha=.05, cols=None, method='default'): return np.asarray(lzip(lower, upper)) def save(self, fname, remove_data=False): - ''' + """ save a pickle of this instance Parameters @@ -1751,8 +1737,7 @@ def save(self, fname, remove_data=False): ----- If remove_data is true and the model result does not implement a remove_data method then this will raise an exception. - - ''' + """ from statsmodels.iolib.smpickle import save_pickle @@ -1763,7 +1748,7 @@ def save(self, fname, remove_data=False): @classmethod def load(cls, fname): - ''' + """ load a pickle, (class method) Parameters @@ -1774,14 +1759,13 @@ def load(cls, fname): Returns ------- unpickled instance - - ''' + """ from statsmodels.iolib.smpickle import load_pickle return load_pickle(fname) def remove_data(self): - '''remove data arrays, all nobs arrays from result and model + """remove data arrays, all nobs arrays from result and model This reduces the size of the instance, so it can be pickled with less memory. Currently tested for use with predict from an unpickled @@ -1809,18 +1793,14 @@ def remove_data(self): result._data_attr_model : arrays attached to the model instance but not to the results instance - ''' + """ def wipe(obj, att): - #get to last element in attribute path + # get to last element in attribute path p = att.split('.') att_ = p.pop(-1) try: obj_ = reduce(getattr, [obj] + p) - - #print(repr(obj), repr(att)) - #print(hasattr(obj_, att_)) if hasattr(obj_, att_): - #print('removing3', att_) setattr(obj_, att_, None) except AttributeError: pass @@ -1828,7 +1808,6 @@ def wipe(obj, att): model_only = ['model.' + i for i in getattr(self, "_data_attr_model", [])] model_attr = ['model.' + i for i in self.model._data_attr] for att in self._data_attr + model_attr + model_only: - #print('removing', att) wipe(self, att) data_in_cache = getattr(self, 'data_in_cache', []) @@ -1857,7 +1836,7 @@ class LikelihoodResultsWrapper(wrap.ResultsWrapper): 'conf_int': 'columns' } -wrap.populate_wrapper(LikelihoodResultsWrapper, +wrap.populate_wrapper(LikelihoodResultsWrapper, # noqa:E305 LikelihoodModelResults) @@ -1887,54 +1866,54 @@ def bic(self): @cache_readonly def score_obsv(self): - '''cached Jacobian of log-likelihood - ''' + """cached Jacobian of log-likelihood + """ return self.model.score_obs(self.params) @cache_readonly def hessv(self): - '''cached Hessian of log-likelihood - ''' + """cached Hessian of log-likelihood + """ return self.model.hessian(self.params) @cache_readonly def covjac(self): - ''' + """ covariance of parameters based on outer product of jacobian of log-likelihood - ''' - ## if not hasattr(self, '_results'): - ## raise ValueError('need to call fit first') - ## #self.fit() - ## self.jacv = jacv = self.jac(self._results.params) + """ + # if not hasattr(self, '_results'): + # raise ValueError('need to call fit first') + # #self.fit() + # self.jacv = jacv = self.jac(self._results.params) jacv = self.score_obsv return np.linalg.inv(np.dot(jacv.T, jacv)) @cache_readonly def covjhj(self): - '''covariance of parameters based on HJJH + """covariance of parameters based on HJJH dot product of Hessian, Jacobian, Jacobian, Hessian of likelihood name should be covhjh - ''' + """ jacv = self.score_obsv hessv = self.hessv hessinv = np.linalg.inv(hessv) - ## self.hessinv = hessin = self.cov_params() + # self.hessinv = hessin = self.cov_params() return np.dot(hessinv, np.dot(np.dot(jacv.T, jacv), hessinv)) @cache_readonly def bsejhj(self): - '''standard deviation of parameter estimates based on covHJH - ''' + """standard deviation of parameter estimates based on covHJH + """ return np.sqrt(np.diag(self.covjhj)) @cache_readonly def bsejac(self): - '''standard deviation of parameter estimates based on covjac - ''' + """standard deviation of parameter estimates based on covjac + """ return np.sqrt(np.diag(self.covjac)) def bootstrap(self, nrep=100, method='nm', disp=0, store=1): @@ -1977,8 +1956,8 @@ def bootstrap(self, nrep=100, method='nm', disp=0, store=1): hascloneattr = True if hasattr(self, 'cloneattr') else False for i in range(nrep): rvsind = np.random.randint(self.nobs, size=self.nobs) - #this needs to set startparam and get other defining attributes - #need a clone method on model + # this needs to set startparam and get other defining attributes + # need a clone method on model fitmod = self.model.__class__(self.endog[rvsind], self.exog[rvsind, :]) if hascloneattr: @@ -1993,8 +1972,8 @@ def bootstrap(self, nrep=100, method='nm', disp=0, store=1): return results.mean(0), results.std(0), results def get_nlfun(self, fun): - #I think this is supposed to get the delta method that is currently - #in miscmodels count (as part of Poisson example) + # I think this is supposed to get the delta method that is currently + # in miscmodels count (as part of Poisson example) pass @@ -2125,7 +2104,7 @@ def summary(self, yname=None, xname=None, title=None, alpha=.05): if title is None: title = self.model.__class__.__name__ + ' ' + "Results" - #create summary table instance + # create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, From 0e4e305b5a9e6bcec22d04942440dfd7c214ec51 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 3 Mar 2018 15:48:04 -0800 Subject: [PATCH 029/157] get rid of redundant import --- statsmodels/discrete/tests/test_sandwich_cov.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/statsmodels/discrete/tests/test_sandwich_cov.py b/statsmodels/discrete/tests/test_sandwich_cov.py index c1e9745782a..b470f49cb95 100644 --- a/statsmodels/discrete/tests/test_sandwich_cov.py +++ b/statsmodels/discrete/tests/test_sandwich_cov.py @@ -14,7 +14,6 @@ from statsmodels.genmod import families from statsmodels.genmod.families import links from statsmodels.regression.linear_model import OLS -import statsmodels.stats.sandwich_covariance as sc from statsmodels.base.covtype import get_robustcov_results import statsmodels.stats.sandwich_covariance as sw from statsmodels.tools.tools import add_constant @@ -67,8 +66,8 @@ def test_basic(self): @classmethod def get_robust_clu(cls): res1 = cls.res1 - cov_clu = sc.cov_cluster(res1, group) - cls.bse_rob = sc.se_cov(cov_clu) + cov_clu = sw.cov_cluster(res1, group) + cls.bse_rob = sw.se_cov(cov_clu) nobs, k_vars = res1.model.exog.shape k_params = len(res1.params) @@ -262,7 +261,7 @@ def setup_class(cls): df_correction=True, #TODO has no effect use_t=False, #True, use_self=True) - cls.bse_rob = cls.res1.bse #sc.se_cov(cov_clu) + cls.bse_rob = cls.res1.bse #sw.se_cov(cov_clu) nobs, k_vars = res1.model.exog.shape k_params = len(res1.params) @@ -393,9 +392,9 @@ def setup_class(cls): # mod_nb = smd.NegativeBinomial(endog, exog) # res_nb = mod_nb.fit() # -# cov_clu_nb = sc.cov_cluster(res_nb, group) +# cov_clu_nb = sw.cov_cluster(res_nb, group) # k_params = k_vars + 1 -# print sc.se_cov(cov_clu_nb / ((nobs-1.) / float(nobs - k_params))) +# print sw.se_cov(cov_clu_nb / ((nobs-1.) / float(nobs - k_params))) # # wt = res_nb.wald_test(np.eye(len(res_nb.params))[1:3], cov_p=cov_clu_nb/((nobs-1.) / float(nobs - k_params))) # print wt From d0d1348453b86b9a868a68bdfebd99cdf40df217 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 3 Mar 2018 21:10:08 -0500 Subject: [PATCH 030/157] TST: test_arima111_predict_exog_2127 skip for scipy < 0.16 --- statsmodels/tsa/tests/test_arima.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/tests/test_arima.py b/statsmodels/tsa/tests/test_arima.py index 9400adf392a..cdb890c4733 100644 --- a/statsmodels/tsa/tests/test_arima.py +++ b/statsmodels/tsa/tests/test_arima.py @@ -3,6 +3,7 @@ import os import warnings +from distutils.version import LooseVersion import numpy as np from numpy.testing import (assert_almost_equal, assert_, assert_allclose, @@ -21,6 +22,9 @@ from statsmodels.tsa.tests.results import results_arma, results_arima from statsmodels.tsa.arima_process import arma_generate_sample +import scipy # only needed for version check +scipy_old = LooseVersion(scipy.__version__) < '0.16' + try: import matplotlib.pyplot as plt have_matplotlib = True @@ -2054,7 +2058,7 @@ def test_arima_diff2(): 229.457] assert_almost_equal(predicted, predicted_res, 3) - +@skipif(scipy_old, reason='scipy is old, test might fail') def test_arima111_predict_exog_2127(): # regression test for issue #2127 ef = [ 0.03005, 0.03917, 0.02828, 0.03644, 0.03379, 0.02744, From 2486c334c2b5dc958e3475821cdb4c6140d2ea04 Mon Sep 17 00:00:00 2001 From: Jeroen Van Goey Date: Mon, 5 Mar 2018 17:08:20 +0100 Subject: [PATCH 031/157] Fix typo regressaors -> regressors --- examples/python/regression_diagnostics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python/regression_diagnostics.py b/examples/python/regression_diagnostics.py index 1677b678c42..7c596f42c74 100644 --- a/examples/python/regression_diagnostics.py +++ b/examples/python/regression_diagnostics.py @@ -19,7 +19,7 @@ url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv' dat = pd.read_csv(url) -# Fit regression model (using the natural log of one of the regressaors) +# Fit regression model (using the natural log of one of the regressors) results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit() # Inspect the results From c32b3eafccfb4093c34682a69d8322aa18edab21 Mon Sep 17 00:00:00 2001 From: Jeroen Van Goey Date: Mon, 5 Mar 2018 17:09:24 +0100 Subject: [PATCH 032/157] Fix typo in notebook regressaors -> regressors --- examples/notebooks/regression_diagnostics.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/notebooks/regression_diagnostics.ipynb b/examples/notebooks/regression_diagnostics.ipynb index 7bed710d2cb..b90ea645b65 100644 --- a/examples/notebooks/regression_diagnostics.ipynb +++ b/examples/notebooks/regression_diagnostics.ipynb @@ -46,7 +46,7 @@ "url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv'\n", "dat = pd.read_csv(url)\n", "\n", - "# Fit regression model (using the natural log of one of the regressaors)\n", + "# Fit regression model (using the natural log of one of the regressors)\n", "results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()\n", "\n", "# Inspect the results\n", @@ -258,4 +258,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 0487024988a9879c1c0ad7bf13064e7f87d96578 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 2 Jan 2018 23:41:43 -0500 Subject: [PATCH 033/157] initial commit --- statsmodels/regression/mixed_glm.py | 523 ++++++++++++++++++ .../regression/tests/test_mixed_glm.py | 171 ++++++ 2 files changed, 694 insertions(+) create mode 100644 statsmodels/regression/mixed_glm.py create mode 100644 statsmodels/regression/tests/test_mixed_glm.py diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/mixed_glm.py new file mode 100644 index 00000000000..34d71f23838 --- /dev/null +++ b/statsmodels/regression/mixed_glm.py @@ -0,0 +1,523 @@ +import numpy as np +from scipy.optimize import minimize +from scipy import sparse +from statsmodels.iolib import summary2 +import pandas as pd +import statsmodels +import warnings + +# Gauss-Legendre weights +glw = [[0.2955242247147529, -0.1488743389816312], + [0.2955242247147529, 0.1488743389816312], + [0.2692667193099963, -0.4333953941292472], + [0.2692667193099963, 0.4333953941292472], + [0.2190863625159820, -0.6794095682990244], + [0.2190863625159820, 0.6794095682990244], + [0.1494513491505806, -0.8650633666889845], + [0.1494513491505806, 0.8650633666889845], + [0.0666713443086881, -0.9739065285171717], + [0.0666713443086881, 0.9739065285171717]] + + +class MixedGLM(object): + """ + Fit a generalized linear mixed model. + + The class implements the Laplace approximation to the posterior + distribution. See subclasses, e.g. BinomialMixedGLM for other + estimation approaches. + + Parameters + ---------- + endog : array-like + Vector of response values. + exog_fe : array-like + Array of covariates for the fixed effects part of the mean + structure. + exog_vc : array-like + Array of covariates for the random part of the model. A + scipy.sparse array may be provided, or else the passed + array will be converted to sparse internally. + ident : array-like + Array of labels showing which random terms have a common + variance. + vc_p : float + Prior standard deviation for variance component + parameters. + fe_p : float + Prior standard deviation for fixed effects parameters. + family : statsmodels.genmod.families instance + The GLM family. + fep_names : list of strings + The names of the fixed effects parameters (correspinding + to columns of exog_fe). + vcp_names : list of strings + The names of the variance component parameters (corresponding + to distinct labels in ident). + + Returns + ------- + MixedGLMResults object + + Notes + ----- + All random effects are modeled as being independent Gaussian + values. Every column of `exog_vc` has a distinct realized random + effect that used to form the inear predictors. Two columns of + `exog_vc` that have the same value in `ident` are constrinaed to + have the same variance. + + There are three types of values in the posterior: fixed effects + parameters (fep), corresponding to the columns of `exog_fe`, + random effects realizations (vc), corresponding to the columns of + `exog_vc`, and the variances of the random effects, corresponding + to the unique labels in `ident`. + """ + + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, family=None, fep_names=None, + vcp_names=None): + + if family is None: + family = statsmodels.genmod.families.Gaussian() + + # Get the fixed effects parameter names + if fep_names is None: + if hasattr(exog_fe, "columns"): + fep_names = exog_fe.columns.tolist() + else: + fep_names = ["FE_%d" % (k + 1) + for k in range(exog_fe.shape[1])] + self.fep_names = fep_names + + # Get the variance parameter names + if vcp_names is None: + if hasattr(exog_vc, "columns"): + vcp_names = exog_vc.columns.tolist() + else: + vcp_names = ["VC_%d" % (k + 1) + for k in range(int(max(ident)) + 1)] + self.vcp_names = vcp_names + + self.endog = np.asarray(endog) + self.exog_fe = np.asarray(exog_fe) + + if sparse.issparse(exog_vc): + self.exog_vc = exog_vc + else: + self.exog_vc = sparse.csr_matrix(exog_vc) + + self.ident = ident.astype(np.int) + self.family = family + self.vcp_p = float(vcp_p) + self.fe_p = float(fe_p) + + # Number of fixed effects parameters + if self.exog_fe is None: + self.k_fep = 0 + else: + self.k_fep = exog_fe.shape[1] + + # Number of variance component structure parameters and + # variance component realizations. + if self.exog_vc is None: + self.k_vc = 0 + self.k_vcp = 0 + else: + self.k_vc = exog_vc.shape[1] + self.k_vcp = max(self.ident) + 1 + + def _unpack(self, vec): + + ii = 0 + + # Fixed effects parameters + fep = vec[:ii+self.k_fep] + ii += self.k_fep + + # Variance component parameters (standard deviations) + vcp = vec[ii:ii+self.k_vcp] + ii += self.k_vcp + + # Variance component realizations + vc = vec[ii:] + + return fep, vcp, vc + + def logposterior(self, params): + """ + Returns the overall log-density log p(y, fe, vc, vcp), which + differs by an additive constant from the log posterior log + p(fe, vc, vcp | y). + """ + + fep, vcp, vc = self._unpack(params) + + # Contributions from p(y | vc) + lp = 0 + if self.k_fep > 0: + lp += np.dot(self.exog_fe, fep) + if self.k_vc > 0: + lp += self.exog_vc.dot(vc) + + mu = self.family.link.inverse(lp) + ll = self.family.loglike(self.endog, mu) + + if self.k_vc > 0: + + # Contribution from p(vc | vcp) + vcp0 = vcp[self.ident] + s = np.exp(vcp0) + ll -= 0.5 * np.sum(vc**2 / s**2) + np.sum(vcp0) + + # Prior for vc parameters + ll -= 0.5 * np.sum(vcp**2 / self.vcp_p**2) + + # Contributions from p(fep) + if self.k_fep > 0: + ll -= 0.5 * np.sum(fep**2 / self.fe_p**2) + + return ll + + def logposterior_grad(self, params): + """ + The gradient of the log posterior. + """ + + fep, vcp, vc = self._unpack(params) + + if self.k_fep > 0: + lp = np.dot(self.exog_fe, fep) + if self.k_vc > 0: + lp += self.exog_vc.dot(vc) + + mu = self.family.link.inverse(lp) + + score_factor = (self.endog - mu) / self.family.link.deriv(mu) + score_factor /= self.family.variance(mu) + + te = [None, None, None] + + # Contributions from p(y | x, z, vc) + if self.k_fep > 0: + te[0] = np.dot(score_factor, self.exog_fe) + if self.k_vc > 0: + te[2] = self.exog_vc.transpose().dot(score_factor) + + if self.k_vc > 0: + # Contributions from p(vc | vcp) + # vcp0 = vcp[self.ident] + # s = np.exp(vcp0) + # ll -= 0.5 * np.sum(vc**2 / s**2) + np.sum(vcp0) + vcp0 = vcp[self.ident] + s = np.exp(vcp0) + u = vc**2 / s**2 - 1 + te[1] = np.bincount(self.ident, weights=u) + te[2] -= vc / s**2 + + # Contributions from p(vcp) + # ll -= 0.5 * np.sum(vcp**2 / self.vcp_p**2) + te[1] -= vcp / self.vcp_p**2 + + # Contributions from p(fep) + if self.k_fep > 0: + te[0] -= fep / self.fe_p**2 + + te = [x for x in te if x is not None] + return np.concatenate(te) + + def _get_start(self): + start_fep = np.zeros(self.k_fep) + start_vcp = np.ones(self.k_vcp) + start_vc = np.random.normal(size=self.k_vc) + start = np.concatenate((start_fep, start_vcp, start_vc)) + return start + + def fit_map(self, method="BFGS", minim_opts=None): + """ + Construct the Laplace approximation to the posterior + distribution. + """ + + def fun(params): + return -self.logposterior(params) + + def grad(params): + return -self.logposterior_grad(params) + + start = self._get_start() + + r = minimize(fun, start, method=method, jac=grad, options=minim_opts) + if not r.success: + msg = ("Laplace fitting did not converge, |gradient|=%.6f" % + np.sqrt(np.sum(r.jac**2))) + warnings.warn(msg) + + return MixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) + + # Overall mean and variance of the linear predictor under the + # given distribution parameters. + def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): + + tm = np.dot(self.exog_fe, fep_mean) + tv = np.dot(self.exog_fe**2, fep_sd**2) + tm += self.exog_vc.dot(vc_mean) + tv += self.exog_vc.power(2).dot(vc_sd**2) + + return tm, tv + + def fit_vb(self, mean=None, sd=None, minim_opts=None): + """ + Fit the model using variational Bayes. + + Parameters: + ----------- + mean : array-like + Starting value for VB mean vector + sd : array-like + Starting value for VB standard deviation vector + n_iter : integer + Number of iterations + + Notes + ----- + The goal is to find a factored Gaussian approximation + q1*q2*... to the posterior distribution, approximately + minimizing the KL divergence from the factored approximation + to the actual posterior. The KL divergence, or ELBO function + has the form + + E* log p(y, fe, vcp, vc) - E* log q + + where E* is expectation with respect to the product of qj. + + References + ---------- + https://arxiv.org/pdf/1601.00670.pdf + """ + + n = self.k_fep + self.k_vcp + self.k_vc + if mean is None: + m = np.zeros(n) + else: + m = mean + if sd is None: + s = -0.5 + 0.1 * np.random.normal(size=n) + else: + s = sd + + # Don't allow the variance parameter starting mean values to + # be too small. + i1, i2 = self.k_fep, self.k_fep + self.k_vcp + m[i1:i2] = np.where(m[i1:i2] < -1, -1, m[i1:i2]) + + # Don't allow the posterior standard deviation starting values + # to be too small. + s = np.where(s < -1, -1, s) + + def elbo(x): + n = len(x) // 2 + return -self.vb_elbo(x[:n], np.exp(x[n:])) + + def elbo_grad(x): + n = len(x) // 2 + gm, gs = self.vb_elbo_grad(x[:n], np.exp(x[n:])) + gs *= np.exp(x[n:]) + return -np.concatenate((gm, gs)) + + start = np.concatenate((m, s)) + mm = minimize(elbo, start, jac=elbo_grad, method="bfgs", + options=minim_opts) + if not mm.success: + warnings.warn("VB fitting did not converge") + + n = len(mm.x) // 2 + return MixedGLMResults(self, mm.x[0:n], np.exp(2*mm.x[n:]), mm) + + +class MixedGLMResults(object): + + def __init__(self, model, params, cov_params, + optim_retvals=None): + + self.model = model + self.params = params + self.cov_params = cov_params + self.optim_retvals = optim_retvals + + self.fe_params, self.vcp_params, self.vc_params = ( + model._unpack(params)) + + if cov_params.ndim == 2: + cp = np.diag(cov_params) + else: + cp = cov_params + self.fe_sd, self.vcp_sd, self.vc_sd = model._unpack(cp) + self.fe_sd = np.sqrt(self.fe_sd) + self.vcp_sd = np.sqrt(self.vcp_sd) + self.vc_sd = np.sqrt(self.vc_sd) + + def summary(self): + + df = pd.DataFrame() + m = self.model.k_fep + self.model.k_vcp + df["Type"] = (["F" for k in range(self.model.k_fep)] + + ["R" for k in range(self.model.k_vcp)]) + + df["Post. Mean"] = self.params[0:m] + + if self.cov_params.ndim == 2: + v = np.diag(self.cov_params)[0:m] + df["Post. SD"] = np.sqrt(v) + else: + df["Post. SD"] = np.sqrt(self.cov_params[0:m]) + + # Convert variance parameters to natural scale + df["VC"] = np.exp(df["Post. Mean"]) + df["VC (LB)"] = np.exp(df["Post. Mean"] - 2*df["Post. SD"]) + df["VC (UB)"] = np.exp(df["Post. Mean"] + 2*df["Post. SD"]) + df["VC"] = ["%.3f" % x for x in df.VC] + df["VC (LB)"] = ["%.3f" % x for x in df["VC (LB)"]] + df["VC (UB)"] = ["%.3f" % x for x in df["VC (UB)"]] + df.loc[df.index < self.model.k_fep, "VC"] = "" + df.loc[df.index < self.model.k_fep, "VC (LB)"] = "" + df.loc[df.index < self.model.k_fep, "VC (UB)"] = "" + + df.index = self.model.fep_names + self.model.vcp_names + + summ = summary2.Summary() + summ.add_title(self.model.family.__class__.__name__ + + " Mixed GLM Results") + summ.add_df(df) + + return summ + + +class BinomialMixedGLM(MixedGLM): + + # Integration range (from -rng to +rng). The integrals are with + # respect to a standard Gaussian distribution so (-5, 5) will be + # sufficient in many cases. + rng = 5 + + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, fep_names=None, vcp_names=None): + + super().__init__(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, + family=statsmodels.genmod.families.Binomial(), + fep_names=fep_names, vcp_names=vcp_names) + + def vb_elbo(self, vb_mean, vb_sd): + """ + Returns the evidence lower bound (ELBO) for the model. + """ + + fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) + fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) + + tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) + + def h(z): + x = np.log(1 + np.exp(tm + np.sqrt(tv)*z)) + x *= np.exp(-z**2 / 2) + x /= np.sqrt(2*np.pi) + return x + + # p(y | vc) contributions + iv = 0 + for w in glw: + iv += h(self.rng * w[1]) * w[0] + iv *= -self.rng + iv += self.endog * tm + iv = iv.sum() + + # p(vc | vcp) contributions + m = vcp_mean[self.ident] + s = vcp_sd[self.ident] + iv -= np.sum((vc_mean**2 + vc_sd**2) * np.exp(-2*m + 2*s**2)) / 2 + iv -= np.sum(m) + + # p(vcp) contributions + iv -= 0.5 * (vcp_mean**2 + vcp_sd**2).sum() / self.vcp_p**2 + + # p(b) contributions + iv -= 0.5 * (fep_mean**2 + fep_sd**2).sum() / self.fe_p**2 + + return iv + np.sum(np.log(vb_sd)) + + def vb_elbo_grad(self, vb_mean, vb_sd): + """ + Returns the gradient of the evidence lower bound (ELBO). + """ + + fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) + fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) + + tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) + + def h(z): + x = np.log(1 + np.exp(tm + np.sqrt(tv)*z)) + x *= np.exp(-z**2 / 2) + x /= np.sqrt(2*np.pi) + return x + + def h1(z): + u = tm + np.sqrt(tv)*z + x = np.exp(u) / (1 + np.exp(u)) + x *= np.exp(-z**2 / 2) + x /= np.sqrt(2*np.pi) + return x + + fep_mean_grad = 0. + fep_sd_grad = 0. + vcp_mean_grad = 0. + vcp_sd_grad = 0. + vc_mean_grad = 0. + vc_sd_grad = 0. + + # p(y | vc) contributions + for w in glw: + x = self.rng * w[1] + u = h1(x) + r = u / np.sqrt(tv) + fep_mean_grad += w[0] * np.dot(u, self.exog_fe) + vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) + fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) + v = self.exog_vc.power(2).multiply(vc_sd).transpose().dot(r) + v = np.squeeze(np.asarray(v)) + vc_sd_grad += w[0] * x * v + fep_mean_grad *= -self.rng + vc_mean_grad *= -self.rng + fep_sd_grad *= -self.rng + vc_sd_grad *= -self.rng + fep_mean_grad += np.dot(self.endog, self.exog_fe) + vc_mean_grad += self.exog_vc.transpose().dot(self.endog) + + # p(vc | vcp) contributions + m = vcp_mean[self.ident] + s = vcp_sd[self.ident] + u = vc_mean**2 + vc_sd**2 + ve = np.exp(2*(s**2 - m)) + dm = u * ve - 1 + ds = -2 * u * ve * s + vcp_mean_grad += np.bincount(self.ident, weights=dm) + vcp_sd_grad += np.bincount(self.ident, weights=ds) + + vc_mean_grad -= vc_mean * ve + vc_sd_grad -= vc_sd * ve + + # p(vcp) contributions + vcp_mean_grad -= vcp_mean / self.vcp_p**2 + vcp_sd_grad -= vcp_sd / self.vcp_p**2 + + # p(b) contributions + fep_mean_grad -= fep_mean / self.fe_p**2 + fep_sd_grad -= fep_sd / self.fe_p**2 + + mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, + vc_mean_grad)) + sd_grad = np.concatenate((fep_sd_grad, vcp_sd_grad, vc_sd_grad)) + + sd_grad += 1 / vb_sd + + return mean_grad, sd_grad diff --git a/statsmodels/regression/tests/test_mixed_glm.py b/statsmodels/regression/tests/test_mixed_glm.py new file mode 100644 index 00000000000..9a5748a7963 --- /dev/null +++ b/statsmodels/regression/tests/test_mixed_glm.py @@ -0,0 +1,171 @@ +import numpy as np +from .mixed_glm import MixedGLM, BinomialMixedGLM +import statsmodels.api as sm +from scipy import sparse +from numpy.testing import assert_allclose +from scipy.optimize import approx_fprime +import warnings + + +def gen_simple_logit(s): + + np.random.seed(3799) + + nc = 100 + cs = 500 + + exog_vc = np.kron(np.eye(nc), np.ones((cs, 1))) + exog_fe = np.random.normal(size=(nc*cs, 2)) + vc = s*np.random.normal(size=nc) + lp = np.dot(exog_fe, np.r_[1, -1]) + np.dot(exog_vc, vc) + pr = 1 / (1 + np.exp(-lp)) + y = 1*(np.random.uniform(size=nc*cs) < pr) + ident = np.zeros(nc, dtype=np.int) + + return y, exog_fe, exog_vc, ident + + +def gen_logit_crossed(s1, s2): + + np.random.seed(3799) + + nc = 100 + cs = 500 + + a = np.kron(np.eye(nc), np.ones((cs, 1))) + b = np.kron(np.ones((cs, 1)), np.eye(nc)) + exog_vc = np.concatenate((a, b), axis=1) + + exog_fe = np.random.normal(size=(nc*cs, 1)) + vc = s1 * np.random.normal(size=2*nc) + vc[nc:] *= s2 / s1 + lp = np.dot(exog_fe, np.r_[-0.5]) + np.dot(exog_vc, vc) + pr = 1 / (1 + np.exp(-lp)) + y = 1*(np.random.uniform(size=nc*cs) < pr) + ident = np.zeros(2*nc, dtype=np.int) + ident[nc:] = 1 + + return y, exog_fe, exog_vc, ident + + +def test_logit_map(): + + y, exog_fe, exog_vc, ident = gen_simple_logit(2) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm = MixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) + rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) + + assert_allclose(glmm.logposterior_grad(rslt.params), + np.zeros_like(rslt.params), atol=1e-4) + + +def test_logit_map_crossed(): + + y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm = MixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) + rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) + + assert_allclose(glmm.logposterior_grad(rslt.params), + np.zeros_like(rslt.params), atol=1e-4) + + +def test_logit_elbo_grad(): + + for j in range(2): + + if j == 0: + y, exog_fe, exog_vc, ident = gen_simple_logit(2) + else: + y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + + exog_vc = sparse.csr_matrix(exog_vc) + + glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4}) + + n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc + + for k in range(3): + + if k == 0: + vb_mean = rslt1.params + vb_sd = np.ones_like(vb_mean) + elif k == 1: + vb_mean = np.zeros(len(vb_mean)) + vb_sd = np.ones_like(vb_mean) + else: + vb_mean = np.random.normal(size=len(vb_mean)) + vb_sd = np.random.uniform(1, 2, size=len(vb_mean)) + + mean_grad, sd_grad = glmm1.vb_elbo_grad(vb_mean, vb_sd) + + def elbo(vec): + n = len(vec) // 2 + return glmm1.vb_elbo(vec[:n], vec[n:]) + + x = np.concatenate((vb_mean, vb_sd)) + g1 = approx_fprime(x, elbo, 1e-5) + n = len(x) // 2 + + mean_grad_n = g1[:n] + sd_grad_n = g1[n:] + + assert_allclose(mean_grad, mean_grad_n, atol=1e-2, rtol=1e-2) + assert_allclose(sd_grad, sd_grad_n, atol=1e-2, rtol=1e-2) + + +def test_logit_vb(): + + y, exog_fe, exog_vc, ident = gen_simple_logit(0) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 5}) + + glmm2 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) + + rslt1.summary() + rslt2.summary() + + assert_allclose(rslt1.params[0:5], np.r_[ + 0.64644962, -0.61266869, -1., -0.00961027, 0.02411796], + rtol=1e-4, atol=1e-4) + + assert_allclose(rslt2.params[0:5], np.r_[ + 0.9017295, -0.95958884, -0.70822657, -0.00711374, 0.02673195], + rtol=1e-4, atol=1e-4) + + +def test_logit_vb_crossed(): + + y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 2}) + + glmm2 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) + + rslt1.summary() + rslt2.summary() + + assert_allclose(rslt1.params[0:5], np.r_[ + -0.84192649, 0.81152304, 0.81056098, -0.76727982, -0.94713751], + rtol=1e-4, atol=1e-4) + + assert_allclose(rslt2.params[0:5], np.r_[ + -0.68311938, 0.75472554, 0.75218755, -0.71387273, -0.76462306], + rtol=1e-4, atol=1e-4) From 5b528539400ea51aa71d6a985b1cb91a85cfd685 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 3 Jan 2018 00:15:00 -0500 Subject: [PATCH 034/157] fix import path --- statsmodels/regression/mixed_glm.py | 2 ++ statsmodels/regression/tests/test_mixed_glm.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/mixed_glm.py index 34d71f23838..33c63e49ccb 100644 --- a/statsmodels/regression/mixed_glm.py +++ b/statsmodels/regression/mixed_glm.py @@ -520,4 +520,6 @@ def h1(z): sd_grad += 1 / vb_sd + # print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + np.sum(sd_grad**2))) + return mean_grad, sd_grad diff --git a/statsmodels/regression/tests/test_mixed_glm.py b/statsmodels/regression/tests/test_mixed_glm.py index 9a5748a7963..9330af9bf77 100644 --- a/statsmodels/regression/tests/test_mixed_glm.py +++ b/statsmodels/regression/tests/test_mixed_glm.py @@ -1,5 +1,5 @@ import numpy as np -from .mixed_glm import MixedGLM, BinomialMixedGLM +from statsmodels.regression.mixed_glm import MixedGLM, BinomialMixedGLM import statsmodels.api as sm from scipy import sparse from numpy.testing import assert_allclose From e10942402a8d4da07afb4568fb90ddcf6675b90f Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 3 Jan 2018 23:02:00 -0500 Subject: [PATCH 035/157] rename classes --- statsmodels/regression/mixed_glm.py | 70 +++++++++++++------ .../regression/tests/test_mixed_glm.py | 21 +++--- 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/mixed_glm.py index 33c63e49ccb..b1e96897ff6 100644 --- a/statsmodels/regression/mixed_glm.py +++ b/statsmodels/regression/mixed_glm.py @@ -1,3 +1,4 @@ +from __future__ import division import numpy as np from scipy.optimize import minimize from scipy import sparse @@ -19,9 +20,9 @@ [0.0666713443086881, 0.9739065285171717]] -class MixedGLM(object): +class BayesMixedGLM(object): """ - Fit a generalized linear mixed model. + Fit a generalized linear mixed model using Bayesian methods. The class implements the Laplace approximation to the posterior distribution. See subclasses, e.g. BinomialMixedGLM for other @@ -61,17 +62,26 @@ class MixedGLM(object): Notes ----- - All random effects are modeled as being independent Gaussian - values. Every column of `exog_vc` has a distinct realized random - effect that used to form the inear predictors. Two columns of - `exog_vc` that have the same value in `ident` are constrinaed to - have the same variance. - There are three types of values in the posterior: fixed effects parameters (fep), corresponding to the columns of `exog_fe`, random effects realizations (vc), corresponding to the columns of - `exog_vc`, and the variances of the random effects, corresponding - to the unique labels in `ident`. + `exog_vc`, and the variances of the random effects (vcp), + corresponding to the unique labels in `ident`. + + All random effects are modeled as being independent Gaussian + values, given the variance parameters. Every column of `exog_vc` + has a distinct realized random effect that is used to form the + linear predictors. The elements of `ident` index the distinct + random effect variance parameters. Two columns of `exog_vc` that + have the same value in `ident` are constrained to have the same + variance. + + The random effect standard deviation parameters (vcp) have + log-normal prior distributions with mean 0 and standard deviation + `vcp_p`. + + The prior for the fixed effects parameters is Gaussian with mean 0 + and standard deviation `fe_p`. """ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, @@ -80,6 +90,7 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, if family is None: family = statsmodels.genmod.families.Gaussian() + warnings.Warn("Defaulting to Gaussian family") # Get the fixed effects parameter names if fep_names is None: @@ -186,8 +197,9 @@ def logposterior_grad(self, params): fep, vcp, vc = self._unpack(params) + lp = 0 if self.k_fep > 0: - lp = np.dot(self.exog_fe, fep) + lp += np.dot(self.exog_fe, fep) if self.k_vc > 0: lp += self.exog_vc.dot(vc) @@ -253,7 +265,7 @@ def grad(params): np.sqrt(np.sum(r.jac**2))) warnings.warn(msg) - return MixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) + return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) # Overall mean and variance of the linear predictor under the # given distribution parameters. @@ -266,7 +278,7 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): return tm, tv - def fit_vb(self, mean=None, sd=None, minim_opts=None): + def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): """ Fit the model using variational Bayes. @@ -276,8 +288,8 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None): Starting value for VB mean vector sd : array-like Starting value for VB standard deviation vector - n_iter : integer - Number of iterations + minim_opts : dict-like + Options passed to scipy.minimize Notes ----- @@ -296,6 +308,14 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None): https://arxiv.org/pdf/1601.00670.pdf """ + if type(self) is BayesMixedGLM: + msg = ("To fit the model using variational Bayes, create a " + + "class for the appropriate family type, e.g. " + + "BinomialBayesMixedGLM.") + raise ValueError(msg) + + self.verbose = verbose + n = self.k_fep + self.k_vcp + self.k_vc if mean is None: m = np.zeros(n) @@ -332,10 +352,10 @@ def elbo_grad(x): warnings.warn("VB fitting did not converge") n = len(mm.x) // 2 - return MixedGLMResults(self, mm.x[0:n], np.exp(2*mm.x[n:]), mm) + return BayesMixedGLMResults(self, mm.x[0:n], np.exp(2*mm.x[n:]), mm) -class MixedGLMResults(object): +class BayesMixedGLMResults(object): def __init__(self, model, params, cov_params, optim_retvals=None): @@ -393,19 +413,23 @@ def summary(self): return summ -class BinomialMixedGLM(MixedGLM): +class BinomialBayesMixedGLM(BayesMixedGLM): # Integration range (from -rng to +rng). The integrals are with # respect to a standard Gaussian distribution so (-5, 5) will be # sufficient in many cases. rng = 5 + verbose = False + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5, fep_names=None, vcp_names=None): - super().__init__(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, - family=statsmodels.genmod.families.Binomial(), - fep_names=fep_names, vcp_names=vcp_names) + super(BinomialBayesMixedGLM, self).__init__( + endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, + ident=ident, vcp_p=vcp_p, fe_p=fe_p, + family=statsmodels.genmod.families.Binomial(), + fep_names=fep_names, vcp_names=vcp_names) def vb_elbo(self, vb_mean, vb_sd): """ @@ -520,6 +544,8 @@ def h1(z): sd_grad += 1 / vb_sd - # print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + np.sum(sd_grad**2))) + if self.verbose: + print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + + np.sum(sd_grad**2))) return mean_grad, sd_grad diff --git a/statsmodels/regression/tests/test_mixed_glm.py b/statsmodels/regression/tests/test_mixed_glm.py index 9330af9bf77..df0d0f1cc99 100644 --- a/statsmodels/regression/tests/test_mixed_glm.py +++ b/statsmodels/regression/tests/test_mixed_glm.py @@ -1,5 +1,6 @@ import numpy as np -from statsmodels.regression.mixed_glm import MixedGLM, BinomialMixedGLM +from statsmodels.regression.mixed_glm import ( + BayesMixedGLM, BinomialBayesMixedGLM) import statsmodels.api as sm from scipy import sparse from numpy.testing import assert_allclose @@ -53,7 +54,8 @@ def test_logit_map(): y, exog_fe, exog_vc, ident = gen_simple_logit(2) exog_vc = sparse.csr_matrix(exog_vc) - glmm = MixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) + glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Binomial()) rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) assert_allclose(glmm.logposterior_grad(rslt.params), @@ -65,7 +67,8 @@ def test_logit_map_crossed(): y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm = MixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) + glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Binomial()) rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) assert_allclose(glmm.logposterior_grad(rslt.params), @@ -83,7 +86,7 @@ def test_logit_elbo_grad(): exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4}) n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc @@ -122,12 +125,13 @@ def test_logit_vb(): y, exog_fe, exog_vc, ident = gen_simple_logit(0) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Binomial()) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 5}) - glmm2 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) @@ -149,12 +153,13 @@ def test_logit_vb_crossed(): y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Binomial()) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 2}) - glmm2 = BinomialMixedGLM(y, exog_fe, exog_vc, ident) + glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) From 9e28c753a1bca69cac2b8ba057506d484b35728f Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 3 Jan 2018 23:14:21 -0500 Subject: [PATCH 036/157] docstring work --- statsmodels/regression/mixed_glm.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/mixed_glm.py index b1e96897ff6..93ed4aea341 100644 --- a/statsmodels/regression/mixed_glm.py +++ b/statsmodels/regression/mixed_glm.py @@ -25,8 +25,8 @@ class BayesMixedGLM(object): Fit a generalized linear mixed model using Bayesian methods. The class implements the Laplace approximation to the posterior - distribution. See subclasses, e.g. BinomialMixedGLM for other - estimation approaches. + distribution. See subclasses, e.g. BinomialBayesMixedGLM for + other estimation approaches. Parameters ---------- @@ -43,14 +43,15 @@ class BayesMixedGLM(object): Array of labels showing which random terms have a common variance. vc_p : float - Prior standard deviation for variance component - parameters. + Prior standard deviation for variance component parameters + (the prior standard deviation of log(s) is vc_p, where s is + the standard deviation of a random effect). fe_p : float Prior standard deviation for fixed effects parameters. family : statsmodels.genmod.families instance The GLM family. fep_names : list of strings - The names of the fixed effects parameters (correspinding + The names of the fixed effects parameters (corresponding to columns of exog_fe). vcp_names : list of strings The names of the variance component parameters (corresponding @@ -324,7 +325,9 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): if sd is None: s = -0.5 + 0.1 * np.random.normal(size=n) else: - s = sd + # s is parameterized on the log-scale internally + # (transparent to caller) + s = np.log(sd) # Don't allow the variance parameter starting mean values to # be too small. From a0bd626f6b9cf04041d4e8d8ae0527d5d95de5f1 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 19:35:44 -0500 Subject: [PATCH 037/157] modify tests to be faster --- statsmodels/regression/mixed_glm.py | 42 ++++++++++--------- .../regression/tests/test_mixed_glm.py | 38 +++++++---------- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/mixed_glm.py index 93ed4aea341..8c6974ad9ee 100644 --- a/statsmodels/regression/mixed_glm.py +++ b/statsmodels/regression/mixed_glm.py @@ -63,19 +63,19 @@ class BayesMixedGLM(object): Notes ----- - There are three types of values in the posterior: fixed effects - parameters (fep), corresponding to the columns of `exog_fe`, - random effects realizations (vc), corresponding to the columns of - `exog_vc`, and the variances of the random effects (vcp), - corresponding to the unique labels in `ident`. + There are three types of values in the posterior distribution: + fixed effects parameters (fep), corresponding to the columns of + `exog_fe`, random effects realizations (vc), corresponding to the + columns of `exog_vc`, and the variances of the random effects + realizations (vcp), corresponding to the unique labels in `ident`. All random effects are modeled as being independent Gaussian - values, given the variance parameters. Every column of `exog_vc` + values (given the variance parameters). Every column of `exog_vc` has a distinct realized random effect that is used to form the - linear predictors. The elements of `ident` index the distinct - random effect variance parameters. Two columns of `exog_vc` that - have the same value in `ident` are constrained to have the same - variance. + linear predictors. The elements of `ident` determine the distinct + random effect variance parameters. Two random effect realizations + that have the same value in `ident` are constrained to have the + same variance. The random effect standard deviation parameters (vcp) have log-normal prior distributions with mean 0 and standard deviation @@ -147,20 +147,23 @@ def _unpack(self, vec): fep = vec[:ii+self.k_fep] ii += self.k_fep - # Variance component parameters (standard deviations) + # Variance component structure parameters (standard + # deviations). These are on the log scale. The standard + # deviation for random effect j is exp(vcp[ident[j]]). vcp = vec[ii:ii+self.k_vcp] ii += self.k_vcp - # Variance component realizations + # Random effect realizations vc = vec[ii:] return fep, vcp, vc def logposterior(self, params): """ - Returns the overall log-density log p(y, fe, vc, vcp), which - differs by an additive constant from the log posterior log - p(fe, vc, vcp | y). + The overall log-density: log p(y, fe, vc, vcp). + + This differs by an additive constant from the log posterior + log p(fe, vc, vcp | y). """ fep, vcp, vc = self._unpack(params) @@ -268,7 +271,7 @@ def grad(params): return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) - # Overall mean and variance of the linear predictor under the + # Returns the mean and variance of the linear predictor under the # given distribution parameters. def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): @@ -281,7 +284,7 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): """ - Fit the model using variational Bayes. + Fit a model using variational Bayes. Parameters: ----------- @@ -325,8 +328,9 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): if sd is None: s = -0.5 + 0.1 * np.random.normal(size=n) else: - # s is parameterized on the log-scale internally - # (transparent to caller) + # s is parameterized on the log-scale internally when + # optimizing the ELBO function (this is transparent to the + # caller) s = np.log(sd) # Don't allow the variance parameter starting mean values to diff --git a/statsmodels/regression/tests/test_mixed_glm.py b/statsmodels/regression/tests/test_mixed_glm.py index df0d0f1cc99..0ff7a8ec25a 100644 --- a/statsmodels/regression/tests/test_mixed_glm.py +++ b/statsmodels/regression/tests/test_mixed_glm.py @@ -8,13 +8,10 @@ import warnings -def gen_simple_logit(s): +def gen_simple_logit(nc, cs, s): np.random.seed(3799) - nc = 100 - cs = 500 - exog_vc = np.kron(np.eye(nc), np.ones((cs, 1))) exog_fe = np.random.normal(size=(nc*cs, 2)) vc = s*np.random.normal(size=nc) @@ -26,13 +23,10 @@ def gen_simple_logit(s): return y, exog_fe, exog_vc, ident -def gen_logit_crossed(s1, s2): +def gen_logit_crossed(nc, cs, s1, s2): np.random.seed(3799) - nc = 100 - cs = 500 - a = np.kron(np.eye(nc), np.ones((cs, 1))) b = np.kron(np.ones((cs, 1)), np.eye(nc)) exog_vc = np.concatenate((a, b), axis=1) @@ -51,7 +45,7 @@ def gen_logit_crossed(s1, s2): def test_logit_map(): - y, exog_fe, exog_vc, ident = gen_simple_logit(2) + y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) exog_vc = sparse.csr_matrix(exog_vc) glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, @@ -64,7 +58,7 @@ def test_logit_map(): def test_logit_map_crossed(): - y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, @@ -80,9 +74,9 @@ def test_logit_elbo_grad(): for j in range(2): if j == 0: - y, exog_fe, exog_vc, ident = gen_simple_logit(2) + y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) else: - y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) @@ -122,55 +116,55 @@ def elbo(vec): def test_logit_vb(): - y, exog_fe, exog_vc, ident = gen_simple_logit(0) + y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 0) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) with warnings.catch_warnings(): warnings.simplefilter("ignore") - rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 5}) + rslt1 = glmm1.fit_map() glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) with warnings.catch_warnings(): warnings.simplefilter("ignore") - rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) + rslt2 = glmm2.fit_vb(rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ - 0.64644962, -0.61266869, -1., -0.00961027, 0.02411796], + 0.75330405, -0.71643228, -1., -0.00959806, 0.00450254], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ - 0.9017295, -0.95958884, -0.70822657, -0.00711374, 0.02673195], + 0.79338836, -0.7599833, -0.64149356, -0.24772884, 0.10775366], rtol=1e-4, atol=1e-4) def test_logit_vb_crossed(): - y, exog_fe, exog_vc, ident = gen_logit_crossed(1, 2) + y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, family=sm.families.Binomial()) with warnings.catch_warnings(): warnings.simplefilter("ignore") - rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4, "maxiter": 2}) + rslt1 = glmm1.fit_map() glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) with warnings.catch_warnings(): warnings.simplefilter("ignore") - rslt2 = glmm2.fit_vb(mean=rslt1.params, minim_opts={"maxiter": 2}) + rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ - -0.84192649, 0.81152304, 0.81056098, -0.76727982, -0.94713751], + -0.54307398, -1., -1., -0.0096403, 0.00232701], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ - -0.68311938, 0.75472554, 0.75218755, -0.71387273, -0.76462306], + -0.70834417, -0.3571011, 0.19126823, -0.36074489, 0.058976], rtol=1e-4, atol=1e-4) From dbacfbf7390237f286448d50a6bbdc6e831d94ce Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 19:38:28 -0500 Subject: [PATCH 038/157] rename to specify bayes --- statsmodels/regression/{mixed_glm.py => bayes_mixed_glm.py} | 0 .../tests/{test_mixed_glm.py => test_bayes_mixed_glm.py} | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename statsmodels/regression/{mixed_glm.py => bayes_mixed_glm.py} (100%) rename statsmodels/regression/tests/{test_mixed_glm.py => test_bayes_mixed_glm.py} (98%) diff --git a/statsmodels/regression/mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py similarity index 100% rename from statsmodels/regression/mixed_glm.py rename to statsmodels/regression/bayes_mixed_glm.py diff --git a/statsmodels/regression/tests/test_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py similarity index 98% rename from statsmodels/regression/tests/test_mixed_glm.py rename to statsmodels/regression/tests/test_bayes_mixed_glm.py index 0ff7a8ec25a..740f4cd1da9 100644 --- a/statsmodels/regression/tests/test_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -1,5 +1,5 @@ import numpy as np -from statsmodels.regression.mixed_glm import ( +from statsmodels.regression.bayes_mixed_glm import ( BayesMixedGLM, BinomialBayesMixedGLM) import statsmodels.api as sm from scipy import sparse From 371aed75cd434b5c0cf4bec8aae183db6f5ca04b Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 20:07:21 -0500 Subject: [PATCH 039/157] docstring work: --- statsmodels/regression/bayes_mixed_glm.py | 42 +++++++++++++++-------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 8c6974ad9ee..1e392d14cd2 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -20,14 +20,9 @@ [0.0666713443086881, 0.9739065285171717]] -class BayesMixedGLM(object): - """ +_init_doc = r""" Fit a generalized linear mixed model using Bayesian methods. - - The class implements the Laplace approximation to the posterior - distribution. See subclasses, e.g. BinomialBayesMixedGLM for - other estimation approaches. - +{fit_method} Parameters ---------- endog : array-like @@ -85,6 +80,21 @@ class BayesMixedGLM(object): and standard deviation `fe_p`. """ +_laplace_fit_method = """ + The class implements the Laplace approximation to the posterior + distribution. See subclasses, e.g. BinomialBayesMixedGLM for + other estimation approaches. +""" + +_vb_fit_method = """ + The class implements a variational Bayes approximation to the + posterior. See the docstring to `fit_vb` for more information. +""" + +class BayesMixedGLM(object): + + __doc__ = _init_doc.format(fit_method=_laplace_fit_method) + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5, family=None, fep_names=None, vcp_names=None): @@ -271,6 +281,9 @@ def grad(params): return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) + +class _VariationalBayesMixedGLM(BayesMixedGLM): + # Returns the mean and variance of the linear predictor under the # given distribution parameters. def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): @@ -284,7 +297,7 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): """ - Fit a model using variational Bayes. + Fit a model using the variational Bayes mean field approximation. Parameters: ----------- @@ -294,6 +307,9 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): Starting value for VB standard deviation vector minim_opts : dict-like Options passed to scipy.minimize + verbose : bool + If True, print the gradient norm to the screen each time + it is calculated. Notes ----- @@ -312,12 +328,6 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): https://arxiv.org/pdf/1601.00670.pdf """ - if type(self) is BayesMixedGLM: - msg = ("To fit the model using variational Bayes, create a " + - "class for the appropriate family type, e.g. " + - "BinomialBayesMixedGLM.") - raise ValueError(msg) - self.verbose = verbose n = self.k_fep + self.k_vcp + self.k_vc @@ -420,7 +430,9 @@ def summary(self): return summ -class BinomialBayesMixedGLM(BayesMixedGLM): +class BinomialBayesMixedGLM(_VariationalBayesMixedGLM): + + __doc__ = _init_doc.format(fit_method=_vb_fit_method) # Integration range (from -rng to +rng). The integrals are with # respect to a standard Gaussian distribution so (-5, 5) will be From 77e341163588e7992cfe31a3f3d0219e253a5580 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 20:41:17 -0500 Subject: [PATCH 040/157] remove common pieces to a superclass --- statsmodels/regression/bayes_mixed_glm.py | 90 +++++++++++++++-------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 1e392d14cd2..87886e25d52 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -91,6 +91,7 @@ posterior. See the docstring to `fit_vb` for more information. """ + class BayesMixedGLM(object): __doc__ = _init_doc.format(fit_method=_laplace_fit_method) @@ -371,6 +372,52 @@ def elbo_grad(x): n = len(mm.x) // 2 return BayesMixedGLMResults(self, mm.x[0:n], np.exp(2*mm.x[n:]), mm) + # Handle terms in the ELBO that are common to all models. + def _elbo_common(self, fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd): + + iv = 0 + + # p(vc | vcp) contributions + m = vcp_mean[self.ident] + s = vcp_sd[self.ident] + iv -= np.sum((vc_mean**2 + vc_sd**2) * np.exp(-2*m + 2*s**2)) / 2 + iv -= np.sum(m) + + # p(vcp) contributions + iv -= 0.5 * (vcp_mean**2 + vcp_sd**2).sum() / self.vcp_p**2 + + # p(b) contributions + iv -= 0.5 * (fep_mean**2 + fep_sd**2).sum() / self.fe_p**2 + + return iv + + def _elbo_grad_common(self, fep_mean, fep_sd, vcp_mean, vcp_sd, + vc_mean, vc_sd): + + # p(vc | vcp) contributions + m = vcp_mean[self.ident] + s = vcp_sd[self.ident] + u = vc_mean**2 + vc_sd**2 + ve = np.exp(2*(s**2 - m)) + dm = u * ve - 1 + ds = -2 * u * ve * s + vcp_mean_grad = np.bincount(self.ident, weights=dm) + vcp_sd_grad = np.bincount(self.ident, weights=ds) + + vc_mean_grad = -vc_mean.copy() * ve + vc_sd_grad = -vc_sd.copy() * ve + + # p(vcp) contributions + vcp_mean_grad -= vcp_mean / self.vcp_p**2 + vcp_sd_grad -= vcp_sd / self.vcp_p**2 + + # p(b) contributions + fep_mean_grad = -fep_mean.copy() / self.fe_p**2 + fep_sd_grad = -fep_sd.copy() / self.fe_p**2 + + return (fep_mean_grad, fep_sd_grad, vcp_mean_grad, vcp_sd_grad, + vc_mean_grad, vc_sd_grad) + class BayesMixedGLMResults(object): @@ -474,23 +521,14 @@ def h(z): iv += self.endog * tm iv = iv.sum() - # p(vc | vcp) contributions - m = vcp_mean[self.ident] - s = vcp_sd[self.ident] - iv -= np.sum((vc_mean**2 + vc_sd**2) * np.exp(-2*m + 2*s**2)) / 2 - iv -= np.sum(m) - - # p(vcp) contributions - iv -= 0.5 * (vcp_mean**2 + vcp_sd**2).sum() / self.vcp_p**2 - - # p(b) contributions - iv -= 0.5 * (fep_mean**2 + fep_sd**2).sum() / self.fe_p**2 + iv += self._elbo_common(fep_mean, fep_sd, vcp_mean, vcp_sd, + vc_mean, vc_sd) return iv + np.sum(np.log(vb_sd)) def vb_elbo_grad(self, vb_mean, vb_sd): """ - Returns the gradient of the evidence lower bound (ELBO). + Returns the gradient of the model's evidence lower bound (ELBO). """ fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) @@ -536,26 +574,16 @@ def h1(z): fep_mean_grad += np.dot(self.endog, self.exog_fe) vc_mean_grad += self.exog_vc.transpose().dot(self.endog) - # p(vc | vcp) contributions - m = vcp_mean[self.ident] - s = vcp_sd[self.ident] - u = vc_mean**2 + vc_sd**2 - ve = np.exp(2*(s**2 - m)) - dm = u * ve - 1 - ds = -2 * u * ve * s - vcp_mean_grad += np.bincount(self.ident, weights=dm) - vcp_sd_grad += np.bincount(self.ident, weights=ds) - - vc_mean_grad -= vc_mean * ve - vc_sd_grad -= vc_sd * ve + (fep_mean_grad_i, fep_sd_grad_i, vcp_mean_grad_i, vcp_sd_grad_i, + vc_mean_grad_i, vc_sd_grad_i) = self._elbo_grad_common( + fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd) - # p(vcp) contributions - vcp_mean_grad -= vcp_mean / self.vcp_p**2 - vcp_sd_grad -= vcp_sd / self.vcp_p**2 - - # p(b) contributions - fep_mean_grad -= fep_mean / self.fe_p**2 - fep_sd_grad -= fep_sd / self.fe_p**2 + fep_mean_grad += fep_mean_grad_i + fep_sd_grad += fep_sd_grad_i + vcp_mean_grad += vcp_mean_grad_i + vcp_sd_grad += vcp_sd_grad_i + vc_mean_grad += vc_mean_grad_i + vc_sd_grad += vc_sd_grad_i mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, vc_mean_grad)) From a99f3dfe5d521e03ca42b984d23bc95402a39a70 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 22:09:34 -0500 Subject: [PATCH 041/157] modify to work with older scipy --- statsmodels/regression/bayes_mixed_glm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 87886e25d52..b2a5892938e 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -292,7 +292,7 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): tm = np.dot(self.exog_fe, fep_mean) tv = np.dot(self.exog_fe**2, fep_sd**2) tm += self.exog_vc.dot(vc_mean) - tv += self.exog_vc.power(2).dot(vc_sd**2) + tv += self.exog_vc2.dot(vc_sd**2) return tm, tv @@ -497,6 +497,9 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, family=statsmodels.genmod.families.Binomial(), fep_names=fep_names, vcp_names=vcp_names) + # power would be better but not available in older scipy + self.exog_vc2 = self.exog_vc.multiply(self.exog_vc) + def vb_elbo(self, vb_mean, vb_sd): """ Returns the evidence lower bound (ELBO) for the model. @@ -564,9 +567,10 @@ def h1(z): fep_mean_grad += w[0] * np.dot(u, self.exog_fe) vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) - v = self.exog_vc.power(2).multiply(vc_sd).transpose().dot(r) + v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) v = np.squeeze(np.asarray(v)) vc_sd_grad += w[0] * x * v + fep_mean_grad *= -self.rng vc_mean_grad *= -self.rng fep_sd_grad *= -self.rng From 32d4a8a316639c743d6642a3f78513e1307db22a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 22:13:14 -0500 Subject: [PATCH 042/157] aviod repeated calculations --- statsmodels/regression/bayes_mixed_glm.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index b2a5892938e..02091fba252 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -285,6 +285,17 @@ def grad(params): class _VariationalBayesMixedGLM(BayesMixedGLM): + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, family=None, fep_names=None, vcp_names=None): + + super(_VariationalBayesMixedGLM, self).__init__( + endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, + ident=ident, vcp_p=vcp_p, fe_p=fe_p, + family=family, fep_names=fep_names, vcp_names=vcp_names) + + # power would be better but not available in older scipy + self.exog_vc2 = self.exog_vc.multiply(self.exog_vc) + # Returns the mean and variance of the linear predictor under the # given distribution parameters. def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): @@ -497,9 +508,6 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, family=statsmodels.genmod.families.Binomial(), fep_names=fep_names, vcp_names=vcp_names) - # power would be better but not available in older scipy - self.exog_vc2 = self.exog_vc.multiply(self.exog_vc) - def vb_elbo(self, vb_mean, vb_sd): """ Returns the evidence lower bound (ELBO) for the model. From 2d07ba8ee904b0325375cb5742431b3672138923 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 4 Jan 2018 22:34:25 -0500 Subject: [PATCH 043/157] add docstring for results --- statsmodels/regression/bayes_mixed_glm.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 02091fba252..5b2d837c0a8 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -431,6 +431,24 @@ def _elbo_grad_common(self, fep_mean, fep_sd, vcp_mean, vcp_sd, class BayesMixedGLMResults(object): + """ + Attributes + ---------- + fe_mean : array-like + Posterior mean of the fixed effects coefficients. + fe_sd : array-like + Posterior standard deviation of the fixed effects coefficients + vcp_mean : array-like + Posterior mean of the logged variance component standard + deviations. + vcp_sd : array-like + Posterior standard deviation of the logged variance component + standard deviations. + vc_mean : array-like + Posterior mean of the random coefficients + vc_sd : array-like + Posterior standard deviation of the random coefficients + """ def __init__(self, model, params, cov_params, optim_retvals=None): @@ -440,7 +458,7 @@ def __init__(self, model, params, cov_params, self.cov_params = cov_params self.optim_retvals = optim_retvals - self.fe_params, self.vcp_params, self.vc_params = ( + self.fe_mean, self.vcp_mean, self.vc_mean = ( model._unpack(params)) if cov_params.ndim == 2: From bfaa13e4b7be6e9643fd6835ee0895b70a9b63ae Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 5 Jan 2018 17:31:31 -0500 Subject: [PATCH 044/157] change prior defaults --- statsmodels/regression/bayes_mixed_glm.py | 17 ++++--- .../regression/tests/test_bayes_mixed_glm.py | 45 +++++++++---------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 5b2d837c0a8..42545f2b0c8 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -74,7 +74,9 @@ The random effect standard deviation parameters (vcp) have log-normal prior distributions with mean 0 and standard deviation - `vcp_p`. + `vcp_p`. Note that for some families, e.g. Binomial, the + posterior mode may be difficult to find numerically if `vcp_p` is + set too large. Setting `vcp_p` to 0.5 seems to work well. The prior for the fixed effects parameters is Gaussian with mean 0 and standard deviation `fe_p`. @@ -96,8 +98,8 @@ class BayesMixedGLM(object): __doc__ = _init_doc.format(fit_method=_laplace_fit_method) - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, family=None, fep_names=None, + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + fe_p=2, family=None, fep_names=None, vcp_names=None): if family is None: @@ -251,6 +253,7 @@ def logposterior_grad(self, params): te[0] -= fep / self.fe_p**2 te = [x for x in te if x is not None] + return np.concatenate(te) def _get_start(self): @@ -285,8 +288,8 @@ def grad(params): class _VariationalBayesMixedGLM(BayesMixedGLM): - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, family=None, fep_names=None, vcp_names=None): + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + fe_p=2, family=None, fep_names=None, vcp_names=None): super(_VariationalBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, @@ -517,8 +520,8 @@ class BinomialBayesMixedGLM(_VariationalBayesMixedGLM): verbose = False - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, fep_names=None, vcp_names=None): + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + fe_p=2, fep_names=None, vcp_names=None): super(BinomialBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 740f4cd1da9..7f6ce08a2b8 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -5,7 +5,6 @@ from scipy import sparse from numpy.testing import assert_allclose from scipy.optimize import approx_fprime -import warnings def gen_simple_logit(nc, cs, s): @@ -49,11 +48,12 @@ def test_logit_map(): exog_vc = sparse.csr_matrix(exog_vc) glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial()) - rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) + family=sm.families.Binomial(), + vcp_p=0.5) + rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), - np.zeros_like(rslt.params), atol=1e-4) + np.zeros_like(rslt.params), atol=1e-3) def test_logit_map_crossed(): @@ -62,8 +62,9 @@ def test_logit_map_crossed(): exog_vc = sparse.csr_matrix(exog_vc) glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial()) - rslt = glmm.fit_map(minim_opts={"gtol": 1e-4}) + family=sm.families.Binomial(), + vcp_p=0.5) + rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), np.zeros_like(rslt.params), atol=1e-4) @@ -80,7 +81,7 @@ def test_logit_elbo_grad(): exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) + glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4}) n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc @@ -119,16 +120,13 @@ def test_logit_vb(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 0) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial()) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - rslt1 = glmm1.fit_map() + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, family=sm.families.Binomial()) + rslt1 = glmm1.fit_map() - glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - rslt2 = glmm2.fit_vb(rslt1.params) + glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) + rslt2 = glmm2.fit_vb(rslt1.params) rslt1.summary() rslt2.summary() @@ -147,16 +145,13 @@ def test_logit_vb_crossed(): y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial()) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - rslt1 = glmm1.fit_map() + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, family=sm.families.Binomial()) + rslt1 = glmm1.fit_map() - glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - rslt2 = glmm2.fit_vb(mean=rslt1.params) + glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) + rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() From ac7ef804b56d7c042a588b947374f12867a62adc Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 5 Jan 2018 19:01:06 -0500 Subject: [PATCH 045/157] add formula support --- statsmodels/regression/bayes_mixed_glm.py | 83 ++++++++++++++++++- .../regression/tests/test_bayes_mixed_glm.py | 66 ++++++++++++++- 2 files changed, 145 insertions(+), 4 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 42545f2b0c8..0dd5dc60651 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -6,6 +6,7 @@ import pandas as pd import statsmodels import warnings +import patsy # Gauss-Legendre weights glw = [[0.2955242247147529, -0.1488743389816312], @@ -98,7 +99,7 @@ class BayesMixedGLM(object): __doc__ = _init_doc.format(fit_method=_laplace_fit_method) - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, vcp_names=None): @@ -263,6 +264,69 @@ def _get_start(self): start = np.concatenate((start_fep, start_vcp, start_vc)) return start + @classmethod + def from_formula(cls, formula, vc_formulas, data, family=None, + vcp_p=1, fe_p=2, vcp_names=None): + + """ + Fit a BayesMixedGLM using a formula. + + Parameters + ---------- + formula : string + Formula for the endog and fixed effects terms (use ~ to separate + dependent and independent expressions). + vc_formula : list of strings + Each element of the list is a one-sided formula that + creates one collection of random effects with a common + variance prameter. If using a categorical expression to + produce variance components, note that generally `0 + ...` + should be used so that an intercept is not included. + data : data frame + The data to which the formulas are applied. + family : genmod.families instance + A GLM family. + vcp_p : float + The prior standard deviation for the logarithms of the standard + deviations of the random effects. + fe_p : float + The prior standard deviation for the fixed effects parameters. + vcp_names : list + Names of variance component parameters + """ + + if not type(vc_formulas) is list: + vc_formulas = [vc_formulas] + + endog, exog_fe = patsy.dmatrices(formula, data, + return_type='dataframe') + + ident = [] + exog_vc = [] + for j, fml in enumerate(vc_formulas): + mat = patsy.dmatrix(fml, data, return_type='dataframe') + exog_vc.append(mat) + ident.append(j * np.ones(mat.shape[1])) + exog_vc = pd.concat(exog_vc, axis=1) + + if vcp_names is None: + vcp_names = ["VC_%d" % (k + 1) for k in range(len(vc_formulas))] + else: + vcp_names = exog_vc.columns.tolist() + ident = np.concatenate(ident) + + endog = np.squeeze(np.asarray(endog)) + + fep_names = exog_fe.columns.tolist() + exog_fe = np.asarray(exog_fe) + exog_vc = sparse.csr_matrix(np.asarray(exog_vc)) + + mod = BayesMixedGLM(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, + family=family, fep_names=fep_names, + vcp_names=vcp_names) + + return mod + def fit_map(self, method="BFGS", minim_opts=None): """ Construct the Laplace approximation to the posterior @@ -288,7 +352,7 @@ def grad(params): class _VariationalBayesMixedGLM(BayesMixedGLM): - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, vcp_names=None): super(_VariationalBayesMixedGLM, self).__init__( @@ -520,7 +584,7 @@ class BinomialBayesMixedGLM(_VariationalBayesMixedGLM): verbose = False - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): super(BinomialBayesMixedGLM, self).__init__( @@ -529,6 +593,19 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=2, family=statsmodels.genmod.families.Binomial(), fep_names=fep_names, vcp_names=vcp_names) + @classmethod + def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2): + + fam = statsmodels.genmod.families.Binomial() + x = BayesMixedGLM.from_formula(formula, vc_formulas, data, + family=fam, vcp_p=vcp_p, fe_p=fe_p) + + return BinomialBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, + exog_vc=x.exog_vc, ident=x.ident, + vcp_p=x.vcp_p, fe_p=x.fe_p, + fep_names=x.fep_names, + vcp_names=x.vcp_names) + def vb_elbo(self, vb_mean, vb_sd): """ Returns the evidence lower bound (ELBO) for the model. diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 7f6ce08a2b8..5d318a42eca 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -2,6 +2,7 @@ from statsmodels.regression.bayes_mixed_glm import ( BayesMixedGLM, BinomialBayesMixedGLM) import statsmodels.api as sm +import pandas as pd from scipy import sparse from numpy.testing import assert_allclose from scipy.optimize import approx_fprime @@ -42,6 +43,34 @@ def gen_logit_crossed(nc, cs, s1, s2): return y, exog_fe, exog_vc, ident +def gen_logit_crossed_pandas(nc, cs, s1, s2): + + np.random.seed(3799) + + a = np.kron(np.arange(nc), np.ones(cs)) + b = np.kron(np.ones(cs), np.arange(nc)) + fe = np.ones(nc * cs) + + vc = np.zeros(nc * cs) + for i in np.unique(a): + ii = np.flatnonzero(a == i) + vc[ii] += s1*np.random.normal() + for i in np.unique(b): + ii = np.flatnonzero(b == i) + vc[ii] += s2*np.random.normal() + + lp = -0.5 * fe + vc + pr = 1 / (1 + np.exp(-lp)) + y = 1*(np.random.uniform(size=nc*cs) < pr) + + ident = np.zeros(2*nc, dtype=np.int) + ident[nc:] = 1 + + df = pd.DataFrame({"fe": fe, "a": a, "b": b, "y": y}) + + return df + + def test_logit_map(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) @@ -70,6 +99,22 @@ def test_logit_map_crossed(): np.zeros_like(rslt.params), atol=1e-4) +def test_logit_map_crosed_formula(): + + data = gen_logit_crossed_pandas(10, 10, 1, 2) + + fml = "y ~ fe" + fml_vc = ["0 + C(a)", "0 + C(b)"] + glmm = BayesMixedGLM.from_formula( + fml, fml_vc, data, family=sm.families.Binomial(), vcp_p=0.5) + rslt = glmm.fit_map() + + assert_allclose(glmm.logposterior_grad(rslt.params), + np.zeros_like(rslt.params), atol=1e-4) + + rslt.summary() + + def test_logit_elbo_grad(): for j in range(2): @@ -143,7 +188,6 @@ def test_logit_vb(): def test_logit_vb_crossed(): y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) - exog_vc = sparse.csr_matrix(exog_vc) glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5, family=sm.families.Binomial()) @@ -163,3 +207,23 @@ def test_logit_vb_crossed(): assert_allclose(rslt2.params[0:5], np.r_[ -0.70834417, -0.3571011, 0.19126823, -0.36074489, 0.058976], rtol=1e-4, atol=1e-4) + + +def test_logit_vb_crossed_formula(): + + data = gen_logit_crossed_pandas(10, 10, 1, 2) + + fml = "y ~ fe" + fml_vc = ["0 + C(a)", "0 + C(b)"] + glmm1 = BinomialBayesMixedGLM.from_formula( + fml, fml_vc, data, vcp_p=0.5) + rslt1 = glmm1.fit_vb() + + glmm2 = BinomialBayesMixedGLM(glmm1.endog, glmm1.exog_fe, glmm1.exog_vc, + glmm1.ident, vcp_p=0.5) + rslt2 = glmm2.fit_vb() + + assert_allclose(rslt1.params, rslt2.params, atol=1e-4) + + rslt1.summary() + rslt2.summary() From 45782c2566c656a7c97d54763ab5857f2b87983a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 5 Jan 2018 21:23:53 -0500 Subject: [PATCH 046/157] check passed starting values for proper length --- statsmodels/regression/bayes_mixed_glm.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 0dd5dc60651..43ed92dd09d 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -410,13 +410,21 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): self.verbose = verbose n = self.k_fep + self.k_vcp + self.k_vc + ml = self.k_fep + self.k_vcp + self.k_vc if mean is None: m = np.zeros(n) else: + if len(mean) != ml: + raise ValueError("mean has incorrect length, %d != %d" % + (len(mean), ml)) m = mean if sd is None: s = -0.5 + 0.1 * np.random.normal(size=n) else: + if len(sd) != ml: + raise ValueError("sd has incorrect length, %d != %d" % + (len(sd), ml)) + # s is parameterized on the log-scale internally when # optimizing the ELBO function (this is transparent to the # caller) @@ -594,11 +602,13 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fep_names=fep_names, vcp_names=vcp_names) @classmethod - def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2): + def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, + vcp_names=None): fam = statsmodels.genmod.families.Binomial() x = BayesMixedGLM.from_formula(formula, vc_formulas, data, - family=fam, vcp_p=vcp_p, fe_p=fe_p) + family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names) return BinomialBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, ident=x.ident, From a5c2f19c54dac693c1ef7120e74ec66872d77f94 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 6 Jan 2018 14:52:00 -0500 Subject: [PATCH 047/157] add poisson family for vb --- statsmodels/regression/bayes_mixed_glm.py | 170 ++++++++++++- .../regression/tests/test_bayes_mixed_glm.py | 224 ++++++++++++++---- 2 files changed, 336 insertions(+), 58 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 43ed92dd09d..9f9006878a2 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -62,8 +62,9 @@ There are three types of values in the posterior distribution: fixed effects parameters (fep), corresponding to the columns of `exog_fe`, random effects realizations (vc), corresponding to the - columns of `exog_vc`, and the variances of the random effects - realizations (vcp), corresponding to the unique labels in `ident`. + columns of `exog_vc`, and the standard deviations of the random + effects realizations (vcp), corresponding to the unique labels in + `ident`. All random effects are modeled as being independent Gaussian values (given the variance parameters). Every column of `exog_vc` @@ -75,9 +76,11 @@ The random effect standard deviation parameters (vcp) have log-normal prior distributions with mean 0 and standard deviation - `vcp_p`. Note that for some families, e.g. Binomial, the - posterior mode may be difficult to find numerically if `vcp_p` is - set too large. Setting `vcp_p` to 0.5 seems to work well. + `vcp_p`. + + Note that for some families, e.g. Binomial, the posterior mode may + be difficult to find numerically if `vcp_p` is set to too large of + a value. Setting `vcp_p` to 0.5 seems to work well. The prior for the fixed effects parameters is Gaussian with mean 0 and standard deviation `fe_p`. @@ -118,11 +121,8 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, # Get the variance parameter names if vcp_names is None: - if hasattr(exog_vc, "columns"): - vcp_names = exog_vc.columns.tolist() - else: - vcp_names = ["VC_%d" % (k + 1) - for k in range(int(max(ident)) + 1)] + vcp_names = ["VC_%d" % (k + 1) + for k in range(int(max(ident)) + 1)] self.vcp_names = vcp_names self.endog = np.asarray(endog) @@ -312,7 +312,7 @@ def from_formula(cls, formula, vc_formulas, data, family=None, if vcp_names is None: vcp_names = ["VC_%d" % (k + 1) for k in range(len(vc_formulas))] else: - vcp_names = exog_vc.columns.tolist() + vcp_names = vcp_names ident = np.concatenate(ident) endog = np.squeeze(np.asarray(endog)) @@ -351,6 +351,10 @@ def grad(params): class _VariationalBayesMixedGLM(BayesMixedGLM): + """ + A private base class for family-specific variational Bayes GLM + implementations. + """ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, vcp_names=None): @@ -404,6 +408,8 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): References ---------- + Blei, Kucukelbir, McAuliffe (2017). Variational Inference: A + review for Statisticians https://arxiv.org/pdf/1601.00670.pdf """ @@ -417,7 +423,7 @@ def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): if len(mean) != ml: raise ValueError("mean has incorrect length, %d != %d" % (len(mean), ml)) - m = mean + m = mean.copy() if sd is None: s = -0.5 + 0.1 * np.random.normal(size=n) else: @@ -466,7 +472,7 @@ def _elbo_common(self, fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd): # p(vc | vcp) contributions m = vcp_mean[self.ident] s = vcp_sd[self.ident] - iv -= np.sum((vc_mean**2 + vc_sd**2) * np.exp(-2*m + 2*s**2)) / 2 + iv -= np.sum((vc_mean**2 + vc_sd**2) * np.exp(2*(s**2 - m))) / 2 iv -= np.sum(m) # p(vcp) contributions @@ -716,3 +722,141 @@ def h1(z): np.sum(sd_grad**2))) return mean_grad, sd_grad + + +class PoissonBayesMixedGLM(_VariationalBayesMixedGLM): + + __doc__ = _init_doc.format(fit_method=_vb_fit_method) + + # Integration range (from -rng to +rng). The integrals are with + # respect to a standard Gaussian distribution so (-5, 5) will be + # sufficient in many cases. + rng = 5 + + verbose = False + + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, + fe_p=2, fep_names=None, vcp_names=None): + + super(PoissonBayesMixedGLM, self).__init__( + endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, + ident=ident, vcp_p=vcp_p, fe_p=fe_p, + family=statsmodels.genmod.families.Poisson(), + fep_names=fep_names, vcp_names=vcp_names) + + @classmethod + def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, + vcp_names=None): + + fam = statsmodels.genmod.families.Poisson() + x = BayesMixedGLM.from_formula(formula, vc_formulas, data, + family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names) + + return PoissonBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, + exog_vc=x.exog_vc, ident=x.ident, + vcp_p=x.vcp_p, fe_p=x.fe_p, + fep_names=x.fep_names, + vcp_names=x.vcp_names) + + def vb_elbo(self, vb_mean, vb_sd): + """ + Returns the evidence lower bound (ELBO) for the model. + + -lam + k*log(lam) + + -exp(lp) + k*lp + + k*lp - exp(lp) + + """ + + fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) + fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) + + tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) + + def h(z): + y = np.exp(tm + np.sqrt(tv)*z) + y *= np.exp(-z**2 / 2) + y /= np.sqrt(2*np.pi) + return y + + # p(y | vc) contributions + iv = 0 + for w in glw: + iv += h(self.rng * w[1]) * w[0] + iv *= -self.rng + iv += self.endog * tm + iv = iv.sum() + + iv += self._elbo_common(fep_mean, fep_sd, vcp_mean, vcp_sd, + vc_mean, vc_sd) + + r = iv + np.sum(np.log(vb_sd)) + return r + + def vb_elbo_grad(self, vb_mean, vb_sd): + """ + Returns the gradient of the model's evidence lower bound (ELBO). + """ + + fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) + fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) + + tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) + + def h(z): + y = np.exp(tm + np.sqrt(tv)*z) + y *= np.exp(-z**2 / 2) + y /= np.sqrt(2*np.pi) + return y + + fep_mean_grad = 0. + fep_sd_grad = 0. + vcp_mean_grad = 0. + vcp_sd_grad = 0. + vc_mean_grad = 0. + vc_sd_grad = 0. + + # p(y | vc) contributions + for w in glw: + x = self.rng * w[1] + u = h(x) + r = u / np.sqrt(tv) + fep_mean_grad += w[0] * np.dot(u, self.exog_fe) + vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) + fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) + v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) + v = np.squeeze(np.asarray(v)) + vc_sd_grad += w[0] * x * v + + fep_mean_grad *= -self.rng + vc_mean_grad *= -self.rng + fep_sd_grad *= -self.rng + vc_sd_grad *= -self.rng + fep_mean_grad += np.dot(self.endog, self.exog_fe) + vc_mean_grad += self.exog_vc.transpose().dot(self.endog) + + (fep_mean_grad_i, fep_sd_grad_i, vcp_mean_grad_i, vcp_sd_grad_i, + vc_mean_grad_i, vc_sd_grad_i) = self._elbo_grad_common( + fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd) + + fep_mean_grad += fep_mean_grad_i + fep_sd_grad += fep_sd_grad_i + vcp_mean_grad += vcp_mean_grad_i + vcp_sd_grad += vcp_sd_grad_i + vc_mean_grad += vc_mean_grad_i + vc_sd_grad += vc_sd_grad_i + + mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, + vc_mean_grad)) + sd_grad = np.concatenate((fep_sd_grad, vcp_sd_grad, vc_sd_grad)) + + sd_grad += 1 / vb_sd + + if self.verbose: + print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + + np.sum(sd_grad**2))) + + return mean_grad, sd_grad diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 5d318a42eca..1fffb7f4366 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -1,6 +1,6 @@ import numpy as np from statsmodels.regression.bayes_mixed_glm import ( - BayesMixedGLM, BinomialBayesMixedGLM) + BayesMixedGLM, BinomialBayesMixedGLM, PoissonBayesMixedGLM) import statsmodels.api as sm import pandas as pd from scipy import sparse @@ -23,7 +23,22 @@ def gen_simple_logit(nc, cs, s): return y, exog_fe, exog_vc, ident -def gen_logit_crossed(nc, cs, s1, s2): +def gen_simple_poisson(nc, cs, s): + + np.random.seed(3799) + + exog_vc = np.kron(np.eye(nc), np.ones((cs, 1))) + exog_fe = np.random.normal(size=(nc*cs, 2)) + vc = s*np.random.normal(size=nc) + lp = np.dot(exog_fe, np.r_[0.1, -0.1]) + np.dot(exog_vc, vc) + r = np.exp(lp) + y = np.random.poisson(r) + ident = np.zeros(nc, dtype=np.int) + + return y, exog_fe, exog_vc, ident + + +def gen_crossed_logit(nc, cs, s1, s2): np.random.seed(3799) @@ -43,7 +58,27 @@ def gen_logit_crossed(nc, cs, s1, s2): return y, exog_fe, exog_vc, ident -def gen_logit_crossed_pandas(nc, cs, s1, s2): +def gen_crossed_poisson(nc, cs, s1, s2): + + np.random.seed(3799) + + a = np.kron(np.eye(nc), np.ones((cs, 1))) + b = np.kron(np.ones((cs, 1)), np.eye(nc)) + exog_vc = np.concatenate((a, b), axis=1) + + exog_fe = np.random.normal(size=(nc*cs, 1)) + vc = s1 * np.random.normal(size=2*nc) + vc[nc:] *= s2 / s1 + lp = np.dot(exog_fe, np.r_[-0.5]) + np.dot(exog_vc, vc) + r = np.exp(lp) + y = np.random.poisson(r) + ident = np.zeros(2*nc, dtype=np.int) + ident[nc:] = 1 + + return y, exog_fe, exog_vc, ident + + +def gen_crossed_logit_pandas(nc, cs, s1, s2): np.random.seed(3799) @@ -71,7 +106,7 @@ def gen_logit_crossed_pandas(nc, cs, s1, s2): return df -def test_logit_map(): +def test_simple_logit_map(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) exog_vc = sparse.csr_matrix(exog_vc) @@ -85,9 +120,27 @@ def test_logit_map(): np.zeros_like(rslt.params), atol=1e-3) -def test_logit_map_crossed(): +def test_simple_poisson_map(): - y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) + y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 0.2) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Poisson(), + vcp_p=0.5) + rslt1 = glmm1.fit_map() + assert_allclose(glmm1.logposterior_grad(rslt1.params), + np.zeros_like(rslt1.params), atol=1e-3) + + # This should give the same answer as above + glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) + rslt2 = glmm2.fit_map() + assert_allclose(rslt1.params, rslt2.params, atol=1e-4) + +def test_crossed_logit_map(): + + y, exog_fe, exog_vc, ident = gen_crossed_logit(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, @@ -99,9 +152,23 @@ def test_logit_map_crossed(): np.zeros_like(rslt.params), atol=1e-4) +def test_crossed_poisson_map(): + + y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, + family=sm.families.Poisson(), + vcp_p=0.5) + rslt = glmm.fit_map() + + assert_allclose(glmm.logposterior_grad(rslt.params), + np.zeros_like(rslt.params), atol=1e-4) + + def test_logit_map_crosed_formula(): - data = gen_logit_crossed_pandas(10, 10, 1, 2) + data = gen_crossed_logit_pandas(10, 10, 1, 2) fml = "y ~ fe" fml_vc = ["0 + C(a)", "0 + C(b)"] @@ -115,52 +182,70 @@ def test_logit_map_crosed_formula(): rslt.summary() -def test_logit_elbo_grad(): +def test_elbo_grad(): - for j in range(2): + for f in range(2): + for j in range(2): - if j == 0: - y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) - else: - y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) + if f == 0: + if j == 0: + y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) + else: + y, exog_fe, exog_vc, ident = gen_crossed_logit( + 10, 10, 1, 2) + elif f == 1: + if j == 0: + y, exog_fe, exog_vc, ident = gen_simple_poisson( + 10, 10, 0.5) + else: + y, exog_fe, exog_vc, ident = gen_crossed_poisson( + 10, 10, 1, 0.5) - exog_vc = sparse.csr_matrix(exog_vc) + exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) - rslt1 = glmm1.fit_map(minim_opts={"gtol": 1e-4}) + if f == 0: + glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) + else: + glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) - n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc + rslt1 = glmm1.fit_map() - for k in range(3): + n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc - if k == 0: - vb_mean = rslt1.params - vb_sd = np.ones_like(vb_mean) - elif k == 1: - vb_mean = np.zeros(len(vb_mean)) - vb_sd = np.ones_like(vb_mean) - else: - vb_mean = np.random.normal(size=len(vb_mean)) - vb_sd = np.random.uniform(1, 2, size=len(vb_mean)) + for k in range(3): + + if k == 0: + vb_mean = rslt1.params + vb_sd = np.ones_like(vb_mean) + elif k == 1: + vb_mean = np.zeros(len(vb_mean)) + vb_sd = np.ones_like(vb_mean) + else: + vb_mean = np.random.normal(size=len(vb_mean)) + vb_sd = np.random.uniform(1, 2, size=len(vb_mean)) - mean_grad, sd_grad = glmm1.vb_elbo_grad(vb_mean, vb_sd) + mean_grad, sd_grad = glmm1.vb_elbo_grad(vb_mean, vb_sd) - def elbo(vec): - n = len(vec) // 2 - return glmm1.vb_elbo(vec[:n], vec[n:]) + def elbo(vec): + n = len(vec) // 2 + return glmm1.vb_elbo(vec[:n], vec[n:]) - x = np.concatenate((vb_mean, vb_sd)) - g1 = approx_fprime(x, elbo, 1e-5) - n = len(x) // 2 + x = np.concatenate((vb_mean, vb_sd)) + g1 = approx_fprime(x, elbo, 1e-5) + n = len(x) // 2 - mean_grad_n = g1[:n] - sd_grad_n = g1[n:] + mean_grad_n = g1[:n] + sd_grad_n = g1[n:] - assert_allclose(mean_grad, mean_grad_n, atol=1e-2, rtol=1e-2) - assert_allclose(sd_grad, sd_grad_n, atol=1e-2, rtol=1e-2) + assert_allclose(mean_grad, mean_grad_n, atol=1e-2, + rtol=1e-2) + assert_allclose(sd_grad, sd_grad_n, atol=1e-2, + rtol=1e-2) -def test_logit_vb(): +def test_simple_logit_vb(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 0) exog_vc = sparse.csr_matrix(exog_vc) @@ -177,7 +262,7 @@ def test_logit_vb(): rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ - 0.75330405, -0.71643228, -1., -0.00959806, 0.00450254], + 0.75330405, -0.71643228, -2.49091288, -0.00959806, 0.00450254], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ @@ -185,9 +270,33 @@ def test_logit_vb(): rtol=1e-4, atol=1e-4) -def test_logit_vb_crossed(): +def test_simple_poisson_vb(): + + y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 1) + exog_vc = sparse.csr_matrix(exog_vc) + + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + family=sm.families.Poisson()) + rslt1 = glmm1.fit_map() + + glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) + rslt2 = glmm2.fit_vb(rslt1.params) + + rslt1.summary() + rslt2.summary() + + assert_allclose(rslt1.params[0:5], np.r_[ + -0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], + rtol=1e-4, atol=1e-4) + + assert_allclose(rslt2.params[0:5], np.r_[ + -0.07088814, -0.06373107, -0.22770786, 1.12923746, -1.26161339], + rtol=1e-4, atol=1e-4) + + +def test_crossed_logit_vb(): - y, exog_fe, exog_vc, ident = gen_logit_crossed(10, 10, 1, 2) + y, exog_fe, exog_vc, ident = gen_crossed_logit(10, 10, 1, 2) glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5, family=sm.families.Binomial()) @@ -201,7 +310,8 @@ def test_logit_vb_crossed(): rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ - -0.54307398, -1., -1., -0.0096403, 0.00232701], + -5.43073978e-01, -2.46197518e+00, -2.36582801e+00, + -9.64030461e-03, 2.32701078e-03], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ @@ -209,9 +319,9 @@ def test_logit_vb_crossed(): rtol=1e-4, atol=1e-4) -def test_logit_vb_crossed_formula(): +def test_crossed_logit_vb_formula(): - data = gen_logit_crossed_pandas(10, 10, 1, 2) + data = gen_crossed_logit_pandas(10, 10, 1, 2) fml = "y ~ fe" fml_vc = ["0 + C(a)", "0 + C(b)"] @@ -227,3 +337,27 @@ def test_logit_vb_crossed_formula(): rslt1.summary() rslt2.summary() + + +def test_crossed_poisson_vb(): + + y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) + + glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5, family=sm.families.Poisson()) + rslt1 = glmm1.fit_map() + + glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) + rslt2 = glmm2.fit_vb(mean=rslt1.params) + + rslt1.summary() + rslt2.summary() + + assert_allclose(rslt1.params[0:5], np.r_[ + -0.46012702, 0.20564564, 0.48015114, -0.09004295, 0.92886591], + rtol=1e-4, atol=1e-4) + + assert_allclose(rslt2.params[0:5], np.r_[ + -0.45982888, 0.24911954, 0.53073859, -0.10369905, 0.91920463], + rtol=1e-4, atol=1e-4) From c9409744d221cfe4a115e3063e6de3d0695d0ad6 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 6 Jan 2018 22:42:34 -0500 Subject: [PATCH 048/157] refactor to better dry --- statsmodels/regression/bayes_mixed_glm.py | 261 +++++++----------- .../regression/tests/test_bayes_mixed_glm.py | 3 +- 2 files changed, 107 insertions(+), 157 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 9f9006878a2..330bc0fc9fb 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -356,6 +356,13 @@ class _VariationalBayesMixedGLM(BayesMixedGLM): implementations. """ + # Integration range (from -rng to +rng). The integrals are with + # respect to a standard Gaussian distribution so (-5, 5) will be + # sufficient in many cases. + rng = 5 + + verbose = False + def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, vcp_names=None): @@ -378,6 +385,92 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): return tm, tv + def vb_elbo(self, h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, + vcp_sd, vc_sd): + """ + Returns the evidence lower bound (ELBO) for the model. + + This function calculates the family-specific ELBO function + based on information provided from a subclass. + + Parameters + ---------- + h : function + Implements log p(y, fep, vcp, vc) in the form of a function of z, + where z is a standard normal random variable. + """ + + # p(y | vc) contributions + iv = 0 + for w in glw: + iv += h(self.rng * w[1]) * w[0] + iv *= -self.rng + iv += self.endog * tm + iv = iv.sum() + + # p(vc | vcp) * p(vcp) * p(fep) contributions + iv += self._elbo_common(fep_mean, fep_sd, vcp_mean, vcp_sd, + vc_mean, vc_sd) + + r = (iv + np.sum(np.log(fep_sd)) + np.sum(np.log(vcp_sd)) + + np.sum(np.log(vc_sd))) + + return r + + def vb_elbo_grad(self, h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, + vcp_sd, vc_sd): + + fep_mean_grad = 0. + fep_sd_grad = 0. + vcp_mean_grad = 0. + vcp_sd_grad = 0. + vc_mean_grad = 0. + vc_sd_grad = 0. + + # p(y | vc) contributions + for w in glw: + x = self.rng * w[1] + u = h(x) + r = u / np.sqrt(tv) + fep_mean_grad += w[0] * np.dot(u, self.exog_fe) + vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) + fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) + v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) + v = np.squeeze(np.asarray(v)) + vc_sd_grad += w[0] * x * v + + fep_mean_grad *= -self.rng + vc_mean_grad *= -self.rng + fep_sd_grad *= -self.rng + vc_sd_grad *= -self.rng + fep_mean_grad += np.dot(self.endog, self.exog_fe) + vc_mean_grad += self.exog_vc.transpose().dot(self.endog) + + (fep_mean_grad_i, fep_sd_grad_i, vcp_mean_grad_i, vcp_sd_grad_i, + vc_mean_grad_i, vc_sd_grad_i) = self._elbo_grad_common( + fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd) + + fep_mean_grad += fep_mean_grad_i + fep_sd_grad += fep_sd_grad_i + vcp_mean_grad += vcp_mean_grad_i + vcp_sd_grad += vcp_sd_grad_i + vc_mean_grad += vc_mean_grad_i + vc_sd_grad += vc_sd_grad_i + + fep_sd_grad += 1 / fep_sd + vcp_sd_grad += 1 / vcp_sd + vc_sd_grad += 1 / vc_sd + + mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, + vc_mean_grad)) + sd_grad = np.concatenate((fep_sd_grad, vcp_sd_grad, vc_sd_grad)) + + if self.verbose: + print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + + np.sum(sd_grad**2))) + + return mean_grad, sd_grad + def fit_vb(self, mean=None, sd=None, minim_opts=None, verbose=False): """ Fit a model using the variational Bayes mean field approximation. @@ -591,13 +684,6 @@ class BinomialBayesMixedGLM(_VariationalBayesMixedGLM): __doc__ = _init_doc.format(fit_method=_vb_fit_method) - # Integration range (from -rng to +rng). The integrals are with - # respect to a standard Gaussian distribution so (-5, 5) will be - # sufficient in many cases. - rng = 5 - - verbose = False - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): @@ -629,7 +715,6 @@ def vb_elbo(self, vb_mean, vb_sd): fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) - tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): @@ -638,18 +723,8 @@ def h(z): x /= np.sqrt(2*np.pi) return x - # p(y | vc) contributions - iv = 0 - for w in glw: - iv += h(self.rng * w[1]) * w[0] - iv *= -self.rng - iv += self.endog * tm - iv = iv.sum() - - iv += self._elbo_common(fep_mean, fep_sd, vcp_mean, vcp_sd, - vc_mean, vc_sd) - - return iv + np.sum(np.log(vb_sd)) + return super(BinomialBayesMixedGLM, self).vb_elbo( + h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) def vb_elbo_grad(self, vb_mean, vb_sd): """ @@ -658,83 +733,23 @@ def vb_elbo_grad(self, vb_mean, vb_sd): fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) - tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): - x = np.log(1 + np.exp(tm + np.sqrt(tv)*z)) - x *= np.exp(-z**2 / 2) - x /= np.sqrt(2*np.pi) - return x - - def h1(z): u = tm + np.sqrt(tv)*z x = np.exp(u) / (1 + np.exp(u)) x *= np.exp(-z**2 / 2) x /= np.sqrt(2*np.pi) return x - fep_mean_grad = 0. - fep_sd_grad = 0. - vcp_mean_grad = 0. - vcp_sd_grad = 0. - vc_mean_grad = 0. - vc_sd_grad = 0. - - # p(y | vc) contributions - for w in glw: - x = self.rng * w[1] - u = h1(x) - r = u / np.sqrt(tv) - fep_mean_grad += w[0] * np.dot(u, self.exog_fe) - vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) - fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) - v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) - v = np.squeeze(np.asarray(v)) - vc_sd_grad += w[0] * x * v - - fep_mean_grad *= -self.rng - vc_mean_grad *= -self.rng - fep_sd_grad *= -self.rng - vc_sd_grad *= -self.rng - fep_mean_grad += np.dot(self.endog, self.exog_fe) - vc_mean_grad += self.exog_vc.transpose().dot(self.endog) - - (fep_mean_grad_i, fep_sd_grad_i, vcp_mean_grad_i, vcp_sd_grad_i, - vc_mean_grad_i, vc_sd_grad_i) = self._elbo_grad_common( - fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd) - - fep_mean_grad += fep_mean_grad_i - fep_sd_grad += fep_sd_grad_i - vcp_mean_grad += vcp_mean_grad_i - vcp_sd_grad += vcp_sd_grad_i - vc_mean_grad += vc_mean_grad_i - vc_sd_grad += vc_sd_grad_i - - mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, - vc_mean_grad)) - sd_grad = np.concatenate((fep_sd_grad, vcp_sd_grad, vc_sd_grad)) - - sd_grad += 1 / vb_sd - - if self.verbose: - print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + - np.sum(sd_grad**2))) - - return mean_grad, sd_grad + return super(BinomialBayesMixedGLM, self).vb_elbo_grad( + h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) class PoissonBayesMixedGLM(_VariationalBayesMixedGLM): __doc__ = _init_doc.format(fit_method=_vb_fit_method) - # Integration range (from -rng to +rng). The integrals are with - # respect to a standard Gaussian distribution so (-5, 5) will be - # sufficient in many cases. - rng = 5 - - verbose = False - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): @@ -754,26 +769,18 @@ def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=vcp_names) return PoissonBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, - exog_vc=x.exog_vc, ident=x.ident, - vcp_p=x.vcp_p, fe_p=x.fe_p, - fep_names=x.fep_names, - vcp_names=x.vcp_names) + exog_vc=x.exog_vc, ident=x.ident, + vcp_p=x.vcp_p, fe_p=x.fe_p, + fep_names=x.fep_names, + vcp_names=x.vcp_names) def vb_elbo(self, vb_mean, vb_sd): """ Returns the evidence lower bound (ELBO) for the model. - - -lam + k*log(lam) - - -exp(lp) + k*lp - - k*lp - exp(lp) - """ fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) - tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): @@ -782,19 +789,8 @@ def h(z): y /= np.sqrt(2*np.pi) return y - # p(y | vc) contributions - iv = 0 - for w in glw: - iv += h(self.rng * w[1]) * w[0] - iv *= -self.rng - iv += self.endog * tm - iv = iv.sum() - - iv += self._elbo_common(fep_mean, fep_sd, vcp_mean, vcp_sd, - vc_mean, vc_sd) - - r = iv + np.sum(np.log(vb_sd)) - return r + return super(PoissonBayesMixedGLM, self).vb_elbo( + h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) def vb_elbo_grad(self, vb_mean, vb_sd): """ @@ -803,7 +799,6 @@ def vb_elbo_grad(self, vb_mean, vb_sd): fep_mean, vcp_mean, vc_mean = self._unpack(vb_mean) fep_sd, vcp_sd, vc_sd = self._unpack(vb_sd) - tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): @@ -812,51 +807,5 @@ def h(z): y /= np.sqrt(2*np.pi) return y - fep_mean_grad = 0. - fep_sd_grad = 0. - vcp_mean_grad = 0. - vcp_sd_grad = 0. - vc_mean_grad = 0. - vc_sd_grad = 0. - - # p(y | vc) contributions - for w in glw: - x = self.rng * w[1] - u = h(x) - r = u / np.sqrt(tv) - fep_mean_grad += w[0] * np.dot(u, self.exog_fe) - vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) - fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) - v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) - v = np.squeeze(np.asarray(v)) - vc_sd_grad += w[0] * x * v - - fep_mean_grad *= -self.rng - vc_mean_grad *= -self.rng - fep_sd_grad *= -self.rng - vc_sd_grad *= -self.rng - fep_mean_grad += np.dot(self.endog, self.exog_fe) - vc_mean_grad += self.exog_vc.transpose().dot(self.endog) - - (fep_mean_grad_i, fep_sd_grad_i, vcp_mean_grad_i, vcp_sd_grad_i, - vc_mean_grad_i, vc_sd_grad_i) = self._elbo_grad_common( - fep_mean, fep_sd, vcp_mean, vcp_sd, vc_mean, vc_sd) - - fep_mean_grad += fep_mean_grad_i - fep_sd_grad += fep_sd_grad_i - vcp_mean_grad += vcp_mean_grad_i - vcp_sd_grad += vcp_sd_grad_i - vc_mean_grad += vc_mean_grad_i - vc_sd_grad += vc_sd_grad_i - - mean_grad = np.concatenate((fep_mean_grad, vcp_mean_grad, - vc_mean_grad)) - sd_grad = np.concatenate((fep_sd_grad, vcp_sd_grad, vc_sd_grad)) - - sd_grad += 1 / vb_sd - - if self.verbose: - print("|G|=%f" % np.sqrt(np.sum(mean_grad**2) + - np.sum(sd_grad**2))) - - return mean_grad, sd_grad + return super(PoissonBayesMixedGLM, self).vb_elbo_grad( + h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 1fffb7f4366..2852e63986e 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -138,6 +138,7 @@ def test_simple_poisson_map(): rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, atol=1e-4) + def test_crossed_logit_map(): y, exog_fe, exog_vc, ident = gen_crossed_logit(10, 10, 1, 2) @@ -348,7 +349,7 @@ def test_crossed_poisson_vb(): rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5) + fe_p=0.5) rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() From be12115aac3d5e7f2a54e04022d6f162a0214d09 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 6 Jan 2018 22:46:16 -0500 Subject: [PATCH 049/157] fix self assignment --- statsmodels/regression/bayes_mixed_glm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 330bc0fc9fb..25ac8fa06db 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -311,8 +311,7 @@ def from_formula(cls, formula, vc_formulas, data, family=None, if vcp_names is None: vcp_names = ["VC_%d" % (k + 1) for k in range(len(vc_formulas))] - else: - vcp_names = vcp_names + ident = np.concatenate(ident) endog = np.squeeze(np.asarray(endog)) From 377f84e1e7805c9a273791a9e0b9d4816574d021 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 6 Jan 2018 23:14:38 -0500 Subject: [PATCH 050/157] privatize base class, restructure as mixin --- statsmodels/regression/bayes_mixed_glm.py | 56 ++++++++----------- .../regression/tests/test_bayes_mixed_glm.py | 42 ++++++-------- 2 files changed, 42 insertions(+), 56 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 25ac8fa06db..ac19c4b4ad0 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -3,8 +3,8 @@ from scipy.optimize import minimize from scipy import sparse from statsmodels.iolib import summary2 +import statsmodels.api as sm import pandas as pd -import statsmodels import warnings import patsy @@ -98,7 +98,7 @@ """ -class BayesMixedGLM(object): +class _BayesMixedGLM(object): __doc__ = _init_doc.format(fit_method=_laplace_fit_method) @@ -107,7 +107,7 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, vcp_names=None): if family is None: - family = statsmodels.genmod.families.Gaussian() + family = sm.families.Gaussian() warnings.Warn("Defaulting to Gaussian family") # Get the fixed effects parameter names @@ -153,6 +153,9 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, self.k_vc = exog_vc.shape[1] self.k_vcp = max(self.ident) + 1 + # power would be better but not available in older scipy + self.exog_vc2 = self.exog_vc.multiply(self.exog_vc) + def _unpack(self, vec): ii = 0 @@ -320,9 +323,9 @@ def from_formula(cls, formula, vc_formulas, data, family=None, exog_fe = np.asarray(exog_fe) exog_vc = sparse.csr_matrix(np.asarray(exog_vc)) - mod = BayesMixedGLM(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, - family=family, fep_names=fep_names, - vcp_names=vcp_names) + mod = _BayesMixedGLM(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, + family=family, fep_names=fep_names, + vcp_names=vcp_names) return mod @@ -349,10 +352,10 @@ def grad(params): return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) -class _VariationalBayesMixedGLM(BayesMixedGLM): +class _VariationalBayesMixedGLM(_BayesMixedGLM): """ - A private base class for family-specific variational Bayes GLM - implementations. + A mixin providing generic (not family-specific) methods for + variational Bayes mean field fitting. """ # Integration range (from -rng to +rng). The integrals are with @@ -362,17 +365,6 @@ class _VariationalBayesMixedGLM(BayesMixedGLM): verbose = False - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, - fe_p=2, family=None, fep_names=None, vcp_names=None): - - super(_VariationalBayesMixedGLM, self).__init__( - endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, - ident=ident, vcp_p=vcp_p, fe_p=fe_p, - family=family, fep_names=fep_names, vcp_names=vcp_names) - - # power would be better but not available in older scipy - self.exog_vc2 = self.exog_vc.multiply(self.exog_vc) - # Returns the mean and variance of the linear predictor under the # given distribution parameters. def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): @@ -679,7 +671,7 @@ def summary(self): return summ -class BinomialBayesMixedGLM(_VariationalBayesMixedGLM): +class BinomialBayesMixedGLM(_VariationalBayesMixedGLM, _BayesMixedGLM): __doc__ = _init_doc.format(fit_method=_vb_fit_method) @@ -689,17 +681,17 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, super(BinomialBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, - family=statsmodels.genmod.families.Binomial(), + family=sm.families.Binomial(), fep_names=fep_names, vcp_names=vcp_names) @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None): - fam = statsmodels.genmod.families.Binomial() - x = BayesMixedGLM.from_formula(formula, vc_formulas, data, - family=fam, vcp_p=vcp_p, fe_p=fe_p, - vcp_names=vcp_names) + fam = sm.families.Binomial() + x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, + family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names) return BinomialBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, ident=x.ident, @@ -745,7 +737,7 @@ def h(z): h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) -class PoissonBayesMixedGLM(_VariationalBayesMixedGLM): +class PoissonBayesMixedGLM(_VariationalBayesMixedGLM, _BayesMixedGLM): __doc__ = _init_doc.format(fit_method=_vb_fit_method) @@ -755,17 +747,17 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, super(PoissonBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, - family=statsmodels.genmod.families.Poisson(), + family=sm.families.Poisson(), fep_names=fep_names, vcp_names=vcp_names) @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None): - fam = statsmodels.genmod.families.Poisson() - x = BayesMixedGLM.from_formula(formula, vc_formulas, data, - family=fam, vcp_p=vcp_p, fe_p=fe_p, - vcp_names=vcp_names) + fam = sm.families.Poisson() + x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, + family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names) return PoissonBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, ident=x.ident, diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 2852e63986e..484f6c7d6b0 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -1,7 +1,6 @@ import numpy as np from statsmodels.regression.bayes_mixed_glm import ( - BayesMixedGLM, BinomialBayesMixedGLM, PoissonBayesMixedGLM) -import statsmodels.api as sm + BinomialBayesMixedGLM, PoissonBayesMixedGLM) import pandas as pd from scipy import sparse from numpy.testing import assert_allclose @@ -111,9 +110,8 @@ def test_simple_logit_map(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial(), - vcp_p=0.5) + glmm = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), @@ -125,9 +123,8 @@ def test_simple_poisson_map(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 0.2) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Poisson(), - vcp_p=0.5) + glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) rslt1 = glmm1.fit_map() assert_allclose(glmm1.logposterior_grad(rslt1.params), np.zeros_like(rslt1.params), atol=1e-3) @@ -144,9 +141,8 @@ def test_crossed_logit_map(): y, exog_fe, exog_vc, ident = gen_crossed_logit(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Binomial(), - vcp_p=0.5) + glmm = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), @@ -158,9 +154,8 @@ def test_crossed_poisson_map(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) exog_vc = sparse.csr_matrix(exog_vc) - glmm = BayesMixedGLM(y, exog_fe, exog_vc, ident, - family=sm.families.Poisson(), - vcp_p=0.5) + glmm = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, + vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), @@ -173,8 +168,8 @@ def test_logit_map_crosed_formula(): fml = "y ~ fe" fml_vc = ["0 + C(a)", "0 + C(b)"] - glmm = BayesMixedGLM.from_formula( - fml, fml_vc, data, family=sm.families.Binomial(), vcp_p=0.5) + glmm = BinomialBayesMixedGLM.from_formula( + fml, fml_vc, data, vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), @@ -251,8 +246,8 @@ def test_simple_logit_vb(): y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 0) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, family=sm.families.Binomial()) + glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, @@ -276,8 +271,7 @@ def test_simple_poisson_vb(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 1) exog_vc = sparse.csr_matrix(exog_vc) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, - family=sm.families.Poisson()) + glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) @@ -299,8 +293,8 @@ def test_crossed_logit_vb(): y, exog_fe, exog_vc, ident = gen_crossed_logit(10, 10, 1, 2) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, family=sm.families.Binomial()) + glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, @@ -344,8 +338,8 @@ def test_crossed_poisson_vb(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) - glmm1 = BayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, - fe_p=0.5, family=sm.families.Poisson()) + glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, + fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, From d9234557e7009f197889de2e416656d3c6c38913 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 7 Jan 2018 12:42:32 -0500 Subject: [PATCH 051/157] modify unstable test --- statsmodels/regression/bayes_mixed_glm.py | 27 ++++++++++++++++++- .../regression/tests/test_bayes_mixed_glm.py | 2 +- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index ac19c4b4ad0..6eb49d79045 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -1,3 +1,28 @@ +""" +Bayesian inference for generalized linear mixed models. + +Currently only families without additional scale or shape parameters +are supported (binomial and Poisson). + +Two estimation approaches are supported: Laplace approximation +(maximum a posteriori), and variational Bayes (mean field +approximation to the posterior). + +Random effects are required to be independent in this implementation. + +The `exog_vc` matrix is the design matrix for the random effects. +Every column of `exog_vc` corresponds to an independent realizattion +of a random effect. These random effects have mean zero and an +unknown standard deviation. The standard deviation parameters are +constrained, so that a subset of the columns of `exog_vc` will have a +common variance. These subsets are specified through the parameer +`ident`. + +In many applications, `exog_vc` will be sparse. A sparse matrix may +be passed when constructing a model class. If a dense matrix is +passed, it will be converted internally to a sparse matrix. +""" + from __future__ import division import numpy as np from scipy.optimize import minimize @@ -352,7 +377,7 @@ def grad(params): return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) -class _VariationalBayesMixedGLM(_BayesMixedGLM): +class _VariationalBayesMixedGLM(object): """ A mixin providing generic (not family-specific) methods for variational Bayes mean field fitting. diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 484f6c7d6b0..838e38cd937 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -151,7 +151,7 @@ def test_crossed_logit_map(): def test_crossed_poisson_map(): - y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) + y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, From 3cbb0f28cb548116009cb57e80c08495a3b5f297 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 7 Jan 2018 12:48:53 -0500 Subject: [PATCH 052/157] rectify method names --- statsmodels/regression/bayes_mixed_glm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 6eb49d79045..be6a7389503 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -401,8 +401,8 @@ def _lp_stats(self, fep_mean, fep_sd, vc_mean, vc_sd): return tm, tv - def vb_elbo(self, h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, - vcp_sd, vc_sd): + def vb_elbo_base(self, h, tm, fep_mean, vcp_mean, vc_mean, + fep_sd, vcp_sd, vc_sd): """ Returns the evidence lower bound (ELBO) for the model. @@ -433,8 +433,8 @@ def vb_elbo(self, h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, return r - def vb_elbo_grad(self, h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, - vcp_sd, vc_sd): + def vb_elbo_grad_base(self, h, tm, tv, fep_mean, vcp_mean, vc_mean, + fep_sd, vcp_sd, vc_sd): fep_mean_grad = 0. fep_sd_grad = 0. @@ -739,7 +739,7 @@ def h(z): x /= np.sqrt(2*np.pi) return x - return super(BinomialBayesMixedGLM, self).vb_elbo( + return self.vb_elbo_base( h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) def vb_elbo_grad(self, vb_mean, vb_sd): @@ -758,7 +758,7 @@ def h(z): x /= np.sqrt(2*np.pi) return x - return super(BinomialBayesMixedGLM, self).vb_elbo_grad( + return self.vb_elbo_grad_base( h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) @@ -805,7 +805,7 @@ def h(z): y /= np.sqrt(2*np.pi) return y - return super(PoissonBayesMixedGLM, self).vb_elbo( + return self.vb_elbo_base( h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) def vb_elbo_grad(self, vb_mean, vb_sd): @@ -823,5 +823,5 @@ def h(z): y /= np.sqrt(2*np.pi) return y - return super(PoissonBayesMixedGLM, self).vb_elbo_grad( + return self.vb_elbo_grad_base( h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) From bdb3f40217c61fea3299b4a7e668174c19505c91 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 7 Jan 2018 12:50:30 -0500 Subject: [PATCH 053/157] remove unneeded assignment --- statsmodels/regression/tests/test_bayes_mixed_glm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 838e38cd937..d293d9f8c14 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -208,8 +208,6 @@ def test_elbo_grad(): rslt1 = glmm1.fit_map() - n = glmm1.k_fep + glmm1.k_vcp + glmm1.k_vc - for k in range(3): if k == 0: From 21e77dd10cd40a2ff1cce643f6990bc9359ab838 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 7 Jan 2018 17:35:06 -0500 Subject: [PATCH 054/157] poisson test fails on old scipy --- statsmodels/regression/tests/test_bayes_mixed_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index d293d9f8c14..3f15957a98e 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -334,7 +334,7 @@ def test_crossed_logit_vb_formula(): def test_crossed_poisson_vb(): - y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 2) + y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) From 7e2ec7d40b2f81002714b3c525c5cb88134fbd30 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 7 Jan 2018 18:47:00 -0500 Subject: [PATCH 055/157] more work on poisson vb test --- statsmodels/regression/tests/test_bayes_mixed_glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 3f15957a98e..4452f92a4eb 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -348,9 +348,9 @@ def test_crossed_poisson_vb(): rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ - -0.46012702, 0.20564564, 0.48015114, -0.09004295, 0.92886591], + -0.54855281, 0.10458834, -0.68777741, -0.01699925, 0.77200546], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ - -0.45982888, 0.24911954, 0.53073859, -0.10369905, 0.91920463], + -0.54691502, 0.22297158, -0.52673802, -0.06218684, 0.74385237], rtol=1e-4, atol=1e-4) From fef9661cebc125882d58a146b98ba06f5826ba6d Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 8 Jan 2018 12:04:16 -0500 Subject: [PATCH 056/157] more checking of input parameters --- statsmodels/regression/bayes_mixed_glm.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index be6a7389503..4a43c37eeed 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -135,6 +135,10 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, family = sm.families.Gaussian() warnings.Warn("Defaulting to Gaussian family") + if len(ident) != exog_vc.shape[1]: + msg = "len(ident) should match the number of columns of exog_vc" + raise ValueError(msg) + # Get the fixed effects parameter names if fep_names is None: if hasattr(exog_fe, "columns"): @@ -148,6 +152,10 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, if vcp_names is None: vcp_names = ["VC_%d" % (k + 1) for k in range(int(max(ident)) + 1)] + else: + if len(vcp_names) != len(set(ident)): + msg = "The lengths of vcp_names and ident should be the same" + raise ValueError(msg) self.vcp_names = vcp_names self.endog = np.asarray(endog) From 60a5e9c51ca4928b184c1ce9c8c2b9b061deb164 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 10 Jan 2018 00:09:39 -0500 Subject: [PATCH 057/157] work on docstrings --- statsmodels/regression/bayes_mixed_glm.py | 41 ++++++++++++++++------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 4a43c37eeed..15937a89fd8 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -48,7 +48,12 @@ _init_doc = r""" Fit a generalized linear mixed model using Bayesian methods. -{fit_method} + + The class implements the Laplace approximation to the posterior + distribution (`fit_map`) and a variational Bayes approximation to + the posterior (`fit_vb`). See the fit method docstrings for more + information about the fitting approaches. + Parameters ---------- endog : array-like @@ -109,24 +114,36 @@ The prior for the fixed effects parameters is Gaussian with mean 0 and standard deviation `fe_p`. + + Examples + --------{example} """ -_laplace_fit_method = """ - The class implements the Laplace approximation to the posterior - distribution. See subclasses, e.g. BinomialBayesMixedGLM for - other estimation approaches. +_logit_example = """ + A binomial (logistic) random effects model with random intercepts + for villages and random slopes for year within villages: + + >>> data['year_cen'] = data['Year'] - data.Year.mean() + >>> random = ['0 + C(Village)', '0 + C(Village)*year_cen'] + >>> model = BinomialBayesMixedGLM.from_formula('y ~ year_cen', + random, data) + >>> result = model.fit() """ -_vb_fit_method = """ - The class implements a variational Bayes approximation to the - posterior. See the docstring to `fit_vb` for more information. +_poisson_example = """ + A Poisson random effects model with random intercepts for villages + and random slopes for year within villages: + + >>> data['year_cen'] = data['Year'] - data.Year.mean() + >>> random = ['0 + C(Village)', '0 + C(Village)*year_cen'] + >>> model = PoissonBayesMixedGLM.from_formula('y ~ year_cen', + random, data) + >>> result = model.fit() """ class _BayesMixedGLM(object): - __doc__ = _init_doc.format(fit_method=_laplace_fit_method) - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, vcp_names=None): @@ -706,7 +723,7 @@ def summary(self): class BinomialBayesMixedGLM(_VariationalBayesMixedGLM, _BayesMixedGLM): - __doc__ = _init_doc.format(fit_method=_vb_fit_method) + __doc__ = _init_doc.format(example=_logit_example) def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): @@ -772,7 +789,7 @@ def h(z): class PoissonBayesMixedGLM(_VariationalBayesMixedGLM, _BayesMixedGLM): - __doc__ = _init_doc.format(fit_method=_vb_fit_method) + __doc__ = _init_doc.format(example=_poisson_example) def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None): From 90d4e34ce352847303fc8277ce3a31cc0037c297 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 11 Jan 2018 21:55:59 -0500 Subject: [PATCH 058/157] add method to extract random effect parameters --- statsmodels/regression/bayes_mixed_glm.py | 93 ++++++++++++++----- .../regression/tests/test_bayes_mixed_glm.py | 4 + 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 15937a89fd8..e04eb1e016e 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -146,7 +146,7 @@ class _BayesMixedGLM(object): def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, family=None, fep_names=None, - vcp_names=None): + vcp_names=None, vc_names=None): if family is None: family = sm.families.Gaussian() @@ -175,6 +175,10 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, raise ValueError(msg) self.vcp_names = vcp_names + # Get the variance component realization (random effect) + # names. + self.vc_names = vc_names + self.endog = np.asarray(endog) self.exog_fe = np.asarray(exog_fe) @@ -319,7 +323,7 @@ def _get_start(self): @classmethod def from_formula(cls, formula, vc_formulas, data, family=None, - vcp_p=1, fe_p=2, vcp_names=None): + vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): """ Fit a BayesMixedGLM using a formula. @@ -346,6 +350,8 @@ def from_formula(cls, formula, vc_formulas, data, family=None, The prior standard deviation for the fixed effects parameters. vcp_names : list Names of variance component parameters + vc_names : list + Names of random effects realizations """ if not type(vc_formulas) is list: @@ -361,6 +367,7 @@ def from_formula(cls, formula, vc_formulas, data, family=None, exog_vc.append(mat) ident.append(j * np.ones(mat.shape[1])) exog_vc = pd.concat(exog_vc, axis=1) + vc_names = exog_vc.columns.tolist() if vcp_names is None: vcp_names = ["VC_%d" % (k + 1) for k in range(len(vc_formulas))] @@ -375,7 +382,7 @@ def from_formula(cls, formula, vc_formulas, data, family=None, mod = _BayesMixedGLM(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, family=family, fep_names=fep_names, - vcp_names=vcp_names) + vcp_names=vcp_names, vc_names=vc_names) return mod @@ -720,34 +727,72 @@ def summary(self): return summ + def random_effects(self, term=None): + """ + Posterior mean and standard deviation of random effects. + + Parameters + ---------- + term : int or None + If None, results for all random effects are returned. If + an integer, returns results for a given set of random + effects. The value of `term` refers to an element of the + `ident` vector, or to a position in the `vc_formulas` + list. + + Returns + ------- + Data frame of posterior means and posterior standard + deviations of random effects. + """ + + z = self.vc_mean + s = self.vc_sd + na = self.model.vc_names + + if term is not None: + ii = np.flatnonzero(self.model.ident == term) + z = z[ii] + s = s[ii] + na = [na[i] for i in ii] + + x = pd.DataFrame({"Mean": z, "SD": s}) + + if na is not None: + x.index = na + + return x + class BinomialBayesMixedGLM(_VariationalBayesMixedGLM, _BayesMixedGLM): __doc__ = _init_doc.format(example=_logit_example) def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, - fe_p=2, fep_names=None, vcp_names=None): + fe_p=2, fep_names=None, vcp_names=None, + vc_names=None): super(BinomialBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, family=sm.families.Binomial(), - fep_names=fep_names, vcp_names=vcp_names) + fep_names=fep_names, vcp_names=vcp_names, + vc_names=vc_names) @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, - vcp_names=None): + vcp_names=None, vc_names=None): fam = sm.families.Binomial() - x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, - family=fam, vcp_p=vcp_p, fe_p=fe_p, - vcp_names=vcp_names) + x = _BayesMixedGLM.from_formula( + formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names, vc_names=vc_names) - return BinomialBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, - exog_vc=x.exog_vc, ident=x.ident, - vcp_p=x.vcp_p, fe_p=x.fe_p, - fep_names=x.fep_names, - vcp_names=x.vcp_names) + return BinomialBayesMixedGLM( + endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, + ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, + fep_names=x.fep_names, vcp_names=x.vcp_names, + vc_names=x.vc_names) def vb_elbo(self, vb_mean, vb_sd): """ @@ -802,18 +847,18 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, - vcp_names=None): + vcp_names=None, vc_names=None): fam = sm.families.Poisson() - x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, - family=fam, vcp_p=vcp_p, fe_p=fe_p, - vcp_names=vcp_names) - - return PoissonBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, - exog_vc=x.exog_vc, ident=x.ident, - vcp_p=x.vcp_p, fe_p=x.fe_p, - fep_names=x.fep_names, - vcp_names=x.vcp_names) + x = _BayesMixedGLM.from_formula( + formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, + vcp_names=vcp_names, vc_names=vc_names) + + return PoissonBayesMixedGLM( + endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, + ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, + fep_names=x.fep_names, vcp_names=x.vcp_names, + vc_names=x.vc_names) def vb_elbo(self, vb_mean, vb_sd): """ diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 4452f92a4eb..b478156616d 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -177,6 +177,10 @@ def test_logit_map_crosed_formula(): rslt.summary() + r = rslt.random_effects(0) + assert_allclose(r.iloc[0, :].values, + np.r_[-0.0258016, 0.0937002], atol=1e-4) + def test_elbo_grad(): From 12f85937b1bcfb68551e624042fd368887b6cada Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 11 Jan 2018 22:46:39 -0500 Subject: [PATCH 059/157] make test more robust --- statsmodels/regression/tests/test_bayes_mixed_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index b478156616d..8f3a418b2a7 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -162,9 +162,9 @@ def test_crossed_poisson_map(): np.zeros_like(rslt.params), atol=1e-4) -def test_logit_map_crosed_formula(): +def test_logit_map_crossed_formula(): - data = gen_crossed_logit_pandas(10, 10, 1, 2) + data = gen_crossed_logit_pandas(10, 10, 1, 0.5) fml = "y ~ fe" fml_vc = ["0 + C(a)", "0 + C(b)"] @@ -179,7 +179,7 @@ def test_logit_map_crosed_formula(): r = rslt.random_effects(0) assert_allclose(r.iloc[0, :].values, - np.r_[-0.0258016, 0.0937002], atol=1e-4) + np.r_[-0.02004904, 0.10013856], atol=1e-4) def test_elbo_grad(): From 39aef6586c3197436c26c5c86608c456351f0876 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 12 Jan 2018 20:55:04 -0500 Subject: [PATCH 060/157] add tests --- .../regression/tests/test_bayes_mixed_glm.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 8f3a418b2a7..9814df492c4 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -286,10 +286,18 @@ def test_simple_poisson_vb(): -0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) + assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ + 0.00737842, 0.0012137, -0.00033823, -0.00029894, 0.00072396], + rtol=1e-4, atol=1e-4) + assert_allclose(rslt2.params[0:5], np.r_[ -0.07088814, -0.06373107, -0.22770786, 1.12923746, -1.26161339], rtol=1e-4, atol=1e-4) + assert_allclose(rslt2.cov_params[0:5], np.r_[ + 0.00747782, 0.0092554, 0.04508904, 0.02934488, 0.20312746], + rtol=1e-4, atol=1e-4) + def test_crossed_logit_vb(): @@ -311,10 +319,18 @@ def test_crossed_logit_vb(): -9.64030461e-03, 2.32701078e-03], rtol=1e-4, atol=1e-4) + assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ + 0.03937444, 0.00218164, 0.00599386, 0.00039312, 0.00017214], + rtol=1e-4, atol=1e-4) + assert_allclose(rslt2.params[0:5], np.r_[ -0.70834417, -0.3571011, 0.19126823, -0.36074489, 0.058976], rtol=1e-4, atol=1e-4) + assert_allclose(rslt2.cov_params[0:5], np.r_[ + 0.05212492, 0.04729656, 0.03916944, 0.25921842, 0.25782576], + rtol=1e-4, atol=1e-4) + def test_crossed_logit_vb_formula(): From fb9363cf65310e9ac78c4e61a377e3af038cb1c5 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 12 Jan 2018 22:28:56 -0500 Subject: [PATCH 061/157] continue working on tests --- .../regression/tests/test_bayes_mixed_glm.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 9814df492c4..88a1c2855f5 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -2,6 +2,8 @@ from statsmodels.regression.bayes_mixed_glm import ( BinomialBayesMixedGLM, PoissonBayesMixedGLM) import pandas as pd +from distutils.version import LooseVersion +import scipy from scipy import sparse from numpy.testing import assert_allclose from scipy.optimize import approx_fprime @@ -177,9 +179,10 @@ def test_logit_map_crossed_formula(): rslt.summary() - r = rslt.random_effects(0) - assert_allclose(r.iloc[0, :].values, - np.r_[-0.02004904, 0.10013856], atol=1e-4) + if LooseVersion(scipy.__version__) >= LooseVersion("0.19.0"): + r = rslt.random_effects(0) + assert_allclose(r.iloc[0, :].values, + np.r_[-0.02004904, 0.10013856], atol=1e-4) def test_elbo_grad(): @@ -286,6 +289,9 @@ def test_simple_poisson_vb(): -0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) + if LooseVersion(scipy.__version__) < LooseVersion("0.19.0"): + return + assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ 0.00737842, 0.0012137, -0.00033823, -0.00029894, 0.00072396], rtol=1e-4, atol=1e-4) @@ -319,6 +325,9 @@ def test_crossed_logit_vb(): -9.64030461e-03, 2.32701078e-03], rtol=1e-4, atol=1e-4) + if LooseVersion(scipy.__version__) < LooseVersion("0.19.0"): + return + assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ 0.03937444, 0.00218164, 0.00599386, 0.00039312, 0.00017214], rtol=1e-4, atol=1e-4) From 361d27422f75bd68e90b591a0a8837e8442ea561 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 12 Jan 2018 22:59:13 -0500 Subject: [PATCH 062/157] use numdiff for hessian --- statsmodels/regression/bayes_mixed_glm.py | 6 +++++- .../regression/tests/test_bayes_mixed_glm.py | 20 ++++++------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index e04eb1e016e..710908291e1 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -406,7 +406,11 @@ def grad(params): np.sqrt(np.sum(r.jac**2))) warnings.warn(msg) - return BayesMixedGLMResults(self, r.x, r.hess_inv, optim_retvals=r) + from statsmodels.tools.numdiff import approx_fprime + hess = approx_fprime(r.x, grad) + hess_inv = np.linalg.inv(hess) + + return BayesMixedGLMResults(self, r.x, hess_inv, optim_retvals=r) class _VariationalBayesMixedGLM(object): diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/regression/tests/test_bayes_mixed_glm.py index 88a1c2855f5..df9b5f731c3 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/regression/tests/test_bayes_mixed_glm.py @@ -2,8 +2,6 @@ from statsmodels.regression.bayes_mixed_glm import ( BinomialBayesMixedGLM, PoissonBayesMixedGLM) import pandas as pd -from distutils.version import LooseVersion -import scipy from scipy import sparse from numpy.testing import assert_allclose from scipy.optimize import approx_fprime @@ -179,10 +177,9 @@ def test_logit_map_crossed_formula(): rslt.summary() - if LooseVersion(scipy.__version__) >= LooseVersion("0.19.0"): - r = rslt.random_effects(0) - assert_allclose(r.iloc[0, :].values, - np.r_[-0.02004904, 0.10013856], atol=1e-4) + r = rslt.random_effects(0) + assert_allclose(r.iloc[0, :].values, + np.r_[-0.02004904, 0.094014], atol=1e-4) def test_elbo_grad(): @@ -289,11 +286,8 @@ def test_simple_poisson_vb(): -0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) - if LooseVersion(scipy.__version__) < LooseVersion("0.19.0"): - return - assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ - 0.00737842, 0.0012137, -0.00033823, -0.00029894, 0.00072396], + 0.00790914, 0.00080666, -0.00050719, 0.00022648, 0.00046235], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ @@ -325,11 +319,9 @@ def test_crossed_logit_vb(): -9.64030461e-03, 2.32701078e-03], rtol=1e-4, atol=1e-4) - if LooseVersion(scipy.__version__) < LooseVersion("0.19.0"): - return - assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ - 0.03937444, 0.00218164, 0.00599386, 0.00039312, 0.00017214], + 4.12927123e-02, -2.04448923e-04, 4.64829219e-05, + 1.20377543e-04, -1.45003234e-04], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ From e6dd95f164ba9739d5a163eb9004a1bfb4cc5cc9 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 14 Jan 2018 11:55:05 -0500 Subject: [PATCH 063/157] work on docstrings --- statsmodels/regression/bayes_mixed_glm.py | 79 ++++++++++++++++------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 710908291e1..5271b43667a 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -11,16 +11,33 @@ Random effects are required to be independent in this implementation. The `exog_vc` matrix is the design matrix for the random effects. -Every column of `exog_vc` corresponds to an independent realizattion -of a random effect. These random effects have mean zero and an -unknown standard deviation. The standard deviation parameters are -constrained, so that a subset of the columns of `exog_vc` will have a -common variance. These subsets are specified through the parameer -`ident`. +Every column of `exog_vc` corresponds to an independent realization of +a random effect. These random effects have mean zero and an unknown +standard deviation. The standard deviation parameters are constrained +to be equal within subsets of the columns. These subsets are specified +through the parameer `ident` when not using formulas. When formulas +are used, the columns of `exog_vc` derived from a common formula are +constrained to have the same standard deviation. In many applications, `exog_vc` will be sparse. A sparse matrix may be passed when constructing a model class. If a dense matrix is -passed, it will be converted internally to a sparse matrix. +passed, it will be converted internally to a sparse matrix. There +currently is no way to avoid creating a temporary dense version of +`exog_vc` when using formulas. + +Model and parameterization +-------------------------- +The joint density of data and parameters factors as: + + p(y | vc, fep) p(vc | vcp) p(vcp) p(fe) + +The terms p(vcp) and p(fe) are prior distributions that are taken to +be Gaussian (the vcp parameters are log standard deviations so the +variance parameters have log-normal distributions). The random +effects distribution p(vc | vcp) is independent Gaussian (random +effect realizations are independent within and between values of the +`ident` array). The model p(y | vc, fep) is based on the specific GLM +being fit. """ from __future__ import division @@ -51,8 +68,8 @@ The class implements the Laplace approximation to the posterior distribution (`fit_map`) and a variational Bayes approximation to - the posterior (`fit_vb`). See the fit method docstrings for more - information about the fitting approaches. + the posterior (`fit_vb`). See the two fit method docstrings for + more information about the fitting approaches. Parameters ---------- @@ -66,8 +83,8 @@ scipy.sparse array may be provided, or else the passed array will be converted to sparse internally. ident : array-like - Array of labels showing which random terms have a common - variance. + Array of labels showing which random terms (columns of + `exog_vc`) have a common variance. vc_p : float Prior standard deviation for variance component parameters (the prior standard deviation of log(s) is vc_p, where s is @@ -77,11 +94,14 @@ family : statsmodels.genmod.families instance The GLM family. fep_names : list of strings - The names of the fixed effects parameters (corresponding - to columns of exog_fe). + The names of the fixed effects parameters (corresponding to + columns of exog_fe). If None, default names are constructed. vcp_names : list of strings The names of the variance component parameters (corresponding - to distinct labels in ident). + to distinct labels in ident). If None, default names are + constructed. + vc_names : list of strings + The nmes of the random effect realizations. Returns ------- @@ -102,7 +122,9 @@ linear predictors. The elements of `ident` determine the distinct random effect variance parameters. Two random effect realizations that have the same value in `ident` are constrained to have the - same variance. + same variance. When fitting with a formula, `ident` is + constructed internally (each element of `vc_formulas` yields a + distinct label in `ident`). The random effect standard deviation parameters (vcp) have log-normal prior distributions with mean 0 and standard deviation @@ -121,7 +143,7 @@ _logit_example = """ A binomial (logistic) random effects model with random intercepts - for villages and random slopes for year within villages: + for villages and random slopes for each year within each village: >>> data['year_cen'] = data['Year'] - data.Year.mean() >>> random = ['0 + C(Village)', '0 + C(Village)*year_cen'] @@ -132,7 +154,7 @@ _poisson_example = """ A Poisson random effects model with random intercepts for villages - and random slopes for year within villages: + and random slopes for each year within each village: >>> data['year_cen'] = data['Year'] - data.Year.mean() >>> random = ['0 + C(Village)', '0 + C(Village)*year_cen'] @@ -144,14 +166,10 @@ class _BayesMixedGLM(object): - def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, - fe_p=2, family=None, fep_names=None, + def __init__(self, endog, exog_fe, exog_vc, ident, family, + vcp_p=1, fe_p=2, fep_names=None, vcp_names=None, vc_names=None): - if family is None: - family = sm.families.Gaussian() - warnings.Warn("Defaulting to Gaussian family") - if len(ident) != exog_vc.shape[1]: msg = "len(ident) should match the number of columns of exog_vc" raise ValueError(msg) @@ -380,8 +398,8 @@ def from_formula(cls, formula, vc_formulas, data, family=None, exog_fe = np.asarray(exog_fe) exog_vc = sparse.csr_matrix(np.asarray(exog_vc)) - mod = _BayesMixedGLM(endog, exog_fe, exog_vc, ident, vcp_p, fe_p, - family=family, fep_names=fep_names, + mod = _BayesMixedGLM(endog, exog_fe, exog_vc, ident, family, + vcp_p, fe_p, fep_names=fep_names, vcp_names=vcp_names, vc_names=vc_names) return mod @@ -390,6 +408,17 @@ def fit_map(self, method="BFGS", minim_opts=None): """ Construct the Laplace approximation to the posterior distribution. + + Parameters + ---------- + method : string + Optimization method for finding the posterior mode. + minim_opts : dict-like + Options passed to scipy.minimize. + + Returns + ------- + BayesMixedGLMResults instance. """ def fun(params): From 3d40ce2c88f5c3d9f44a47e74ef95ef792679621 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 14 Jan 2018 12:04:45 -0500 Subject: [PATCH 064/157] mored documentation work --- statsmodels/regression/bayes_mixed_glm.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/regression/bayes_mixed_glm.py index 5271b43667a..063745ea3be 100644 --- a/statsmodels/regression/bayes_mixed_glm.py +++ b/statsmodels/regression/bayes_mixed_glm.py @@ -139,6 +139,18 @@ Examples --------{example} + + + References + ---------- + Introduction to generalized linear mixed models: + https://stats.idre.ucla.edu/other/mult-pkg/introduction-to-generalized-linear-mixed-models + + SAS documentation: + https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_intromix_a0000000215.htm + + An assessment of estimation methods for generalized linear mixed models with binary outcomes + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3866838/ """ _logit_example = """ From 415d5f3da128b647fe7a429703748705bb7730a1 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 18 Jan 2018 10:40:49 -0500 Subject: [PATCH 065/157] change import path --- statsmodels/{regression => genmod}/bayes_mixed_glm.py | 0 .../{regression => genmod}/tests/test_bayes_mixed_glm.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename statsmodels/{regression => genmod}/bayes_mixed_glm.py (100%) rename statsmodels/{regression => genmod}/tests/test_bayes_mixed_glm.py (99%) diff --git a/statsmodels/regression/bayes_mixed_glm.py b/statsmodels/genmod/bayes_mixed_glm.py similarity index 100% rename from statsmodels/regression/bayes_mixed_glm.py rename to statsmodels/genmod/bayes_mixed_glm.py diff --git a/statsmodels/regression/tests/test_bayes_mixed_glm.py b/statsmodels/genmod/tests/test_bayes_mixed_glm.py similarity index 99% rename from statsmodels/regression/tests/test_bayes_mixed_glm.py rename to statsmodels/genmod/tests/test_bayes_mixed_glm.py index df9b5f731c3..ca9cdf0cd80 100644 --- a/statsmodels/regression/tests/test_bayes_mixed_glm.py +++ b/statsmodels/genmod/tests/test_bayes_mixed_glm.py @@ -1,5 +1,5 @@ import numpy as np -from statsmodels.regression.bayes_mixed_glm import ( +from statsmodels.genmod.bayes_mixed_glm import ( BinomialBayesMixedGLM, PoissonBayesMixedGLM) import pandas as pd from scipy import sparse From 542d9fc5c11b898403ad8e1f8fc91763222dea8b Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 21 Jan 2018 17:40:12 -0500 Subject: [PATCH 066/157] add classes to genmod api --- statsmodels/genmod/api.py | 1 + statsmodels/genmod/bayes_mixed_glm.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/statsmodels/genmod/api.py b/statsmodels/genmod/api.py index 63d7515b7db..483131672f4 100644 --- a/statsmodels/genmod/api.py +++ b/statsmodels/genmod/api.py @@ -1,4 +1,5 @@ from .generalized_linear_model import GLM from .generalized_estimating_equations import GEE, OrdinalGEE, NominalGEE +from .bayes_mixed_glm import BinomialBayesMixedGLM, PoissonBayesMixedGLM from . import families from . import cov_struct diff --git a/statsmodels/genmod/bayes_mixed_glm.py b/statsmodels/genmod/bayes_mixed_glm.py index 063745ea3be..6260d1d4966 100644 --- a/statsmodels/genmod/bayes_mixed_glm.py +++ b/statsmodels/genmod/bayes_mixed_glm.py @@ -45,7 +45,7 @@ from scipy.optimize import minimize from scipy import sparse from statsmodels.iolib import summary2 -import statsmodels.api as sm +from statsmodels.genmod import families import pandas as pd import warnings import patsy @@ -820,7 +820,7 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, super(BinomialBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, - family=sm.families.Binomial(), + family=families.Binomial(), fep_names=fep_names, vcp_names=vcp_names, vc_names=vc_names) @@ -828,7 +828,7 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): - fam = sm.families.Binomial() + fam = families.Binomial() x = _BayesMixedGLM.from_formula( formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, vcp_names=vcp_names, vc_names=vc_names) @@ -887,14 +887,14 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, super(PoissonBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, - family=sm.families.Poisson(), + family=families.Poisson(), fep_names=fep_names, vcp_names=vcp_names) @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): - fam = sm.families.Poisson() + fam = families.Poisson() x = _BayesMixedGLM.from_formula( formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, vcp_names=vcp_names, vc_names=vc_names) From 293a20cf5da44b67396b29e2a07479015f1ea00a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 21 Jan 2018 17:58:49 -0500 Subject: [PATCH 067/157] Docs, release notes, api, etc. --- docs/source/mixed_glm.rst | 33 ++++++++++++++++++++++++++++++ docs/source/release/version0.9.rst | 7 +++++++ 2 files changed, 40 insertions(+) create mode 100644 docs/source/mixed_glm.rst diff --git a/docs/source/mixed_glm.rst b/docs/source/mixed_glm.rst new file mode 100644 index 00000000000..3b6a5ffb35c --- /dev/null +++ b/docs/source/mixed_glm.rst @@ -0,0 +1,33 @@ +.. currentmodule:: statsmodels.genmod.bayes_mixed_glm + +Generalized Linear Mixed Effects Models +======================================= + +Generalized Linear Mixed Effects (GLIMMIX) models are generalized +linear models with random effects in the linear predictors. +Statsmodels currently supports estimation of binomial and Poisson +GLIMMIX models using two Bayesian methods: the Laplace approximation +to the posterior, and a variational Bayes approximation to the +posterior. Both methods provide point estimates (posterior means) and +assements of uncertainty (posterior standard deviation). + +The current implimentation only supports independent random effects. + +Technical Documentation +----------------------- + +Unlike Statsmodels mixed linear models, the GLIMMIX implementation is +not group-based. Groups are created by interacting all random effects +with a categorical variable. Note that this creates large, sparse +random effects design matrices `exog_vc`. Internally, `exog_vc` is +converted to a scipy sparse matrix. When passing the arguments +directly to the class initializer, a sparse matrix may be passed. +When using formulas, a dense matrix is created then converted to +sparse. For very large problems, it may not be feasible to use +formulas due to the size of this dense intermediate matrix. + +References +^^^^^^^^^^ + +Blei, Kucukelbir, McAuliffe (2017). Variational Inference: A review +for Statisticians https://arxiv.org/pdf/1601.00670.pdf diff --git a/docs/source/release/version0.9.rst b/docs/source/release/version0.9.rst index ae48165db80..e380d059f40 100644 --- a/docs/source/release/version0.9.rst +++ b/docs/source/release/version0.9.rst @@ -28,6 +28,13 @@ are mentioned in the docstrings. The following major new features appear in this version. +Generalized linear mixed models +------------------------------- + +Limited support for GLIMMIX models is now included in the genmod +module. Binomial and Poisson models with independent random effects +can be fit using Bayesian methods (Laplace and mean field +approximations to the posterior). Documentation ------------- From 51c56ae11d2f73e50e6b9c21bca346b0d4c076f6 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 21 Jan 2018 18:01:59 -0500 Subject: [PATCH 068/157] spelling errors in docs --- docs/source/mixed_glm.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/mixed_glm.rst b/docs/source/mixed_glm.rst index 3b6a5ffb35c..c8623da0b87 100644 --- a/docs/source/mixed_glm.rst +++ b/docs/source/mixed_glm.rst @@ -9,9 +9,9 @@ Statsmodels currently supports estimation of binomial and Poisson GLIMMIX models using two Bayesian methods: the Laplace approximation to the posterior, and a variational Bayes approximation to the posterior. Both methods provide point estimates (posterior means) and -assements of uncertainty (posterior standard deviation). +assessments of uncertainty (posterior standard deviation). -The current implimentation only supports independent random effects. +The current implementation only supports independent random effects. Technical Documentation ----------------------- From c231dba9709d970d43adc5198dd83051b57b1ebb Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 3 Feb 2018 16:39:00 -0500 Subject: [PATCH 069/157] refactor so that h() is easier to define --- statsmodels/genmod/bayes_mixed_glm.py | 58 ++++++++++++++------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/statsmodels/genmod/bayes_mixed_glm.py b/statsmodels/genmod/bayes_mixed_glm.py index 6260d1d4966..2068a19b0ff 100644 --- a/statsmodels/genmod/bayes_mixed_glm.py +++ b/statsmodels/genmod/bayes_mixed_glm.py @@ -149,7 +149,8 @@ SAS documentation: https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_intromix_a0000000215.htm - An assessment of estimation methods for generalized linear mixed models with binary outcomes + An assessment of estimation methods for generalized linear mixed + models with binary outcomes https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3866838/ """ @@ -488,16 +489,22 @@ def vb_elbo_base(self, h, tm, fep_mean, vcp_mean, vc_mean, Parameters ---------- - h : function - Implements log p(y, fep, vcp, vc) in the form of a function of z, - where z is a standard normal random variable. + h : function mapping 1d vector to 1d vector + The contribution of the model to the ELBO function can be + expressed as y_i*lp_i + Eh_i(z), where y_i and lp_i are + the response and linear predictor for observation i, and z + is a standard normal rangom variable. This formulation + can be achieved for any GLM with a canonical link + function. """ # p(y | vc) contributions iv = 0 for w in glw: - iv += h(self.rng * w[1]) * w[0] - iv *= -self.rng + z = self.rng * w[1] + iv += w[0] * h(z) * np.exp(-z**2 / 2) + iv /= np.sqrt(2*np.pi) + iv *= self.rng iv += self.endog * tm iv = iv.sum() @@ -512,6 +519,11 @@ def vb_elbo_base(self, h, tm, fep_mean, vcp_mean, vc_mean, def vb_elbo_grad_base(self, h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd): + """ + Return the gradient of the ELBO function. + + See vb_elbo_base for parameters. + """ fep_mean_grad = 0. fep_sd_grad = 0. @@ -522,20 +534,20 @@ def vb_elbo_grad_base(self, h, tm, tv, fep_mean, vcp_mean, vc_mean, # p(y | vc) contributions for w in glw: - x = self.rng * w[1] - u = h(x) + z = self.rng * w[1] + u = h(z) * np.exp(-z**2 / 2) / np.sqrt(2*np.pi) r = u / np.sqrt(tv) fep_mean_grad += w[0] * np.dot(u, self.exog_fe) vc_mean_grad += w[0] * self.exog_vc.transpose().dot(u) - fep_sd_grad += w[0] * x * np.dot(r, self.exog_fe**2 * fep_sd) + fep_sd_grad += w[0] * z * np.dot(r, self.exog_fe**2 * fep_sd) v = self.exog_vc2.multiply(vc_sd).transpose().dot(r) v = np.squeeze(np.asarray(v)) - vc_sd_grad += w[0] * x * v + vc_sd_grad += w[0] * z * v - fep_mean_grad *= -self.rng - vc_mean_grad *= -self.rng - fep_sd_grad *= -self.rng - vc_sd_grad *= -self.rng + fep_mean_grad *= self.rng + vc_mean_grad *= self.rng + fep_sd_grad *= self.rng + vc_sd_grad *= self.rng fep_mean_grad += np.dot(self.endog, self.exog_fe) vc_mean_grad += self.exog_vc.transpose().dot(self.endog) @@ -849,10 +861,7 @@ def vb_elbo(self, vb_mean, vb_sd): tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): - x = np.log(1 + np.exp(tm + np.sqrt(tv)*z)) - x *= np.exp(-z**2 / 2) - x /= np.sqrt(2*np.pi) - return x + return -np.log(1 + np.exp(tm + np.sqrt(tv)*z)) return self.vb_elbo_base( h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) @@ -869,9 +878,7 @@ def vb_elbo_grad(self, vb_mean, vb_sd): def h(z): u = tm + np.sqrt(tv)*z x = np.exp(u) / (1 + np.exp(u)) - x *= np.exp(-z**2 / 2) - x /= np.sqrt(2*np.pi) - return x + return -x return self.vb_elbo_grad_base( h, tm, tv, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) @@ -915,10 +922,7 @@ def vb_elbo(self, vb_mean, vb_sd): tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): - y = np.exp(tm + np.sqrt(tv)*z) - y *= np.exp(-z**2 / 2) - y /= np.sqrt(2*np.pi) - return y + return -np.exp(tm + np.sqrt(tv)*z) return self.vb_elbo_base( h, tm, fep_mean, vcp_mean, vc_mean, fep_sd, vcp_sd, vc_sd) @@ -933,9 +937,7 @@ def vb_elbo_grad(self, vb_mean, vb_sd): tm, tv = self._lp_stats(fep_mean, fep_sd, vc_mean, vc_sd) def h(z): - y = np.exp(tm + np.sqrt(tv)*z) - y *= np.exp(-z**2 / 2) - y /= np.sqrt(2*np.pi) + y = -np.exp(tm + np.sqrt(tv)*z) return y return self.vb_elbo_grad_base( From 90ad21c272c77f0f7f4a88401f508c82be882570 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 26 Feb 2018 08:46:33 -0500 Subject: [PATCH 070/157] change formula interface to resemble MixedLM --- statsmodels/genmod/bayes_mixed_glm.py | 38 +++++++++---------- .../genmod/tests/test_bayes_mixed_glm.py | 6 +-- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/statsmodels/genmod/bayes_mixed_glm.py b/statsmodels/genmod/bayes_mixed_glm.py index 2068a19b0ff..71f5b05f050 100644 --- a/statsmodels/genmod/bayes_mixed_glm.py +++ b/statsmodels/genmod/bayes_mixed_glm.py @@ -8,7 +8,8 @@ (maximum a posteriori), and variational Bayes (mean field approximation to the posterior). -Random effects are required to be independent in this implementation. +All realizations of random effects are required to be mutually +independent in this implementation. The `exog_vc` matrix is the design matrix for the random effects. Every column of `exog_vc` corresponds to an independent realization of @@ -354,7 +355,7 @@ def _get_start(self): @classmethod def from_formula(cls, formula, vc_formulas, data, family=None, - vcp_p=1, fe_p=2, vcp_names=None, vc_names=None): + vcp_p=1, fe_p=2, vc_names=None): """ Fit a BayesMixedGLM using a formula. @@ -364,12 +365,12 @@ def from_formula(cls, formula, vc_formulas, data, family=None, formula : string Formula for the endog and fixed effects terms (use ~ to separate dependent and independent expressions). - vc_formula : list of strings - Each element of the list is a one-sided formula that - creates one collection of random effects with a common - variance prameter. If using a categorical expression to - produce variance components, note that generally `0 + ...` - should be used so that an intercept is not included. + vc_formulas : dictionary + vc_formulas[name] is a one-sided formula that creates one + collection of random effects with a common variance + prameter. If using a categorical expression to produce + variance components, note that generally `0 + ...` should + be used so that an intercept is not included. data : data frame The data to which the formulas are applied. family : genmod.families instance @@ -379,30 +380,26 @@ def from_formula(cls, formula, vc_formulas, data, family=None, deviations of the random effects. fe_p : float The prior standard deviation for the fixed effects parameters. - vcp_names : list - Names of variance component parameters vc_names : list Names of random effects realizations """ - if not type(vc_formulas) is list: - vc_formulas = [vc_formulas] - endog, exog_fe = patsy.dmatrices(formula, data, return_type='dataframe') ident = [] exog_vc = [] - for j, fml in enumerate(vc_formulas): + vcp_names = [] + j = 0 + for na, fml in vc_formulas.items(): mat = patsy.dmatrix(fml, data, return_type='dataframe') exog_vc.append(mat) + vcp_names.append(na) ident.append(j * np.ones(mat.shape[1])) + j += 1 exog_vc = pd.concat(exog_vc, axis=1) vc_names = exog_vc.columns.tolist() - if vcp_names is None: - vcp_names = ["VC_%d" % (k + 1) for k in range(len(vc_formulas))] - ident = np.concatenate(ident) endog = np.squeeze(np.asarray(endog)) @@ -808,7 +805,8 @@ def random_effects(self, term=None): na = self.model.vc_names if term is not None: - ii = np.flatnonzero(self.model.ident == term) + termix = self.model.vcp_names.index(term) + ii = np.flatnonzero(self.model.ident == termix) z = z[ii] s = s[ii] na = [na[i] for i in ii] @@ -838,12 +836,12 @@ def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, @classmethod def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, - vcp_names=None, vc_names=None): + vc_names=None): fam = families.Binomial() x = _BayesMixedGLM.from_formula( formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, - vcp_names=vcp_names, vc_names=vc_names) + vc_names=vc_names) return BinomialBayesMixedGLM( endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, diff --git a/statsmodels/genmod/tests/test_bayes_mixed_glm.py b/statsmodels/genmod/tests/test_bayes_mixed_glm.py index ca9cdf0cd80..4edf42b518c 100644 --- a/statsmodels/genmod/tests/test_bayes_mixed_glm.py +++ b/statsmodels/genmod/tests/test_bayes_mixed_glm.py @@ -167,7 +167,7 @@ def test_logit_map_crossed_formula(): data = gen_crossed_logit_pandas(10, 10, 1, 0.5) fml = "y ~ fe" - fml_vc = ["0 + C(a)", "0 + C(b)"] + fml_vc = {"a": "0 + C(a)", "b": "0 + C(b)"} glmm = BinomialBayesMixedGLM.from_formula( fml, fml_vc, data, vcp_p=0.5) rslt = glmm.fit_map() @@ -177,7 +177,7 @@ def test_logit_map_crossed_formula(): rslt.summary() - r = rslt.random_effects(0) + r = rslt.random_effects("a") assert_allclose(r.iloc[0, :].values, np.r_[-0.02004904, 0.094014], atol=1e-4) @@ -338,7 +338,7 @@ def test_crossed_logit_vb_formula(): data = gen_crossed_logit_pandas(10, 10, 1, 2) fml = "y ~ fe" - fml_vc = ["0 + C(a)", "0 + C(b)"] + fml_vc = {"a": "0 + C(a)", "b": "0 + C(b)"} glmm1 = BinomialBayesMixedGLM.from_formula( fml, fml_vc, data, vcp_p=0.5) rslt1 = glmm1.fit_vb() From 8c13ee650b50261770a817eeeed82f64ab4c97f7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 5 Mar 2018 17:22:27 -0800 Subject: [PATCH 071/157] fix unclosed files --- statsmodels/datasets/template_data.py | 4 +-- statsmodels/discrete/tests/test_discrete.py | 3 +- statsmodels/regression/tests/test_lme.py | 8 +++--- statsmodels/stats/tests/test_diagnostic.py | 10 ++++--- statsmodels/tsa/tests/results/results_arma.py | 28 ++++++++++++------- statsmodels/tsa/tests/test_arima.py | 5 ++-- statsmodels/tsa/vector_ar/tests/test_coint.py | 5 ++-- 7 files changed, 38 insertions(+), 25 deletions(-) diff --git a/statsmodels/datasets/template_data.py b/statsmodels/datasets/template_data.py index 919fac6fcc3..ba489419a30 100644 --- a/statsmodels/datasets/template_data.py +++ b/statsmodels/datasets/template_data.py @@ -58,6 +58,6 @@ def load_pandas(): def _get_data(): filepath = dirname(abspath(__file__)) ##### EDIT THE FOLLOWING TO POINT TO DatasetName.csv ##### - data = np.recfromtxt(open(filepath + '/DatasetName.csv', 'rb'), - delimiter=",", names=True, dtype=float) + with open(filepath + '/DatasetName.csv', 'rb') as fd: + data = np.recfromtxt(fd, delimiter=",", names=True, dtype=float) return data diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index 7173f2bbf8d..b7074809d30 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -1508,7 +1508,8 @@ def test_issue_339(): smry = "\n".join(res1.summary().as_text().split('\n')[9:]) cur_dir = os.path.dirname(os.path.abspath(__file__)) test_case_file = os.path.join(cur_dir, 'results', 'mn_logit_summary.txt') - test_case = open(test_case_file, 'r').read() + with open(test_case_file, 'r') as fd: + test_case = fd.read() np.testing.assert_equal(smry, test_case[:-1]) # smoke test for summary2 res1.summary2() # see #3651 diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 8649d1381d5..0728c4988ae 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -56,10 +56,10 @@ def __init__(self, meth, irfs, ds_ix): cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, "lme%02d.csv" % ds_ix) - fid = open(fname) - rdr = csv.reader(fid) - header = next(rdr) - data = [[float(x) for x in line] for line in rdr] + with open(fname) as fid: + rdr = csv.reader(fid) + header = next(rdr) + data = [[float(x) for x in line] for line in rdr] data = np.asarray(data) # Split into exog, endog, etc. diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py index e113d81a2ea..1160c0e24e6 100644 --- a/statsmodels/stats/tests/test_diagnostic.py +++ b/statsmodels/stats/tests/test_diagnostic.py @@ -637,8 +637,9 @@ def test_influence(self): #this test is slow infl = oi.OLSInfluence(res) - fp = open(os.path.join(cur_dir,"results/influence_lsdiag_R.json")) - lsdiag = json.load(fp) + path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") + with open(path, 'r') as fp: + lsdiag = json.load(fp) #basic assert_almost_equal(np.array(lsdiag['cov.scaled']).reshape(3, 3), @@ -782,8 +783,9 @@ def test_influence_wrapped(): assert_(isinstance(df, DataFrame)) #this test is slow - fp = open(os.path.join(cur_dir,"results/influence_lsdiag_R.json")) - lsdiag = json.load(fp) + path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") + with open(path, "r") as fp: + lsdiag = json.load(fp) c0, c1 = infl.cooks_distance #TODO: what's c1, it's pvalues? -ss diff --git a/statsmodels/tsa/tests/results/results_arma.py b/statsmodels/tsa/tests/results/results_arma.py index 67c87d9ab36..fe3234299d0 100644 --- a/statsmodels/tsa/tests/results/results_arma.py +++ b/statsmodels/tsa/tests/results/results_arma.py @@ -5,24 +5,32 @@ from numpy import genfromtxt current_path = os.path.dirname(os.path.abspath(__file__)) -yhat_mle = genfromtxt(open(current_path+"/yhat_exact_nc.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/yhat_exact_nc.csv", "rb") as fd: + yhat_mle = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -yhat_css = genfromtxt(open(current_path+"/yhat_css_nc.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/yhat_css_nc.csv", "rb") as fd: + yhat_css = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -yhatc_mle = genfromtxt(open(current_path+"/yhat_exact_c.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/yhat_exact_c.csv", "rb") as fd: + yhatc_mle = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -yhatc_css = genfromtxt(open(current_path+"/yhat_css_c.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/yhat_css_c.csv", "rb") as fd: + yhatc_css = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -resids_mle = genfromtxt(open(current_path+"/resids_exact_nc.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/resids_exact_nc.csv", "rb") as fd: + resids_mle = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -resids_css = genfromtxt(open(current_path+"/resids_css_nc.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/resids_css_nc.csv", "rb") as fd: + resids_css = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -residsc_mle = genfromtxt(open(current_path+"/resids_exact_c.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/resids_exact_c.csv", "rb") as fd: + residsc_mle = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -residsc_css = genfromtxt(open(current_path+"/resids_css_c.csv", "rb"), delimiter=",", skip_header = 1, dtype=float) +with open(current_path+"/resids_css_c.csv", "rb") as fd: + residsc_css = genfromtxt(fd, delimiter=",", skip_header = 1, dtype=float) -forecast_results = genfromtxt(open(current_path+"/results_arma_forecasts.csv", - "rb"), names=True, delimiter=",", dtype=float) +with open(current_path+"/results_arma_forecasts.csv", "rb") as fd: + forecast_results = genfromtxt(fd, names=True, delimiter=",", dtype=float) class Y_arma11(object): def __init__(self, method="mle"): diff --git a/statsmodels/tsa/tests/test_arima.py b/statsmodels/tsa/tests/test_arima.py index cdb890c4733..8bb740a426d 100644 --- a/statsmodels/tsa/tests/test_arima.py +++ b/statsmodels/tsa/tests/test_arima.py @@ -37,8 +37,9 @@ DECIMAL_1 = 1 current_path = os.path.dirname(os.path.abspath(__file__)) -y_arma = np.genfromtxt(open(current_path + '/results/y_arma_data.csv', "rb"), - delimiter=",", skip_header=1, dtype=float) +ydata_path = os.path.join(current_path, 'results', 'y_arma_data.csv') +with open(ydata_path, "rb") as fd: + y_arma = np.genfromtxt(fd, delimiter=",", skip_header=1, dtype=float) cpi_dates = PeriodIndex(start='1959q1', end='2009q3', freq='Q') sun_dates = PeriodIndex(start='1700', end='2008', freq='A') diff --git a/statsmodels/tsa/vector_ar/tests/test_coint.py b/statsmodels/tsa/vector_ar/tests/test_coint.py index a808badfea9..e3001511174 100644 --- a/statsmodels/tsa/vector_ar/tests/test_coint.py +++ b/statsmodels/tsa/vector_ar/tests/test_coint.py @@ -13,8 +13,9 @@ from statsmodels.tsa.vector_ar.vecm import coint_johansen current_path = os.path.dirname(os.path.abspath(__file__)) -dta = np.genfromtxt(open(os.path.join(current_path, "Matlab_results", - "test_coint.csv"), "rb")) +dta_path = os.path.join(current_path, "Matlab_results", "test_coint.csv") +with open(dta_path, "rb") as fd: + dta = np.genfromtxt(fd) class CheckCointJoh(object): From 3290542397dfb60f0d1ffacf2d1b2b6423fb15c8 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 12 Mar 2018 20:56:26 -0400 Subject: [PATCH 072/157] remove old survival files --- statsmodels/sandbox/cox.py | 303 ------------------- statsmodels/sandbox/km_class.py | 240 --------------- statsmodels/sandbox/survival.py | 18 -- statsmodels/sandbox/survival2.py | 499 ------------------------------- 4 files changed, 1060 deletions(-) delete mode 100644 statsmodels/sandbox/cox.py delete mode 100644 statsmodels/sandbox/km_class.py delete mode 100644 statsmodels/sandbox/survival.py delete mode 100644 statsmodels/sandbox/survival2.py diff --git a/statsmodels/sandbox/cox.py b/statsmodels/sandbox/cox.py deleted file mode 100644 index 555d2a5c8b8..00000000000 --- a/statsmodels/sandbox/cox.py +++ /dev/null @@ -1,303 +0,0 @@ -'''Cox proportional hazards regression model. - - -some dimension problems -fixed import errors -currently produces parameter estimate but then raises exception for other results - - -finally, after running the script several times, I get a OSError with too many -open file handles - -updates and changes : - -as of 2010-05-15 -AttributeError: 'CoxPH' object has no attribute 'cachedir' -Traceback (most recent call last): - File "C:\...\scikits\statsmodels\sandbox\cox.py", line 244, in - res = c.newton([0.4]) -AttributeError: 'CoxPH' object has no attribute 'newton' - -replaced newton by call to new fit method for mle with bfgs - -feels very slow -need testcase before trying to fix - -''' -from __future__ import print_function -from statsmodels.compat.python import iterkeys, range -import shutil -import tempfile - -import numpy as np - - -from statsmodels.base import model -from . import survival - -class Discrete(object): - - """ - A simple little class for working with discrete random vectors. - - Note: assumes x is 2-d and observations are in 0 axis, variables in 1 axis - """ - - def __init__(self, x, w=None): - self.x = np.squeeze(x) - if self.x.shape == (): - self.x = np.array([self.x]) -## #JP added and removed again b/c still broadcast error -## if self.x.ndim == 1: -## self.x = self.x[:,None] - self.n = self.x.shape[0] - if w is None: - w = np.ones(self.n, np.float64) - else: - if w.shape[0] != self.n: - raise ValueError('incompatible shape for weights w') - if np.any(np.less(w, 0)): - raise ValueError('weights should be non-negative') - self.w = w*1.0 / w.sum() - - def mean(self, f=None): #JP: this is expectation, "expect" in mine - if f is None: - fx = self.x - else: - fx = f(self.x) - return (fx * self.w).sum() - - def cov(self): - mu = self.mean() #JP: call to method (confusing name) - dx = self.x - mu#np.multiply.outer(mu, self.x.shape[1]) - return np.dot(dx, np.transpose(dx)) -## if dx.ndim == 1: -## dx = dx[:,None] -## return np.dot(dx.T, dx) - -class Observation(survival.RightCensored): - - def __getitem__(self, item): - if self.namespace is not None: - return self.namespace[item] - else: - return getattr(self, item) - - def __init__(self, time, delta, namespace=None): - self.namespace = namespace - survival.RightCensored.__init__(self, time, delta) - - def __call__(self, formula, time=None, **extra): - return formula(namespace=self, time=time, **extra) - -class CoxPH(model.LikelihoodModel): - """Cox proportional hazards regression model.""" - - def __init__(self, subjects, formula, time_dependent=False): - self.subjects, self.formula = subjects, formula - self.time_dependent = time_dependent - self.initialize(self.subjects) - - def initialize(self, subjects): - print('called initialize') - self.failures = {} - for i in range(len(subjects)): - s = subjects[i] - if s.delta: - if s.time not in self.failures: - self.failures[s.time] = [i] - else: - self.failures[s.time].append(i) - - self.failure_times = list(iterkeys(self.failures)) - self.failure_times.sort() - - def cache(self): - if self.time_dependent: - self.cachedir = tempfile.mkdtemp() - - self.design = {} - self.risk = {} - first = True - - for t in iterkeys(self.failures): - if self.time_dependent: - d = np.array([s(self.formula, time=t) - for s in self.subjects]).astype(float)[:,None] - dshape = d.shape - dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w') - d.tofile(dfile) - dfile.close() - del(d) - self.design[t] = np.memmap(dfile.name, - dtype=np.dtype(float), - shape=dshape) - elif first: - d = np.array([s(self.formula, time=t) - for s in self.subjects]).astype(np.float64) - self.design[t] = d - else: - self.design[t] = d - self.risk[t] = np.compress([s.atrisk(t) for s in self.subjects], - np.arange(self.design[t].shape[0]),axis=-1) -# this raised exception on exit, - def __del__(self): - try: - shutil.rmtree(self.cachedir, ignore_errors=True) - except AttributeError: - print("AttributeError: 'CoxPH' object has no attribute 'cachedir'") - pass - - def loglike(self, b, ties='breslow'): - - logL = 0 - for t in iterkeys(self.failures): - fail = self.failures[t] - d = len(fail) - risk = self.risk[t] - Zb = np.dot(self.design[t], b) - - logL += Zb[fail].sum() - - if ties == 'breslow': - s = np.exp(Zb[risk]).sum() - logL -= np.log(np.exp(Zb[risk]).sum()) * d - elif ties == 'efron': - s = np.exp(Zb[risk]).sum() - r = np.exp(Zb[fail]).sum() - for j in range(d): - logL -= np.log(s - j * r / d) - elif ties == 'cox': - raise NotImplementedError('Cox tie breaking method not \ -implemented') - else: - raise NotImplementedError('tie breaking method not recognized') - return logL - - def score(self, b, ties='breslow'): - - score = 0 - for t in iterkeys(self.failures): - fail = self.failures[t] - d = len(fail) - risk = self.risk[t] - Z = self.design[t] - - score += Z[fail].sum() - - if ties == 'breslow': - w = np.exp(np.dot(Z, b)) - rv = Discrete(Z[risk], w=w[risk]) - score -= rv.mean() * d - elif ties == 'efron': - w = np.exp(np.dot(Z, b)) - score += Z[fail].sum() - for j in range(d): - efron_w = w - efron_w[fail] -= i * w[fail] / float(d) - rv = Discrete(Z[risk], w=efron_w[risk]) - score -= rv.mean() - elif ties == 'cox': - raise NotImplementedError('Cox tie breaking method not \ -implemented') - else: - raise NotImplementedError('tie breaking method not recognized') - return np.array([score]) - - def information(self, b, ties='breslow'): - - info = 0 #np.zeros((len(b),len(b))) #0 - score = 0 - for t in iterkeys(self.failures): - fail = self.failures[t] - d = len(fail) - risk = self.risk[t] - Z = self.design[t] - - if ties == 'breslow': - w = np.exp(np.dot(Z, b)) - rv = Discrete(Z[risk], w=w[risk]) - info += rv.cov() - elif ties == 'efron': - w = np.exp(np.dot(Z, b)) - score += Z[fail].sum() - for j in range(d): - efron_w = w - efron_w[fail] -= i * w[fail] / d - rv = Discrete(Z[risk], w=efron_w[risk]) - info += rv.cov() - elif ties == 'cox': - raise NotImplementedError('Cox tie breaking method not \ -implemented') - else: - raise NotImplementedError('tie breaking method not recognized') - return score - -if __name__ == '__main__': - import numpy.random as R - n = 100 - X = np.array([0]*n + [1]*n) - b = 0.4 - lin = 1 + b*X - Y = R.standard_exponential((2*n,)) / lin - delta = R.binomial(1, 0.9, size=(2*n,)) - - subjects = [Observation(Y[i], delta[i]) for i in range(2*n)] - for i in range(2*n): - subjects[i].X = X[i] - - import statsmodels.sandbox.formula as F - x = F.Quantitative('X') - f = F.Formula(x) - - c = CoxPH(subjects, f) - -# c.cache() - # temp file cleanup doesn't work on windows - c = CoxPH(subjects, f, time_dependent=True) - c.cache() #this creates tempfile cache, - # no tempfile cache is created in normal use of CoxPH - - - #res = c.newton([0.4]) #doesn't work anymore - res=c.fit([0.4],method="bfgs") - print(res.params) - print(dir(c)) - #print c.fit(Y) - #c.information(res.params) #raises exception - - ''' - Note: Replacement for c.newton - - >>> c.fit() - Traceback (most recent call last): - File "", line 1, in - c.fit() - File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\model.py", line 132, in fit - start_params = [0]*self.exog.shape[1] # will fail for shape (K,) - AttributeError: 'CoxPH' object has no attribute 'exog' - >>> c.fit([0.4]) - Traceback (most recent call last): - File "", line 1, in - c.fit([0.4]) - File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\model.py", line 148, in fit - H = self.hessian(history[-1]) - File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\model.py", line 115, in hessian - raise NotImplementedError - NotImplementedError - >>> c.fit([0.4],method="bfgs") - Optimization terminated successfully. - Current function value: 802.354181 - Iterations: 3 - Function evaluations: 5 - Gradient evaluations: 5 - - >>> res=c.fit([0.4],method="bfgs") - Optimization terminated successfully. - Current function value: 802.354181 - Iterations: 3 - Function evaluations: 5 - Gradient evaluations: 5 - >>> res.params - array([ 0.34924421]) -''' diff --git a/statsmodels/sandbox/km_class.py b/statsmodels/sandbox/km_class.py deleted file mode 100644 index d63478e17e9..00000000000 --- a/statsmodels/sandbox/km_class.py +++ /dev/null @@ -1,240 +0,0 @@ -#a class for the Kaplan-Meier estimator -from statsmodels.compat.python import range -import numpy as np -from math import sqrt -import matplotlib.pyplot as plt - -class KAPLAN_MEIER(object): - def __init__(self, data, timesIn, groupIn, censoringIn): - raise RuntimeError('Newer version of Kaplan-Meier class available in survival2.py') - #store the inputs - self.data = data - self.timesIn = timesIn - self.groupIn = groupIn - self.censoringIn = censoringIn - - def fit(self): - #split the data into groups based on the predicting variable - #get a set of all the groups - groups = list(set(self.data[:,self.groupIn])) - #create an empty list to store the data for different groups - groupList = [] - #create an empty list for each group and add it to groups - for i in range(len(groups)): - groupList.append([]) - #iterate through all the groups in groups - for i in range(len(groups)): - #iterate though the rows of dataArray - for j in range(len(self.data)): - #test if this row has the correct group - if self.data[j,self.groupIn] == groups[i]: - #add the row to groupList - groupList[i].append(self.data[j]) - #create an empty list to store the times for each group - timeList = [] - #iterate through all the groups - for i in range(len(groupList)): - #create an empty list - times = [] - #iterate through all the rows of the group - for j in range(len(groupList[i])): - #get a list of all the times in the group - times.append(groupList[i][j][self.timesIn]) - #get a sorted set of the times and store it in timeList - times = list(sorted(set(times))) - timeList.append(times) - #get a list of the number at risk and events at each time - #create an empty list to store the results in - timeCounts = [] - #create an empty list to hold points for plotting - points = [] - #create a list for points where censoring occurs - censoredPoints = [] - #iterate trough each group - for i in range(len(groupList)): - #initialize a variable to estimate the survival function - survival = 1 - #initialize a variable to estimate the variance of - #the survival function - varSum = 0 - #initialize a counter for the number at risk - riskCounter = len(groupList[i]) - #create a list for the counts for this group - counts = [] - ##create a list for points to plot - x = [] - y = [] - #iterate through the list of times - for j in range(len(timeList[i])): - if j != 0: - if j == 1: - #add an indicator to tell if the time - #starts a new group - groupInd = 1 - #add (0,1) to the list of points - x.append(0) - y.append(1) - #add the point time to the right of that - x.append(timeList[i][j-1]) - y.append(1) - #add the point below that at survival - x.append(timeList[i][j-1]) - y.append(survival) - #add the survival to y - y.append(survival) - else: - groupInd = 0 - #add survival twice to y - y.append(survival) - y.append(survival) - #add the time twice to x - x.append(timeList[i][j-1]) - x.append(timeList[i][j-1]) - #add each censored time, number of censorings and - #its survival to censoredPoints - censoredPoints.append([timeList[i][j-1], - censoringNum,survival,groupInd]) - #add the count to the list - counts.append([timeList[i][j-1],riskCounter, - eventCounter,survival, - sqrt(((survival)**2)*varSum)]) - #increment the number at risk - riskCounter += -1*(riskChange) - #initialize a counter for the change in the number at risk - riskChange = 0 - #initialize a counter to zero - eventCounter = 0 - #intialize a counter to tell when censoring occurs - censoringCounter = 0 - censoringNum = 0 - #iterate through the observations in each group - for k in range(len(groupList[i])): - #check of the observation has the given time - if (groupList[i][k][self.timesIn]) == (timeList[i][j]): - #increment the number at risk counter - riskChange += 1 - #check if this is an event or censoring - if groupList[i][k][self.censoringIn] == 1: - #add 1 to the counter - eventCounter += 1 - else: - censoringNum += 1 - #check if there are any events at this time - if eventCounter != censoringCounter: - censoringCounter = eventCounter - #calculate the estimate of the survival function - survival *= ((float(riskCounter) - - eventCounter)/(riskCounter)) - try: - #calculate the estimate of the variance - varSum += (eventCounter)/((riskCounter) - *(float(riskCounter)- - eventCounter)) - except ZeroDivisionError: - varSum = 0 - #append the last row to counts - counts.append([timeList[i][len(timeList[i])-1], - riskCounter,eventCounter,survival, - sqrt(((survival)**2)*varSum)]) - #add the last time once to x - x.append(timeList[i][len(timeList[i])-1]) - x.append(timeList[i][len(timeList[i])-1]) - #add the last survival twice to y - y.append(survival) - #y.append(survival) - censoredPoints.append([timeList[i][len(timeList[i])-1], - censoringNum,survival,1]) - #add the list for the group to al ist for all the groups - timeCounts.append(np.array(counts)) - points.append([x,y]) - #returns a list of arrays, where each array has as it columns: the time, - #the number at risk, the number of events, the estimated value of the - #survival function at that time, and the estimated standard error at - #that time, in that order - self.results = timeCounts - self.points = points - self.censoredPoints = censoredPoints - - def plot(self): - x = [] - #iterate through the groups - for i in range(len(self.points)): - #plot x and y - plt.plot(np.array(self.points[i][0]),np.array(self.points[i][1])) - #create lists of all the x and y values - x += self.points[i][0] - for j in range(len(self.censoredPoints)): - #check if censoring is occuring - if (self.censoredPoints[j][1] != 0): - #if this is the first censored point - if (self.censoredPoints[j][3] == 1) and (j == 0): - #calculate a distance beyond 1 to place it - #so all the points will fit - dx = ((1./((self.censoredPoints[j][1])+1.)) - *(float(self.censoredPoints[j][0]))) - #iterate through all the censored points at this time - for k in range(self.censoredPoints[j][1]): - #plot a vertical line for censoring - plt.vlines((1+((k+1)*dx)), - self.censoredPoints[j][2]-0.03, - self.censoredPoints[j][2]+0.03) - #if this censored point starts a new group - elif ((self.censoredPoints[j][3] == 1) and - (self.censoredPoints[j-1][3] == 1)): - #calculate a distance beyond 1 to place it - #so all the points will fit - dx = ((1./((self.censoredPoints[j][1])+1.)) - *(float(self.censoredPoints[j][0]))) - #iterate through all the censored points at this time - for k in range(self.censoredPoints[j][1]): - #plot a vertical line for censoring - plt.vlines((1+((k+1)*dx)), - self.censoredPoints[j][2]-0.03, - self.censoredPoints[j][2]+0.03) - #if this is the last censored point - elif j == (len(self.censoredPoints) - 1): - #calculate a distance beyond the previous time - #so that all the points will fit - dx = ((1./((self.censoredPoints[j][1])+1.)) - *(float(self.censoredPoints[j][0]))) - #iterate through all the points at this time - for k in range(self.censoredPoints[j][1]): - #plot a vertical line for censoring - plt.vlines((self.censoredPoints[j-1][0]+((k+1)*dx)), - self.censoredPoints[j][2]-0.03, - self.censoredPoints[j][2]+0.03) - #if this is a point in the middle of the group - else: - #calcuate a distance beyond the current time - #to place the point, so they all fit - dx = ((1./((self.censoredPoints[j][1])+1.)) - *(float(self.censoredPoints[j+1][0]) - - self.censoredPoints[j][0])) - #iterate through all the points at this time - for k in range(self.censoredPoints[j][1]): - #plot a vetical line for censoring - plt.vlines((self.censoredPoints[j][0]+((k+1)*dx)), - self.censoredPoints[j][2]-0.03, - self.censoredPoints[j][2]+0.03) - #set the size of the plot so it extends to the max x and above 1 for y - plt.xlim((0,np.max(x))) - plt.ylim((0,1.05)) - #label the axes - plt.xlabel('time') - plt.ylabel('survival') - plt.show() - - def show_results(self): - #start a string that will be a table of the results - resultsString = '' - #iterate through all the groups - for i in range(len(self.results)): - #label the group and header - resultsString += ('Group {0}\n\n'.format(i) + - 'Time At Risk Events Survival Std. Err\n') - for j in self.results[i]: - #add the results to the string - resultsString += ( - '{0:<9d}{1:<12d}{2:<11d}{3:<13.4f}{4:<6.4f}\n'.format( - int(j[0]),int(j[1]),int(j[2]),j[3],j[4])) - print(resultsString) diff --git a/statsmodels/sandbox/survival.py b/statsmodels/sandbox/survival.py deleted file mode 100644 index 83d5ce0a948..00000000000 --- a/statsmodels/sandbox/survival.py +++ /dev/null @@ -1,18 +0,0 @@ -import numpy as np - -class SurvivalTime(object): - def __init__(self, time, delta): - self.time, self.delta = time, delta - - def atrisk(self, time): - raise NotImplementedError - -class RightCensored(SurvivalTime): - - def atrisk(self, time): - return np.less_equal.outer(time, self.time) - -class LeftCensored(SurvivalTime): - - def atrisk(self, time): - return np.greater_equal.outer(time, self.time) diff --git a/statsmodels/sandbox/survival2.py b/statsmodels/sandbox/survival2.py deleted file mode 100644 index bbba55503b0..00000000000 --- a/statsmodels/sandbox/survival2.py +++ /dev/null @@ -1,499 +0,0 @@ -#Kaplan-Meier Estimator - -import numpy as np -import numpy.linalg as la -import matplotlib.pyplot as plt -from scipy import stats -from statsmodels.iolib.table import SimpleTable - -class KaplanMeier(object): - - """ - KaplanMeier(...) - KaplanMeier(data, endog, exog=None, censoring=None) - - Create an object of class KaplanMeier for estimating - Kaplan-Meier survival curves. - - Parameters - ---------- - data: array_like - An array, with observations in each row, and - variables in the columns - - endog: index (starting at zero) of the column - containing the endogenous variable (time) - - exog: index of the column containing the exogenous - variable (must be catagorical). If exog = None, this - is equivalent to a single survival curve - - censoring: index of the column containing an indicator - of whether an observation is an event, or a censored - observation, with 0 for censored, and 1 for an event - - Attributes - ----------- - censorings: List of censorings associated with each unique - time, at each value of exog - - events: List of the number of events at each unique time - for each value of exog - - results: List of arrays containing estimates of the value - value of the survival function and its standard error - at each unique time, for each value of exog - - ts: List of unique times for each value of exog - - Methods - ------- - fit: Calcuate the Kaplan-Meier estimates of the survival - function and its standard error at each time, for each - value of exog - - plot: Plot the survival curves using matplotlib.plyplot - - summary: Display the results of fit in a table. Gives results - for all (including censored) times - - test_diff: Test for difference between survival curves - - Examples - -------- - >>> import statsmodels.api as sm - >>> import matplotlib.pyplot as plt - >>> import numpy as np - >>> from statsmodels.sandbox.survival2 import KaplanMeier - >>> dta = sm.datasets.strikes.load() - >>> dta = dta.values()[-1] - >>> dta[range(5),:] - array([[ 7.00000000e+00, 1.13800000e-02], - [ 9.00000000e+00, 1.13800000e-02], - [ 1.30000000e+01, 1.13800000e-02], - [ 1.40000000e+01, 1.13800000e-02], - [ 2.60000000e+01, 1.13800000e-02]]) - >>> km = KaplanMeier(dta,0) - >>> km.fit() - >>> km.plot() - - Doing - - >>> km.summary() - - will display a table of the estimated survival and standard errors - for each time. The first few lines are - - Kaplan-Meier Curve - ===================================== - Time Survival Std. Err - ------------------------------------- - 1.0 0.983870967742 0.0159984306572 - 2.0 0.91935483871 0.0345807888235 - 3.0 0.854838709677 0.0447374942184 - 4.0 0.838709677419 0.0467104592871 - 5.0 0.822580645161 0.0485169952543 - - Doing - - >>> plt.show() - - will plot the survival curve - - Mutliple survival curves: - - >>> km2 = KaplanMeier(dta,0,exog=1) - >>> km2.fit() - - km2 will estimate a survival curve for each value of industrial - production, the column of dta with index one (1). - - With censoring: - - >>> censoring = np.ones_like(dta[:,0]) - >>> censoring[dta[:,0] > 80] = 0 - >>> dta = np.c_[dta,censoring] - >>> dta[range(5),:] - array([[ 7.00000000e+00, 1.13800000e-02, 1.00000000e+00], - [ 9.00000000e+00, 1.13800000e-02, 1.00000000e+00], - [ 1.30000000e+01, 1.13800000e-02, 1.00000000e+00], - [ 1.40000000e+01, 1.13800000e-02, 1.00000000e+00], - [ 2.60000000e+01, 1.13800000e-02, 1.00000000e+00]]) - - >>> km3 = KaplanMeier(dta,0,exog=1,censoring=2) - >>> km3.fit() - - Test for difference of survival curves - - >>> log_rank = km3.test_diff([0.0645,-0.03957]) - - The zeroth element of log_rank is the chi-square test statistic - for the difference between the survival curves for exog = 0.0645 - and exog = -0.03957, the index one element is the degrees of freedom for - the test, and the index two element is the p-value for the test - - Groups with nan names - - >>> groups = np.ones_like(dta[:,1]) - >>> groups = groups.astype('S4') - >>> groups[dta[:,1] > 0] = 'high' - >>> groups[dta[:,1] <= 0] = 'low' - >>> dta = dta.astype('S4') - >>> dta[:,1] = groups - >>> dta[range(5),:] - array([['7.0', 'high', '1.0'], - ['9.0', 'high', '1.0'], - ['13.0', 'high', '1.0'], - ['14.0', 'high', '1.0'], - ['26.0', 'high', '1.0']], - dtype='|S4') - >>> km4 = KaplanMeier(dta,0,exog=1,censoring=2) - >>> km4.fit() - - """ - - def __init__(self, data, endog, exog=None, censoring=None): - self.exog = exog - self.censoring = censoring - cols = [endog] - self.endog = 0 - if exog != None: - cols.append(exog) - self.exog = 1 - if censoring != None: - cols.append(censoring) - if exog != None: - self.censoring = 2 - else: - self.censoring = 1 - data = data[:,cols] - if data.dtype == float or data.dtype == int: - self.data = data[~np.isnan(data).any(1)] - else: - t = (data[:,self.endog]).astype(float) - if exog != None: - evec = data[:,self.exog] - evec = evec[~np.isnan(t)] - if censoring != None: - cvec = (data[:,self.censoring]).astype(float) - cvec = cvec[~np.isnan(t)] - t = t[~np.isnan(t)] - if censoring != None: - t = t[~np.isnan(cvec)] - if exog != None: - evec = evec[~np.isnan(cvec)] - cvec = cvec[~np.isnan(cvec)] - cols = [t] - if exog != None: - cols.append(evec) - if censoring != None: - cols.append(cvec) - data = (np.array(cols)).transpose() - self.data = data - - def fit(self): - """ - Calculate the Kaplan-Meier estimator of the survival function - """ - self.results = [] - self.ts = [] - self.censorings = [] - self.event = [] - if self.exog == None: - self.fitting_proc(self.data) - else: - groups = np.unique(self.data[:,self.exog]) - self.groups = groups - for g in groups: - group = self.data[self.data[:,self.exog] == g] - self.fitting_proc(group) - - def plot(self): - """ - Plot the estimated survival curves. After using this method - do - - plt.show() - - to display the plot - """ - plt.figure() - if self.exog == None: - self.plotting_proc(0) - else: - for g in range(len(self.groups)): - self.plotting_proc(g) - plt.ylim(ymax=1.05) - plt.ylabel('Survival') - plt.xlabel('Time') - - def summary(self): - """ - Print a set of tables containing the estimates of the survival - function, and its standard errors - """ - if self.exog == None: - self.summary_proc(0) - else: - for g in range(len(self.groups)): - self.summary_proc(g) - - def fitting_proc(self, group): - """ - For internal use - """ - t = ((group[:,self.endog]).astype(float)).astype(int) - if self.censoring == None: - events = np.bincount(t) - t = np.unique(t) - events = events[:,list(t)] - events = events.astype(float) - eventsSum = np.cumsum(events) - eventsSum = np.r_[0,eventsSum] - n = len(group) - eventsSum[:-1] - else: - censoring = ((group[:,self.censoring]).astype(float)).astype(int) - reverseCensoring = -1*(censoring - 1) - events = np.bincount(t,censoring) - censored = np.bincount(t,reverseCensoring) - t = np.unique(t) - censored = censored[:,list(t)] - censored = censored.astype(float) - censoredSum = np.cumsum(censored) - censoredSum = np.r_[0,censoredSum] - events = events[:,list(t)] - events = events.astype(float) - eventsSum = np.cumsum(events) - eventsSum = np.r_[0,eventsSum] - n = len(group) - eventsSum[:-1] - censoredSum[:-1] - (self.censorings).append(censored) - survival = np.cumprod(1-events/n) - var = ((survival*survival) * - np.cumsum(events/(n*(n-events)))) - se = np.sqrt(var) - (self.results).append(np.array([survival,se])) - (self.ts).append(t) - (self.event).append(events) - - def plotting_proc(self, g): - """ - For internal use - """ - survival = self.results[g][0] - t = self.ts[g] - e = (self.event)[g] - if self.censoring != None: - c = self.censorings[g] - csurvival = survival[c != 0] - ct = t[c != 0] - if len(ct) != 0: - plt.vlines(ct,csurvival+0.02,csurvival-0.02) - x = np.repeat(t[e != 0], 2) - y = np.repeat(survival[e != 0], 2) - if self.ts[g][-1] in t[e != 0]: - x = np.r_[0,x] - y = np.r_[1,1,y[:-1]] - else: - x = np.r_[0,x,self.ts[g][-1]] - y = np.r_[1,1,y] - plt.plot(x,y) - - def summary_proc(self, g): - """ - For internal use - """ - if self.exog != None: - myTitle = ('exog = ' + str(self.groups[g]) + '\n') - else: - myTitle = "Kaplan-Meier Curve" - table = np.transpose(self.results[g]) - table = np.c_[np.transpose(self.ts[g]),table] - table = SimpleTable(table, headers=['Time','Survival','Std. Err'], - title = myTitle) - print(table) - - def test_diff(self, groups, rho=None, weight=None): - - """ - test_diff(groups, rho=0) - - Test for difference between survival curves - - Parameters - ---------- - groups: A list of the values for exog to test for difference. - tests the null hypothesis that the survival curves for all - values of exog in groups are equal - - rho: compute the test statistic with weight S(t)^rho, where - S(t) is the pooled estimate for the Kaplan-Meier survival function. - If rho = 0, this is the logrank test, if rho = 0, this is the - Peto and Peto modification to the Gehan-Wilcoxon test. - - weight: User specified function that accepts as its sole arguement - an array of times, and returns an array of weights for each time - to be used in the test - - Returns - ------- - An array whose zeroth element is the chi-square test statistic for - the global null hypothesis, that all survival curves are equal, - the index one element is degrees of freedom for the test, and the - index two element is the p-value for the test. - - Examples - -------- - - >>> import statsmodels.api as sm - >>> import matplotlib.pyplot as plt - >>> import numpy as np - >>> from statsmodels.sandbox.survival2 import KaplanMeier - >>> dta = sm.datasets.strikes.load() - >>> dta = dta.values()[-1] - >>> censoring = np.ones_like(dta[:,0]) - >>> censoring[dta[:,0] > 80] = 0 - >>> dta = np.c_[dta,censoring] - >>> km = KaplanMeier(dta,0,exog=1,censoring=2) - >>> km.fit() - - Test for difference of survival curves - - >>> log_rank = km3.test_diff([0.0645,-0.03957]) - - The zeroth element of log_rank is the chi-square test statistic - for the difference between the survival curves using the log rank test - for exog = 0.0645 and exog = -0.03957, the index one element - is the degrees of freedom for the test, and the index two element - is the p-value for the test - - >>> wilcoxon = km.test_diff([0.0645,-0.03957], rho=1) - - wilcoxon is the equivalent information as log_rank, but for the - Peto and Peto modification to the Gehan-Wilcoxon test. - - User specified weight functions - - >>> log_rank = km3.test_diff([0.0645,-0.03957], weight=np.ones_like) - - This is equivalent to the log rank test - - More than two groups - - >>> log_rank = km.test_diff([0.0645,-0.03957,0.01138]) - - The test can be performed with arbitrarily many groups, so long as - they are all in the column exog - """ - groups = np.asarray(groups) - if self.exog == None: - raise ValueError("Need an exogenous variable for logrank test") - - elif (np.in1d(groups,self.groups)).all(): - data = self.data[np.in1d(self.data[:,self.exog],groups)] - t = ((data[:,self.endog]).astype(float)).astype(int) - tind = np.unique(t) - NK = [] - N = [] - D = [] - Z = [] - if rho != None and weight != None: - raise ValueError("Must use either rho or weights, not both") - - elif rho != None: - s = KaplanMeier(data,self.endog,censoring=self.censoring) - s.fit() - s = (s.results[0][0]) ** (rho) - s = np.r_[1,s[:-1]] - - elif weight != None: - s = weight(tind) - - else: - s = np.ones_like(tind) - - if self.censoring == None: - for g in groups: - dk = np.bincount((t[data[:,self.exog] == g])) - d = np.bincount(t) - if np.max(tind) != len(dk): - dif = np.max(tind) - len(dk) + 1 - dk = np.r_[dk,[0]*dif] - dk = dk[:,list(tind)] - d = d[:,list(tind)] - dk = dk.astype(float) - d = d.astype(float) - dkSum = np.cumsum(dk) - dSum = np.cumsum(d) - dkSum = np.r_[0,dkSum] - dSum = np.r_[0,dSum] - nk = len(data[data[:,self.exog] == g]) - dkSum[:-1] - n = len(data) - dSum[:-1] - d = d[n>1] - dk = dk[n>1] - nk = nk[n>1] - n = n[n>1] - s = s[n>1] - ek = (nk * d)/(n) - Z.append(np.sum(s * (dk - ek))) - NK.append(nk) - N.append(n) - D.append(d) - else: - for g in groups: - censoring = ((data[:,self.censoring]).astype(float)).astype(int) - reverseCensoring = -1*(censoring - 1) - censored = np.bincount(t,reverseCensoring) - ck = np.bincount((t[data[:,self.exog] == g]), - reverseCensoring[data[:,self.exog] == g]) - dk = np.bincount((t[data[:,self.exog] == g]), - censoring[data[:,self.exog] == g]) - d = np.bincount(t,censoring) - if np.max(tind) != len(dk): - dif = np.max(tind) - len(dk) + 1 - dk = np.r_[dk,[0]*dif] - ck = np.r_[ck,[0]*dif] - dk = dk[:,list(tind)] - ck = ck[:,list(tind)] - d = d[:,list(tind)] - dk = dk.astype(float) - d = d.astype(float) - ck = ck.astype(float) - dkSum = np.cumsum(dk) - dSum = np.cumsum(d) - ck = np.cumsum(ck) - ck = np.r_[0,ck] - dkSum = np.r_[0,dkSum] - dSum = np.r_[0,dSum] - censored = censored[:,list(tind)] - censored = censored.astype(float) - censoredSum = np.cumsum(censored) - censoredSum = np.r_[0,censoredSum] - nk = (len(data[data[:,self.exog] == g]) - dkSum[:-1] - - ck[:-1]) - n = len(data) - dSum[:-1] - censoredSum[:-1] - d = d[n>1] - dk = dk[n>1] - nk = nk[n>1] - n = n[n>1] - s = s[n>1] - ek = (nk * d)/(n) - Z.append(np.sum(s * (dk - ek))) - NK.append(nk) - N.append(n) - D.append(d) - Z = np.array(Z) - N = np.array(N) - D = np.array(D) - NK = np.array(NK) - sigma = -1 * np.dot((NK/N) * ((N - D)/(N - 1)) * D - * np.array([(s ** 2)]*len(D)) - ,np.transpose(NK/N)) - np.fill_diagonal(sigma, np.diagonal(np.dot((NK/N) - * ((N - D)/(N - 1)) * D - * np.array([(s ** 2)]*len(D)) - ,np.transpose(1 - (NK/N))))) - chisq = np.dot(np.transpose(Z),np.dot(la.pinv(sigma), Z)) - df = len(groups) - 1 - return np.array([chisq, df, stats.chi2.sf(chisq,df)]) - else: - raise ValueError("groups must be in column exog") From 3cb06e105d98aad43e1e5b85a62189595c9cd83a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 09:58:16 -0500 Subject: [PATCH 073/157] New tests for mixedlm --- statsmodels/regression/tests/test_lme.py | 56 ++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 0728c4988ae..94559b8f860 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -363,6 +363,62 @@ def test_sparse(self): assert_allclose(result.params, result2.params) assert_allclose(result.bse, result2.bse) + def test_dietox(self): + # dietox data from geepack + # + # Fit in R using + # + # library(geepack) + # rm = lmer(Weight ~ Time + (1 | Pig), data=dietox) + # rm = lmer(Weight ~ Time + (1 | Pig), REML=FALSE, data=dietox) + + cur_dir = os.path.dirname(os.path.abspath(__file__)) + rdir = os.path.join(cur_dir, 'results') + fname = os.path.join(rdir, 'dietox.csv') + + # REML + data = pd.read_csv(fname) + model = MixedLM.from_formula("Weight ~ Time", groups="Pig", + data = data) + result = model.fit() + + # fixef(rm) + assert_allclose(result.fe_params, np.r_[15.723523, 6.942505], rtol=1e-5) + + # sqrt(diag(vcov(rm))) + assert_allclose(result.bse[0:2], np.r_[0.78805374, 0.03338727], rtol=1e-5) + + # attr(VarCorr(rm), "sc")^2 + assert_allclose(result.scale, 11.36692, rtol=1e-5) + + # VarCorr(rm)[[1]][[1]] + assert_allclose(result.cov_re, 40.39395, rtol=1e-5) + + # logLik(rm) + assert_allclose(model.loglike(result.params_object), -2404.775, rtol=1e-5) + + # ML + data = pd.read_csv(fname) + model = MixedLM.from_formula("Weight ~ Time", groups="Pig", + data = data) + result = model.fit(reml=False) + + # fixef(rm) + assert_allclose(result.fe_params, np.r_[15.723517, 6.942506], rtol=1e-5) + + # sqrt(diag(vcov(rm))) + assert_allclose(result.bse[0:2], np.r_[0.7829397, 0.0333661], rtol=1e-5) + + # attr(VarCorr(rm), "sc")^2 + assert_allclose(result.scale, 11.35251, rtol=1e-5) + + # VarCorr(rm)[[1]][[1]] + assert_allclose(result.cov_re, 39.82097, rtol=1e-5) + + # logLik(rm) + assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5) + + def test_pastes_vcomp(self): # pastes data from lme4 # From 71245ed3f16a3eb3c9ec9056d5b83eb5eea2c184 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 10:23:02 -0500 Subject: [PATCH 074/157] test dietox with random slope model --- statsmodels/regression/tests/test_lme.py | 65 ++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 94559b8f860..201e81e88fb 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -364,7 +364,7 @@ def test_sparse(self): assert_allclose(result.bse, result2.bse) def test_dietox(self): - # dietox data from geepack + # dietox data from geepack using random intercepts # # Fit in R using # @@ -379,7 +379,7 @@ def test_dietox(self): # REML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", - data = data) + data=data) result = model.fit() # fixef(rm) @@ -400,7 +400,7 @@ def test_dietox(self): # ML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", - data = data) + data=data) result = model.fit(reml=False) # fixef(rm) @@ -419,6 +419,65 @@ def test_dietox(self): assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5) + def test_dietox_slopes(self): + # dietox data from geepack using random intercepts + # + # Fit in R using + # + # library(geepack) + # rm = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox) + # rm = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox) + + cur_dir = os.path.dirname(os.path.abspath(__file__)) + rdir = os.path.join(cur_dir, 'results') + fname = os.path.join(rdir, 'dietox.csv') + + # REML + data = pd.read_csv(fname) + model = MixedLM.from_formula("Weight ~ Time", groups="Pig", + re_formula="1 + Time", data=data) + result = model.fit(method='powell') + + # fixef(rm) + assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5) + + # sqrt(diag(vcov(rm))) + assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3) + + # attr(VarCorr(rm), "sc")^2 + assert_allclose(result.scale, 6.03745, rtol=1e-3) + + # as.numeric(VarCorr(rm)[[1]]) + assert_allclose(result.cov_re.values.ravel(), + np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620], + rtol=1e-1) + + # logLik(rm) + assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5) + + # ML + data = pd.read_csv(fname) + model = MixedLM.from_formula("Weight ~ Time", groups="Pig", + re_formula="1 + Time", data=data) + result = model.fit(method='powell', reml=False) + + # fixef(rm) + assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5) + + # sqrt(diag(vcov(rm))) + assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3) + + # attr(VarCorr(rm), "sc")^2 + assert_allclose(result.scale, 6.037441, rtol=1e-3) + + # as.numeric(VarCorr(rm)[[1]]) + assert_allclose(result.cov_re.values.ravel(), + np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2) + + # logLik(rm) + assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5) + + def test_pastes_vcomp(self): # pastes data from lme4 # From 4687151f10526a6ad4f349d68080db1f1316049b Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 10:25:21 -0500 Subject: [PATCH 075/157] add dietox data file for testing --- .../regression/tests/results/dietox.csv | 862 ++++++++++++++++++ 1 file changed, 862 insertions(+) create mode 100644 statsmodels/regression/tests/results/dietox.csv diff --git a/statsmodels/regression/tests/results/dietox.csv b/statsmodels/regression/tests/results/dietox.csv new file mode 100644 index 00000000000..d99e09ce134 --- /dev/null +++ b/statsmodels/regression/tests/results/dietox.csv @@ -0,0 +1,862 @@ +"Weight","Feed","Time","Pig","Evit","Cu","Litter" +"1",26.5,NA,1,4601,1,1,1 +"2",27.59999,5.200005,2,4601,1,1,1 +"3",36.5,17.6,3,4601,1,1,1 +"4",40.29999,28.5,4,4601,1,1,1 +"5",49.09998,45.200001,5,4601,1,1,1 +"6",55.39999,56.900002,6,4601,1,1,1 +"7",59.59998,71.700005,7,4601,1,1,1 +"8",67,86.800001,8,4601,1,1,1 +"9",76.59998,104.900002,9,4601,1,1,1 +"10",86.5,123,10,4601,1,1,1 +"11",91.59998,140.900002,11,4601,1,1,1 +"12",98.59998,160,12,4601,1,1,1 +"13",27,NA,1,4643,1,1,2 +"14",31.79999,6.400002,2,4643,1,1,2 +"15",39,21.5,3,4643,1,1,2 +"16",44.79999,33.900002,4,4643,1,1,2 +"17",50.89999,44.900002,5,4643,1,1,2 +"18",57.39999,58.8,6,4643,1,1,2 +"19",62.5,73.700001,7,4643,1,1,2 +"20",71.69995,92.800003,8,4643,1,1,2 +"21",78.19995,111.5,9,4643,1,1,2 +"22",85.79999,132.800003,10,4643,1,1,2 +"23",91.79999,148.400001,11,4643,1,1,2 +"24",100.19995,170,12,4643,1,1,2 +"25",17,NA,1,4756,3,1,3 +"26",19,9.900002,2,4756,3,1,3 +"27",23.59999,18.5,3,4756,3,1,3 +"28",30,28.900002,4,4756,3,1,3 +"29",36.29999,38.100002,5,4756,3,1,3 +"30",42.39999,52.799999,6,4756,3,1,3 +"31",50.5,66.799999,7,4756,3,1,3 +"32",57.59998,86,8,4756,3,1,3 +"33",66.59998,106,9,4756,3,1,3 +"34",74.19995,123,10,4756,3,1,3 +"35",83,141.900002,11,4756,3,1,3 +"36",89.29999,163.700001,12,4756,3,1,3 +"37",26.89999,NA,1,4757,1,1,3 +"38",32.29999,9.900002,2,4757,1,1,3 +"39",38.59998,20.200005,3,4757,1,1,3 +"40",46,30,4,4757,1,1,3 +"41",57.79999,40,5,4757,1,1,3 +"42",58.59998,62.800003,6,4757,1,1,3 +"43",65.89996,78.300003,7,4757,1,1,3 +"44",71.19995,97.900002,8,4757,1,1,3 +"45",79.39996,115.800003,9,4757,1,1,3 +"46",81,129,10,4757,1,1,3 +"47",91.69995,150,11,4757,1,1,3 +"48",91,156.800003,12,4757,1,1,3 +"49",29,NA,1,4854,2,1,6 +"50",32.69998,8.200001,2,4854,2,1,6 +"51",38.29999,19.800003,3,4854,2,1,6 +"52",45.5,30,4,4854,2,1,6 +"53",48.59998,40,5,4854,2,1,6 +"54",56,50,6,4854,2,1,6 +"55",62.59998,70,7,4854,2,1,6 +"56",70.19995,86.200001,8,4854,2,1,6 +"57",73.79999,100,9,4854,2,1,6 +"58",80.29999,120.5,10,4854,2,1,6 +"59",85.5,138.5,11,4854,2,1,6 +"60",93.09998,158,12,4854,2,1,6 +"61",32,NA,1,4856,1,1,6 +"62",35.19998,8.1,2,4856,1,1,6 +"63",41.5,19.600006,3,4856,1,1,6 +"64",47.69998,27.6,4,4856,1,1,6 +"65",53.79999,41.800003,5,4856,1,1,6 +"66",61.59998,58.600006,6,4856,1,1,6 +"67",70,75.200001,7,4856,1,1,6 +"68",79.09998,91.700005,8,4856,1,1,6 +"69",85.69995,109,9,4856,1,1,6 +"70",92.5,131,10,4856,1,1,6 +"71",101.09998,153,11,4856,1,1,6 +"72",109,177.6,12,4856,1,1,6 +"73",21.2,NA,1,5497,1,1,8 +"74",25.79999,6.300001,2,5497,1,1,8 +"75",29.39999,16.5,3,5497,1,1,8 +"76",36.79999,27.200001,4,5497,1,1,8 +"77",43.19998,40.200005,5,5497,1,1,8 +"78",51.09998,54.800003,6,5497,1,1,8 +"79",59.29999,68,7,5497,1,1,8 +"80",66,85.200001,8,5497,1,1,8 +"81",72.79999,101.100006,9,5497,1,1,8 +"82",78.59998,119.5,10,5497,1,1,8 +"83",93.79999,144.200001,11,5497,1,1,8 +"84",95.19995,158.5,12,5497,1,1,8 +"85",29.59999,NA,1,5502,3,1,8 +"86",34.19998,9,2,5502,3,1,8 +"87",43,21.400002,3,5502,3,1,8 +"88",50,36.5,4,5502,3,1,8 +"89",57,51,5,5502,3,1,8 +"90",63.5,66.800001,6,5502,3,1,8 +"91",72.69995,80.100006,7,5502,3,1,8 +"92",78.69995,96.200001,8,5502,3,1,8 +"93",84.89996,109.800003,9,5502,3,1,8 +"94",91,126.800001,10,5502,3,1,8 +"95",96.39996,145.200001,11,5502,3,1,8 +"96",100.19995,157.999996,12,5502,3,1,8 +"97",22.39999,NA,1,5524,2,1,9 +"98",25.2,10,2,5524,2,1,9 +"99",30,26.699997,3,5524,2,1,9 +"100",36.5,39.799999,4,5524,2,1,9 +"101",43.29999,55.199997,5,5524,2,1,9 +"102",50,74.499996,6,5524,2,1,9 +"103",58.09998,85.999996,7,5524,2,1,9 +"104",63,101.999996,8,5524,2,1,9 +"105",70.69995,121.999996,9,5524,2,1,9 +"106",78.09998,136.099997,10,5524,2,1,9 +"107",83.5,136.099997,11,5524,2,1,9 +"108",24,NA,1,5528,3,1,9 +"109",28.2,10,2,5528,3,1,9 +"110",35.29999,20,3,5528,3,1,9 +"111",42.69998,32.100002,4,5528,3,1,9 +"112",48.19998,44.800003,5,5528,3,1,9 +"113",53.19998,58.5,6,5528,3,1,9 +"114",64.79999,76.1,7,5528,3,1,9 +"115",71.5,94.200001,8,5528,3,1,9 +"116",76.09998,113.200001,9,5528,3,1,9 +"117",86.09998,134.900002,10,5528,3,1,9 +"118",92.5,146.1,11,5528,3,1,9 +"119",27,NA,1,5581,3,1,10 +"120",30,6.800001,2,5581,3,1,10 +"121",37.79999,19.5,3,5581,3,1,10 +"122",43.29999,31.100006,4,5581,3,1,10 +"123",51.5,46.200001,5,5581,3,1,10 +"124",57,59.900002,6,5581,3,1,10 +"125",63.69998,76.1,7,5581,3,1,10 +"126",70.5,97,8,5581,3,1,10 +"127",80.39996,115,9,5581,3,1,10 +"128",92,138.800003,10,5581,3,1,10 +"129",100,158.200001,11,5581,3,1,10 +"130",106.09998,177.900002,12,5581,3,1,10 +"131",22.79999,NA,1,5850,3,1,12 +"132",27,6,2,5850,3,1,12 +"133",33.89999,18.200001,3,5850,3,1,12 +"134",37.69998,27,4,5850,3,1,12 +"135",41.39999,36.300001,5,5850,3,1,12 +"136",48.19998,48,6,5850,3,1,12 +"137",53.69998,60.200005,7,5850,3,1,12 +"138",62.59998,76.200001,8,5850,3,1,12 +"139",70.39996,93,9,5850,3,1,12 +"140",78.5,120,10,5850,3,1,12 +"141",85.89996,127.5,11,5850,3,1,12 +"142",94,146.800001,12,5850,3,1,12 +"143",23.79999,NA,1,5852,1,1,12 +"144",30,9.7,2,5852,1,1,12 +"145",37.09998,20.800003,3,5852,1,1,12 +"146",43.19998,31.200005,4,5852,1,1,12 +"147",50.39999,44.900002,5,5852,1,1,12 +"148",60.19998,61.800003,6,5852,1,1,12 +"149",67,78.200001,7,5852,1,1,12 +"150",75,94.800003,8,5852,1,1,12 +"151",82.39996,112.900002,9,5852,1,1,12 +"152",91.19995,132,10,5852,1,1,12 +"153",101.79999,160,11,5852,1,1,12 +"154",109.59998,180.100006,12,5852,1,1,12 +"155",27.39999,NA,1,6058,2,1,16 +"156",32.19998,8.1,2,6058,2,1,16 +"157",37.89999,18.3,3,6058,2,1,16 +"158",45.89999,29.900002,4,6058,2,1,16 +"159",51.19998,44.100002,5,6058,2,1,16 +"160",57.09998,58.3,6,6058,2,1,16 +"161",69,77.5,7,6058,2,1,16 +"162",76.79999,91.5,8,6058,2,1,16 +"163",83.09998,104.800003,9,6058,2,1,16 +"164",91.59998,124,10,6058,2,1,16 +"165",100.5,148.6,11,6058,2,1,16 +"166",105.29999,166.900002,12,6058,2,1,16 +"167",27.09999,NA,1,6207,3,1,17 +"168",30.29999,5.800003,2,6207,3,1,17 +"169",38.19998,17,3,6207,3,1,17 +"170",43.19998,30,4,6207,3,1,17 +"171",52.5,45,5,6207,3,1,17 +"172",59,57.200001,6,6207,3,1,17 +"173",67.29999,72.100002,7,6207,3,1,17 +"174",74.19995,90,8,6207,3,1,17 +"175",81.19995,103.200001,9,6207,3,1,17 +"176",87.5,119.5,10,6207,3,1,17 +"177",94.39996,135.200001,11,6207,3,1,17 +"178",101.39996,153.800003,12,6207,3,1,17 +"179",24.5,NA,1,6211,2,1,17 +"180",31.59999,8.1,2,6211,2,1,17 +"181",37.39999,18.1,3,6211,2,1,17 +"182",46.39999,30.100006,4,6211,2,1,17 +"183",53.59998,48.1,5,6211,2,1,17 +"184",61.19998,62.600002,6,6211,2,1,17 +"185",70.09998,76.400002,7,6211,2,1,17 +"186",77.5,96.200001,8,6211,2,1,17 +"187",86.19995,114.600002,9,6211,2,1,17 +"188",89.19995,129.800003,10,6211,2,1,17 +"189",96.89996,146.5,11,6211,2,1,17 +"190",104.5,168.3,12,6211,2,1,17 +"191",23.09999,NA,1,6284,2,1,18 +"192",26,5.900002,2,6284,2,1,18 +"193",30.09999,17.6,3,6284,2,1,18 +"194",40.69998,30.600006,4,6284,2,1,18 +"195",46.59998,41.400002,5,6284,2,1,18 +"196",52.09998,53.400002,6,6284,2,1,18 +"197",60.5,69.2,7,6284,2,1,18 +"198",73.5,90,8,6284,2,1,18 +"199",77.29999,100,9,6284,2,1,18 +"200",86.89996,108,10,6284,2,1,18 +"201",90,126,11,6284,2,1,18 +"202",96.39996,141.100006,12,6284,2,1,18 +"203",21.5,NA,1,6287,1,1,18 +"204",24.59999,5,2,6287,1,1,18 +"205",28.5,14.200001,3,6287,1,1,18 +"206",39.09998,27.800003,4,6287,1,1,18 +"207",45.29999,39.800003,5,6287,1,1,18 +"208",53,53.5,6,6287,1,1,18 +"209",59.59998,68.5,7,6287,1,1,18 +"210",69.5,87.400002,8,6287,1,1,18 +"211",75.19995,107.300003,9,6287,1,1,18 +"212",84.89996,127,10,6287,1,1,18 +"213",93.39996,146.599991,11,6287,1,1,18 +"214",99.29999,167.799992,12,6287,1,1,18 +"215",32.19998,NA,1,6433,2,1,5 +"216",36.39999,6.900002,2,6433,2,1,5 +"217",41.79999,14.300003,3,6433,2,1,5 +"218",48.19998,26.400002,4,6433,2,1,5 +"219",55.89999,41.800003,5,6433,2,1,5 +"220",60.09998,58.6,6,6433,2,1,5 +"221",68.5,71.400002,7,6433,2,1,5 +"222",77.79999,88.900002,8,6433,2,1,5 +"223",86.39996,108.100002,9,6433,2,1,5 +"224",96.69995,130.5,10,6433,2,1,5 +"225",111.09998,163,11,6433,2,1,5 +"226",115.39996,184.5,12,6433,2,1,5 +"227",24.2,NA,1,6910,2,1,13 +"228",29.2,6.200001,2,6910,2,1,13 +"229",36.19998,17.1,3,6910,2,1,13 +"230",43.09998,30,4,6910,2,1,13 +"231",49.19998,44,5,6910,2,1,13 +"232",57,56.900002,6,6910,2,1,13 +"233",63,70,7,6910,2,1,13 +"234",70.89996,85.100002,8,6910,2,1,13 +"235",80,102.300003,9,6910,2,1,13 +"236",87.19995,118.499996,10,6910,2,1,13 +"237",94.69995,135.599998,11,6910,2,1,13 +"238",100.59998,158.499996,12,6910,2,1,13 +"239",24.5,NA,1,6912,3,1,13 +"240",28.39999,7.1,2,6912,3,1,13 +"241",35,17.300001,3,6912,3,1,13 +"242",41.89999,29.5,4,6912,3,1,13 +"243",49,40.900002,5,6912,3,1,13 +"244",55.59998,54.200001,6,6912,3,1,13 +"245",64.09998,66.800003,7,6912,3,1,13 +"246",71.19995,84.5,8,6912,3,1,13 +"247",80.5,104.5,9,6912,3,1,13 +"248",90.19995,119.900002,10,6912,3,1,13 +"249",98.89996,143,11,6912,3,1,13 +"250",103.79999,164.5,12,6912,3,1,13 +"251",25.29999,NA,1,8195,3,1,21 +"252",31.59999,9.600006,2,8195,3,1,21 +"253",34,19.600006,3,8195,3,1,21 +"254",35.19998,28,4,8195,3,1,21 +"255",37.29999,41,5,8195,3,1,21 +"256",40.79999,57.400002,6,8195,3,1,21 +"257",47.69998,69,7,8195,3,1,21 +"258",57,86.300001,8,8195,3,1,21 +"259",64.59998,102.200001,9,8195,3,1,21 +"260",71.19995,118.5,10,8195,3,1,21 +"261",80.5,140.5,11,8195,3,1,21 +"262",85.89996,157.8,12,8195,3,1,21 +"263",24,NA,1,8271,2,1,22 +"264",28.89999,8.5,2,8271,2,1,22 +"265",35.79999,19.5,3,8271,2,1,22 +"266",44,30,4,8271,2,1,22 +"267",47.5,43.200001,5,8271,2,1,22 +"268",56.39999,56.6,6,8271,2,1,22 +"269",63.69998,72,7,8271,2,1,22 +"270",70.29999,89.5,8,8271,2,1,22 +"271",78,108,9,8271,2,1,22 +"272",89.19995,127.5,10,8271,2,1,22 +"273",96.29999,144.400002,11,8271,2,1,22 +"274",99.79999,158.1,12,8271,2,1,22 +"275",28.29999,NA,1,4602,1,2,1 +"276",30.09999,3.300003,2,4602,1,2,1 +"277",38.29999,13.200001,3,4602,1,2,1 +"278",44.5,26.1,4,4602,1,2,1 +"279",51.59998,43.600002,5,4602,1,2,1 +"280",57.59998,55.200001,6,4602,1,2,1 +"281",65,72.800003,7,4602,1,2,1 +"282",73,90.800003,8,4602,1,2,1 +"283",82.29999,112.200001,9,4602,1,2,1 +"284",91,132.900002,10,4602,1,2,1 +"285",99.69995,156.1,11,4602,1,2,1 +"286",106.69995,180,12,4602,1,2,1 +"287",31.5,NA,1,4605,2,2,1 +"288",34.79999,6.5,2,4605,2,2,1 +"289",40.69998,20,3,4605,2,2,1 +"290",47.69998,33.100002,4,4605,2,2,1 +"291",55.89999,51.800003,5,4605,2,2,1 +"292",62.19998,65.800003,6,4605,2,2,1 +"293",70.69995,81.700005,7,4605,2,2,1 +"294",78,97,8,4605,2,2,1 +"295",86.19995,115.300003,9,4605,2,2,1 +"296",94.69995,133.100002,10,4605,2,2,1 +"297",102,151,11,4605,2,2,1 +"298",109.19995,176.400002,12,4605,2,2,1 +"299",27.7,NA,1,4645,3,2,2 +"300",33.59998,10,2,4645,3,2,2 +"301",44,28.5,3,4645,3,2,2 +"302",46.69998,40,4,4645,3,2,2 +"303",54.79999,55.200001,5,4645,3,2,2 +"304",61,69,6,4645,3,2,2 +"305",69.79999,88,7,4645,3,2,2 +"306",76,104.400002,8,4645,3,2,2 +"307",83.59998,121.800003,9,4645,3,2,2 +"308",88.19995,140,10,4645,3,2,2 +"309",96.79999,157.200001,11,4645,3,2,2 +"310",102.29999,180,12,4645,3,2,2 +"311",22.59999,NA,1,4759,3,2,3 +"312",28.5,10.600006,2,4759,3,2,3 +"313",32.69998,21.100006,3,4759,3,2,3 +"314",39.89999,33.300003,4,4759,3,2,3 +"315",47.59998,47.1,5,4759,3,2,3 +"316",53.79999,63.400002,6,4759,3,2,3 +"317",63,80.800003,7,4759,3,2,3 +"318",67.19995,98.100002,8,4759,3,2,3 +"319",74.09998,111.900002,9,4759,3,2,3 +"320",80,128.900002,10,4759,3,2,3 +"321",89.39996,152.100002,11,4759,3,2,3 +"322",93.5,171.700005,12,4759,3,2,3 +"323",27.29999,NA,1,4813,1,2,4 +"324",32.39999,10.300003,2,4813,1,2,4 +"325",38.29999,22.800003,3,4813,1,2,4 +"326",45.79999,40.200005,4,4813,1,2,4 +"327",53.79999,53.700005,5,4813,1,2,4 +"328",60,72.700005,6,4813,1,2,4 +"329",68.69995,92.900005,7,4813,1,2,4 +"330",75.79999,114.700005,8,4813,1,2,4 +"331",80.79999,132.800003,9,4813,1,2,4 +"332",86.79999,138.100006,10,4813,1,2,4 +"333",91.39996,159.900005,11,4813,1,2,4 +"334",99.79999,182.100006,12,4813,1,2,4 +"335",26.2,NA,1,4814,2,2,4 +"336",31.09999,9.900002,2,4814,2,2,4 +"337",34.59998,19.600006,3,4814,2,2,4 +"338",41.19998,30,4,4814,2,2,4 +"339",49.19998,37.400002,5,4814,2,2,4 +"340",55.59998,57.200005,6,4814,2,2,4 +"341",62.59998,76.200005,7,4814,2,2,4 +"342",72.09998,93.000004,8,4814,2,2,4 +"343",77.5,109.800003,9,4814,2,2,4 +"344",83.79999,127.400002,10,4814,2,2,4 +"345",93.39996,148.900002,11,4814,2,2,4 +"346",98.39996,174.400002,12,4814,2,2,4 +"347",28,NA,1,4858,2,2,6 +"348",31.2,8,2,4858,2,2,6 +"349",38.79999,16.700005,3,4858,2,2,6 +"350",44.69998,31.800003,4,4858,2,2,6 +"351",53.59998,50.500004,5,4858,2,2,6 +"352",59.19998,71.100006,6,4858,2,2,6 +"353",67.09998,87.100006,7,4858,2,2,6 +"354",76.39996,107.300003,8,4858,2,2,6 +"355",83.79999,133.200005,9,4858,2,2,6 +"356",71.59998,163.100006,10,4858,2,2,6 +"357",97.39996,190.800003,11,4858,2,2,6 +"358",107.09998,219.800003,12,4858,2,2,6 +"359",24.59999,NA,1,5392,3,2,7 +"360",25.79999,7.700001,2,5392,3,2,7 +"361",28.59999,19.999996,3,5392,3,2,7 +"362",30.39999,29.999996,4,5392,3,2,7 +"363",36.09998,39.999996,5,5392,3,2,7 +"364",36.09998,47.499996,6,5392,3,2,7 +"365",49.29999,62.499996,7,5392,3,2,7 +"366",57.89999,76.799999,8,5392,3,2,7 +"367",63,89.499996,9,5392,3,2,7 +"368",75.69995,104.999996,10,5392,3,2,7 +"369",81.39996,123.999996,11,5392,3,2,7 +"370",88.69995,142.799997,12,5392,3,2,7 +"371",17,NA,1,5500,1,2,8 +"372",19,5.200001,2,5500,1,2,8 +"373",23,18.5,3,5500,1,2,8 +"374",29.39999,32.800003,4,5500,1,2,8 +"375",34.59998,43.800003,5,5500,1,2,8 +"376",40.59998,55.800003,6,5500,1,2,8 +"377",46.59998,71.400002,7,5500,1,2,8 +"378",54,87,8,5500,1,2,8 +"379",62.5,103.800003,9,5500,1,2,8 +"380",70,124.800003,10,5500,1,2,8 +"381",78.29999,147.300001,11,5500,1,2,8 +"382",83.19995,160.800003,12,5500,1,2,8 +"383",22,NA,1,5862,1,2,11 +"384",25.79999,8.5,2,5862,1,2,11 +"385",29.2,15.800003,3,5862,1,2,11 +"386",35.79999,25.900002,4,5862,1,2,11 +"387",41.09998,37.6,5,5862,1,2,11 +"388",50,48.200005,6,5862,1,2,11 +"389",55.19998,62,7,5862,1,2,11 +"390",62.19998,76.5,8,5862,1,2,11 +"391",69.79999,90.100002,9,5862,1,2,11 +"392",76.39996,107.100006,10,5862,1,2,11 +"393",83.19995,126.2,11,5862,1,2,11 +"394",93.19995,147.700005,12,5862,1,2,11 +"395",22,NA,1,5865,3,2,11 +"396",27,8.8,2,5865,3,2,11 +"397",32.89999,18.900001,3,5865,3,2,11 +"398",38,29.200005,4,5865,3,2,11 +"399",46.19998,43.600002,5,5865,3,2,11 +"400",53.5,58.1,6,5865,3,2,11 +"401",58.59998,73.400002,7,5865,3,2,11 +"402",68.19995,90,8,5865,3,2,11 +"403",71,104.800003,9,5865,3,2,11 +"404",80.79999,120.200005,10,5865,3,2,11 +"405",84.5,134.800003,11,5865,3,2,11 +"406",90,146.300001,12,5865,3,2,11 +"407",26.2,NA,1,6055,3,2,16 +"408",30.39999,6.900002,2,6055,3,2,16 +"409",37.39999,18.200001,3,6055,3,2,16 +"410",43,30.600006,4,6055,3,2,16 +"411",49.5,42.100002,5,6055,3,2,16 +"412",55.09998,62.200001,6,6055,3,2,16 +"413",64.19995,87.200001,7,6055,3,2,16 +"414",75.59998,106.900002,8,6055,3,2,16 +"415",83.29999,122.100002,9,6055,3,2,16 +"416",87.39996,148.4,10,6055,3,2,16 +"417",94.69995,170.100006,11,6055,3,2,16 +"418",98.39996,190.800003,12,6055,3,2,16 +"419",26.79999,NA,1,6208,1,2,17 +"420",31.09999,6.1,2,6208,1,2,17 +"421",37.79999,14.900002,3,6208,1,2,17 +"422",44.59998,28.5,4,6208,1,2,17 +"423",50.5,40.5,5,6208,1,2,17 +"424",58.29999,55.200001,6,6208,1,2,17 +"425",68.29999,73.700001,7,6208,1,2,17 +"426",75.09998,92.5,8,6208,1,2,17 +"427",84.39996,111.200005,9,6208,1,2,17 +"428",87.29999,129,10,6208,1,2,17 +"429",96.39996,145.800003,11,6208,1,2,17 +"430",104.39996,165.5,12,6208,1,2,17 +"431",24.09999,NA,1,6288,3,2,18 +"432",28.09999,6.200001,2,6288,3,2,18 +"433",32.5,15.900002,3,6288,3,2,18 +"434",40.59998,30.800003,4,6288,3,2,18 +"435",46,40.900002,5,6288,3,2,18 +"436",53.79999,52.100002,6,6288,3,2,18 +"437",61.19998,68.5,7,6288,3,2,18 +"438",71.89996,85.400002,8,6288,3,2,18 +"439",79.5,103.100002,9,6288,3,2,18 +"440",87.39996,119.5,10,6288,3,2,18 +"441",93.29999,141,11,6288,3,2,18 +"442",101.59998,161.800003,12,6288,3,2,18 +"443",25.09999,NA,1,6432,2,2,5 +"444",23.2,4,2,6432,2,2,5 +"445",28.89999,11.100006,3,6432,2,2,5 +"446",36.79999,21,4,6432,2,2,5 +"447",44.19998,31,5,6432,2,2,5 +"448",52.79999,56.300003,6,6432,2,2,5 +"449",62.19998,69.100006,7,6432,2,2,5 +"450",69.39996,87.200001,8,6432,2,2,5 +"451",78.09998,106.300003,9,6432,2,2,5 +"452",88.79999,129.5,10,6432,2,2,5 +"453",102.39996,159.5,11,6432,2,2,5 +"454",106.79999,197.400002,12,6432,2,2,5 +"455",24.7,NA,1,6909,3,2,13 +"456",28.89999,5.900002,2,6909,3,2,13 +"457",34.59998,15,3,6909,3,2,13 +"458",42.19998,26.6,4,6909,3,2,13 +"459",48.19998,38,5,6909,3,2,13 +"460",54.39999,50.900002,6,6909,3,2,13 +"461",62.5,65.100002,7,6909,3,2,13 +"462",69.79999,80,8,6909,3,2,13 +"463",78.59998,100.5,9,6909,3,2,13 +"464",87,116.900002,10,6909,3,2,13 +"465",96.39996,137.900002,11,6909,3,2,13 +"466",103.19995,158.5,12,6909,3,2,13 +"467",22,NA,1,8049,1,2,20 +"468",26.59999,9.5,2,8049,1,2,20 +"469",33.09998,19.900002,3,8049,1,2,20 +"470",38.39999,31,4,8049,1,2,20 +"471",46.59998,44.900002,5,8049,1,2,20 +"472",52.29999,63.800003,6,8049,1,2,20 +"473",61.5,78,7,8049,1,2,20 +"474",69.29999,96,8,8049,1,2,20 +"475",74.19995,116,9,8049,1,2,20 +"476",80.19995,137.6,10,8049,1,2,20 +"477",87.19995,162.100002,11,8049,1,2,20 +"478",94.79999,198,12,8049,1,2,20 +"479",30.59999,NA,1,8051,2,2,20 +"480",37.59998,11.200005,2,8051,2,2,20 +"481",42.79999,21,3,8051,2,2,20 +"482",47.39999,31.100006,4,8051,2,2,20 +"483",56.59998,47,5,8051,2,2,20 +"484",62.39999,65,6,8051,2,2,20 +"485",71.79999,78,7,8051,2,2,20 +"486",78,95.800003,8,8051,2,2,20 +"487",86.79999,115.800003,9,8051,2,2,20 +"488",94.19995,138,10,8051,2,2,20 +"489",101.89996,166.5,11,8051,2,2,20 +"490",112.5,196.900002,12,8051,2,2,20 +"491",24.2,NA,1,8141,3,2,23 +"492",25.39999,3.900002,2,8141,3,2,23 +"493",30.7,13.300003,3,8141,3,2,23 +"494",35.59998,24.600002,4,8141,3,2,23 +"495",42.79999,38.5,5,8141,3,2,23 +"496",49.5,53,6,8141,3,2,23 +"497",57.19998,67.6,7,8141,3,2,23 +"498",65.59998,83,8,8141,3,2,23 +"499",72.39996,99,9,8141,3,2,23 +"500",81.19995,116,10,8141,3,2,23 +"501",85.19995,135.400002,11,8141,3,2,23 +"502",90.19995,153,12,8141,3,2,23 +"503",25.2,NA,1,8142,1,2,23 +"504",29.79999,9,2,8142,1,2,23 +"505",35.69998,21,3,8142,1,2,23 +"506",42,31.300003,4,8142,1,2,23 +"507",50,44,5,8142,1,2,23 +"508",55.89999,57.800001,6,8142,1,2,23 +"509",63.5,73.100002,7,8142,1,2,23 +"510",70,89.5,8,8142,1,2,23 +"511",79.5,105.400002,9,8142,1,2,23 +"512",83.59998,122.200001,10,8142,1,2,23 +"513",89.39996,138.5,11,8142,1,2,23 +"514",98.5,156.400002,12,8142,1,2,23 +"515",23.5,NA,1,8144,2,2,23 +"516",25.89999,7.200001,2,8144,2,2,23 +"517",32.19998,19.5,3,8144,2,2,23 +"518",32.39999,25.800003,4,8144,2,2,23 +"519",29.79999,29.5,5,8144,2,2,23 +"520",32.79999,34,6,8144,2,2,23 +"521",41,50,7,8144,2,2,23 +"522",50,66.700001,8,8144,2,2,23 +"523",56.89999,80.700005,9,8144,2,2,23 +"524",64.79999,96,10,8144,2,2,23 +"525",72.09998,111.200005,11,8144,2,2,23 +"526",81.39996,131.400002,12,8144,2,2,23 +"527",26.59999,NA,1,8191,2,2,21 +"528",33,10,2,8191,2,2,21 +"529",38.19998,20,3,8191,2,2,21 +"530",46.39999,30,4,8191,2,2,21 +"531",53.19998,40,5,8191,2,2,21 +"532",59.59998,60,6,8191,2,2,21 +"533",66.19995,82.100002,7,8191,2,2,21 +"534",72.59998,99.5,8,8191,2,2,21 +"535",80.09998,120,9,8191,2,2,21 +"536",89,139,10,8191,2,2,21 +"537",97.69995,162.800003,11,8191,2,2,21 +"538",103,185.800003,12,8191,2,2,21 +"539",29.29999,NA,1,8193,3,2,21 +"540",34,10.5,2,8193,3,2,21 +"541",39,20.200005,3,8193,3,2,21 +"542",45.39999,34.800003,4,8193,3,2,21 +"543",51.79999,51,5,8193,3,2,21 +"544",58.79999,70,6,8193,3,2,21 +"545",62.59998,86.200001,7,8193,3,2,21 +"546",71.59998,103,8,8193,3,2,21 +"547",76.69995,111,9,8193,3,2,21 +"548",81.69995,132,10,8193,3,2,21 +"549",86.39996,157.800001,11,8193,3,2,21 +"550",92.09998,178,12,8193,3,2,21 +"551",25,NA,1,8273,1,2,22 +"552",30.09999,8,2,8273,1,2,22 +"553",37.09998,20,3,8273,1,2,22 +"554",46,33.800003,4,8273,1,2,22 +"555",53.59998,48.5,5,8273,1,2,22 +"556",61.09998,67,6,8273,1,2,22 +"557",69,83,7,8273,1,2,22 +"558",78.19995,102,8,8273,1,2,22 +"559",87.09998,119,9,8273,1,2,22 +"560",94.39996,145.5,10,8273,1,2,22 +"561",102.5,163.400002,11,8273,1,2,22 +"562",107.5,183.800001,12,8273,1,2,22 +"563",27,NA,1,8437,2,2,24 +"564",30,6.800003,2,8437,2,2,24 +"565",35.39999,16.400002,3,8437,2,2,24 +"566",41,28.300001,4,8437,2,2,24 +"567",48.59998,41,5,8437,2,2,24 +"568",56.39999,54.900002,6,8437,2,2,24 +"569",63,70.5,7,8437,2,2,24 +"570",75.5,88.5,8,8437,2,2,24 +"571",77.79999,103.300003,9,8437,2,2,24 +"572",85.79999,119.5,10,8437,2,2,24 +"573",95,124.5,11,8437,2,2,24 +"574",101.39996,143.7,12,8437,2,2,24 +"575",27.59999,NA,1,4603,3,3,1 +"576",30.59999,6.700005,2,4603,3,3,1 +"577",38.69998,17.6,3,4603,3,3,1 +"578",47.19998,32.800003,4,4603,3,3,1 +"579",54.09998,50,5,4603,3,3,1 +"580",61.5,64.600002,6,4603,3,3,1 +"581",68.5,80.5,7,4603,3,3,1 +"582",75.19995,91.800003,8,4603,3,3,1 +"583",81.69995,110.200005,9,4603,3,3,1 +"584",90.19995,125.600002,10,4603,3,3,1 +"585",98.39996,145,11,4603,3,3,1 +"586",105.39996,166,12,4603,3,3,1 +"587",27.09999,NA,1,4641,2,3,2 +"588",33,5.200001,2,4641,2,3,2 +"589",42.5,22.5,3,4641,2,3,2 +"590",50.09998,38.5,4,4641,2,3,2 +"591",56.5,53.300003,5,4641,2,3,2 +"592",63,69.8,6,4641,2,3,2 +"593",72.5,85.100002,7,4641,2,3,2 +"594",80.5,101.100006,8,4641,2,3,2 +"595",92,121.400002,9,4641,2,3,2 +"596",100,142.200001,10,4641,2,3,2 +"597",108.39996,157,11,4641,2,3,2 +"598",117,180,12,4641,2,3,2 +"599",15,NA,1,4760,1,3,3 +"600",19.39999,7.900002,2,4760,1,3,3 +"601",21.39999,15,3,4760,1,3,3 +"602",25.29999,22.100002,4,4760,1,3,3 +"603",31.89999,29.900002,5,4760,1,3,3 +"604",37.59998,41.800003,6,4760,1,3,3 +"605",43.69998,57.600006,7,4760,1,3,3 +"606",48.89999,69.5,8,4760,1,3,3 +"607",53.69998,83.800003,9,4760,1,3,3 +"608",59.29999,93.200001,10,4760,1,3,3 +"609",64.89996,107.900002,11,4760,1,3,3 +"610",65.79999,117.300003,12,4760,1,3,3 +"611",24.89999,NA,1,4815,2,3,4 +"612",29.7,9.5,2,4815,2,3,4 +"613",33.79999,19,3,4815,2,3,4 +"614",38.79999,32,4,4815,2,3,4 +"615",48.29999,45.5,5,4815,2,3,4 +"616",52.89999,59.100006,6,4815,2,3,4 +"617",65,80.100006,7,4815,2,3,4 +"618",70.39996,104.600002,8,4815,2,3,4 +"619",76.79999,130,9,4815,2,3,4 +"620",83.29999,149,10,4815,2,3,4 +"621",95,175.600006,11,4815,2,3,4 +"622",102.69995,203.500008,12,4815,2,3,4 +"623",24.59999,NA,1,4817,1,3,4 +"624",28,9,2,4817,1,3,4 +"625",34.59998,19.900002,3,4817,1,3,4 +"626",41.5,33.900002,4,4817,1,3,4 +"627",49.69998,47.000004,5,4817,1,3,4 +"628",56.5,65.200001,6,4817,1,3,4 +"629",66.89996,83.300003,7,4817,1,3,4 +"630",75.79999,106.400002,8,4817,1,3,4 +"631",84.59998,125.900002,9,4817,1,3,4 +"632",91,144.000004,10,4817,1,3,4 +"633",98.79999,168.200001,11,4817,1,3,4 +"634",107.19995,178.200001,12,4817,1,3,4 +"635",26,NA,1,4857,1,3,6 +"636",34.5,8.5,2,4857,1,3,6 +"637",40.39999,19.200005,3,4857,1,3,6 +"638",45,29,4,4857,1,3,6 +"639",53.5,45,5,4857,1,3,6 +"640",60.19998,62.300003,6,4857,1,3,6 +"641",68.59998,89.700005,7,4857,1,3,6 +"642",73.39996,101.800003,8,4857,1,3,6 +"643",78.5,112.700003,9,4857,1,3,6 +"644",86,135.300007,10,4857,1,3,6 +"645",90.59998,154.100002,11,4857,1,3,6 +"646",97.19995,168.700005,12,4857,1,3,6 +"647",24.29999,NA,1,5389,3,3,7 +"648",28.39999,5.400002,2,5389,3,3,7 +"649",36,20,3,5389,3,3,7 +"650",40.29999,30.300003,4,5389,3,3,7 +"651",46.19998,41.800003,5,5389,3,3,7 +"652",51.89999,54.800003,6,5389,3,3,7 +"653",58.09998,69.000002,7,5389,3,3,7 +"654",62.59998,83.800005,8,5389,3,3,7 +"655",68.19995,97.200002,9,5389,3,3,7 +"656",78.39996,114.200003,10,5389,3,3,7 +"657",87.5,131.100004,11,5389,3,3,7 +"658",90.39996,147.000002,12,5389,3,3,7 +"659",30,NA,1,5501,2,3,8 +"660",34.69998,10,2,5501,2,3,8 +"661",42.79999,22.5,3,5501,2,3,8 +"662",48.39999,35.200001,4,5501,2,3,8 +"663",53.29999,45.800003,5,5501,2,3,8 +"664",60,60,6,5501,2,3,8 +"665",71.09998,75.200001,7,5501,2,3,8 +"666",77.39996,92.800003,8,5501,2,3,8 +"667",86,110,9,5501,2,3,8 +"668",94.5,128,10,5501,2,3,8 +"669",102,149.5,11,5501,2,3,8 +"670",104.59998,160,12,5501,2,3,8 +"671",26.59999,NA,1,5527,2,3,9 +"672",28.2,10,2,5527,2,3,9 +"673",38.39999,30,3,5527,2,3,9 +"674",45.79999,50,4,5527,2,3,9 +"675",54.89999,60,5,5527,2,3,9 +"676",60.09998,78.5,6,5527,2,3,9 +"677",56.89999,86.200001,7,5527,2,3,9 +"678",64.89996,102.900002,8,5527,2,3,9 +"679",74,121.799988,9,5527,2,3,9 +"680",81.89996,136.999992,10,5527,2,3,9 +"681",90.89996,136.999992,11,5527,2,3,9 +"682",27.09999,NA,1,5578,2,3,10 +"683",34.69998,11.100002,2,5578,2,3,10 +"684",40.69998,29.900002,3,5578,2,3,10 +"685",47.59998,45.300001,4,5578,2,3,10 +"686",56.5,65.6,5,5578,2,3,10 +"687",64,84.800001,6,5578,2,3,10 +"688",72.29999,107,7,5578,2,3,10 +"689",82,135.1,8,5578,2,3,10 +"690",88,155.5,9,5578,2,3,10 +"691",95.09998,177.900002,10,5578,2,3,10 +"692",104,201.5,11,5578,2,3,10 +"693",105.69995,224.5,12,5578,2,3,10 +"694",29.5,NA,1,5582,3,3,10 +"695",31.39999,6.800001,2,5582,3,3,10 +"696",39.39999,19.6,3,5582,3,3,10 +"697",44.79999,30.300003,4,5582,3,3,10 +"698",50.29999,41.400002,5,5582,3,3,10 +"699",57,55,6,5582,3,3,10 +"700",65.59998,71,7,5582,3,3,10 +"701",72.39996,86.800001,8,5582,3,3,10 +"702",80.19995,105.200001,9,5582,3,3,10 +"703",90.39996,127.800001,10,5582,3,3,10 +"704",95,149.5,11,5582,3,3,10 +"705",99.09998,167.5,12,5582,3,3,10 +"706",22.29999,NA,1,5851,3,3,12 +"707",24.7,5.900002,2,5851,3,3,12 +"708",30.39999,15,3,5851,3,3,12 +"709",36.89999,25.200001,4,5851,3,3,12 +"710",44.09998,39,5,5851,3,3,12 +"711",52.79999,56,6,5851,3,3,12 +"712",60.29999,71.100006,7,5851,3,3,12 +"713",67.79999,86.5,8,5851,3,3,12 +"714",75.39996,106.300001,9,5851,3,3,12 +"715",87.59998,127.599991,10,5851,3,3,12 +"716",93,137.599991,11,5851,3,3,12 +"717",101.39996,157.599991,12,5851,3,3,12 +"718",22,NA,1,5866,2,3,11 +"719",24.79999,7.5,2,5866,2,3,11 +"720",30,16.1,3,5866,2,3,11 +"721",35.79999,26,4,5866,2,3,11 +"722",42.29999,36.800001,5,5866,2,3,11 +"723",50.89999,49.800003,6,5866,2,3,11 +"724",57.79999,65.100002,7,5866,2,3,11 +"725",64.79999,80,8,5866,2,3,11 +"726",71.79999,93.800003,9,5866,2,3,11 +"727",75,109.900002,10,5866,2,3,11 +"728",83.79999,122.800003,11,5866,2,3,11 +"729",92.09998,142.400002,12,5866,2,3,11 +"730",32.5,NA,1,6056,3,3,16 +"731",38.79999,10,2,6056,3,3,16 +"732",47.79999,23.800003,3,6056,3,3,16 +"733",54.5,39,4,6056,3,3,16 +"734",63,53.200001,5,6056,3,3,16 +"735",70.09998,69,6,6056,3,3,16 +"736",76.59998,89,7,6056,3,3,16 +"737",86.19995,105.900002,8,6056,3,3,16 +"738",95.79999,127,9,6056,3,3,16 +"739",101.39996,151.200005,10,6056,3,3,16 +"740",112.29999,174.5,11,6056,3,3,16 +"741",112,191.400002,12,6056,3,3,16 +"742",27,NA,1,6057,2,3,16 +"743",32.39999,7.800001,2,6057,2,3,16 +"744",38.19998,17,3,6057,2,3,16 +"745",44.69998,30.100006,4,6057,2,3,16 +"746",53.19998,44.800003,5,6057,2,3,16 +"747",62.39999,60,6,6057,2,3,16 +"748",70.5,80,7,6057,2,3,16 +"749",72.79999,91.800003,8,6057,2,3,16 +"750",86.59998,111.100006,9,6057,2,3,16 +"751",93.29999,131.800003,10,6057,2,3,16 +"752",103,147.800003,11,6057,2,3,16 +"753",104.19995,165.400002,12,6057,2,3,16 +"754",34.19998,NA,1,6430,2,3,5 +"755",42.09998,8.499996,2,6430,2,3,5 +"756",50.09998,23.099998,3,6430,2,3,5 +"757",57.59998,35,4,6430,2,3,5 +"758",62.29999,54.299995,5,6430,2,3,5 +"759",66,73.399996,6,6430,2,3,5 +"760",76.09998,81.899996,7,6430,2,3,5 +"761",81.5,96.099998,8,6430,2,3,5 +"762",84.29999,114.299995,9,6430,2,3,5 +"763",97.89996,133.799995,10,6430,2,3,5 +"764",108.59998,163.799995,11,6430,2,3,5 +"765",109,198.299995,12,6430,2,3,5 +"766",26.79999,NA,1,8050,1,3,20 +"767",33.59998,10.900002,2,8050,1,3,20 +"768",42.09998,25.100002,3,8050,1,3,20 +"769",47.89999,38,4,8050,1,3,20 +"770",56.19998,54.400002,5,8050,1,3,20 +"771",63.39999,74.200001,6,8050,1,3,20 +"772",71.79999,89.900002,7,8050,1,3,20 +"773",80,111,8,8050,1,3,20 +"774",87.59998,135.800003,9,8050,1,3,20 +"775",94.89996,159.600006,10,8050,1,3,20 +"776",101,188.200001,11,8050,1,3,20 +"777",112,224,12,8050,1,3,20 +"778",21.89999,NA,1,8053,3,3,20 +"779",26.79999,9.6,2,8053,3,3,20 +"780",34.69998,21.800003,3,8053,3,3,20 +"781",41.29999,36.700001,4,8053,3,3,20 +"782",48.69998,51.800003,5,8053,3,3,20 +"783",57.89999,71.400002,6,8053,3,3,20 +"784",65.39996,91.600006,7,8053,3,3,20 +"785",71.29999,115.800003,8,8053,3,3,20 +"786",80.59998,138,9,8053,3,3,20 +"787",88.39996,163,10,8053,3,3,20 +"788",96.29999,188,11,8053,3,3,20 +"789",103.5,217.800001,12,8053,3,3,20 +"790",24,NA,1,8139,1,3,23 +"791",26.79999,6,2,8139,1,3,23 +"792",33,20,3,8139,1,3,23 +"793",39.09998,30.300003,4,8139,1,3,23 +"794",46.39999,45.100002,5,8139,1,3,23 +"795",54,61.400002,6,8139,1,3,23 +"796",61.59998,74.400002,7,8139,1,3,23 +"797",68.79999,91.800003,8,8139,1,3,23 +"798",72.19995,109.5,9,8139,1,3,23 +"799",77.19995,128,10,8139,1,3,23 +"800",83.79999,144.400002,11,8139,1,3,23 +"801",90.79999,160.300003,12,8139,1,3,23 +"802",35.39999,NA,1,8192,1,3,21 +"803",42.29999,15.100002,2,8192,1,3,21 +"804",49.5,29.800003,3,8192,1,3,21 +"805",58.69998,48.5,4,8192,1,3,21 +"806",66.59998,70,5,8192,1,3,21 +"807",71.39996,87.300001,6,8192,1,3,21 +"808",78.79999,104.200001,7,8192,1,3,21 +"809",85.5,125.5,8,8192,1,3,21 +"810",91.29999,144.900002,9,8192,1,3,21 +"811",97.79999,165,10,8192,1,3,21 +"812",108,197,11,8192,1,3,21 +"813",113,220,12,8192,1,3,21 +"814",22.09999,NA,1,8269,3,3,22 +"815",26.29999,6,2,8269,3,3,22 +"816",33.29999,16.800001,3,8269,3,3,22 +"817",38,27.400002,4,8269,3,3,22 +"818",39.59998,36.200001,5,8269,3,3,22 +"819",41.39999,43.300003,6,8269,3,3,22 +"820",46.29999,54,7,8269,3,3,22 +"821",52.39999,70,8,8269,3,3,22 +"822",59.39999,90,9,8269,3,3,22 +"823",66.69995,112.5,10,8269,3,3,22 +"824",73.89996,128,11,8269,3,3,22 +"825",76.79999,141.5,12,8269,3,3,22 +"826",23.7,NA,1,8270,1,3,22 +"827",28.79999,7.6,2,8270,1,3,22 +"828",35.09998,18.5,3,8270,1,3,22 +"829",43,33.200001,4,8270,1,3,22 +"830",49.09998,44.900002,5,8270,1,3,22 +"831",58.79999,60,6,8270,1,3,22 +"832",65.39996,75.800003,7,8270,1,3,22 +"833",68.5,91,8,8270,1,3,22 +"834",79,108.5,9,8270,1,3,22 +"835",90.69995,133.200001,10,8270,1,3,22 +"836",98,151.200005,11,8270,1,3,22 +"837",104,167.800001,12,8270,1,3,22 +"838",27.29999,NA,1,8439,3,3,24 +"839",31.2,8.8,2,8439,3,3,24 +"840",37,20,3,8439,3,3,24 +"841",43.89999,33.200001,4,8439,3,3,24 +"842",49.79999,44.800003,5,8439,3,3,24 +"843",57.19998,58,6,8439,3,3,24 +"844",65.19995,74.600002,7,8439,3,3,24 +"845",73.19995,94.900002,8,8439,3,3,24 +"846",78.79999,108.8,9,8439,3,3,24 +"847",87.39996,128,10,8439,3,3,24 +"848",95,149,11,8439,3,3,24 +"849",100.5,158.600006,12,8439,3,3,24 +"850",25.7,NA,1,8442,1,3,24 +"851",28.7,6.6,2,8442,1,3,24 +"852",33.39999,15.800003,3,8442,1,3,24 +"853",40,27,4,8442,1,3,24 +"854",46.69998,40,5,8442,1,3,24 +"855",56.59998,52,6,8442,1,3,24 +"856",65.19995,66,7,8442,1,3,24 +"857",73.19995,83.800003,8,8442,1,3,24 +"858",81.69995,99.800003,9,8442,1,3,24 +"859",90.29999,115.200001,10,8442,1,3,24 +"860",96,133.200001,11,8442,1,3,24 +"861",103.5,151.400002,12,8442,1,3,24 From 6e94ff8905e25aa2e7e0b05e929bb7fbb4d08aaa Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 10:33:13 -0500 Subject: [PATCH 076/157] Remove a few unused imports --- statsmodels/regression/mixed_linear_model.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py index 5209b83cfd6..f0cf084ebce 100644 --- a/statsmodels/regression/mixed_linear_model.py +++ b/statsmodels/regression/mixed_linear_model.py @@ -145,7 +145,6 @@ import numpy as np import statsmodels.base.model as base -from scipy.optimize import fmin_ncg, fmin_cg, fmin_bfgs, fmin from statsmodels.tools.decorators import cache_readonly from statsmodels.tools import data as data_tools from scipy.stats.distributions import norm @@ -159,7 +158,6 @@ from statsmodels.tools.sm_exceptions import ConvergenceWarning from statsmodels.base._penalties import Penalty from statsmodels.compat.numpy import np_matrix_rank -from pandas import DataFrame def _dot(x, y): @@ -1960,8 +1958,6 @@ def fit(self, start_params=None, reml=True, niter_sa=0, else: hist = None - success = False - if start_params is None: params = MixedLMParams(self.k_fe, self.k_re, self.k_vc) params.fe_params = np.zeros(self.k_fe) @@ -2495,7 +2491,7 @@ def profile_re(self, re_ix, vtype, num_low=5, dist_low=1., num_high=5, k_fe = pmodel.k_fe k_re = pmodel.k_re k_vc = pmodel.k_vc - endog, exog, groups = pmodel.endog, pmodel.exog, pmodel.groups + endog, exog = pmodel.endog, pmodel.exog # Need to permute the columns of the random effects design # matrix so that the profiled variable is in the first column. From e6f2f84585b5a49bb20a091f9dac3b008381868a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 7 Mar 2018 10:01:54 -0500 Subject: [PATCH 077/157] Test cleanup, warn about unused keyword to fit --- statsmodels/regression/mixed_linear_model.py | 10 +++ statsmodels/regression/tests/test_lme.py | 68 ++++++++++++++------ 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py index f0cf084ebce..d547de6554e 100644 --- a/statsmodels/regression/mixed_linear_model.py +++ b/statsmodels/regression/mixed_linear_model.py @@ -1916,8 +1916,15 @@ def fit(self, start_params=None, reml=True, niter_sa=0, reml : bool If true, fit according to the REML likelihood, else fit the standard likelihood using ML. + niter_sa : + Currently this argument is ignored and has no effect + on the results. cov_pen : CovariancePenalty object A penalty for the random effects covariance matrix + do_cg : boolean, defaults to True + If False, the optimization is skipped and a results + object at the given (or default) starting values is + returned. fe_pen : Penalty object A penalty on the fixed effects free : MixedLMParams object @@ -1985,6 +1992,9 @@ def fit(self, start_params=None, reml=True, niter_sa=0, kwargs["disp"] = False packed = params.get_packed(use_sqrt=self.use_sqrt, has_fe=False) + if niter_sa > 0: + warnings.warn("niter_sa is currently ignored") + # It seems that the optimizers sometimes stop too soon, so # we run a few times. for rep in range(5): diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 201e81e88fb..174d76eb19e 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -425,8 +425,8 @@ def test_dietox_slopes(self): # Fit in R using # # library(geepack) - # rm = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox) - # rm = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox) + # r = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox) + # r = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') @@ -438,21 +438,21 @@ def test_dietox_slopes(self): re_formula="1 + Time", data=data) result = model.fit(method='powell') - # fixef(rm) + # fixef(r) assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5) - # sqrt(diag(vcov(rm))) + # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3) - # attr(VarCorr(rm), "sc")^2 + # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.03745, rtol=1e-3) - # as.numeric(VarCorr(rm)[[1]]) + # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620], rtol=1e-1) - # logLik(rm) + # logLik(r) assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5) # ML @@ -461,70 +461,96 @@ def test_dietox_slopes(self): re_formula="1 + Time", data=data) result = model.fit(method='powell', reml=False) - # fixef(rm) + # fixef(r) assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5) - # sqrt(diag(vcov(rm))) + # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3) - # attr(VarCorr(rm), "sc")^2 + # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.037441, rtol=1e-3) - # as.numeric(VarCorr(rm)[[1]]) + # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2) - # logLik(rm) + # logLik(r) assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5) def test_pastes_vcomp(self): # pastes data from lme4 # - # Fit in R using formula: + # Fit in R using: # - # strength ~ (1|batch) + (1|batch:cask) + # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data) + # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data, reml=FALSE) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') - - # REML data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} + + # REML model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() + # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) + + # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3) + + # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3) + + # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.678, rtol=1e-3) + + # logLik(r) assert_allclose(result.llf, -123.49, rtol=1e-1) - assert_equal(result.aic, np.nan) # don't provide aic/bic with REML + + # don't provide aic/bic with REML + assert_equal(result.aic, np.nan) assert_equal(result.bic, np.nan) - resid = np.r_[0.17133538, -0.02866462, - - 1.08662875, 1.11337125, -0.12093607] + # resid(r)[1:5] + resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125, + -0.12093607] assert_allclose(result.resid[0:5], resid, rtol=1e-3) + # predict(r)[1:5] fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094] assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4) # ML - data = pd.read_csv(fname) - vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit(reml=False) + + # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) + + # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3) + + # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3) + + # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.67799, rtol=1e-3) + + # logLik(r) assert_allclose(result.llf, -123.997, rtol=1e-1) + + # AIC(r) assert_allclose(result.aic, 255.9944, rtol=1e-3) + + # BIC(r) assert_allclose(result.bic, 264.3718, rtol=1e-3) def test_vcomp_formula(self): From 0f4fe80141a219975f65544782064dc0afe933f0 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 19 Mar 2018 21:00:01 -0400 Subject: [PATCH 078/157] Trigger From c1d78dcdc80995408c90bf5c4cb1db0c038963a8 Mon Sep 17 00:00:00 2001 From: Josef Date: Mon, 19 Mar 2018 22:40:09 -0400 Subject: [PATCH 079/157] ENH: add t_test_pairwise to LikelihoodModelResults, minimal unit test --- statsmodels/base/model.py | 9 +- .../base/tests/test_generic_methods.py | 33 +++++ statsmodels/stats/contrast.py | 133 ++++++++++++++++++ 3 files changed, 174 insertions(+), 1 deletion(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index cb350d7d356..123d3092f78 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -5,7 +5,8 @@ from statsmodels.base.data import handle_data from statsmodels.tools.data import _is_using_pandas from statsmodels.tools.tools import recipr, nan_dot -from statsmodels.stats.contrast import ContrastResults, WaldTestResults +from statsmodels.stats.contrast import (ContrastResults, WaldTestResults, + t_test_pairwise) from statsmodels.tools.decorators import resettable_cache, cache_readonly import statsmodels.base.wrapper as wrap from statsmodels.tools.numdiff import approx_fprime @@ -1645,6 +1646,12 @@ def wald_test_terms(self, skip_single=False, extra_constraints=None, res.temp = constraints + combined_constraints + extra_constraints return res + def t_test_pairwise(self, term_name, method='hs', + factor_labels=None, ignore=False): + res = t_test_pairwise(self, term_name, method=method, + factor_labels=factor_labels, ignore=ignore) + return res + def conf_int(self, alpha=.05, cols=None, method='default'): """ Returns the confidence interval of the fitted parameters. diff --git a/statsmodels/base/tests/test_generic_methods.py b/statsmodels/base/tests/test_generic_methods.py index 9a862228391..57d3078a118 100644 --- a/statsmodels/base/tests/test_generic_methods.py +++ b/statsmodels/base/tests/test_generic_methods.py @@ -518,5 +518,38 @@ def initialize(cls): cls.res = mod.fit() # default use_t=True +class CheckPairwise(object): + + def test_default(self): + res = self.res + + tt = res.t_test(self.constraints) + + pw = res.t_test_pairwise(self.term_name) + pw_frame = pw.result_frame + assert_allclose(pw_frame.iloc[:, :6].values, + tt.summary_frame().values) + + +class TestTTestPairwiseOLS(CheckPairwise): + + @classmethod + def setup_class(cls): + from statsmodels.formula.api import ols + import statsmodels.stats.tests.test_anova as ttmod + + test = ttmod.TestAnova3() + test.setup_class() + cls.data = test.data.drop([0,1,2]) + + mod = ols("np.log(Days+1) ~ C(Duration) + C(Weight)", cls.data) + cls.res = mod.fit() + cls.term_name = "C(Weight)" + cls.constraints = ['C(Weight)[T.2]', + 'C(Weight)[T.3]', + 'C(Weight)[T.3] - C(Weight)[T.2]'] + + + if __name__ == '__main__': pass diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py index 9dace7f9798..1b850136bd9 100644 --- a/statsmodels/stats/contrast.py +++ b/statsmodels/stats/contrast.py @@ -5,6 +5,7 @@ from scipy import stats from statsmodels.tools.tools import clean0, fullrank from statsmodels.compat.numpy import np_matrix_rank +from statsmodels.stats.multitest import multipletests #TODO: should this be public if it's just a container? @@ -412,3 +413,135 @@ def __str__(self): def __repr__(self): return str(self.__class__) + '\n' + self.__str__() + + +# t_test for pairwise comparison and automatic contrast/restrictions + + +def _get_pairs_labels(k_level, level_names): + idx_pairs_all = np.triu_indices(k_level, 1) + labels = ['%s-%s' % (level_names[name[1]], level_names[name[0]]) for name in zip(*idx_pairs_all)] + return labels + +def contrast_pairs(k_params, k_level, idx_start, level_names=None): + """create pairwise contrast for reference coding + """ + k_level_m1 = k_level - 1 + idx_pairs = np.triu_indices(k_level_m1, 1) + + k = len(idx_pairs[0]) + c_pairs = np.zeros((k, k_level_m1)) + c_pairs[np.arange(k), idx_pairs[0]] = -1 + c_pairs[np.arange(k), idx_pairs[1]] = 1 + c_reference = np.eye(k_level_m1) + c = np.concatenate((c_reference, c_pairs), axis=0) + k_all = c.shape[0] + + contrasts = np.zeros((k_all, k_params)) + contrasts[:, idx_start : idx_start + k_level_m1] = c + + return contrasts + + +def t_test_multi(result, contrasts, method='hs', ci_method=None, contrast_names=None): + tt = result.t_test(contrasts) + res_df = tt.summary_frame(xname=contrast_names) + + if type(method) is not list: + method = [method] + for meth in method: + mt = multipletests(tt.pvalue, method=meth) + res_df['pvalue-%s' % meth] = mt[1] + res_df['reject-%s' % meth] = mt[0] + return res_df + + +class MultiCompResult(object): + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + +def _embed_constraints(contrasts, k_params, idx_start): + + k_c, k_p = contrasts.shape + c = np.zeros((k_c, k_params)) + if isinstance(idx_start, int): + # no ducks, int_likes supported yet + c[:, idx_start : idx_start + k_p] = contrasts + else: + c[:, idx_start] = contrasts + return c + + +def t_test_pairwise(result, term_name, method='hs', factor_labels=None, ignore=False): + """get pairwise t_test with multiple testing corrected p-values + + This uses the formula design_info encoding contrast matrix and should work for + all encodings of a main effect. + + Parameters + ---------- + result : result instance + term_name : str + name of the term for which pairwise comparisons are computed + method : str or list of strings + multiple testing p-value correction, default is 'hs', see stats.multipletesting + factor_labels : None, list of str + Labels for the factor levels used for pairwise labels. If not provided, + then the labels from the formula design_info are used. + ignore : boolean + This function tries to detect whether an appropriate factor encoding was + used and will raise if a ValueError if the factor encoding is not a simple + reference coding. These exceptions can be turned off. + + Returns + ------- + results : instance of a simple Results class + The results are stored as attributes, the main attributes are the following two. Other + attributes are added for debugging purposes or as background information. + + - result_frame : pandas DataFrame with t_test results and multiple testing corrected p-values + - contrasts : matrix of constraints of the null hypothesis in the t_test + + Notes + ----- + + Status: experimental. Currently only checked for treatment coding with and without specified + reference level. + + Currently there are no multiple testing corrected confidence intervals available + + """ + + desinfo = result.model.data.design_info + term_idx = desinfo.term_names.index(term_name) + term = desinfo.terms[term_idx] + idx_start = desinfo.term_slices[term].start + if not ignore and len(term.factors) > 1: + raise ValueError('interaction effects not yet supported') + factor = term.factors[0] + cat = desinfo.factor_infos[factor].categories + if factor_labels is not None: + if len(factor_labels) == len(cat): + cat = factor_labels + else: + raise ValueError("factor_labels has the wrong length, should be %d" % len(cat)) + + + k_level = len(cat) + cm = desinfo.term_codings[term][0].contrast_matrices[factor].matrix + + k_params = len(result.params) + labels = _get_pairs_labels(k_level, cat) + + import statsmodels.sandbox.stats.multicomp as mc + c_all_pairs = -mc.contrast_allpairs(k_level) + contrasts_sub = c_all_pairs.dot(cm) + contrasts = _embed_constraints(contrasts_sub, k_params, idx_start) + res_df = t_test_multi(result, contrasts, method=method, ci_method=None, contrast_names=labels) + res = MultiCompResult(result_frame=res_df, + contrasts=contrasts, + term=term, + contrast_labels=labels, + term_encoding_matrix=cm) + return res From 5142b877a410b4cbf9af560b039e99e93f2b0849 Mon Sep 17 00:00:00 2001 From: Josef Date: Mon, 19 Mar 2018 23:41:08 -0400 Subject: [PATCH 080/157] TST: t_test_pairwise, unit tests for different exog parameterization/encoding --- .../base/tests/test_generic_methods.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/statsmodels/base/tests/test_generic_methods.py b/statsmodels/base/tests/test_generic_methods.py index 57d3078a118..a4e357e3d3d 100644 --- a/statsmodels/base/tests/test_generic_methods.py +++ b/statsmodels/base/tests/test_generic_methods.py @@ -550,6 +550,61 @@ def setup_class(cls): 'C(Weight)[T.3] - C(Weight)[T.2]'] +class TestTTestPairwiseOLS2(CheckPairwise): + + @classmethod + def setup_class(cls): + from statsmodels.formula.api import ols + import statsmodels.stats.tests.test_anova as ttmod + + test = ttmod.TestAnova3() + test.setup_class() + cls.data = test.data.drop([0,1,2]) + + mod = ols("np.log(Days+1) ~ C(Weight) + C(Duration)", cls.data) + cls.res = mod.fit() + cls.term_name = "C(Weight)" + cls.constraints = ['C(Weight)[T.2]', + 'C(Weight)[T.3]', + 'C(Weight)[T.3] - C(Weight)[T.2]'] + + +class TestTTestPairwiseOLS3(CheckPairwise): + + @classmethod + def setup_class(cls): + from statsmodels.formula.api import ols + import statsmodels.stats.tests.test_anova as ttmod + + test = ttmod.TestAnova3() + test.setup_class() + cls.data = test.data.drop([0,1,2]) + + mod = ols("np.log(Days+1) ~ C(Weight) + C(Duration) - 1", cls.data) + cls.res = mod.fit() + cls.term_name = "C(Weight)" + cls.constraints = ['C(Weight)[2] - C(Weight)[1]', + 'C(Weight)[3] - C(Weight)[1]', + 'C(Weight)[3] - C(Weight)[2]'] + +class TestTTestPairwiseOLS4(CheckPairwise): + + @classmethod + def setup_class(cls): + from statsmodels.formula.api import ols + import statsmodels.stats.tests.test_anova as ttmod + + test = ttmod.TestAnova3() + test.setup_class() + cls.data = test.data.drop([0,1,2]) + + mod = ols("np.log(Days+1) ~ C(Weight, Treatment(2)) + C(Duration)", cls.data) + cls.res = mod.fit() + cls.term_name = "C(Weight, Treatment(2))" + cls.constraints = ['-C(Weight, Treatment(2))[T.1]', + 'C(Weight, Treatment(2))[T.3] - C(Weight, Treatment(2))[T.1]', + 'C(Weight, Treatment(2))[T.3]',] + if __name__ == '__main__': pass From 9515fd38b1e2bd0ecb6337a65fb78d6538e5b935 Mon Sep 17 00:00:00 2001 From: Josef Date: Tue, 20 Mar 2018 08:53:39 -0400 Subject: [PATCH 081/157] REF: avoid isinstance int, unit test poisson --- .../base/tests/test_generic_methods.py | 20 +++++++++++++++++++ statsmodels/stats/contrast.py | 7 +++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/statsmodels/base/tests/test_generic_methods.py b/statsmodels/base/tests/test_generic_methods.py index a4e357e3d3d..98f470011ba 100644 --- a/statsmodels/base/tests/test_generic_methods.py +++ b/statsmodels/base/tests/test_generic_methods.py @@ -606,5 +606,25 @@ def setup_class(cls): 'C(Weight, Treatment(2))[T.3]',] +class TestTTestPairwisePoisson(CheckPairwise): + + @classmethod + def setup_class(cls): + from statsmodels.discrete.discrete_model import Poisson + import statsmodels.stats.tests.test_anova as ttmod + + test = ttmod.TestAnova3() + test.setup_class() + cls.data = test.data.drop([0,1,2]) + + mod = Poisson.from_formula("Days ~ C(Duration) + C(Weight)", cls.data) + cls.res = mod.fit(cov_type='HC0') + cls.term_name = "C(Weight)" + cls.constraints = ['C(Weight)[T.2]', + 'C(Weight)[T.3]', + 'C(Weight)[T.3] - C(Weight)[T.2]'] + + + if __name__ == '__main__': pass diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py index 1b850136bd9..754d3ac5a52 100644 --- a/statsmodels/stats/contrast.py +++ b/statsmodels/stats/contrast.py @@ -461,15 +461,14 @@ def __init__(self, **kwargs): self.__dict__.update(kwargs) -def _embed_constraints(contrasts, k_params, idx_start): +def _embed_constraints(contrasts, k_params, idx_start, index=None): k_c, k_p = contrasts.shape c = np.zeros((k_c, k_params)) - if isinstance(idx_start, int): - # no ducks, int_likes supported yet + if index is None: c[:, idx_start : idx_start + k_p] = contrasts else: - c[:, idx_start] = contrasts + c[:, index] = contrasts return c From c51b9d54eef3cc543db34937cde24753e744eca4 Mon Sep 17 00:00:00 2001 From: Josef Date: Tue, 20 Mar 2018 10:12:14 -0400 Subject: [PATCH 082/157] BUG: cast test data to int, avoid patsy long int name --- statsmodels/stats/tests/test_anova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/stats/tests/test_anova.py b/statsmodels/stats/tests/test_anova.py index d8c6a0b613e..60e96799de2 100644 --- a/statsmodels/stats/tests/test_anova.py +++ b/statsmodels/stats/tests/test_anova.py @@ -72,7 +72,7 @@ """) kidney_table.seek(0) -kidney_table = read_table(kidney_table, sep="\s+") +kidney_table = read_table(kidney_table, sep="\s+").astype(int) class TestAnovaLM(object): @classmethod From bd91e53a479c47a4b696723b6e04cfdf04a7696d Mon Sep 17 00:00:00 2001 From: Josef Date: Tue, 20 Mar 2018 11:31:22 -0400 Subject: [PATCH 083/157] DOC/Ref: docstrings and argument cleanup --- statsmodels/base/model.py | 61 +++++++++++++++- statsmodels/stats/contrast.py | 130 ++++++++++++++++++++++++++++------ 2 files changed, 167 insertions(+), 24 deletions(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 123d3092f78..ee10dfb4896 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -1647,9 +1647,66 @@ def wald_test_terms(self, skip_single=False, extra_constraints=None, return res def t_test_pairwise(self, term_name, method='hs', - factor_labels=None, ignore=False): + factor_labels=None): + """perform pairwise t_test with multiple testing corrected p-values + + This uses the formula design_info encoding contrast matrix and should + work for all encodings of a main effect. + + Parameters + ---------- + result : result instance + The results of an estimated model with a categorical main effect. + term_name : str + name of the term for which pairwise comparisons are computed. + Term names for categorical effects are created by patsy and + correspond to the main part of the exog names. + method : str or list of strings + multiple testing p-value correction, default is 'hs', + see stats.multipletesting + factor_labels : None, list of str + Labels for the factor levels used for pairwise labels. If not + provided, then the labels from the formula design_info are used. + + Returns + ------- + results : instance of a simple Results class + The results are stored as attributes, the main attributes are the + following two. Other attributes are added for debugging purposes + or as background information. + + - result_frame : pandas DataFrame with t_test results and multiple + testing corrected p-values. + - contrasts : matrix of constraints of the null hypothesis in the + t_test. + + Notes + ----- + + Status: experimental. Currently only checked for treatment coding with + and without specified reference level. + + Currently there are no multiple testing corrected confidence intervals + available. + + Examples + -------- + >>> res = ols("np.log(Days+1) ~ C(Weight) + C(Duration)", data).fit() + >>> pw = res.t_test_pairwise("C(Weight)") + >>> pw.result_frame + coef std err t P>|t| Conf. Int. Low \ + 2-1 0.632315 0.230003 2.749157 8.028083e-03 0.171563 + 3-1 1.302555 0.230003 5.663201 5.331513e-07 0.841803 + 3-2 0.670240 0.230003 2.914044 5.119126e-03 0.209488 + + Conf. Int. Upp. pvalue-hs reject-hs + 2-1 1.093067 0.010212 True + 3-1 1.763307 0.000002 True + 3-2 1.130992 0.010212 True + + """ res = t_test_pairwise(self, term_name, method=method, - factor_labels=factor_labels, ignore=ignore) + factor_labels=factor_labels) return res def conf_int(self, alpha=.05, cols=None, method='default'): diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py index 754d3ac5a52..58441ff2770 100644 --- a/statsmodels/stats/contrast.py +++ b/statsmodels/stats/contrast.py @@ -419,12 +419,38 @@ def __repr__(self): def _get_pairs_labels(k_level, level_names): + """helper function for labels for pairwise comparisons + """ idx_pairs_all = np.triu_indices(k_level, 1) - labels = ['%s-%s' % (level_names[name[1]], level_names[name[0]]) for name in zip(*idx_pairs_all)] + labels = ['%s-%s' % (level_names[name[1]], level_names[name[0]]) + for name in zip(*idx_pairs_all)] return labels -def contrast_pairs(k_params, k_level, idx_start, level_names=None): +def _contrast_pairs(k_params, k_level, idx_start): """create pairwise contrast for reference coding + + currently not used, + using encoding contrast matrix is more general, but requires requires + factor information from patsy design_info. + + + Parameters + ---------- + k_params : int + number of parameters + k_level : int + number of levels or categories (including reference case) + idx_start : int + Index of the first parameter of this factor. The restrictions on the + factor are inserted as a block in the full restriction matrix starting + at column with index `idx_start`. + + Returns + ------- + contrasts : ndarray + restriction matrix with k_params columns and number of rows equal to + the number of restrictions. + """ k_level_m1 = k_level - 1 idx_pairs = np.triu_indices(k_level_m1, 1) @@ -443,7 +469,31 @@ def contrast_pairs(k_params, k_level, idx_start, level_names=None): return contrasts -def t_test_multi(result, contrasts, method='hs', ci_method=None, contrast_names=None): +def t_test_multi(result, contrasts, method='hs', ci_method=None, + contrast_names=None): + """perform t_test and add multiplicity correction to results dataframe + + Parameters + ---------- + result results instance + results of an estimated model + contrasts : ndarray + restriction matrix for t_test + method : string or list of strings + method for multiple testing p-value correction, default is'hs'. + ci_method : None + not used yet, will be for multiplicity corrected confidence intervals + contrast_names : list of strings or None + If contrast_names are provided, then they are used in the index of the + returned dataframe, otherwise some generic default names are created. + + Returns + ------- + res_df : pandas DataFrame + The dataframe contains the results of the t_test and additional columns + for multiplicity corrected p-values and boolean indicator for whether + the Null hypothesis is rejected. + """ tt = result.t_test(contrasts) res_df = tt.summary_frame(xname=contrast_names) @@ -457,11 +507,39 @@ def t_test_multi(result, contrasts, method='hs', ci_method=None, contrast_names= class MultiCompResult(object): + """class to hold return of t_test_pairwise + + currently just a minimal class to hold attributes. + """ def __init__(self, **kwargs): self.__dict__.update(kwargs) def _embed_constraints(contrasts, k_params, idx_start, index=None): + """helper function to expand constraints to a full restriction matrix + + Parameters + ---------- + contrasts : ndarray + restriction matrix for t_test + k_params : int + number of parameters + idx_start : int + Index of the first parameter of this factor. The restrictions on the + factor are inserted as a block in the full restriction matrix starting + at column with index `idx_start`. + index : slice or ndarray + Column index if constraints do not form a block in the full restriction + matrix, i.e. if parameters that are subject to restrictions are not + consecutive in the list of parameters. + If index is not None, then idx_start is ignored. + + Returns + ------- + contrasts : ndarray + restriction matrix with k_params columns and number of rows equal to + the number of restrictions. + """ k_c, k_p = contrasts.shape c = np.zeros((k_c, k_params)) @@ -472,43 +550,50 @@ def _embed_constraints(contrasts, k_params, idx_start, index=None): return c -def t_test_pairwise(result, term_name, method='hs', factor_labels=None, ignore=False): - """get pairwise t_test with multiple testing corrected p-values +def t_test_pairwise(result, term_name, method='hs', factor_labels=None, + ignore=False): + """perform pairwise t_test with multiple testing corrected p-values - This uses the formula design_info encoding contrast matrix and should work for - all encodings of a main effect. + This uses the formula design_info encoding contrast matrix and should + work for all encodings of a main effect. Parameters ---------- result : result instance + The results of an estimated model with a categorical main effect. term_name : str - name of the term for which pairwise comparisons are computed + name of the term for which pairwise comparisons are computed. + Term names for categorical effects are created by patsy and + correspond to the main part of the exog names. method : str or list of strings - multiple testing p-value correction, default is 'hs', see stats.multipletesting + multiple testing p-value correction, default is 'hs', + see stats.multipletesting factor_labels : None, list of str - Labels for the factor levels used for pairwise labels. If not provided, - then the labels from the formula design_info are used. + Labels for the factor levels used for pairwise labels. If not + provided, then the labels from the formula design_info are used. ignore : boolean - This function tries to detect whether an appropriate factor encoding was - used and will raise if a ValueError if the factor encoding is not a simple - reference coding. These exceptions can be turned off. + Turn off some of the exceptions raised by input checks. Returns ------- results : instance of a simple Results class - The results are stored as attributes, the main attributes are the following two. Other - attributes are added for debugging purposes or as background information. + The results are stored as attributes, the main attributes are the + following two. Other attributes are added for debugging purposes + or as background information. - - result_frame : pandas DataFrame with t_test results and multiple testing corrected p-values - - contrasts : matrix of constraints of the null hypothesis in the t_test + - result_frame : pandas DataFrame with t_test results and multiple + testing corrected p-values. + - contrasts : matrix of constraints of the null hypothesis in the + t_test. Notes ----- - Status: experimental. Currently only checked for treatment coding with and without specified - reference level. + Status: experimental. Currently only checked for treatment coding with + and without specified reference level. - Currently there are no multiple testing corrected confidence intervals available + Currently there are no multiple testing corrected confidence intervals + available. """ @@ -537,7 +622,8 @@ def t_test_pairwise(result, term_name, method='hs', factor_labels=None, ignore=F c_all_pairs = -mc.contrast_allpairs(k_level) contrasts_sub = c_all_pairs.dot(cm) contrasts = _embed_constraints(contrasts_sub, k_params, idx_start) - res_df = t_test_multi(result, contrasts, method=method, ci_method=None, contrast_names=labels) + res_df = t_test_multi(result, contrasts, method=method, ci_method=None, + contrast_names=labels) res = MultiCompResult(result_frame=res_df, contrasts=contrasts, term=term, From ea51b59ee8ce0fe10a3fa6ac499740ab4a0ea01a Mon Sep 17 00:00:00 2001 From: Josef Date: Tue, 20 Mar 2018 15:47:11 -0400 Subject: [PATCH 084/157] ENH/TST: add _constraints_factor to contrast.py --- statsmodels/stats/contrast.py | 47 ++++++++++++++++++++++++ statsmodels/stats/tests/test_contrast.py | 30 +++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py index 58441ff2770..ff3c7c81f09 100644 --- a/statsmodels/stats/contrast.py +++ b/statsmodels/stats/contrast.py @@ -550,6 +550,53 @@ def _embed_constraints(contrasts, k_params, idx_start, index=None): return c +def _constraints_factor(encoding_matrix, comparison='pairwise', k_params=None, + idx_start=None): + """helper function to create constraints based on encoding matrix + + Parameters + ---------- + encoding_matrix : ndarray + contrast matrix for the encoding of a factor as defined by patsy. + The number of rows should be equal to the number of levels or categories + of the factor, the number of columns should be equal to the number + of parameters for this factor. + comparison : str + Currently only 'pairwise' is implemented. The restriction matrix + can be used for testing the hypothesis that all pairwise differences + are zero. + k_params : int + number of parameters + idx_start : int + Index of the first parameter of this factor. The restrictions on the + factor are inserted as a block in the full restriction matrix starting + at column with index `idx_start`. + + Returns + ------- + contrast : ndarray + Contrast or restriction matrix that can be used in hypothesis test + of model results. The number of columns is k_params. + """ + + cm = encoding_matrix + k_level, k_p = cm.shape + + import statsmodels.sandbox.stats.multicomp as mc + if comparison in ['pairwise', 'pw', 'pairs']: + c_all = -mc.contrast_allpairs(k_level) + else: + raise NotImplementedError('currentlyonly pairwise comparison') + + contrasts = c_all.dot(cm) + if k_params is not None: + if idx_start is None: + raise ValueError("if k_params is not None, then idx_start is " + "required") + contrasts = _embed_constraints(contrasts, k_params, idx_start) + return contrasts + + def t_test_pairwise(result, term_name, method='hs', factor_labels=None, ignore=False): """perform pairwise t_test with multiple testing corrected p-values diff --git a/statsmodels/stats/tests/test_contrast.py b/statsmodels/stats/tests/test_contrast.py index 4054759a619..01a537c7a20 100644 --- a/statsmodels/stats/tests/test_contrast.py +++ b/statsmodels/stats/tests/test_contrast.py @@ -2,6 +2,8 @@ import numpy.random as R from numpy.testing import assert_almost_equal, assert_equal from statsmodels.stats.contrast import Contrast +import statsmodels.stats.contrast as smc + class TestContrast(object): @classmethod @@ -36,3 +38,31 @@ def test_estimable(self): c = Contrast(self.X[:,5],X2) #TODO: I don't think this should be estimable? isestimable correct? + +def test_constraints(): + cm_ = np.eye(4, 3, k=-1) + cpairs = np.array([[ 1., 0., 0.], + [ 0., 1., 0.], + [ 0., 0., 1.], + [-1., 1., 0.], + [-1., 0., 1.], + [ 0., -1., 1.]]) + c0 = smc._constraints_factor(cm_) + assert_equal(c0, cpairs) + + c1 = smc._contrast_pairs(3, 4, 0) + assert_equal(c1, cpairs) + + # embedded + cpairs2 = np.array([[ 0., 1., 0., 0., 0., 0.], + [ 0., 0., 1., 0., 0., 0.], + [ 0., 0., 0., 1., 0., 0.], + [ 0., -1., 1., 0., 0., 0.], + [ 0., -1., 0., 1., 0., 0.], + [ 0., 0., -1., 1., 0., 0.]]) + + c0 = smc._constraints_factor(cm_, k_params=6, idx_start=1) + assert_equal(c0, cpairs2) + + c1 = smc._contrast_pairs(6, 4, 1) # k_params, k_level, idx_start + assert_equal(c1, cpairs2) From ba959b6af0d4262237db4f8d7d49a2fbd5e780fb Mon Sep 17 00:00:00 2001 From: Josef Date: Tue, 20 Mar 2018 20:57:07 -0400 Subject: [PATCH 085/157] ENH: add alpha option to t_test_pairwise, more test coverage --- statsmodels/base/model.py | 6 +++-- .../base/tests/test_generic_methods.py | 16 +++++++++++++ statsmodels/stats/contrast.py | 24 +++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index ee10dfb4896..8fd06754425 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -1646,7 +1646,7 @@ def wald_test_terms(self, skip_single=False, extra_constraints=None, res.temp = constraints + combined_constraints + extra_constraints return res - def t_test_pairwise(self, term_name, method='hs', + def t_test_pairwise(self, term_name, method='hs', alpha=0.05, factor_labels=None): """perform pairwise t_test with multiple testing corrected p-values @@ -1664,6 +1664,8 @@ def t_test_pairwise(self, term_name, method='hs', method : str or list of strings multiple testing p-value correction, default is 'hs', see stats.multipletesting + alpha : float + significance level for multiple testing reject decision. factor_labels : None, list of str Labels for the factor levels used for pairwise labels. If not provided, then the labels from the formula design_info are used. @@ -1705,7 +1707,7 @@ def t_test_pairwise(self, term_name, method='hs', 3-2 1.130992 0.010212 True """ - res = t_test_pairwise(self, term_name, method=method, + res = t_test_pairwise(self, term_name, method=method, alpha=alpha, factor_labels=factor_labels) return res diff --git a/statsmodels/base/tests/test_generic_methods.py b/statsmodels/base/tests/test_generic_methods.py index 98f470011ba..50434b5a743 100644 --- a/statsmodels/base/tests/test_generic_methods.py +++ b/statsmodels/base/tests/test_generic_methods.py @@ -550,6 +550,22 @@ def setup_class(cls): 'C(Weight)[T.3] - C(Weight)[T.2]'] + def test_alpha(self): + pw1 = self.res.t_test_pairwise(self.term_name, method='hommel', + factor_labels='A B C'.split()) + pw2 = self.res.t_test_pairwise(self.term_name, method='hommel', + alpha=0.01) + assert_allclose(pw1.result_frame.iloc[:, :7].values, + pw2.result_frame.iloc[:, :7].values, rtol=1e-10) + assert_equal(pw1.result_frame.iloc[:, -1].values, + [True]*3) + assert_equal(pw2.result_frame.iloc[:, -1].values, + [False, True, False]) + + assert_equal(pw1.result_frame.index.values, + np.array(['B-A', 'C-A', 'C-B'], dtype=object)) + + class TestTTestPairwiseOLS2(CheckPairwise): @classmethod diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py index ff3c7c81f09..30990e19ec9 100644 --- a/statsmodels/stats/contrast.py +++ b/statsmodels/stats/contrast.py @@ -469,23 +469,25 @@ def _contrast_pairs(k_params, k_level, idx_start): return contrasts -def t_test_multi(result, contrasts, method='hs', ci_method=None, +def t_test_multi(result, contrasts, method='hs', alpha=0.05, ci_method=None, contrast_names=None): """perform t_test and add multiplicity correction to results dataframe Parameters ---------- result results instance - results of an estimated model + results of an estimated model contrasts : ndarray restriction matrix for t_test method : string or list of strings - method for multiple testing p-value correction, default is'hs'. + method for multiple testing p-value correction, default is'hs'. + alpha : float + significance level for multiple testing reject decision. ci_method : None - not used yet, will be for multiplicity corrected confidence intervals + not used yet, will be for multiplicity corrected confidence intervals contrast_names : list of strings or None - If contrast_names are provided, then they are used in the index of the - returned dataframe, otherwise some generic default names are created. + If contrast_names are provided, then they are used in the index of the + returned dataframe, otherwise some generic default names are created. Returns ------- @@ -500,7 +502,7 @@ def t_test_multi(result, contrasts, method='hs', ci_method=None, if type(method) is not list: method = [method] for meth in method: - mt = multipletests(tt.pvalue, method=meth) + mt = multipletests(tt.pvalue, method=meth, alpha=alpha) res_df['pvalue-%s' % meth] = mt[1] res_df['reject-%s' % meth] = mt[0] return res_df @@ -597,8 +599,8 @@ def _constraints_factor(encoding_matrix, comparison='pairwise', k_params=None, return contrasts -def t_test_pairwise(result, term_name, method='hs', factor_labels=None, - ignore=False): +def t_test_pairwise(result, term_name, method='hs', alpha=0.05, + factor_labels=None, ignore=False): """perform pairwise t_test with multiple testing corrected p-values This uses the formula design_info encoding contrast matrix and should @@ -615,6 +617,8 @@ def t_test_pairwise(result, term_name, method='hs', factor_labels=None, method : str or list of strings multiple testing p-value correction, default is 'hs', see stats.multipletesting + alpha : float + significance level for multiple testing reject decision. factor_labels : None, list of str Labels for the factor levels used for pairwise labels. If not provided, then the labels from the formula design_info are used. @@ -670,7 +674,7 @@ def t_test_pairwise(result, term_name, method='hs', factor_labels=None, contrasts_sub = c_all_pairs.dot(cm) contrasts = _embed_constraints(contrasts_sub, k_params, idx_start) res_df = t_test_multi(result, contrasts, method=method, ci_method=None, - contrast_names=labels) + alpha=alpha, contrast_names=labels) res = MultiCompResult(result_frame=res_df, contrasts=contrasts, term=term, From 3d977d9d3b049add51858445fa253658f5b1006f Mon Sep 17 00:00:00 2001 From: Rob Klooster Date: Wed, 21 Mar 2018 16:38:18 +0100 Subject: [PATCH 086/157] BUG: Prevent warnings from scipy.stats --- statsmodels/stats/anova.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/statsmodels/stats/anova.py b/statsmodels/stats/anova.py index db71a46e4d3..1348671af25 100644 --- a/statsmodels/stats/anova.py +++ b/statsmodels/stats/anova.py @@ -132,16 +132,14 @@ def anova1_lm_single(model, endog, exog, nobs, design_info, table, n_rows, test, index = term_names.tolist() table.index = Index(index + ['Residual']) table.loc[index, ['df', 'sum_sq']] = np.c_[arr[~idx].sum(1), sum_sq] - if test == 'F': - table.loc[table.index[:n_rows], test] = ((table['sum_sq']/table['df']) / - (model.ssr/model.df_resid)) - table.loc[table.index[:n_rows], pr_test] = stats.f.sf(table["F"], table["df"], - model.df_resid) - # fill in residual - table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr, - model.df_resid, - np.nan, np.nan) + table.loc['Residual', ['sum_sq','df']] = model.ssr, model.df_resid + if test == 'F': + table[test] = ((table['sum_sq'] / table['df']) / + (model.ssr / model.df_resid)) + table[pr_test] = stats.f.sf(table["F"], table["df"], + model.df_resid) + table.loc['Residual', [test, pr_test]] = np.nan, np.nan table['mean_sq'] = table['sum_sq'] / table['df'] return table @@ -333,19 +331,9 @@ def anova_lm(*args, **kwargs): raise ValueError("Multiple models only supported for type I. " "Got type %s" % str(typ)) - ### COMPUTE Anova TYPE I ### - - # if given a single model - if len(args) == 1: - return anova_single(*args, **kwargs) - - # received multiple fitted models - test = kwargs.get("test", "F") scale = kwargs.get("scale", None) n_models = len(args) - - model_formula = [] pr_test = "Pr(>%s)" % test names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test] table = DataFrame(np.zeros((n_models, 6)), columns = names) From 0871eba510e7ebe8aa9323f4f3ee50226576ed29 Mon Sep 17 00:00:00 2001 From: Josef Date: Wed, 21 Mar 2018 14:15:51 -0400 Subject: [PATCH 087/157] BUG: fix MICEData if column with one missing value closes #4375 --- statsmodels/imputation/mice.py | 2 +- statsmodels/imputation/tests/test_mice.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/statsmodels/imputation/mice.py b/statsmodels/imputation/mice.py index 13b1bf974cf..233a7512371 100644 --- a/statsmodels/imputation/mice.py +++ b/statsmodels/imputation/mice.py @@ -392,7 +392,7 @@ def _store_changes(self, col, vals): ix = self.ix_miss[col] if len(ix) > 0: - self.data[col].iloc[ix] = vals + self.data[col].iloc[ix] = np.atleast_1d(vals) def update_all(self, n_iter=1): diff --git a/statsmodels/imputation/tests/test_mice.py b/statsmodels/imputation/tests/test_mice.py index 69a3621499f..7b5874ba505 100644 --- a/statsmodels/imputation/tests/test_mice.py +++ b/statsmodels/imputation/tests/test_mice.py @@ -339,6 +339,29 @@ def test_combine(self): assert_allclose(result.tvalues, tvalues, atol=1e-5) +def test_micedata_miss1(): + # test for #4375 + np.random.seed(0) + data = pd.DataFrame(np.random.rand(50, 4)) + data.columns = ['var1', 'var2', 'var3', 'var4'] + # one column with a single missing value + data.iloc[1, 1] = np.nan + data.iloc[[1, 3], 2] = np.nan + + data_imp = mice.MICEData(data) + data_imp.update_all() + + assert_equal(data_imp.data.isnull().values.sum(), 0) + + ix_miss = {'var1': np.array([], dtype=np.int64), + 'var2': np.array([1], dtype=np.int64), + 'var3': np.array([1, 3], dtype=np.int64), + 'var4': np.array([], dtype=np.int64)} + + for k in ix_miss: + assert_equal(data_imp.ix_miss[k], ix_miss[k]) + + if __name__=="__main__": import pytest pytest.main([__file__, '-vvs', '-x', '--pdb']) From b3c1e8a29799de129d336061c9949ccf0d4c0b0c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 21 Mar 2018 15:08:48 -0700 Subject: [PATCH 088/157] make arithmetic more readable --- statsmodels/discrete/discrete_model.py | 55 +++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py index b04dd2c2c96..4f8b1f7e590 100644 --- a/statsmodels/discrete/discrete_model.py +++ b/statsmodels/discrete/discrete_model.py @@ -1468,8 +1468,10 @@ def score_obs(self, params): dmudb = mu * exog dalpha = (mu_p * (y * ((y - 1) / a2 - 2 / a1) + a2 / a1**2)) - dparams = dmudb * (-a4 / a1 + a3 * a2 / (a1 ** 2) + (1 + a4) * - ((y - 1) / a2 - 1 / a1) + 1 / mu) + dparams = dmudb * (-a4 / a1 + + a3 * a2 / (a1 ** 2) + + (1 + a4) * ((y - 1) / a2 - 1 / a1) + + 1 / mu) return np.concatenate((dparams, np.atleast_2d(dalpha)), axis=1) @@ -1510,8 +1512,8 @@ def _score_p(self, params): a1 = 1 + alpha * mu_p a2 = mu + alpha * mu_p * y - dp = np.sum((np.log(mu) * ((a2 - mu) * ((y - 1) / a2 - 2 / a1) + (a1 - 1) * - a2 / a1 ** 2))) + dp = np.sum((np.log(mu) * ((a2 - mu) * ((y - 1) / a2 - 2 / a1) + + (a1 - 1) * a2 / a1 ** 2))) return dp def hessian(self, params): @@ -1554,29 +1556,42 @@ def hessian(self, params): for i in range(dim): for j in range(i + 1): hess_arr[i,j] = np.sum(mu * exog[:,i,None] * exog[:,j,None] * - (mu * (a3 * a4 / a1**2 - 2 * a3**2 * a2 / a1**3 + 2 * a3 * - (a4 + 1) / a1**2 - a4 * p / (mu * a1) + a3 * p * a2 / - (mu * a1**2) + a4 / (mu * a1) - a3 * a2 / (mu * a1**2) + - (y - 1) * a4 * (p - 1) / (a2 * mu) - (y - 1) * - (1 + a4)**2 / a2**2 - a4 * (p - 1) / (a1 * mu) - 1 / - mu**2) + (-a4 / a1 + a3 * a2 / a1**2 + (y - 1) * - (1 + a4) / a2 - (1 + a4) / a1 + 1 / mu)), axis=0) + (mu * (a3 * a4 / a1**2 - + 2 * a3**2 * a2 / a1**3 + + 2 * a3 * (a4 + 1) / a1**2 - + a4 * p / (mu * a1) + + a3 * p * a2 / (mu * a1**2) + + a4 / (mu * a1) - + a3 * a2 / (mu * a1**2) + + (y - 1) * a4 * (p - 1) / (a2 * mu) - + (y - 1) * (1 + a4)**2 / a2**2 - + a4 * (p - 1) / (a1 * mu) - + 1 / mu**2) + + (-a4 / a1 + + a3 * a2 / a1**2 + + (y - 1) * (1 + a4) / a2 - + (1 + a4) / a1 + + 1 / mu)), axis=0) tri_idx = np.triu_indices(dim, k=1) hess_arr[tri_idx] = hess_arr.T[tri_idx] # for dl/dparams dalpha - dldpda = np.sum((2 * a4 * mu_p / a1**2 - 2 * a3 * mu_p * a2 / a1**3 - - mu_p * y * (y - 1) * (1 + a4) / a2**2 + mu_p * - (1 + a4) / a1**2 + a5 * y * (y - 1) / a2 - 2 * - a5 * y / a1 + a5 * a2 / a1**2) * dmudb, + dldpda = np.sum((2 * a4 * mu_p / a1**2 - + 2 * a3 * mu_p * a2 / a1**3 - + mu_p * y * (y - 1) * (1 + a4) / a2**2 + + mu_p * (1 + a4) / a1**2 + + a5 * y * (y - 1) / a2 - + 2 * a5 * y / a1 + + a5 * a2 / a1**2) * dmudb, axis=0) hess_arr[-1,:-1] = dldpda hess_arr[:-1,-1] = dldpda # for dl/dalpha dalpha - dldada = mu_p**2 * (3 * y / a1**2 - (y / a2)**2. * (y - 1) - 2 * a2 / - a1**3) + dldada = mu_p**2 * (3 * y / a1**2 - + (y / a2)**2. * (y - 1) - + 2 * a2 / a1**3) hess_arr[-1,-1] = dldada.sum() @@ -3017,14 +3032,16 @@ def hessian(self, params): hess_arr = np.zeros((dim + 1, dim + 1)) coeff = mu**2 * (((1 + a4)**2 * a3 / a2**2 - - a3 * (a5 - a4 / mu) / a2 - y / mu**2 - + a3 * (a5 - a4 / mu) / a2 - + y / mu**2 - 2 * a4 * (1 + a4) / a2 + a5 * (np.log(a1) - np.log(a2) - digamma(a1) + digamma(a3) + 2) - a4 * (np.log(a1) - np.log(a2) - digamma(a1) + digamma(a3) + 1) / mu - a4**2 * (polygamma(1, a1) - polygamma(1, a3))) + - (-(1 + a4) * a3 / a2 + y / mu + + (-(1 + a4) * a3 / a2 + + y / mu + a4 * (np.log(a1) - np.log(a2) - digamma(a1) + digamma(a3) + 1)) / mu) From 6d0619120b9576d23c0613068c47b53514313f04 Mon Sep 17 00:00:00 2001 From: Josef Date: Wed, 21 Mar 2018 23:46:25 -0400 Subject: [PATCH 089/157] TST: tukeyhsd: add smoke test for plot_simultaneous --- statsmodels/stats/tests/test_pairwise.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/statsmodels/stats/tests/test_pairwise.py b/statsmodels/stats/tests/test_pairwise.py index 371dcf67d00..7c46b0caba0 100644 --- a/statsmodels/stats/tests/test_pairwise.py +++ b/statsmodels/stats/tests/test_pairwise.py @@ -9,6 +9,12 @@ import warnings +try: + import matplotlib.pyplot as plt + has_maplotlib = True +except ImportError: + has_maplotlib = False + import numpy as np import pandas as pd from numpy.testing import assert_, assert_allclose, assert_almost_equal, assert_equal, \ @@ -186,6 +192,14 @@ def test_shortcut_function(self): res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha) assert_almost_equal(res.confint, self.res.confint, decimal=14) + def test_plot_simultaneous_ci(self): + # smoke tests + self.res._simultaneous_ci() + if has_maplotlib: + reference = self.res.groupsunique[1] + fig = self.res.plot_simultaneous(comparison_name=reference) + plt.close('all') + class TestTuckeyHSD2(CheckTuckeyHSDMixin): From 6746910faa4a929e66fe9241a15da58f32b7d35d Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 09:40:04 -0400 Subject: [PATCH 090/157] BUG/MAINT numpy compat, scalar index --- statsmodels/sandbox/stats/multicomp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/sandbox/stats/multicomp.py b/statsmodels/sandbox/stats/multicomp.py index 98f77dc3222..cb53c3aa52d 100644 --- a/statsmodels/sandbox/stats/multicomp.py +++ b/statsmodels/sandbox/stats/multicomp.py @@ -728,7 +728,7 @@ def plot_simultaneous(self, comparison_name=None, ax=None, figsize=(10,6), else: if comparison_name not in self.groupsunique: raise ValueError('comparison_name not found in group names.') - midx = np.where(self.groupsunique==comparison_name)[0] + midx = np.where(self.groupsunique==comparison_name)[0][0] for i in range(len(means)): if self.groupsunique[i] == comparison_name: continue From f59661a48b478942b4a37bc9e8a0a87d3d6035f0 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 16:13:04 -0500 Subject: [PATCH 091/157] TST: correct super call xfailing test in TestGlmPoissonPwNr --- statsmodels/genmod/tests/test_glm_weights.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/statsmodels/genmod/tests/test_glm_weights.py b/statsmodels/genmod/tests/test_glm_weights.py index 737791a6a33..4d5c36f2960 100644 --- a/statsmodels/genmod/tests/test_glm_weights.py +++ b/statsmodels/genmod/tests/test_glm_weights.py @@ -197,12 +197,12 @@ def setup_class(cls): cls.res2 = res_stata.results_poisson_pweight_nonrobust @pytest.mark.xfail(reason='Known to fail') - def test_basic(cls): - super(cls, TestGlmPoissonPwNr).test_basic(cls) + def test_basic(self): + super(TestGlmPoissonPwNr, self).test_basic() @pytest.mark.xfail(reason='Known to fail') - def test_compare_optimizers(cls): - super(cls, TestGlmPoissonPwNr).test_compare_optimizers(cls) + def test_compare_optimizers(self): + super(TestGlmPoissonPwNr, self).test_compare_optimizers() class TestGlmPoissonFwHC(CheckWeight): From 02731c7e5d2b654b67e4099b835c4323ba334171 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 17:22:37 -0500 Subject: [PATCH 092/157] REF/TST: FutureWarning for resid_anscombe, silence unit tests --- .../genmod/generalized_linear_model.py | 2 +- statsmodels/genmod/tests/test_gee_glm.py | 8 ++++++-- statsmodels/genmod/tests/test_glm.py | 8 ++++++-- statsmodels/genmod/tests/test_glm_weights.py | 19 +++++++++++++------ 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/statsmodels/genmod/generalized_linear_model.py b/statsmodels/genmod/generalized_linear_model.py index 2cb1bd49b52..8851ab9c225 100644 --- a/statsmodels/genmod/generalized_linear_model.py +++ b/statsmodels/genmod/generalized_linear_model.py @@ -1484,7 +1484,7 @@ def resid_working(self): def resid_anscombe(self): import warnings warnings.warn('Anscombe residuals currently unscaled. In a future ' - 'release, they will be scaled.') + 'release, they will be scaled.', category=FutureWarning) return self.family.resid_anscombe(self._endog, self.fittedvalues, var_weights=self._var_weights, scale=1.) diff --git a/statsmodels/genmod/tests/test_gee_glm.py b/statsmodels/genmod/tests/test_gee_glm.py index 7f3cdf8a602..7ef0cbb5717 100644 --- a/statsmodels/genmod/tests/test_gee_glm.py +++ b/statsmodels/genmod/tests/test_gee_glm.py @@ -1,3 +1,5 @@ + +import warnings import numpy as np import pandas as pd @@ -29,8 +31,10 @@ def test_resid(self): rtol=1e-6, atol=1e-10) assert_allclose(res1.resid_deviance, res2.resid_deviance, rtol=1e-6, atol=1e-10) - assert_allclose(res1.resid_anscombe, res2.resid_anscombe, - rtol=1e-6, atol=1e-10) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + assert_allclose(res1.resid_anscombe, res2.resid_anscombe, + rtol=1e-6, atol=1e-10) assert_allclose(res1.resid_working, res2.resid_working, rtol=1e-6, atol=1e-10) diff --git a/statsmodels/genmod/tests/test_glm.py b/statsmodels/genmod/tests/test_glm.py index 8f138cc7644..22de59f54b8 100644 --- a/statsmodels/genmod/tests/test_glm.py +++ b/statsmodels/genmod/tests/test_glm.py @@ -75,9 +75,12 @@ def test_residuals(self): resid2[:, 2] *= self.res1.family.link.deriv(self.res1.mu)**2 atol = 10**(-self.decimal_resids) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + resid_a = self.res1.resid_anscombe resids = np.column_stack((self.res1.resid_pearson, self.res1.resid_deviance, self.res1.resid_working, - self.res1.resid_anscombe, self.res1.resid_response)) + resid_a, self.res1.resid_response)) assert_allclose(resids, resid2, rtol=1e-6, atol=atol) decimal_aic_R = DECIMAL_4 @@ -1750,7 +1753,8 @@ def test_resid(self): rtol=1e-5, atol=1e-5) assert_allclose(self.res1.resid_working, self.res2.resid_working, rtol=1e-5, atol=1e-5) - assert_allclose(self.res1.resid_anscombe, self.res2.resid_anscombe, + assert_allclose(self.res1.resid_anscombe_unscaled, + self.res2.resid_anscombe_unscaled, rtol=1e-5, atol=1e-5) diff --git a/statsmodels/genmod/tests/test_glm_weights.py b/statsmodels/genmod/tests/test_glm_weights.py index 4d5c36f2960..cdf0bdaba10 100644 --- a/statsmodels/genmod/tests/test_glm_weights.py +++ b/statsmodels/genmod/tests/test_glm_weights.py @@ -48,6 +48,7 @@ # load data into module namespace from statsmodels.datasets.cpunish import load +from warnings import catch_warnings cpunish_data = load() cpunish_data.exog[:, 3] = np.log(cpunish_data.exog[:, 3]) cpunish_data.exog = add_constant(cpunish_data.exog, prepend=False) @@ -104,9 +105,13 @@ def test_residuals(self): assert_allclose(res1.resid_working, resid_all['resid_working'], atol= 1e-6, rtol=2e-6) if resid_all.get('resid_anscombe') is None: return None - # Stata doesn't use var_weights in anscombe residuals, it seems. + # Stata doesn't use var_weights in anscombe residuals, it seems. # Adjust residuals to match our approach. - assert_allclose(res1.resid_anscombe, resid_all['resid_anscombe'] * np.sqrt(res1._var_weights), atol= 1e-6, rtol=2e-6) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + resid_a = res1.resid_anscombe + + assert_allclose(resid_a, resid_all['resid_anscombe'] * np.sqrt(res1._var_weights), atol= 1e-6, rtol=2e-6) def test_compare_optimizers(self): res1 = self.res1 @@ -169,7 +174,7 @@ def setup_class(cls): family=sm.families.Poisson(), var_weights=aweights).fit() # compare with discrete, start close to save time modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) - + # Need to copy to avoid inplace adjustment from copy import copy cls.res2 = copy(res_stata.results_poisson_aweight_nonrobust) @@ -237,7 +242,7 @@ def setup_class(cls): # This is really close when corr_fact = (wsum - 1.) / wsum, but to # avoid having loosen precision of the assert_allclose, I'm doing this # manually. Its *possible* lowering the IRLS convergence criterion - # in stata and here will make this less sketchy. + # in stata and here will make this less sketchy. cls.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609 cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights @@ -308,7 +313,7 @@ def setup_class(cls): aweights = np.repeat(1, len(endog)) aweights[::5] = 5 aweights[::13] = 3 - model = sm.GLM(endog, exog, + model = sm.GLM(endog, exog, family=sm.families.Gamma(link=sm.families.links.log()), var_weights=aweights) cls.res1 = model.fit(rtol=1e-25, atol=0) @@ -902,7 +907,9 @@ def test_poisson_residuals(): res_poi_w.resid_response) assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson) assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance) - assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe) assert_allclose(res_poi_e.resid_anscombe_unscaled, res_poi_w.resid_anscombe) From ea966e7a104ff666affe81acbc1e5df608d82a72 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 18:41:20 -0500 Subject: [PATCH 093/157] TST: silence warning in test_glm.py --- statsmodels/genmod/tests/test_glm.py | 40 +++++++++++++++++++--------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/statsmodels/genmod/tests/test_glm.py b/statsmodels/genmod/tests/test_glm.py index 22de59f54b8..706949771e0 100644 --- a/statsmodels/genmod/tests/test_glm.py +++ b/statsmodels/genmod/tests/test_glm.py @@ -15,6 +15,7 @@ from statsmodels.tools.tools import add_constant from statsmodels.tools.sm_exceptions import PerfectSeparationError from statsmodels.discrete import discrete_model as discrete +from statsmodels.tools.sm_exceptions import DomainWarning import pytest import warnings @@ -649,8 +650,12 @@ def setup_class(cls): interaction = cls.data.exog[:,2]*cls.data.exog[:,1] cls.data.exog = np.column_stack((cls.data.exog,interaction)) cls.data.exog = add_constant(cls.data.exog, prepend=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=DomainWarning) + fam = sm.families.NegativeBinomial() + cls.res1 = GLM(cls.data.endog, cls.data.exog, - family=sm.families.NegativeBinomial()).fit(scale='x2') + family=fam).fit(scale='x2') from .results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale @@ -773,7 +778,9 @@ def test_perfect_pred(): y = y[y != 2] X = add_constant(X, prepend=True) glm = GLM(y, X, family=sm.families.Binomial()) - assert_raises(PerfectSeparationError, glm.fit) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + assert_raises(PerfectSeparationError, glm.fit) def test_score_test_OLS(): @@ -1422,14 +1429,17 @@ def setup_class(cls): ''' super(TestWtdGlmNegativeBinomial, cls).setup_class() alpha = 1. - family_link = sm.families.NegativeBinomial( - link=sm.families.links.nbinom(alpha=alpha), - alpha=alpha) - cls.res1 = GLM(cls.endog, cls.exog, - freq_weights=cls.weight, - family=family_link).fit() - cls.res2 = GLM(cls.endog_big, cls.exog_big, - family=family_link).fit() + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=DomainWarning) + family_link = sm.families.NegativeBinomial( + link=sm.families.links.nbinom(alpha=alpha), + alpha=alpha) + cls.res1 = GLM(cls.endog, cls.exog, + freq_weights=cls.weight, + family=family_link).fit() + cls.res2 = GLM(cls.endog_big, cls.exog_big, + family=family_link).fit() class TestWtdGlmGamma(CheckWtdDuplicationMixin): @@ -2067,9 +2077,13 @@ def test_non_invertible_hessian_fails_summary(): data = sm.datasets.cpunish.load_pandas() data.endog[:] = 1 - mod = sm.GLM(data.endog, data.exog, family=sm.families.Gamma()) - res = mod.fit(maxiter=1, method='bfgs', max_start_irls=0) - res.summary() + with warnings.catch_warnings(): + # we filter DomainWarning, the convergence problems + # and warnings in summary + warnings.simplefilter("ignore") + mod = sm.GLM(data.endog, data.exog, family=sm.families.Gamma()) + res = mod.fit(maxiter=1, method='bfgs', max_start_irls=0) + res.summary() if __name__ == "__main__": From b55649e42a935d4d2a6b2e90e4558f225d75e562 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 19:17:29 -0500 Subject: [PATCH 094/157] TST: remove most warnings, cheating with start_params --- statsmodels/genmod/tests/test_glm_weights.py | 24 ++++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/statsmodels/genmod/tests/test_glm_weights.py b/statsmodels/genmod/tests/test_glm_weights.py index cdf0bdaba10..a6ba72f615d 100644 --- a/statsmodels/genmod/tests/test_glm_weights.py +++ b/statsmodels/genmod/tests/test_glm_weights.py @@ -30,6 +30,7 @@ from statsmodels.compat.testing import SkipTest import warnings +from warnings import catch_warnings import sys import nose @@ -42,13 +43,14 @@ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.tools.tools import add_constant from statsmodels.discrete import discrete_model as discrete +from statsmodels.tools.sm_exceptions import SpecificationWarning from .results import results_glm_poisson_weights as res_stata from .results import res_R_var_weight as res_r # load data into module namespace from statsmodels.datasets.cpunish import load -from warnings import catch_warnings + cpunish_data = load() cpunish_data.exog[:, 3] = np.log(cpunish_data.exog[:, 3]) cpunish_data.exog = add_constant(cpunish_data.exog, prepend=False) @@ -125,7 +127,10 @@ def test_compare_optimizers(self): TestGlmPoissonFwClu, TestBinomial0RepeatedvsAverage)): return None - res2 = self.res1.model.fit(method=method, optim_hessian=optim_hessian) + + start_params = res1.params + res2 = self.res1.model.fit(start_params=start_params, method=method, + optim_hessian=optim_hessian) assert_allclose(res1.params, res2.params, atol=1e-3, rtol=2e-3) H = res2.model.hessian(res2.params, observed=False) res2_bse = np.sqrt(-np.diag(np.linalg.inv(H))) @@ -220,6 +225,7 @@ def setup_class(cls): nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs cls.corr_fact = np.sqrt((wsum - 1.) / wsum) + cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=fweights ).fit(cov_type='HC0') #, cov_kwds={'use_correction':False}) @@ -268,9 +274,11 @@ def setup_class(cls): # no wnobs yet in sandwich covariance calcualtion cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1)) #np.sqrt((wsum - 1.) / wsum) cov_kwds = {'groups': gid, 'use_correction':False} - cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, - family=sm.families.Poisson(), freq_weights=fweights - ).fit(cov_type='cluster', cov_kwds=cov_kwds) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=SpecificationWarning) + cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, + family=sm.families.Poisson(), freq_weights=fweights + ).fit(cov_type='cluster', cov_kwds=cov_kwds) # compare with discrete, start close to save time #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) cls.res2 = res_stata.results_poisson_fweight_clu1 @@ -715,7 +723,7 @@ def setup_class(cls): beta = np.array([-1, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Binomial - link = sm.families.links.log + link = sm.families.links.logit endog = gen_endog(lin_pred, family, link, binom_version=0) mod1 = sm.GLM(endog, exog, family=family(link=link())) cls.res1 = mod1.fit(rtol=1e-10, atol=0, tol_criterion='params', @@ -748,7 +756,7 @@ def setup_class(cls): beta = np.array([-1, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Binomial - link = sm.families.links.log + link = sm.families.links.logit endog = gen_endog(lin_pred, family, link, binom_version=0) wt = np.random.randint(1, 5, n) mod1 = sm.GLM(endog, exog, family=family(link=link), freq_weights=wt) @@ -824,7 +832,9 @@ class TestBinomialVsVarWeights(CheckWeight): def setup_class(cls): from statsmodels.datasets.star98 import load data = load() + data.exog /= data.exog.std(0) data.exog = add_constant(data.exog, prepend=False) + cls.res1 = GLM(data.endog, data.exog, family=sm.families.Binomial()).fit() weights = data.endog.sum(axis=1) From 9094b726493352e37eebb42b739f234f0804ce98 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 21:54:33 -0500 Subject: [PATCH 095/157] TST: remove print, module level filter array_split FutureWarning --- .../vector_ar/tests/JMulTi_results/parse_jmulti_var_output.py | 3 ++- statsmodels/tsa/vector_ar/tests/test_vecm.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/vector_ar/tests/JMulTi_results/parse_jmulti_var_output.py b/statsmodels/tsa/vector_ar/tests/JMulTi_results/parse_jmulti_var_output.py index e02ef808a83..b0f962dcc3e 100644 --- a/statsmodels/tsa/vector_ar/tests/JMulTi_results/parse_jmulti_var_output.py +++ b/statsmodels/tsa/vector_ar/tests/JMulTi_results/parse_jmulti_var_output.py @@ -245,7 +245,8 @@ def load_results_jmulti(dataset, dt_s_list): # all possible combinations of potentially causing variables # (at least 1 variable and not all variables together): var_combs = sublists(vn, 1, len(vn)-1) - print("\n\n\n" + dt_string) + if debug_mode: + print("\n\n\n" + dt_string) for causing in var_combs: caused = tuple(name for name in vn if name not in causing) causality_file = dataset.__str__() + "_" + source + "_" \ diff --git a/statsmodels/tsa/vector_ar/tests/test_vecm.py b/statsmodels/tsa/vector_ar/tests/test_vecm.py index dcda96f5698..2e9412a1599 100644 --- a/statsmodels/tsa/vector_ar/tests/test_vecm.py +++ b/statsmodels/tsa/vector_ar/tests/test_vecm.py @@ -15,6 +15,8 @@ from statsmodels.tsa.vector_ar.var_model import VARProcess from statsmodels.tsa.vector_ar.vecm import VECM, select_order, select_coint_rank +import pytest +pytestmark = pytest.mark.filterwarnings('ignore:in the future np.array_split') class DataSet(object): """ From 2a3dd6d508ad95a7293948359206511ed02ebcd9 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 21:58:00 -0500 Subject: [PATCH 096/157] TST: silence warnings in vector_ar/tests, except one convergend in SVAR --- statsmodels/tsa/vector_ar/tests/test_coint.py | 6 +++++- statsmodels/tsa/vector_ar/tests/test_var.py | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/vector_ar/tests/test_coint.py b/statsmodels/tsa/vector_ar/tests/test_coint.py index e3001511174..7ad0bb404fc 100644 --- a/statsmodels/tsa/vector_ar/tests/test_coint.py +++ b/statsmodels/tsa/vector_ar/tests/test_coint.py @@ -6,11 +6,13 @@ """ import os +import warnings import numpy as np from numpy.testing import assert_almost_equal, assert_equal from statsmodels.tsa.vector_ar.vecm import coint_johansen +from statsmodels.tools.sm_exceptions import HypothesisTestWarning current_path = os.path.dirname(os.path.abspath(__file__)) dta_path = os.path.join(current_path, "Matlab_results", "test_coint.csv") @@ -113,7 +115,9 @@ class TestCointJoh25(CheckCointJoh): @classmethod def setup_class(cls): - cls.res = coint_johansen(dta, 2, 5) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=HypothesisTestWarning) + cls.res = coint_johansen(dta, 2, 5) cls.nobs_r = 173 - 1 - 5 #Note: critical values not available if trend>1 diff --git a/statsmodels/tsa/vector_ar/tests/test_var.py b/statsmodels/tsa/vector_ar/tests/test_var.py index c3d024ccc0f..6b877e01441 100644 --- a/statsmodels/tsa/vector_ar/tests/test_var.py +++ b/statsmodels/tsa/vector_ar/tests/test_var.py @@ -3,6 +3,7 @@ Test VAR Model """ from __future__ import print_function +import warnings # pylint: disable=W0612,W0231 from statsmodels.compat.python import (iteritems, StringIO, lrange, BytesIO, range) @@ -18,6 +19,7 @@ import statsmodels.tsa.vector_ar.util as util import statsmodels.tools.data as data_util from statsmodels.tsa.vector_ar.var_model import VAR +from statsmodels.tools.sm_exceptions import ValueWarning from numpy.testing import (assert_almost_equal, assert_equal, assert_, @@ -584,7 +586,10 @@ def test_var_constant(): data.index = DatetimeIndex(index) - model = VAR(data) + #with pytest.warns(ValueWarning): #does not silence warning in test output + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=ValueWarning) + model = VAR(data) with pytest.raises(ValueError): model.fit(1) From 0f79ff89e139f78acfab033b082d5cc7ebff1616 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 27 Jan 2018 23:08:40 -0500 Subject: [PATCH 097/157] REF/TST: count_models catch convergence warning in start_params fit, reduce test noise --- statsmodels/discrete/count_model.py | 13 +++++++--- .../discrete/tests/test_count_model.py | 26 +++++++++++-------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/statsmodels/discrete/count_model.py b/statsmodels/discrete/count_model.py index 1f2134920ec..1180a606da5 100644 --- a/statsmodels/discrete/count_model.py +++ b/statsmodels/discrete/count_model.py @@ -3,7 +3,7 @@ __all__ = ["ZeroInflatedPoisson", "ZeroInflatedGeneralizedPoisson", "ZeroInflatedNegativeBinomialP"] - +import warnings import numpy as np import statsmodels.base.model as base import statsmodels.base.wrapper as wrap @@ -18,6 +18,7 @@ from statsmodels.tools.numdiff import (approx_fprime, approx_hess, approx_hess_cs, approx_fprime_cs) from statsmodels.tools.decorators import (resettable_cache, cache_readonly) +from statsmodels.tools.sm_exceptions import ConvergenceWarning _doc_zi_params = """ @@ -618,8 +619,10 @@ def _predict_prob(self, params, exog, exog_infl, exposure, offset): return result[0] if transform else result def _get_start_params(self): - start_params = ZeroInflatedPoisson(self.endog, self.exog, - exog_infl=self.exog_infl).fit(disp=0).params + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=ConvergenceWarning) + start_params = ZeroInflatedPoisson(self.endog, self.exog, + exog_infl=self.exog_infl).fit(disp=0).params start_params = np.append(start_params, 0.1) return start_params @@ -695,7 +698,9 @@ def _predict_prob(self, params, exog, exog_infl, exposure, offset): return result[0] if transform else result def _get_start_params(self): - start_params = self.model_main.fit(disp=0, method='nm').params + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=ConvergenceWarning) + start_params = self.model_main.fit(disp=0, method='nm').params start_params = np.append(np.zeros(self.k_inflate), start_params) return start_params diff --git a/statsmodels/discrete/tests/test_count_model.py b/statsmodels/discrete/tests/test_count_model.py index 90c53f5cbd7..79240cb9a58 100644 --- a/statsmodels/discrete/tests/test_count_model.py +++ b/statsmodels/discrete/tests/test_count_model.py @@ -74,7 +74,8 @@ def setup_class(cls): exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedPoisson(data.endog, exog, - exog_infl=exog_infl, inflation='logit').fit(method='newton', maxiter=500) + exog_infl=exog_infl, inflation='logit').fit(method='newton', maxiter=500, + disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] @@ -91,7 +92,8 @@ def setup_class(cls): exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedPoisson(data.endog, exog, - exog_infl=exog_infl, inflation='probit').fit(method='newton', maxiter=500) + exog_infl=exog_infl, inflation='probit').fit(method='newton', maxiter=500, + disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] @@ -108,7 +110,9 @@ def setup_class(cls): exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedPoisson(data.endog, exog, - exog_infl=exog_infl, offset=data.exog[:,7]).fit(method='newton', maxiter=500) + exog_infl=exog_infl, offset=data.exog[:,7]).fit(method='newton', + maxiter=500, + disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] @@ -125,7 +129,7 @@ def test_exposure(self): model3 = sm.ZeroInflatedPoisson(model1.endog, model1.exog, exog_infl=model1.exog_infl, exposure=np.exp(offset)) res3 = model3.fit(start_params=self.res1.params, - method='newton', maxiter=500) + method='newton', maxiter=500, disp=0) assert_allclose(res3.params, self.res1.params, atol=1e-6, rtol=1e-6) fitted1 = self.res1.predict() @@ -176,7 +180,7 @@ def setup_class(cls): model = sm.ZeroInflatedPoisson(data.endog, exog, exog_infl=exog_infl, inflation='logit') cls.res1 = model.fit(start_params=start_params, method='newton', - maxiter=500) + maxiter=500, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] @@ -213,7 +217,7 @@ def setup_class(cls): cls.endog = sm.distributions.zipoisson.rvs(mu_true, 0.05, size=mu_true.shape) model = sm.ZeroInflatedPoisson(cls.endog, exog) - cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000) + cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000, disp=0) def test_mean(self): assert_allclose(self.res.predict().mean(), self.endog.mean(), @@ -241,7 +245,7 @@ def setup_class(cls): exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedGeneralizedPoisson(data.endog, exog, - exog_infl=exog_infl, p=1).fit(method='newton', maxiter=500) + exog_infl=exog_infl, p=1).fit(method='newton', maxiter=500, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset', 'p'] @@ -313,7 +317,7 @@ def setup_class(cls): cls.endog = sm.distributions.zigenpoisson.rvs(mu_true, expected_params[-1], 2, 0.5, size=mu_true.shape) model = sm.ZeroInflatedGeneralizedPoisson(cls.endog, exog, p=2) - cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000) + cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000, disp=0) def test_mean(self): assert_allclose(self.res.predict().mean(), self.endog.mean(), @@ -345,7 +349,7 @@ def setup_class(cls): sp = np.array([1.88, -10.28, -0.20, 1.14, 1.34]) cls.res1 = sm.ZeroInflatedNegativeBinomialP(data.endog, exog, exog_infl=exog_infl, p=2).fit(start_params=sp, method='nm', - xtol=1e-6, maxiter=5000) + xtol=1e-6, maxiter=5000, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset', 'p'] @@ -428,7 +432,7 @@ def setup_class(cls): cls.endog = sm.distributions.zinegbin.rvs(mu_true, expected_params[-1], 2, prob_infl, size=mu_true.shape) model = sm.ZeroInflatedNegativeBinomialP(cls.endog, exog, p=2) - cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000) + cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000, disp=0) # attach others cls.prob_infl = prob_infl @@ -539,7 +543,7 @@ def setup_class(cls): mod = sm.ZeroInflatedNegativeBinomialP( cls.endog, exog, exog_infl=exog, p=2) res = mod.fit(start_params=start_params, method="bfgs", - maxiter=1000) + maxiter=1000, disp=0) cls.res = res From 9f3e32b64c57826dd278a775141fe07f7eebfe12 Mon Sep 17 00:00:00 2001 From: Josef Date: Sun, 28 Jan 2018 00:50:21 -0500 Subject: [PATCH 098/157] TST: use pytest.warns --- statsmodels/discrete/tests/test_discrete.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index b7074809d30..614fb0f4e6a 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -28,7 +28,9 @@ import statsmodels.api as sm import statsmodels.formula.api as smf from .results.results_discrete import Spector, DiscreteL1, RandHIE, Anes -from statsmodels.tools.sm_exceptions import PerfectSeparationError +from statsmodels.tools.sm_exceptions import (PerfectSeparationError, + ConvergenceWarning) + #PerfectSeparationWarning) from scipy.stats import nbinom try: @@ -1458,7 +1460,10 @@ def test_perfect_prediction(): # this will raise if you set maxiter high enough with a singular matrix from pandas.util.testing import assert_produces_warning # this is not thread-safe - with assert_produces_warning(): + mod.fit(disp=False, maxiter=50) + #with assert_produces_warning(): + import pytest + with pytest.warns(ConvergenceWarning): warnings.simplefilter('always') mod.fit(disp=False, maxiter=50) # should not raise but does warn From 2c0a583d2ed69a6ee6c86f51862a9bd4c23005e3 Mon Sep 17 00:00:00 2001 From: Josef Date: Sun, 28 Jan 2018 09:06:18 -0500 Subject: [PATCH 099/157] REF: set warnings filter explicitly to always for UserWarning --- statsmodels/tools/sm_exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/statsmodels/tools/sm_exceptions.py b/statsmodels/tools/sm_exceptions.py index 8030e13becd..5666754ee89 100644 --- a/statsmodels/tools/sm_exceptions.py +++ b/statsmodels/tools/sm_exceptions.py @@ -11,6 +11,8 @@ raised. """ +import warnings + # Errors class PerfectSeparationError(Exception): pass @@ -119,3 +121,5 @@ class HessianInversionWarning(UserWarning): class ColinearityWarning(UserWarning): pass + +warnings.simplefilter('always', category=UserWarning) From 07ae00d282d769720d3e71455493202c9e5bc910 Mon Sep 17 00:00:00 2001 From: Josef Date: Sun, 28 Jan 2018 11:20:55 -0500 Subject: [PATCH 100/157] TST: assert warns only if py > 3.3, assert converged flag --- statsmodels/discrete/tests/test_discrete.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index 614fb0f4e6a..556dcb21d3f 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -1460,12 +1460,16 @@ def test_perfect_prediction(): # this will raise if you set maxiter high enough with a singular matrix from pandas.util.testing import assert_produces_warning # this is not thread-safe - mod.fit(disp=False, maxiter=50) - #with assert_produces_warning(): - import pytest - with pytest.warns(ConvergenceWarning): - warnings.simplefilter('always') - mod.fit(disp=False, maxiter=50) # should not raise but does warn + # py 2.7 and 3.3 don't raise here anymore #4235 + import sys + PY3_g3 = sys.version_info[:2] > (3, 3) + if PY3_g3: + with assert_produces_warning(): + warnings.simplefilter('always') + res = mod.fit(disp=False, maxiter=50) # should not raise but does warn + else: + res = mod.fit(disp=False, maxiter=50) + assert_(not res.mle_retvals['converged']) def test_poisson_predict(): From e07539cce9cc09cdaec703c90c56f05bb40f309f Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 10:17:01 -0400 Subject: [PATCH 101/157] MAINT: numpy compat, keep old rcond in lstsq, see #3994 --- statsmodels/nonparametric/smoothers_lowess_old.py | 4 ++-- statsmodels/regression/_tools.py | 3 ++- statsmodels/sandbox/nonparametric/smoothers.py | 2 +- statsmodels/sandbox/panel/mixed.py | 6 +++--- statsmodels/sandbox/stats/contrast_tools.py | 8 ++++---- statsmodels/sandbox/tsa/diffusion.py | 4 ++-- statsmodels/tsa/seasonal.py | 4 ++-- statsmodels/tsa/varma_process.py | 4 ++-- statsmodels/tsa/vector_ar/svar_model.py | 2 +- statsmodels/tsa/vector_ar/var_model.py | 2 +- 10 files changed, 20 insertions(+), 19 deletions(-) diff --git a/statsmodels/nonparametric/smoothers_lowess_old.py b/statsmodels/nonparametric/smoothers_lowess_old.py index 31c0ca6a0e8..5c57f8e47dd 100644 --- a/statsmodels/nonparametric/smoothers_lowess_old.py +++ b/statsmodels/nonparametric/smoothers_lowess_old.py @@ -169,7 +169,7 @@ def _lowess_initial_fit(x_copy, y_copy, k, n): X[:,1] = x_copy[nn_indices[0]:nn_indices[1]] y_i = weights[i,:] * y_copy[nn_indices[0]:nn_indices[1]] - beta = lstsq(weights[i,:].reshape(k,1) * X, y_i)[0] + beta = lstsq(weights[i,:].reshape(k,1) * X, y_i, rcond=-1)[0] fitted[i] = beta[0] + beta[1]*x_copy[i] _lowess_update_nn(x_copy, nn_indices, i+1) @@ -255,7 +255,7 @@ def _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n): y_i = total_weights * y_copy[nn_indices[0]:nn_indices[1]] total_weights.shape = (k,1) - beta = lstsq(total_weights * X, y_i)[0] + beta = lstsq(total_weights * X, y_i, rcond=-1)[0] fitted[i] = beta[0] + beta[1] * x_copy[i] diff --git a/statsmodels/regression/_tools.py b/statsmodels/regression/_tools.py index c4493126dc8..3526bcfd1db 100644 --- a/statsmodels/regression/_tools.py +++ b/statsmodels/regression/_tools.py @@ -89,7 +89,8 @@ def fit(self, method='pinv'): Q, R = np.linalg.qr(self.wexog) params = np.linalg.solve(R, np.dot(Q.T, self.wendog)) else: - params, _, _, _ = np.linalg.lstsq(self.wexog, self.wendog) + params, _, _, _ = np.linalg.lstsq(self.wexog, self.wendog, + rcond=-1) fitted_values = self.exog.dot(params) resid = self.endog - fitted_values diff --git a/statsmodels/sandbox/nonparametric/smoothers.py b/statsmodels/sandbox/nonparametric/smoothers.py index 519fbf57bc1..119f65c0ad3 100644 --- a/statsmodels/sandbox/nonparametric/smoothers.py +++ b/statsmodels/sandbox/nonparametric/smoothers.py @@ -195,7 +195,7 @@ def fit(self, y, x=None, weights=None): _y = y * _w#[:,None] #self.coef = np.dot(L.pinv(X).T, _y[:,None]) #self.coef = np.dot(L.pinv(X), _y) - self.coef = np.linalg.lstsq(X, _y)[0] + self.coef = np.linalg.lstsq(X, _y, rcond=-1)[0] self.params = np.squeeze(self.coef) diff --git a/statsmodels/sandbox/panel/mixed.py b/statsmodels/sandbox/panel/mixed.py index 1b5590e4cff..39f739729c2 100644 --- a/statsmodels/sandbox/panel/mixed.py +++ b/statsmodels/sandbox/panel/mixed.py @@ -415,7 +415,7 @@ def logL(self, ML=False): def initialize(self): S = sum([np.dot(unit.X.T, unit.X) for unit in self.units]) Y = sum([np.dot(unit.X.T, unit.Y) for unit in self.units]) - self.a = L.lstsq(S, Y)[0] + self.a = L.lstsq(S, Y, rcond=-1)[0] D = 0 t = 0 @@ -423,10 +423,10 @@ def initialize(self): for unit in self.units: unit.r = unit.Y - np.dot(unit.X, self.a) if self.q > 1: - unit.b = L.lstsq(unit.Z, unit.r)[0] + unit.b = L.lstsq(unit.Z, unit.r, rcond=-1)[0] else: Z = unit.Z.reshape((unit.Z.shape[0], 1)) - unit.b = L.lstsq(Z, unit.r)[0] + unit.b = L.lstsq(Z, unit.r, rcond=-1)[0] sigmasq += (np.power(unit.Y, 2).sum() - (self.a * np.dot(unit.X.T, unit.Y)).sum() - diff --git a/statsmodels/sandbox/stats/contrast_tools.py b/statsmodels/sandbox/stats/contrast_tools.py index 67d74407982..af7110e33f7 100644 --- a/statsmodels/sandbox/stats/contrast_tools.py +++ b/statsmodels/sandbox/stats/contrast_tools.py @@ -388,8 +388,8 @@ def __init__(self, d1, d2): should be (x, z) in arguments ? ''' - self.transf_matrix = np.linalg.lstsq(d1, d2)[0] - self.invtransf_matrix = np.linalg.lstsq(d2, d1)[0] + self.transf_matrix = np.linalg.lstsq(d1, d2, rcond=-1)[0] + self.invtransf_matrix = np.linalg.lstsq(d2, d1, rcond=-1)[0] def dot_left(self, a): ''' b = C a @@ -684,8 +684,8 @@ def test_dummy_limits(self): params_df_df = resols_dropf_dropf.params - tr_of = np.linalg.lstsq(dd_dropf, dd_full)[0] - tr_fo = np.linalg.lstsq(dd_full, dd_dropf)[0] + tr_of = np.linalg.lstsq(dd_dropf, dd_full, rcond=-1)[0] + tr_fo = np.linalg.lstsq(dd_full, dd_dropf, rcond=-1)[0] print(np.dot(tr_fo, params_df_df) - params_df_f) print(np.dot(tr_of, params_f_f) - params_f_df) diff --git a/statsmodels/sandbox/tsa/diffusion.py b/statsmodels/sandbox/tsa/diffusion.py index a2138cb0112..8a108a5e3d4 100644 --- a/statsmodels/sandbox/tsa/diffusion.py +++ b/statsmodels/sandbox/tsa/diffusion.py @@ -319,7 +319,7 @@ def fitls(self, data, dt): # brute force, no parameter estimation errors nobs = len(data)-1 exog = np.column_stack((np.ones(nobs), data[:-1])) - parest, res, rank, sing = np.linalg.lstsq(exog, data[1:]) + parest, res, rank, sing = np.linalg.lstsq(exog, data[1:], rcond=-1) const, slope = parest errvar = res/(nobs-2.) lambd = -np.log(slope)/dt @@ -373,7 +373,7 @@ def fitls(self, data, dt): # brute force, no parameter estimation errors nobs = len(data)-1 exog = np.column_stack((np.ones(nobs),np.log(data[:-1]))) - parest, res, rank, sing = np.linalg.lstsq(exog, np.log(data[1:])) + parest, res, rank, sing = np.linalg.lstsq(exog, np.log(data[1:]), rcond=-1) const, slope = parest errvar = res/(nobs-2.) #check denominator estimate, of sigma too low kappa = -np.log(slope)/dt diff --git a/statsmodels/tsa/seasonal.py b/statsmodels/tsa/seasonal.py index ca2707d3d30..2e6beafc33a 100644 --- a/statsmodels/tsa/seasonal.py +++ b/statsmodels/tsa/seasonal.py @@ -33,7 +33,7 @@ def _extrapolate_trend(trend, npoints): k, n = np.linalg.lstsq( np.c_[np.arange(front, front_last), np.ones(front_last - front)], - trend[front:front_last])[0] + trend[front:front_last], rcond=-1)[0] extra = (np.arange(0, front) * np.c_[k] + np.c_[n]).T if trend.ndim == 1: extra = extra.squeeze() @@ -41,7 +41,7 @@ def _extrapolate_trend(trend, npoints): k, n = np.linalg.lstsq( np.c_[np.arange(back_first, back), np.ones(back - back_first)], - trend[back_first:back])[0] + trend[back_first:back], rcond=-1)[0] extra = (np.arange(back + 1, trend.shape[0]) * np.c_[k] + np.c_[n]).T if trend.ndim == 1: extra = extra.squeeze() diff --git a/statsmodels/tsa/varma_process.py b/statsmodels/tsa/varma_process.py index 68c62c510a2..635e056f1e6 100644 --- a/statsmodels/tsa/varma_process.py +++ b/statsmodels/tsa/varma_process.py @@ -381,7 +381,7 @@ def fit(self, nlags): lmat = lagmat(self.y, nlags, trim='both', original='in') self.yred = lmat[:,:nvars] self.xred = lmat[:,nvars:] - res = np.linalg.lstsq(self.xred, self.yred) + res = np.linalg.lstsq(self.xred, self.yred, rcond=-1) self.estresults = res self.arlhs = res[0].reshape(nlags, nvars, nvars) self.arhat = ar2full(self.arlhs) @@ -677,7 +677,7 @@ def reduceform(self, apoly): ut = np.random.randn(1000,2) ar2s = vargenerate(a22,ut) #res = np.linalg.lstsq(lagmat(ar2s,1)[:,1:], ar2s) - res = np.linalg.lstsq(lagmat(ar2s,1), ar2s) + res = np.linalg.lstsq(lagmat(ar2s,1), ar2s, rcond=-1) bhat = res[0].reshape(1,2,2) arhat = ar2full(bhat) #print(maxabs(arhat - a22) diff --git a/statsmodels/tsa/vector_ar/svar_model.py b/statsmodels/tsa/vector_ar/svar_model.py index e2ac48aec82..868e6806c56 100644 --- a/statsmodels/tsa/vector_ar/svar_model.py +++ b/statsmodels/tsa/vector_ar/svar_model.py @@ -229,7 +229,7 @@ def _estimate_svar(self, start_params, lags, maxiter, maxfun, y_sample = y[lags:] # Lutkepohl p75, about 5x faster than stated formula - var_params = np.linalg.lstsq(z, y_sample)[0] + var_params = np.linalg.lstsq(z, y_sample, rcond=-1)[0] resid = y_sample - np.dot(z, var_params) # Unbiased estimate of covariance matrix $\Sigma_u$ of the white noise diff --git a/statsmodels/tsa/vector_ar/var_model.py b/statsmodels/tsa/vector_ar/var_model.py index 637996995fb..72eb09d2154 100644 --- a/statsmodels/tsa/vector_ar/var_model.py +++ b/statsmodels/tsa/vector_ar/var_model.py @@ -686,7 +686,7 @@ def _estimate_var(self, lags, offset=0, trend='c'): y_sample = endog[lags:] # Lütkepohl p75, about 5x faster than stated formula - params = np.linalg.lstsq(z, y_sample)[0] + params = np.linalg.lstsq(z, y_sample, rcond=-1)[0] resid = y_sample - np.dot(z, params) # Unbiased estimate of covariance matrix $\Sigma_u$ of the white noise From 22fceee5589b7813bb8cb3ac1c66e476afba9dd1 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 12:07:56 -0400 Subject: [PATCH 102/157] TST/MAINT test_discrete skip assert_produces_warning on python <= 3.3 --- statsmodels/discrete/tests/test_discrete.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index 556dcb21d3f..2792984f1ce 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -1499,9 +1499,16 @@ def test_poisson_newton(): mod = sm.Poisson(y_count, x) from pandas.util.testing import assert_produces_warning # this is not thread-safe - with assert_produces_warning(): - warnings.simplefilter('always') + # py 2.7 and 3.3 don't raise here anymore #4235 + import sys + PY3_g3 = sys.version_info[:2] > (3, 3) + if PY3_g3: + with assert_produces_warning(): + warnings.simplefilter('always') + res = mod.fit(start_params=-np.ones(4), method='newton', disp=0) + else: res = mod.fit(start_params=-np.ones(4), method='newton', disp=0) + assert_(not res.mle_retvals['converged']) From 197f4903d81f947ea4bbef46e967b7d7919a61d3 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 12:08:51 -0400 Subject: [PATCH 103/157] TST: test_discrete add all disp=0 and one start_params --- statsmodels/discrete/tests/test_discrete.py | 40 +++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index 2792984f1ce..c62c785bed8 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -1661,7 +1661,7 @@ def setup_class(cls): data = sm.datasets.randhie.load() data.exog = sm.add_constant(data.exog, prepend=False) mod = GeneralizedPoisson(data.endog, data.exog, p=2) - cls.res1 = mod.fit(method='newton') + cls.res1 = mod.fit(method='newton', disp=0) res2 = RandHIE() res2.generalizedpoisson_gp2() cls.res2 = res2 @@ -1712,7 +1712,7 @@ def setup_class(cls): data = sm.datasets.randhie.load() data.exog = sm.add_constant(data.exog, prepend=False) cls.res1 = GeneralizedPoisson(data.endog, data.exog, p=2).fit( - method='newton', use_transparams=True) + method='newton', use_transparams=True, disp=0) res2 = RandHIE() res2.generalizedpoisson_gp2() cls.res2 = res2 @@ -1753,7 +1753,7 @@ def setup_class(cls): cls.data = sm.datasets.randhie.load() cls.data.exog = sm.add_constant(cls.data.exog, prepend=False) cls.res1 = GeneralizedPoisson( - cls.data.endog, cls.data.exog, p=1).fit(method='newton') + cls.data.endog, cls.data.exog, p=1).fit(method='newton', disp=0) def test_llf(self): poisson_llf = sm.Poisson( @@ -1827,7 +1827,7 @@ def setup_class(cls): cls.expected_params[-1], 1, size=len(mu_true)) model_gp = sm.GeneralizedPoisson(cls.endog, exog, p=1) cls.res = model_gp.fit(method='nm', xtol=1e-6, maxiter=5000, - maxfun=5000) + maxfun=5000, disp=0) def test_basic(self): res = self.res @@ -1844,7 +1844,7 @@ def test_basic(self): def test_newton(self): # check newton optimization with start_params res = self.res - res2 = res.model.fit(start_params=res.params, method='newton') + res2 = res.model.fit(start_params=res.params, method='newton', disp=0) assert_allclose(res.model.score(res.params), np.zeros(len(res2.params)), atol=0.01) assert_allclose(res.model.score(res2.params), @@ -2104,7 +2104,7 @@ def setup_class(cls): # Drop some columns and do an unregularized fit exog_no_PSI = rand_exog[:, :cls.m] mod_unreg = sm.NegativeBinomialP(rand_data.endog, exog_no_PSI) - cls.res_unreg = mod_unreg.fit(method="newton", disp=False) + cls.res_unreg = mod_unreg.fit(method="newton", disp=0) # Do a regularized fit with alpha, effectively dropping the last column alpha = 10 * len(rand_data.endog) * np.ones(cls.kvars + 1) alpha[:cls.m] = 0 @@ -2131,7 +2131,7 @@ def test_predict_prob_p1(self): prob = size / (size + mu_true) endog = nbinom.rvs(size, prob, size=len(mu_true)) - res = sm.NegativeBinomialP(endog, exog).fit() + res = sm.NegativeBinomialP(endog, exog).fit(disp=0) mu = res.predict() size = 1. / alpha * mu @@ -2157,7 +2157,7 @@ def test_predict_prob_p2(self): prob = size / (size + mu_true) endog = nbinom.rvs(size, prob, size=len(mu_true)) - res = sm.NegativeBinomialP(endog, exog, p=2).fit() + res = sm.NegativeBinomialP(endog, exog, p=2).fit(disp=0) mu = res.predict() size = 1. / alpha @@ -2179,7 +2179,7 @@ def _get_data(cls): return endog, exog def test_llnull(self): - res = self.model.fit(start_params=self.start_params) + res = self.model.fit(start_params=self.start_params, disp=0) res._results._attach_nullmodel = True llf0 = res.llnull res_null0 = res.res_null @@ -2198,7 +2198,7 @@ class TestPoissonNull(CheckNull): def setup_class(cls): endog, exog = cls._get_data() cls.model = Poisson(endog, exog) - cls.res_null = Poisson(endog, exog[:, 0]).fit(start_params=[8.5]) + cls.res_null = Poisson(endog, exog[:, 0]).fit(start_params=[8.5], disp=0) # use start params to avoid warnings cls.start_params = [8.5, 0] @@ -2213,7 +2213,7 @@ def setup_class(cls): loglike_method='nb1') cls.res_null = cls.model_null.fit(start_params=[8, 1000], method='bfgs', gtol=1e-08, - maxiter=300) + maxiter=300, disp=0) # for convergence with bfgs, I needed to round down alpha start_params cls.start_params = np.array([7.730452, 2.01633068e-02, 1763.0]) @@ -2228,7 +2228,7 @@ def setup_class(cls): loglike_method='nb2') cls.res_null = cls.model_null.fit(start_params=[8, 0.5], method='bfgs', gtol=1e-06, - maxiter=300) + maxiter=300, disp=0) cls.start_params = np.array([8.07216448, 0.01087238, 0.44024134]) @@ -2241,7 +2241,7 @@ def setup_class(cls): cls.model_null = NegativeBinomialP(endog, exog[:, 0], p=2) cls.res_null = cls.model_null.fit(start_params=[8, 1], method='bfgs', gtol=1e-06, - maxiter=300) + maxiter=300, disp=0) cls.start_params = np.array([8.07216448, 0.01087238, 0.44024134]) def test_start_null(self): @@ -2261,7 +2261,7 @@ def setup_class(cls): cls.model_null = NegativeBinomialP(endog, exog[:, 0], p=1) cls.res_null = cls.model_null.fit(start_params=[8, 1], method='bfgs', gtol=1e-06, - maxiter=300) + maxiter=300, disp=0) cls.start_params = np.array([7.730452, 2.01633068e-02, 1763.0]) def test_start_null(self): @@ -2281,7 +2281,7 @@ def setup_class(cls): cls.model_null = GeneralizedPoisson(endog, exog[:, 0], p=1.5) cls.res_null = cls.model_null.fit(start_params=[8.4, 1], method='bfgs', gtol=1e-08, - maxiter=300) + maxiter=300, disp=0) cls.start_params = np.array([6.91127148, 0.04501334, 0.88393736]) @@ -2293,7 +2293,7 @@ def test_null_options(): exog[:nobs // 2, 1] = 0 mu = np.exp(exog.sum(1)) endog = np.random.poisson(mu) # Note no size=nobs in np.random - res = Poisson(endog, exog).fit(start_params=np.log([1, 1])) + res = Poisson(endog, exog).fit(start_params=np.log([1, 1]), disp=0) llnull0 = res.llnull assert_(hasattr(res, 'res_llnull') is False) res.set_null_options(attach_results=True) @@ -2374,6 +2374,8 @@ def test_optim_kwds_prelim(): def test_unchanging_degrees_of_freedom(): + import warnings + warnings.simplefilter('error') # see GH3734 data = sm.datasets.randhie.load() model = sm.NegativeBinomial(data.endog, data.exog, loglike_method='nb2') @@ -2381,17 +2383,17 @@ def test_unchanging_degrees_of_freedom(): 0.22902315, 0.06210253, 0.06799444, 0.08406794, 0.18530092, 1.36645186]) - res1 = model.fit(start_params=params) + res1 = model.fit(start_params=params, disp=0) assert_equal(res1.df_model, 8) reg_params = np.array([-0.04854 , -0.15019404, 0.08363671, -0.03032834, 0.17592454, 0.06440753, 0.01584555, 0. , 0. , 1.36984628]) - res2 = model.fit_regularized(alpha=100, start_params=reg_params) + res2 = model.fit_regularized(alpha=100, start_params=reg_params, disp=0) assert_(res2.df_model != 8) # If res2.df_model == res1.df_model, then this test is invalid. - res3 = model.fit() + res3 = model.fit(start_params=params, disp=0) # Test that the call to `fit_regularized` didn't modify model.df_model inplace. assert_equal(res3.df_model, res1.df_model) assert_equal(res3.df_resid, res1.df_resid) From 735fe8fe3a7cd41616eacd1791ed0edf3e0d4d33 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 16:38:50 -0400 Subject: [PATCH 104/157] BUG: outlier_test fix index with order closes #3971 --- statsmodels/stats/outliers_influence.py | 5 +++-- statsmodels/stats/tests/test_diagnostic.py | 23 +++++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/statsmodels/stats/outliers_influence.py b/statsmodels/stats/outliers_influence.py index a12ca588c70..2ee09e0c8c2 100644 --- a/statsmodels/stats/outliers_influence.py +++ b/statsmodels/stats/outliers_influence.py @@ -56,6 +56,8 @@ def outlier_test(model_results, method='bonf', alpha=.05, labels=None, df = df_resid - 1. """ from scipy import stats # lazy import + if labels is None: + labels = getattr(model_results.model.data, 'row_labels', None) infl = getattr(model_results, 'get_influence', None) if infl is None: results = maybe_unwrap_results(model_results) @@ -72,8 +74,7 @@ def outlier_test(model_results, method='bonf', alpha=.05, labels=None, adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] - if labels is None: - labels = getattr(model_results.model.data, 'row_labels', None) + if labels is not None: from pandas import DataFrame return DataFrame(data, diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py index 1160c0e24e6..b47cce961a0 100644 --- a/statsmodels/stats/tests/test_diagnostic.py +++ b/statsmodels/stats/tests/test_diagnostic.py @@ -18,7 +18,8 @@ import numpy as np from numpy.testing import (assert_, assert_almost_equal, assert_equal, - assert_approx_equal, assert_allclose) + assert_approx_equal, assert_allclose, + assert_array_equal) import pytest from statsmodels.regression.linear_model import OLS, GLSAR @@ -326,7 +327,7 @@ def test_acorr_ljung_box(self): res = self.res #general test - + #> bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box") #> mkhtest(bt, "ljung_box_4", "chi2") ljung_box_4 = dict(statistic=5.23587172795227, pvalue=0.263940335284713, @@ -365,7 +366,7 @@ def test_acorr_ljung_box_big_default(self): def test_acorr_ljung_box_small_default(self): res = self.res #test with small dataset and default lag - + #> bt = Box.test(residuals(fm), type = "Ljung-Box") #> mkhtest(bt, "ljung_box_small", "chi2") ljung_box_small = dict(statistic=9.61503968281915, pvalue=0.72507000996945, @@ -920,6 +921,22 @@ def test_outlier_test(): np.testing.assert_almost_equal(res.values, res2, 7) np.testing.assert_equal(res.index.tolist(), sorted_labels) # pylint: disable-msg=E1103 + import pandas as pd + data = pd.DataFrame(np.column_stack((endog, exog)), + columns='y const var1 var2'.split(), + index=labels) + + # check `order` with pandas bug in #3971 + res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit() + res_outl1 = res_pd.outlier_test(method='b') + res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True) + res_outl2 = oi.outlier_test(res_pd, method='b', order=True) + assert_almost_equal(res_outl1.values, res2, 7) + assert_equal(res_outl1.index.tolist(), sorted_labels) + assert_almost_equal(res_outl2.values, res2, 7) + assert_equal(res_outl2.index.tolist(), sorted_labels) + assert_array_equal(res_outl2.index, res_outl1.index) + if __name__ == '__main__': import pytest From 662a3dc6754146818edc510dad67784cd4103de7 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 22 Mar 2018 17:25:59 -0400 Subject: [PATCH 105/157] ENH: add more options to outlier_test --- statsmodels/regression/linear_model.py | 16 ++++++++++++++-- statsmodels/stats/outliers_influence.py | 19 ++++++++++++++++--- statsmodels/stats/tests/test_diagnostic.py | 6 ++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py index f57fd0f7d85..d33e6e4b86c 100644 --- a/statsmodels/regression/linear_model.py +++ b/statsmodels/regression/linear_model.py @@ -2556,7 +2556,8 @@ def get_influence(self): from statsmodels.stats.outliers_influence import OLSInfluence return OLSInfluence(self) - def outlier_test(self, method='bonf', alpha=.05): + def outlier_test(self, method='bonf', alpha=.05, labels=None, + order=False, cutoff=None): """ Test observations for outliers according to method @@ -2576,6 +2577,16 @@ def outlier_test(self, method='bonf', alpha=.05): See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate + labels : None or array_like + If `labels` is not None, then it will be used as index to the + returned pandas DataFrame. See also Returns below + order : bool + Whether or not to order the results by the absolute value of the + studentized residuals. If labels are provided they will also be sorted. + cutoff : None or float in [0, 1] + If cutoff is not None, then the return only includes observations with + multiple testing corrected p-values strictly below the cutoff. The + returned array or dataframe can be empty if t Returns ------- @@ -2591,7 +2602,8 @@ def outlier_test(self, method='bonf', alpha=.05): df = df_resid - 1. """ from statsmodels.stats.outliers_influence import outlier_test - return outlier_test(self, method, alpha) + return outlier_test(self, method, alpha, labels=labels, + order=order, cutoff=cutoff) def el_test(self, b0_vals, param_nums, return_weights=0, ret_params=0, method='nm', diff --git a/statsmodels/stats/outliers_influence.py b/statsmodels/stats/outliers_influence.py index 2ee09e0c8c2..be277bb5eb5 100644 --- a/statsmodels/stats/outliers_influence.py +++ b/statsmodels/stats/outliers_influence.py @@ -18,7 +18,7 @@ # outliers test convenience wrapper def outlier_test(model_results, method='bonf', alpha=.05, labels=None, - order=False): + order=False, cutoff=None): """ Outlier Tests for RegressionResults instances. @@ -38,9 +38,17 @@ def outlier_test(model_results, method='bonf', alpha=.05, labels=None, See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate + labels : None or array_like + If `labels` is not None, then it will be used as index to the + returned pandas DataFrame. See also Returns below order : bool Whether or not to order the results by the absolute value of the studentized residuals. If labels are provided they will also be sorted. + cutoff : None or float in [0, 1] + If cutoff is not None, then the return only includes observations with + multiple testing corrected p-values strictly below the cutoff. The + returned array or dataframe can be empty if there are no outlier + candidates at the specified cutoff. Returns ------- @@ -68,18 +76,23 @@ def outlier_test(model_results, method='bonf', alpha=.05, labels=None, idx = np.abs(resid).argsort()[::-1] resid = resid[idx] if labels is not None: - labels = np.array(labels)[idx].tolist() + labels = np.asarray(labels)[idx] df = model_results.df_resid - 1 unadj_p = stats.t.sf(np.abs(resid), df) * 2 adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] + if cutoff is not None: + mask = data[:, -1] < cutoff + data = data[mask] + else: + mask = slice(None) if labels is not None: from pandas import DataFrame return DataFrame(data, columns=['student_resid', 'unadj_p', method+"(p)"], - index=labels) + index=np.asarray(labels)[mask]) return data #influence measures diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py index b47cce961a0..d8fd03e87fe 100644 --- a/statsmodels/stats/tests/test_diagnostic.py +++ b/statsmodels/stats/tests/test_diagnostic.py @@ -937,6 +937,12 @@ def test_outlier_test(): assert_equal(res_outl2.index.tolist(), sorted_labels) assert_array_equal(res_outl2.index, res_outl1.index) + # additional keywords in method + res_outl3 = res_pd.outlier_test(method='b', order=True) + assert_equal(res_outl3.index.tolist(), sorted_labels) + res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15) + assert_equal(res_outl4.index.tolist(), sorted_labels[:1]) + if __name__ == '__main__': import pytest From d1fceb73ae01f269ee359ee495186e9928bcbd0a Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 23 Mar 2018 09:06:59 -0400 Subject: [PATCH 106/157] TST: skip parts if pandas < 0.17 --- statsmodels/stats/tests/test_diagnostic.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py index d8fd03e87fe..ff1e8b2afc2 100644 --- a/statsmodels/stats/tests/test_diagnostic.py +++ b/statsmodels/stats/tests/test_diagnostic.py @@ -16,6 +16,11 @@ import os import numpy as np +import pandas as pd + +# skipping some parts +from distutils.version import LooseVersion +PD_GE_17 = LooseVersion(pd.__version__) >= '0.17' from numpy.testing import (assert_, assert_almost_equal, assert_equal, assert_approx_equal, assert_allclose, @@ -921,21 +926,25 @@ def test_outlier_test(): np.testing.assert_almost_equal(res.values, res2, 7) np.testing.assert_equal(res.index.tolist(), sorted_labels) # pylint: disable-msg=E1103 - import pandas as pd data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) # check `order` with pandas bug in #3971 res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit() - res_outl1 = res_pd.outlier_test(method='b') - res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True) + res_outl2 = oi.outlier_test(res_pd, method='b', order=True) - assert_almost_equal(res_outl1.values, res2, 7) - assert_equal(res_outl1.index.tolist(), sorted_labels) assert_almost_equal(res_outl2.values, res2, 7) assert_equal(res_outl2.index.tolist(), sorted_labels) - assert_array_equal(res_outl2.index, res_outl1.index) + + if PD_GE_17: + # pandas < 0.17 does not have sort_values method + res_outl1 = res_pd.outlier_test(method='b') + res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True) + assert_almost_equal(res_outl1.values, res2, 7) + assert_equal(res_outl1.index.tolist(), sorted_labels) + assert_array_equal(res_outl2.index, res_outl1.index) + # additional keywords in method res_outl3 = res_pd.outlier_test(method='b', order=True) From f43cccaff2626eb026011f5314be77534986be4c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 23 Mar 2018 08:25:17 -0700 Subject: [PATCH 107/157] flake8 cleanup in regime_switching --- statsmodels/tsa/regime_switching/__init__.py | 1 - .../regime_switching/markov_autoregression.py | 2 +- .../tsa/regime_switching/markov_regression.py | 3 +- .../tsa/regime_switching/markov_switching.py | 5 +- .../tests/test_markov_autoregression.py | 73 ++++++++++--------- .../tests/test_markov_regression.py | 51 +++++++------ .../tests/test_markov_switching.py | 22 +++--- 7 files changed, 78 insertions(+), 79 deletions(-) diff --git a/statsmodels/tsa/regime_switching/__init__.py b/statsmodels/tsa/regime_switching/__init__.py index 4e5d251def6..6e2f0243459 100644 --- a/statsmodels/tsa/regime_switching/__init__.py +++ b/statsmodels/tsa/regime_switching/__init__.py @@ -1,3 +1,2 @@ from statsmodels import PytestTester test = PytestTester() - diff --git a/statsmodels/tsa/regime_switching/markov_autoregression.py b/statsmodels/tsa/regime_switching/markov_autoregression.py index 5aaccba93a0..a38d250e8a2 100644 --- a/statsmodels/tsa/regime_switching/markov_autoregression.py +++ b/statsmodels/tsa/regime_switching/markov_autoregression.py @@ -488,5 +488,5 @@ class MarkovAutoregressionResults(markov_regression.MarkovRegressionResults): class MarkovAutoregressionResultsWrapper( markov_regression.MarkovRegressionResultsWrapper): pass -wrap.populate_wrapper(MarkovAutoregressionResultsWrapper, +wrap.populate_wrapper(MarkovAutoregressionResultsWrapper, # noqa:E305 MarkovAutoregressionResults) diff --git a/statsmodels/tsa/regime_switching/markov_regression.py b/statsmodels/tsa/regime_switching/markov_regression.py index ef11b49627e..f16724a91a5 100644 --- a/statsmodels/tsa/regime_switching/markov_regression.py +++ b/statsmodels/tsa/regime_switching/markov_regression.py @@ -452,4 +452,5 @@ class MarkovRegressionResults(markov_switching.MarkovSwitchingResults): class MarkovRegressionResultsWrapper( markov_switching.MarkovSwitchingResultsWrapper): pass -wrap.populate_wrapper(MarkovRegressionResultsWrapper, MarkovRegressionResults) +wrap.populate_wrapper(MarkovRegressionResultsWrapper, # noqa:E305 + MarkovRegressionResults) diff --git a/statsmodels/tsa/regime_switching/markov_switching.py b/statsmodels/tsa/regime_switching/markov_switching.py index 068153e9601..1a55330ee90 100644 --- a/statsmodels/tsa/regime_switching/markov_switching.py +++ b/statsmodels/tsa/regime_switching/markov_switching.py @@ -42,8 +42,6 @@ } - - def _logistic(x): """ Note that this is not a vectorized function @@ -2272,4 +2270,5 @@ class MarkovSwitchingResultsWrapper(wrap.ResultsWrapper): } _wrap_methods = wrap.union_dicts( tsbase.TimeSeriesResultsWrapper._wrap_methods, _methods) -wrap.populate_wrapper(MarkovSwitchingResultsWrapper, MarkovSwitchingResults) +wrap.populate_wrapper(MarkovSwitchingResultsWrapper, # noqa:E305 + MarkovSwitchingResults) diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py index 96204fcd883..ec55a97e2ba 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py @@ -13,7 +13,7 @@ import pandas as pd from statsmodels.tools import add_constant from statsmodels.tsa.regime_switching import markov_autoregression -from numpy.testing import assert_equal, assert_allclose, assert_raises +from numpy.testing import assert_equal, assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) @@ -239,126 +239,126 @@ def test_fit_em(self, **kwargs): rtol=self.rtol) -hamilton_ar2_short_filtered_joint_probabilities = np.array([[[[ - 4.99506987e-02, 6.44048275e-04, 6.22227140e-05, +hamilton_ar2_short_filtered_joint_probabilities = np.array([ + [[[4.99506987e-02, 6.44048275e-04, 6.22227140e-05, 4.45756755e-06, 5.26645567e-07, 7.99846146e-07, 1.19425705e-05, 6.87762063e-03], - [ 1.95930395e-02, 3.25884335e-04, 1.12955091e-04, + [1.95930395e-02, 3.25884335e-04, 1.12955091e-04, 3.38537103e-04, 9.81927968e-06, 2.71696750e-05, 5.83828290e-03, 7.64261509e-02]], - [[ 1.97113193e-03, 9.50372207e-05, 1.98390978e-04, + [[1.97113193e-03, 9.50372207e-05, 1.98390978e-04, 1.88188953e-06, 4.83449400e-07, 1.14872860e-05, 4.02918239e-06, 4.35015431e-04], - [ 2.24870443e-02, 1.27331172e-03, 9.62155856e-03, + [2.24870443e-02, 1.27331172e-03, 9.62155856e-03, 4.04178695e-03, 2.75516282e-04, 1.18179572e-02, 5.99778157e-02, 1.48149567e-01]]], - [[[ 6.70912859e-02, 1.84223872e-02, 2.55621792e-04, + [[[6.70912859e-02, 1.84223872e-02, 2.55621792e-04, 4.48500688e-05, 7.80481515e-05, 2.73734559e-06, 7.59835896e-06, 1.42930726e-03], - [ 2.10053328e-02, 7.44036383e-03, 3.70388879e-04, + [2.10053328e-02, 7.44036383e-03, 3.70388879e-04, 2.71878370e-03, 1.16152088e-03, 7.42182691e-05, 2.96490192e-03, 1.26774695e-02]], - [[ 8.09335679e-02, 8.31016518e-02, 2.49149080e-02, + [[8.09335679e-02, 8.31016518e-02, 2.49149080e-02, 5.78825626e-04, 2.19019941e-03, 1.20179130e-03, 7.83659430e-05, 2.76363377e-03], - [ 7.36967899e-01, 8.88697316e-01, 9.64463954e-01, + [7.36967899e-01, 8.88697316e-01, 9.64463954e-01, 9.92270877e-01, 9.96283886e-01, 9.86863839e-01, 9.31117063e-01, 7.51241236e-01]]]]) -hamilton_ar2_short_predicted_joint_probabilities = np.array([[[[[ - 1.20809334e-01, 3.76964436e-02, 4.86045844e-04, +hamilton_ar2_short_predicted_joint_probabilities = np.array([[ + [[[1.20809334e-01, 3.76964436e-02, 4.86045844e-04, 4.69578023e-05, 3.36400588e-06, 3.97445190e-07, 6.03622290e-07, 9.01273552e-06], - [ 3.92723623e-02, 1.47863379e-02, 2.45936108e-04, + [3.92723623e-02, 1.47863379e-02, 2.45936108e-04, 8.52441571e-05, 2.55484811e-04, 7.41034525e-06, 2.05042201e-05, 4.40599447e-03]], - [[ 4.99131230e-03, 1.48756005e-03, 7.17220245e-05, + [[4.99131230e-03, 1.48756005e-03, 7.17220245e-05, 1.49720314e-04, 1.42021122e-06, 3.64846209e-07, 8.66914462e-06, 3.04071516e-06], - [ 4.70476003e-02, 1.69703652e-02, 9.60933974e-04, + [4.70476003e-02, 1.69703652e-02, 9.60933974e-04, 7.26113047e-03, 3.05022748e-03, 2.07924699e-04, 8.91869322e-03, 4.52636381e-02]]], - [[[ 4.99131230e-03, 6.43506069e-03, 1.76698327e-03, + [[[4.99131230e-03, 6.43506069e-03, 1.76698327e-03, 2.45179642e-05, 4.30179435e-06, 7.48598845e-06, 2.62552503e-07, 7.28796600e-07], - [ 1.62256192e-03, 2.01472650e-03, 7.13642497e-04, + [1.62256192e-03, 2.01472650e-03, 7.13642497e-04, 3.55258493e-05, 2.60772139e-04, 1.11407276e-04, 7.11864528e-06, 2.84378568e-04]], - [[ 5.97950448e-03, 7.76274317e-03, 7.97069493e-03, + [[5.97950448e-03, 7.76274317e-03, 7.97069493e-03, 2.38971340e-03, 5.55180599e-05, 2.10072977e-04, 1.15269812e-04, 7.51646942e-06], - [ 5.63621989e-02, 7.06862760e-02, 8.52394030e-02, + [5.63621989e-02, 7.06862760e-02, 8.52394030e-02, 9.25065601e-02, 9.51736612e-02, 9.55585689e-02, 9.46550451e-02, 8.93080931e-02]]]], - [[[[ 3.92723623e-02, 1.22542551e-02, 1.58002431e-04, + [[[[3.92723623e-02, 1.22542551e-02, 1.58002431e-04, 1.52649118e-05, 1.09356167e-06, 1.29200377e-07, 1.96223855e-07, 2.92983500e-06], - [ 1.27665503e-02, 4.80670161e-03, 7.99482261e-05, + [1.27665503e-02, 4.80670161e-03, 7.99482261e-05, 2.77109335e-05, 8.30522919e-05, 2.40893443e-06, 6.66545485e-06, 1.43228843e-03]], - [[ 1.62256192e-03, 4.83571884e-04, 2.33151963e-05, + [[1.62256192e-03, 4.83571884e-04, 2.33151963e-05, 4.86706634e-05, 4.61678312e-07, 1.18603191e-07, 2.81814142e-06, 9.88467229e-07], - [ 1.52941031e-02, 5.51667911e-03, 3.12377744e-04, + [1.52941031e-02, 5.51667911e-03, 3.12377744e-04, 2.36042810e-03, 9.91559466e-04, 6.75915830e-05, 2.89926399e-03, 1.47141776e-02]]], - [[[ 4.70476003e-02, 6.06562252e-02, 1.66554040e-02, + [[[4.70476003e-02, 6.06562252e-02, 1.66554040e-02, 2.31103828e-04, 4.05482745e-05, 7.05621631e-05, 2.47479309e-06, 6.86956236e-06], - [ 1.52941031e-02, 1.89906063e-02, 6.72672133e-03, + [1.52941031e-02, 1.89906063e-02, 6.72672133e-03, 3.34863029e-04, 2.45801156e-03, 1.05011361e-03, 6.70996238e-05, 2.68052335e-03]], - [[ 5.63621989e-02, 7.31708248e-02, 7.51309569e-02, + [[5.63621989e-02, 7.31708248e-02, 7.51309569e-02, 2.25251946e-02, 5.23307566e-04, 1.98012644e-03, 1.08652148e-03, 7.08494735e-05], - [ 5.31264334e-01, 6.66281623e-01, 8.03457913e-01, + [5.31264334e-01, 6.66281623e-01, 8.03457913e-01, 8.71957394e-01, 8.97097216e-01, 9.00725317e-01, 8.92208794e-01, 8.41808970e-01]]]]]) -hamilton_ar2_short_smoothed_joint_probabilities = np.array([[[[ - 1.29898189e-02, 1.66298475e-04, 1.29822987e-05, +hamilton_ar2_short_smoothed_joint_probabilities = np.array([ + [[[1.29898189e-02, 1.66298475e-04, 1.29822987e-05, 9.95268382e-07, 1.84473346e-07, 7.18761267e-07, 1.69576494e-05, 6.87762063e-03], - [ 5.09522472e-03, 8.41459714e-05, 2.35672254e-05, + [5.09522472e-03, 8.41459714e-05, 2.35672254e-05, 7.55872505e-05, 3.43949612e-06, 2.44153330e-05, 8.28997024e-03, 7.64261509e-02]], - [[ 5.90021731e-04, 2.55342733e-05, 4.50698224e-05, + [[5.90021731e-04, 2.55342733e-05, 4.50698224e-05, 5.30734135e-07, 1.80741761e-07, 1.11483792e-05, 5.98539007e-06, 4.35015431e-04], - [ 6.73107901e-03, 3.42109009e-04, 2.18579464e-03, + [6.73107901e-03, 3.42109009e-04, 2.18579464e-03, 1.13987259e-03, 1.03004157e-04, 1.14692946e-02, 8.90976350e-02, 1.48149567e-01]]], - [[[ 6.34648123e-02, 1.79187451e-02, 2.37462147e-04, + [[[6.34648123e-02, 1.79187451e-02, 2.37462147e-04, 3.55542558e-05, 7.63980455e-05, 2.90520820e-06, 8.17644492e-06, 1.42930726e-03], - [ 1.98699352e-02, 7.23695477e-03, 3.44076057e-04, + [1.98699352e-02, 7.23695477e-03, 3.44076057e-04, 2.15527721e-03, 1.13696383e-03, 7.87695658e-05, 3.19047276e-03, 1.26774695e-02]], - [[ 8.81925054e-02, 8.33092133e-02, 2.51106301e-02, + [[8.81925054e-02, 8.33092133e-02, 2.51106301e-02, 5.81007470e-04, 2.19065072e-03, 1.20221350e-03, 7.56893839e-05, 2.76363377e-03], - [ 8.03066603e-01, 8.90916999e-01, 9.72040418e-01, + [8.03066603e-01, 8.90916999e-01, 9.72040418e-01, 9.96011175e-01, 9.96489179e-01, 9.87210535e-01, 8.99315113e-01, 7.51241236e-01]]]]) @@ -422,6 +422,7 @@ def test_smoother_output(self, **kwargs): res.smoothed_joint_probabilities[..., :-3], hamilton_ar2_short_smoothed_joint_probabilities[..., :-3]) + hamilton_ar4_filtered = [ 0.776712, 0.949192, 0.996320, 0.990258, 0.940111, 0.537442, 0.140001, 0.008942, 0.048480, 0.614097, 0.910889, 0.995463, diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py index 266ff6cccc1..f98b9fbe0c6 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py @@ -12,7 +12,7 @@ import pandas as pd from statsmodels.tsa.regime_switching import (markov_switching, markov_regression) -from numpy.testing import assert_equal, assert_allclose, assert_raises +from numpy.testing import assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) @@ -436,8 +436,8 @@ def test_fit_em(self, **kwargs): rtol=self.rtol) -fedfunds_const_filtered_joint_probabilities = np.array([[[ - 9.81875427e-01, 9.99977639e-01, 9.99982269e-01, +fedfunds_const_filtered_joint_probabilities = np.array([ + [[9.81875427e-01, 9.99977639e-01, 9.99982269e-01, 9.99977917e-01, 9.99961064e-01, 9.99932206e-01, 9.99919386e-01, 9.99894144e-01, 9.99875287e-01, 9.99853807e-01, 9.99852600e-01, 9.99839056e-01, @@ -513,7 +513,7 @@ def test_fit_em(self, **kwargs): 9.99996082e-01, 9.99996179e-01, 9.99996370e-01, 9.99996334e-01, 9.99996045e-01, 9.99996030e-01, 9.99996030e-01], - [ 1.79021167e-02, 1.14091306e-05, 5.61557959e-07, + [1.79021167e-02, 1.14091306e-05, 5.61557959e-07, 8.80398735e-07, 1.08717798e-06, 1.94073468e-06, 3.37670187e-06, 3.96039606e-06, 5.22475895e-06, 6.12683114e-06, 7.18211108e-06, 7.18979687e-06, @@ -590,7 +590,7 @@ def test_fit_em(self, **kwargs): 1.76579974e-07, 1.78918351e-07, 1.93625709e-07, 1.93628651e-07]], - [[ 1.12025955e-05, 1.08238349e-05, 1.71596282e-05, + [[1.12025955e-05, 1.08238349e-05, 1.71596282e-05, 2.11831999e-05, 3.78067714e-05, 6.57213848e-05, 7.69689076e-05, 1.01479702e-04, 1.18846167e-04, 1.39184231e-04, 1.39184063e-04, 1.52618987e-04, @@ -666,7 +666,7 @@ def test_fit_em(self, **kwargs): 3.72600486e-06, 3.62917375e-06, 3.44299546e-06, 3.48862495e-06, 3.77538503e-06, 3.77538498e-06, 3.77538498e-06], - [ 2.11253415e-04, 1.27726353e-07, 9.96660517e-09, + [2.11253415e-04, 1.27726353e-07, 9.96660517e-09, 1.92893528e-08, 4.25132434e-08, 1.31928738e-07, 2.68831493e-07, 4.15719953e-07, 6.42307057e-07, 8.82117691e-07, 1.03405286e-06, 1.13509493e-06, @@ -817,80 +817,79 @@ def test_bse(self): assert_allclose(bse[:-1], self.true['bse_oim'][:-1], atol=1e-7) - -fedfunds_const_short_filtered_joint_probabilities = np.array([[[ - 9.81370301e-01, 9.99956215e-01, 9.99995966e-01, +fedfunds_const_short_filtered_joint_probabilities = np.array([ + [[9.81370301e-01, 9.99956215e-01, 9.99995966e-01, 9.99996082e-01, 9.99996179e-01, 9.99996370e-01, 9.99996334e-01, 9.99996045e-01, 9.99996030e-01, 9.99996030e-01], - [ 1.78929069e-02, 3.78065881e-05, 3.06546640e-07, + [1.78929069e-02, 3.78065881e-05, 3.06546640e-07, 1.91118379e-07, 1.91095611e-07, 1.86129447e-07, 1.76579974e-07, 1.78918351e-07, 1.93625709e-07, 1.93628651e-07]], - [[ 3.71038873e-05, 5.75327472e-06, 3.72600443e-06, + [[3.71038873e-05, 5.75327472e-06, 3.72600443e-06, 3.72600486e-06, 3.62917375e-06, 3.44299546e-06, 3.48862495e-06, 3.77538503e-06, 3.77538498e-06, 3.77538498e-06], - [ 6.99688113e-04, 2.24977302e-07, 1.18135050e-09, + [6.99688113e-04, 2.24977302e-07, 1.18135050e-09, 7.36520203e-10, 7.17294043e-10, 6.62811758e-10, 6.37139329e-10, 6.98642410e-10, 7.56071871e-10, 7.56083358e-10]]]) -fedfunds_const_short_predicted_joint_probabilities = np.array([[[[ - 7.11514435e-01, 9.63797786e-01, 9.82050899e-01, +fedfunds_const_short_predicted_joint_probabilities = np.array([ + [[[7.11514435e-01, 9.63797786e-01, 9.82050899e-01, 9.82089938e-01, 9.82090052e-01, 9.82090147e-01, 9.82090335e-01, 9.82090300e-01, 9.82090016e-01, 9.82090001e-01], - [ 1.29727398e-02, 1.75725147e-02, 3.71296195e-05, + [1.29727398e-02, 1.75725147e-02, 3.71296195e-05, 3.01057585e-07, 1.87696195e-07, 1.87673833e-07, 1.82796594e-07, 1.73418115e-07, 1.75714621e-07, 1.90158628e-07]], - [[ 6.65201476e-04, 1.86850353e-06, 2.89727435e-07, + [[6.65201476e-04, 1.86850353e-06, 2.89727435e-07, 1.87636739e-07, 1.87636761e-07, 1.82760472e-07, 1.73384775e-07, 1.75682617e-07, 1.90123482e-07, 1.90123479e-07], - [ 1.25440648e-02, 3.52353838e-05, 1.13295645e-08, + [1.25440648e-02, 3.52353838e-05, 1.13295645e-08, 5.94912755e-11, 3.70902000e-11, 3.61219955e-11, 3.33783385e-11, 3.20855083e-11, 3.51827235e-11, 3.80747965e-11]]], - [[[ 1.29727398e-02, 1.75725147e-02, 1.79053160e-02, + [[[1.29727398e-02, 1.75725147e-02, 1.79053160e-02, 1.79060278e-02, 1.79060298e-02, 1.79060316e-02, 1.79060350e-02, 1.79060344e-02, 1.79060292e-02, 1.79060289e-02], - [ 2.36526442e-04, 3.20392181e-04, 6.76968547e-07, + [2.36526442e-04, 3.20392181e-04, 6.76968547e-07, 5.48905479e-09, 3.42218481e-09, 3.42177711e-09, 3.33285249e-09, 3.16185867e-09, 3.20372988e-09, 3.46708131e-09]], - [[ 1.25440648e-02, 3.52353838e-05, 5.46354728e-06, + [[1.25440648e-02, 3.52353838e-05, 5.46354728e-06, 3.53836769e-06, 3.53836810e-06, 3.44641328e-06, 3.26961068e-06, 3.31294233e-06, 3.58526155e-06, 3.58526150e-06], - [ 2.36550228e-01, 6.64452729e-04, 2.13647738e-07, + [2.36550228e-01, 6.64452729e-04, 2.13647738e-07, 1.12185923e-09, 6.99430003e-10, 6.81172047e-10, 6.29433420e-10, 6.05053821e-10, 6.63459686e-10, 7.17997074e-10]]]]) -fedfunds_const_short_smoothed_joint_probabilities = np.array([[[ - 9.82056759e-01, 9.99961887e-01, 9.99999502e-01, +fedfunds_const_short_smoothed_joint_probabilities = np.array([ + [[9.82056759e-01, 9.99961887e-01, 9.99999502e-01, 9.99999618e-01, 9.99999623e-01, 9.99999637e-01, 9.99999644e-01, 9.99999627e-01, 9.99999612e-01, 9.99996030e-01], - [ 1.79054228e-02, 3.78068025e-05, 3.06547724e-07, + [1.79054228e-02, 3.78068025e-05, 3.06547724e-07, 1.91119055e-07, 1.91096269e-07, 1.86130055e-07, 1.76580558e-07, 1.78918992e-07, 1.93626403e-07, 1.93628651e-07]], - [[ 1.90448249e-06, 2.95069837e-07, 1.91096241e-07, + [[1.90448249e-06, 2.95069837e-07, 1.91096241e-07, 1.91095282e-07, 1.86127261e-07, 1.76579242e-07, 1.78922146e-07, 1.93629492e-07, 1.94345814e-07, 3.77538498e-06], - [ 3.59138585e-05, 1.15384749e-08, 6.05881299e-11, + [3.59138585e-05, 1.15384749e-08, 6.05881299e-11, 3.77738466e-11, 3.67874300e-11, 3.39933060e-11, 3.26771544e-11, 3.58315175e-11, 3.89203762e-11, 7.56083358e-10]]]) diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_switching.py b/statsmodels/tsa/regime_switching/tests/test_markov_switching.py index f24c349d116..62fd6ed8d3c 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_switching.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_switching.py @@ -216,34 +216,34 @@ def test_logistic(): # For a vector, logistic(x) returns # np.exp(x[i]) / (1 + np.sum(np.exp(x[:]))) for each i # but squeezed - cases = [[1.], [0,1.], [-2,3.,1.2,-30.]] + cases = [[1.], [0, 1.], [-2, 3., 1.2, -30.]] for x in cases: actual = logistic(x) desired = [np.exp(i) / (1 + np.sum(np.exp(x))) for i in x] assert_allclose(actual, desired) # For a 2-dim, logistic(x) returns - # np.exp(x[i,t]) / (1 + np.sum(np.exp(x[:,t]))) for each i, each t + # np.exp(x[i, t]) / (1 + np.sum(np.exp(x[:, t]))) for each i, each t # but squeezed case = [[1.]] actual = logistic(case) - assert_equal(actual.shape, (1,1)) + assert_equal(actual.shape, (1, 1)) assert_allclose(actual, np.exp(1) / (1 + np.exp(1))) - # Here, np.array(case) is 2x1, so it is interpreted as i=0,1 and t=0 + # Here, np.array(case) is 2x1, so it is interpreted as i=0, 1 and t=0 case = [[0], [1.]] actual = logistic(case) desired = [np.exp(i) / (1 + np.sum(np.exp(case))) for i in case] assert_allclose(actual, desired) - # Here, np.array(case) is 1x2, so it is interpreted as i=0 and t=0,1 + # Here, np.array(case) is 1x2, so it is interpreted as i=0 and t=0, 1 case = [[0, 1.]] actual = logistic(case) desired = np.exp(case) / (1 + np.exp(case)) assert_allclose(actual, desired) # For a 3-dim, logistic(x) returns - # np.exp(x[i,j,t]) / (1 + np.sum(np.exp(x[:,j,t]))) + # np.exp(x[i, j, t]) / (1 + np.sum(np.exp(x[:, j, t]))) # for each i, each j, each t case = np.arange(2*3*4).reshape(2, 3, 4) actual = logistic(case) @@ -268,7 +268,7 @@ def test_partials_logistic(): # np.exp(x[i]) / (1 + np.sum(np.exp(x[:]))) for each i # Then d logistic(x[i]) / dx[i] = (logistix(x) - logistic(x)**2)[i] # And d logistic(x[i]) / dx[j] = -(logistic(x[i]) * logistic[x[j]]) - cases = [[1.], [0,1.], [-2,3.,1.2,-30.]] + cases = [[1.], [0, 1.], [-2, 3., 1.2, -30.]] for x in cases: evaluated = np.atleast_1d(logistic(x)) partials = np.diag(evaluated - evaluated**2) @@ -279,7 +279,7 @@ def test_partials_logistic(): assert_allclose(partials_logistic(x), approx_fprime_cs(x, logistic)) # For a 2-dim, logistic(x) returns - # np.exp(x[i,t]) / (1 + np.sum(np.exp(x[:,t]))) for each i, each t + # np.exp(x[i, t]) / (1 + np.sum(np.exp(x[:, t]))) for each i, each t # but squeezed case = [[1.]] evaluated = logistic(case) @@ -287,7 +287,7 @@ def test_partials_logistic(): assert_allclose(partials_logistic(case), partial) assert_allclose(partials_logistic(case), approx_fprime_cs(case, logistic)) - # # Here, np.array(case) is 2x1, so it is interpreted as i=0,1 and t=0 + # # Here, np.array(case) is 2x1, so it is interpreted as i=0, 1 and t=0 case = [[0], [1.]] evaluated = logistic(case)[:, 0] partials = np.diag(evaluated - evaluated**2) @@ -296,7 +296,7 @@ def test_partials_logistic(): assert_allclose(partials_logistic(case), approx_fprime_cs(np.squeeze(case), logistic)[..., None]) - # Here, np.array(case) is 1x2, so it is interpreted as i=0 and t=0,1 + # Here, np.array(case) is 1x2, so it is interpreted as i=0 and t=0, 1 case = [[0, 1.]] evaluated = logistic(case) partials = (evaluated - evaluated**2)[None, ...] @@ -305,7 +305,7 @@ def test_partials_logistic(): approx_fprime_cs(case, logistic).T) # For a 3-dim, logistic(x) returns - # np.exp(x[i,j,t]) / (1 + np.sum(np.exp(x[:,j,t]))) + # np.exp(x[i, j, t]) / (1 + np.sum(np.exp(x[:, j, t]))) # for each i, each j, each t case = np.arange(2*3*4).reshape(2, 3, 4) evaluated = logistic(case) From 904d0596a6dc4e10836e4d551a623c9265d5e6d6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 23 Mar 2018 08:30:54 -0700 Subject: [PATCH 108/157] cleanup in statespace --- statsmodels/tsa/statespace/api.py | 1 + statsmodels/tsa/statespace/dynamic_factor.py | 6 +++--- statsmodels/tsa/statespace/sarimax.py | 1 - statsmodels/tsa/statespace/structural.py | 4 +--- statsmodels/tsa/statespace/tools.py | 3 +-- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/statsmodels/tsa/statespace/api.py b/statsmodels/tsa/statespace/api.py index 9c1322c398f..f5a6bfbf64e 100644 --- a/statsmodels/tsa/statespace/api.py +++ b/statsmodels/tsa/statespace/api.py @@ -1,3 +1,4 @@ +__all__ = ["SARIMAX", "MLEModel", "MLEResults", "tools"] from .sarimax import SARIMAX from .mlemodel import MLEModel, MLEResults from . import tools diff --git a/statsmodels/tsa/statespace/dynamic_factor.py b/statsmodels/tsa/statespace/dynamic_factor.py index f98f4342ef9..e2afb6bed1e 100644 --- a/statsmodels/tsa/statespace/dynamic_factor.py +++ b/statsmodels/tsa/statespace/dynamic_factor.py @@ -11,7 +11,6 @@ from statsmodels.compat.collections import OrderedDict import numpy as np -import pandas as pd from .mlemodel import MLEModel, MLEResults, MLEResultsWrapper from .tools import ( is_invertible, prepare_exog, @@ -161,7 +160,7 @@ def __init__(self, endog, k_factors, factor_order, exog=None, # Exogenous data (self.k_exog, exog) = prepare_exog(exog) - + # Note: at some point in the future might add state regression, as in # SARIMAX. self.mle_regression = self.k_exog > 0 @@ -1354,4 +1353,5 @@ class DynamicFactorResultsWrapper(MLEResultsWrapper): _methods = {} _wrap_methods = wrap.union_dicts(MLEResultsWrapper._wrap_methods, _methods) -wrap.populate_wrapper(DynamicFactorResultsWrapper, DynamicFactorResults) +wrap.populate_wrapper(DynamicFactorResultsWrapper, # noqa:E305 + DynamicFactorResults) diff --git a/statsmodels/tsa/statespace/sarimax.py b/statsmodels/tsa/statespace/sarimax.py index 7e176bb6911..7953a5650bf 100644 --- a/statsmodels/tsa/statespace/sarimax.py +++ b/statsmodels/tsa/statespace/sarimax.py @@ -10,7 +10,6 @@ from warnings import warn import numpy as np -import pandas as pd from .kalman_filter import KalmanFilter from .mlemodel import MLEModel, MLEResults, MLEResultsWrapper from .tools import ( diff --git a/statsmodels/tsa/statespace/structural.py b/statsmodels/tsa/statespace/structural.py index 2783a805180..00dd05d2809 100644 --- a/statsmodels/tsa/statespace/structural.py +++ b/statsmodels/tsa/statespace/structural.py @@ -10,7 +10,6 @@ from statsmodels.compat.collections import OrderedDict import numpy as np -import pandas as pd from statsmodels.tsa.filters.hp_filter import hpfilter from statsmodels.tsa.tsatools import lagmat from .mlemodel import MLEModel, MLEResults, MLEResultsWrapper @@ -294,11 +293,10 @@ class UnobservedComponents(MLEModel): References ---------- - .. [1] Durbin, James, and Siem Jan Koopman. 2012. Time Series Analysis by State Space Methods: Second Edition. Oxford University Press. - """ + """ # noqa:E501 def __init__(self, endog, level=False, trend=False, seasonal=None, cycle=False, autoregressive=None, exog=None, irregular=False, diff --git a/statsmodels/tsa/statespace/tools.py b/statsmodels/tsa/statespace/tools.py index 236c9068450..7ddab4e8c98 100644 --- a/statsmodels/tsa/statespace/tools.py +++ b/statsmodels/tsa/statespace/tools.py @@ -246,7 +246,7 @@ def companion_matrix(polynomial): Given coefficients of a lag polynomial of the form: .. math:: - + c(L) = c_0 + c_1 L + \dots + c_p L^p returns a matrix of the form @@ -1896,7 +1896,6 @@ def copy_index_vector(a, b, index, inplace=False, prefix=None): return b - def prepare_exog(exog): k_exog = 0 if exog is not None: From dcb90b7a416f8c4acefbdd795dd5496c995e4cbc Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 5 Mar 2018 23:31:49 -0500 Subject: [PATCH 109/157] ENH: Allow MICE to work with regularized regression --- statsmodels/imputation/mice.py | 19 +++++++++++++++---- statsmodels/imputation/tests/test_mice.py | 8 ++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/statsmodels/imputation/mice.py b/statsmodels/imputation/mice.py index 233a7512371..9a70e5defba 100644 --- a/statsmodels/imputation/mice.py +++ b/statsmodels/imputation/mice.py @@ -200,6 +200,8 @@ def __init__(self, data, perturbation_method='gaussian', if data.columns.dtype != np.dtype('O'): raise ValueError("MICEData data column names should be string type") + self.regularized = dict() + # Drop observations where all variables are missing. This # also has the effect of copying the data frame. self.data = data.dropna(how='all').reset_index(drop=True) @@ -257,7 +259,6 @@ def __init__(self, data, perturbation_method='gaussian', self.k_pmm = k_pmm - def next_sample(self): """ Returns the next imputed dataset in the imputation process. @@ -308,7 +309,7 @@ def _split_indices(self, vec): def set_imputer(self, endog_name, formula=None, model_class=None, init_kwds=None, fit_kwds=None, predict_kwds=None, - k_pmm=20, perturbation_method=None): + k_pmm=20, perturbation_method=None, regularized=False): """ Specify the imputation process for a single variable. @@ -337,6 +338,11 @@ def set_imputer(self, endog_name, formula=None, model_class=None, Either 'gaussian' or 'bootstrap'. Determines the method for perturbing parameters in the imputation model. If None, uses the default specified at class initialization. + regularized : dict + If regularized[name]=True, `fit_regularized` rather than + `fit` is called when fitting imputation models for this + variable. When regularized[name]=True for any variable, + pertrurbation_method must be set to boot. Notes ----- @@ -376,7 +382,7 @@ def set_imputer(self, endog_name, formula=None, model_class=None, self.perturbation_method[endog_name] = perturbation_method self.k_pmm = k_pmm - + self.regularized[endog_name] = regularized def _store_changes(self, col, vals): """ @@ -948,7 +954,12 @@ def _perturb_bootstrap(self, vname): klass = self.model_class[vname] self.models[vname] = klass(endog, exog, **init_kwds) - self.results[vname] = self.models[vname].fit(**fit_kwds) + + if vname in self.regularized and self.regularized[vname]: + self.results[vname] = self.models[vname].fit_regularized(**fit_kwds) + else: + self.results[vname] = self.models[vname].fit(**fit_kwds) + self.params[vname] = self.results[vname].params diff --git a/statsmodels/imputation/tests/test_mice.py b/statsmodels/imputation/tests/test_mice.py index 7b5874ba505..ecfc52dbe5c 100644 --- a/statsmodels/imputation/tests/test_mice.py +++ b/statsmodels/imputation/tests/test_mice.py @@ -301,6 +301,14 @@ def test_MICE1(self): assert(issubclass(x.__class__, RegressionResultsWrapper)) + def test_MICE1_regularized(self): + + df = gendat() + imp = mice.MICEData(df, perturbation_method='boot') + imp.set_imputer('x1', 'x2 + y', fit_kwds={'alpha': 1, 'L1_wt': 0}) + imp.update_all() + + def test_MICE2(self): from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper From 42e7826399de6a7244e86ac5b9706b80408b5e06 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 5 Mar 2018 23:41:24 -0500 Subject: [PATCH 110/157] pep8 fixes to mice.py --- statsmodels/imputation/mice.py | 71 +++++++++++++--------------------- 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/statsmodels/imputation/mice.py b/statsmodels/imputation/mice.py index 9a70e5defba..3e22fe97394 100644 --- a/statsmodels/imputation/mice.py +++ b/statsmodels/imputation/mice.py @@ -118,7 +118,6 @@ import pandas as pd import numpy as np import patsy -import statsmodels from statsmodels.base.model import LikelihoodModelResults from statsmodels.regression.linear_model import OLS from collections import defaultdict @@ -198,7 +197,8 @@ def __init__(self, data, perturbation_method='gaussian', k_pmm=20, history_callback=None): if data.columns.dtype != np.dtype('O'): - raise ValueError("MICEData data column names should be string type") + msg = "MICEData data column names should be string type" + raise ValueError(msg) self.regularized = dict() @@ -212,7 +212,7 @@ def __init__(self, data, perturbation_method='gaussian', # Assign the same perturbation method for all variables. # Can be overriden when calling 'set_imputer'. - self.perturbation_method = defaultdict(lambda : + self.perturbation_method = defaultdict(lambda: perturbation_method) # Map from variable name to indices of observed/missing @@ -233,8 +233,8 @@ def __init__(self, data, perturbation_method='gaussian', # Map from variable names to init/fit args of the conditional # models. - self.init_kwds = defaultdict(lambda : dict()) - self.fit_kwds = defaultdict(lambda : dict()) + self.init_kwds = defaultdict(lambda: dict()) + self.fit_kwds = defaultdict(lambda: dict()) # Map from variable names to the model class. self.model_class = {} @@ -248,7 +248,7 @@ def __init__(self, data, perturbation_method='gaussian', # The order in which variables are imputed in each cycle. # Impute variables with the fewest missing values first. - vnames =list(data.columns) + vnames = list(data.columns) nmiss = [len(self.ix_miss[v]) for v in vnames] nmiss = np.asarray(nmiss) ii = np.argsort(nmiss) @@ -281,7 +281,6 @@ def next_sample(self): self.update_all(1) return self.data - def _initial_imputation(self): """ Use a PMM-like procedure for initial imputed values. @@ -297,7 +296,6 @@ def _initial_imputation(self): imp = di.loc[ix] self.data[col].fillna(imp, inplace=True) - def _split_indices(self, vec): null = pd.isnull(vec) ix_obs = np.flatnonzero(~null) @@ -306,7 +304,6 @@ def _split_indices(self, vec): raise ValueError("variable to be imputed has no observed values") return ix_obs, ix_miss - def set_imputer(self, endog_name, formula=None, model_class=None, init_kwds=None, fit_kwds=None, predict_kwds=None, k_pmm=20, perturbation_method=None, regularized=False): @@ -400,7 +397,6 @@ def _store_changes(self, col, vals): if len(ix) > 0: self.data[col].iloc[ix] = np.atleast_1d(vals) - def update_all(self, n_iter=1): """ Perform a specified number of MICE iterations. @@ -424,7 +420,6 @@ def update_all(self, n_iter=1): hv = self.history_callback(self) self.history.append(hv) - def get_split_data(self, vname): """ Return endog and exog for imputation of a given variable. @@ -475,8 +470,8 @@ def get_split_data(self, vname): kwds = self.predict_kwds[vname] predict_miss_kwds = self._process_kwds(kwds, ixo) - return endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds - + return (endog_obs, exog_obs, exog_miss, predict_obs_kwds, + predict_miss_kwds) def _process_kwds(self, kwds, ix): kwds = kwds.copy() @@ -491,7 +486,6 @@ def _process_kwds(self, kwds, ix): kwds[k] = mat return kwds - def get_fitting_data(self, vname): """ Return the data needed to fit a model for imputation. @@ -537,7 +531,6 @@ def get_fitting_data(self, vname): return endog, exog, init_kwds, fit_kwds - def plot_missing_pattern(self, ax=None, row_order="pattern", column_order="pattern", hide_complete_rows=False, @@ -586,7 +579,8 @@ def plot_missing_pattern(self, ax=None, row_order="pattern", elif column_order == "raw": ix = np.arange(len(cols)) else: - raise ValueError(column_order + " is not an allowed value for `column_order`.") + raise ValueError( + column_order + " is not an allowed value for `column_order`.") miss = miss[:, ix] cols = [cols[i] for i in ix] @@ -600,7 +594,8 @@ def plot_missing_pattern(self, ax=None, row_order="pattern", elif row_order == "raw": ix = np.arange(miss.shape[0]) else: - raise ValueError(row_order + " is not an allowed value for `row_order`.") + raise ValueError( + row_order + " is not an allowed value for `row_order`.") miss = miss[ix, :] if hide_complete_rows: @@ -629,7 +624,7 @@ def plot_missing_pattern(self, ax=None, row_order="pattern", cmap='gist_ncar_r') else: cmap = LinearSegmentedColormap.from_list("_", - ["white", "darkgrey"]) + ["white", "darkgrey"]) ax.imshow(miss, aspect="auto", interpolation="nearest", cmap=cmap) @@ -639,7 +634,6 @@ def plot_missing_pattern(self, ax=None, row_order="pattern", return fig - def plot_bivariate(self, col1_name, col2_name, lowess_args=None, lowess_min_n=40, jitter=None, plot_points=True, ax=None): @@ -751,7 +745,6 @@ def plot_bivariate(self, col1_name, col2_name, return fig - def plot_fit_obs(self, col_name, lowess_args=None, lowess_min_n=40, jitter=None, plot_points=True, ax=None): @@ -784,7 +777,6 @@ def plot_fit_obs(self, col_name, lowess_args=None, from statsmodels.graphics import utils as gutils from statsmodels.nonparametric.smoothers_lowess import lowess - import pandas as pd if lowess_args is None: lowess_args = {} @@ -849,7 +841,6 @@ def plot_fit_obs(self, col_name, lowess_args=None, return fig - def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, obs_hist_args=None, all_hist_args=None): """ @@ -878,7 +869,6 @@ def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, """ from statsmodels.graphics import utils as gutils - from matplotlib.colors import LinearSegmentedColormap if imp_hist_args is None: imp_hist_args = {} @@ -922,7 +912,6 @@ def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, return fig - def _boot_kwds(self, kwds, rix): for k in kwds: @@ -936,7 +925,6 @@ def _boot_kwds(self, kwds, rix): return kwds - def _perturb_bootstrap(self, vname): """ Perturbs the model's parameters using a bootstrap. @@ -956,13 +944,13 @@ def _perturb_bootstrap(self, vname): self.models[vname] = klass(endog, exog, **init_kwds) if vname in self.regularized and self.regularized[vname]: - self.results[vname] = self.models[vname].fit_regularized(**fit_kwds) + self.results[vname] = ( + self.models[vname].fit_regularized(**fit_kwds)) else: self.results[vname] = self.models[vname].fit(**fit_kwds) self.params[vname] = self.results[vname].params - def _perturb_gaussian(self, vname): """ Gaussian perturbation of model parameters. @@ -982,7 +970,6 @@ def _perturb_gaussian(self, vname): mu = self.results[vname].params self.params[vname] = np.random.multivariate_normal(mean=mu, cov=cov) - def perturb_params(self, vname): if self.perturbation_method[vname] == "gaussian": @@ -992,13 +979,11 @@ def perturb_params(self, vname): else: raise ValueError("unknown perturbation method") - def impute(self, vname): # Wrap this in case we later add additional imputation # methods. self.impute_pmm(vname) - def update(self, vname): """ Impute missing values for a single variable. @@ -1015,7 +1000,6 @@ def update(self, vname): self.perturb_params(vname) self.impute(vname) - # work-around for inconsistent predict return values def _get_predicted(self, obj): @@ -1026,8 +1010,8 @@ def _get_predicted(self, obj): elif hasattr(obj, 'predicted_values'): return obj.predicted_values else: - raise ValueError("cannot obtain predicted values from %s" % obj.__class__) - + raise ValueError( + "cannot obtain predicted values from %s" % obj.__class__) def impute_pmm(self, vname): """ @@ -1041,14 +1025,16 @@ def impute_pmm(self, vname): k_pmm = self.k_pmm - endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds =\ - self.get_split_data(vname) + endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds = ( + self.get_split_data(vname)) # Predict imputed variable for both missing and non-missing # observations model = self.models[vname] - pendog_obs = model.predict(self.params[vname], exog_obs, **predict_obs_kwds) - pendog_miss = model.predict(self.params[vname], exog_miss, **predict_miss_kwds) + pendog_obs = model.predict(self.params[vname], exog_obs, + **predict_obs_kwds) + pendog_miss = model.predict(self.params[vname], exog_miss, + **predict_miss_kwds) pendog_obs = self._get_predicted(pendog_obs) pendog_miss = self._get_predicted(pendog_miss) @@ -1065,7 +1051,7 @@ def impute_pmm(self, vname): # Get the indices for the closest k_pmm values on # either side of the closest index. - ixm = ix[:, None] + np.arange(-k_pmm, k_pmm)[None, :] + ixm = ix[:, None] + np.arange(-k_pmm, k_pmm)[None, :] # Account for boundary effects msk = np.nonzero((ixm < 0) | (ixm > len(endog_obs) - 1)) @@ -1126,6 +1112,7 @@ def impute_pmm(self, vname): >>> results.append(x) """ + class MICE(object): __doc__ = """\ @@ -1164,9 +1151,8 @@ class if for the 'analysis model'. Obtain a sequence of fitted analysis models without combining to obtain summary: %(mice_example_2)s - """ % {'mice_example_1' : _mice_example_1, - 'mice_example_2' : _mice_example_2} - + """ % {'mice_example_1': _mice_example_1, + 'mice_example_2': _mice_example_2} def __init__(self, model_formula, model_class, data, n_skip=3, init_kwds=None, fit_kwds=None): @@ -1180,7 +1166,6 @@ def __init__(self, model_formula, model_class, data, n_skip=3, self.init_kwds = init_kwds if init_kwds is not None else {} self.fit_kwds = fit_kwds if fit_kwds is not None else {} - def next_sample(self): """ Perform one complete MICE iteration. @@ -1222,7 +1207,6 @@ class and is not used in any subsequent calls to `combine`. return result - def fit(self, n_burnin=10, n_imputations=10): """ Fit a model using MICE. @@ -1247,7 +1231,6 @@ def fit(self, n_burnin=10, n_imputations=10): return self.combine() - def combine(self): """ Pools MICE imputation results. From 66e040e17a1dba53bb14e4eae525ab409c5e806a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 08:44:47 -0500 Subject: [PATCH 111/157] add release note --- docs/source/release/version0.9.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/release/version0.9.rst b/docs/source/release/version0.9.rst index e380d059f40..cdf0651164f 100644 --- a/docs/source/release/version0.9.rst +++ b/docs/source/release/version0.9.rst @@ -44,6 +44,8 @@ Documentation Other important improvements ---------------------------- +* MICE (multiple imputation) can use regularized model fitters in the + imputation step. From 4ebca76438c1d0e44f5b0482d44055d25cf1eb81 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 11:04:12 -0500 Subject: [PATCH 112/157] try to improve test coverage metrics --- statsmodels/imputation/mice.py | 8 ++++++++ statsmodels/imputation/tests/test_mice.py | 16 +++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/statsmodels/imputation/mice.py b/statsmodels/imputation/mice.py index 3e22fe97394..4f57c1927a2 100644 --- a/statsmodels/imputation/mice.py +++ b/statsmodels/imputation/mice.py @@ -912,14 +912,22 @@ def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, return fig + # Try to identify any auxiliary arrays (e.g. status vector in + # PHReg) that need to be bootstrapped along with exog and endog. def _boot_kwds(self, kwds, rix): for k in kwds: v = kwds[k] + + # This is only relevant for ndarrays if not isinstance(v, np.ndarray): continue + + # Handle 1d vectors if (v.ndim == 1) and (v.shape[0] == len(rix)): kwds[k] = v[rix] + + # Handle 2d arrays if (v.ndim == 2) and (v.shape[0] == len(rix)): kwds[k] = v[rix, :] diff --git a/statsmodels/imputation/tests/test_mice.py b/statsmodels/imputation/tests/test_mice.py index ecfc52dbe5c..695f7346f1c 100644 --- a/statsmodels/imputation/tests/test_mice.py +++ b/statsmodels/imputation/tests/test_mice.py @@ -166,13 +166,15 @@ def test_phreg(self): from statsmodels.duration.hazard_regression import PHReg - idata = mice.MICEData(df) - idata.set_imputer("time", "0 + x1 + x2", model_class=PHReg, - init_kwds={"status": mice.PatsyFormula("status")}, - predict_kwds={"pred_type": "hr"}) - - x = idata.next_sample() - assert(isinstance(x, pd.DataFrame)) + for pm in "gaussian", "boot": + idata = mice.MICEData(df, perturbation_method=pm) + idata.set_imputer("time", "0 + x1 + x2", model_class=PHReg, + init_kwds={"status": mice.PatsyFormula("status")}, + predict_kwds={"pred_type": "hr"}, + perturbation_method=pm) + + x = idata.next_sample() + assert(isinstance(x, pd.DataFrame)) def test_set_imputer(self): From 45c83df353c04721a670aa98bbcb63d1b830d449 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Mar 2018 11:16:25 -0500 Subject: [PATCH 113/157] Further code coverage improvements --- statsmodels/imputation/tests/test_mice.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/statsmodels/imputation/tests/test_mice.py b/statsmodels/imputation/tests/test_mice.py index 695f7346f1c..25da07d6075 100644 --- a/statsmodels/imputation/tests/test_mice.py +++ b/statsmodels/imputation/tests/test_mice.py @@ -166,8 +166,13 @@ def test_phreg(self): from statsmodels.duration.hazard_regression import PHReg + # Save the dataset size at each iteration. + hist = [] + def cb(imp): + hist.append(imp.data.shape) + for pm in "gaussian", "boot": - idata = mice.MICEData(df, perturbation_method=pm) + idata = mice.MICEData(df, perturbation_method=pm, history_callback=cb) idata.set_imputer("time", "0 + x1 + x2", model_class=PHReg, init_kwds={"status": mice.PatsyFormula("status")}, predict_kwds={"pred_type": "hr"}, @@ -176,6 +181,7 @@ def test_phreg(self): x = idata.next_sample() assert(isinstance(x, pd.DataFrame)) + assert(all([x == (299, 4) for x in hist])) def test_set_imputer(self): # Test with specified perturbation method. From a3f05a0a3effbbb7e3abc40fc423210c73b1210f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 22 Mar 2018 16:11:36 -0700 Subject: [PATCH 114/157] remove unused imports of assert_raises --- statsmodels/discrete/tests/test_count_model.py | 2 +- statsmodels/distributions/tests/test_discrete.py | 2 +- .../tsa/regime_switching/tests/test_markov_autoregression.py | 2 +- .../tsa/regime_switching/tests/test_markov_regression.py | 2 +- statsmodels/tsa/statespace/tests/test_impulse_responses.py | 3 +-- statsmodels/tsa/statespace/tests/test_prediction.py | 2 +- statsmodels/tsa/statespace/tests/test_simulate.py | 3 +-- statsmodels/tsa/statespace/tests/test_simulation_smoothing.py | 2 +- statsmodels/tsa/statespace/tests/test_smoothing.py | 2 +- statsmodels/tsa/tests/test_bds.py | 2 +- statsmodels/tsa/tests/test_holtwinters.py | 2 +- 11 files changed, 11 insertions(+), 13 deletions(-) diff --git a/statsmodels/discrete/tests/test_count_model.py b/statsmodels/discrete/tests/test_count_model.py index 79240cb9a58..d517f067380 100644 --- a/statsmodels/discrete/tests/test_count_model.py +++ b/statsmodels/discrete/tests/test_count_model.py @@ -2,7 +2,7 @@ from __future__ import division import os import numpy as np -from numpy.testing import (assert_, assert_raises, assert_almost_equal, +from numpy.testing import (assert_, assert_almost_equal, assert_equal, assert_array_equal, assert_allclose, assert_array_less) diff --git a/statsmodels/distributions/tests/test_discrete.py b/statsmodels/distributions/tests/test_discrete.py index c98697514e9..341fd478647 100644 --- a/statsmodels/distributions/tests/test_discrete.py +++ b/statsmodels/distributions/tests/test_discrete.py @@ -2,7 +2,7 @@ import numpy as np import statsmodels.api as sm from scipy.stats import poisson, nbinom -from numpy.testing import (assert_, assert_raises, assert_almost_equal, +from numpy.testing import (assert_, assert_almost_equal, assert_equal, assert_array_equal, assert_allclose, assert_array_less) diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py index 96204fcd883..a0de0d48a8d 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py @@ -13,7 +13,7 @@ import pandas as pd from statsmodels.tools import add_constant from statsmodels.tsa.regime_switching import markov_autoregression -from numpy.testing import assert_equal, assert_allclose, assert_raises +from numpy.testing import assert_equal, assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py index 266ff6cccc1..378c4ffd239 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py @@ -12,7 +12,7 @@ import pandas as pd from statsmodels.tsa.regime_switching import (markov_switching, markov_regression) -from numpy.testing import assert_equal, assert_allclose, assert_raises +from numpy.testing import assert_equal, assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/statsmodels/tsa/statespace/tests/test_impulse_responses.py b/statsmodels/tsa/statespace/tests/test_impulse_responses.py index d66962e5d35..0846e709ee0 100644 --- a/statsmodels/tsa/statespace/tests/test_impulse_responses.py +++ b/statsmodels/tsa/statespace/tests/test_impulse_responses.py @@ -15,8 +15,7 @@ from statsmodels.tsa.statespace import (sarimax, structural, varmax, dynamic_factor) -from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal, - assert_raises) +from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal) def test_sarimax(): diff --git a/statsmodels/tsa/statespace/tests/test_prediction.py b/statsmodels/tsa/statespace/tests/test_prediction.py index 8f2d9ee4146..8b25caa3422 100644 --- a/statsmodels/tsa/statespace/tests/test_prediction.py +++ b/statsmodels/tsa/statespace/tests/test_prediction.py @@ -12,7 +12,7 @@ import warnings from statsmodels.tsa.statespace import sarimax -from numpy.testing import assert_equal, assert_allclose, assert_raises +from numpy.testing import assert_equal, assert_allclose def test_predict_dates(): diff --git a/statsmodels/tsa/statespace/tests/test_simulate.py b/statsmodels/tsa/statespace/tests/test_simulate.py index e165453c3a4..5436bf46bc5 100644 --- a/statsmodels/tsa/statespace/tests/test_simulate.py +++ b/statsmodels/tsa/statespace/tests/test_simulate.py @@ -16,8 +16,7 @@ from statsmodels.tsa.statespace import (sarimax, structural, varmax, dynamic_factor) from statsmodels.tsa.statespace.tools import compatibility_mode -from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal, - assert_raises) +from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal) def test_arma_lfilter(): diff --git a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py index 32f496f97bf..e46137ed5eb 100644 --- a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py +++ b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py @@ -21,7 +21,7 @@ SMOOTH_UNIVARIATE) from statsmodels.tsa.statespace.simulation_smoother import ( SIMULATION_STATE, SIMULATION_DISTURBANCE, SIMULATION_ALL) -from numpy.testing import assert_allclose, assert_almost_equal, assert_equal, assert_raises +from numpy.testing import assert_allclose, assert_almost_equal, assert_equal import pytest current_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/statsmodels/tsa/statespace/tests/test_smoothing.py b/statsmodels/tsa/statespace/tests/test_smoothing.py index 1cd8df845ff..cc4dad979cf 100644 --- a/statsmodels/tsa/statespace/tests/test_smoothing.py +++ b/statsmodels/tsa/statespace/tests/test_smoothing.py @@ -28,7 +28,7 @@ from statsmodels.tsa.statespace.kalman_smoother import ( SMOOTH_CONVENTIONAL, SMOOTH_CLASSICAL, SMOOTH_ALTERNATIVE, SMOOTH_UNIVARIATE) -from numpy.testing import assert_allclose, assert_almost_equal, assert_equal, assert_raises +from numpy.testing import assert_allclose, assert_almost_equal, assert_equal current_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/statsmodels/tsa/tests/test_bds.py b/statsmodels/tsa/tests/test_bds.py index ea9aaab2208..85f52b3b774 100644 --- a/statsmodels/tsa/tests/test_bds.py +++ b/statsmodels/tsa/tests/test_bds.py @@ -16,7 +16,7 @@ import numpy as np import pandas as pd from statsmodels.tsa.stattools import bds -from numpy.testing import assert_almost_equal, assert_equal, assert_raises +from numpy.testing import assert_almost_equal, assert_equal from numpy import genfromtxt DECIMAL_8 = 8 diff --git a/statsmodels/tsa/tests/test_holtwinters.py b/statsmodels/tsa/tests/test_holtwinters.py index c6e732479d4..80bff0a1e4f 100644 --- a/statsmodels/tsa/tests/test_holtwinters.py +++ b/statsmodels/tsa/tests/test_holtwinters.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from numpy.testing import assert_almost_equal, assert_equal, assert_raises +from numpy.testing import assert_almost_equal, assert_equal from statsmodels.tsa.holtwinters import (ExponentialSmoothing, SimpleExpSmoothing, Holt) from pandas import DataFrame, DatetimeIndex From 14573e56bd566b2df21312efe47530f9ad0f21e8 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 22 Mar 2018 16:18:10 -0700 Subject: [PATCH 115/157] replace @dec with pytest.mark --- .../tests/test_kernel_density.py | 24 +++++++++---------- .../tests/test_kernel_regression.py | 16 ++++++------- statsmodels/regression/tests/test_lme.py | 13 +++++----- .../stats/libqsturng/tests/test_qsturng.py | 9 +++---- statsmodels/tsa/tests/test_arima.py | 6 ++--- statsmodels/tsa/tests/test_arima_process.py | 5 ++-- statsmodels/tsa/tests/test_stattools.py | 5 ++-- 7 files changed, 40 insertions(+), 38 deletions(-) diff --git a/statsmodels/nonparametric/tests/test_kernel_density.py b/statsmodels/nonparametric/tests/test_kernel_density.py index 7d16151d4d4..81bd436fcbb 100644 --- a/statsmodels/nonparametric/tests/test_kernel_density.py +++ b/statsmodels/nonparametric/tests/test_kernel_density.py @@ -1,6 +1,6 @@ import numpy as np import numpy.testing as npt -import numpy.testing.decorators as dec +import pytest import statsmodels.api as sm nparam = sm.nonparametric @@ -116,7 +116,7 @@ def test_weighted_pdf_non_fft(self): class TestKDEMultivariate(KDETestBase): - @dec.slow + @pytest.mark.slow def test_pdf_mixeddata_CV_LS(self): dens_u = nparam.KDEMultivariate(data=[self.c1, self.o, self.o2], var_type='coo', bw='cv_ls') @@ -150,7 +150,7 @@ def test_pdf_mixeddata_CV_ML(self): R_bw = [1.021563, 2.806409e-14, 0.5142077] npt.assert_allclose(dens_ml.bw, R_bw, atol=0.1, rtol=0.1) - @dec.slow + @pytest.mark.slow def test_pdf_continuous(self): # Test for only continuous data dens = nparam.KDEMultivariate(data=[self.growth, self.Italy_gdp], @@ -177,7 +177,7 @@ def test_pdf_ordered(self): # lower tol here. only 2nd decimal npt.assert_allclose(sm_result, R_result, atol=1e-1) - @dec.slow + @pytest.mark.slow def test_unordered_CV_LS(self): dens = nparam.KDEMultivariate(data=[self.growth, self.oecd], var_type='cu', bw='cv_ls') @@ -199,7 +199,7 @@ def test_mixeddata_cdf(self, data_predict=None): R_result = [0.54700010, 0.65907039, 0.89676865, 0.74132941, 0.25291361] npt.assert_allclose(sm_result, R_result, atol=1e-3) - @dec.slow + @pytest.mark.slow def test_continuous_cvls_efficient(self): nobs = 400 np.random.seed(12345) @@ -215,7 +215,7 @@ def test_continuous_cvls_efficient(self): bw = np.array([0.3404, 0.1666]) npt.assert_allclose(bw, dens_efficient.bw, atol=0.1, rtol=0.2) - @dec.slow + @pytest.mark.slow def test_continuous_cvml_efficient(self): nobs = 400 np.random.seed(12345) @@ -232,7 +232,7 @@ def test_continuous_cvml_efficient(self): bw = np.array([0.4471, 0.2861]) npt.assert_allclose(bw, dens_efficient.bw, atol=0.1, rtol = 0.2) - @dec.slow + @pytest.mark.slow def test_efficient_notrandom(self): nobs = 400 np.random.seed(12345) @@ -262,7 +262,7 @@ def test_efficient_user_specified_bw(self): class TestKDEMultivariateConditional(KDETestBase): - @dec.slow + @pytest.mark.slow def test_mixeddata_CV_LS(self): dens_ls = nparam.KDEMultivariateConditional(endog=[self.Italy_gdp], exog=[self.Italy_year], @@ -279,7 +279,7 @@ def test_continuous_CV_ML(self): # Results from R npt.assert_allclose(dens_ml.bw, [0.5341164, 0.04510836], atol=1e-3) - @dec.slow + @pytest.mark.slow def test_unordered_CV_LS(self): dens_ls = nparam.KDEMultivariateConditional(endog=[self.oecd], exog=[self.growth], @@ -299,7 +299,7 @@ def test_pdf_continuous(self): R_result = [11.97964, 12.73290, 13.23037, 13.46438, 12.22779] npt.assert_allclose(sm_result, R_result, atol=1e-3) - @dec.slow + @pytest.mark.slow def test_pdf_mixeddata(self): dens = nparam.KDEMultivariateConditional(endog=[self.Italy_gdp], exog=[self.Italy_year], @@ -340,7 +340,7 @@ def test_continuous_cdf(self): R_result = [0.81304920, 0.95046942, 0.86878727, 0.71961748, 0.38685423] npt.assert_allclose(sm_result, R_result, atol=1e-3) - @dec.slow + @pytest.mark.slow def test_mixeddata_cdf(self): dens = nparam.KDEMultivariateConditional(endog=[self.Italy_gdp], exog=[self.Italy_year], @@ -352,7 +352,7 @@ def test_mixeddata_cdf(self): expected = [0.83378885, 0.97684477, 0.90655143, 0.79393161, 0.43629083] npt.assert_allclose(sm_result, expected, atol=0, rtol=1e-5) - @dec.slow + @pytest.mark.slow def test_continuous_cvml_efficient(self): nobs = 500 np.random.seed(12345) diff --git a/statsmodels/nonparametric/tests/test_kernel_regression.py b/statsmodels/nonparametric/tests/test_kernel_regression.py index 15c4189fd28..38ade366d9e 100644 --- a/statsmodels/nonparametric/tests/test_kernel_regression.py +++ b/statsmodels/nonparametric/tests/test_kernel_regression.py @@ -1,8 +1,7 @@ -from statsmodels.compat.testing import skip +import pytest import numpy as np import numpy.testing as npt -import numpy.testing.decorators as dec import statsmodels.api as sm nparam = sm.nonparametric @@ -174,7 +173,8 @@ def test_mixed_mfx_ll_cvls(self, file_name='RegData.csv'): sm_R2 = model.r_squared() # TODO: add expected result npt.assert_allclose(sm_mfx[0,:], [b1,b2,b3], rtol=2e-1) - @skip("Test doesn't make much sense - always passes with very small bw.") + @pytest.mark.skip(reason="Test doesn't make much sense - always passes " + "with very small bw.") def test_mfx_nonlinear_ll_cvls(self, file_name='RegData.csv'): #FIXME nobs = 200 @@ -199,7 +199,7 @@ def test_mfx_nonlinear_ll_cvls(self, file_name='RegData.csv'): #npt.assert_allclose(sm_mfx[0:10,1], mfx2[0:10], rtol=2e-1) npt.assert_allclose(sm_mean, Y, rtol = 2e-1) - @dec.slow + @pytest.mark.slow def test_continuous_cvls_efficient(self): nobs = 500 np.random.seed(12345) @@ -219,7 +219,7 @@ def test_continuous_cvls_efficient(self): var_type='c', bw='cv_ls') npt.assert_allclose(model.bw, model_efficient.bw, atol=5e-2, rtol=1e-1) - @dec.slow + @pytest.mark.slow def test_censored_ll_cvls(self): nobs = 200 np.random.seed(1234) @@ -234,7 +234,7 @@ def test_censored_ll_cvls(self): sm_mean, sm_mfx = model.fit() npt.assert_allclose(sm_mfx[0,:], [1.2, -0.9], rtol = 2e-1) - @dec.slow + @pytest.mark.slow def test_continuous_lc_aic(self): nobs = 200 np.random.seed(1234) @@ -255,7 +255,7 @@ def test_continuous_lc_aic(self): bw_expected = [0.3987821, 0.50933458] npt.assert_allclose(model.bw, bw_expected, rtol=1e-3) - @dec.slow + @pytest.mark.slow def test_significance_continuous(self): nobs = 250 np.random.seed(12345) @@ -279,7 +279,7 @@ def test_significance_continuous(self): sig_var2 = model.sig_test([1], nboot=nboot) # H0: b2 = 0 npt.assert_equal(sig_var2 == 'Not Significant', True) - @dec.slow + @pytest.mark.slow def test_significance_discrete(self): nobs = 200 np.random.seed(12345) diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 174d76eb19e..8a732c09123 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -1,4 +1,3 @@ -from statsmodels.compat.testing import skipif import warnings import numpy as np @@ -8,7 +7,7 @@ assert_) from . import lme_r_results from statsmodels.base import _penalties as penalties -from numpy.testing import dec +import pytest import statsmodels.tools.numdiff as nd import os import csv @@ -88,7 +87,7 @@ def f(x): class TestMixedLM(object): # Test analytic scores and Hessian using numeric differentiation - @dec.slow + @pytest.mark.slow def test_compare_numdiff(self): n_grp = 200 @@ -215,7 +214,7 @@ def test_profile_inference(self): dist_high=0.5, num_high=3) # Fails on old versions of scipy/numpy - @skipif(old_scipy, 'SciPy too old') + @pytest.mark.skipif(old_scipy, reason='SciPy too old') def test_vcomp_1(self): # Fit the same model using constrained random effects and # variance components. @@ -313,7 +312,7 @@ def test_vcomp_2(self): assert_allclose(result1.bse.iloc[0:3], [ 0.12610, 0.03938, 0.03848], rtol=1e-3) - @skipif(old_scipy, 'SciPy too old') + @pytest.mark.skipif(old_scipy, reason='SciPy too old') def test_vcomp_3(self): # Test a model with vcomp but no other random effects, using formulas. @@ -339,7 +338,7 @@ def test_vcomp_3(self): np.r_[-0.101549, 0.028613, -0.224621, -0.126295], rtol=1e-3) - @skipif(old_scipy, 'SciPy too old') + @pytest.mark.skipif(old_scipy, reason='SciPy too old') def test_sparse(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) @@ -663,7 +662,7 @@ def test_formulas(self): rslt5 = mod5.fit() assert_almost_equal(rslt4.params, rslt5.params) - @skipif(old_scipy, 'SciPy too old') + @pytest.mark.skipif(old_scipy, reason='SciPy too old') def test_regularized(self): np.random.seed(3453) diff --git a/statsmodels/stats/libqsturng/tests/test_qsturng.py b/statsmodels/stats/libqsturng/tests/test_qsturng.py index de3fba7f56c..50ec2be54dc 100644 --- a/statsmodels/stats/libqsturng/tests/test_qsturng.py +++ b/statsmodels/stats/libqsturng/tests/test_qsturng.py @@ -2,7 +2,7 @@ # This software is funded in part by NIH Grant P20 RR016454. """The 'handful' tests are intended to aid refactoring. The tests with the -@dec.slow are empirical (test within error limits) and intended to more +@pytest.mark..slow are empirical (test within error limits) and intended to more extensively ensure the stability and accuracy of the functions""" from statsmodels.compat.python import iterkeys, lzip, lmap @@ -10,9 +10,10 @@ from numpy.testing import rand, assert_, assert_equal, \ assert_almost_equal, assert_array_almost_equal, assert_array_equal, \ - assert_approx_equal, assert_raises, run_module_suite, dec + assert_approx_equal, assert_raises, run_module_suite import numpy as np +import pytest from statsmodels.stats.libqsturng import qsturng, psturng,p_keys,v_keys @@ -119,7 +120,7 @@ def test_handful_to_ch(self): for p,r,v,q in cases: assert_almost_equal(q, qsturng(p,r,v), 5) - @dec.slow + @pytest.mark.slow def test_10000_to_ch(self): import os curdir = os.path.dirname(os.path.abspath(__file__)) @@ -182,7 +183,7 @@ def test_handful_to_known_values(self): for p,r,v,q in cases: assert_almost_equal(1.-p, psturng(q,r,v), 5) - @dec.slow + @pytest.mark.slow def test_100_random_values(self): n = 100 ps = np.random.random(n)*(.999 - .1) + .1 diff --git a/statsmodels/tsa/tests/test_arima.py b/statsmodels/tsa/tests/test_arima.py index 8bb740a426d..a55277df97b 100644 --- a/statsmodels/tsa/tests/test_arima.py +++ b/statsmodels/tsa/tests/test_arima.py @@ -7,7 +7,7 @@ import numpy as np from numpy.testing import (assert_almost_equal, assert_, assert_allclose, - assert_raises, dec) + assert_raises) import pandas as pd from pandas import PeriodIndex, DatetimeIndex import pytest @@ -222,7 +222,7 @@ def setup_class(cls): cls.res2 = results_arma.Y_arma14() -@dec.slow +@pytest.mark.slow class Test_Y_ARMA41_NoConst(CheckArmaResultsMixin, CheckForecastMixin): @classmethod def setup_class(cls): @@ -475,7 +475,7 @@ def test_reset_trend(): assert_equal(len(res1.params), len(res2.params)+1) -@dec.slow +@pytest.mark.slow def test_start_params_bug(): data = np.array([1368., 1187, 1090, 1439, 2362, 2783, 2869, 2512, 1804, 1544, 1028, 869, 1737, 2055, 1947, 1618, 1196, 867, 997, 1862, 2525, diff --git a/statsmodels/tsa/tests/test_arima_process.py b/statsmodels/tsa/tests/test_arima_process.py index 25d88444489..8427cd263c7 100644 --- a/statsmodels/tsa/tests/test_arima_process.py +++ b/statsmodels/tsa/tests/test_arima_process.py @@ -5,10 +5,11 @@ from statsmodels.tsa.arima_model import ARMA from unittest import TestCase +import pytest import numpy as np from numpy.testing import (assert_array_almost_equal, assert_almost_equal, assert_allclose, - assert_equal, assert_raises, assert_, dec) + assert_equal, assert_raises, assert_) from statsmodels.tsa.arima_process import (arma_generate_sample, arma_acovf, arma_acf, arma_impulse_response, lpol_fiar, lpol_fima, @@ -238,7 +239,7 @@ def test_process_multiplication(self): assert_raises(TypeError, process1.__mul__, [3]) - @dec.skipif(NP16) + @pytest.mark.skipif(NP16, reason='numpy<1.7') def test_str_repr(self): process1 = ArmaProcess.from_coeffs([.9], [.2]) out = process1.__str__() diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index c87446eadc6..410cbdddd85 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -8,8 +8,9 @@ arma_order_select_ic) import numpy as np import pandas as pd +import pytest from numpy.testing import (assert_almost_equal, assert_equal, assert_warns, - assert_raises, dec, assert_, assert_allclose) + assert_raises, assert_, assert_allclose) from statsmodels.datasets import macrodata, sunspots from pandas import Series, DatetimeIndex, DataFrame import os @@ -491,7 +492,7 @@ def test_acovf_fft_vs_convolution(): F2 = acovf(q, demean=demean, unbiased=unbiased, fft=False) assert_almost_equal(F1, F2, decimal=7) -@dec.slow +@pytest.mark.slow def test_arma_order_select_ic(): # smoke test, assumes info-criteria are right from statsmodels.tsa.arima_process import arma_generate_sample From 2db4457c3abae198465468f1e0ad7746fb1c2a6f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 24 Mar 2018 10:20:15 -0700 Subject: [PATCH 116/157] deletion reverted by rebase --- .../tsa/regime_switching/tests/test_markov_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py index 378c4ffd239..81ca40aefe9 100644 --- a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py +++ b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py @@ -12,7 +12,7 @@ import pandas as pd from statsmodels.tsa.regime_switching import (markov_switching, markov_regression) -from numpy.testing import assert_equal, assert_allclose +from numpy.testing import assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) From 786cfce42e79e806b7f3c8e7fc90ef14f06b55e6 Mon Sep 17 00:00:00 2001 From: Josef Date: Sat, 24 Mar 2018 15:25:28 -0400 Subject: [PATCH 117/157] ENH: inter_rater add randolph's kappa to fleiss_kappa, see #4387 --- statsmodels/stats/inter_rater.py | 51 +++++++++++++++++---- statsmodels/stats/tests/test_inter_rater.py | 43 ++++++++++++++--- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/statsmodels/stats/inter_rater.py b/statsmodels/stats/inter_rater.py index ed1110688fa..9a83967acf1 100644 --- a/statsmodels/stats/inter_rater.py +++ b/statsmodels/stats/inter_rater.py @@ -35,7 +35,7 @@ """ - +from __future__ import division import numpy as np from scipy import stats #get rid of this? need only norm.sf @@ -192,27 +192,57 @@ def to_table(data, bins=None): return tt[0], bins_ -def fleiss_kappa(table): - '''Fleiss' kappa multi-rater agreement measure +def fleiss_kappa(table, method='fleiss'): + """Fleiss' and Randolph's kappa multi-rater agreement measure Parameters ---------- table : array_like, 2-D assumes subjects in rows, and categories in columns + method : string + Method 'fleiss' returns Fleiss' kappa which uses the sample margin + to define the chance outcome. + Method 'randolph' or 'uniform' (only first 4 letters are needed) + returns Randolph's (2005) multirater kappa which assumes a uniform + distribution of the categories to define the chance outcome. Returns ------- kappa : float - Fleiss's kappa statistic for inter rater agreement + Fleiss's or Randolph's kappa statistic for inter rater agreement Notes ----- - coded from Wikipedia page - http://en.wikipedia.org/wiki/Fleiss%27_kappa + no variance or hypothesis tests yet - no variance or tests yet + Interrater agreement measures like Fleiss's kappa measure agreement relative + to chance agreement. Different authors have proposed ways of defining + these chance agreements. Fleiss' is based on the marginal sample distribution + of categories, while Randolph uses a uniform distribution of categories as + benchmark. Warrens (2010) showed that Randolph's kappa is always larger or + equal to Fleiss' kappa. Under some commonly observed condition, Fleiss' and + Randolph's kappa provide lower and upper bounds for two similar kappa_like + measures by Light (1971) and Hubert (1977). + + References + ---------- + Wikipedia http://en.wikipedia.org/wiki/Fleiss%27_kappa + + Fleiss, Joseph L. 1971. "Measuring Nominal Scale Agreement among Many + Raters." Psychological Bulletin 76 (5): 378-82. + https://doi.org/10.1037/h0031619. + + Randolph, Justus J. 2005 "Free-Marginal Multirater Kappa (multirater + K [free]): An Alternative to Fleiss' Fixed-Marginal Multirater Kappa." + Presented at the Joensuu Learning and Instruction Symposium, vol. 2005 + https://eric.ed.gov/?id=ED490661 + + Warrens, Matthijs J. 2010. "Inequalities between Multi-Rater Kappas." + Advances in Data Analysis and Classification 4 (4): 271-86. + https://doi.org/10.1007/s11634-010-0073-4. + + """ - ''' table = 1.0 * np.asarray(table) #avoid integer division n_sub, n_cat = table.shape n_total = table.sum() @@ -228,7 +258,10 @@ def fleiss_kappa(table): p_rat = (table2.sum(1) - n_rat) / (n_rat * (n_rat - 1.)) p_mean = p_rat.mean() - p_mean_exp = (p_cat*p_cat).sum() + if method == 'fleiss': + p_mean_exp = (p_cat*p_cat).sum() + elif method.startswith('rand') or method.startswith('unif'): + p_mean_exp = 1 / n_cat kappa = (p_mean - p_mean_exp) / (1- p_mean_exp) return kappa diff --git a/statsmodels/stats/tests/test_inter_rater.py b/statsmodels/stats/tests/test_inter_rater.py index 9019e0dca58..f4058d1b501 100644 --- a/statsmodels/stats/tests/test_inter_rater.py +++ b/statsmodels/stats/tests/test_inter_rater.py @@ -7,7 +7,7 @@ """ import numpy as np -from numpy.testing import assert_almost_equal, assert_equal +from numpy.testing import assert_almost_equal, assert_equal, assert_allclose from statsmodels.stats.inter_rater import (fleiss_kappa, cohens_kappa, to_table, aggregate_raters) @@ -76,6 +76,41 @@ def test_fleiss_kappa(): assert_almost_equal(fleiss_kappa(table1), kappa_wp, decimal=3) +def test_fleis_randolph(): + # reference numbers from online calculator + # http://justusrandolph.net/kappa/#dInfo + table = [[7, 0], [7, 0]] + assert_equal(fleiss_kappa(table, method='unif'), 1) + + table = [[6.99, 0.01], [6.99, 0.01]] + # % Overall Agreement 0.996671 + # Fixed Marginal Kappa: -0.166667 + # Free Marginal Kappa: 0.993343 + assert_allclose(fleiss_kappa(table), -0.166667, atol=6e-6) + assert_allclose(fleiss_kappa(table, method='unif'), 0.993343, atol=6e-6) + + table = [[7, 1], [3, 5]] + # % Overall Agreement 0.607143 + # Fixed Marginal Kappa: 0.161905 + # Free Marginal Kappa: 0.214286 + assert_allclose(fleiss_kappa(table, method='fleiss'), 0.161905, atol=6e-6) + assert_allclose(fleiss_kappa(table, method='randolph'), 0.214286, atol=6e-6) + + table = [[7, 0], [0, 7]] + # % Overall Agreement 1.000000 + # Fixed Marginal Kappa: 1.000000 + # Free Marginal Kappa: 1.000000 + assert_allclose(fleiss_kappa(table), 1) + assert_allclose(fleiss_kappa(table, method='uniform'), 1) + + table = [[6, 1, 0], [0, 7, 0]] + # % Overall Agreement 0.857143 + # Fixed Marginal Kappa: 0.708333 + # Free Marginal Kappa: 0.785714 + assert_allclose(fleiss_kappa(table), 0.708333, atol=6e-6) + assert_allclose(fleiss_kappa(table, method='rand'), 0.785714, atol=6e-6) + + class CheckCohens(object): def test_results(self): @@ -316,9 +351,3 @@ def test_aggregate_raters(): resf = aggregate_raters(data) colsum = np.array([26, 26, 30, 55, 43]) assert_equal(resf[0].sum(0), colsum) - - -if __name__ == '__main__': - import pytest - pytest.main([__file__, '-vvs', '-x', '--pdb']) - From 8a7d0fc882639ace881be9bf688d479697b20071 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 10 Feb 2018 13:31:35 -0500 Subject: [PATCH 118/157] Initial commit to fix #4213 --- docs/source/release/version0.9.rst | 3 +++ statsmodels/base/elastic_net.py | 7 +++++++ statsmodels/regression/tests/test_regression.py | 12 +++++++----- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/source/release/version0.9.rst b/docs/source/release/version0.9.rst index e380d059f40..91f6f467cad 100644 --- a/docs/source/release/version0.9.rst +++ b/docs/source/release/version0.9.rst @@ -56,6 +56,9 @@ While most bugs are usability problems, there is now a new label `type-bug-wrong for bugs that cause that silently incorrect numbers are returned. https://github.com/statsmodels/statsmodels/issues?q=label%3Atype-bug-wrong+is%3Aclosed +* Refitting elastic net regularized models using the `refit=True` + option now returns the unregularized parameters for the coefficients + selected by the regularized fitter, as documented. #4213 Backwards incompatible changes and deprecations diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py index 70b575f79ee..b5935c51589 100644 --- a/statsmodels/base/elastic_net.py +++ b/statsmodels/base/elastic_net.py @@ -216,6 +216,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, model1 = model.__class__(model.endog, model.exog[:, ii], **init_args) rslt = model1.fit() + params[ii] = rslt.params cov[np.ix_(ii, ii)] = rslt.normalized_cov_params else: # Hack: no variables were selected but we need to run fit in @@ -236,6 +237,12 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, else: scale = 1. + # The degrees of freedom should reflect the number of parameters + # in the refit model, not including the zeros that we present to + # indicate what was dropped. + model.df_model = len(ii) + model.df_resid = model.nobs - model.df_model + # Assuming a standard signature for creating results classes. refit = klass(model, params, cov, scale=scale) refit.regularized = True diff --git a/statsmodels/regression/tests/test_regression.py b/statsmodels/regression/tests/test_regression.py index 6e8ea8492d1..456d7c31292 100644 --- a/statsmodels/regression/tests/test_regression.py +++ b/statsmodels/regression/tests/test_regression.py @@ -1201,13 +1201,15 @@ def test_regularized_refit(): p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) - yvec = xmat.sum(1) + np.random.normal(size=n) + # covariates 0 and 2 matter + yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True) - model2 = OLS(yvec, xmat) - result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True) - assert_allclose(result1.params, result2.params) - assert_allclose(result1.bse, result2.bse) + model2 = OLS(yvec, xmat[:, [0, 2]]) + result2 = model2.fit() + ii = [0, 2] + assert_allclose(result1.params[ii], result2.params) + assert_allclose(result1.bse[ii], result2.bse) def test_regularized_predict(): From 06c87c059c13c069df8fd2ac0c86249a7843bf73 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 10 Feb 2018 13:34:19 -0500 Subject: [PATCH 119/157] wordsmith comment --- statsmodels/base/elastic_net.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py index b5935c51589..1f44665b3f7 100644 --- a/statsmodels/base/elastic_net.py +++ b/statsmodels/base/elastic_net.py @@ -238,8 +238,8 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, scale = 1. # The degrees of freedom should reflect the number of parameters - # in the refit model, not including the zeros that we present to - # indicate what was dropped. + # in the refit model, not including the zeros that are displayed + # to indicate which variables were dropped. model.df_model = len(ii) model.df_resid = model.nobs - model.df_model From 413c0c5032260f15bfe218b9f37ceac6d8597108 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 24 Mar 2018 19:38:26 -0400 Subject: [PATCH 120/157] don't change model df --- statsmodels/base/elastic_net.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py index 1f44665b3f7..30f3b87f700 100644 --- a/statsmodels/base/elastic_net.py +++ b/statsmodels/base/elastic_net.py @@ -240,6 +240,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, # The degrees of freedom should reflect the number of parameters # in the refit model, not including the zeros that are displayed # to indicate which variables were dropped. + p, q = model.df_model, model.df_resid model.df_model = len(ii) model.df_resid = model.nobs - model.df_model @@ -249,6 +250,8 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, refit.method = method refit.fit_history = {'iteration' : itr + 1} + model.df_model, model.df_resid = p, q + return refit From cb99dcc4d0ace7bd0bf6ec6dd30531634e363b32 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 25 Mar 2018 16:57:37 -0400 Subject: [PATCH 121/157] Add comment pointing to issue 1723 for future reference --- statsmodels/base/elastic_net.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py index 30f3b87f700..df1f0e4207f 100644 --- a/statsmodels/base/elastic_net.py +++ b/statsmodels/base/elastic_net.py @@ -239,7 +239,9 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, # The degrees of freedom should reflect the number of parameters # in the refit model, not including the zeros that are displayed - # to indicate which variables were dropped. + # to indicate which variables were dropped. See issue #1723 for + # discussion about setting df parameters in model and results + # classes. p, q = model.df_model, model.df_resid model.df_model = len(ii) model.df_resid = model.nobs - model.df_model @@ -250,6 +252,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, refit.method = method refit.fit_history = {'iteration' : itr + 1} + # Restore df in model class, see issue #1723 for discussion. model.df_model, model.df_resid = p, q return refit From 1d0c058baf423168c4ceb114da5e6a2493805389 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 25 Mar 2018 17:14:51 -0400 Subject: [PATCH 122/157] Minor formatting changes --- statsmodels/base/elastic_net.py | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py index df1f0e4207f..a594837749c 100644 --- a/statsmodels/base/elastic_net.py +++ b/statsmodels/base/elastic_net.py @@ -9,7 +9,8 @@ Routines for fitting regression models using elastic net regularization. The elastic net minimizes the objective function --llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 + L1_wt * sum(abs(params))) +-llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 + + L1_wt * sum(abs(params))) The algorithm implemented here closely follows the implementation in the R glmnet package, documented here: @@ -52,17 +53,16 @@ def npscore(params, model): def nphess(params, model): nobs = model.nobs pen_hess = alpha[k] * (1 - L1_wt) - h = -model.hessian(np.r_[params], **hess_kwds)[0,0] / nobs + pen_hess + h = -model.hessian(np.r_[params], **hess_kwds)[0, 0] / nobs + pen_hess return h return nploglike, npscore, nphess - def fit_elasticnet(model, method="coord_descent", maxiter=100, - alpha=0., L1_wt=1., start_params=None, cnvrg_tol=1e-7, - zero_tol=1e-8, refit=False, check_step=True, - loglike_kwds=None, score_kwds=None, hess_kwds=None): + alpha=0., L1_wt=1., start_params=None, cnvrg_tol=1e-7, + zero_tol=1e-8, refit=False, check_step=True, + loglike_kwds=None, score_kwds=None, hess_kwds=None): """ Return an elastic net regularized fit to a regression model. @@ -133,7 +133,6 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, """ k_exog = model.exog.shape[1] - n_exog = model.exog.shape[0] loglike_kwds = {} if loglike_kwds is None else loglike_kwds score_kwds = {} if score_kwds is None else score_kwds @@ -148,7 +147,6 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, else: params = start_params.copy() - converged = False btol = 1e-4 params_zero = np.zeros(len(params), dtype=bool) @@ -156,8 +154,9 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, if k != "offset" and hasattr(model, k)]) init_args['hasconst'] = False - fgh_list = [_gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds) - for k in range(k_exog)] + fgh_list = [ + _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds) + for k in range(k_exog)] for itr in range(maxiter): @@ -181,13 +180,14 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, offset += model.offset # Create a one-variable model for optimization. - model_1var = model.__class__(model.endog, model.exog[:, k], offset=offset, - **init_args) + model_1var = model.__class__( + model.endog, model.exog[:, k], offset=offset, **init_args) # Do the one-dimensional optimization. func, grad, hess = fgh_list[k] - params[k] = _opt_1d(func, grad, hess, model_1var, params[k], alpha[k]*L1_wt, - tol=btol, check_step=check_step) + params[k] = _opt_1d( + func, grad, hess, model_1var, params[k], alpha[k]*L1_wt, + tol=btol, check_step=check_step) # Update the active set if itr > 0 and np.abs(params[k]) < zero_tol: @@ -197,7 +197,6 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, # Check for convergence pchange = np.max(np.abs(params - params_save)) if pchange < cnvrg_tol: - converged = True break # Set approximate zero coefficients to be exactly zero @@ -213,8 +212,8 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, cov = np.zeros((k_exog, k_exog)) init_args = dict([(k, getattr(model, k, None)) for k in model._init_keys]) if len(ii) > 0: - model1 = model.__class__(model.endog, model.exog[:, ii], - **init_args) + model1 = model.__class__( + model.endog, model.exog[:, ii], **init_args) rslt = model1.fit() params[ii] = rslt.params cov[np.ix_(ii, ii)] = rslt.normalized_cov_params @@ -250,7 +249,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100, refit = klass(model, params, cov, scale=scale) refit.regularized = True refit.method = method - refit.fit_history = {'iteration' : itr + 1} + refit.fit_history = {'iteration': itr + 1} # Restore df in model class, see issue #1723 for discussion. model.df_model, model.df_resid = p, q From 1dfdf52bd627c214522839cab600c92a0d3133b5 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 29 Mar 2018 16:17:36 -0400 Subject: [PATCH 123/157] BUG/ENH: proportion_confint: impose bounds, support asarray and pandas closes #2742 --- statsmodels/stats/proportion.py | 23 +++++++++++++++++++ .../stats/tests/results/results_proportion.py | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/statsmodels/stats/proportion.py b/statsmodels/stats/proportion.py index 704266daa68..6969a8fb19a 100644 --- a/statsmodels/stats/proportion.py +++ b/statsmodels/stats/proportion.py @@ -67,6 +67,10 @@ def proportion_confint(count, nobs, alpha=0.05, method='normal'): ''' + pd_index = getattr(count, 'index', None) + count = np.asarray(count) + nobs = np.asarray(nobs) + q_ = count * 1. / nobs alpha_2 = 0.5 * alpha @@ -93,6 +97,13 @@ def func(qi): ci_low = stats.beta.ppf(alpha_2, count, nobs - count + 1) ci_upp = stats.beta.isf(alpha_2, count + 1, nobs - count) + if np.ndim(ci_low) > 0: + ci_low[q_ == 0] = 0 + ci_upp[q_ == 1] = 1 + else: + ci_low = ci_low if (q_ != 0) else 0 + ci_upp = ci_upp if (q_ != 1) else 1 + elif method == 'agresti_coull': crit = stats.norm.isf(alpha / 2.) nobs_c = nobs + crit**2 @@ -119,6 +130,18 @@ def func(qi): else: raise NotImplementedError('method "%s" is not available' % method) + + if method in ['normal', 'agresti_coull']: + ci_low = np.clip(ci_low, 0, 1) + ci_upp = np.clip(ci_upp, 0, 1) + if pd_index is not None and np.ndim(ci_low) > 0: + import pandas as pd + if np.ndim(ci_low) == 1: + ci_low = pd.Series(ci_low, index=pd_index) + ci_upp = pd.Series(ci_upp, index=pd_index) + if np.ndim(ci_low) == 2: + ci_low = pd.DataFrame(ci_low, index=pd_index) + ci_upp = pd.DataFrame(ci_upp, index=pd_index) return ci_low, ci_upp diff --git a/statsmodels/stats/tests/results/results_proportion.py b/statsmodels/stats/tests/results/results_proportion.py index 00f41381cdb..48add2ecf6f 100644 --- a/statsmodels/stats/tests/results/results_proportion.py +++ b/statsmodels/stats/tests/results/results_proportion.py @@ -30,7 +30,9 @@ class Holder(object): ]) #> mkarray2(bci$upper, "res_binom[(18, 20)].ci_upp") res_binom[(18, 20)].ci_upp = np.array([ - 0.984343760998137, 1.031478381086487, 0.97862751197755, + # changing 1.03 by manual edit because we impose <=1, see #2742 + # 0.984343760998137, 1.031478381086487, 0.97862751197755, + 0.984343760998137, 1.0 , 0.97862751197755, 0.974010174395775, 0.9876514728297052, 0.974866415649319, 0.978858461808406, 0.982318186566456, 0.982639913376776, 0.982487361226571, 0.972133518786232 From 9cc4146e3958476f3c4021f40c9e0dcd63187da1 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 29 Mar 2018 19:53:34 -0400 Subject: [PATCH 124/157] TST: proportion_confint, unit test, corner case, 2-D, pandas --- .../stats/tests/results/results_proportion.py | 41 +++++++++++++-- statsmodels/stats/tests/test_proportion.py | 52 +++++++++++++++---- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/statsmodels/stats/tests/results/results_proportion.py b/statsmodels/stats/tests/results/results_proportion.py index 48add2ecf6f..5c371a3230f 100644 --- a/statsmodels/stats/tests/results/results_proportion.py +++ b/statsmodels/stats/tests/results/results_proportion.py @@ -30,9 +30,7 @@ class Holder(object): ]) #> mkarray2(bci$upper, "res_binom[(18, 20)].ci_upp") res_binom[(18, 20)].ci_upp = np.array([ - # changing 1.03 by manual edit because we impose <=1, see #2742 - # 0.984343760998137, 1.031478381086487, 0.97862751197755, - 0.984343760998137, 1.0 , 0.97862751197755, + 0.984343760998137, 1.031478381086487, 0.97862751197755, 0.974010174395775, 0.9876514728297052, 0.974866415649319, 0.978858461808406, 0.982318186566456, 0.982639913376776, 0.982487361226571, 0.972133518786232 @@ -89,3 +87,40 @@ class Holder(object): 0.974623779100809, 0.974626983311416, 0.974392083257476, 0.972617354399236 ]) + +# > bci = binom.confint(x = c(1), n = 30, tol = 1e-8) +res_binom[(1, 30)].ci_low = np.array([ + -8.305484e-03, -3.090070e-02, 6.903016e-05, 2.494567e-03, + 8.435709e-04, 4.675346e-03, 3.475014e-03, 3.012987e-03, + 1.932430e-03, 1.742467e-03, 5.908590e-03]) + +res_binom[(1, 30)].ci_upp = np.array([ + 0.18091798, 0.09756737, 0.12314380, 0.14513807, + 0.17216946, 0.20200244, 0.16637241, 0.13868254, + 0.13868375, 0.19053022, 0.16670391]) + +# > bci = binom.confint(x = c(29), n = 30, tol = 1e-8) +res_binom[(29, 30)].ci_low = np.array([ + 0.8190820, 0.9024326, 0.8768562, 0.7860836, + 0.8278305, 0.7979976, 0.8336276, 0.8613175, + 0.8613162, 0.8094698, 0.8332961]) +res_binom[(29, 30)].ci_upp = np.array([ + 1.0083055, 1.0309007, 0.9999310, 0.9952363, + 0.9991564, 0.9953247, 0.9965250, 0.9969870, + 0.9980676, 0.9982575, 0.9940914]) + +# > bci = binom.confint(x = c(0), n = 30, tol = 1e-8) + # this ci_low clips one negative value to 0 +res_binom[(0, 30)].ci_low = np.zeros(11) +res_binom[(0, 30)].ci_upp = np.array([ + 0.13471170, 0.00000000, 0.06151672, 0.11570331, + 0.11570331, 0.11570331, 0.11570331, 0.10402893, + 0.06201781, 0.14132048, 0.11351339]) + +# > bci = binom.confint(x = c(30), n = 30, tol = 1e-8) +res_binom[(30, 30)].ci_low = np.array([ + 0.8652883, 1.0000000, 0.9384833, 0.8842967, + 0.8842967, 0.8842967, 0.8842967, 0.8959711, + 0.9379822, 0.8586795, 0.8864866]) + # this ci_upp clips one value > 1 +res_binom[(30, 30)].ci_upp = np.ones(11) diff --git a/statsmodels/stats/tests/test_proportion.py b/statsmodels/stats/tests/test_proportion.py index b5ba2a457ed..f354909a5c4 100644 --- a/statsmodels/stats/tests/test_proportion.py +++ b/statsmodels/stats/tests/test_proportion.py @@ -8,6 +8,7 @@ import warnings import numpy as np +import pandas as pd from numpy.testing import (assert_almost_equal, assert_equal, assert_array_less, assert_raises, assert_allclose) @@ -20,30 +21,61 @@ class Holder(object): pass - +probci_methods = {'agresti_coull' : 'agresti-coull', + 'normal' : 'asymptotic', + 'beta' : 'exact', + 'wilson' : 'wilson', + 'jeffreys' : 'bayes' + } def test_confint_proportion(): from .results.results_proportion import res_binom, res_binom_methods - methods = {'agresti_coull' : 'agresti-coull', - 'normal' : 'asymptotic', - 'beta' : 'exact', - 'wilson' : 'wilson', - 'jeffrey' : 'bayes' - } + for case in res_binom: count, nobs = case - for method in methods: - idx = res_binom_methods.index(methods[method]) + for method in probci_methods: + idx = res_binom_methods.index(probci_methods[method]) res_low = res_binom[case].ci_low[idx] res_upp = res_binom[case].ci_upp[idx] if np.isnan(res_low) or np.isnan(res_upp): continue + if (count == 0 or count == nobs) and method == 'jeffreys': + # maybe a bug or different corner case definition + continue + if method == 'jeffreys' and nobs == 30: + # something is strange in extreme case e.g 0/30 or 1/30 + continue ci = proportion_confint(count, nobs, alpha=0.05, method=method) - + # we impose that confint is in [0, 1] + res_low = max(res_low, 0) + res_upp = min(res_upp, 1) assert_almost_equal(ci, [res_low, res_upp], decimal=6, err_msg=repr(case) + method) + +def test_confint_proportion_ndim(): + # check that 2-D, works including pandas + + count = np.arange(6).reshape(2, 3) + nobs = 10 * np.ones((2, 3)) + + count_pd = pd.DataFrame(count) + nobs_pd = pd.DataFrame(nobs) + + for method in probci_methods: + ci_arr = proportion_confint(count, nobs, alpha=0.05, method=method) + ci_pd = proportion_confint(count_pd, nobs_pd, alpha=0.05, + method=method) + assert_allclose(ci_arr, (ci_pd[0].values, ci_pd[1].values), rtol=1e-13) + # spot checking one value + ci12 = proportion_confint(count[1, 2], nobs[1, 2], alpha=0.05, + method=method) + assert_allclose((ci_pd[0].values[1, 2], ci_pd[1].values[1, 2]), ci12, + rtol=1e-13) + assert_allclose((ci_arr[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13) + + def test_samplesize_confidenceinterval_prop(): #consistency test for samplesize to achieve confidence_interval nobs = 20 From 33016f2cc082804b2084a9646021a7cdb41dd9ea Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 29 Mar 2018 21:28:25 -0400 Subject: [PATCH 125/157] BUG: make list work, (count.index not callable) --- statsmodels/stats/proportion.py | 3 +++ statsmodels/stats/tests/test_proportion.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/statsmodels/stats/proportion.py b/statsmodels/stats/proportion.py index 6969a8fb19a..6235346da58 100644 --- a/statsmodels/stats/proportion.py +++ b/statsmodels/stats/proportion.py @@ -68,6 +68,9 @@ def proportion_confint(count, nobs, alpha=0.05, method='normal'): ''' pd_index = getattr(count, 'index', None) + if pd_index is not None and hasattr(pd_index, '__call__'): + # this rules out lists, lists have an index method + pd_index = None count = np.asarray(count) nobs = np.asarray(nobs) diff --git a/statsmodels/stats/tests/test_proportion.py b/statsmodels/stats/tests/test_proportion.py index f354909a5c4..04a711720de 100644 --- a/statsmodels/stats/tests/test_proportion.py +++ b/statsmodels/stats/tests/test_proportion.py @@ -75,6 +75,11 @@ def test_confint_proportion_ndim(): rtol=1e-13) assert_allclose((ci_arr[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13) + # check that lists work as input + ci_li = proportion_confint(count.tolist(), nobs.tolist(), alpha=0.05, + method=method) + assert_allclose(ci_arr, (ci_li[0], ci_li[1]), rtol=1e-13) + def test_samplesize_confidenceinterval_prop(): #consistency test for samplesize to achieve confidence_interval From c16ef02708f76fa4bc915ab5866ad13ef75145b9 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 30 Mar 2018 10:21:53 -0400 Subject: [PATCH 126/157] TST/DOC update docstring, test for 1-D counts --- statsmodels/stats/proportion.py | 25 +++++++++++++++------- statsmodels/stats/tests/test_proportion.py | 15 +++++++++++-- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/statsmodels/stats/proportion.py b/statsmodels/stats/proportion.py index 6235346da58..ab817ca546a 100644 --- a/statsmodels/stats/proportion.py +++ b/statsmodels/stats/proportion.py @@ -20,8 +20,8 @@ def proportion_confint(count, nobs, alpha=0.05, method='normal'): Parameters ---------- - count : int or array - number of successes + count : int or array_array_like + number of successes, can be pandas Series or DataFrame nobs : int total number of trials alpha : float in (0, 1) @@ -39,16 +39,24 @@ def proportion_confint(count, nobs, alpha=0.05, method='normal'): Returns ------- - ci_low, ci_upp : float + ci_low, ci_upp : float, ndarray, or pandas Series or DataFrame lower and upper confidence level with coverage (approximately) 1-alpha. - Note: Beta has coverage - coverage is only 1-alpha on average for some other methods.) + When a pandas object is returned, then the index is taken from the + `count`. Notes ----- - Beta, the Clopper-Pearson interval has coverage at least 1-alpha, but is - in general conservative. Most of the other methods have average coverage - equal to 1-alpha, but will have smaller coverage in some cases. + Beta, the Clopper-Pearson exact interval has coverage at least 1-alpha, + but is in general conservative. Most of the other methods have average + coverage equal to 1-alpha, but will have smaller coverage in some cases. + + The 'beta' and 'jeffreys' interval are central, they use alpha/2 in each + tail, and alpha is not adjusted at the boundaries. In the extreme case + when `count` is zero or equal to `nobs`, then the coverage will be only + 1 - alpha/2 in the case of 'beta'. + + The confidence intervals are clipped to be in the [0, 1] interval in the + case of 'normal' and 'agresti_coull'. Method "binom_test" directly inverts the binomial test in scipy.stats. which has discrete steps. @@ -145,6 +153,7 @@ def func(qi): if np.ndim(ci_low) == 2: ci_low = pd.DataFrame(ci_low, index=pd_index) ci_upp = pd.DataFrame(ci_upp, index=pd_index) + return ci_low, ci_upp diff --git a/statsmodels/stats/tests/test_proportion.py b/statsmodels/stats/tests/test_proportion.py index 04a711720de..ae1c09e61e7 100644 --- a/statsmodels/stats/tests/test_proportion.py +++ b/statsmodels/stats/tests/test_proportion.py @@ -55,7 +55,7 @@ def test_confint_proportion(): def test_confint_proportion_ndim(): - # check that 2-D, works including pandas + # check that it works with 1-D, 2-D and pandas count = np.arange(6).reshape(2, 3) nobs = 10 * np.ones((2, 3)) @@ -76,10 +76,21 @@ def test_confint_proportion_ndim(): assert_allclose((ci_arr[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13) # check that lists work as input - ci_li = proportion_confint(count.tolist(), nobs.tolist(), alpha=0.05, + ci_li = proportion_confint(count.tolist(), nobs.tolist(), alpha=0.05, method=method) assert_allclose(ci_arr, (ci_li[0], ci_li[1]), rtol=1e-13) + # check pandas Series, 1-D + ci_pds = proportion_confint(count_pd.iloc[0], nobs_pd.iloc[0], + alpha=0.05, method=method) + assert_allclose((ci_pds[0].values, ci_pds[1].values), + (ci_pd[0].values[0], ci_pd[1].values[0]), rtol=1e-13) + + # check scalar nobs, verifying one value + ci_arr2 = proportion_confint(count, nobs[1, 2], alpha=0.05, + method=method) + assert_allclose((ci_arr2[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13) + def test_samplesize_confidenceinterval_prop(): #consistency test for samplesize to achieve confidence_interval From 52465a6471f7ddbe2d00e7f5f4bb4d48aaa52214 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 5 Apr 2018 12:12:21 -0400 Subject: [PATCH 127/157] BUG/REF: GMM fix exog_names, summary closes #4340 --- statsmodels/iolib/summary.py | 3 ++ statsmodels/sandbox/regression/gmm.py | 28 +++++++++- .../sandbox/regression/tests/test_gmm.py | 52 ++++++++++++++++++- .../regression/tests/test_gmm_poisson.py | 3 +- 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/statsmodels/iolib/summary.py b/statsmodels/iolib/summary.py index 2b5fdf70174..70b1fb69a14 100644 --- a/statsmodels/iolib/summary.py +++ b/statsmodels/iolib/summary.py @@ -463,6 +463,9 @@ def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, _, xname = _getnames(results, yname=yname, xname=xname) + if len(xname) != len(params): + raise ValueError('xnames and params do not have the same length') + params_stubs = xname exog_idx = lrange(len(xname)) diff --git a/statsmodels/sandbox/regression/gmm.py b/statsmodels/sandbox/regression/gmm.py index 156617267f8..c3a400f26a1 100644 --- a/statsmodels/sandbox/regression/gmm.py +++ b/statsmodels/sandbox/regression/gmm.py @@ -528,8 +528,32 @@ def _fix_param_names(self, params, param_names=None): # cut in front for poisson multiplicative self.data.xnames = xnames[-len(params):] elif len(params) > len(xnames): - # cut at the end - self.data.xnames = xnames[:len(params)] + # use generic names + self.data.xnames = ['p%2d' % i for i in range(len(params))] + + def set_param_names(self, param_names, k_params=None): + """set the parameter names in the model + + Parameters + ---------- + param_names : list of strings + param_names should have the same length as the number of params + k_params : None or int + If k_params is None, then the k_params attribute is used, unless + it is None. + If k_params is not None, then it will also set the k_params + attribute. + """ + if k_params is not None: + self.k_params = k_params + else: + k_params = self.k_params + + if k_params == len(param_names): + self.data.xnames = param_names + else: + raise ValueError('param_names has the wrong length') + def fit(self, start_params=None, maxiter=10, inv_weights=None, weights_method='cov', wargs=(), diff --git a/statsmodels/sandbox/regression/tests/test_gmm.py b/statsmodels/sandbox/regression/tests/test_gmm.py index 04b26ff2340..de036becce2 100644 --- a/statsmodels/sandbox/regression/tests/test_gmm.py +++ b/statsmodels/sandbox/regression/tests/test_gmm.py @@ -237,6 +237,12 @@ def test_hypothesis(self): # Smoke test for Wald res_wald = res1.wald_test(restriction[:-1]) + def test_smoke(self): + res1 = self.res1 + summ = res1.summary() + # len + 1 is for header line + assert_equal(len(summ.tables[1]), len(res1.params) + 1) + class TestGMMSt1(CheckGMM): @@ -678,7 +684,8 @@ def test_hausman(self): def test_smoke(self): res1 = self.res1 - res1.summary() + summ = res1.summary() + assert_equal(len(summ.tables[1]), len(res1.params) + 1) @@ -752,4 +759,45 @@ def test_noconstant(): assert_equal(res.fvalue, np.nan) # smoke test - res.summary() + summ = res.summary() + assert_equal(len(summ.tables[1]), len(res.params) + 1) + + +def test_gmm_basic(): + # this currently tests mainly the param names, exog_names + # see #4340 + cd = np.array([1.5, 1.5, 1.7, 2.2, 2.0, 1.8, 1.8, 2.2, 1.9, 1.6, 1.8, 2.2, + 2.0, 1.5, 1.1, 1.5, 1.4, 1.7, 1.42, 1.9]) + dcd = np.array([0, 0.2 ,0.5, -0.2, -0.2, 0, 0.4, -0.3, -0.3, 0.2, 0.4, + -0.2, -0.5, -0.4, 0.4, -0.1, 0.3, -0.28, 0.48, 0.2]) + inst = np.column_stack((np.ones(len(cd)), cd)) + + class GMMbase(gmm.GMM): + def momcond(self, params): + p0, p1, p2, p3 = params + endog = self.endog[:, None] + exog = self.exog + inst = self.instrument + + mom0 = (endog - p0 - p1 * exog) * inst + mom1 = ((endog - p0 - p1 * exog)**2 - + p2 * (exog**(2 * p3)) / 12) * inst + g = np.column_stack((mom0, mom1)) + return g + + beta0 = np.array([0.1, 0.1, 0.01, 1]) + res = GMMbase(endog=dcd, exog=cd, instrument=inst, k_moms=4, + k_params=4).fit(beta0, optim_args={'disp': 0}) + summ = res.summary() + assert_equal(len(summ.tables[1]), len(res.params) + 1) + pnames = ['p%2d' % i for i in range(len(res.params))] + assert_equal(res.model.exog_names, pnames) + + # check set_param_names method + mod = GMMbase(endog=dcd, exog=cd, instrument=inst, k_moms=4, + k_params=4) + # use arbitrary names + pnames = ['beta', 'gamma', 'psi', 'phi'] + mod.set_param_names(pnames) + res1 = mod.fit(beta0, optim_args={'disp': 0}) + assert_equal(res1.model.exog_names, pnames) diff --git a/statsmodels/sandbox/regression/tests/test_gmm_poisson.py b/statsmodels/sandbox/regression/tests/test_gmm_poisson.py index 45ae1c79a06..358c1d9eea8 100644 --- a/statsmodels/sandbox/regression/tests/test_gmm_poisson.py +++ b/statsmodels/sandbox/regression/tests/test_gmm_poisson.py @@ -119,7 +119,8 @@ def test_other(self): def test_smoke(self): res1 = self.res1 - res1.summary() + summ = res1.summary() + assert_equal(len(summ.tables[1]), len(res1.params) + 1) class TestGMMAddOnestep(CheckGMM): From c5120c5d779e330571eca686f70001a4a22f73c8 Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 5 Apr 2018 15:58:10 -0400 Subject: [PATCH 128/157] BUG: tsa coint, nearly perfect colinearity, return (-inf, 0, ...) closes #4237 --- statsmodels/tsa/stattools.py | 14 ++++++++++---- statsmodels/tsa/tests/test_stattools.py | 13 ++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py index 2fc4a1af1fd..cce05f98d18 100644 --- a/statsmodels/tsa/stattools.py +++ b/statsmodels/tsa/stattools.py @@ -22,6 +22,7 @@ 'periodogram', 'q_stat', 'coint', 'arma_order_select_ic', 'adfuller', 'kpss', 'bds'] +SQRTEPS = np.sqrt(np.finfo(np.double).eps) #NOTE: now in two places to avoid circular import #TODO: I like the bunch pattern for this too. @@ -979,6 +980,11 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', P-values and critical values are obtained through regression surface approximation from MacKinnon 1994 and 2010. + If the two series are almost perfectly collinear, then computing the + test is numerically unstable. However, the two series will be cointegrated + under the maintained assumption that they are integrated. In this case + the t-statistic will be set to -inf and the pvalue to zero. + TODO: We could handle gaps in data by dropping rows with nans in the auxiliary regressions. Not implemented yet, currently assumes no nans and no gaps in time series. @@ -1010,15 +1016,15 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', res_co = OLS(y0, xx).fit() - if res_co.rsquared < 1 - np.sqrt(np.finfo(np.double).eps): + if res_co.rsquared < 1 - 100 * SQRTEPS: res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=None, regression='nc') else: import warnings - warnings.warn("y0 and y1 are perfectly colinear. Cointegration test " - "is not reliable in this case.") + warnings.warn("y0 and y1 are (almost) perfectly colinear." + "Cointegration test is not reliable in this case.") # Edge case where series are too similar - res_adf = (0,) + res_adf = (-np.inf,) # no constant or trend, see egranger in Stata and MacKinnon if trend == 'nc': diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index 410cbdddd85..42aff3f96e3 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -359,23 +359,22 @@ def test_coint_identical_series(): with warnings.catch_warnings(record=True) as w: c = coint(y, y, trend="c", maxlag=0, autolag=None) assert_equal(len(w), 1) - assert_equal(c[0], 0.0) - # Limit of table - assert_(c[1] > .98) + assert_equal(c[1], 0.0) + assert_(np.isneginf(c[0])) def test_coint_perfect_collinearity(): + # test uses nearly perfect collinearity nobs = 200 scale_e = 1 np.random.seed(123) x = scale_e * np.random.randn(nobs, 2) - y = 1 + x.sum(axis=1) + y = 1 + x.sum(axis=1) + 1e-7 * np.random.randn(nobs) warnings.simplefilter('always', ColinearityWarning) with warnings.catch_warnings(record=True) as w: c = coint(y, x, trend="c", maxlag=0, autolag=None) - assert_equal(c[0], 0.0) - # Limit of table - assert_(c[1] > .98) + assert_equal(c[1], 0.0) + assert_(np.isneginf(c[0])) class TestGrangerCausality(object): From ab0231a4f375cef48472d828439372d421ff160f Mon Sep 17 00:00:00 2001 From: Josef Date: Thu, 5 Apr 2018 21:02:31 -0400 Subject: [PATCH 129/157] MAINT: ignore deprecation warning histogram normed --- statsmodels/tsa/statespace/mlemodel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/statespace/mlemodel.py b/statsmodels/tsa/statespace/mlemodel.py index c227b960690..e554369eab7 100644 --- a/statsmodels/tsa/statespace/mlemodel.py +++ b/statsmodels/tsa/statespace/mlemodel.py @@ -2607,7 +2607,9 @@ def plot_diagnostics(self, variable=0, lags=10, fig=None, figsize=None): # elements resid_nonmissing = resid[~(np.isnan(resid))] ax = fig.add_subplot(222) - ax.hist(resid_nonmissing, normed=True, label='Hist') + # temporarily disable Deprecation warning, normed -> density + with warnings.catch_warnings(record=True) as w: + ax.hist(resid_nonmissing, normed=True, label='Hist') from scipy.stats import gaussian_kde, norm kde = gaussian_kde(resid_nonmissing) xlim = (-1.96*2, 1.96*2) From 23831fcbeb2dc4ecb95ec1ef381aa84284d664f8 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 10:05:58 -0400 Subject: [PATCH 130/157] MAINT: test_glm, use pandas instead of genfromtxt --- statsmodels/genmod/tests/results/results_glm.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/statsmodels/genmod/tests/results/results_glm.py b/statsmodels/genmod/tests/results/results_glm.py index 0b2f47d68c5..9183fd85482 100644 --- a/statsmodels/genmod/tests/results/results_glm.py +++ b/statsmodels/genmod/tests/results/results_glm.py @@ -6,6 +6,7 @@ IRLS. """ import numpy as np +import pandas as pd from statsmodels.compat.python import asbytes from . import glm_test_resids import os @@ -684,9 +685,8 @@ def __init__(self): # data set up for data not in datasets filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "stata_lbw_glm.csv") - data=np.recfromcsv(open(filename, 'rb')) - vfunc = np.vectorize(lambda x: x.strip(asbytes("\""))) - data['race'] = vfunc(data['race']) + data = pd.read_csv(filename).to_records() + # categorical does not work with pandas data = categorical(data, col='race', drop=True) self.endog = data.low design = np.column_stack((data['age'], data['lwt'], @@ -2193,9 +2193,7 @@ class Medpar1(object): def __init__(self): filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "stata_medpar1_glm.csv") - data = np.recfromcsv(open(filename, 'rb')) - vfunc = np.vectorize(lambda x: x.strip(asbytes('\"'))) - data['admitype'] = vfunc(data['admitype']) + data = pd.read_csv(filename).to_records() self.endog = data.los design = np.column_stack((data.admitype, data.codes)) design = categorical(design, col=0, drop=True) @@ -2210,7 +2208,7 @@ def __init__(self): super(InvGaussLog, self).__init__() filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "medparlogresids.csv") - self.resids = np.genfromtxt(open(filename, 'rb'), delimiter=",") + self.resids = pd.read_csv(filename, sep=',', header=None).values self.null_deviance = 335.1539777981053 # from R, Rpy bug self.params = np.array([ 0.09927544, -0.19161722, 1.05712336]) self.bse = np.array([ 0.00600728, 0.02632126, 0.04915765]) @@ -2973,7 +2971,7 @@ def __init__(self): self.bse = np.array([ 0.02586783, 0.13830023, 0.20834864]) filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "igaussident_resids.csv") - self.resids = np.genfromtxt(open(filename, 'rb'), delimiter=",") + self.resids = pd.read_csv(filename, sep=',', header=None).values self.null_deviance = 335.1539777981053 # from R, Rpy bug self.df_null = 3675 self.deviance = 305.33661191013988 From bc8c9c7aa262382e6dcc0fe54ea74020431f5c76 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 11:41:06 -0400 Subject: [PATCH 131/157] TST: make genfromtxt usage depend on numpy version (plus whitespace) --- .../genmod/tests/results/results_glm.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/statsmodels/genmod/tests/results/results_glm.py b/statsmodels/genmod/tests/results/results_glm.py index 9183fd85482..0ebba38dcb8 100644 --- a/statsmodels/genmod/tests/results/results_glm.py +++ b/statsmodels/genmod/tests/results/results_glm.py @@ -12,6 +12,10 @@ import os from statsmodels.api import add_constant, categorical +# for genfromtxt changes +from distutils.version import LooseVersion +NUMPY_LT_113 = LooseVersion(np.__version__) < '1.13.0' + # Test Precisions DECIMAL_4 = 4 DECIMAL_3 = 3 @@ -685,7 +689,14 @@ def __init__(self): # data set up for data not in datasets filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "stata_lbw_glm.csv") - data = pd.read_csv(filename).to_records() + + # https://github.com/statsmodels/statsmodels/pull/4432#issuecomment-379279617 + if NUMPY_LT_113: + data=np.recfromcsv(open(filename, 'rb')) + vfunc = np.vectorize(lambda x: x.strip(asbytes("\""))) + data['race'] = vfunc(data['race']) + else: + data = pd.read_csv(filename).to_records() # categorical does not work with pandas data = categorical(data, col='race', drop=True) self.endog = data.low @@ -3834,11 +3845,11 @@ class CpunishTweediePower15(object): # From R setwd('c:/workspace') data <- read.csv('cpunish.csv', sep=",") - + library(statmod) library(tweedie) - - summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, + + summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, family=tweedie(var.power=1.5, link.power=1), data=data)) """ @@ -3901,11 +3912,11 @@ class CpunishTweediePower2(object): # From R setwd('c:/workspace') data <- read.csv('cpunish.csv', sep=",") - + library(statmod) library(tweedie) - - summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, + + summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, family=tweedie(var.power=2, link.power=1), data=data)) """ @@ -3969,11 +3980,11 @@ class CpunishTweedieLog1(object): # From R setwd('c:/workspace') data <- read.csv('cpunish.csv', sep=",") - + library(statmod) library(tweedie) - - summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, + + summary(glm(EXECUTIONS ~ INCOME + SOUTH - 1, family=tweedie(var.power=1, link.power=0), data=data)) """ From bdb243900ee92e8d555efc029ba87946836555b9 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 11:42:35 -0400 Subject: [PATCH 132/157] TST/MAINT : test_theil, add fittedvalues test, use pandas instead of genfromtxt --- statsmodels/regression/tests/test_theil.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/statsmodels/regression/tests/test_theil.py b/statsmodels/regression/tests/test_theil.py index 1b85ddb88b6..375719ed90b 100644 --- a/statsmodels/regression/tests/test_theil.py +++ b/statsmodels/regression/tests/test_theil.py @@ -7,6 +7,7 @@ import os import numpy as np +import pandas as pd from scipy import stats from numpy.testing import assert_allclose, assert_equal, assert_warns @@ -24,8 +25,9 @@ class TestTheilTextile(object): def setup_class(cls): cur_dir = os.path.dirname(os.path.abspath(__file__)) - filepath = os.path.join(cur_dir, "results", "theil_textile_predict.csv") - cls.res_predict = np.recfromtxt(filepath, delimiter=",") + filepath = os.path.join(cur_dir, "results", + "theil_textile_predict.csv") + cls.res_predict = pd.read_csv(filepath, sep=",") names = "year lconsump lincome lprice".split() @@ -90,7 +92,11 @@ def test_basic(self): # Note: tgmixed is using k_exog for df_resid corr_fact = self.res1.df_resid / self.res2.df_r - assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact), self.res2.rmse, rtol=2e-6) + assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact), + self.res2.rmse, rtol=2e-6) + + assert_allclose(self.res1.fittedvalues, + self.res_predict['fittedvalues'], atol=5e7) def test_other(self): tc = self.res1.test_compatibility() From 70415a88683e9014d2453440b4d0f31268f9b501 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 13:23:12 -0400 Subject: [PATCH 133/157] MAINT: test_glm Lbw, old genfromtxt path for python2 --- statsmodels/genmod/tests/results/results_glm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/statsmodels/genmod/tests/results/results_glm.py b/statsmodels/genmod/tests/results/results_glm.py index 0ebba38dcb8..fb4b175fb00 100644 --- a/statsmodels/genmod/tests/results/results_glm.py +++ b/statsmodels/genmod/tests/results/results_glm.py @@ -13,6 +13,8 @@ from statsmodels.api import add_constant, categorical # for genfromtxt changes +import sys +PY2 = (sys.version_info[0] < 3) from distutils.version import LooseVersion NUMPY_LT_113 = LooseVersion(np.__version__) < '1.13.0' @@ -691,7 +693,7 @@ def __init__(self): "stata_lbw_glm.csv") # https://github.com/statsmodels/statsmodels/pull/4432#issuecomment-379279617 - if NUMPY_LT_113: + if NUMPY_LT_113 or PY2: data=np.recfromcsv(open(filename, 'rb')) vfunc = np.vectorize(lambda x: x.strip(asbytes("\""))) data['race'] = vfunc(data['race']) From cee0039368973ab30029f53adad0d71b19adbf7a Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 13:24:46 -0400 Subject: [PATCH 134/157] REF: boxplot: use asarray in jitter --- statsmodels/graphics/boxplots.py | 1 + 1 file changed, 1 insertion(+) diff --git a/statsmodels/graphics/boxplots.py b/statsmodels/graphics/boxplots.py index d9f0f4bccc7..bd89b095cc1 100644 --- a/statsmodels/graphics/boxplots.py +++ b/statsmodels/graphics/boxplots.py @@ -383,6 +383,7 @@ def beanplot(data, ax=None, labels=None, positions=None, side='both', def _jitter_envelope(pos_data, xvals, violin, side): """Determine envelope for jitter markers.""" + pos_data = np.asarray(pos_data) if side == 'both': low, high = (-1., 1.) elif side == 'right': From e213d072d3bb2d05eb8a0c73416d06fa73886075 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 13:37:56 -0400 Subject: [PATCH 135/157] MAINT: test_ros pandas idxmax instead of argmax, fix intend --- statsmodels/imputation/tests/test_ros.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/statsmodels/imputation/tests/test_ros.py b/statsmodels/imputation/tests/test_ros.py index cc319b8bcd4..f8f6c102865 100644 --- a/statsmodels/imputation/tests/test_ros.py +++ b/statsmodels/imputation/tests/test_ros.py @@ -204,7 +204,7 @@ def test_baseline(self): def test_censored_greater_than_max(self): df = self.df.copy() - max_row = df['conc'].argmax() + max_row = df['conc'].idxmax() df.loc[max_row, 'censored'] = True result = ros._ros_sort(df, 'conc', 'censored') pdtest.assert_frame_equal(result, self.expected_with_warning) @@ -254,9 +254,9 @@ def test_empty(self): assert_equal(ros._detection_limit_index(None, self.empty_cohn), 0) def test_populated(self): - assert_equal(ros._detection_limit_index(3.5, self.cohn), 0) - assert_equal(ros._detection_limit_index(6.0, self.cohn), 3) - assert_equal(ros._detection_limit_index(12.0, self.cohn), 5) + assert_equal(ros._detection_limit_index(3.5, self.cohn), 0) + assert_equal(ros._detection_limit_index(6.0, self.cohn), 3) + assert_equal(ros._detection_limit_index(12.0, self.cohn), 5) def test_out_of_bounds(self): with pytest.raises(IndexError): From bf8e9be7444b799def82e3bd1c89aa0c8c5dd24b Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 16:14:57 -0400 Subject: [PATCH 136/157] REF/BUG boxplots use asarray for data --- statsmodels/graphics/boxplots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/statsmodels/graphics/boxplots.py b/statsmodels/graphics/boxplots.py index bd89b095cc1..19acbc22ef7 100644 --- a/statsmodels/graphics/boxplots.py +++ b/statsmodels/graphics/boxplots.py @@ -125,6 +125,7 @@ def violinplot(data, ax=None, labels=None, positions=None, side='both', """ fig, ax = utils.create_mpl_ax(ax) + data = list(map(np.asarray, data)) if positions is None: positions = np.arange(len(data)) + 1 @@ -325,6 +326,7 @@ def beanplot(data, ax=None, labels=None, positions=None, side='both', """ fig, ax = utils.create_mpl_ax(ax) + data = list(map(np.asarray, data)) if positions is None: positions = np.arange(len(data)) + 1 @@ -383,7 +385,6 @@ def beanplot(data, ax=None, labels=None, positions=None, side='both', def _jitter_envelope(pos_data, xvals, violin, side): """Determine envelope for jitter markers.""" - pos_data = np.asarray(pos_data) if side == 'both': low, high = (-1., 1.) elif side == 'right': From 7afa3ddced85f1485d5d929a8b42795182985060 Mon Sep 17 00:00:00 2001 From: Josef Date: Fri, 6 Apr 2018 18:32:14 -0400 Subject: [PATCH 137/157] TST: improve code in tests --- statsmodels/genmod/tests/results/results_glm.py | 5 +++-- statsmodels/tsa/statespace/mlemodel.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/statsmodels/genmod/tests/results/results_glm.py b/statsmodels/genmod/tests/results/results_glm.py index fb4b175fb00..52902190571 100644 --- a/statsmodels/genmod/tests/results/results_glm.py +++ b/statsmodels/genmod/tests/results/results_glm.py @@ -694,11 +694,12 @@ def __init__(self): # https://github.com/statsmodels/statsmodels/pull/4432#issuecomment-379279617 if NUMPY_LT_113 or PY2: - data=np.recfromcsv(open(filename, 'rb')) + with open(filename, 'rb') as datafile: + data=np.recfromcsv(datafile) vfunc = np.vectorize(lambda x: x.strip(asbytes("\""))) data['race'] = vfunc(data['race']) else: - data = pd.read_csv(filename).to_records() + data = pd.read_csv(filename).to_records(index=False) # categorical does not work with pandas data = categorical(data, col='race', drop=True) self.endog = data.low diff --git a/statsmodels/tsa/statespace/mlemodel.py b/statsmodels/tsa/statespace/mlemodel.py index e554369eab7..14f18574f27 100644 --- a/statsmodels/tsa/statespace/mlemodel.py +++ b/statsmodels/tsa/statespace/mlemodel.py @@ -2608,6 +2608,7 @@ def plot_diagnostics(self, variable=0, lags=10, fig=None, figsize=None): resid_nonmissing = resid[~(np.isnan(resid))] ax = fig.add_subplot(222) # temporarily disable Deprecation warning, normed -> density + # hist needs to use `density` in future when minimum matplotlib has it with warnings.catch_warnings(record=True) as w: ax.hist(resid_nonmissing, normed=True, label='Hist') from scipy.stats import gaussian_kde, norm From f9f8c92874e97c36dd0cf143715845ec5509e317 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Mon, 9 Apr 2018 20:37:20 -0400 Subject: [PATCH 138/157] ENH: Add support for RangeIndexes --- statsmodels/tsa/base/tsa_model.py | 50 +++++++-- statsmodels/tsa/tests/test_tsa_indexes.py | 128 ++++++++++++++++++++++ 2 files changed, 171 insertions(+), 7 deletions(-) diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py index 883cfde5ceb..d48ade4b4e5 100644 --- a/statsmodels/tsa/base/tsa_model.py +++ b/statsmodels/tsa/base/tsa_model.py @@ -7,12 +7,18 @@ import numpy as np from pandas import (to_datetime, Int64Index, DatetimeIndex, Period, PeriodIndex, Timestamp, Series, Index) +# RangeIndex only introduced in Pandas 0.18, so we include a shim until +# Statsmodels requires that version. +try: + from pandas import RangeIndex +except ImportError: + class RangeIndex(object): + pass from pandas.tseries.frequencies import to_offset from statsmodels.base import data import statsmodels.base.model as base import statsmodels.base.wrapper as wrap -from statsmodels.tsa.base import datetools from statsmodels.tools.sm_exceptions import ValueWarning _tsa_doc = """ @@ -204,12 +210,13 @@ def _init_dates(self, dates=None, freq=None): has_index = index is not None date_index = isinstance(index, (DatetimeIndex, PeriodIndex)) int_index = isinstance(index, Int64Index) + range_index = isinstance(index, RangeIndex) has_freq = index.freq is not None if date_index else None increment = Int64Index(np.arange(self.endog.shape[0])) is_increment = index.equals(increment) if int_index else None # Issue warnings for unsupported indexes - if has_index and not (date_index or is_increment): + if has_index and not (date_index or range_index or is_increment): warnings.warn('An unsupported index was provided and will be' ' ignored when e.g. forecasting.', ValueWarning) if date_index and not has_freq: @@ -220,7 +227,8 @@ def _init_dates(self, dates=None, freq=None): # Construct the internal index index_generated = False - if (date_index and has_freq) or (int_index and is_increment): + if ((date_index and has_freq) or (int_index and is_increment) or + range_index): _index = index.copy() else: _index = increment @@ -270,11 +278,25 @@ def _get_index_loc(self, key, base_index=None): index = base_index date_index = isinstance(base_index, (PeriodIndex, DatetimeIndex)) + int_index = isinstance(base_index, Int64Index) + range_index = isinstance(base_index, RangeIndex) index_class = type(base_index) nobs = len(index) + # Special handling for RangeIndex + if range_index and isinstance(key, (int, long, np.integer)): + # Negative indices (that lie in the Index) + if key < 0 and -key <= nobs: + key = nobs + key + # Out-of-sample (note that we include key itself in the new index) + elif key > nobs - 1: + stop = base_index._start + (key + 1) * base_index._step + index = RangeIndex(start=base_index._start, + stop=stop, + step=base_index._step) + # Special handling for Int64Index - if (isinstance(index, Int64Index) and not date_index and + if (not range_index and int_index and not date_index and isinstance(key, (int, long, np.integer))): # Negative indices (that lie in the Index) if key < 0 and -key <= nobs: @@ -319,9 +341,23 @@ def _get_index_loc(self, key, base_index=None): periods=len(index) + 1, freq=base_index.freq) - # Get the location (note that get_loc will throw a KeyError if key is - # invalid) - loc = index.get_loc(key) + # Get the location + if date_index: + # (note that get_loc will throw a KeyError if key is invalid) + loc = index.get_loc(key) + elif int_index or range_index: + # For Int64Index and RangeIndex, key is assumed to be the location + # and not an index value (this assumption is required to support + # RangeIndex) + try: + index[key] + # We want to raise a KeyError in this case, to keep the exception + # consistent across index types + except IndexError as e: + raise KeyError(str(e)) + loc = key + else: + loc = index.get_loc(key) # Check if we now have a modified index index_was_expanded = index is not base_index diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index bcecc8c23de..44a8b5217e1 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -17,6 +17,16 @@ import pandas as pd import pytest +# RangeIndex only introduced in Pandas 0.18, so we include a shim until +# Statsmodels requires that version. +has_range_index = False +try: + from pandas import RangeIndex + has_range_index = True +except ImportError: + class RangeIndex(object): + pass + from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal, assert_raises) @@ -87,6 +97,11 @@ # Supported increment indexes supported_increment_indexes = [(pd.Int64Index(np.arange(nobs)), None)] +if has_range_index: + supported_increment_indexes += [ + (pd.RangeIndex(start=0, stop=nobs, step=1), None), + (pd.RangeIndex(start=-5, stop=nobs - 5, step=1), None), + (pd.RangeIndex(start=0, stop=nobs * 6, step=6), None)] # Supported date indexes # Only the Int64Index and the `date_indexes` are valid without @@ -307,6 +322,20 @@ def test_instantiation_valid(): assert_equal(mod.data.dates, None) assert_equal(mod.data.freq, None) + if has_range_index: + # RangeIndex (start=0, end=nobs, so equivalent to increment index) + endog = base_endog.copy() + endog.index = supported_increment_indexes[1][0] + + mod = tsa_model.TimeSeriesModel(endog) + assert_equal(type(mod._index) == pd.RangeIndex, True) + assert_equal(mod._index_none, False) + assert_equal(mod._index_dates, False) + assert_equal(mod._index_generated, False) + assert_equal(mod._index_freq, None) + assert_equal(mod.data.dates, None) + assert_equal(mod.data.freq, None) + # Supported indexes *when a freq is given*, should not raise a warning with warnings.catch_warnings(): warnings.simplefilter('error') @@ -710,6 +739,105 @@ def test_prediction_increment_pandas_dates_nanosecond(): assert_equal(prediction_index.equals(desired_index), True) +@pytest.mark.skipif(not has_range_index, reason='No RangeIndex') +def test_range_index(): + tsa_model.__warningregistry__ = {} + + endog = pd.Series(np.random.normal(size=5)) + assert_equal(isinstance(endog.index, pd.RangeIndex), True) + # Warning should not be given + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + mod = tsa_model.TimeSeriesModel(endog) + assert_equal(len(w), 0) + + +@pytest.mark.skipif(not has_range_index, reason='No RangeIndex') +def test_prediction_rangeindex(): + index = supported_increment_indexes[2][0] + endog = pd.Series(dta[0], index=index) + mod = tsa_model.TimeSeriesModel(endog) + + # Basic prediction: [0, end] + start_key = 0 + end_key = None + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 0) + assert_equal(end, nobs - 1) + assert_equal(out_of_sample, 0) + desired_index = pd.RangeIndex(start=-5, stop=0, step=1) + assert_equal(prediction_index.equals(desired_index), True) + + # Negative index: [-2, end] + start_key = -2 + end_key = -1 + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 3) + assert_equal(end, 4) + assert_equal(out_of_sample, 0) + desired_index = pd.RangeIndex(start=-2, stop=0, step=1) + assert_equal(prediction_index.equals(desired_index), True) + + # Forecasting: [1, 5] + start_key = 1 + end_key = nobs + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 1) + assert_equal(end, 4) + assert_equal(out_of_sample, 1) + desired_index = pd.RangeIndex(start=-4, stop=1, step=1) + assert_equal(prediction_index.equals(desired_index), True) + + +@pytest.mark.skipif(not has_range_index, reason='No RangeIndex') +def test_prediction_rangeindex_withstep(): + index = supported_increment_indexes[3][0] + endog = pd.Series(dta[0], index=index) + mod = tsa_model.TimeSeriesModel(endog) + + # Basic prediction: [0, end] + start_key = 0 + end_key = None + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 0) + assert_equal(end, nobs - 1) + assert_equal(out_of_sample, 0) + desired_index = pd.RangeIndex(start=0, stop=nobs * 6, step=6) + assert_equal(prediction_index.equals(desired_index), True) + + # Negative index: [-2, end] + start_key = -2 + end_key = -1 + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 3) + assert_equal(end, 4) + assert_equal(out_of_sample, 0) + desired_index = pd.RangeIndex(start=3 * 6, stop=nobs * 6, step=6) + assert_equal(prediction_index.equals(desired_index), True) + + # Forecasting: [1, 5] + start_key = 1 + end_key = nobs + start, end, out_of_sample, prediction_index = ( + mod._get_prediction_index(start_key, end_key)) + + assert_equal(start, 1) + assert_equal(end, 4) + assert_equal(out_of_sample, 1) + desired_index = pd.RangeIndex(start=1 * 6, stop=(nobs + 1) * 6, step=6) + assert_equal(prediction_index.equals(desired_index), True) + + def test_custom_index(): tsa_model.__warningregistry__ = {} From 9381a58cde7d010e193fb26dfd836657edbc81ae Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Wed, 11 Apr 2018 21:42:54 -0400 Subject: [PATCH 139/157] TST: Test (fails) sim. smth. w/ state intercept --- .../tests/test_simulation_smoothing.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py index e46137ed5eb..8e24e9b5dd6 100644 --- a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py +++ b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py @@ -623,3 +623,19 @@ def test_simulation_smoothing_obs_intercept(): sim.simulate(disturbance_variates=np.zeros(mod.nobs * 2), initial_state_variates=np.zeros(1)) assert_equal(sim.simulated_state[0], 0) + + +def test_simulation_smoothing_state_intercept(): + nobs = 10 + intercept = 100 + endog = np.ones(nobs) * intercept + + mod = sarimax.SARIMAX(endog, order=(0, 0, 0), trend='c', + measurement_error=True) + mod.initialize_known([100], [[0]]) + mod.update([intercept, 1., 1.]) + + sim = mod.simulation_smoother() + sim.simulate(disturbance_variates=np.zeros(mod.nobs * 2), + initial_state_variates=np.zeros(1)) + assert_equal(sim.simulated_state[0], intercept) From f9569988195b687032501efb9e024fe2edd75321 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Wed, 11 Apr 2018 21:46:32 -0400 Subject: [PATCH 140/157] BUG: Fix sim. smoothing w/ state intercept. --- statsmodels/tsa/statespace/_simulation_smoother.pyx.in | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/statespace/_simulation_smoother.pyx.in b/statsmodels/tsa/statespace/_simulation_smoother.pyx.in index 9777908d1e2..ea63cbfff01 100644 --- a/statsmodels/tsa/statespace/_simulation_smoother.pyx.in +++ b/statsmodels/tsa/statespace/_simulation_smoother.pyx.in @@ -212,6 +212,8 @@ cdef class {{prefix}}SimulationSmoother(object): else: dim2[0] = self.model.k_endog; dim2[1] = self.model.obs_intercept.shape[1]; self.simulated_model.obs_intercept = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN) + dim2[0] = self.model.k_states; dim2[1] = self.model.state_intercept.shape[1]; + self.simulated_model.state_intercept = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN) # Initialize the simulated model memoryviews @@ -388,7 +390,9 @@ cdef class {{prefix}}SimulationSmoother(object): self.cholesky(&self.model.initial_state_cov[0,0], self._tmp0, k_states) if not self.pretransformed_initial_state_variates: self.transform_variates(&self.generated_state[0,0], self._tmp0, k_states) - blas.{{prefix}}axpy(&k_states, &alpha, &self.model.initial_state[0], &inc, &self.generated_state[0,0], &inc) + # In the case of no missing data, we want to keep the initial state at zero + if self.has_missing: + blas.{{prefix}}axpy(&k_states, &alpha, &self.model.initial_state[0], &inc, &self.generated_state[0,0], &inc) self.simulated_kfilter.seek(0) # reset the filter From eb8f076a90ffb48424cf298c7af4520c8f67129d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Apr 2018 18:35:22 -0700 Subject: [PATCH 141/157] substitutions --- statsmodels/discrete/discrete_model.py | 52 ++++++++++++++------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py index 4f8b1f7e590..97beaf10be7 100644 --- a/statsmodels/discrete/discrete_model.py +++ b/statsmodels/discrete/discrete_model.py @@ -2543,14 +2543,16 @@ def _score_nbin(self, params, Q=0): y = self.endog[:,None] mu = self.predict(params)[:,None] a1 = 1/alpha * mu**Q + prob = a1 / (a1 + mu) # a1 aka "size" in _ll_nbin if Q: # nb1 - dparams = exog*mu/alpha*(np.log(1/(alpha + 1)) + + # Q == 1 --> a1 = mu / alpha --> prob = 1 / (alpha + 1) + dparams = exog * a1 * (np.log(prob) + special.digamma(y + mu/alpha) - special.digamma(mu/alpha)) - dalpha = ((alpha*(y - mu*np.log(1/(alpha + 1)) - + dalpha = ((alpha * (y - mu * np.log(prob) - mu*(special.digamma(y + mu/alpha) - special.digamma(mu/alpha) + 1)) - - mu*(np.log(1/(alpha + 1)) + + mu * (np.log(prob) + special.digamma(y + mu/alpha) - special.digamma(mu/alpha)))/ (alpha**2*(alpha + 1))).sum() @@ -2559,7 +2561,7 @@ def _score_nbin(self, params, Q=0): dparams = exog*a1 * (y-mu)/(mu+a1) da1 = -alpha**-2 dalpha = (special.digamma(a1+y) - special.digamma(a1) + np.log(a1) - - np.log(a1+mu) - (a1+y)/(a1+mu) + 1).sum()*da1 + - np.log(a1+mu) - (y-mu)/(a1+mu)).sum() * da1 #multiply above by constant outside sum to reduce rounding error if self._transparams: @@ -2605,20 +2607,21 @@ def _hessian_nb1(self, params): mu = self.predict(params)[:,None] a1 = mu/alpha + prob = 1 / (1 + alpha) # equiv: a1 / (a1 + mu) # for dl/dparams dparams dim = exog.shape[1] hess_arr = np.empty((dim+1,dim+1)) #const_arr = a1*mu*(a1+y)/(mu+a1)**2 # not all of dparams - dparams = exog/alpha*(np.log(1/(alpha + 1)) + - special.digamma(y + mu/alpha) - - special.digamma(mu/alpha)) + dparams = exog / alpha * (np.log(prob) + + special.digamma(y + a1) - + special.digamma(a1)) dmudb = exog*mu - xmu_alpha = exog*mu/alpha - trigamma = (special.polygamma(1, mu/alpha + y) - - special.polygamma(1, mu/alpha)) + xmu_alpha = exog * a1 + trigamma = (special.polygamma(1, a1 + y) - + special.polygamma(1, a1)) for i in range(dim): for j in range(dim): if j > i: @@ -2631,27 +2634,27 @@ def _hessian_nb1(self, params): # for dl/dparams dalpha da1 = -alpha**-2 - dldpda = np.sum(-mu/alpha * dparams + exog*mu/alpha * - (-trigamma*mu/alpha**2 - 1/(alpha+1)), axis=0) + dldpda = np.sum(-a1 * dparams + exog * a1 * + (-trigamma*mu/alpha**2 - prob), axis=0) hess_arr[-1,:-1] = dldpda hess_arr[:-1,-1] = dldpda # for dl/dalpha dalpha - digamma_part = (special.digamma(y + mu/alpha) - - special.digamma(mu/alpha)) + digamma_part = (special.digamma(y + a1) - + special.digamma(a1)) - log_alpha = np.log(1/(alpha+1)) + log_alpha = np.log(prob) alpha3 = alpha**3 alpha2 = alpha**2 mu2 = mu**2 dada = ((alpha3*mu*(2*log_alpha + 2*digamma_part + 3) - - 2*alpha3*y + alpha2*mu2*trigamma + + 2*alpha3*y + 4*alpha2*mu*(log_alpha + digamma_part) + alpha2 * (2*mu - y) + - 2*alpha*mu2*trigamma + - 2*alpha*mu*(log_alpha + digamma_part) + - mu2*trigamma)/(alpha**4*(alpha2 + 2*alpha + 1))) + 2*alpha*mu2*trigamma + mu2 * trigamma + alpha2 * mu2 * trigamma + 2*alpha*mu*(log_alpha + digamma_part) + )/(alpha**4*(alpha2 + 2*alpha + 1))) hess_arr[-1,-1] = dada.sum() return hess_arr @@ -2670,6 +2673,7 @@ def _hessian_nb2(self, params): exog = self.exog y = self.endog[:,None] mu = self.predict(params)[:,None] + prob = a1 / (a1 + mu) # for dl/dparams dparams dim = exog.shape[1] @@ -2686,7 +2690,7 @@ def _hessian_nb2(self, params): # for dl/dparams dalpha da1 = -alpha**-2 - dldpda = np.sum(mu*exog*(y-mu)*da1/(mu+a1)**2 , axis=0) + dldpda = -np.sum(mu*exog*(y-mu)*a1**2/(mu+a1)**2 , axis=0) hess_arr[-1,:-1] = dldpda hess_arr[:-1,-1] = dldpda @@ -2694,7 +2698,7 @@ def _hessian_nb2(self, params): #NOTE: polygamma(1,x) is the trigamma function da2 = 2*alpha**-3 dalpha = da1 * (special.digamma(a1+y) - special.digamma(a1) + - np.log(a1) - np.log(a1+mu) - (a1+y)/(a1+mu) + 1) + np.log(prob) - (y - mu)/(a1+mu)) dada = (da2 * dalpha/da1 + da1**2 * (special.polygamma(1, a1+y) - special.polygamma(1, a1) + 1/a1 - 1/(a1 + mu) + (y - mu)/(mu + a1)**2)).sum() @@ -2963,8 +2967,8 @@ def score_obs(self, params): a4 = p * a1 / mu dparams = ((a4 * (digamma(a3) - digamma(a1)) - - (1 + a4) * a3 / a2) + - y / mu + a4 * (1 + np.log(a1) - np.log(a2))) + a3 / a2) + + y / mu + a4 * (1 - a3 / a2 + np.log(a1 / a2))) dparams = (self.exog.T * mu * dparams).T dalpha = (-a1 / alpha * (digamma(a3) - digamma(a1) + @@ -3055,7 +3059,7 @@ def hessian(self, params): p * (a3 / mu + a4) / a2 + a4 * (polygamma(1, a1) - polygamma(1, a3))) / alpha).sum(axis=1) - da2 = (a1 * (2 * np.log(a1) - 2 * np.log(a2) - + da2 = (a1 * (2 * np.log(a1 / a2) - 2 * digamma(a1) + 2 *digamma(a3) + 3 - 2 * a3 / a2 - a1 * polygamma(1, a1) + a1 * polygamma(1, a3) - 2 * a1 / a2 + From 33330d5c091579f1f8c1dd57afb10603aa32ad5d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Apr 2018 19:17:04 -0700 Subject: [PATCH 142/157] typo fixup --- statsmodels/discrete/discrete_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py index 97beaf10be7..7e1b08ba782 100644 --- a/statsmodels/discrete/discrete_model.py +++ b/statsmodels/discrete/discrete_model.py @@ -2652,7 +2652,7 @@ def _hessian_nb1(self, params): 2*alpha3*y + 4*alpha2*mu*(log_alpha + digamma_part) + alpha2 * (2*mu - y) + - 2*alpha*mu2*trigamma + mu2 * trigamma + alpha2 * mu2 * trigamma + 2*alpha*mu2*trigamma + mu2*trigamma + alpha2*mu2*trigamma + 2*alpha*mu*(log_alpha + digamma_part) )/(alpha**4*(alpha2 + 2*alpha + 1))) hess_arr[-1,-1] = dada.sum() From f6020e49d6eec3e404e791bc15ffa22246f17611 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Apr 2018 20:18:40 -0700 Subject: [PATCH 143/157] check for Q == 1 explicitly --- statsmodels/discrete/discrete_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py index 7e1b08ba782..51868085572 100644 --- a/statsmodels/discrete/discrete_model.py +++ b/statsmodels/discrete/discrete_model.py @@ -2544,7 +2544,7 @@ def _score_nbin(self, params, Q=0): mu = self.predict(params)[:,None] a1 = 1/alpha * mu**Q prob = a1 / (a1 + mu) # a1 aka "size" in _ll_nbin - if Q: # nb1 + if Q == 1: # nb1 # Q == 1 --> a1 = mu / alpha --> prob = 1 / (alpha + 1) dparams = exog * a1 * (np.log(prob) + special.digamma(y + mu/alpha) - @@ -2557,7 +2557,7 @@ def _score_nbin(self, params, Q=0): special.digamma(mu/alpha)))/ (alpha**2*(alpha + 1))).sum() - else: # nb2 + elif Q == 0: # nb2 dparams = exog*a1 * (y-mu)/(mu+a1) da1 = -alpha**-2 dalpha = (special.digamma(a1+y) - special.digamma(a1) + np.log(a1) From c7930c7c8ebbd369eff6eda719d6fcf1c63fcaa5 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Thu, 12 Apr 2018 23:47:05 -0400 Subject: [PATCH 144/157] REF: Let Pandas select Int64Index vs RangeIndex --- statsmodels/tsa/base/tsa_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py index d48ade4b4e5..8e9cf8aab2a 100644 --- a/statsmodels/tsa/base/tsa_model.py +++ b/statsmodels/tsa/base/tsa_model.py @@ -212,7 +212,7 @@ def _init_dates(self, dates=None, freq=None): int_index = isinstance(index, Int64Index) range_index = isinstance(index, RangeIndex) has_freq = index.freq is not None if date_index else None - increment = Int64Index(np.arange(self.endog.shape[0])) + increment = Index(range(self.endog.shape[0])) is_increment = index.equals(increment) if int_index else None # Issue warnings for unsupported indexes From 20849c028e7f644bf1a015e716e62cd5b344406c Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Thu, 12 Apr 2018 23:47:19 -0400 Subject: [PATCH 145/157] =?UTF-8?q?REF:=20Don=E2=80=99t=20copy=20indexes.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- statsmodels/tsa/base/tsa_model.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py index 8e9cf8aab2a..0cd7d6efeba 100644 --- a/statsmodels/tsa/base/tsa_model.py +++ b/statsmodels/tsa/base/tsa_model.py @@ -229,7 +229,7 @@ def _init_dates(self, dates=None, freq=None): if ((date_index and has_freq) or (int_index and is_increment) or range_index): - _index = index.copy() + _index = index else: _index = increment index_generated = True @@ -362,10 +362,6 @@ def _get_index_loc(self, key, base_index=None): # Check if we now have a modified index index_was_expanded = index is not base_index - # (Never return the actual index object) - if not index_was_expanded: - index = index.copy() - # Return the index through the end of the loc / slice if isinstance(loc, slice): end = loc.stop From eab81b6591dbb0b434f0694153a18ae4ded87fae Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 19:45:58 -0400 Subject: [PATCH 146/157] REF: No need to create shim class in test. --- statsmodels/tsa/tests/test_tsa_indexes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index 44a8b5217e1..6f7f42d1dc7 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -24,8 +24,7 @@ from pandas import RangeIndex has_range_index = True except ImportError: - class RangeIndex(object): - pass + pass from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal, assert_raises) From 2169f865bc95bd55e2e002aa00748d42e6d93a27 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 19:47:39 -0400 Subject: [PATCH 147/157] BUG: Catch all RangeIndex indexing exceptions. --- statsmodels/tsa/base/tsa_model.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py index 0cd7d6efeba..c15df4f3348 100644 --- a/statsmodels/tsa/base/tsa_model.py +++ b/statsmodels/tsa/base/tsa_model.py @@ -352,8 +352,15 @@ def _get_index_loc(self, key, base_index=None): try: index[key] # We want to raise a KeyError in this case, to keep the exception - # consistent across index types - except IndexError as e: + # consistent across index types. + # - Attempting to index with an out-of-bound location (e.g. + # index[10] on an index of length 9) will raise an IndexError + # (as of Pandas 0.22) + # - Attemtping to index with a type that cannot be cast to integer + # (e.g. a non-numeric string) will raise a ValueError if the + # index is RangeIndex (otherwise will raise an IndexError) + # (as of Pandas 0.22) + except IndexError, ValueError as e: raise KeyError(str(e)) loc = key else: From a759e5783cef1dc9331492254a50f331166eb39a Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 19:47:53 -0400 Subject: [PATCH 148/157] =?UTF-8?q?TST/DOC:=20Include=20RangeIndex=20in=20?= =?UTF-8?q?list=20of=20valid=20ix=E2=80=99s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- statsmodels/tsa/tests/test_tsa_indexes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index 6f7f42d1dc7..92b252fc6ce 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -150,10 +150,11 @@ def test_instantiation_valid(): # # Each pandas index (of `endog`, `exog`, or passed to `dates`) can be: # 0. None - # 1. Int64Index with values exactly equal to 0, 1, ..., nobs-1 - # 2. DatetimeIndex with frequency - # 3. PeriodIndex with frequency - # 4. Anything that doesn't fall into the above categories also should + # 1. RangeIndex (if applicable; i.e. if Pandas >= 0.18) + # 2. Int64Index with values exactly equal to 0, 1, ..., nobs-1 + # 3. DatetimeIndex with frequency + # 4. PeriodIndex with frequency + # 5. Anything that doesn't fall into the above categories also should # only raise an exception if it was passed to dates, and may trigger # a warning otherwise. # From 28c3c5f2d6a9651a7d764f789767d3d46f1015d1 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 19:48:49 -0400 Subject: [PATCH 149/157] TST: Allow default index to be Int64 or Range (necessary to handle all of the Py2 and Py3 / Pandas < 0.18 and Pandas >= 0.18 combinations) --- statsmodels/tsa/tests/test_tsa_indexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index 92b252fc6ce..bd126976da6 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -208,7 +208,8 @@ def test_instantiation_valid(): warnings.simplefilter('error') mod = tsa_model.TimeSeriesModel(endog) - assert_equal(type(mod._index) == pd.Int64Index, True) + assert_equal(isinstance(mod._index, + (pd.Int64Index, pd.RangeIndex)), True) assert_equal(mod._index_none, True) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) From 708d5f771644806cd7da69a389223e353a381d52 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 19:52:01 -0400 Subject: [PATCH 150/157] TST/DOC: Add descriptive note on index test cases --- statsmodels/tsa/tests/test_tsa_indexes.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index bd126976da6..7a7b778f64a 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -492,6 +492,9 @@ def test_prediction_increment_unsupported(): warnings.simplefilter('ignore') mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end]; notice that since this is an in-sample # prediction, the index returned is the (unsupported) original index start_key = 0 @@ -542,6 +545,9 @@ def test_prediction_increment_nonpandas(): endog = dta[0] mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end]; since there was no index at all and the data # is not Pandas, the returned prediction_index is None start_key = 0 @@ -583,6 +589,9 @@ def test_prediction_increment_pandas_noindex(): endog = dta[2].copy() mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end]; since there was no index and the data is # Pandas, the index is the generated incrementing index, and no warning is # issued @@ -629,6 +638,9 @@ def test_prediction_increment_pandas_dates(): endog.index = date_indexes[0][0] # Daily, 1950-01-01, 1950-01-02, ... mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end]; the index is the date index start_key = 0 end_key = None @@ -690,6 +702,9 @@ def test_prediction_increment_pandas_dates_nanosecond(): except: raise SkipTest + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end]; the index is the date index start_key = 0 end_key = None @@ -759,6 +774,9 @@ def test_prediction_rangeindex(): endog = pd.Series(dta[0], index=index) mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end] start_key = 0 end_key = None @@ -802,6 +820,9 @@ def test_prediction_rangeindex_withstep(): endog = pd.Series(dta[0], index=index) mod = tsa_model.TimeSeriesModel(endog) + # Tests three common use cases: basic prediction, negative indexes, and + # out-of-sample indexes. + # Basic prediction: [0, end] start_key = 0 end_key = None From e7d19dc8413e465310382635968c1e0bca8eec26 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 20:23:53 -0400 Subject: [PATCH 151/157] =?UTF-8?q?BUG:=20Syntax=20error=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- statsmodels/tsa/base/tsa_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py index c15df4f3348..2c00f5f5855 100644 --- a/statsmodels/tsa/base/tsa_model.py +++ b/statsmodels/tsa/base/tsa_model.py @@ -360,7 +360,7 @@ def _get_index_loc(self, key, base_index=None): # (e.g. a non-numeric string) will raise a ValueError if the # index is RangeIndex (otherwise will raise an IndexError) # (as of Pandas 0.22) - except IndexError, ValueError as e: + except (IndexError, ValueError) as e: raise KeyError(str(e)) loc = key else: From 0b1bb281efdf3ccaddea6b1741e2f4e11b68c772 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 21:01:22 -0400 Subject: [PATCH 152/157] TST: Allow Range or Int64 index w/ unsupported --- statsmodels/tsa/tests/test_tsa_indexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index 7a7b778f64a..b2d0cfb91b3 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -450,7 +450,8 @@ def test_instantiation_valid(): endog = base_endog.copy() endog.index = ix mod = tsa_model.TimeSeriesModel(endog) - assert_equal(type(mod._index) == pd.Int64Index, True) + assert_equal(isinstance(mod._index, + (pd.Int64Index, pd.RangeIndex)), True) assert_equal(mod._index_none, False) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) From 893f1d5c98c354709cd0de8af9f353ffe27d2496 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 21:27:51 -0400 Subject: [PATCH 153/157] TST: Allow Range or Int64 index w/ unsupported (2) --- statsmodels/tsa/tests/test_tsa_indexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index b2d0cfb91b3..ff353a69395 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -428,7 +428,8 @@ def test_instantiation_valid(): endog = base_endog.copy() endog.index = ix mod = tsa_model.TimeSeriesModel(endog) - assert_equal(type(mod._index) == pd.Int64Index, True) + assert_equal(isinstance(mod._index, + (pd.Int64Index, pd.RangeIndex)), True) assert_equal(mod._index_none, False) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) From ef4a07b8749001a4f73738caa23644696399d909 Mon Sep 17 00:00:00 2001 From: Chad Fulton Date: Fri, 13 Apr 2018 21:55:51 -0400 Subject: [PATCH 154/157] REF: Put back RangeIndex shim. --- statsmodels/tsa/tests/test_tsa_indexes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/statsmodels/tsa/tests/test_tsa_indexes.py b/statsmodels/tsa/tests/test_tsa_indexes.py index ff353a69395..e46862554b0 100644 --- a/statsmodels/tsa/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/tests/test_tsa_indexes.py @@ -24,7 +24,8 @@ from pandas import RangeIndex has_range_index = True except ImportError: - pass + class RangeIndex(object): + pass from numpy.testing import (assert_allclose, assert_almost_equal, assert_equal, assert_raises) @@ -209,7 +210,7 @@ def test_instantiation_valid(): mod = tsa_model.TimeSeriesModel(endog) assert_equal(isinstance(mod._index, - (pd.Int64Index, pd.RangeIndex)), True) + (pd.Int64Index, RangeIndex)), True) assert_equal(mod._index_none, True) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) @@ -329,7 +330,7 @@ def test_instantiation_valid(): endog.index = supported_increment_indexes[1][0] mod = tsa_model.TimeSeriesModel(endog) - assert_equal(type(mod._index) == pd.RangeIndex, True) + assert_equal(type(mod._index) == RangeIndex, True) assert_equal(mod._index_none, False) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, False) @@ -429,7 +430,7 @@ def test_instantiation_valid(): endog.index = ix mod = tsa_model.TimeSeriesModel(endog) assert_equal(isinstance(mod._index, - (pd.Int64Index, pd.RangeIndex)), True) + (pd.Int64Index, RangeIndex)), True) assert_equal(mod._index_none, False) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) @@ -452,7 +453,7 @@ def test_instantiation_valid(): endog.index = ix mod = tsa_model.TimeSeriesModel(endog) assert_equal(isinstance(mod._index, - (pd.Int64Index, pd.RangeIndex)), True) + (pd.Int64Index, RangeIndex)), True) assert_equal(mod._index_none, False) assert_equal(mod._index_dates, False) assert_equal(mod._index_generated, True) From 7677a9afc9c07dfbeda14c493a1ceb511e8c2cb7 Mon Sep 17 00:00:00 2001 From: Josef Date: Mon, 16 Apr 2018 12:20:13 -0400 Subject: [PATCH 155/157] BUG: allow list exog in get_prediction, closes #4437 --- statsmodels/genmod/_prediction.py | 5 +-- statsmodels/regression/_prediction.py | 5 +-- statsmodels/regression/tests/test_predict.py | 43 +++++++++++++++++++- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/statsmodels/genmod/_prediction.py b/statsmodels/genmod/_prediction.py index bad7430a0fc..617389aeb8d 100644 --- a/statsmodels/genmod/_prediction.py +++ b/statsmodels/genmod/_prediction.py @@ -186,9 +186,8 @@ def get_prediction_glm(self, exog=None, transform=True, weights=None, if exog is not None: if row_labels is None: - if hasattr(exog, 'index'): - row_labels = exog.index - else: + row_labels = getattr(exog, 'index', None) + if callable(row_labels): row_labels = None exog = np.asarray(exog) diff --git a/statsmodels/regression/_prediction.py b/statsmodels/regression/_prediction.py index 5cb759f9ba4..26bd90387c6 100644 --- a/statsmodels/regression/_prediction.py +++ b/statsmodels/regression/_prediction.py @@ -130,9 +130,8 @@ def get_prediction(self, exog=None, transform=True, weights=None, if exog is not None: if row_labels is None: - if hasattr(exog, 'index'): - row_labels = exog.index - else: + row_labels = getattr(exog, 'index', None) + if callable(row_labels): row_labels = None exog = np.asarray(exog) diff --git a/statsmodels/regression/tests/test_predict.py b/statsmodels/regression/tests/test_predict.py index f8b169a2c7e..f85cd4e5d3a 100644 --- a/statsmodels/regression/tests/test_predict.py +++ b/statsmodels/regression/tests/test_predict.py @@ -92,7 +92,6 @@ def test_predict_se(): np.testing.assert_allclose(iv_l, res3.fittedvalues[:3] - ci_half[:3], rtol=1e-12) - #use wrong size for exog #prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,0], weights=3.) np.testing.assert_raises(ValueError, wls_prediction_std, res3, x2[-1,0], @@ -158,6 +157,27 @@ def test_ci(self): sf2 = pred_res2.summary_frame() assert_equal(sf2.columns.tolist(), col_names) + # check that list works, issue 4437 + x = res_wls.model.exog.mean(0) + pred_res3 = res_wls.get_prediction(x) + ci3 = pred_res3.conf_int(obs=True) + pred_res3b = res_wls.get_prediction(x.tolist()) + ci3b = pred_res3b.conf_int(obs=True) + assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13) + assert_allclose(ci3b, ci3, rtol=1e-13) + res_df = pred_res3b.summary_frame() + assert_equal(res_df.index.values, [0]) + + x = res_wls.model.exog[-2:] + pred_res3 = res_wls.get_prediction(x) + ci3 = pred_res3.conf_int(obs=True) + pred_res3b = res_wls.get_prediction(x.tolist()) + ci3b = pred_res3b.conf_int(obs=True) + assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13) + assert_allclose(ci3b, ci3, rtol=1e-13) + res_df = pred_res3b.summary_frame() + assert_equal(res_df.index.values, [0, 1]) + def test_glm(self): # prelimnimary, getting started with basic test for GLM.get_prediction @@ -217,3 +237,24 @@ def test_glm(self): # prediction with exog and no weights does not error res_glm = mod_glm.fit() pred_glm = res_glm.get_prediction(X) + + # check that list works, issue 4437 + x = res_glm.model.exog.mean(0) + pred_res3 = res_glm.get_prediction(x) + ci3 = pred_res3.conf_int() + pred_res3b = res_glm.get_prediction(x.tolist()) + ci3b = pred_res3b.conf_int() + assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13) + assert_allclose(ci3b, ci3, rtol=1e-13) + res_df = pred_res3b.summary_frame() + assert_equal(res_df.index.values, [0]) + + x = res_glm.model.exog[-2:] + pred_res3 = res_glm.get_prediction(x) + ci3 = pred_res3.conf_int() + pred_res3b = res_glm.get_prediction(x.tolist()) + ci3b = pred_res3b.conf_int() + assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13) + assert_allclose(ci3b, ci3, rtol=1e-13) + res_df = pred_res3b.summary_frame() + assert_equal(res_df.index.values, [0, 1]) From 379e0e785b52c7af3c016b45f27dd7f89e167304 Mon Sep 17 00:00:00 2001 From: Josef Date: Mon, 16 Apr 2018 21:04:29 -0400 Subject: [PATCH 156/157] BUG: coint use autolag in adfuller call, closes #4490 --- statsmodels/tsa/stattools.py | 13 ++++++++++++- statsmodels/tsa/tests/test_stattools.py | 7 +++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py index cce05f98d18..791c1614a60 100644 --- a/statsmodels/tsa/stattools.py +++ b/statsmodels/tsa/stattools.py @@ -933,6 +933,10 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', Constant or trend is included in 1st stage regression, i.e. in cointegrating equation. + **Warning:** The autolag default has changed compared to statsmodels 0.8. + In 0.8 autolag was always None, no the keyword is used and defaults to + 'aic'. Use `autolag=None` to avoid the lag search. + Parameters ---------- y1 : array_like, 1d @@ -952,6 +956,13 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', keyword for `adfuller`, largest or given number of lags autolag : string keyword for `adfuller`, lag selection criterion. + * if None, then maxlag lags are used without lag search + * if 'AIC' (default) or 'BIC', then the number of lags is chosen + to minimize the corresponding information criterion + * 't-stat' based choice of maxlag. Starts with maxlag and drops a + lag until the t-statistic on the last lag length is significant + using a 5%-sized test + return_results : bool for future compatibility, currently only tuple available. If True, then a results instance is returned. Otherwise, a tuple @@ -1017,7 +1028,7 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', res_co = OLS(y0, xx).fit() if res_co.rsquared < 1 - 100 * SQRTEPS: - res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=None, + res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=autolag, regression='nc') else: import warnings diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index 42aff3f96e3..a97451eb433 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -349,6 +349,13 @@ def test_coint(): r1 = res1[i][2] assert_allclose(r1, r2, rtol=0, atol=6e-7) + # use default autolag #4490 + res1_0 = coint(y[:, 0], y[:, 1], trend='ct', maxlag=4) + assert_allclose(res1_0[2], res_egranger['ct'][0][1:], rtol=0, atol=6e-7) + # the following is just a regression test + assert_allclose(res1_0[:2], [-13.992946638547112, 2.270898990540678e-27], + rtol=1e-10, atol=1e-27) + def test_coint_identical_series(): nobs = 200 From 24db0ff95b61d47ef134360834acdf4d28e818cb Mon Sep 17 00:00:00 2001 From: Jordan Yoder Date: Wed, 14 Feb 2018 13:02:30 -0500 Subject: [PATCH 157/157] Add frequency domain seasonal components to UnobservedComponents Previously, we had implemented the main time domain seasonal component, which is restrictive when the periodicity is very high or the seasonality can be more parsimoniously represented by trigonometric terms. Also, we were only able to use a single seasonal component. This commit allows for multiple seasonal components in the frequency domain and for mixing of the two types. We add a test checking the state matrices for a somewhat complicated model and update the regression test to be against the results in the test_ucm.R file as generated by KFAS. Finally, we added an ipython notebook as an example showcasing the new functionality. Also - Change offset for regression coefficients when cycle is included. Cycle is two states regardless of if it is stochastic. Therefore, the offset should always be 2 instead of possibly 1 for a deterministic cycle. - Update the .R code for the other seasonal test to correctly initialize P1 Signed-off-by: Jordan Yoder --- examples/notebooks/statespace_seasonal.ipynb | 453 ++++++++++++++++++ statsmodels/tsa/statespace/structural.py | 369 ++++++++++++-- .../tests/results/results_structural.py | 14 +- .../tsa/statespace/tests/results/test_ucm.R | 17 +- .../tsa/statespace/tests/test_structural.py | 92 +++- 5 files changed, 896 insertions(+), 49 deletions(-) create mode 100644 examples/notebooks/statespace_seasonal.ipynb diff --git a/examples/notebooks/statespace_seasonal.ipynb b/examples/notebooks/statespace_seasonal.ipynb new file mode 100644 index 00000000000..276012d50e8 --- /dev/null +++ b/examples/notebooks/statespace_seasonal.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Seasonality in time series data\n", + "\n", + "Consider the problem of modeling time series data with multiple seasonal components with different perioidicities. Let us take the time series $y_t$ and decompose it explicitly to have a level component and two seasonal components.\n", + "\n", + "$$\n", + "y_t = \\mu_t + \\gamma^{(1)}_t + \\gamma^{(2)}_t\n", + "$$\n", + "\n", + "where $\\mu_t$ represents the trend or level, $\\gamma^{(1)}_t$ represents a seasonal component with a relatively short period, and $\\gamma^{(2)}_t$ represents another seasonal component of longer period. We will have a fixed intercept term for our level and consider both $\\gamma^{(2)}_t$ and $\\gamma^{(2)}_t$ to be stochastic so that the seasonal patterns can vary over time.\n", + "\n", + "In this notebook, we will generate synthetic data conforming to this model and showcase modeling of the seasonal terms in a few different ways under the unobserved components modeling framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import statsmodels.api as sm\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Synthetic data creation\n", + "\n", + "We will create data with multiple seasonal patterns by following equations (3.7) and (3.8) in Durbin and Koopman (2012). We will simulate 300 periods and two seasonal terms parameterized in the frequency domain having periods 10 and 100, respectively, and 3 and 2 number of harmonics, respectively. Further, the variances of their stochastic parts are 4 and 9, respecively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# First we'll simulate the synthetic data\n", + "def simulate_seasonal_term(periodicity, total_cycles, noise_std=1.,\n", + " harmonics=None):\n", + " duration = periodicity * total_cycles\n", + " assert duration == int(duration)\n", + " duration = int(duration)\n", + " harmonics = harmonics if harmonics else int(np.floor(periodicity / 2))\n", + "\n", + " lambda_p = 2 * np.pi / float(periodicity)\n", + "\n", + " gamma_jt = noise_std * np.random.randn((harmonics))\n", + " gamma_star_jt = noise_std * np.random.randn((harmonics))\n", + "\n", + " total_timesteps = 100 * duration # Pad for burn in\n", + " series = np.zeros(total_timesteps) \n", + " for t in range(total_timesteps):\n", + " gamma_jtp1 = np.zeros_like(gamma_jt)\n", + " gamma_star_jtp1 = np.zeros_like(gamma_star_jt)\n", + " for j in range(1, harmonics + 1):\n", + " cos_j = np.cos(lambda_p * j)\n", + " sin_j = np.sin(lambda_p * j)\n", + " gamma_jtp1[j - 1] = (gamma_jt[j - 1] * cos_j\n", + " + gamma_star_jt[j - 1] * sin_j\n", + " + noise_std * np.random.randn())\n", + " gamma_star_jtp1[j - 1] = (- gamma_jt[j - 1] * sin_j\n", + " + gamma_star_jt[j - 1] * cos_j\n", + " + noise_std * np.random.randn())\n", + " series[t] = np.sum(gamma_jtp1)\n", + " gamma_jt = gamma_jtp1\n", + " gamma_star_jt = gamma_star_jtp1\n", + " wanted_series = series[-duration:] # Discard burn in\n", + "\n", + " return wanted_series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duration = 100 * 3\n", + "periodicities = [10, 100]\n", + "num_harmonics = [3, 2]\n", + "std = np.array([2, 3])\n", + "np.random.seed(8678309)\n", + "\n", + "terms = []\n", + "for ix, _ in enumerate(periodicities):\n", + " s = simulate_seasonal_term(\n", + " periodicities[ix],\n", + " duration / periodicities[ix],\n", + " harmonics=num_harmonics[ix],\n", + " noise_std=std[ix])\n", + " terms.append(s)\n", + "terms.append(np.ones_like(terms[0]) * 10.)\n", + "series = pd.Series(np.sum(terms, axis=0))\n", + "df = pd.DataFrame(data={'total': series,\n", + " '10(3)': terms[0],\n", + " '100(2)': terms[1],\n", + " 'level':terms[2]})\n", + "h1, = plt.plot(df['total'])\n", + "h2, = plt.plot(df['10(3)'])\n", + "h3, = plt.plot(df['100(2)'])\n", + "h4, = plt.plot(df['level'])\n", + "plt.legend(['total','10(3)','100(2)', 'level'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unobserved components (frequency domain modeling)\n", + "\n", + "The next method is an unobserved components model, where the trend is modeled as a fixed intercept and the seasonal components are modeled using trigonometric functions with primary periodicities of 10 and 100, respectively, and number of harmonics 3 and 2, respecively. Note that this is the correct, generating model. The process for the time series can be written as:\n", + "\n", + "$$\n", + "\\begin{align}\n", + "y_t & = \\mu_t + \\gamma^{(1)}_t + \\gamma^{(2)}_t + \\epsilon_t\\\\\n", + "\\mu_{t+1} & = \\mu_t \\\\\n", + "\\gamma^{(1)}_{t} &= \\sum_{j=1}^2 \\gamma^{(1)}_{j, t} \\\\\n", + "\\gamma^{(2)}_{t} &= \\sum_{j=1}^3 \\gamma^{(2)}_{j, t}\\\\\n", + "\\gamma^{(1)}_{j, t+1} &= \\gamma^{(1)}_{j, t}\\cos(\\lambda_j) + \\gamma^{*, (1)}_{j, t}\\sin(\\lambda_j) + \\omega^{(1)}_{j,t}, ~j = 1, 2, 3\\\\\n", + "\\gamma^{*, (1)}_{j, t+1} &= -\\gamma^{(1)}_{j, t}\\sin(\\lambda_j) + \\gamma^{*, (1)}_{j, t}\\cos(\\lambda_j) + \\omega^{*, (1)}_{j, t}, ~j = 1, 2, 3\\\\\n", + "\\gamma^{(2)}_{j, t+1} &= \\gamma^{(2)}_{j, t}\\cos(\\lambda_j) + \\gamma^{*, (2)}_{j, t}\\sin(\\lambda_j) + \\omega^{(2)}_{j,t}, ~j = 1, 2\\\\\n", + "\\gamma^{*, (2)}_{j, t+1} &= -\\gamma^{(2)}_{j, t}\\sin(\\lambda_j) + \\gamma^{*, (2)}_{j, t}\\cos(\\lambda_j) + \\omega^{*, (2)}_{j, t}, ~j = 1, 2\\\\\n", + "\\end{align}\n", + "$$\n", + "$$\n", + "\n", + "where $\\epsilon_t$ is white noise, $\\omega^{(1)}_{j,t}$ are i.i.d. $N(0, \\sigma^2_1)$, and $\\omega^{(2)}_{j,t}$ are i.i.d. $N(0, \\sigma^2_2)$, where $\\sigma_1 = 2.$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "model = sm.tsa.UnobservedComponents(series.values, \n", + " level='fixed intercept', \n", + " freq_seasonal=[{'period': 10,\n", + " 'harmonics': 3},\n", + " {'period': 100,\n", + " 'harmonics': 2}])\n", + "res_f = model.fit(disp=False)\n", + "print(res_f.summary())\n", + "# The first state variable holds our estimate of the intercept\n", + "print(\"fixed intercept estimated as {0:.3f}\".format(res_f.smoother_results.smoothed_state[0,-1:][0]))\n", + "\n", + "res_f.plot_components()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.ssm.transition[:, :, 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Observe that the fitted variances are pretty close to the true variances of 4 and 9. Further, the individual seasonal components look pretty close to the true seasonal components. The smoothed level term is kind of close to the true level of 10. Finally, our diagnostics look solid; the test statistics are small enough to fail to reject our three tests." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unobserved components (mixed time and frequency domain modeling)\n", + "\n", + "The second method is an unobserved components model, where the trend is modeled as a fixed intercept and the seasonal components are modeled using 10 constants summing to 0 and trigonometric functions with a primary periodicities of 100 with 2 harmonics total. Note that this isn't the generating model, as it presupposes that there are more state errors for the shorter seasonal component than in reality. The process for the time series can be written as:\n", + "\n", + "$$\n", + "\\begin{align}\n", + "y_t & = \\mu_t + \\gamma^{(1)}_t + \\gamma^{(2)}_t + \\epsilon_t\\\\\n", + "\\mu_{t+1} & = \\mu_t \\\\\n", + "\\gamma^{(1)}_{t + 1} &= - \\sum_{j=1}^9 \\gamma^{(1)}_{t + 1 - j} + \\omega^{(1)}_t\\\\\n", + "\\gamma^{(2)}_{j, t+1} &= \\gamma^{(2)}_{j, t}\\cos(\\lambda_j) + \\gamma^{*, (2)}_{j, t}\\sin(\\lambda_j) + \\omega^{(2)}_{j,t}, ~j = 1, 2\\\\\n", + "\\gamma^{*, (2)}_{j, t+1} &= -\\gamma^{(2)}_{j, t}\\sin(\\lambda_j) + \\gamma^{*, (2)}_{j, t}\\cos(\\lambda_j) + \\omega^{*, (2)}_{j, t}, ~j = 1, 2\\\\\n", + "\\end{align}\n", + "$$\n", + "\n", + "where $\\epsilon_t$ is white noise, $\\omega^{(1)}_{t}$ are i.i.d. $N(0, \\sigma^2_1)$, and $\\omega^{(2)}_{j,t}$ are i.i.d. $N(0, \\sigma^2_2)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = sm.tsa.UnobservedComponents(series, \n", + " level='fixed intercept', \n", + " seasonal=10,\n", + " freq_seasonal=[{'period': 100,\n", + " 'harmonics': 2}])\n", + "res_tf = model.fit()\n", + "print(res_tf.summary())\n", + "# The first state variable holds our estimate of the intercept\n", + "print(\"fixed intercept estimated as {0:.3f}\".format(res_tf.smoother_results.smoothed_state[0,-1:][0]))\n", + "\n", + "res_tf.plot_components()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The plotted components look good. However, the estimated variance of the second seasonal term is inflated from reality. Additionally, we reject the Ljung-Box statistic, indicating we may have remaining autocorrelation after accounting for our components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unobserved components (lazy frequency domain modeling)\n", + "\n", + "The third method is an unobserved components model with a fixed intercept and one seasonal component, which is modeled using trigonometric functions with primary periodicity 100 and 50 harmonics. Note that this isn't the generating model, as it presupposes that there are more harmonics then in reality. Because the variances are tied together, we are not able to drive the estimated covariance of the non-existent harmonics to 0. What is lazy about this model specification is that we have not bothered to specify the two different seasonal components and instead chosen to model them using a single component with enough harmonics to cover both. We will not be able to capture any differences in variances between the two true components. The process for the time series can be written as:\n", + "\n", + "$$\n", + "\\begin{align}\n", + "y_t & = \\mu_t + \\gamma^{(1)}_t + \\epsilon_t\\\\\n", + "\\mu_{t+1} &= \\mu_t\\\\\n", + "\\gamma^{(1)}_{t} &= \\sum_{j=1}^{50}\\gamma^{(1)}_{j, t}\\\\\n", + "\\gamma^{(1)}_{j, t+1} &= \\gamma^{(1)}_{j, t}\\cos(\\lambda_j) + \\gamma^{*, (1)}_{j, t}\\sin(\\lambda_j) + \\omega^{(1}_{j,t}, ~j = 1, 2, \\dots, 50\\\\\n", + "\\gamma^{*, (1)}_{j, t+1} &= -\\gamma^{(1)}_{j, t}\\sin(\\lambda_j) + \\gamma^{*, (1)}_{j, t}\\cos(\\lambda_j) + \\omega^{*, (1)}_{j, t}, ~j = 1, 2, \\dots, 50\\\\\n", + "\\end{align}\n", + "$$\n", + "\n", + "where $\\epsilon_t$ is white noise, $\\omega^{(1)}_{t}$ are i.i.d. $N(0, \\sigma^2_1)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = sm.tsa.UnobservedComponents(series, \n", + " level='fixed intercept', \n", + " freq_seasonal=[{'period': 100}])\n", + "res_lf = model.fit()\n", + "print(res_lf.summary())\n", + "# The first state variable holds our estimate of the intercept\n", + "print(\"fixed intercept estimated as {0:.3f}\".format(res_lf.smoother_results.smoothed_state[0,-1:][0]))\n", + "\n", + "res_lf.plot_components()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that one of our diagnostic tests would be rejected at the .05 level." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unobserved components (lazy time domain seasonal modeling)\n", + "\n", + "The fourth method is an unobserved components model with a fixed intercept and a single seasonal component modeled using a time-domain seasonal model of 100 constants. The process for the time series can be written as:\n", + "\n", + "$$\n", + "\\begin{align}\n", + "y_t & =\\mu_t + \\gamma^{(1)}_t + \\epsilon_t\\\\\n", + "\\mu_{t+1} &= \\mu_{t} \\\\\n", + "\\gamma^{(1)}_{t + 1} &= - \\sum_{j=1}^{99} \\gamma^{(1)}_{t + 1 - j} + \\omega^{(1)}_t\\\\\n", + "\\end{align}\n", + "$$\n", + "\n", + "where $\\epsilon_t$ is white noise, $\\omega^{(1)}_{t}$ are i.i.d. $N(0, \\sigma^2_1)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = sm.tsa.UnobservedComponents(series,\n", + " level='fixed intercept',\n", + " seasonal=100)\n", + "res_lt = model.fit(disp=False)\n", + "print(res_lt.summary())\n", + "# The first state variable holds our estimate of the intercept\n", + "print(\"fixed intercept estimated as {0:.3f}\".format(res_lt.smoother_results.smoothed_state[0,-1:][0]))\n", + "\n", + "res_lt.plot_components()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The seasonal component itself looks good--it is the primary singal. The estimated variance of the seasonal term is very high ($>10^5$), leading to a lot of uncertainty in our one-step-ahead predictions and slow responsiveness to new data, as evidenced by large errors in one-step ahead predictions and observations. Finally, all three of our diagnostic tests were rejected. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparison of filtered estimates\n", + "\n", + "The plots below show that explicitly modeling the individual components results in the filtered state being close to the true state within roughly half a period. The lazy models took longer (almost a full period) to do the same on the combined true state." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Assign better names for our seasonal terms\n", + "true_seasonal_10_3 = terms[0]\n", + "true_seasonal_100_2 = terms[1]\n", + "true_sum = true_seasonal_10_3 + true_seasonal_100_2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time_s = np.s_[:50] # After this they basically agree\n", + "fig1 = plt.figure()\n", + "ax1 = fig1.add_subplot(111)\n", + "h1, = ax1.plot(series.index[time_s], res_f.freq_seasonal[0].filtered[time_s], label='Double Freq. Seas')\n", + "h2, = ax1.plot(series.index[time_s], res_tf.seasonal.filtered[time_s], label='Mixed Domain Seas')\n", + "h3, = ax1.plot(series.index[time_s], true_seasonal_10_3[time_s], label='True Seasonal 10(3)')\n", + "plt.legend([h1, h2, h3], ['Double Freq. Seasonal','Mixed Domain Seasonal','Truth'], loc=2)\n", + "plt.title('Seasonal 10(3) component')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time_s = np.s_[:50] # After this they basically agree\n", + "fig2 = plt.figure()\n", + "ax2 = fig2.add_subplot(111)\n", + "h21, = ax2.plot(series.index[time_s], res_f.freq_seasonal[1].filtered[time_s], label='Double Freq. Seas')\n", + "h22, = ax2.plot(series.index[time_s], res_tf.freq_seasonal[0].filtered[time_s], label='Mixed Domain Seas')\n", + "h23, = ax2.plot(series.index[time_s], true_seasonal_100_2[time_s], label='True Seasonal 100(2)')\n", + "plt.legend([h21, h22, h23], ['Double Freq. Seasonal','Mixed Domain Seasonal','Truth'], loc=2)\n", + "plt.title('Seasonal 100(2) component')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "time_s = np.s_[:100]\n", + "\n", + "fig3 = plt.figure()\n", + "ax3 = fig3.add_subplot(111)\n", + "h31, = ax3.plot(series.index[time_s], res_f.freq_seasonal[1].filtered[time_s] + res_f.freq_seasonal[0].filtered[time_s], label='Double Freq. Seas')\n", + "h32, = ax3.plot(series.index[time_s], res_tf.freq_seasonal[0].filtered[time_s] + res_tf.seasonal.filtered[time_s], label='Mixed Domain Seas')\n", + "h33, = ax3.plot(series.index[time_s], true_sum[time_s], label='True Seasonal 100(2)')\n", + "h34, = ax3.plot(series.index[time_s], res_lf.freq_seasonal[0].filtered[time_s], label='Lazy Freq. Seas')\n", + "h35, = ax3.plot(series.index[time_s], res_lt.seasonal.filtered[time_s], label='Lazy Time Seas')\n", + "\n", + "plt.legend([h31, h32, h33, h34, h35], ['Double Freq. Seasonal','Mixed Domain Seasonal','Truth', 'Lazy Freq. Seas', 'Lazy Time Seas'], loc=1)\n", + "plt.title('Seasonal components combined')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Conclusions\n", + "\n", + "In this notebook, we simulated a time series with two seasonal components of different periods. We modeled them using structural time series models with (a) two frequency domain components of correct periods and numbers of harmonics, (b) time domain seasonal component for the shorter term and a frequency domain term with correct period and number of harmonics, (c) a single frequency domain term with the longer period and full number of harmonics, and (d) a single time domain term with the longer period. We saw a variety of diagnostic results, with only the correct generating model, (a), failing to reject any of the tests. Thus, more flexible seasonal modeling allowing for multiple components with specifiable harmonics can be a useful tool for time series modeling. Finally, we can represent seasonal components with fewer total states in this way, allowing for the user to attempt to make the bias-variance tradeoff themselves instead of being forced to choose \"lazy\" models, which use a large number of states and incur additional variance as a result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.14" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/statsmodels/tsa/statespace/structural.py b/statsmodels/tsa/statespace/structural.py index 00dd05d2809..fac55102f06 100644 --- a/statsmodels/tsa/statespace/structural.py +++ b/statsmodels/tsa/statespace/structural.py @@ -60,6 +60,13 @@ class UnobservedComponents(MLEModel): `level` must also be True. seasonal : int or None, optional The period of the seasonal component, if any. Default is None. + freq_seasonal: list of dicts or None, optional. + Whether (and how) to model seasonal component(s) with trig. functions. + If specified, there is one dictionary for each frequency-domain + seasonal component. Each dictionary must have the key, value pair for + 'period' -- integer and may have a key, value pair for + 'harmonics' -- integer. If 'harmonics' is not specified in any of the + dictionaries, it defaults to the floor of period/2. cycle : bool, optional Whether or not to include a cycle component. Default is False. ar : int or None, optional @@ -73,7 +80,11 @@ class UnobservedComponents(MLEModel): stochastic_trend : bool, optional Whether or not any trend component is stochastic. Default is False. stochastic_seasonal : bool, optional - Whether or not any seasonal component is stochastic. Default is False. + Whether or not any seasonal component is stochastic. Default is True. + stochastic_freq_seasonal: list of bools, optional + Whether or not each seasonal component(s) is (are) stochastic. Default + is True for each component. The list should be of the same length as + freq_seasonal. stochastic_cycle : bool, optional Whether or not any cycle component is stochastic. Default is False. damped_cycle : bool, optional @@ -203,6 +214,39 @@ class UnobservedComponents(MLEModel): time series is available in the results class in the `seasonal` attribute. + ** Freq. Seasonal** + + Each frequency-domain seasonal component is modeled as: + + .. math:: + \gamma_t &=& \sum_{j=1}^h \gamma_{j, t} + \gamma_{j, t+1} &=& \gamma_{j, t}\cos(\lambda_j) \\ + && + \gamma^{*}_{j, t}\sin(\lambda_j) + \omega_{j,t} \\ + \gamma^{*}_{j, t+1} &=^ -\gamma^{(1)}_{j, t}\sin(\lambda_j) \\ + &&+ \gamma^{*}_{j, t}\cos(\lambda_j) \\ + + \omega^{*}_{j, t}, \\ + \omega^{*}_{j, t}, \omega_{j, t} &\sim& N(0, \sigma_{\omega^2} + \lambda_j = \frac{2 \pi j}{s} + where j ranges from 1 to h. + + The periodicity (number of "seasons" in a "year") is s and the number of + harmonics is h. Note that h is configurable to be less than s/2, but + s/2 harmonics is sufficient to fully model all seasonal variations of + periodicity s. Like the time domain seasonal term (cf. Seasonal section, + above), the inclusion of the error terms allows for the seasonal effects to + vary over time. The argument stochastic_freq_seasonal can be used to set + one or more of the seasonal components of this type to be non-random, + meaning they will not vary over time. + + This component results in one parameter to be fitted using maximum + likelihood: :math: `\sigma_{\omega^2}`, and up to two parameters to be + chosen, the number of seasons s and optionally the number of harmonics + h, with :math: 1 \leq h \leq \floor(s/2). + + After fitting the model, each unobserved seasonal component modeled in the + frequency domain is available in the results class in the `freq_seasonal` + attribute. + **Cycle** The cyclical component is intended to capture cyclical effects at time @@ -299,9 +343,13 @@ class UnobservedComponents(MLEModel): """ # noqa:E501 def __init__(self, endog, level=False, trend=False, seasonal=None, - cycle=False, autoregressive=None, exog=None, irregular=False, - stochastic_level=False, stochastic_trend=False, - stochastic_seasonal=True, stochastic_cycle=False, + freq_seasonal=None, cycle=False, autoregressive=None, + exog=None, irregular=False, + stochastic_level=False, + stochastic_trend=False, + stochastic_seasonal=True, + stochastic_freq_seasonal=None, + stochastic_cycle=False, damped_cycle=False, cycle_period_bounds=None, mle_regression=True, **kwargs): @@ -311,6 +359,15 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, self.trend = trend self.seasonal_periods = seasonal if seasonal is not None else 0 self.seasonal = self.seasonal_periods > 0 + if freq_seasonal: + self.freq_seasonal_periods = [d['period'] for d in freq_seasonal] + self.freq_seasonal_harmonics = [d.get( + 'harmonics', int(np.floor(d['period'] / 2))) for + d in freq_seasonal] + else: + self.freq_seasonal_periods = [] + self.freq_seasonal_harmonics = [] + self.freq_seasonal = any(x > 0 for x in self.freq_seasonal_periods) self.cycle = cycle self.ar_order = autoregressive if autoregressive is not None else 0 self.autoregressive = self.ar_order > 0 @@ -319,6 +376,16 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, self.stochastic_level = stochastic_level self.stochastic_trend = stochastic_trend self.stochastic_seasonal = stochastic_seasonal + if stochastic_freq_seasonal is None: + self.stochastic_freq_seasonal = [True] * len( + self.freq_seasonal_periods) + else: + if len(stochastic_freq_seasonal) != len(freq_seasonal): + raise ValueError( + "Length of stochastic_freq_seasonal must equal length" + " of freq_seasonal: {!r} vs {!r}".format( + len(stochastic_freq_seasonal), len(freq_seasonal))) + self.stochastic_freq_seasonal = stochastic_freq_seasonal self.stochastic_cycle = stochastic_cycle self.damped_cycle = damped_cycle @@ -414,6 +481,8 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, (self.level and self.stochastic_level) or (self.trend and self.stochastic_trend) or (self.seasonal and self.stochastic_seasonal) or + (self.freq_seasonal and any( + self.stochastic_freq_seasonal)) or (self.cycle and self.stochastic_cycle) or self.autoregressive): warn("Specified model does not contain a stochastic element;" @@ -424,6 +493,13 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, raise ValueError('Seasonal component must have a seasonal period' ' of at least 2.') + if self.freq_seasonal: + for p in self.freq_seasonal_periods: + if p < 2: + raise ValueError( + 'Frequency Domain seasonal component must have a ' + 'seasonal period of at least 2.') + # Create a bitmask holding the level/trend specification self.trend_mask = ( self.irregular * 0x01 | @@ -445,10 +521,16 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, self.regression = self.k_exog > 0 # Model parameters + self._k_seasonal_states = (self.seasonal_periods - 1) * self.seasonal + self._k_freq_seas_states = ( + sum(2 * h for h in self.freq_seasonal_harmonics) + * self.freq_seasonal) + self._k_cycle_states = self.cycle * 2 k_states = ( self.level + self.trend + - (self.seasonal_periods - 1) * self.seasonal + - self.cycle * 2 + + self._k_seasonal_states + + self._k_freq_seas_states + + self._k_cycle_states + self.ar_order + (not self.mle_regression) * self.k_exog ) @@ -456,14 +538,22 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, self.stochastic_level * self.level + self.stochastic_trend * self.trend + self.stochastic_seasonal * self.seasonal + - self.stochastic_cycle * (self.cycle * 2) + + ((sum(2 * h if self.stochastic_freq_seasonal[ix] else 0 for + ix, h in enumerate(self.freq_seasonal_harmonics))) * + self.freq_seasonal) + + self.stochastic_cycle * (self._k_cycle_states) + self.autoregressive ) + + # The ar states are initialized as stationary, so they don't need to be + # burned. + loglikelihood_burn = kwargs.get('loglikelihood_burn', + k_states + - self.ar_order) + # We can still estimate the model with just the irregular component, # just need to have one state that does nothing. - loglikelihood_burn = kwargs.get('loglikelihood_burn', - k_states - self.ar_order) if k_states == 0: if not self.irregular: raise ValueError('Model has no components specified.') @@ -508,10 +598,11 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, ) # update _init_keys attached by super - self._init_keys += ['level', 'trend', 'seasonal', 'cycle', - 'autoregressive', 'exog', 'irregular', + self._init_keys += ['level', 'trend', 'seasonal', 'freq_seasonal', + 'cycle', 'autoregressive', 'exog', 'irregular', 'stochastic_level', 'stochastic_trend', - 'stochastic_seasonal', 'stochastic_cycle', + 'stochastic_seasonal', 'stochastic_freq_seasonal', + 'stochastic_cycle', 'damped_cycle', 'cycle_period_bounds', 'mle_regression'] + list(kwargs.keys()) # TODO: I think the kwargs or not attached, need to recover from ??? @@ -522,6 +613,10 @@ def _get_init_kwds(self): # Modifications kwds['seasonal'] = self.seasonal_periods + kwds['freq_seasonal'] = [ + {'period': p, + 'harmonics': self.freq_seasonal_harmonics[ix]} for + ix, p in enumerate(self.freq_seasonal_periods)] kwds['autoregressive'] = self.ar_order for key, value in kwds.items(): @@ -575,6 +670,33 @@ def setup(self): self.parameters_state_cov['seasonal_var'] = 1 j += 1 i += n + if self.freq_seasonal: + for ix, h in enumerate(self.freq_seasonal_harmonics): + # These are the \gamma_jt and \gamma^*_jt terms in D&K (3.8) + n = 2 * h + p = self.freq_seasonal_periods[ix] + lambda_p = 2 * np.pi / float(p) + + t = 0 # frequency transition matrix offset + for block in range(1, h + 1): + # ibid. eqn (3.7) + self.ssm['design', 0, i+t] = 1. + + # ibid. eqn (3.8) + cos_lambda_block = np.cos(lambda_p * block) + sin_lambda_block = np.sin(lambda_p * block) + trans = np.array([[cos_lambda_block, sin_lambda_block], + [-sin_lambda_block, cos_lambda_block]]) + trans_s = np.s_[i + t:i + t + 2] + self.ssm['transition', trans_s, trans_s] = trans + t += 2 + + if self.stochastic_freq_seasonal[ix]: + self.ssm['selection', i:i + n, j:j + n] = np.eye(n) + cov_key = 'freq_seasonal_var_{!r}'.format(ix) + self.parameters_state_cov[cov_key] = 1 + j += n + i += n if self.cycle: self.ssm['design', 0, i] = 1. self.parameters_transition['cycle_freq'] = 1 @@ -630,6 +752,24 @@ def setup(self): idx = np.diag_indices(self.ssm.k_posdef) self._idx_state_cov = ('state_cov', idx[0], idx[1]) + # Some of the variances may be tied together (repeated parameter usage) + # Use list() for compatibility with python 3.5 + param_keys = list(self.parameters_state_cov.keys()) + self._var_repetitions = np.ones(self.k_state_cov, dtype=np.int) + if self.freq_seasonal: + for ix, is_stochastic in enumerate(self.stochastic_freq_seasonal): + if is_stochastic: + num_harmonics = self.freq_seasonal_harmonics[ix] + repeat_times = 2 * num_harmonics + cov_key = 'freq_seasonal_var_{!r}'.format(ix) + cov_ix = param_keys.index(cov_key) + self._var_repetitions[cov_ix] = repeat_times + + if self.stochastic_cycle and self.cycle: + cov_ix = param_keys.index('cycle_var') + self._var_repetitions[cov_ix] = 2 + self._repeat_any_var = any(self._var_repetitions > 1) + def initialize_state(self): # Initialize the AR component as stationary, the rest as approximately # diffuse @@ -643,8 +783,9 @@ def initialize_state(self): start = ( self.level + self.trend + - (self.seasonal_periods - 1) * self.seasonal + - self.cycle * 2 + self._k_seasonal_states + + self._k_freq_seas_states + + self._k_cycle_states ) end = start + self.ar_order selection_stationary = self.ssm['selection', start:end, :, 0] @@ -730,6 +871,11 @@ def start_params(self): if self.stochastic_seasonal: _start_params['seasonal_var'] = var_resid + # Frequency domain seasonal + for ix, is_stochastic in enumerate(self.stochastic_freq_seasonal): + cov_key = 'freq_seasonal_var_{!r}'.format(ix) + _start_params[cov_key] = var_resid + # Cyclical if self.cycle: _start_params['cycle_var'] = var_resid @@ -784,6 +930,16 @@ def param_names(self): param_names.append('sigma2.trend') elif key == 'seasonal_var': param_names.append('sigma2.seasonal') + elif key.startswith('freq_seasonal_var_'): + # There are potentially multiple frequency domain seasonal terms + idx_fseas_comp = int(key[-1]) + periodicity = self.freq_seasonal_periods[idx_fseas_comp] + harmonics = self.freq_seasonal_harmonics[idx_fseas_comp] + freq_seasonal_name = "{p}({h})".format( + p=repr(periodicity), + h=repr(harmonics)) + param_names.append( + 'sigma2.' + 'freq_seasonal_' + freq_seasonal_name) elif key == 'cycle_var': param_names.append('sigma2.cycle') elif key == 'cycle_freq': @@ -905,11 +1061,8 @@ def update(self, params, **kwargs): # State covariance if self.k_state_cov > 0: variances = params[offset:offset+self.k_state_cov] - if self.stochastic_cycle and self.cycle: - if self.autoregressive: - variances = np.r_[variances[:-1], variances[-2:]] - else: - variances = np.r_[variances, variances[-1]] + if self._repeat_any_var: + variances = np.repeat(variances, self._var_repetitions) self.ssm[self._idx_state_cov] = variances offset += self.k_state_cov @@ -978,6 +1131,12 @@ def __init__(self, model, params, filter_results, cov_type='opg', # Save _init_kwds self._init_kwds = self.model._get_init_kwds() + # Save number of states by type + self._k_states_by_type = { + 'seasonal': self.model._k_seasonal_states, + 'freq_seasonal': self.model._k_freq_seas_states, + 'cycle': self.model._k_cycle_states} + # Save the model specification self.specification = Bunch(**{ # Model options @@ -985,6 +1144,9 @@ def __init__(self, model, params, filter_results, cov_type='opg', 'trend': self.model.trend, 'seasonal_periods': self.model.seasonal_periods, 'seasonal': self.model.seasonal, + 'freq_seasonal': self.model.freq_seasonal, + 'freq_seasonal_periods': self.model.freq_seasonal_periods, + 'freq_seasonal_harmonics': self.model.freq_seasonal_harmonics, 'cycle': self.model.cycle, 'ar_order': self.model.ar_order, 'autoregressive': self.model.autoregressive, @@ -992,6 +1154,7 @@ def __init__(self, model, params, filter_results, cov_type='opg', 'stochastic_level': self.model.stochastic_level, 'stochastic_trend': self.model.stochastic_trend, 'stochastic_seasonal': self.model.stochastic_seasonal, + 'stochastic_freq_seasonal': self.model.stochastic_freq_seasonal, 'stochastic_cycle': self.model.stochastic_cycle, 'damped_cycle': self.model.damped_cycle, @@ -1115,6 +1278,82 @@ def seasonal(self): out.smoothed_cov = self.smoothed_state_cov[offset, offset] return out + @property + def freq_seasonal(self): + """ + Estimates of unobserved frequency domain seasonal component(s) + + Returns + ------- + out: list of Bunch instances + Each item has the following attributes: + + - `filtered`: a time series array with the filtered estimate of + the component + - `filtered_cov`: a time series array with the filtered estimate of + the variance/covariance of the component + - `smoothed`: a time series array with the smoothed estimate of + the component + - `smoothed_cov`: a time series array with the smoothed estimate of + the variance/covariance of the component + - `offset`: an integer giving the offset in the state vector where + this component begins + """ + # If present, freq_seasonal components always follows level/trend + # and seasonal. + + # There are 2 * (harmonics) seasonal states per freq_seasonal + # component. + # The sum of every other state enters the measurement equation. + # Additionally, there can be multiple components of this type. + # These facts make this property messier in implementation than the + # others. + # Fortunately, the states are conditionally mutually independent + # (conditional on previous timestep's states), so that the calculations + # of the variances are simple summations of individual variances and + # the calculation of the returned state is likewise a summation. + out = [] + spec = self.specification + if spec.freq_seasonal: + previous_states_offset = int(spec.trend + spec.level + + self._k_states_by_type['seasonal']) + previous_f_seas_offset = 0 + for ix, h in enumerate(spec.freq_seasonal_harmonics): + offset = previous_states_offset + previous_f_seas_offset + + period = spec.freq_seasonal_periods[ix] + + # Only the gamma_jt terms enter the measurement equation (cf. + # D&K 2012 (3.7)) + states_in_sum = np.arange(0, 2 * h, 2) + + filtered_state = np.sum( + [self.filtered_state[offset + j] for j in states_in_sum], + axis=0) + filtered_cov = np.sum( + [self.filtered_state_cov[offset + j, offset + j] for j in + states_in_sum], axis=0) + + + item = Bunch( + filtered=filtered_state, + filtered_cov=filtered_cov, + smoothed=None, smoothed_cov=None, + offset=offset, + pretty_name='seasonal {p}({h})'.format(p=repr(period), + h=repr(h))) + if self.smoothed_state is not None: + item.smoothed = np.sum( + [self.smoothed_state[offset+j] for j in states_in_sum], + axis=0) + if self.smoothed_state_cov is not None: + item.smoothed_cov = np.sum( + [self.smoothed_state_cov[offset+j, offset+j] + for j in states_in_sum], axis=0) + out.append(item) + previous_f_seas_offset += 2 * h + return out + @property def cycle(self): """ @@ -1136,7 +1375,8 @@ def cycle(self): - `offset`: an integer giving the offset in the state vector where this component begins """ - # If present, cycle always follows level/trend and seasonal + # If present, cycle always follows level/trend, seasonal, and freq + # seasonal. # Note that we return only the first cyclical state, but there are # in fact 2 cyclical states. The second cyclical state is not simply # a lag of the first cyclical state, but the first cyclical state is @@ -1144,8 +1384,9 @@ def cycle(self): out = None spec = self.specification if spec.cycle: - offset = int(spec.trend + spec.level + - spec.seasonal * (spec.seasonal_periods - 1)) + offset = int(spec.trend + spec.level + + self._k_states_by_type['seasonal'] + + self._k_states_by_type['freq_seasonal']) out = Bunch(filtered=self.filtered_state[offset], filtered_cov=self.filtered_state_cov[offset, offset], smoothed=None, smoothed_cov=None, @@ -1177,15 +1418,17 @@ def autoregressive(self): - `offset`: an integer giving the offset in the state vector where this component begins """ - # If present, autoregressive always follows level/trend, seasonal, and - # cyclical. If it is an AR(p) model, then there are p associated + # If present, autoregressive always follows level/trend, seasonal, + # freq seasonal, and cyclical. + # If it is an AR(p) model, then there are p associated # states, but the second - pth states are just lags of the first state. out = None spec = self.specification if spec.autoregressive: - offset = int(spec.trend + spec.level + - spec.seasonal * (spec.seasonal_periods - 1) + - 2 * spec.cycle) + offset = int(spec.trend + spec.level + + self._k_states_by_type['seasonal'] + + self._k_states_by_type['freq_seasonal'] + + self._k_states_by_type['cycle']) out = Bunch(filtered=self.filtered_state[offset], filtered_cov=self.filtered_state_cov[offset, offset], smoothed=None, smoothed_cov=None, @@ -1218,7 +1461,7 @@ def regression_coefficients(self): this component begins """ # If present, state-vector regression coefficients always are last - # (i.e. they follow level/trend, seasonal, cyclical, and + # (i.e. they follow level/trend, seasonal, freq seasonal, cyclical, and # autoregressive states). There is one state associated with each # regressor, and all are returned here. out = None @@ -1231,10 +1474,11 @@ def regression_coefficients(self): ' available in the parameters list, not as part' ' of the state vector.', OutputWarning) else: - offset = int(spec.trend + spec.level + - spec.seasonal * (spec.seasonal_periods - 1) + - spec.cycle * (1 + spec.stochastic_cycle) + - spec.ar_order) + offset = int(spec.trend + spec.level + + self._k_states_by_type['seasonal'] + + self._k_states_by_type['freq_seasonal'] + + self._k_states_by_type['cycle'] + + spec.ar_order) start = offset end = offset + spec.k_exog out = Bunch( @@ -1252,7 +1496,8 @@ def regression_coefficients(self): def plot_components(self, which=None, alpha=0.05, observed=True, level=True, trend=True, - seasonal=True, cycle=True, autoregressive=True, + seasonal=True, freq_seasonal=True, + cycle=True, autoregressive=True, legend_loc='upper right', fig=None, figsize=None): """ Plot the estimated components of the model. @@ -1273,6 +1518,9 @@ def plot_components(self, which=None, alpha=0.05, seasonal : boolean, optional Whether or not to plot the seasonal component, if applicable. Default is True. + freq_seasonal: boolean, optional + Whether or not to plot the frequency domain seasonal component(s), + if applicable. Default is True. cycle : boolean, optional Whether or not to plot the cyclical component, if applicable. Default is True. @@ -1296,8 +1544,9 @@ def plot_components(self, which=None, alpha=0.05, 1. Level 2. Trend 3. Seasonal - 4. Cycle - 5. Autoregressive + 4. Freq Seasonal + 5. Cycle + 6. Autoregressive Specific subplots will be removed if the component is not present in the estimated model or if the corresponding keywork argument is set to @@ -1316,13 +1565,23 @@ def plot_components(self, which=None, alpha=0.05, # Determine which plots we have spec = self.specification - components = OrderedDict([ + + comp = [ ('level', level and spec.level), ('trend', trend and spec.trend), ('seasonal', seasonal and spec.seasonal), - ('cycle', cycle and spec.cycle), - ('autoregressive', autoregressive and spec.autoregressive), - ]) + ] + + if freq_seasonal and spec.freq_seasonal: + for ix, _ in enumerate(spec.freq_seasonal_periods): + key = 'freq_seasonal_{!r}'.format(ix) + comp.append((key, True)) + + comp.extend( + [('cycle', cycle and spec.cycle), + ('autoregressive', autoregressive and spec.autoregressive)]) + + components = OrderedDict(comp) llb = self.filter_results.loglikelihood_burn @@ -1383,7 +1642,19 @@ def plot_components(self, which=None, alpha=0.05, ax = fig.add_subplot(k_plots, 1, plot_idx) plot_idx += 1 - component_bunch = getattr(self, component) + try: + component_bunch = getattr(self, component) + title = component.title() + except AttributeError: + # This might be a freq_seasonal component, of which there are + # possibly multiple bagged up in property freq_seasonal + if component.startswith('freq_seasonal_'): + ix = int(component.replace('freq_seasonal_', '')) + big_bunch = getattr(self, 'freq_seasonal') + component_bunch = big_bunch[ix] + title = component_bunch.pretty_name + else: + raise # Check for a valid estimation type if which not in component_bunch: @@ -1395,7 +1666,7 @@ def plot_components(self, which=None, alpha=0.05, value = component_bunch[which] # Plot - state_label = '%s (%s)' % (component.title(), which) + state_label = '%s (%s)' % (title, which) ax.plot(dates[llb:], value[llb:], label=state_label) # Get confidence intervals @@ -1407,12 +1678,12 @@ def plot_components(self, which=None, alpha=0.05, dates[llb:], ci_lower[llb:], ci_upper[llb:], alpha=0.2 ) ci_label = ('$%.3g \\%%$ confidence interval' - % ((1 - alpha)*100)) + % ((1 - alpha) * 100)) # Legend ax.legend(loc=legend_loc) - ax.set_title('%s component' % component.title()) + ax.set_title('%s component' % title) # Add a note if first observations excluded if llb > 0: @@ -1530,6 +1801,18 @@ def summary(self, alpha=.05, start=None): seasonal_name = 'stochastic ' + seasonal_name model_name.append(seasonal_name) + if self.specification.freq_seasonal: + for ix, is_stochastic in enumerate( + self.specification.stochastic_freq_seasonal): + periodicity = self.specification.freq_seasonal_periods[ix] + harmonics = self.specification.freq_seasonal_harmonics[ix] + freq_seasonal_name = "freq_seasonal({p}({h}))".format( + p=repr(periodicity), + h=repr(harmonics)) + if is_stochastic: + freq_seasonal_name = 'stochastic ' + freq_seasonal_name + model_name.append(freq_seasonal_name) + if self.specification.cycle: cycle_name = 'cycle' if self.specification.stochastic_cycle: diff --git a/statsmodels/tsa/statespace/tests/results/results_structural.py b/statsmodels/tsa/statespace/tests/results/results_structural.py index 79ea1bb3387..e60acd31617 100644 --- a/statsmodels/tsa/statespace/tests/results/results_structural.py +++ b/statsmodels/tsa/statespace/tests/results/results_structural.py @@ -164,11 +164,21 @@ seasonal = { 'models': [{'irregular': True, 'seasonal': 4}], 'params': [38.1704278, 0.1], - 'llf': -655.3337155, - 'kwargs': {}, + 'llf': -678.8138005, + 'kwargs': {'loglikelihood_burn': 0}, 'rtol': 1e-6 } +freq_seasonal = { + 'models': [{'irregular': True, + 'freq_seasonal': [{'period': 5}]}], + 'params': [38.95352534, 0.05], + 'llf': -688.9697249, + 'rtol': 1e-6, + 'kwargs': { + 'loglikelihood_burn': 0 # No idea why KFAS includes all data in this + } +} reg = { # Note: The test needs to fill in exog=np.log(dta['realgdp']) 'models': [ diff --git a/statsmodels/tsa/statespace/tests/results/test_ucm.R b/statsmodels/tsa/statespace/tests/results/test_ucm.R index 6a2004a7819..8973667822e 100644 --- a/statsmodels/tsa/statespace/tests/results/test_ucm.R +++ b/statsmodels/tsa/statespace/tests/results/test_ucm.R @@ -94,12 +94,23 @@ print(exp(res_cycle$optim.out$par)) # [37.57197224] print(res_cycle$optim.out$value) # 672.3102588 # Seasonal (with fixed period=4, seasonal variance=0.1) -mod_seasonal <- SSModel(dta$unemp ~ -1 + SSMseasonal(4, Q=matrix(0.1), sea.type='dummy', P1=diag(1)*1e6), H=matrix(NA)) +# Note: matching this requires setting loglikelihood_burn=0 +mod_seasonal <- SSModel(dta$unemp ~ -1 + SSMseasonal(4, Q=matrix(0.1), sea.type='dummy', P1=diag(rep(1e6, 3))), H=matrix(NA)) res_seasonal <- fitSSM(inits=c(log(var(dta$unemp))), model=mod_seasonal, method="BFGS", control=list(REPORT=1, trace=1)) -print(exp(res_seasonal$optim.out$par)) # [38.1704278] -print(res_seasonal$optim.out$value) # 655.3337155 +print(exp(res_seasonal$optim.out$par)) # [38.17043009] +print(res_seasonal$optim.out$value) # 678.8138005 + +# Trig Seasonal (with fixed period=5, full number of harmonics, seasonal variance=.05) +# Note: matching this requires setting loglikelihood_burn=0 +mod_trig_seasonal <- SSModel(dta$unemp ~ -1 + SSMseasonal(5, Q=0.05, sea.type='trigonometric', P1=diag(rep(1e6, 4))), H=matrix(NA)) +res_trig_seasonal <- fitSSM(inits=c(log(var(dta$unemp))), model=mod_trig_seasonal, + method="BFGS", control=list(REPORT=1, trace=1)) + +print(exp(res_trig_seasonal$optim.out$par)) # [38.95352534] +print(res_trig_seasonal$optim.out$value) # 688.9697249 + # Regression # Note: matching this requires setting loglikelihood_burn = 0 diff --git a/statsmodels/tsa/statespace/tests/test_structural.py b/statsmodels/tsa/statespace/tests/test_structural.py index 25a1c80ca37..308b24b556a 100644 --- a/statsmodels/tsa/statespace/tests/test_structural.py +++ b/statsmodels/tsa/statespace/tests/test_structural.py @@ -141,7 +141,7 @@ def test_fixed_slope(): run_ucm('fixed_slope') -def test_fixed_slope(): +def test_fixed_slope_warn(): # Clear warnings structural.__warningregistry__ = {} @@ -184,6 +184,10 @@ def test_seasonal(): run_ucm('seasonal') +def test_freq_seasonal(): + run_ucm('freq_seasonal') + + def test_reg(): run_ucm('reg') @@ -355,3 +359,89 @@ def test_predict_custom_index(): res = mod.smooth(mod.start_params) out = res.predict(start=1, end=1, index=['a']) assert_equal(out.index.equals(pd.Index(['a'])), True) + + +def test_matrices_somewhat_complicated_model(): + values = dta.copy() + + model = UnobservedComponents(values['unemp'], + level='lltrend', + freq_seasonal=[{'period': 4}, + {'period': 9, 'harmonics': 3}], + cycle=True, + cycle_period_bounds=[2, 30], + damped_cycle=True, + stochastic_freq_seasonal=[True, False], + stochastic_cycle=True + ) + # Selected parameters + params = [1, # irregular_var + 3, 4, # lltrend parameters: level_var, trend_var + 5, # freq_seasonal parameters: freq_seasonal_var_0 + # cycle parameters: cycle_var, cycle_freq, cycle_damp + 6, 2*np.pi/30., .9 + ] + model.update(params) + + # Check scalar properties + assert_equal(model.k_states, 2 + 4 + 6 + 2) + assert_equal(model.k_state_cov, 2 + 1 + 0 + 1) + assert_equal(model.loglikelihood_burn, 2 + 4 + 6 + 2) + assert_allclose(model.ssm.k_posdef, 2 + 4 + 0 + 2) + assert_equal(model.k_params, len(params)) + + # Check the statespace model matrices against hand-constructed answers + # We group the terms by the component + expected_design = np.r_[[1, 0], + [1, 0, 1, 0], + [1, 0, 1, 0, 1, 0], + [1, 0]].reshape(1, 14) + assert_allclose(model.ssm.design[:, :, 0], expected_design) + + expected_transition = __direct_sum([ + np.array([[1, 1], + [0, 1]]), + np.array([[ 0, 1, 0, 0], + [-1, 0, 0, 0], + [0, 0, -1, 0], + [0, 0, 0, -1]]), + np.array([[ np.cos(2*np.pi*1/9.), np.sin(2*np.pi*1/9.), 0, 0, 0, 0], + [-np.sin(2*np.pi*1/9.), np.cos(2*np.pi*1/9.), 0, 0, 0, 0], + [0, 0, np.cos(2*np.pi*2/9.), np.sin(2*np.pi*2/9.), 0, 0], + [0, 0, -np.sin(2*np.pi*2/9.), np.cos(2*np.pi*2/9.), 0, 0], + [0, 0, 0, 0, np.cos(2*np.pi/3.), np.sin(2*np.pi/3.)], + [0, 0, 0, 0, -np.sin(2*np.pi/3.), np.cos(2*np.pi/3.)]]), + np.array([[.9*np.cos(2*np.pi/30.), .9*np.sin(2*np.pi/30.)], + [-.9*np.sin(2*np.pi/30.), .9*np.cos(2*np.pi/30.)]]) + ]) + assert_allclose( + model.ssm.transition[:, :, 0], expected_transition, atol=1e-7) + + # Since the second seasonal term is not stochastic, + # the dimensionality of the state disturbance is 14 - 6 = 8 + expected_selection = np.zeros((14, 14 - 6)) + expected_selection[0:2, 0:2] = np.eye(2) + expected_selection[2:6, 2:6] = np.eye(4) + expected_selection[-2:, -2:] = np.eye(2) + assert_allclose(model.ssm.selection[:, :, 0], expected_selection) + + expected_state_cov = __direct_sum([ + np.diag(params[1:3]), + np.eye(4)*params[3], + np.eye(2)*params[4] + ]) + assert_allclose(model.ssm.state_cov[:, :, 0], expected_state_cov) + + +def __direct_sum(square_matrices): + """Compute the matrix direct sum of an iterable of square numpy 2-d arrays + """ + new_shape = np.sum([m.shape for m in square_matrices], axis=0) + new_array = np.zeros(new_shape) + offset = 0 + for m in square_matrices: + rows, cols = m.shape + assert rows == cols + new_array[offset:offset + rows, offset:offset + rows] = m + offset += rows + return new_array \ No newline at end of file