"""Classes and Functions for the ziopcpy module."""
import numpy as np
# ZiOPC model converges extremely
# faster with import * rather than import as np.
import pandas as pd
from numpy import *
from scipy.optimize import minimize
from scipy.stats import mvn, norm
[docs]class OpModel:
"""Store model results from :py:func:`opmod`."""
def __init__(self, llik, coef, aic, vcov, data, xs, ts, x_, yx_, yncat,
xstr, ystr):
"""Store model results, goodness-of-fit tests, and other information.
:param llik: Log-Likelihood.
:param coef: Model coefficients.
:param aic: Model Akaike information criterion.
:param vcov: Variance-Covariance matrix.
(optimized as inverted Hessian matrix)
:param data: Full dataset.
:param ts: Cutpoints for ordered probit.
:param xs: Ordered probit estimates (Betas).
:param yncat: Number of categories in the outcome variable.
:param x_: X (Covariates) Data.
:param yx_: Y (Dependent Variable) data.
:param xstr: List of strings for variable(s) in the outcome stage (x).
:param ystr: List of strings for outcome variable name (y).
"""
self.llik = llik
self.coefs = coef
self.AIC = aic
self.vcov = vcov
self.data = data
self.cutpoints = ts
self.ordered = xs
self.ycat = yncat
self.X = x_
self.Y = yx_
self.xstr = xstr
self.ystr = ystr
[docs]class IopModel:
"""Store model results from :py:func:`iopmod`."""
def __init__(
self,
modeltype,
llik,
coef,
aic,
vcov,
data,
xs,
zs,
ts,
x_,
yx_,
z_,
yncat,
xstr,
ystr,
zstr,
):
"""Store model results, goodness-of-fit tests, and other information.
:param modeltype: Type of Iop Model (ziop or miop).
:param llik: Log-Likelihood.
:param coef: Model coefficients
:param aic: Model Akaike information criterion.
:param vcov: Variance-Covariance matrix.
(optimized as inverted Hessian matrix)
:param data: Full dataset.
:param ts: Cutpoints for ordered probit.
:param zs: Inflation stage estimates (Gammas).
:param xs: Ordered probit estimates (Betas).
:param yncat: Number of categories in the Dependent Variable (DV).
:param x_: X Data.
:param yx_: Y (DV) data.
:param z_: Z Data.
:param xstr: List of strings for x names.
:param ystr: List of strings for y names.
:param zstr: List of strings for z names.
"""
self.modeltype = modeltype
self.llik = llik
self.coefs = coef
self.AIC = aic
self.vcov = vcov
self.data = data
self.cutpoints = ts
self.inflate = zs
self.ordered = xs
self.ycat = yncat
self.X = x_
self.Y = yx_
self.Z = z_
self.xstr = xstr
self.ystr = ystr
self.zstr = zstr
[docs]class IopCModel:
"""Store model results from :py:func:`iopcmod`."""
def __init__(
self,
modeltype,
llik,
coef,
aic,
vcov,
data,
xs,
zs,
ts,
x_,
yx_,
z_,
rho,
yncat,
xstr,
ystr,
zstr,
):
"""Store model results, goodness-of-fit tests, and other information.
:param modeltype: Type of IopC Model (ziopc or miopc).
:param llik: Log-Likelihood.
:param coef: Model coefficients.
:param AIC: Model Akaike information criterion.
:param vcov: Variance-Covariance matrix.
(optimized as inverted Hessian matrix)
:param data: Full dataset.
:param ts: Cutpoints for ordered probit.
:param zs: Inflation stage estimates (Gammas).
:param xs: Ordered probit estimates (Betas).
:param rho: Rho.
:param yncat: Number of categories in Dependent Variable (DV).
:param x_: X Data.
:param yx_: Y (DV) data.
:param z_: Z Data.
:param xstr: List of strings for x names.
:param ystr: List of strings for y names.
:param zstr: List of strings for z names.
"""
self.modeltype = modeltype
self.llik = llik
self.coefs = coef
self.AIC = aic
self.vcov = vcov
self.data = data
self.cutpoints = ts
self.inflate = zs
self.ordered = xs
self.ycat = yncat
self.X = x_
self.Y = yx_
self.Z = z_
self.r = rho
self.xstr = xstr
self.ystr = ystr
self.zstr = zstr
[docs]class FittedVals:
"""Store fitted values for iOP models."""
def __init__(self, responsefull, responseordered, responseinflation,
linear):
"""Store different type of equation in each attribute.
:param responsefull: responsefull
:param responseordered: responseordered
:param responseinflation: responseinflation
:param linear: linear
"""
self.responsefull = responsefull
self.responseordered = responseordered
self.responseinflation = responseinflation
self.linear = linear
[docs]def op(pstart, x, y, data, weights, offsetx):
"""Calculate likelihood function for Ordered Probit Model.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param x: Covariates for the ordered stage.
:type x: pandas.DataFrame
:param y: The dependent variable (DV).
:type y: pandas.DataFrame
:param data: Dataset.
:type data: pandas.DataFrame
:param weights: Weights.
:param offsetx: Offset for covariates in the ordered stage.
"""
n = len(data)
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((len(data), yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
tau = np.repeat(1.0, yncat)
for j in range(yncat - 1):
if j == 0:
tau[j] = pstart[j]
else:
tau[j] = tau[j - 1] + np.exp(pstart[j])
beta = pstart[(yncat - 1): len(pstart)]
xb = x.dot(beta) + offsetx
cprobs = np.zeros((n, yncat))
probs = np.zeros((n, yncat))
for i in range(yncat - 1):
cprobs[:, i] = norm.cdf(tau[i] - xb)
probs[:, 0] = cprobs[:, 0]
for i in range(1, yncat - 1):
probs[:, i] = cprobs[:, i] - cprobs[:, (i - 1)]
probs[:, yncat - 1] = 1 - cprobs[:, (yncat - 2)]
lik = np.zeros((n, yncat))
for k in range(n):
for j in range(yncat):
lik[k, j] = v[k, j] * probs[k, j]
likk = np.log(lik[lik != 0])
llik = -1 * sum(likk * weights)
return llik
[docs]def ziop(pstart, x, y, z, data, weights, offsetx, offsetz):
"""Calculate likelihood function for Zero-inflated Model.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param x: Covariates for the ordered probit stage.
:type x: pandas.DataFrame
:param y: The ordinal dependent Variable (DV).
:type y: pandas.DataFrame
:param z: Covariates for the inflation stage.
:type z: pandas.DataFrame
:param data: Dataset with missing values listwise deleted.
:type data: pandas.DataFrame
:param weights: weights.
:type weights: float
:param offsetx: Offset for the ordered stage covariates (X).
:type offsetx: float
:param offsetz: Offset for the inflation stage covariates (Z).
:type offsetz: float
"""
n = len(data)
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((len(data), yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
tau = np.repeat(1.0, yncat)
for j in range(yncat - 1):
if j == 0:
tau[j] = pstart[j]
else:
tau[j] = tau[j - 1] + np.exp(pstart[j])
beta = pstart[(yncat + len(z.columns) - 1): len(pstart)]
gamma = pstart[(yncat - 1): (yncat + len(z.columns) - 1)]
zg = z.dot(gamma) + offsetz
xb = x.dot(beta) + offsetx
cprobs = np.zeros((n, yncat))
probs = np.zeros((n, yncat))
for i in range(yncat - 1):
cprobs[:, i] = norm.cdf(tau[i] - xb)
probs[:, 0] = (cprobs[:, 0]) * norm.cdf(zg) + (1 - norm.cdf(zg))
for i in range(1, yncat - 1):
probs[:, i] = (cprobs[:, i] - cprobs[:, (i - 1)]) * norm.cdf(zg)
probs[:, yncat - 1] = (1 - cprobs[:, (yncat - 2)]) * norm.cdf(zg)
lik = np.zeros((n, yncat))
for k in range(n):
for j in range(yncat):
lik[k, j] = v[k, j] * probs[k, j]
likk = np.log(lik[lik != 0])
llik = -1 * sum(likk * weights)
return llik
[docs]def ziopc(pstart, x, y, z, data, weights, offsetx, offsetz):
"""Calculate likelihood function for Zero-inflated Correlated-Errors Model.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param x: Covariates for the ordered probit stage.
:type x: pandas.DataFrame
:param y: The dependent variable (DV).
:type y: pandas.DataFrame
:param z: Covariates for the inflation stage.
:type z: pandas.DataFrame
:param data: Dataset with missing values listwise deleted.
:type data: pandas.DataFrame
:param weights: Weights.
:type weights: float
:param offsetx: Offset for the ordered probit stage covariates (X).
:type offsetx: float
:param offsetz: Offset for the inflation stage (Z).
:type offsetz: float
"""
n = len(data)
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((len(data), yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
tau = np.repeat(1.0, yncat)
for j in range(yncat - 1):
if j == 0:
tau[j] = pstart[j]
else:
tau[j] = tau[j - 1] + np.exp(pstart[j])
beta = pstart[(yncat + len(z.columns) - 1): len(pstart) - 1]
gamma = pstart[(yncat - 1): (yncat + len(z.columns) - 1)]
x_beta = x.dot(beta)
# correlation
rho = pstart[len(pstart) - 1]
cprobs = np.zeros((len(x_beta), yncat))
probs = np.zeros((len(x_beta), yncat))
cut = np.zeros((len(x_beta), yncat))
cutb = np.zeros((len(x_beta), yncat))
zg = z.dot(gamma) + offsetz
xb = x.dot(beta) + offsetx
means = np.array([0, 0])
lower = np.array([-inf, -inf])
sigma = np.array([[1, rho], [rho, 1]])
nsigma = np.array([[1, -rho], [-rho, 1]])
for i in range(yncat - 1):
cprobs[:, i] = norm.cdf(tau[i] - xb)
cut[:, i] = tau[i] - xb
cutb[:, i] = xb - tau[i]
upperb = np.zeros((len(x_beta), 2))
upper = np.zeros((len(x_beta), 2))
for j in range(n):
upperb[j, :] = [zg[j], cutb[j, yncat - 2]]
upper[j, :] = [zg[j], cut[j, 0]]
probs[j, yncat - 1] = mvn.mvnun(lower, upperb[j], means, sigma)[0]
probs[j, 0] = (1 - norm.cdf(zg[j])) + mvn.mvnun(lower, upper[j],
means, nsigma)[0]
for j in range(n):
for i in range(1, yncat - 1):
probs[j, i] = (
mvn.mvnun(lower, [zg[j],
cut[j, i]], means, nsigma)[0]
- mvn.mvnun(lower, [zg[j],
cut[j, i - 1]], means, nsigma)[0])
lik = np.zeros((n, yncat))
for k in range(n):
for j in range(yncat):
lik[k, j] = v[k, j] * probs[k, j]
likk = np.log(lik[lik != 0])
llik = -1 * sum(likk * weights)
return llik
[docs]def miop(pstart, x, y, z, data, weights, offsetx, offsetz):
"""
Likelihood function for Middle-inflated Ordered Probit Model
"without" correlated errors.
The number of categories in the dependent
variable must be odd.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param x: Covariates in the ordered probit stage.
:type x: pandas.DataFrame
:param y: The dependent variable (DV).
:type y: pandas.DataFrame
:param z: Covariates in the inflation stage.
:type z: pandas.DataFrame
:param data: Dataset with missing values listwise deleted.
:type data: pandas.DataFrame
:param weights: Weights.
:type weights: float
:param offsetx: Offset for covariates in the ordered probit stage (X).
:type offsetx: float
:param offsetz: Offset for covariates in the inflation stage.
:type offsetz: float
"""
n = len(data)
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((len(data), yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
tau = np.repeat(1.0, yncat)
for j in range(yncat - 1):
if j == 0:
tau[j] = pstart[j]
else:
tau[j] = tau[j - 1] + np.exp(pstart[j])
beta = pstart[(yncat + len(z.columns) - 1): len(pstart)]
gamma = pstart[(yncat - 1): (yncat + len(z.columns) - 1)]
zg = z.dot(gamma) + offsetz
xb = x.dot(beta) + offsetx
cprobs = np.zeros((n, yncat))
probs = np.zeros((n, yncat))
for i in range(yncat - 1):
cprobs[:, i] = norm.cdf(tau[i] - xb)
probs[:, 0] = (cprobs[:, 0]) * norm.cdf(zg)
for i in range(1, yncat - 1):
if i == np.median(range(yncat)):
probs[:, i] = (1 - norm.cdf(zg)) + (
norm.cdf(zg) * (cprobs[:, i] - cprobs[:, (i - 1)])
)
else:
probs[:, i] = norm.cdf(zg) * (cprobs[:, i] - cprobs[:, (i - 1)])
probs[:, yncat - 1] = (1 - cprobs[:, (yncat - 2)]) * norm.cdf(zg)
lik = np.zeros((n, yncat))
for k in range(n):
for j in range(yncat):
lik[k, j] = v[k, j] * probs[k, j]
likk = np.log(lik[lik != 0])
llik = -1 * sum(likk * weights)
return llik
[docs]def miopc(pstart, x, y, z, data, weights, offsetx, offsetz):
"""
Likelihood function for Middle-inflated Correlated-Errors Model.
The number of categories in the dependent variable must be odd.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param x: Covariates in the ordered probit stage.
:type x: pandas.DataFrame
:param y: The dependent variable (DV).
:type y: pandas.DataFrame
:param z: Covariates in the inflation stage.
:type z: pandas.DataFrame
:param data: Dataset with missing values listwise deleted.
:type data: pandas.DataFrame
:param weights: Weights.
:type weights: float
:param offsetx: Offset for the ordered probit stage covariates (X).
:type offsetx: float
:param offsetz: Offset for the covariates in the inflation stage (Z).
:type offsetz: float
"""
n = len(data)
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((len(data), yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
tau = np.repeat(1.0, yncat)
for i in range(yncat - 1):
if i == 0:
tau[i] = pstart[i]
else:
tau[i] = tau[i - 1] + np.exp(pstart[i])
beta = pstart[(yncat + len(z.columns) - 1): len(pstart) - 1]
gamma = pstart[(yncat - 1): (yncat + len(z.columns) - 1)]
x_beta = x.dot(beta)
rho = pstart[len(pstart) - 1]
cprobs = np.zeros((len(x_beta), yncat))
probs = np.zeros((len(x_beta), yncat))
cutpoint = np.zeros((len(x_beta), yncat))
cutpointb = np.zeros((len(x_beta), yncat))
zg = z.dot(gamma) + offsetz
xb = x.dot(beta) + offsetx
means = np.array([0, 0])
lower = np.array([-inf, -inf])
sigma = np.array([[1, rho], [rho, 1]])
nsigma = np.array([[1, -rho], [-rho, 1]])
for i in range(yncat - 1):
cprobs[:, i] = norm.cdf(tau[i] - xb)
cutpoint[:, i] = tau[i] - xb
cutpointb[:, i] = xb - tau[i]
upperb = np.zeros((len(x_beta), 2))
upper = np.zeros((len(x_beta), 2))
for j in range(n):
upperb[j, :] = [zg[j], cutpointb[j, yncat - 2]]
upper[j, :] = [zg[j], cutpoint[j, 0]]
probs[j, yncat - 1] = mvn.mvnun(lower, upperb[j], means, sigma)[0]
probs[j, 0] = mvn.mvnun(lower, upper[j], means, nsigma)[0]
for i in range(n):
for j in range(1, yncat - 1):
if j == np.median(range(yncat)):
probs[i, j] = (
(1 - norm.cdf(zg[i]))
+ mvn.mvnun(lower, [zg[i], cutpoint[i, j]], means,
nsigma)[0]
- mvn.mvnun(lower, [zg[i], cutpoint[i, j - 1]], means,
nsigma)[0]
)
else:
probs[i, j] = (
mvn.mvnun(lower, [zg[i], cutpoint[i, j]], means,
nsigma)[0]
- mvn.mvnun(lower, [zg[i], cutpoint[i, j - 1]], means,
nsigma)[0]
)
lik = np.zeros((n, yncat))
for k in range(n):
for j in range(yncat):
lik[k, j] = v[k, j] * probs[k, j]
likk = np.log(lik[lik != 0])
llik = -1 * sum(likk * weights)
return llik
[docs]def opresults(model, data, x, y):
"""Produces estimation results, part of :py:func:`opmod`.
:param model: Model estimation results obtained from minimization.
:param data: Dataset.
:type data: pandas.DataFrame
:param x: List of names for independent variables matching column names
in data.
:type x: list of str
:param y: : List of name for dependent Variablematching column names
in data.
"""
varlist = np.unique(y + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
names = list()
for s in range(1, yncat):
names.append("cut" + str(s))
for s in range(x_.shape[1]):
names.append(x_.columns[s])
ts = model.x[0: yncat - 1]
xs = model.x[(yncat - 1): (yncat + x_.shape[1] - 1)]
ses = np.sqrt(np.diag(model.hess_inv))
tscore = model.x / ses
pval = (1 - (norm.cdf(abs(tscore)))) * 2
lci = model.x - 1.96 * ses
uci = model.x + 1.96 * ses
coef = pd.DataFrame(
{
"Coef": model.x,
"SE": ses,
"tscore": tscore,
"p": pval,
"2.5%": lci,
"97.5%": uci,
},
names,
)
aic = -2 * (-model.fun) + 2 * (len(coef))
results = OpModel(
model.fun, coef, aic, model.hess_inv, datasetnew, xs, ts, x_, yx_,
yncat, x, y
)
return results
[docs]def opmod(data, x, y, pstart=None, method="BFGS", weights=1, offsetx=0):
"""Estimates Ordered Probit model and returns :class:`OpModel` class object.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param data: Full dataset used for estimation.
:type data: pandas.DataFrame
:type x: list of str
:param y: Dependent Variable (DV).
:type y: list of str
:param method: Method for optimization, default is 'BFGS'. For other
available methods, see scipy.optimize.minimize documentation.
:param weights: Weights.
:param offsetx: Offset for covariates (X).
:return: OpModel
"""
varlist = np.unique(y + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
if pstart is None:
pstart = np.repeat(0.01, ((yncat - 1) + len(x_.columns)))
model = minimize(
op,
pstart,
args=(x_, yx_, datasetnew, weights, offsetx),
method=method,
options={"disp": True, "maxiter": 500},
)
results = opresults(model, data, x, y)
return results
[docs]def iopresults(model, data, x, y, z, modeltype):
"""Produce estimation results, part of :py:func:`iopmod`.
:param model: Model estimation results obtained from minimization.
:param data: Full dataset used for estimation.
:type data: pandas.DataFrame
:param x: List of names for covariates in the ordered probit stage.
:type x: list of str
:param y: : List of names for dependent cariable.
:type y: list of str
:param z: : List of names for covariates in the inflation stage.
:type z: list of str
:param modeltype: : Type of model. Options are: 'ziop' or 'miop'.
"""
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
z_ = datasetnew[z]
z_.insert(0, "int", np.repeat(1, len(z_)))
names = list()
for s in range(1, yncat):
names.append("cut" + str(s))
for s in range(z_.shape[1]):
names.append("Inflation: " + z_.columns[s])
for s in range(x_.shape[1]):
names.append("Ordered: " + x_.columns[s])
ts = model.x[0: yncat - 1]
zs = model.x[yncat - 1: (yncat + z_.shape[1] - 1)]
xs = model.x[
(yncat + z_.shape[1] - 1): (yncat + z_.shape[1] + x_.shape[1] - 1)]
ses = np.sqrt(np.diag(model.hess_inv))
tscore = model.x / ses
pval = (1 - (norm.cdf(abs(tscore)))) * 2
lci = model.x - 1.96 * ses
uci = model.x + 1.96 * ses
coef = pd.DataFrame(
{
"Coef": model.x,
"SE": ses,
"tscore": tscore,
"p": pval,
"2.5%": lci,
"97.5%": uci,
},
names,
)
aic = -2 * (-model.fun) + 2 * (len(coef))
results = IopModel(
modeltype,
model.fun,
coef,
aic,
model.hess_inv,
datasetnew,
xs,
zs,
ts,
x_,
yx_,
z_,
yncat,
x,
y,
z,
)
return results
[docs]def iopcresults(model, data, x, y, z, modeltype):
"""Produce estimation results, part of :py:func:`ziopc mod`.
:param model: Model estimation results obtained from minimization.
:param data: Full dataset used for estimation.
:type data: pandas.DataFrame
:param x: List of names for covariates in the ordered probit stage.
:type x: list of str
:param y: : List of names for dependent cariable.
:type y: list of str
:param z: : List of names for covariates in the inflation stage.
:type z: list of str
:param modeltype: : Type of model. Options are: 'ziopc' or 'miopc'
"""
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
z_ = datasetnew[z]
z_.insert(0, "int", np.repeat(1, len(z_)))
names = list()
for s in range(1, yncat):
names.append("cut" + str(s))
for s in range(z_.shape[1]):
names.append("Inflation: " + z_.columns[s])
for s in range(x_.shape[1]):
names.append("Ordered: " + x_.columns[s])
names.append("rho")
ts = model.x[0: yncat - 1]
zs = model.x[yncat - 1: (yncat + z_.shape[1] - 1)]
xs = model.x[
(yncat + z_.shape[1] - 1): (yncat + z_.shape[1] + x_.shape[1] - 1)]
rho = model.x[-1]
ses = np.sqrt(np.diag(model.hess_inv))
tscore = model.x / ses
pval = (1 - (norm.cdf(abs(tscore)))) * 2
lci = model.x - 1.96 * ses
uci = model.x + 1.96 * ses
coef = pd.DataFrame(
{
"Coef": model.x,
"SE": ses,
"tscore": tscore,
"p": pval,
"2.5%": lci,
"97.5%": uci,
},
names,
)
aic = -2 * (-model.fun) + 2 * (len(coef))
results = IopCModel(
modeltype,
model.fun,
coef,
aic,
model.hess_inv,
datasetnew,
xs,
zs,
ts,
x_,
yx_,
z_,
rho,
yncat,
x,
y,
z,
)
return results
[docs]def iopmod(
modeltype,
data,
x,
y,
z,
pstart=None,
method="BFGS",
weights=1,
offsetx=0,
offsetz=0,
):
"""Estimate ZiOP model and return :class:`IopModel` class object as output.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param data: Full dataset used for estimation.
:type data: pandas.DataFrame
:param x: Covariates in the ordered probit stage.
Elements must match column names of ``data``.
:type x: list of str
:param y: Dependent variable (DV). Element must match column
names of ``data``.
:type y: list of str
:param z: Inflation stage variable. Elements must match column
names of ``data``.
:type z: list of str
:param modeltype: Type of model to be estimated.
Options are: "ziop" or 'miop'.
:param method: Method for optimization, default 'BFGS'. For other
available methods, see scipy.optimize.minimize documentation.
:param weights: Weights.
:param offsetx: Offset for ordered probit stage covariates (X).
:param offsetz: Offset for inflation stage covariates (Z).
:return: IopModel
"""
types = ["ziop", "miop"]
if modeltype in types:
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
z_ = datasetnew[z]
z_.insert(0, "ones", np.repeat(1, len(z_)))
if pstart is None:
pstart = np.repeat(0.01, (
(yncat - 1) + len(x_.columns) + len(z_.columns)))
if modeltype == "ziop":
model = minimize(
ziop,
pstart,
args=(x_, yx_, z_, datasetnew, weights, offsetx, offsetz),
method=method,
options={"disp": True, "maxiter": 500},
)
elif modeltype == "miop":
if len(np.unique(y_.astype("category").iloc[:, 0])) % 2 == 1:
model = minimize(
miop,
pstart,
args=(x_, yx_, z_, datasetnew, weights, offsetx, offsetz),
method=method,
options={"disp": True, "maxiter": 500},
)
else:
raise Exception("miop requires odd number of categories.")
results = iopresults(model, data, x, y, z, modeltype)
return results
else:
raise Exception("type must be ziop or miop")
[docs]def iopcmod(
modeltype,
data,
x,
y,
z,
pstart=None,
method="BFGS",
weights=1,
offsetx=0,
offsetz=0,
):
"""Estimate an iOP model (ZiOP or MiOP) and return :class:`IopcModel`.
:param pstart: A list of starting values for the estimation.
Length of the number of parameters to be estimated.
:type pstart: list
:param data: Dataset used for estimation.
:type data: pandas.DataFrame
:param x: Covariates for the ordered probit stage.
Elements must match column names of ``data``.
:type x: list of str
:param y: The dependent variable (DV). Element must match column
names of ``data``.
:type y: list of str
:param z: Covariates for the inflation stage. Elements must match column
names of ``data``.
:type z: list of str
:param modeltype: Type of model to be estimated.
Options are: 'ziopc' or 'miopc'.
:param method: Method for optimization, default is 'BFGS'. For other
available methods, see scipy.optimize.minimize documentation.
:param weights: Weights.
:param offsetx: Offset for variables in ordered probit stage (X).
:param offsetz: Offset for variables in inflation stage (Z).
:return: IopCModel
"""
types = ["ziopc", "miopc"]
if modeltype in types:
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
z_ = datasetnew[z]
z_.insert(0, "ones", np.repeat(1, len(z_)))
if pstart is None:
pstart = np.repeat(
0.01, ((yncat - 1) + len(x_.columns) + len(z_.columns) + 1)
)
if modeltype == "ziopc":
model = minimize(
ziopc,
pstart,
args=(x_, yx_, z_, datasetnew, weights, offsetx, offsetz),
method=method,
options={"disp": True, "maxiter": 500},
)
elif modeltype == "miopc":
if len(np.unique(y_.astype("category").iloc[:, 0])) % 2 == 1:
model = minimize(
miopc,
pstart,
args=(x_, yx_, z_, datasetnew, weights, offsetx, offsetz),
method=method,
options={"disp": True, "maxiter": 500},
)
else:
raise Exception("miopc requires odd number of categories.")
return
results = iopcresults(model, data, x, y, z, modeltype)
return results
else:
raise Exception("type must be ziopc or miopc")
[docs]def iopfit(model):
"""Calculate probabilities from :py:func:`iopmod`.
:param model: :class:IopModel object from :py:func:`iopmod`.
:return: :class:FittedVals object with fitted values.
"""
zg = model.Z.dot(model.inflate)
xb = model.X.dot(model.ordered)
cprobs = np.zeros((model.ycat - 1, 1))
n = len(model.data)
probs = np.zeros((n, model.ycat))
cprobs[0, 0] = model.cutpoints[0]
if model.modeltype == "ziop":
for j in range(1, model.ycat - 1):
cprobs[j, 0] = cprobs[j - 1, 0] + np.exp(model.cutpoints[j])
probs[:, model.ycat - 1] = (norm.cdf(zg)) * (
1 - norm.cdf(cprobs[model.ycat - 2, 0] - xb)
)
probs[:, 0] = (1 - norm.cdf(zg)) + (norm.cdf(zg)) * (
norm.cdf(cprobs[0, 0] - xb)
)
for j in range(1, model.ycat - 1):
probs[:, j] = (norm.cdf(zg)) * ((norm.cdf(cprobs[j, 0] - xb))
- (norm.cdf(cprobs[j - 1, 0]
- xb)))
elif model.modeltype == "miop":
for j in range(1, model.ycat - 1):
cprobs[j, 0] = cprobs[j - 1, 0] + np.exp(model.cutpoints[j])
probs[:, model.ycat - 1] = (norm.cdf(zg)) * (
1 - norm.cdf(cprobs[model.ycat - 2, 0] - xb)
)
probs[:, 0] = norm.cdf(zg) * norm.cdf(cprobs[0, 0] - xb)
for i in range(1, model.ycat - 1):
if i == np.median(range(model.ycat)):
probs[:, i] = (1 - norm.cdf(zg)) + (
norm.cdf(zg)
* (norm.cdf(cprobs[j, 0] - xb)
- norm.cdf(cprobs[j - 1, 0] - xb)))
else:
probs[:, i] = norm.cdf(zg) * (
norm.cdf(cprobs[:, i] - xb)
- norm.cdf(cprobs[j - 1, 0] - xb))
probsordered = np.zeros((n, model.ycat))
probsordered[:, model.ycat - 1] = 1 - norm.cdf(
cprobs[model.ycat - 2, 0] - xb)
probsordered[:, 0] = norm.cdf(cprobs[0, 0] - xb)
for j in range(1, model.ycat - 1):
probsordered[:, j] = (norm.cdf(cprobs[j, 0] - xb)) - (
norm.cdf(cprobs[j - 1, 0] - xb)
)
probsinfl = np.zeros((n, 1))
probsinfl[:, 0] = 1 - norm.cdf(zg)
probslin = pd.DataFrame({"ZG": zg, "XB": xb})
fitted = FittedVals(probs, probsordered, probsinfl, probslin)
return fitted
[docs]def iopcfit(model):
"""Calculate fitted probabilities from :py:func:`iopcmod`.
:param model: :class:`IopCModel` object from :py:func:`iopcmod`.
:return: :class:`FittedVals` object with fitted values.
"""
zg = model.Z.dot(model.inflate)
xb = model.X.dot(model.ordered)
cprobs = np.zeros((model.ycat - 1, 1))
n = len(model.data)
probs = np.zeros((n, model.ycat))
cprobs[0, 0] = model.cutpoints[0]
rho = model.coefs.iloc[-1, 0]
means = np.array([0, 0])
lower = np.array([-inf, -inf])
sigma = np.array([[1, rho], [rho, 1]])
nsigma = np.array([[1, -rho], [-rho, 1]])
if model.modeltype == "ziopc":
for j in range(1, model.ycat - 1):
cprobs[j, 0] = cprobs[j - 1, 0] + np.exp(model.cutpoints[j])
for i in range(n):
probs[i, model.ycat - 1] = mvn.mvnun(
lower, [zg[i], (xb[i] - cprobs[model.ycat - 2][0])], means,
sigma
)[0]
probs[i, 0] = (1 - norm.cdf(zg[i])) + mvn.mvnun(
lower, [zg[i], (cprobs[0][0] - xb[i])], means, nsigma
)[0]
for i in range(n):
for j in range(1, model.ycat - 1):
probs[i, j] = (
mvn.mvnun(lower,
[zg[i], (cprobs[j][0] - xb[i])],
means, nsigma)[0]
) - (
mvn.mvnun(
lower,
[zg[i], (cprobs[j - 1][0] - xb[i])],
means, nsigma
)[0]
)
elif model.modeltype == "miopc":
for j in range(1, model.ycat - 1):
cprobs[j, 0] = cprobs[j - 1, 0] + np.exp(model.cutpoints[j])
for i in range(n):
probs[i, model.ycat - 1] = mvn.mvnun(
lower, [zg[i], (xb[i] - cprobs[model.ycat - 2][0])], means,
sigma
)[0]
probs[i, 0] = mvn.mvnun(
lower, [zg[i], (cprobs[0][0] - xb[i])], means, nsigma
)[0]
for i in range(n):
for j in range(1, model.ycat - 1):
if j == np.median(range(model.ycat)):
probs[i, j] = (1 - norm.cdf(zg[i])) + (
(mvn.mvnun(
lower, [zg[i], (cprobs[j][0] - xb[i])],
means, nsigma)[0])
- (mvn.mvnun(lower,
[zg[i], (cprobs[j - 1][0] -
xb[i])],
means, nsigma)[0]))
else:
probs[i, j] = (mvn.mvnun(lower,
[zg[i], (cprobs[j][0] - xb[i])],
means, nsigma)[0]) - (
mvn.mvnun(lower, [zg[i],
(cprobs[j - 1][0] -
xb[i])],
means, nsigma)[0])
# ordered
probsordered = np.zeros((n, model.ycat))
probsordered[:, model.ycat - 1] = 1 - norm.cdf(
cprobs[model.ycat - 2, 0] - xb)
probsordered[:, 0] = norm.cdf(cprobs[0, 0] - xb)
for j in range(1, model.ycat - 1):
probsordered[:, j] = norm.cdf(cprobs[j, 0] - xb) - (
norm.cdf(cprobs[j - 1, 0] - xb)
)
probsinfl = np.zeros((n, 1))
probsinfl[:, 0] = 1 - norm.cdf(zg)
probslin = pd.DataFrame({"zg": zg, "xb": xb})
fitted = FittedVals(probs, probsordered, probsinfl, probslin)
return fitted
[docs]def vuong_opiop(opmodel, iopmodel):
"""Run the Vuong test to compare the performance of the OP and iOP model.
:param opmodel: The OP model from :class:`OpModel`.
:param iopmodel: The ZiOP model from :class:`IopModel`.
:return: vuongopiop: Result of the Vuong test.
"""
n1 = len(opmodel.data)
y = iopmodel.Y
# can also y = opmodel.Y when 2 models have the same length
x = iopmodel.X
# can also x = opmodel.X when 2 models have the same length
cuts_op = np.repeat(0, len(opmodel.cutpoints)).astype(float)
xop = opmodel.ordered
cuts_op[0] = opmodel.cutpoints[0]
for i in range(1, len(opmodel.cutpoints)):
cuts_op[i] = cuts_op[i - 1] + np.exp(opmodel.cutpoints[i])
xbop = pd.DataFrame(index=np.arange(n1), columns=np.arange(len(x.columns)))
for j in range(len(x.columns)):
xbop.iloc[:, j] = xop[j] * x.iloc[:, j]
xbop_sum = xbop.sum(axis=1)
fitttediop = iopfit(iopmodel).responsefull
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((n1, yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
m = np.zeros(n1)
probs = np.zeros((n1, yncat))
probs[:, 0] = norm.cdf(cuts_op[0] - xbop_sum) / fitttediop[:, 0]
probs[:, yncat - 1] = (1 - norm.cdf(
cuts_op[yncat - 2] - xbop_sum)) / fitttediop[:, yncat - 1]
for i in range(1, yncat - 1):
probs[:, i] = (norm.cdf(cuts_op[i] - xbop_sum)
- norm.cdf(cuts_op[i - 1]
- xbop_sum)) / fitttediop[:, i]
m = np.zeros((n1, yncat))
for k in range(n1):
for j in range(yncat):
m[k, j] = v[k, j] * probs[k, j]
m2 = m[m != 0]
mlog = np.log(m2)
diffmsq = (mlog - np.mean(mlog)) ** 2
sumdms = sum(diffmsq)
vuongopiop = (np.sqrt(n1) * (1 / n1) * sum(mlog)) / (
np.sqrt((1 / n1) * sumdms))
return vuongopiop
[docs]def vuong_opiopc(opmodel, iopcmodel):
"""Run the Vuong test to compare the performance of the OP and iOPC model.
:param opmodel: The OP model from :class:`OpModel`.
:param iopcmodel: The iOPC model from :class:`IopCModel`.
:return: vuongopiopc: Result of the Vuong test.
"""
n1 = len(opmodel.data)
y = iopcmodel.Y
x = iopcmodel.X
cuts_op = np.repeat(0, len(opmodel.cutpoints)).astype(float)
xop = opmodel.ordered
cuts_op[0] = opmodel.cutpoints[0]
for i in range(1, len(opmodel.cutpoints)):
cuts_op[i] = cuts_op[i - 1] + np.exp(opmodel.cutpoints[i])
xbop = pd.DataFrame(index=np.arange(n1), columns=np.arange(len(x.columns)))
for j in range(len(x.columns)):
xbop.iloc[:, j] = xop[j] * x.iloc[:, j]
xbop_sum = xbop.sum(axis=1)
fitttedziopc = iopcfit(iopcmodel).responsefull
ycat = y.astype("category")
ycatu = np.unique(ycat)
yncat = len(ycatu)
y0 = np.sort(ycatu)
v = np.zeros((n1, yncat))
for j in range(yncat):
v[:, j] = y == y0[j]
probs = np.zeros((n1, yncat))
probs[:, 0] = norm.cdf(cuts_op[0] - xbop_sum) / fitttedziopc[:, 0]
probs[:, yncat - 1] = (1 - norm.cdf(
cuts_op[yncat - 2] - xbop_sum)) / fitttedziopc[:, yncat - 1]
for i in range(1, yncat - 1):
probs[:, i] = (norm.cdf(cuts_op[i] - xbop_sum) - norm.cdf(
cuts_op[i - 1] - xbop_sum)) / fitttedziopc[:, i]
m = np.zeros((n1, yncat))
for k in range(n1):
for j in range(yncat):
m[k, j] = v[k, j] * probs[k, j]
m2 = m[m != 0]
mlog = np.log(m2)
diffmsq = (mlog - np.mean(mlog)) ** 2
sumdms = sum(diffmsq)
vuongopiopc = (np.sqrt(n1) * (1 / n1) * sum(mlog)) / (
np.sqrt((1 / n1) * sumdms))
return vuongopiopc
[docs]def split_effects(model, inflvar, nsims=10000):
"""Calculate change in probability of being 0 in the split-probit stage.
This function calculates the predicted probabilities
when there is change in value of a variable in the split-probit equation.
The chosen dummy variable is changed from 0 to 1,
and chosen numerical variable is mean value + 1 standard deviation.
Other variables are kept at 0 or mean value
(Note: the current version of the function
recognizes ordinal variables as numerical).
:param model: :class:`IopModel` or :class:`IopCModel`.
:param inflvar: Number representing the location of variable
in the split-probit equation.
(attribute .inflate of :class:`IopModel` or :class:`IopCModel`)
:type inflvar: int
:param nsims: Number of simulated observations, default is 10000.
:type nsims: int
:return: changeprobs: A dataframe of the predicted
probabilities when there is change in the variable (1)
versus original values (0).
"""
estimate = model.coefs.iloc[:, 0]
vcov = model.vcov
model_z = model.Z
zsim1 = np.zeros(len(model_z.columns))
zsim1[0] = 1
zsima = np.zeros(len(model_z.columns))
zsima[0] = 1
for j in range(1, len(model_z.columns)):
if (
max(model_z.iloc[:, j]) == 1
and min(model_z.iloc[:, j]) == 0
and len(np.unique(model_z.iloc[:, j])) == 2
):
zsim1[j] = 0
else:
zsim1[j] = np.mean(model_z.iloc[:, j])
for j in range(1, len(model_z.columns)):
if (
max(model_z.iloc[:, j]) == 1
and min(model_z.iloc[:, j]) == 0
and len(np.unique(model_z.iloc[:, j])) == 2
):
zsima[j] = 1
else:
zsima[j] = np.mean(model_z.iloc[:, j]) + np.std(model_z.iloc[:, j])
zsim2 = zsim1.copy()
zsim2[inflvar] = zsima[inflvar]
np.random.seed(1)
probs1 = np.zeros(nsims)
probs2 = np.zeros(nsims)
for i in range(nsims):
gsim = np.random.multivariate_normal(estimate, vcov)
gsim2 = gsim[model.ycat - 1: model.ycat - 1 + len(model.inflate)]
zg1 = zsim1.dot(gsim2)
zg2 = zsim2.dot(gsim2)
probs1[i] = norm.cdf(zg1)
probs2[i] = norm.cdf(zg2)
name = model.coefs.index[model.ycat - 1 + inflvar]
changeprobs = pd.DataFrame({name.replace("Inflation: ", "") + "= 0": probs1,
name.replace("Inflation: ", "") + "= 1": probs2}
)
return changeprobs
[docs]def ordered_effects(model, ordvar, nsims=10000):
"""Calculate the changes in probability in each outcome in OP stage.
This function calculates predicted probabilities
when there is a change in the value of a covariate
in the ordered probit equation.
The chosen dummy variable is changed from 0 to 1,
and chosen numerical variable is mean value + 1 standard deviation.
Other variables are kept at 0 or mean value
(Note: the current version of the function
recognizes ordinal variables as numerical).
:param model: :class:`IopModel` or :class:`IopCModel`.
:param ordvar: Number representing the location of variable
in the ordered probit equation.
(attribute .ordered of :class:`IopModel` or :class:`IopCModel`)
:type ordvar: int
:param nsims: Number of simulated observations, default is 10000.
:type nsims: int
:return: changeprobs: A dataframe of the predicted
probabilities when there is change in the variable for each outcome (1)
versus original values (0).
"""
estimate = model.coefs.iloc[:, 0]
vcov = model.vcov
model_x = model.X
xsim1 = np.zeros(len(model_x.columns))
xsima = np.zeros(len(model_x.columns))
for j in range(len(model_x.columns)):
if (
max(model_x.iloc[:, j]) == 1
and min(model_x.iloc[:, j]) == 0
and len(np.unique(model_x.iloc[:, j])) == 2
):
xsim1[j] = 0
else:
xsim1[j] = np.mean(model_x.iloc[:, j])
for j in range(len(model_x.columns)):
if (
max(model_x.iloc[:, j]) == 1
and min(model_x.iloc[:, j]) == 0
and len(np.unique(model_x.iloc[:, j])) == 2
):
xsima[j] = 1
else:
xsima[j] = np.mean(model_x.iloc[:, j]) + np.std(model_x.iloc[:, j])
xsim2 = xsim1.copy()
xsim2[ordvar] = xsima[ordvar]
np.random.seed(1)
probsordered1 = np.zeros(model.ycat)
probsordered2 = np.zeros(model.ycat)
cprobs = np.zeros((model.ycat - 1, 1))
cprobs[0, 0] = model.cutpoints[0]
for j in range(1, model.ycat - 1):
cprobs[j, 0] = cprobs[j - 1, 0] + np.exp(model.cutpoints[j])
probs1 = pd.DataFrame(index=np.arange(nsims),
columns=np.arange(model.ycat))
probs2 = pd.DataFrame(index=np.arange(nsims),
columns=np.arange(model.ycat))
name = model.coefs.index[model.ycat - 1 + len(model.inflate) + ordvar]
probs1 = probs1.add_suffix(": " + name.replace("Ordered: ", "") + " = 0")
probs2 = probs2.add_suffix(": " + name.replace("Ordered: ", "") + " = 1")
for i in range(nsims):
bsim = np.random.multivariate_normal(estimate, vcov)
bsim2 = bsim[model.ycat - 1 + len(model.inflate):
model.ycat - 1 + len(model.inflate) + len(model.ordered)]
xb1 = xsim1.dot(bsim2)
xb2 = xsim2.dot(bsim2)
probsordered1[model.ycat - 1] = 1 - norm.cdf(
cprobs[model.ycat - 2, 0] - xb1)
probsordered1[0] = norm.cdf(cprobs[0, 0] - xb1)
for j in range(1, model.ycat - 1):
probsordered1[j] = norm.cdf(cprobs[j, 0] - xb1) - (
norm.cdf(cprobs[j - 1, 0] - xb1)
)
probsordered2[model.ycat - 1] = 1 - norm.cdf(
cprobs[model.ycat - 2, 0] - xb2)
probsordered2[0] = norm.cdf(cprobs[0, 0] - xb2)
for j in range(1, model.ycat - 1):
probsordered2[j] = norm.cdf(cprobs[j, 0] - xb2) - (
norm.cdf(cprobs[j - 1, 0] - xb2)
)
probs1.iloc[i:, ] = probsordered1
probs2.iloc[i:, ] = probsordered2
changeprobs = pd.DataFrame(index=np.arange(nsims),
columns=np.arange(2 * model.ycat))
newnames = list(np.repeat("", model.ycat * 2))
for j in range(0, 2 * model.ycat, 2):
changeprobs.iloc[:, j] = probs1.iloc[:, round(j / 2)]
for j in range(1, 2 * model.ycat, 2):
changeprobs.iloc[:, j] = probs2.iloc[:, round((j - 1) / 2)]
for j in range(0, 2 * model.ycat, 2):
newnames[j] = list(probs1.columns)[round(j / 2)]
for j in range(1, 2 * model.ycat, 2):
newnames[j] = list(probs2.columns)[round((j - 1) / 2)]
changeprobs.columns = newnames
return changeprobs