"""Classes and Functions for the gimnl module."""
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
[docs]class GimnlModel:
"""Store model results from :py:func:`gimnlmod`."""
def __init__(
self,
modeltype,
reference,
inflatecat,
llik,
coef,
aic,
vcov,
data,
xs,
zs,
x_,
yx_,
z_,
ycatu,
xstr,
ystr,
zstr,
):
"""Store model results, goodness-of-fit tests, and other information
for the Generalized Inflated Multinomial Logit Model.
:param modeltype: Type of GIMNL Model (bimnl3, simnl3, or timnl3),
indicating the inflated category.
:param reference: Order of categories. The order category will be
the first element.
:param llik: Log-Likelihood.
:param coef: Model coefficients.
:param aic: Model Akaike information .
:param vcov: Variance-Covariance matrix.
(optimized as inverted Hessian matrix)
:param data: Model data used for estimation, subsetted to selected
variables and missing values listwise deleted.
:param zs: Inflation stage estimates (Gammas).
:param xs: Multinomial Logit probit estimates (Betas).
:param ycatu: Number of categories in the Dependent variable (DV).
:param x_: X Data.
:param yx_: Y (DV) data.
:param z_: Z Data.
:param xstr: List of string for variable names in the MNL stage.
:param ystr: List of string for dependent variable name .
:param zstr: List of string for variable names in the Logit Split
stage.
"""
self.modeltype = modeltype
self.reference = reference
self.inflatecat = inflatecat
self.llik = llik
self.coefs = coef
self.AIC = aic
self.vcov = vcov
self.data = data
self.split = zs
self.multinom = xs
self.ycat = ycatu
self.X = x_
self.Y = yx_
self.Z = z_
self.xstr = xstr
self.ystr = ystr
self.zstr = zstr
[docs]class MnlModel:
"""Store model results from :py:func:`gimnlmod`."""
def __init__(
self,
modeltype,
reference,
llik,
coef,
aic,
vcov,
data,
xs,
x_,
yx_,
ycatu,
xstr,
ystr,
):
"""Store model results, goodness-of-fit tests, and other information
for the Multinomial Logit Model.
:param modeltype: Type of Model (mnl3).
:param reference: Order of categories. The order category will be
the first element.
:param llik: Log-Likelihood.
:param coef: Model coefficients.
:param aic: Model Akaike information .
:param vcov: Variance-Covariance matrix.
(optimized as inverted Hessian matrix)
:param data: Model data used for estimation, subsetted to selected
variables and missing values listwise deleted.
:param xs: Multinomial Logit estimates (Betas).
:param ycatu: Number of categories in the Dependent variable (DV).
:param x_: X Data.
:param yx_: Y (DV) data.
:param xstr: List of string for x names.
:param ystr: List of string for y names.
"""
self.modeltype = modeltype
self.reference = reference
self.llik = llik
self.coefs = coef
self.AIC = aic
self.vcov = vcov
self.data = data
self.multinom = xs
self.ycat = ycatu
self.X = x_
self.Y = yx_
self.xstr = xstr
self.ystr = ystr
[docs]def mnl3(pstart, x2, x3, y, reference):
"""
Calculate likelihood function for the baseline inflated three-category MNL
model.
:param pstart: A list of starting parameters.
:type pstart: list
:param x2: Multinomial Logit variables. Data subsetted to selected
variables.
:type x2: Pandas dataframe
:param x3: Multinomial Logit variables (should be identical to x2.
:type x3: Pandas dataframe
:param y: Dependent variable (DV). Data subsetted to selected
variable.
:type y: Pandas dataframe
:param reference: List of order of categories.
"""
b2 = pstart[0: len(x2.columns)]
b3 = pstart[len(x2.columns): (len(pstart))]
xb2 = x2.dot(b2)
xb3 = x3.dot(b3)
p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
p2 = p1 * np.exp(xb2)
p3 = p1 * np.exp(xb3)
lik = np.sum(
np.log(p1) * (y == reference[0])
+ np.log(p2) * (y == reference[1])
+ np.log(p3) * (y == reference[2])
)
llik = -1 * np.sum(lik)
return llik
[docs]def bimnl3(pstart, x2, x3, y, z, reference):
"""
Calculate likelihood function for the baseline inflated three-category
MNL model.
:param pstart: A list of starting parameters.
:type pstart: list
:param x2: Multinomial Logit variables. Data subsetted to selected
variables.
:type x2: pandas dataframe
:param x3: Multinomial Logit variables (should be identical to x2.
:type x3: pandas dataframe
:param y: Dependent variable (DV). Data subsetted to selected
variable.
:type y: pandas dataframe
:param z: Logit Split stage variables. Data subsetted to selected
variables.
:type z: pandas dataframe
:param reference: List of order of categories (first element will be the
inflated category).
"""
b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
gamma = pstart[0: (len(z.columns))]
xb2 = x2.dot(b2)
xb3 = x3.dot(b3)
zg = z.dot(gamma)
pz = 1 / (1 + np.exp(-zg))
p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
p2 = p1 * np.exp(xb2)
p3 = p1 * np.exp(xb3)
lik = np.sum(
np.log((1 - pz) + pz * p1) * (y == reference[0])
+ np.log(pz * p2) * (y == reference[1])
+ np.log(pz * p3) * (y == reference[2])
)
llik = -1 * np.sum(lik)
return llik
[docs]def simnl3(pstart, x2, x3, y, z, reference):
"""
Calculate likelihood function for the second category inflated
three-category MNL model.
:param pstart: A list of starting parameters.
:type pstart: list
:param x2: Multinomial Logit variables. Data subsetted to selected
variables.
:type x2: pandas dataframe
:param x3: Multinomial Logit variables (should be identical to x2.
:type x3: pandas dataframe
:param y: Dependent variable (DV). Data subsetted to selected
variable.
:type y: pandas dataframe
:param z: Logit Split stage variables. Data subsetted to selected
variables.
:type z: pandas dataframe
:param reference: List of order of categories (second element will be the
inflated category).
"""
b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
gamma = pstart[0: (len(z.columns))]
xb2 = x2.dot(b2)
xb3 = x3.dot(b3)
zg = z.dot(gamma)
pz = 1 / (1 + np.exp(-zg))
p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
p2 = p1 * np.exp(xb2)
p3 = p1 * np.exp(xb3)
lik = np.sum(
np.log(pz * p1) * (y == reference[0])
+ np.log((1 - pz) + pz * p2) * (y == reference[1])
+ np.log(pz * p3) * (y == reference[2])
)
llik = -1 * np.sum(lik)
return llik
[docs]def timnl3(pstart, x2, x3, y, z, reference):
"""
Calculate likelihood function for the third category inflated
three-category MNL model.
:param pstart: A list of starting parameters.
:type pstart: list
:param x2: Covariates in the MNL.
:type x2: pandas dataframe
:param x3: Multinomial Logit variables (should be identical to x2).
:type x3: pandas dataframe
:param y: Dependent variable (DV).
:type y: pandas dataframe
:param z: Covariates in the Logit model (i.e. split stage).
variables.
:type z: pandas dataframe
:param reference: List of order of categories (third element will be the
inflated category).
"""
b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
gamma = pstart[0: (len(z.columns))]
xb2 = x2.dot(b2)
xb3 = x3.dot(b3)
zg = z.dot(gamma)
pz = 1 / (1 + np.exp(-zg))
p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
p2 = p1 * np.exp(xb2)
p3 = p1 * np.exp(xb3)
lik = np.sum(
np.log(pz * p1) * (y == reference[0])
+ np.log(pz * p2) * (y == reference[1])
+ np.log((1 - pz) + pz * p3) * (y == reference[2])
)
llik = -1 * np.sum(lik)
return llik
[docs]def gimnlresults(model, data, x, y, z, modeltype, reference, inflatecat):
"""
Produce estimation results, part of :py:func:`gimnlmod`.
Store estimates, model AIC, and other information to
:py:class:`GimnlModel`.
:param model: Estimation results.
:param data: Model data used for estimation, subsetted to selected
variables and missing values listwise deleted.
:type data: pandas.DataFrame
:param x: List of names for covariates in Multinomial Logit stage.
:type x: list of str
:param y: List of names for dependent variable .
:type y: list of str
:param z: List of names for covariates in split-stage.
:type z: list of str
:param modeltype: Type of inflated MNL model. One of 'bimnl3', 'simnl3',
or 'timnl3'.
:param reference: List of order of categories. First element is the
baseline/ reference category.
:param inflatecat: Inflated category. One of 'baseline', 'second',
or 'third'.
"""
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
z_ = datasetnew[z]
z_.insert(0, "int", np.repeat(1, len(z_)))
x_.insert(0, "int", np.repeat(1, len(x_)))
names = list()
if modeltype == "bimnl3":
x2 = x_
x3 = x_
for s in range(z_.shape[1]):
names.append("Inflation: " + z_.columns[s])
for s in range(x2.shape[1]):
names.append(str(reference[1]) + ": " + x2.columns[s])
for s in range(x3.shape[1]):
names.append(str(reference[2]) + ": " + x3.columns[s])
xs = model.x[(z_.shape[1]): (z_.shape[1] + x2.shape[1] + x3.shape[1])]
zs = model.x[0: (z_.shape[1])]
ses = np.sqrt(np.diag(model.hess_inv))
tscore = model.x / ses
pval = (1 - (norm.cdf(abs(tscore)))) * 2
lci = model.x - 1.96 * ses
uci = model.x + 1.96 * ses
coef = pd.DataFrame(
{
"Coef": model.x,
"SE": ses,
"tscore": tscore,
"p": pval,
"2.5%": lci,
"97.5%": uci,
},
names,
)
aic = -2 * (-model.fun) + 2 * (len(coef))
llik = -1 * model.fun
model = GimnlModel(
modeltype,
reference,
inflatecat,
llik,
coef,
aic,
model.hess_inv,
datasetnew,
xs,
zs,
x_,
yx_,
z_,
yncat,
x,
y,
z,
)
return model
[docs]def mnlresults(model, data, x, y, modeltype, reference):
"""
Produce estimation results, part of :py:func:`mnlmod`.
Store estimates, model AIC, and other information to :py:class:`MnlModel`.
:param model: Estimation results.
:param data: Model data used for estimation, subsetted to selected
variables and missing values listwise deleted.
:type data: pandas.DataFrame
:param x: List of names for covariates in Multinomial Logit stage.
:type x: list of str
:param y: List of names for dependent variable .
:type y: list of str
:param modeltype: Three-category MNL model ('mnl3').
:param reference: List of order of categories. First element is the
baseline/ reference category.
"""
varlist = np.unique(y + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
x_.insert(0, "int", np.repeat(1, len(x_)))
names = list()
if modeltype == "mnl3":
x2 = x_
x3 = x_
for s in range(x2.shape[1]):
names.append(str(reference[1]) + ": " + x2.columns[s])
for s in range(x3.shape[1]):
names.append(str(reference[2]) + ": " + x3.columns[s])
xs = model.x[0: (x2.shape[1] + x3.shape[1])]
ses = np.sqrt(np.diag(model.hess_inv))
tscore = model.x / ses
pval = (1 - (norm.cdf(abs(tscore)))) * 2
lci = model.x - 1.96 * ses
uci = model.x + 1.96 * ses
coef = pd.DataFrame(
{
"Coef": model.x,
"SE": ses,
"tscore": tscore,
"p": pval,
"2.5%": lci,
"97.5%": uci,
},
names,
)
aic = -2 * (-model.fun) + 2 * (len(coef))
llik = -1 * model.fun
model = MnlModel(
modeltype,
reference,
llik,
coef,
aic,
model.hess_inv,
datasetnew,
xs,
x_,
yx_,
yncat,
x,
y,
)
return model
[docs]def gimnlmod(data, x, y, z, reference, inflatecat, method="BFGS", pstart=None):
"""
Estimate three-category inflated Multinomial Logit model.
:param data: Full dataset.
:type data: pandas.dataframe
:param x: Covariates in Multi Nomial Logit stage. The elements must match
column names of ``data``.
:type x: list of str
:param y: Dependent Variable. Values should be integers,
with a number from 0-2 representing each category. The element
must match column names of ``data``.
:type y: list of str
:param z: Covariates in split-stage. The elements must match column
names of ``data``.
:type z: list of str
:param reference: List of three elements specifying the order of
categories (e.g [0, 1, 2], [2, 1, 0]. etc...). The first element is
the baseline/reference category. The parameter
inflatecat then specifies which category in the list inflated.
:type reference: list of int
:param inflatecat: A string specifying the inflated category. One of
"baseline" for the first, "second" for the second, or "third" for the
third in reference list to specify the inflated category.
:param method: Optimization method. Default is 'BFGS'. For other
available methods, see scipy.optimize.minimize documentation.
:param pstart: A list of starting parameters.
:type pstart: list
"""
varlist = np.unique(y + z + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
if yncat == 3:
modeltype = "bimnl3"
else:
raise Exception(
"Function only supports Dependent Variable with 3 " "categories."
)
z_ = datasetnew[z]
z_.insert(0, "int", np.repeat(1, len(z_)))
x_.insert(0, "int", np.repeat(1, len(x_)))
if modeltype == "bimnl3":
x2 = x_
x3 = x_
if pstart is None:
pstart = np.repeat(
0.01, (len(x2.columns) + len(x3.columns) + len(z_.columns))
)
if inflatecat == "baseline":
model = minimize(
bimnl3,
pstart,
args=(x2, x3, yx_, z_, reference),
method=method,
options={"disp": True, "maxiter": 500},
)
elif inflatecat == "second":
model = minimize(
simnl3,
pstart,
args=(x2, x3, yx_, z_, reference),
method=method,
options={"disp": True, "maxiter": 500},
)
elif inflatecat == "third":
model = minimize(
timnl3,
pstart,
args=(x2, x3, yx_, z_, reference),
method=method,
options={"disp": True, "maxiter": 500},
)
results = gimnlresults(model, data, x, y, z, modeltype, reference,
inflatecat)
return results
[docs]def mnlmod(data, x, y, reference, method="BFGS", pstart=None):
"""
Estimate three-category Multinomial Logit model.
:param data: Full dataset.
:type data: pandas.dataframe
:param x: Covariates in MNL stage. The elements must match column
names of ``data``
:type x: list of str
:param y: Dependent variable. Values should be integers,
with a number from 0-2 representing each category.
:param reference: List of three elements specifying the order of
categories (e.g [0, 1, 2], [2, 1, 0]. etc...). The first element is
the baseline/reference category.
:type reference: list of int
:param method: Optimization method. Default is 'BFGS'. For other
available methods, see scipy.optimize.minimize documentation.
:param pstart: A list of starting parameters.
:type pstart: list
"""
varlist = np.unique(y + x)
dataset = data[varlist]
datasetnew = dataset.dropna(how="any")
datasetnew = datasetnew.reset_index(drop=True)
x_ = datasetnew[x]
y_ = datasetnew[y]
yx_ = y_.iloc[:, 0]
yncat = len(np.unique(yx_))
if yncat == 3:
modeltype = "mnl3"
else:
raise Exception(
"Function only supports Dependent Variable with 3 " "categories."
)
x_.insert(0, "int", np.repeat(1, len(x_)))
if modeltype == "mnl3":
x2 = x_
x3 = x_
if pstart is None:
pstart = np.repeat(
0.01, (len(x2.columns) + len(x3.columns))
)
model = minimize(
mnl3,
pstart,
args=(x2, x3, yx_, reference),
method=method,
options={"disp": True, "maxiter": 500},
)
results = mnlresults(model, data, x, y, modeltype, reference)
return results
[docs]def vuong_gimnl(modelmnl, modelgimnl):
"""
Run the Vuong test to compare the performance of the MNL and GIMNL model.
For the function to run properly, the models need to have the same X
covariates and same number of observations.
:param modelmnl: A :class:`MnlModel` object.
:param modelgimnl: A :class:`GimnlModel` object.
"""
xb2_mnl = modelmnl.X.dot(modelmnl.multinom[0: len(modelmnl.X.columns)])
xb3_mnl = modelmnl.X.dot(modelmnl.multinom[len(modelmnl.X.columns):
len(modelmnl.multinom)])
xb2_gimnl = modelgimnl.X.dot(
modelgimnl.multinom[0: len(modelgimnl.X.columns)])
xb3_gimnl = modelgimnl.X.dot(modelgimnl.multinom[len(modelgimnl.X.columns):
len(modelgimnl.multinom)])
zg_gimnl = modelgimnl.Z.dot(modelgimnl.split)
mnldenom = 1 + np.exp(xb2_mnl) + np.exp(xb3_mnl)
gimnldenom = 1 + np.exp(xb2_gimnl) + np.exp(xb3_gimnl)
p1mnl = 1 / mnldenom
p2mnl = np.exp(xb2_mnl) / mnldenom
p3mnl = np.exp(xb3_mnl) / mnldenom
if modelgimnl.inflatecat == "baseline":
p1gimnl = ((1 / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl)))
+ (1 - (1 / (1 + np.exp(-zg_gimnl)))))
p2gimnl = (np.exp(xb2_gimnl) / gimnldenom) * (
1 / (1 + np.exp(-zg_gimnl)))
p3gimnl = (np.exp(xb3_gimnl) / gimnldenom) * (
1 / (1 + np.exp(-zg_gimnl)))
elif modelgimnl.inflatecat == "second":
p1gimnl = ((1 / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl))))
p2gimnl = ((np.exp(xb2_gimnl) / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl)))
+ (1 - (1 / (1 + np.exp(-zg_gimnl)))))
p3gimnl = ((np.exp(xb3_gimnl) / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl))))
elif modelgimnl.inflatecat == "third":
p1gimnl = ((1 / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl))))
p2gimnl = ((np.exp(xb2_gimnl) / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl))))
p3gimnl = ((np.exp(xb3_gimnl) / gimnldenom)
* (1 / (1 + np.exp(-zg_gimnl)))
+ (1 - (1 / (1 + np.exp(-zg_gimnl)))))
m = np.zeros(len(modelgimnl.X))
reference = modelgimnl.reference
y = modelgimnl.Y
for i in range(len(m)):
if y[i] == reference[0]:
m[i] = np.log(p1mnl[i] / p1gimnl[i])
elif y[i] == reference[0]:
m[i] = np.log(p2mnl[i] / p2gimnl[i])
elif y[i] == reference[2]:
m[i] = np.log(p3mnl[i] / p3gimnl[i])
diffmsq = (m - np.mean(m)) ** 2
sumdms = sum(diffmsq)
vuong = (np.sqrt(len(m)) * (1 / len(m)) * sum(m)) / (
np.sqrt((1 / len(m)) * sumdms))
return vuong