Source code for gimnl

"""Classes and Functions for the gimnl module."""
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm


[docs]class GimnlModel:
    """Store model results from :py:func:`gimnlmod`."""

    def __init__(
            self,
            modeltype,
            reference,
            inflatecat,
            llik,
            coef,
            aic,
            vcov,
            data,
            xs,
            zs,
            x_,
            yx_,
            z_,
            ycatu,
            xstr,
            ystr,
            zstr,
    ):
        """Store model results, goodness-of-fit tests, and other information
        for the Generalized Inflated Multinomial Logit Model.

        :param modeltype: Type of GIMNL Model (bimnl3, simnl3, or timnl3),
            indicating the inflated category.
        :param reference: Order of categories. The order category will be
            the first element.
        :param llik: Log-Likelihood.
        :param coef: Model coefficients.
        :param aic: Model Akaike information .
        :param vcov: Variance-Covariance matrix.
            (optimized as inverted Hessian matrix)
        :param data: Model data used for estimation, subsetted to selected
            variables and missing values listwise deleted.
        :param zs: Inflation stage estimates (Gammas).
        :param xs: Multinomial Logit probit estimates (Betas).
        :param ycatu: Number of categories in the Dependent variable (DV).
        :param x_: X Data.
        :param yx_: Y (DV) data.
        :param z_: Z Data.
        :param xstr: List of string for variable names in the MNL stage.
        :param ystr: List of string for dependent variable name .
        :param zstr: List of string for variable names in the Logit Split
        stage.
        """
        self.modeltype = modeltype
        self.reference = reference
        self.inflatecat = inflatecat
        self.llik = llik
        self.coefs = coef
        self.AIC = aic
        self.vcov = vcov
        self.data = data
        self.split = zs
        self.multinom = xs
        self.ycat = ycatu
        self.X = x_
        self.Y = yx_
        self.Z = z_
        self.xstr = xstr
        self.ystr = ystr
        self.zstr = zstr


[docs]class MnlModel:
    """Store model results from :py:func:`gimnlmod`."""

    def __init__(
            self,
            modeltype,
            reference,
            llik,
            coef,
            aic,
            vcov,
            data,
            xs,
            x_,
            yx_,
            ycatu,
            xstr,
            ystr,
    ):
        """Store model results, goodness-of-fit tests, and other information
        for the Multinomial Logit Model.

        :param modeltype: Type of Model (mnl3).
        :param reference: Order of categories. The order category will be
            the first element.
        :param llik: Log-Likelihood.
        :param coef: Model coefficients.
        :param aic: Model Akaike information .
        :param vcov: Variance-Covariance matrix.
            (optimized as inverted Hessian matrix)
        :param data: Model data used for estimation, subsetted to selected
            variables and missing values listwise deleted.
        :param xs: Multinomial Logit estimates (Betas).
        :param ycatu: Number of categories in the Dependent variable (DV).
        :param x_: X Data.
        :param yx_: Y (DV) data.
        :param xstr: List of string for x names.
        :param ystr: List of string for y names.
        """
        self.modeltype = modeltype
        self.reference = reference
        self.llik = llik
        self.coefs = coef
        self.AIC = aic
        self.vcov = vcov
        self.data = data
        self.multinom = xs
        self.ycat = ycatu
        self.X = x_
        self.Y = yx_
        self.xstr = xstr
        self.ystr = ystr


[docs]def mnl3(pstart, x2, x3, y, reference):
    """
    Calculate likelihood function for the baseline inflated three-category MNL
    model.

    :param pstart: A list of starting parameters.
    :type pstart: list
    :param x2: Multinomial Logit variables. Data subsetted to selected
        variables.
    :type x2: Pandas dataframe
    :param x3: Multinomial Logit variables (should be identical to x2.
    :type x3: Pandas dataframe
    :param y: Dependent variable (DV). Data subsetted to selected
        variable.
    :type y: Pandas dataframe
    :param reference: List of order of categories.
    """
    b2 = pstart[0: len(x2.columns)]
    b3 = pstart[len(x2.columns): (len(pstart))]
    xb2 = x2.dot(b2)
    xb3 = x3.dot(b3)
    p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
    p2 = p1 * np.exp(xb2)
    p3 = p1 * np.exp(xb3)
    lik = np.sum(
        np.log(p1) * (y == reference[0])
        + np.log(p2) * (y == reference[1])
        + np.log(p3) * (y == reference[2])
    )
    llik = -1 * np.sum(lik)
    return llik


[docs]def bimnl3(pstart, x2, x3, y, z, reference):
    """
    Calculate likelihood function for the baseline inflated three-category
    MNL model.

    :param pstart: A list of starting parameters.
    :type pstart: list
    :param x2: Multinomial Logit variables. Data subsetted to selected
        variables.
    :type x2: pandas dataframe
    :param x3: Multinomial Logit variables (should be identical to x2.
    :type x3: pandas dataframe
    :param y: Dependent variable (DV). Data subsetted to selected
        variable.
    :type y: pandas dataframe
    :param z: Logit Split stage variables. Data subsetted to selected
        variables.
    :type z: pandas dataframe
    :param reference: List of order of categories (first element will be the
        inflated category).
    """
    b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
    b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
    gamma = pstart[0: (len(z.columns))]
    xb2 = x2.dot(b2)
    xb3 = x3.dot(b3)
    zg = z.dot(gamma)
    pz = 1 / (1 + np.exp(-zg))
    p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
    p2 = p1 * np.exp(xb2)
    p3 = p1 * np.exp(xb3)
    lik = np.sum(
        np.log((1 - pz) + pz * p1) * (y == reference[0])
        + np.log(pz * p2) * (y == reference[1])
        + np.log(pz * p3) * (y == reference[2])
    )
    llik = -1 * np.sum(lik)
    return llik


[docs]def simnl3(pstart, x2, x3, y, z, reference):
    """
    Calculate likelihood function for the second category inflated
    three-category MNL model.

    :param pstart: A list of starting parameters.
    :type pstart: list
    :param x2: Multinomial Logit variables. Data subsetted to selected
        variables.
    :type x2: pandas dataframe
    :param x3: Multinomial Logit variables (should be identical to x2.
    :type x3: pandas dataframe
    :param y: Dependent variable (DV). Data subsetted to selected
        variable.
    :type y: pandas dataframe
    :param z: Logit Split stage variables. Data subsetted to selected
        variables.
    :type z: pandas dataframe
    :param reference: List of order of categories (second element will be the
        inflated category).
    """
    b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
    b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
    gamma = pstart[0: (len(z.columns))]
    xb2 = x2.dot(b2)
    xb3 = x3.dot(b3)
    zg = z.dot(gamma)
    pz = 1 / (1 + np.exp(-zg))
    p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
    p2 = p1 * np.exp(xb2)
    p3 = p1 * np.exp(xb3)
    lik = np.sum(
        np.log(pz * p1) * (y == reference[0])
        + np.log((1 - pz) + pz * p2) * (y == reference[1])
        + np.log(pz * p3) * (y == reference[2])
    )
    llik = -1 * np.sum(lik)
    return llik


[docs]def timnl3(pstart, x2, x3, y, z, reference):
    """
    Calculate likelihood function for the third category inflated
    three-category MNL model.

    :param pstart: A list of starting parameters.
    :type pstart: list
    :param x2: Covariates in the MNL. 
    :type x2: pandas dataframe
    :param x3: Multinomial Logit variables (should be identical to x2).
    :type x3: pandas dataframe
    :param y: Dependent variable (DV). 
    :type y: pandas dataframe
    :param z: Covariates in the Logit model (i.e. split stage). 
        variables.
    :type z: pandas dataframe
    :param reference: List of order of categories (third element will be the
        inflated category).
    """
    b2 = pstart[len(z.columns): (len(z.columns) + len(x2.columns))]
    b3 = pstart[(len(z.columns) + len(x2.columns)): (len(pstart))]
    gamma = pstart[0: (len(z.columns))]
    xb2 = x2.dot(b2)
    xb3 = x3.dot(b3)
    zg = z.dot(gamma)
    pz = 1 / (1 + np.exp(-zg))
    p1 = 1 / (1 + np.exp(xb2) + np.exp(xb3))
    p2 = p1 * np.exp(xb2)
    p3 = p1 * np.exp(xb3)
    lik = np.sum(
        np.log(pz * p1) * (y == reference[0])
        + np.log(pz * p2) * (y == reference[1])
        + np.log((1 - pz) + pz * p3) * (y == reference[2])
    )
    llik = -1 * np.sum(lik)
    return llik


[docs]def gimnlresults(model, data, x, y, z, modeltype, reference, inflatecat):
    """
    Produce estimation results, part of :py:func:`gimnlmod`.

    Store estimates, model AIC, and other information to
    :py:class:`GimnlModel`.

    :param model: Estimation results.
    :param data: Model data used for estimation, subsetted to selected
        variables and missing values listwise deleted.
    :type data: pandas.DataFrame
    :param x: List of names for covariates in Multinomial Logit stage.
    :type x: list of str
    :param y: List of names for dependent variable .
    :type y: list of str
    :param z: List of names for covariates in split-stage.
    :type z: list of str
    :param modeltype: Type of inflated MNL model. One of 'bimnl3', 'simnl3',
        or 'timnl3'.
    :param reference: List of order of categories. First element is the
        baseline/ reference category.
    :param inflatecat: Inflated category. One of 'baseline', 'second',
        or 'third'.
    """
    varlist = np.unique(y + z + x)
    dataset = data[varlist]
    datasetnew = dataset.dropna(how="any")
    datasetnew = datasetnew.reset_index(drop=True)
    x_ = datasetnew[x]
    y_ = datasetnew[y]
    yx_ = y_.iloc[:, 0]
    yncat = len(np.unique(yx_))
    z_ = datasetnew[z]
    z_.insert(0, "int", np.repeat(1, len(z_)))
    x_.insert(0, "int", np.repeat(1, len(x_)))
    names = list()
    if modeltype == "bimnl3":
        x2 = x_
        x3 = x_
        for s in range(z_.shape[1]):
            names.append("Inflation: " + z_.columns[s])
        for s in range(x2.shape[1]):
            names.append(str(reference[1]) + ": " + x2.columns[s])
        for s in range(x3.shape[1]):
            names.append(str(reference[2]) + ": " + x3.columns[s])
        xs = model.x[(z_.shape[1]): (z_.shape[1] + x2.shape[1] + x3.shape[1])]
    zs = model.x[0: (z_.shape[1])]
    ses = np.sqrt(np.diag(model.hess_inv))
    tscore = model.x / ses
    pval = (1 - (norm.cdf(abs(tscore)))) * 2
    lci = model.x - 1.96 * ses
    uci = model.x + 1.96 * ses
    coef = pd.DataFrame(
        {
            "Coef": model.x,
            "SE": ses,
            "tscore": tscore,
            "p": pval,
            "2.5%": lci,
            "97.5%": uci,
        },
        names,
    )
    aic = -2 * (-model.fun) + 2 * (len(coef))
    llik = -1 * model.fun
    model = GimnlModel(
        modeltype,
        reference,
        inflatecat,
        llik,
        coef,
        aic,
        model.hess_inv,
        datasetnew,
        xs,
        zs,
        x_,
        yx_,
        z_,
        yncat,
        x,
        y,
        z,
    )
    return model


[docs]def mnlresults(model, data, x, y, modeltype, reference):
    """
    Produce estimation results, part of :py:func:`mnlmod`.

    Store estimates, model AIC, and other information to :py:class:`MnlModel`.

    :param model: Estimation results.
    :param data: Model data used for estimation, subsetted to selected
        variables and missing values listwise deleted.
    :type data: pandas.DataFrame
    :param x: List of names for covariates in Multinomial Logit stage.
    :type x: list of str
    :param y: List of names for dependent variable .
    :type y: list of str
    :param modeltype: Three-category MNL model ('mnl3').
    :param reference: List of order of categories. First element is the
        baseline/ reference category.
    """
    varlist = np.unique(y + x)
    dataset = data[varlist]
    datasetnew = dataset.dropna(how="any")
    datasetnew = datasetnew.reset_index(drop=True)
    x_ = datasetnew[x]
    y_ = datasetnew[y]
    yx_ = y_.iloc[:, 0]
    yncat = len(np.unique(yx_))
    x_.insert(0, "int", np.repeat(1, len(x_)))
    names = list()
    if modeltype == "mnl3":
        x2 = x_
        x3 = x_
        for s in range(x2.shape[1]):
            names.append(str(reference[1]) + ": " + x2.columns[s])
        for s in range(x3.shape[1]):
            names.append(str(reference[2]) + ": " + x3.columns[s])
        xs = model.x[0: (x2.shape[1] + x3.shape[1])]
    ses = np.sqrt(np.diag(model.hess_inv))
    tscore = model.x / ses
    pval = (1 - (norm.cdf(abs(tscore)))) * 2
    lci = model.x - 1.96 * ses
    uci = model.x + 1.96 * ses
    coef = pd.DataFrame(
        {
            "Coef": model.x,
            "SE": ses,
            "tscore": tscore,
            "p": pval,
            "2.5%": lci,
            "97.5%": uci,
        },
        names,
    )
    aic = -2 * (-model.fun) + 2 * (len(coef))
    llik = -1 * model.fun
    model = MnlModel(
        modeltype,
        reference,
        llik,
        coef,
        aic,
        model.hess_inv,
        datasetnew,
        xs,
        x_,
        yx_,
        yncat,
        x,
        y,
    )
    return model


[docs]def gimnlmod(data, x, y, z, reference, inflatecat, method="BFGS", pstart=None):
    """
    Estimate three-category inflated Multinomial Logit model.

    :param data: Full dataset.
    :type data: pandas.dataframe
    :param x: Covariates in Multi Nomial Logit stage. The elements must match
        column names of ``data``.
    :type x: list of str
    :param y: Dependent Variable. Values should be integers,
        with a number from 0-2 representing each category. The element
        must match column names of ``data``.
    :type y: list of str
    :param z: Covariates in split-stage. The elements must match column
        names of ``data``.
    :type z: list of str
    :param reference:  List of three elements specifying the order of
        categories (e.g [0, 1, 2], [2, 1, 0]. etc...). The first element is
        the baseline/reference category. The parameter
        inflatecat then specifies which category in the list inflated.
    :type reference: list of int
    :param inflatecat: A string specifying the inflated category. One of
        "baseline" for the first, "second" for the second, or "third" for the
        third in reference list to specify the inflated category.
    :param method: Optimization method.  Default is 'BFGS'. For other
        available methods, see scipy.optimize.minimize documentation.
    :param pstart: A list of starting parameters.
    :type pstart: list
    """
    varlist = np.unique(y + z + x)
    dataset = data[varlist]
    datasetnew = dataset.dropna(how="any")
    datasetnew = datasetnew.reset_index(drop=True)
    x_ = datasetnew[x]
    y_ = datasetnew[y]
    yx_ = y_.iloc[:, 0]
    yncat = len(np.unique(yx_))
    if yncat == 3:
        modeltype = "bimnl3"
    else:
        raise Exception(
            "Function only supports Dependent Variable with 3 " "categories."
        )
    z_ = datasetnew[z]
    z_.insert(0, "int", np.repeat(1, len(z_)))
    x_.insert(0, "int", np.repeat(1, len(x_)))
    if modeltype == "bimnl3":
        x2 = x_
        x3 = x_
        if pstart is None:
            pstart = np.repeat(
                0.01, (len(x2.columns) + len(x3.columns) + len(z_.columns))
            )
        if inflatecat == "baseline":
            model = minimize(
                bimnl3,
                pstart,
                args=(x2, x3, yx_, z_, reference),
                method=method,
                options={"disp": True, "maxiter": 500},
            )
        elif inflatecat == "second":
            model = minimize(
                simnl3,
                pstart,
                args=(x2, x3, yx_, z_, reference),
                method=method,
                options={"disp": True, "maxiter": 500},
            )
        elif inflatecat == "third":
            model = minimize(
                timnl3,
                pstart,
                args=(x2, x3, yx_, z_, reference),
                method=method,
                options={"disp": True, "maxiter": 500},
            )
    results = gimnlresults(model, data, x, y, z, modeltype, reference,
                           inflatecat)
    return results


[docs]def mnlmod(data, x, y, reference, method="BFGS", pstart=None):
    """
    Estimate three-category Multinomial Logit model.

    :param data: Full dataset.
    :type data: pandas.dataframe
    :param x: Covariates in MNL stage. The elements must match column
        names of ``data``
    :type x: list of str
    :param y: Dependent variable. Values should be integers,
        with a number from 0-2 representing each category.
    :param reference:  List of three elements specifying the order of
        categories (e.g [0, 1, 2], [2, 1, 0]. etc...). The first element is
        the baseline/reference category.
    :type reference: list of int
    :param method: Optimization method.  Default is 'BFGS'. For other
        available methods, see scipy.optimize.minimize documentation.
    :param pstart: A list of starting parameters.
    :type pstart: list
    """
    varlist = np.unique(y + x)
    dataset = data[varlist]
    datasetnew = dataset.dropna(how="any")
    datasetnew = datasetnew.reset_index(drop=True)
    x_ = datasetnew[x]
    y_ = datasetnew[y]
    yx_ = y_.iloc[:, 0]
    yncat = len(np.unique(yx_))
    if yncat == 3:
        modeltype = "mnl3"
    else:
        raise Exception(
            "Function only supports Dependent Variable with 3 " "categories."
        )
    x_.insert(0, "int", np.repeat(1, len(x_)))
    if modeltype == "mnl3":
        x2 = x_
        x3 = x_
        if pstart is None:
            pstart = np.repeat(
                0.01, (len(x2.columns) + len(x3.columns))
            )
            model = minimize(
                mnl3,
                pstart,
                args=(x2, x3, yx_, reference),
                method=method,
                options={"disp": True, "maxiter": 500},
            )
    results = mnlresults(model, data, x, y, modeltype, reference)
    return results


[docs]def vuong_gimnl(modelmnl, modelgimnl):
    """
    Run the Vuong test to compare the performance of the MNL and GIMNL model.

    For the function to run properly, the models need to have the same X
    covariates and same number of observations.

    :param modelmnl: A :class:`MnlModel` object.
    :param modelgimnl: A :class:`GimnlModel` object.
    """
    xb2_mnl = modelmnl.X.dot(modelmnl.multinom[0: len(modelmnl.X.columns)])
    xb3_mnl = modelmnl.X.dot(modelmnl.multinom[len(modelmnl.X.columns):
                                               len(modelmnl.multinom)])
    xb2_gimnl = modelgimnl.X.dot(
        modelgimnl.multinom[0: len(modelgimnl.X.columns)])
    xb3_gimnl = modelgimnl.X.dot(modelgimnl.multinom[len(modelgimnl.X.columns):
                                                     len(modelgimnl.multinom)])
    zg_gimnl = modelgimnl.Z.dot(modelgimnl.split)
    mnldenom = 1 + np.exp(xb2_mnl) + np.exp(xb3_mnl)
    gimnldenom = 1 + np.exp(xb2_gimnl) + np.exp(xb3_gimnl)
    p1mnl = 1 / mnldenom
    p2mnl = np.exp(xb2_mnl) / mnldenom
    p3mnl = np.exp(xb3_mnl) / mnldenom
    if modelgimnl.inflatecat == "baseline":
        p1gimnl = ((1 / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl)))
                   + (1 - (1 / (1 + np.exp(-zg_gimnl)))))
        p2gimnl = (np.exp(xb2_gimnl) / gimnldenom) * (
                1 / (1 + np.exp(-zg_gimnl)))
        p3gimnl = (np.exp(xb3_gimnl) / gimnldenom) * (
                1 / (1 + np.exp(-zg_gimnl)))
    elif modelgimnl.inflatecat == "second":
        p1gimnl = ((1 / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl))))
        p2gimnl = ((np.exp(xb2_gimnl) / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl)))
                   + (1 - (1 / (1 + np.exp(-zg_gimnl)))))
        p3gimnl = ((np.exp(xb3_gimnl) / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl))))
    elif modelgimnl.inflatecat == "third":
        p1gimnl = ((1 / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl))))
        p2gimnl = ((np.exp(xb2_gimnl) / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl))))
        p3gimnl = ((np.exp(xb3_gimnl) / gimnldenom)
                   * (1 / (1 + np.exp(-zg_gimnl)))
                   + (1 - (1 / (1 + np.exp(-zg_gimnl)))))
    m = np.zeros(len(modelgimnl.X))
    reference = modelgimnl.reference
    y = modelgimnl.Y
    for i in range(len(m)):
        if y[i] == reference[0]:
            m[i] = np.log(p1mnl[i] / p1gimnl[i])
        elif y[i] == reference[0]:
            m[i] = np.log(p2mnl[i] / p2gimnl[i])
        elif y[i] == reference[2]:
            m[i] = np.log(p3mnl[i] / p3gimnl[i])
    diffmsq = (m - np.mean(m)) ** 2
    sumdms = sum(diffmsq)
    vuong = (np.sqrt(len(m)) * (1 / len(m)) * sum(m)) / (
        np.sqrt((1 / len(m)) * sumdms))
    return vuong