#!/usr/bin/env python

import matplotlib
matplotlib.use('TkAgg')

import pandas as pd
import csv
import pprint
from scipy.stats.stats import pearsonr
import numpy

from rpy2.robjects.vectors import FloatVector
from rpy2.robjects.packages import importr

import matplotlib.pyplot as plt
import seaborn as sns

pp = pprint.PrettyPrinter()

df = pd.read_csv('gust-bardon-life-expectancy.csv')
print(df)


def annotate_colname(x, **kws):
  ax = plt.gca()
  ax.annotate(x.name, xy=(0.05, 0.9), xycoords=ax.transAxes,
              fontweight='bold')

def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter(.5, .5, marker_size, corr_r, alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)


sns.set(style='white', font_scale=1)
g = sns.PairGrid(df, aspect=0.9, diag_sharey=False)
g.map_lower(sns.regplot, order=3, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_diag(annotate_colname)
g.map_upper(corrdot)
for ax in g.axes.flatten():
    ax.set_ylabel('')
    ax.set_xlabel('')
g.savefig('output.png')
plt.show()

del df['Forest']
del df['ExportsGoodsServices']
df['GDP'] = numpy.log(df["GDP"])
df['HealthExpen'] = numpy.log(df["HealthExpen"])
df['Obesity'] = numpy.log(df["Obesity"])
df['SocialExpenditure'] = numpy.log(df["SocialExpenditure"])

def annotate_colname(x, **kws):
  ax = plt.gca()
  ax.annotate(x.name, xy=(0.05, 0.9), xycoords=ax.transAxes,
              fontweight='bold')

def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter(.5, .5, marker_size, corr_r, alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)


sns.set(style='white', font_scale=1)
g = sns.PairGrid(df, aspect=0.9, diag_sharey=False)
g.map_lower(sns.regplot, order=3, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_diag(annotate_colname)
g.map_upper(corrdot)
for ax in g.axes.flatten():
    ax.set_ylabel('')
    ax.set_xlabel('')
g.savefig('matrix2.png')
plt.show()


def variance_inflation_factors(exog_df):
    '''
    Parameters
    ----------
    exog_df : dataframe, (nobs, k_vars)
        design matrix with all explanatory variables, as for example used in
        regression.

    Returns
    -------
    vif : Series
        variance inflation factors
    '''
    exog_df = add_constant(exog_df)
    vifs = pd.Series(
        [1 / (1. - OLS(exog_df[col].values,
                       exog_df.loc[:, exog_df.columns != col].values).fit().rsquared)
         for col in exog_df],
        index=exog_df.columns,
        name='VIF'
    )
    return vifs
response = df['LifeExp']
del df['LifeExp']

print(variance_inflation_factors(df))


def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out = 0.05,
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

result = stepwise_selection(df, response)

print('resulting features:')
print(result)

import statsmodels.formula.api as sm

model = sm.ols(formula = "response ~ GDP + Sanitation + Obesity + SocialExpenditure", data=df).fit()
print(model.params)
print(model.summary())

residuals = model.resid
fited = 47.788446 + 2.019656*df['GDP']+0.168848*df['Sanitation'] -2.916734*df['Obesity']+1.595880*df['SocialExpenditure']

plt.scatter(fited, residuals)
plt.axhline(color="black", ls="--")
plt.xlabel('Fitted Value')
plt.ylabel('Residuals')
plt.title('Residuals Versus Fits')
plt.show()

import statsmodels.api as stm
stm.qqplot(residuals, line='r')
plt.title('Normal Q-Q plot')
plt.show()

import statsmodels.stats.diagnostic as smsdia

bp= smsdia.het_breuschpagan(residuals, model.model.exog)
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
print(pd.DataFrame(name, bp))

import statsmodels.formula.api as sm
from statsmodels.formula.api import wls
u2 = residuals**2
wls_model = sm.wls(formula = "response ~ GDP + Sanitation + Obesity + SocialExpenditure", data=df, weights = 1/u2).fit()
print(wls_model.params)
print(wls_model.summary())