import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
import chardet
%matplotlib inline
%config IPCompleter.greedy = False
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import matplotlib.patches as patches
from matplotlib import cm
def add_stars(params, pvals):
ret = []
for param, pval in zip(params.values, pvals.values):
if pval < 0.01:
ret.append(str(f'{param:0.3f}***'))
elif pval < 0.05:
ret.append(str(f'{param:0.3f}**'))
elif pval < 0.10:
ret.append(str(f'{param:0.3f}*'))
else:
ret.append(str(f'{param:0.3f}'))
return pd.Series(ret)
def save_results(model, name):
out = model.params
out[:] = add_stars(model.params, model.pvalues)
out['r2'] = model.rsquared
out.name = name
return out
Pandas (which is a protmanteau of Panel Data!) has robust support for multiple index datasets through the use of Excel-style pivot tables. We construct such a table here to further examine the dataset.
with open('/Users/jessegrabowski/Desktop/burnside2.csv', ) as data:
df = pd.read_csv(data)
#The table loads with a redundant index variable which can be dropped.
df.drop(columns=df.columns[0], inplace=True)
#Burnside and Dollar don't use the 1966-69 observations, so we drop those as well:
df = df[df['YEAR1'] != '1966-69']
#This is a list of the 56 countries used in the Burnside-Dollar study
Countries = ['DZA', 'ARG', 'BOL', 'BWA', 'BRA', 'CMR', 'CHL', 'COL', 'CRI', 'CIV', 'DOM', 'ECU', 'EGY', 'SLV','ETH',
'GAB', 'GMB', 'GHA', 'GTM', 'GUY', 'HTI', 'HND', 'IND', 'IDN', 'JAM', 'KEN', 'KOR', 'MDG', 'MWI', 'MYS',
'MLI', 'MEX', 'MAR', 'NIC', 'NER', 'NGA', 'PAK', 'PRY', 'PER', 'PHL', 'SEN', 'SLE', 'SOM', 'LKA', 'SYR',
'TZA', 'THA', 'TGO', 'TTO', 'TUN', 'TUR', 'URY', 'VEN', 'ZAR', 'ZMB', 'ZWE']
#We keep only countries in the above list. Drop unused index columns after.
df = df[df['Country3'].apply(lambda x: True if x in Countries else False)]
df.drop(columns=['CountryYear', 'Country3', 'Ident'], inplace=True)
#Matplotlib will complain later if we keep the years as YYYY-YY, because it sees this as a datetime formate YYYY-MM,
#and gets confused (there is no month 76). To solve this, we change the dates to YYYY-YYYY format.
years = df['YEAR1'].unique()
start = [x.split('-')[0] for x in years]
end = ['19'+x.split('-')[1] for x in years]
years = [i + '-' + j for i,j in zip(start, end)]
replace_map = {}
for old_year, new_year in zip(df['YEAR1'].unique(), years):
replace_map[old_year] = new_year
df['YEAR1'] = df['YEAR1'].apply(lambda x: replace_map[x])
#Transform the dataframe into a pivot-table, appropriate for panel data investigation
df['Country'] = df['Country'].str.title()
df = df.set_index(['Country', 'YEAR1'])
#Rename columns for readability
df.rename(columns={'GDPG':'GDP Growth', 'BB':'Budget Surplus',
'INFL':'Inflation', 'SACW':'Trade Openness',
'ETHNF':'Ethnic Frac', 'ASSAS':'Assassinations',
'ICRGE':'Institutional Quality', 'M2-1':'L_Money Supply', 'SSA':'Sub-Saharan Africa',
'EASIA': 'East Asia', 'LPOP':'Log Population', 'EGYPT':'Egypt', 'CENTAM':'Central America',
'FRZ':'Franc Zone', 'ARMS-1':'L_Arms NX', 'EDA':'Effective Aid',
'BEDA':'Bilateral Effective Aid'}, inplace=True)
#Inflation has been cast as a string, here we reconvert to a float
df['Inflation'] = df['Inflation'].apply(np.float16)
Before going forward, the number of observations per country are verified to match Burnside and Dollar (2000)
#Drop NAs from GDP and BB columns, creating the 275 row dataframe used by Burnside and Dollar
df.dropna(axis=0, how='any', subset=['GDP Growth', 'GDP', 'Budget Surplus', 'Inflation', 'L_Money Supply'], inplace=True)
print(f'Rows in data set: {df.shape[0]}')
line = ''
print(('Country' + ' '*16 + 'N Obs' + ' '*8) * 3)
print('='*100)
for country, count in zip(df.groupby('Country').size().index, df.groupby('Country').size()):
text = country + ' '*(25-len(country)) + str(count) + ' '*10
if len(line) < 100:
line += text
else:
print(line)
line = ''
Several variables used in Burnside-Dollar regressions are not included in the raw data, so these are constructed here.
#Create dummy variables for each 3-year average (year 3 to year 7). Year 2 is dropped to avoid perfect colinearity.
df = pd.get_dummies(df, columns=['year'], drop_first=True)
#Several variables (Budget Surplus, Inflation, and Ethnic Fractionalization) have been divided by 100,
#so the same is done here:
df['Ethnic Frac'] = df['Ethnic Frac'] / 100
df['Inflation'] = df['Inflation'] / 100
df['Budget Surplus'] = df['Budget Surplus'] / 100
#Interaction term
df['Frac*Assass'] = df['Ethnic Frac'] * df['Assassinations']
#log GDP
df['log GDP'] = np.log(df['GDP'])
#Add a constant term (not done automatically by statsmodels.api)
df['Constant'] = 1
Burnside and Dollar use a policy variable as a key element in their investigation; we recreate it here first, before performing descriptivie statisitics. A first-stage regression exluding Effective Aid is conducted, and the coefficients on Budget Surplus, Inflation, and Lagged Money Supply are recovered.
y = df['GDP Growth']
X = df[['Constant', 'log GDP', 'Ethnic Frac', 'Assassinations', 'Frac*Assass', 'Institutional Quality',
'L_Money Supply', 'Sub-Saharan Africa', 'East Asia', 'Budget Surplus', 'Inflation', 'Trade Openness',
'year_3', 'year_4', 'year_5', 'year_6', 'year_7']]
ols = sm.OLS(y, X, has_const=True).fit(cov_type='HC0')
policy_β = ols.params[['Budget Surplus', 'Inflation', 'Trade Openness']]
policy_β
Burnside and Dollar then construct a constant term to center the average estimate of GDP Growth using the policy coefficients at 0, such that:
$\bar{GDP Growth} - \bar{\hat{y}} = 0$
p_const = df['GDP Growth'].mean() - (df[['Budget Surplus', 'Inflation', 'Trade Openness']].values @ policy_β).mean()
p_const
For each observation, the policy variable is defined as:
$Policy = \alpha_p + \beta_{BS} \cdot Budget Surplus + \beta_I \cdot Inflation + \beta_{TO} \cdot Trade Openness$
df['Policy'] = p_const + df[['Budget Surplus', 'Inflation', 'Trade Openness']].values @ policy_β
df['Policy'].head()
We first present sample variance in the country dimension by variable of interest. Vales for each country are first averaged across all time periods, then the variance is calculated for each variable using the resulting series:
$\bar{x_{i}} = \sum_{t=1}^{T} \frac{x_{i,t}}{T}$
$\mu = \sum_{i \in I} \frac{\bar{x_{i}}}{I}$
$V(x) = \sum_{i \in I} (x_{i} - \mu)^2$
Lagged Money Supply has by far the largest variance, followed by GDP growth, then effective aid.
X_vars = ['Budget Surplus', 'Inflation', 'Trade Openness', 'GDP Growth', 'log GDP', 'Frac*Assass',
'Ethnic Frac', 'Assassinations','Institutional Quality', 'L_Money Supply', 'Effective Aid', 'Policy']
df[X_vars].groupby('Country').agg(np.nanmean).apply(np.nanvar).sort_values(ascending=False)
To visualize these summary statistics, we look at the KDE plots of each variable. SCALES OF THE GRAPHS ARE NOT EQUAL, and thus they must be read with great care. Institutional quality, for example, appears more variant than Lagged Money Supply, until one carefully looks at the x-axis labels. Taking into account the scale, we see that L_Money Supply is by far the highest variance variable. Also interesting is Ethnic Fractionalization, which appears to be strongly bimodal. We should keep this in mind moving forward, because summary statistics will fail to capture this. Trade openness also seems to have two clusters, while Inflation and Assassinations are extremely high kurtosis.
Note also that the policy index is much more normal than it's components, to it's credit. On the other hand, the interaction term betweeen Assassination and Ethnic Fractionalization seemed to inherit the distribution of Assassinations itself.
fig, ax = plt.subplots(4,3, figsize=(12,8), dpi=100)
for var, axis in zip(X_vars, fig.axes):
g = sns.kdeplot(df[var].groupby('Country').agg(np.nanmean), ax=axis, legend=False, shade=True)
axis.hist(df[var].groupby('Country').agg(np.nanmean), density=True, facecolor='Tab:Blue', edgecolor=None,
alpha=0.3)
axis.hist(df[var].groupby('Country').agg(np.nanmean), density=True, facecolor='None', edgecolor='black',
alpha=0.6)
axis.set_title(var)
fig.tight_layout()
fig.suptitle('KDE Plots for Variables of Interest, Averaged by Country over Time', size=20, y=1.04)
plt.show()
To visualize inter-country variance, the average value over time, per country, is plotted below for each variable of interest. Individual time observations are also shown, as well as the overall sample average and standard deviations.
_df = df[X_vars].groupby('Country')
fig, axes = plt.subplots(4, 3, figsize=(8*3,6*3), dpi=100)
for var, axis in zip(X_vars, fig.axes):
means = []
for i, country in enumerate(df.index.get_level_values(0)):
y = _df.get_group(country)[var]
x = [country] * len(y)
means.append(np.mean(y))
if i == 0:
axis.scatter(x, y, c='steelblue', alpha=0.2, label='Per-Period Observation')
else:
axis.scatter(x, y, c='steelblue', alpha=0.2)
axis.tick_params(axis='x',
rotation=90,
labelsize=8)
axis.set_ylabel(var)
axis.set_title(f'{var} by Country', size=14)
axis.plot(df.index.get_level_values(0), means, color='red', lw=1, label='Per-Country Mean')
axis.axhline(np.mean(means), ls='-', color='black', label='Sample Mean')
axis.axhline(np.mean(means) + 1.96 * np.std(means), ls='--', color='black', label='Mean $\pm 1.96\hat{\sigma}$')
axis.axhline(np.mean(means) - 1.96 * np.std(means), ls='--', color='black')
for spine in axis.spines.values():
spine.set_visible(False)
fig.tight_layout()
axis.legend(bbox_to_anchor=(-.15,6.15), ncol=4)
fig.suptitle('Variation in the Individual Dimension, by Independent Variable', y=1.07, fontsize=32)
plt.show()
These plots illustrate the high variance shown in the summary statistics. Note the scale of the y axis of Lagged Money Supply: the sample average is ~30, while national averages are as high as 63 (Egypt, see below).
By visual inspection, Budget Surplus, Inflation, and Assasinations seem to be quite invariant between countries, with majority of observations falling within the margin of error around the overall sample mean . Variance in these variables seems driven by a small number of countries, and then outlier time periods within those countries.
GDP growth and Policy seem to have most variance year-to-year, and are fairly uniformly distributed around the mean. This makes sense, given the distributions seen in the KDE plots above.
Trade openness and ethnic fractionalization are highly clustered (this is also visible in the KDE plot plot -- both have bimodal distributions), with extremely low inter-temporal variation relative to inter-individual variation.
A more systematic investigation can count, for each country, how many variable's in-country mean (between transformation) lie outside of the margin of error formed by the collection of all in-country means.
While not systematic outlier detection, this table will give an idea of which countries are most dissimilar from the sample mean. Note that this measure is poor for the bimodal variables, Ethnic Fractionalization and Trade Openness, because the overall mean will not capture the correct shape of the distribution.
From the counts below, we see that Botswana, Korea, and Nicaragua have the most variables systematically different from the sample average.
means = df[X_vars].groupby('Country').agg(np.nanmean)
table = means[(means > (means.mean() + 1.96*means.std())) | (means < (means.mean()- 1.96*means.std()))].dropna(how='all')
table.T.count().sort_values(ascending=False)
Counting row-wise, we see which between-transformed variables have the most outliers from the overall mean. 6 countries have a beteween-transformed Trade Openness and Policy value outside of the confidence interval, followed by Assassinations and Inflation at 4.
This methodology detects high kurtosis variables, but not necessarily high variance variabnles. Recall that L_Money Supply is by far the most highly variant variable, but it has relatively few outlier values. Similarly, Insititutional Quality and Ethnic Fractionalization have none.
table.count().sort_values(ascending=False)
The above exercise is repeated with time averages between all countries. We begin with the variance statistic for each variable, aggregrated by time period. This creates a kind of "Time Between Transformation", following the following methodology:
$\bar{x_{t}} = \sum_{i \in I} \frac{x_{i,t}}{I}$
$\mu = \sum_{t=1}^T \frac{\bar{x_{t}}}{T}$
$V(x) = \sum_{t=1}^T (\bar{x_{t}} - \mu)^2$
df[X_vars].groupby('YEAR1').agg(np.nanmean).apply(np.nanvar).sort_values(ascending=False)
Again, Lagged Money Supply is the most variable, followed by GDP growth and Effective Aid. Index variables are largely invariant over time. Much of this is visible on the blue scatterplot points in the cross-country average plots above. There is very little "jitter" in the points for Budget Surplus, Ethnic Fractionalization, Institution Quality, Assassinations, Inflation, and Trade Openness, which is reflected in their extremely low variances here.
Again, pay extremely close attention to the scale of the x-axis; the graphs are not directly comparable. Trade Openness once again appears to be bimodal, but Ethnic Fractionalization is not. In general, variables seem much more normally distributed than when aggregrated at the country level.
Interestingly, looking at the policy index, one period (1990-1994, visible in the line-graphs below) seems systematically different from the other years. This appears to be driven entirely by a shift in Trade Openness in that year. This shift might very well be endogeneous to Aid, because the late 80's saw the height of the "Washington Concensus", which tied development assistance to policy reform, notably with respect to Trade Openness. Given that this sample is made of aid-recipient countires, they are exactly those who would be most pressured into Washington Concensus reforms.
One limitation is data, because there are only 7 time observations, so these density estimates are certainly not representitive of the data-generating process. This is extremely clear when we compare with the overlaid histograms.
fig, ax = plt.subplots(4,3, figsize=(12,8), dpi=100)
for var, axis in zip(X_vars, fig.axes):
data = df[var].groupby('YEAR1').agg(np.nanmean)
g = sns.kdeplot(df[var].groupby('YEAR1').agg(np.nanmean), ax=axis, shade=True, legend=False)
axis.hist(df[var].groupby('YEAR1').agg(np.nanmean), density=True, facecolor='Tab:Blue', edgecolor=None,
alpha=0.3)
axis.hist(df[var].groupby('YEAR1').agg(np.nanmean), density=True, facecolor='None', edgecolor='black',
alpha=0.6)
axis.set_title(var)
fig.tight_layout()
fig.suptitle('KDE Plots for Variables of Interest, Averaged by Country over Time', size=20, y=1.04)
plt.show()
Inter-temporal variance (red line) is EXTREMELY low when compared to inter-country variance (blue dots). We can observe some trends, especially in trade openness, lagged Money Supply, and effective aid, and but overall variables are static and within the 1 standard deviation confidence interval.
_df = df.groupby('YEAR1')
fig, axes = plt.subplots(4, 3, figsize=(8*3,6*3), dpi=100)
years = df.index.get_level_values(1).unique()
for i, (var, axis) in enumerate(zip(X_vars, fig.axes)):
means = []
for year in years:
y = _df.get_group(year)[var]
x = [year] * len(y)
means.append(np.mean(y))
if i == 0:
axis.scatter(x, y, c='steelblue', alpha=0.2, label='Per-Country Observation')
else:
axis.scatter(x, y, c='steelblue', alpha=0.2)
axis.tick_params(axis='x',
rotation=30,
labelsize=8)
axis.set_ylabel(var)
axis.set_title(f'{var} by Time Period', size=14)
axis.plot(years, means, color='red', lw=2, label='Per-Year Mean')
axis.axhline(np.mean(means), ls='-', color='black', label='Sample Mean', alpha=0.7)
axis.axhline(np.mean(means) + np.std(means), ls='--', color='black', label='Mean $\pm \hat{\sigma}$', alpha=0.75)
axis.axhline(np.mean(means) - np.std(means), ls='--', color='black', alpha=0.7)
for spine in axis.spines.values():
spine.set_visible(False)
fig.tight_layout()
axis.legend(bbox_to_anchor=(-.4,3.58), ncol=4)
fig.suptitle('Variation in the Time Dimension, by Independent Variable', y=1.07, fontsize=32)
plt.show()
As before, we now present years with aggregrated values outside the $\mu \pm 1.96\cdot \hat{\sigma}$ confidence interval, along with counts by variables and years, to see where the main sources of variation are.
As noted above, 1990-1993 is much different from the other years in the sample. In addition to the noted extreme change in Trade Openness in the 1990-1993 period, many variables show a time-trend, including Effective Aid, Money Supply, and Inflation, which contrible to this hetergeneity between years.
means = df[X_vars].groupby('YEAR1').agg(np.nanmean)
table = means[(means > (means.mean() + means.std())) | (means < (means.mean()- means.std()))].dropna(how='all')
table.T.count().sort_values(ascending=False)
Looking row-wise, we see that money supply has the most 'outlier' years (if such a word can be applied to an n=7 sample). By visual inspection of the graph above, these results seem entirely driven by time trend.
table.count().sort_values(ascending=False)
While it's already clear at this point that Lagged Money Supply, GDP Growth, and Effective Aid are the 3 variables with the most variation in both the individual and time dimensions, with variation in the Individual dimension being far, far greater than that of the time dimension, we implement PCA here just as an academic exercise in using the technique to detmine where the greatest variance comes from.
We will compute the eigenvalues for the covariance matrix of aggregrated averages for both the individual and time dimension, and graph the relative size of each variable's component in the first eigenvector to see how the subspace is being re-oriented.
The numpy np.linspace.eig function returns a vector of eigenvalues and a matrix eigenvectors. The j-th column of the eigenvector matrix v corresponds to the i-th eigenvalue, so the eigenvector corresponding to the largest eigenvalue will be the first principal component.
PCA_vars = list((set(X_vars) - set(policy_β.index)) - set(['Frac*Assass']))
_df = df[PCA_vars].groupby('Country').agg(np.nanmean)
_df = _df - _df.mean() #center the variables by removing the mean
ind_cov = _df.cov()
w, v = np.linalg.eig(ind_cov)
fig, ax = plt.subplots(1, 2, figsize=(8*2,4*2), dpi=100, sharey=True)
y = v[:, w.argsort()[-1]]
x = ind_cov.columns
vector_frame = pd.Series(y, index=x)
abs_sorted = vector_frame.apply(abs).sort_values(ascending=False).index
ax[0].bar(vector_frame[abs_sorted].index, vector_frame[abs_sorted])
ax[0].tick_params(axis='x',
rotation=70)
for spine in ax[0].spines.values():
spine.set_visible(False)
ax[0].axhline(0, ls='-', lw=1, alpha=0.5, color='black')
ax[0].set_title('Country-Aggregrated Covariance Matrix')
_df = df[PCA_vars].groupby('YEAR1').agg(np.nanmean)
_df = _df - _df.mean()
ind_cov = _df.cov()
w, v = np.linalg.eig(ind_cov)
y = v[:, w.argsort()[-1]]
x = ind_cov.columns
vector_frame = pd.Series(y, index=x)
abs_sorted = vector_frame.apply(abs).sort_values(ascending=False).index
ax[1].bar(vector_frame[abs_sorted].index, vector_frame[abs_sorted])
ax[1].tick_params(axis='x',
rotation=70)
for spine in ax[1].spines.values():
spine.set_visible(False)
ax[1].axhline(0, ls='-', lw=1, alpha=0.5, color='black')
ax[1].set_title('Time-Aggregrated Covariance Matrix')
fig.suptitle('First Principle Component Weights by Variable, sorted by Absolute Contribution to Variance',
y = 1.05, fontsize=18)
fig.tight_layout()
plt.show()
Note first that the components of Policy, along with the interaction term between Fractionalization and Assassinations have been dropped. This is because they are will be colinear with their components, and will thus have eigenvector components very close to zero.
In both dimensions, the vast majority of variance comes from the Lagged Money supply, followed by GDP Growth and Effective Aid. The near-zero values of Institutional Quality and Ethnic Fractionalization in the time dimension are not surprising, we know these variables have extremely low time variance.
In all cases, there is more variance in the individual dimension; this is a function of sample size (56 observations vs only 7).
cmap = cm.get_cmap('tab10')
gradient = cmap(np.linspace(0,1,10))
_df = df[PCA_vars] - df[PCA_vars].mean()
ind_cov = _df.cov()
w, v = np.linalg.eig(ind_cov)
sorted_vectors = v[:, w.argsort()]
components = pd.DataFrame(sorted_vectors,
columns=[f'Component {i+1}' for i in range(len(w))],
index=PCA_vars)
fig, ax = plt.subplots(figsize=(12,9))
x = _df['Effective Aid']
y = _df['GDP Growth']
ax.scatter(x,y, color=gradient[0].squeeze().tolist(), alpha=0.5)
c1_s = components.loc['GDP Growth','Component 1'] / components.loc['Effective Aid', 'Component 1']
c2_s = components.loc['GDP Growth','Component 2'] / components.loc['Effective Aid', 'Component 2']
c_x = [min(x), max(x)]
c1_y = [xx * c1_s for xx in c_x]
c2_y = [xx * c2_s for xx in c_x]
ax.plot(c_x, c1_y, color=gradient[1].squeeze().tolist(), lw=3, label='Principle Component 1')
ax.plot(c_x, c2_y, color=gradient[2].squeeze().tolist(), lw=3, label='Principle Component 2')
ax.set_ylabel('GDP Growth')
ax.set_xlabel('Effective Aid')
ax.set_title('Visualization of 2 Primary Sources of Variance', fontweight='bold')
ax.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
for spine in ax.spines.values():
spine.set_visible(False)
ax.legend()
plt.show()
There are a total of 6 time periods and 56 countries. 3 countries have only a single time period observation (Cote D'Ivoire, Mali, and Turkey). More than half (34) have all 6 observations, while those without all 6 are split between different patterns.
_df = df.groupby('Country')
line = ''
print(f'Total Number of Countries: {len(_df.groups.keys())}')
print(f'Total Number of Time Periods: {len(df.groupby("YEAR1").groups.keys())}')
print('\n')
for i in range(6, 0, -1):
print(f'Countries with {i} time observations: {len(_df.size()[_df.size() == i])}')
print('\n')
print(('Country' + ' '*15 + 'N Times' + ' '*6) * 3)
print('='*100)
for country, count in zip(_df.size().index, _df.size()):
text = country + ' '*(25-len(country)) + str(count) + ' '*10
if len(line) < 100:
line += text
else:
print(line)
line = ''
We first investigate the patterns of panel imbalance using a graphic representation
all_years = set(df.index.get_level_values(1).unique())
space = 0.3
fig_height = len(_df.groups.keys()) * space
fig_length = len(all_years)
cmap = cm.get_cmap('Dark2')
gradient = cmap(np.linspace(0,1,len(_df.groups.keys())))
fig, ax = plt.subplots(figsize=(8*1.5,6*1.5), dpi=100)
ax.set_ylim(0, fig_height)
ax.set_xlim(-0.2, fig_length)
for i, country in enumerate(_df.groups.keys()):
present = list(set(_df.get_group(country).index.get_level_values(1)))
y1 = [1 if xx in present else 0 for xx in all_years]
for j, color in enumerate(y1):
x_cord = 0+j*(fig_length/len(all_years)) - 0.3
y_cord = fig_height-i*space - 0.3
if color == 1:
fc = gradient[i].squeeze().tolist()
else:
fc = 'white'
rect = patches.Rectangle(xy=(x_cord, y_cord),
width=fig_length/len(all_years),
height=0.25,
linewidth=2,
edgecolor='white',
facecolor=fc)
ax.add_patch(rect)
ax.set_yticks([fig_height - 0.2 - i * space for i in range(len(_df.groups.keys()))])
ax.set_yticklabels(_df.groups.keys(), fontsize=6)
ax.set_xticks(np.array([0,1,2,3,4,5]) + 0.2)
ax.set_xticklabels(df.index.get_level_values(1).unique().sort_values().to_list())
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
ax.set_title('Panel Structure by Country by Year', fontsize=18, )
plt.show()
From this graphic we note that the most observations are missing in 1970-1970 and 1990-1993.
Except for Algeria, all countries with more than 1 observation have at least 2 connected observations, so it is possible to compute lags and first differences for all but the 3 countries with only a single observation. Several countries have disconnected observations, however: including Botswana, Jamaica, Madagascar, and Syrian Arab Republic (plus the already mentioned Algeria). Computing lags and first differences would mean losing 2 observations from these countries.
To get a better sense of the patters in the unbalanced data, we re-create the above graph with only countries missing at least 1 time observation:
all_years = set(df.index.get_level_values(1).unique())
_df = df.groupby('Country')
unbalanced_countries = df.index.get_level_values(0).unique()[_df.size() < 6]
space = 0.3
fig_height = len(unbalanced_countries) * space
fig_length = len(all_years)
cmap = cm.get_cmap('Dark2')
gradient = cmap(np.linspace(0,1,len(unbalanced_countries)))
fig, ax = plt.subplots(figsize=(8*1.5,6*1.5), dpi=100)
ax.set_ylim(0, fig_height)
ax.set_xlim(-0.2, fig_length)
for i, country in enumerate(unbalanced_countries):
present = list(set(_df.get_group(country).index.get_level_values(1)))
y1 = [1 if xx in present else 0 for xx in all_years]
for j, color in enumerate(y1):
x_cord = 0+j*(fig_length/len(all_years)) - 0.3
y_cord = fig_height-i*space - 0.3
if color == 1:
fc = gradient[i].squeeze().tolist()
else:
fc = 'white'
rect = patches.Rectangle(xy=(x_cord, y_cord),
width=fig_length/len(all_years),
height=0.25,
linewidth=2,
edgecolor='white',
facecolor=fc)
ax.add_patch(rect)
ax.set_yticks([fig_height - 0.2 - i * space for i in range(len(unbalanced_countries))])
ax.set_yticklabels(unbalanced_countries, fontsize=12)
ax.set_xticks(np.array([0,1,2,3,4,5]) + 0.2)
ax.set_xticklabels(df.index.get_level_values(1).unique().sort_values().to_list())
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
ax.set_title('Panel Structure by Country by Year, \n Unbalanced Panels Only', fontsize=18, )
plt.show()
We do not notice any "second best" pattern in the unbalanced panels, 1990-1993 is missing the most observations (15), followed by 1978-1981 (14), then each year is missing between 6 and 9. Even if Cote D'Ivoire, Mali, and Turkey are excluded, there is no obvious best choice pattern that will allow for a maximum panel -- all choices lead to many countries being dropped.
Some definitions:
Overall deviation = $x_{i,t} - \bar{x}$
Between deviation = $\bar{x_i} - \bar{x}$
Within deviation = $x_{i,t} - \bar{x_i}$
Note, however, that centering the overall and between transformations has no effect on the variance.
overall = df[X_vars] - df[X_vars].mean()
between = df[X_vars].groupby('Country').agg(np.nanmean)
#The transform method is fancy, it takes the subgroups made by groupby, applies a function and casts the results back
#to the shape of the original dataframe. In this case, it allows us compute the inter-individual mean, then cast it back
#onto the time-dimension values.
within = df[X_vars].groupby('Country').transform(lambda x: x - x.mean())
table = pd.concat([overall.var(), between.var(), within.var()], axis=1)
table.columns = ['Pooled', 'Between', 'Within']
table.loc[table.apply(sum, axis=1).sort_values(ascending=False).index]
The above table has been sorted by the sum of all 3 variances.
We note the within trasnformed values are signficantly lower than the pooled or between values, this is consistant with the lower inter-temporal variance we saw in the plots above. Specifically, Ethnic Fractionalization and Institutional Quality are essentially invariant over time, within individuals. The results of the PCA seem to hold for all transformed variables, with money supply supplying the most variance, followed by GDP growth and then effective aid.
Interestingly, GDP Growth alone has more variance within individuals than between individuals. This suggests idiosyncratic country factors have more influence on growth outcomes than shared macroeconomic trends.
I know the question asked for everything on one plot, but it was really too much (2 KDEs, 2 normals, 2 bars....) so I split it into 4 with shared x and y axes for easy comparison.
import itertools
cmap = cm.get_cmap('Dark2')
gradient = cmap(np.linspace(0,1,4))
fig, ax = plt.subplots(2, 2, figsize=(12,6), dpi=100, sharex=True, sharey=True)
vars_to_plot = ['GDP Growth', 'Effective Aid']
estimators = [('Within', within), ('Between', between)]
combos = itertools.product(vars_to_plot, estimators)
for (v, (name, estimator)), axis, c in zip(combos, fig.axes, gradient):
kde = stats.gaussian_kde(estimator[v])
xx = np.linspace(min(estimator[v]), max(estimator[v]), 1000)
mean = estimator[v].mean()
std = estimator[v].std()
skew = stats.skew(estimator[v])
kur = stats.kurtosis(estimator[v], fisher=False)
pdf_x = np.linspace(mean - 3*std, mean + 3*std, 1000)
pdf = stats.norm.pdf(pdf_x, mean, std)
c = c.squeeze().tolist()
face = axis.hist(estimator[v], bins=30, density=True, edgecolor=None, color=c, alpha=0.3, label='Histogram')
line = axis.hist(estimator[v], bins=30, density=True, edgecolor='black', facecolor='None', alpha=0.6)
axis.plot(xx, kde(xx), color=c, lw=3, alpha=1, label='Kernel Density Estimation')
axis.plot(pdf_x, pdf, color='black', lw=3, label='Normal Distribution')
axis.set_title(v + ' - ' + name)
axis.text(-10, 1, f'μ = {mean:0.2f}')
axis.text(-10, 0.9, f'σ = {std:0.2f}')
axis.text(-10, 0.8, f'Skew = {skew:0.2f}')
axis.text(-10, 0.7, f'Kurtosis = {kur:0.2f}')
fig.tight_layout()
ax[0][0].set_ylabel('Probability Density')
ax[1][0].set_ylabel('Probability Density')
ax[0][0].legend()
plt.show()
The most obvious feature of these plots is the normality of GDP growth, in both the Within and Between transformed cases. The within transformed GDP Growth has fat tails (kurtosis = 2.00), but is otherwise well distributed around the mean. Effective aid is much less normal; the Between transformed Effective Aid looks like a gamma distribtion, while the within transformed variable has extremely high kurtosis.
None of the variables appear to be multi-modal, but we note that KDE estimation and histograms are sensitive to bandwidth and binning, respectively, so while unlikely, this cannot be definitively inferred from these plots alone.
We begin by showing observations beyond $\mu \pm i\sigma$, with $i \in \{1,2,3\}$ for all 4 distributions graphed above. The GDP growth distributions are quite wide, so we expect to find these outliers.
We notice that there are no outliers in the between transformed variables beyond $1 \cdot \sigma$, consistent with the low kurtosis statistics computed above. It is also consistent with the graphs of variation along the time dimension shown in Question 1: we found this dimension to be much less variant than the individual dimension.
On the other hand, we observe very high kurtosis for within transfomed variables, with each having observations up to $\mu \pm 3 \cdot \sigma$. Values for these extreme outliers are shown below.
import re
print('*'*100)
print(f'Number of σ from the mean for selected varaibles, for all x > μ +/- σ')
print('*'*100)
combos = itertools.product(vars_to_plot, estimators)
for v, (name, estimator) in combos:
μ = estimator[v].mean()
σ = estimator[v].std()
print(v + ' - ' + name)
print('='*110)
if name == 'Within':
header = ('Country' + ' '*(15) + 'Year' + ' '*13 + '(x - μ) / σ' + ' '*7) * 2
else:
header = ('Country' + ' '*(13) + '(x - μ) / σ' + ' '*7) * 3
print(header)
print('='*110)
line = ''
temp = (estimator[v].apply(abs) - abs(μ)) / σ
temp = temp.sort_values(ascending=False)
for s in temp[temp > 1.96].index:
sign = estimator[v][s] / abs(estimator[v][s])
if isinstance(s, tuple):
to_print = s[0].strip() + ' '*(22-len(s[0])) + s[1]
to_print += ' ' * (22 - len(to_print))
if sign < 0:
to_print += ' ' * 9
else: to_print += ' ' * 10
to_print += f'{temp[s] * sign:0.4f}' + ' '*10
elif isinstance(s, str):
to_print = s + ' ' * (22 - len(s))
if sign > 0:
to_print += ' '
to_print += f'{temp[s] * sign:0.4f}' + ' '*10
if len(line + to_print) < 120:
line = line + to_print
else:
print(line)
line = to_print
print(line)
print('\n')
For Within Transformed GDP Growth, Gabon 74, Gabon 78, Cameroon 78, and Cameroon 90 are the most extreme outliers, more than 3 sigma away from the mean. Looking at Within Transformed Effective Aid, Guyana 90 is by far the most extreme outlier, 7.4 sigma from the mean. Guyana 90 is largely the cause of the extremely high kurtosis in this distribution, though Gambia 86 and 70, plus Nicaragua 90 and Zambia 90 are all over 4 sigma from the mean.
Turning to the Between Transformed variables, Botswana and Korea are more than 2 sigma from the mean of the GDP Growth distribution, while Mali, The Gambia, and Tanzania are more than 2 sigma from the mean of the Effective Aid distribution.
To quickly make one box plot, Seaborn expects data to be in long form, so we need to do a bit of squishing and transforming:
temp1 = within['GDP Growth'].to_frame()
temp1['Transform'] = 'Within'
temp2 = within['Effective Aid'].to_frame()
temp2['Transform'] = 'Within'
temp3 = between['GDP Growth'].to_frame()
temp3['Transform'] = 'Between'
temp4 = between['Effective Aid'].to_frame()
temp4['Transform'] = 'Between'
long_frame = None
for frame in [temp1, temp2, temp3, temp4]:
frame.columns = [x + ' ' + frame['Transform'][0] for x in frame.columns]
if long_frame is None:
long_frame = frame
else:
long_frame = pd.merge(long_frame, frame, how='outer', left_index=True, right_index=True)
long_frame = long_frame[long_frame.columns[::2]].reset_index().melt(id_vars=['Country', 'YEAR1'])
fig, ax = plt.subplots(figsize=(12,8), dpi=100)
g = sns.boxplot(x='variable', y='value', data=long_frame, ax=ax)
g.set_xlabel('')
g.tick_params(axis='x',
rotation=30)
g.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
combos = [('GDP Growth', within), ('Effective Aid', within),
('GDP Growth', between), ('Effective Aid', between)]
for i, (v, estimator) in enumerate(combos):
if estimator is within:
μ = estimator[v].mean()
σ = estimator[v].std()
outliers = estimator[(estimator[v] < μ - 3*σ) | (estimator[v] > μ + 3*σ)][v]
countries = outliers.index
else:
Q3 = estimator[v].quantile(0.75)
Q1 = estimator[v].quantile(0.25)
iqr = Q3 - Q1
outliers = estimator[(estimator[v] < (Q1 - iqr * 1.4)) | (estimator[v] > (Q3 + iqr * 1.4))][v]
countries = outliers.index
for outlier, country in zip(outliers, countries):
if isinstance(country, tuple):
text = re.findall("(?<=\')(\w+\W? ?\w*\W? ?\w*)(?=\')",str(country))
s = text[0].replace('\'', '').replace(',', '').strip() + ', ' + text[1]
else:
s = country
ax.annotate(s=s, xy=(0.05+i, outlier-0.1), fontsize=6)
for spine in ax.spines.values():
spine.set_visible(False)
ax.set_title('Box and Wiskers Plot for Within- and Between-Transfomed Variables', fontsize=18)
plt.show()
The use of interquartile range 1.5 in the Box and Wiskers plot gives a better view of the less extreme outliers in the between transformed variables, which the $\sigma$ based method didn't pick up (iqr 1.5 is about $2.5 \cdot \sigma$, so we just missed it).
The extreme outlies in the within transformed variable are still visible. The high kurtosis of Within-transformed Effective Aid is especially visible, note the large number of observations outside of the IQR * 1.5 bands, and the narrowness of the bands themselves.
Equally visible is the skew of between-transformed Effective Aid. It is interesting that the between transformation causes the distribution to shift so dramatically, but this makes sense. Countries which recieve less aid also have less inter-temporal variation in aid receipts, while countries who recieve more aid, and thus have more sources of aid, have more potential volitility in aid.
combos = itertools.product(vars_to_plot, estimators)
for i, (v, (name, estimator)) in enumerate(combos):
s = estimator[v].describe()
s.name = s.name + ' ' + name
s = s[1:]
s['median'] = estimator[v].median()
s = np.round(s, 5)
if i == 0:
temp_frame = s.to_frame()
else:
temp_frame = temp_frame.merge(s, left_index=True, right_index=True)
temp_frame.loc[['mean', 'median', 'std', 'min', 'max', '25%', '75%'], :]
We confirm our findings from question 5, with respect to the mean and standard deviations. Note that the median for GDP growth is nearly four times larger than the mean, this is due to the high kurtosis of this distribution (presence of outliers). Effective Aid Between also has a large discrepency between the mean and the median. Note that the median is negative, while the mean is positive! This is because there is a large cluster of negative values, but many positive outliers.
Effective Aid Within is an interesting case, because we don't see the extremely high kurtosis (and thus large number of outliers) reflected in the median. This is because there are an equal number of positive and negative outliers, "ofsetting" their influence on the median. This is an interesting case-study in the limits of using the mean/median difference for outlier detection!
gdp_order = within.groupby('Country').agg(np.std)['GDP Growth'].sort_values(ascending=False).index
aid_order = within.groupby('Country').agg(np.std)['Effective Aid'].sort_values(ascending=False).index
fig, ax = plt.subplots(2, 1, figsize=(12,8), dpi=100, sharex = False)
group_long = long_frame.groupby('variable')
to_plot_outliers = ['GDP Growth', 'Effective Aid']
for axis, v, sort in zip(fig.axes, ['GDP Growth Within', 'Effective Aid Within'], [gdp_order, aid_order]):
g = sns.boxplot(x='Country', y='value',
data=group_long.get_group(v).set_index('Country').loc[sort, :].reset_index(),
ax=axis, palette='tab20')
g.set_xlabel('')
g.tick_params(axis='x',
rotation=90)
g.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
combos = [('GDP Growth', within), ('Effective Aid', within),
('GDP Growth', between), ('Effective Aid', between)]
for spine in axis.spines.values():
spine.set_visible(False)
axis.set_title(f'{v}', fontsize=12)
fig.suptitle('Box-Plots by of Selected Variables by Country, Sorted by Variance', y=1.05, fontsize=16)
fig.tight_layout()
plt.show()
The means of GDP growth by country are all very close to 0, but standard errors are quite heterogeneous.In general, there is more variance within countries for GDP Growth than Effective Aid. The 2nd graph shows that the majority of countries have nearly constant Effective Aid.
Looking at GDP Growth, despite having an IQR more narrow than that of Cameron or Gabon, Ethopia has higher variance. This is because of a quirk in Ethopia's data: it has just 2 observations, and they are quite spaced apart, creating a "high variance" distribution. This is shown in KDE plots below.
Important outliers are also readily visible. We note especially when outliers are "one-sided", pulling the country's median up or down, as in the case of (for example) Ecuador, Uruguay, and El Salvador.
In the Effective Aid grapth, the outliers (Gambia, Malawi, Nicaragua, Syria, and Zambia) are more readily visible, owning to the overall lower variance. The countries with large IQR appear to be those with large standard deviations. Guyana stands out as an extreme outlier in this plot, both because it has a negative mean beyond the ICR of most other countries, and also because of the extreme positive outlier (1990-1993). This single outlier places it as highest variance, despite an otherwise narrow IQR.
fig, ax = plt.subplots(1,3, figsize=(18,6), dpi=100, sharex=True, sharey=True)
for country, axis in zip(['Ethiopia', 'Cameroon', 'Gabon'], fig.axes):
data = within.loc[country, 'GDP Growth']
μ = data.mean()
σ = data.std()
skew = stats.skew(data)
kurtosis = stats.kurtosis(data, fisher=False)
n = data.shape[0]
sns.kdeplot(data, ax = axis, legend=False, cbar=True)
axis.hist(data, facecolor='None',edgecolor='black', lw=2, density=True, alpha=0.6)
axis.hist(data, facecolor='Tab:Blue', density=True, alpha=0.3)
left_corner_x = axis.get_xlim()[0]
left_corner_y = axis.get_ylim()[1]
u_y = (axis.get_ylim()[1] - axis.get_ylim()[0]) / 100
u_x = (axis.get_xlim()[1] - axis.get_xlim()[0]) / 100
axis.set_title(country)
for i, (annotation, name) in enumerate(zip([n, μ, σ, skew, kurtosis], ['n', 'μ', 'σ', 'Skew', 'Kurtosis'])):
axis.text(x=0.05,
y=.95 - 0.05*i,
s = f'{name} = {annotation:0.3f}',
fontsize=13,
transform=axis.transAxes)
fig.suptitle('Within Transformed GDP Growth for 3 Highest Variance Countries', y=1.05, fontsize=16)
fig.tight_layout()
plt.show()
Before plotting scatter matricies, a time trend is created. Note that the between-transfomed data is time invariant, so there is no time trend in this data.
#Create a time trend
year_map = {'1970-1973':1, '1974-1977':2, '1978-1981':3, '1982-1985':4, '1986-1989':5,'1990-1993':6}
df['Year'] = list(map(lambda x: year_map[x], df.index.get_level_values(1)))
within = pd.merge(within, df['Year'], how='inner', left_index=True, right_index=True)
overall = pd.merge(overall, df['Year'], how='inner', left_index=True, right_index=True)
Start old school with some simple outputs of correlation coefficients and p-values. We do not consider the time dimension, and each country/time pair is treated as a different "individual" in a cross-sectional sample (not withstanding corrective transformation).
Note that the p-values aren't reliable for many of the variables, because they are not normally distributed. The simple Pearson's correlation test cannot be corrected for non-normality as far as I know, so I report the statistics anyway.
X_vars_plus =['Year'] + X_vars
n = len(X_vars_plus)
p_values = np.zeros((n,n))
output = np.zeros((n,n))
for i,var1 in enumerate(X_vars_plus):
for j,var2 in enumerate(X_vars_plus):
output[i][j] = stats.pearsonr(within[var1], within[var2])[0]
p_values[i][j] = stats.pearsonr(within[var1], within[var2])[1]
output = pd.DataFrame(output, columns = X_vars_plus, index=X_vars_plus)
p_values = pd.DataFrame(p_values, columns = X_vars_plus, index=X_vars_plus)
new_out = pd.DataFrame()
for col in output.columns:
temp = add_stars(output[col], p_values[col])
temp.name = col
new_out = pd.concat([new_out, temp], axis=1)
new_out.index = [x.replace(' ', '')[:6] for x in output.index]
new_out.columns = [x.replace(' ','')[:6] for x in new_out.columns]
print('Simple Correlation Matrix for Between Transformed Variables')
print('='*110)
print(new_out)
print('\n')
print('*** - p < 0.01, ** - p < 0.05, * - p < 0.01')
print('Note: Pearson\'s correlation coefficient assumes normally distributed variables')
Expected macroeconomic variables are correlated with GDP growth intertemporally, with expected signs. Novel variables (assassination and institutional quality) do not show simple correlation with GDP growth or Effective Aid when within transformed. Amusingly, the correlation between Effective Aid and GDP Growth is 0.01, which typically would be considered a "classical supressor". In this case, however, it is our variable of interest, so it seems imprudent to drop it. Inflation, Assassinators, and Institutional Quality also have correlations below the 0.1 threshhold, and are strong candidates for dropping.
Policy is strongly correlated with its components (obviously), and has a strong time trend, but is not correlated with Effective Aid.
Another interesting note is the presence of a "covergence effect" in this sample: GDP growth is negatively correlated with log GDP, indicating that growth is slower for countries with higher GDP.
Note that nearly all of the variables, except those known to be time-invariant (see Q1 plots) show a strong time trend.
n = len(X_vars)
p_values = np.zeros((n,n))
output = np.zeros((n,n))
for i,var1 in enumerate(X_vars):
for j,var2 in enumerate(X_vars):
output[i][j] = stats.pearsonr(between[var1], between[var2])[0]
p_values[i][j] = stats.pearsonr(between[var1], between[var2])[1]
output = pd.DataFrame(output, columns = X_vars, index=X_vars)
p_values = pd.DataFrame(p_values, columns = X_vars, index=X_vars)
new_out = pd.DataFrame()
for col in output.columns:
temp = add_stars(output[col], p_values[col])
temp.name = col
new_out = pd.concat([new_out, temp], axis=1)
new_out.index = [x.replace(' ', '')[:6] for x in output.index]
new_out.columns = [x.replace(' ','')[:6] for x in new_out.columns]
print('Simple Correlation Matrix for Between Transformed Variables')
print('='*110)
print(new_out)
print('\n')
print('*** - p < 0.01, ** - p < 0.05, * - p < 0.01')
print('Note: Pearson\'s correlation coefficient assumes normally distributed variables')
The individual dimension is known to have more variance than the within dimension (see Q1), so it is not surprising to see more correlation relationships when data is between-transformed than within-transformed. Ethnic Fractionalization and Assassinations find correlations (positive and negative, respectively) with Effective Aid at the 5% level, while the interaction term is significant at the 10% level. Institutional quality is associated with positive macroeconomic outcomes (though there is of course huge correlation/causation problems here!. Perhaps surprisingly, inflation is associated with assassination, which might be a quirk in the data caused by the time period (late 70's and early 80's), when South America was experiencing monetary and political crises at the same time.
Once again, the policy varaible is correlated with its components, but now also with Institutional Quality, which is not visible in the Within Transformation (it is wiped out due to near time-invariance). Policy is once again not associated with Effective Aid.
In terms of correlation with the dependent variable, in the Between framework Effective Aid has sufficient correlaton to not be on the chopping block, but Assassinations remains largely uncorrelated.
# Sort X and Y axes of the scatter matrix by the sum of p-values (x = row-wise, y = col-wise)
n = len(X_vars_plus)
p_values = np.zeros((n,n))
for i,var1 in enumerate(X_vars_plus):
for j,var2 in enumerate(X_vars_plus):
p_values[i][j] = stats.pearsonr(within[var1], within[var2])[1]
p_values = pd.DataFrame(p_values, columns = X_vars_plus, index=X_vars_plus)
x_vars_sorted = p_values.sum().sort_values().index
y_vars_sorted = p_values.sum(axis=1).sort_values().index
n = len(X_vars_plus)
already_plotted = []
fig, ax = plt.subplots(n, n, figsize=(12,12), dpi=100, sharex=False, sharey=False)
for row, x_var in enumerate(x_vars_sorted):
already_plotted.append(x_var)
for col, y_var in enumerate(y_vars_sorted):
if y_var in already_plotted:
ax[row][col].set_visible(False)
else:
x = within[x_var]
y = within[y_var]
reg = np.polyfit(x, y, 1, full=False)
slope, intercept = reg[0], reg[1]
x_l = [min(x), max(x)]
y_l = [xx * slope + intercept for xx in x_l]
pearsons = stats.pearsonr(x, y)[1]
if pearsons < 0.01: color = 'red'
elif pearsons < 0.05: color = 'orange'
elif pearsons < 0.10: color = 'gold'
else: color = 'black'
ax[row][col].scatter(x, y, alpha=0.2)
ax[row][col].plot(x_l, y_l, ls='--',color=color, lw=2, zorder=10)
ax[row][col].set_xticks([])
ax[row][col].set_yticks([])
if row == col-1:
ax[row][col].set_xlabel(x_var, size=6)
ax[row][col].set_ylabel(y_var, size=6)
fig.legend([patches.Patch(facecolor='red', edgecolor='red'),
patches.Patch(facecolor='orange', edgecolor='orange'),
patches.Patch(facecolor='gold', edgecolor='gold')],
['P < 0.01', 'P < 0.05', 'P < 0.10'], bbox_to_anchor=(.395,.31), ncol=3)
fig.text(x=.18, y=.42, s='Read x-axis labels RIGHT and y-axis labels UP', zorder=3)
fig.add_artist(patches.Rectangle((.14,.39), height=.1, width=.36, facecolor='whitesmoke', edgecolor='black', zorder=0))
fig.suptitle('Scatter Matrix of Correlations Between All Variables, Within-Transformed', size=12, x=0.55, y=.92)
plt.show()
In this Scatter Matrix, the X and Y axes have been sorted on the sum of p-values generated from the matrices shown above. We can interpert the north-west corner of the matrix being the variables with most statistical significance, while the south-east is the least statistically significance. Looking at the slopes of the lines of best fit and their significance levels, it is easy to see that this is indeed the case.
The most "important" variables in this analysis are log GDP, Year, Budget Surplus, and Inflation. This is perhaps notable because it doesn't neatly conform to the results of PCA, which found Lagged Money Supply to be, by far, the most important axis of variance. Here we see Lagged Money supply is important (4th in the y-axis sort and 6th in the x-axis sort), but still behind the time trend and macroeconomic variables.
In fact, we see a strong time trend on the macroeconomic variables: Inflation, Trade Openness, GDP Growth, Money Supply, and Effetive Aid all have statistically significant correlation with the time trend. Policy does as well, but recall that it is built of macroeconomic variables. This time-trend correlation confirms what we saw in the Question 1 time-dimension plots. Lagged Money supply is also significantly correlated with the expected macroeconomic variables and signs.
Among the "novel" variables, we can see the time-invariance of Institutional Quality and Ethnic Fractionalizaiton very clearly as a single dot in the far south-east. Save for the interaction term, neither Assassinations nor Ethnic Fractionalization are correlated with any of the covariates, save for a weak (and positive!) correlation between assassinations and GDP.
One note of caution: P-values are computed using Pearson's test of non-correlation, which does assume the data is normally distributed, which the novel variables are not. It would therefore be inappropriate to strongly interpert these results.
# Sort X and Y axes of the scatter matrix by the sum of p-values (x = row-wise, y = col-wise)
n = len(X_vars)
p_values = np.zeros((n,n))
for i,var1 in enumerate(X_vars):
for j,var2 in enumerate(X_vars):
p_values[i][j] = stats.pearsonr(between[var1], between[var2])[1]
p_values = pd.DataFrame(p_values, columns = X_vars, index=X_vars)
x_vars_sorted = p_values.sum().sort_values().index
y_vars_sorted = p_values.sum(axis=1).sort_values().index
n = len(X_vars)
already_plotted = []
fig, ax = plt.subplots(n, n, figsize=(12,12), dpi=100, sharex=False, sharey=False)
for row, x_var in enumerate(x_vars_sorted):
already_plotted.append(x_var)
for col, y_var in enumerate(y_vars_sorted):
if y_var in already_plotted:
ax[row][col].set_visible(False)
else:
x = between[x_var]
y = between[y_var]
x_jitter = 0
y_jitter = 0
reg = np.polyfit(x, y, 1, full=False)
slope, intercept = reg[0], reg[1]
x_l = [min(x), max(x)]
y_l = [xx * slope + intercept for xx in x_l]
pearsons = stats.pearsonr(x, y)[1]
if pearsons < 0.01: color = 'red'
elif pearsons < 0.05: color = 'orange'
elif pearsons < 0.10: color = 'gold'
else: color = 'black'
ax[row][col].scatter(x+x_jitter, y+y_jitter, alpha=0.2)
ax[row][col].plot(x_l, y_l, ls='--',color=color, lw=2, zorder=10)
ax[row][col].set_xticks([])
ax[row][col].set_yticks([])
if row == col-1:
ax[row][col].set_xlabel(x_var, size=6)
ax[row][col].set_ylabel(y_var, size=6)
fig.legend([patches.Patch(facecolor='red', edgecolor='red'),
patches.Patch(facecolor='orange', edgecolor='orange'),
patches.Patch(facecolor='gold', edgecolor='gold')],
['P < 0.01', 'P < 0.05', 'P < 0.10'], bbox_to_anchor=(.395,.31), ncol=3)
fig.text(x=.18, y=.42, s='Read x-axis labels RIGHT and y-axis labels UP', zorder=3)
fig.add_artist(patches.Rectangle((.14,.39), height=.1, width=.36, facecolor='whitesmoke', edgecolor='black', zorder=0))
fig.suptitle('Scatter Matrix of Correlations Between All Variables, Between-Transformed', size=12, x=0.55, y=.92)
plt.show()
With variance now present in the novel variables, more relationships are visible. Interestingly, increased Assassinations are associated with decreased aid (political violence scaring away donors?) but increased Ethnic Fractionalization is associated with increased effective aid, both at a 5% level of significance, while the interaction term is again negative (but at only a 10% level of significance). The interaction term is also decreases as institutional quality increases, which makes sense. Inflation is also associated with assasination, as noted above.
Ethnic Fractionalization has a strong negative relationship with log GDP, which is interesting: one might expect that, when ethnic tensions are present, more GDP would amplify the problems. Perhaps the inverse, that GDP cannot grow until ethnic fissures are sufficiently healed, is the dominant effect.
The policy index remains uninteresting, with only the expected relationships used in it's construction. It is not correlated with any of the "novel" variables
Between-transformations do not have a time dimension, so we consider only the within transformed and pooled variables. We can begin by looking at simple Perason's r for each variable and it's first lag and the time trend variable, first for the within transformed data, then untransformed data.
To construct a single time dimension, each year is computed as the average of all country observations within that year. As a result, not each year will have the same number of observations contibuting to its mean value. The number of observations per year is presented first:
df.groupby('YEAR1').size()
To attempt to correct for this, we can use a weighted average, with weights equal to:
$w_t = \frac{n_t}{49} \cdot \frac{1}{\sum_{i=1}^6 w_t}$
Weights are normalized by the sum of all weights to ensure they sum to unity.
Weights are not found to significantly alter the results, so results using the weighted values are presented.
n = df.groupby('YEAR1').size().values
w = n / max(n)
w = w / w.sum()
print(f'Weights: {w}')
print(f'Verify weights sum to 1: {w.sum():0.8f}')
print('Correlation for each variable and it\'s first lag, Within Transformed')
print('='*68)
print(' '*25 + 'Lag 1' + ' '*10 + 'Trend' + ' '*10 + 'N Obs')
print(' '*25 + '=====' + ' '*10 + '=====' + ' '*10 + '=====')
for var in X_vars:
n_obs = str(within.groupby('YEAR1').agg(np.nanmean)[var].shape[0])
vals = within.groupby('YEAR1').agg(np.nanmean)[var] * w
line = ''
corr, p = stats.pearsonr(vals.iloc[1:], vals.shift(1).dropna(), )
line = var + ' '*(25 - len(var))
if p < 0.01:
stat = f'{corr:0.2f}***'
elif p < 0.05:
stat = f'{corr:0.2f}**'
elif p < 0.10:
stat = f'{corr:0.2f}*'
else:
stat = f'{corr:0.2f}'
line += (stat + ' ' * (16 - len(stat)))
corr, p = stats.pearsonr(vals, within.groupby('YEAR1').agg(np.nanmean)['Year'])
if p < 0.01:
stat = f'{corr:0.2f}***'
elif p < 0.05:
stat = f'{corr:0.2f}**'
elif p < 0.10:
stat = f'{corr:0.2f}*'
else:
stat = f'{corr:0.2f}'
line += (stat + ' ' * (16 - len(stat)))
line += n_obs
print(line)
print('\n')
print('*** - p < 0.01, ** - p < 0.05, * - p < 0.01')
print('Note: Pearson\'s correlation coefficient assumes normally distributed variables')
What is interesting here is that Institutional Quality is sufficiently invariant to be computed as infinity in the Pearson's R formula, but Ethnic Fractionalization, which we saw was time invariant in the scatter matrix above, is not. This is most certainly related to the floating point math used by the software, but actually the within variance of Ethnic Fractionalization (1.465021e-33) is higher than that of Institutional Quality (1.261654e-31). Both of these should be considered zero.
Otherwise, we can see that macroeconomic variables, as has been repeatedly noted, have a time trend. We now see that several (Trade Openness, Lagged Money Supply, and Effective Aid) are also autoregressive.
print('Correlation for each variable and it\'s first lag, untransforrmed')
print('='*68)
print(' '*25 + 'Lag 1' + ' '*10 + 'Trend' + ' '*10 + 'N Obs')
print(' '*25 + '=====' + ' '*10 + '=====' + ' '*10 + '=====')
for var in X_vars:
n_obs = str(overall.groupby('YEAR1').agg(np.nanmean)[var].shape[0])
vals = overall.groupby('YEAR1').agg(np.nanmean)[var] * w
line = ''
corr, p = stats.pearsonr(vals.iloc[1:], vals.shift(1).dropna(), )
line = var + ' '*(25 - len(var))
if p < 0.01:
stat = f'{corr:0.2f}***'
elif p < 0.05:
stat = f'{corr:0.2f}**'
elif p < 0.10:
stat = f'{corr:0.2f}*'
else:
stat = f'{corr:0.2f}'
line += (stat + ' ' * (16 - len(stat)))
corr, p = stats.pearsonr(vals, within.groupby('YEAR1').agg(np.nanmean)['Year'])
if p < 0.01:
stat = f'{corr:0.2f}***'
elif p < 0.05:
stat = f'{corr:0.2f}**'
elif p < 0.10:
stat = f'{corr:0.2f}*'
else:
stat = f'{corr:0.2f}'
line += (stat + ' ' * (16 - len(stat)))
line += n_obs
print(line)
print('\n')
print('*** - p < 0.01, ** - p < 0.05, * - p < 0.01')
print('Note: Pearson\'s correlation coefficient assumes normally distributed variables')
In just a pooled OLS setting, macroeconomic variables have strong time trends (this was also observed in the cross-sectional examination), and also strong persistance. Persistance appears to be slightly stronger than when within transformation was applied, but due to the tiny sample size, these are extremely underpowered tests and should not be examined with that level of fineness.
fig, ax = plt.subplots(1,2, figsize=(16,6), dpi=100)
transforms = [within, between]
cmap = cm.get_cmap('tab10')
gradient = cmap(np.linspace(0,1,10))
for axis, transform in zip(fig.axes, transforms):
if transform is within: title = 'Within Transformation'
else: title = 'Between Transformation'
x = transform['Effective Aid']
y = transform['GDP Growth']
xl_1, xl_0 = np.polyfit(x, y, 1, full=False)
xq_2, xq_1, xq_0 = np.polyfit(x, y, 2, full=False)
xl = [min(x), max(x)]
yl = [xx * xl_1 + xl_0 for xx in xl]
xq = np.linspace(min(x), max(x), 100)
yq = [xx**2*xq_2 + xx*xq_1 + xq_0 for xx in xq]
lowess = sm.nonparametric.lowess(y, x, frac=0.66) #Note that statsmodels stores lowess results column-wise!
axis.scatter(x, y, alpha=0.5, color=gradient[0].squeeze().tolist())
axis.plot(xl, yl, color=gradient[1].squeeze().tolist(), label='Linear')
axis.plot(xq, yq, color=gradient[2].squeeze().tolist(), label='Quadratic')
axis.plot(lowess[:,0], lowess[:,1], color=gradient[3].squeeze().tolist(), label='LOWESS')
for spine in axis.spines.values():
spine.set_visible(False)
axis.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
axis.set_title(title)
axis.set_xlabel('Effective Aid')
axis.set_ylabel('GDP Growth')
ax[0].legend(ncol=3, bbox_to_anchor=(1.4,1.13))
fig.suptitle('Comparision of Model Specifications by Variable Transformation', fontsize=18, y=1.03)
plt.show()
All univariate specifications of Effective Aid have trouble capturing variation in GDP Growth. In the Within case, observations are strongly clustered around Effective Aid = 0, making variation in GDP orthogonal to Effective Aid. Lowess is able to capture some of this, but also grossly overfits outliers. Both linear and quadratic specifications fail to capture variance: in the within case, the line of best fit is y = 0. In the between case, a negative nelationship is found by both the linear and quadratic models, but recall that the correlation coefficient of these two variables is not significant at the 10% level, so we can't be sure this isn't fitting to noise (or to influential outliers).
In terms of regime change detection, LOWESS doesn't give much information about the Within Transformed relationship outside the main cloud due to the presence of outliers. Inside the cloud it is possible there are two regimes (one positive and one negative), but in such a densely clustered column space, it will be possible to generate as many regimes as one pleases by adjusting the bandwidth. I am unfamiliar with best practices for bandwidth tuning -- here we implement LOWESS using package defaults.
Similarly, it is possible that there are 2-3 regimes in Effective Aid ($(-\infty, -1], (-1, 1], [1, \infty)$) but it would be impossible to say without systematic robustness checks on the bandwidth parameter.
#Some cleanup before regressing: remove GDP growth from the list of variables
Exog_vars = ['log GDP', 'Ethnic Frac', 'Assassinations', 'Frac*Assass', 'Institutional Quality',
'L_Money Supply', 'Policy', 'Sub-Saharan Africa', 'East Asia', 'Effective Aid']
import linearmodels as lm
_df = df.reset_index().set_index(['Country', 'Year'])
#We need a numeric value to associate with every country for clustering, so we'll use the sum of the ASCII values of each
#letter in the country name.
_df['Clusters'] = list(map(lambda x: np.sum([ord(xx) for xx in x]), _df.index.get_level_values(0)))
y = _df['GDP Growth']
x = _df[Exog_vars]
model = lm.BetweenOLS(y, sm.add_constant(x)).fit(cov_type='clustered', clusters= _df['Clusters'])
between_results = save_results(model, 'Between')
model.summary
Remarks: r2 high, but the variable of interest (Effective Aid) is not significant. Policy is positive and significant, taking the place of the macroeconomic variables used to construct it. The results of this estimator are consistent with the findings of the descriptive statisitcs, when we think of policy as proxy for strongly correlated Inflation and Budget Surplus (and Trade Openness, to a lesser extent).
#Run a restricted model without the insignificant variables
restricted = lm.BetweenOLS(y, sm.add_constant(_df['Policy'])).fit(cov_type='clustered', clusters=_df['Clusters'])
q = len(Exog_vars) - 2
n = _df.shape[0]
k = len(Exog_vars)
SSR_R = restricted.resid_ss
SSR_UR = model.resid_ss
F = ((SSR_R - SSR_UR) / q) / ((SSR_UR)/(n-k-1))
print('F-test of equivilant variance')
print('H0: Var(Restricted) = Var(Unrestricted)')
print(f'df1 = {q}, df2={n-k-1}')
print('='*40)
print(f'F-stat = {F:0.03f}')
print(f'p = {stats.f.sf(F, q, n-k-1):0.3f}') #f.sf is the survivor function for f, and tells what % is beyond our value
In the between model specification, No variables are significant, save policy. We can test if these variables are jointly significant, however, using an f-test. A restricted model containing only the Policy index and a Constant is found to be significantly different from the full model, and we reject the wholesale exclusion of all statistically insignificant variables.
def resid_plot(x, model, title, transform, resids=None, y_hat=None, labels_x = False, labels_y = False, outlier_x=None, outlier_y=None, color_col=None):
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
cmap = cm.get_cmap('tab20')
gradient = cmap(np.linspace(0,1,10))
if transform == 'between':
X = x.groupby('Country').agg(np.mean).values
else: X = x.values.squeeze()
H = X @ np.linalg.inv(X.T @ X) @ X.T
if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper):
student_resid = model.resid.values / (model.resid.std() * np.sqrt(1 - np.diag(H)))
elif model is None:
student_resid = resids.values / (resids.std() * np.sqrt(1 - np.diag(H)))
else:
student_resid = model.resids.values / (model.resids.std() * np.sqrt(1 - np.diag(H)))
if transform == 'between':
avg_y_hat = model.predict(x).groupby('Country').agg(np.mean)
elif model is None:
avg_y_hat = y_hat.to_frame()
avg_y_hat.columns = ['predictions']
else:
avg_y_hat = model.predict(x)
if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper):
avg_y_hat = avg_y_hat.to_frame()
avg_y_hat.columns = ['predictions']
mean_resid = student_resid.mean()
std_resid = student_resid.std()
skew_resid = stats.skew(student_resid)
kur_resid = stats.kurtosis(student_resid, fisher=False)
mean_yhat = avg_y_hat['predictions'].mean()
std_yhat = avg_y_hat['predictions'].std()
skew_yhat = stats.skew(avg_y_hat['predictions'])
kur_yhat = stats.kurtosis(avg_y_hat['predictions'], fisher=False)
gs = GridSpec(4, 4)
fig = plt.figure(figsize=(8,6), dpi=100)
ax_main = fig.add_subplot(gs[1:4, 0:3])
ax_x = fig.add_subplot(gs[0, 0:3], sharex=ax_main)
ax_y = fig.add_subplot(gs[1:4, 3], sharey=ax_main)
if color_col is not None:
color_by = color_col.name
y_hat_plus = pd.merge(avg_y_hat, color_col, left_index=True, right_index=True)
y_hat_plus['resids'] = student_resid
_temp_df = y_hat_plus.groupby(color_by)
for i, group in enumerate(_temp_df.groups.keys()):
yy = _temp_df.get_group(group)['predictions'].values.squeeze()
xx = _temp_df.get_group(group)['resids'].values.squeeze()
ax_main.scatter(yy, xx, alpha=0.6, zorder=2, color=gradient[i], label=group)
ax_main.legend(fontsize=8)
else:
ax_main.scatter(avg_y_hat, student_resid, alpha=0.6, zorder=2)
sns.kdeplot(avg_y_hat['predictions'], shade=True, ax=ax_x, legend=False, zorder=2)
sns.kdeplot(student_resid, shade=True, ax=ax_y, vertical=True, zorder=2)
ax_main.set_xlabel('$\hat{y}_{i,.}$')
ax_main.set_ylabel('Studentized Residuals')
for xx, yy, name in zip(avg_y_hat['predictions'], student_resid, df.index):
if labels_x == True and labels_y == False:
if (xx < outlier_x[0] or xx > outlier_x[1]):
text = re.findall("(?<=\')(\w+\W? ?\w*\W? ?\w*)(?=\')",str(name))
s = text[0] + '\n' + text[-1]
ax_main.text(xx,yy,s, size=5)
elif labels_x == False and labels_y == True:
if (yy > outlier_y[1]) or (yy < outlier_y[0]):
text = re.findall("(?<=\')(\w+\W? ?\w*\W? ?\w*)(?=\')",str(name))
s = text[0] + '\n' + text[-1]
ax_main.text(xx,yy,s, size=5)
elif labels_x == True and labels_y == True:
if (xx < outlier_x[0] or xx > outlier_x[1]) or (yy > outlier_y[1] or yy < outlier_y[0]):
text = re.findall("(?<=\')(\w+\W? ?\w*\W? ?\w*)(?=\')",str(name))
s = text[0] + '\n' + text[-1]
ax_main.text(xx,yy,s, size=5)
for axis in fig.axes:
for spine in axis.spines.values():
spine.set_visible(False)
axis.tick_params(which='both',
axis='both',
left=False,
right=False,
top=False,
bottom=False)
axis.set_facecolor('ghostwhite')
axis.grid(True, color='white', zorder=1, lw=2)
plt.setp(ax_x.get_xticklabels(), visible=False)
plt.setp(ax_x.get_yticklabels(), visible=False)
plt.setp(ax_y.get_xticklabels(), visible=False)
plt.setp(ax_y.get_yticklabels(), visible=False)
left_corner_x = ax_x.get_xlim()[0]
left_corner_y = ax_x.get_ylim()[1]
u_y = (ax_x.get_ylim()[1] - ax_x.get_ylim()[0]) / 100
u_x = (ax_x.get_xlim()[1] - ax_x.get_xlim()[0]) / 100
for i, (name, value) in enumerate(zip(['μ', 'σ', 'skew', 'kurtosis'], [mean_yhat, std_yhat, skew_yhat, kur_yhat])):
ax_x.text(0.95 * left_corner_x, 0.9*left_corner_y-(u_y * (12*i)), s=f'{name}={value:0.2f}', fontsize=8)
right_corner_x = ax_y.get_xlim()[1]
right_corner_y = ax_y.get_ylim()[1]
u_y = (ax_y.get_ylim()[1] - ax_y.get_ylim()[0]) / 100
u_x = (ax_y.get_xlim()[1] - ax_y.get_xlim()[0]) / 100
for i, (name, value) in enumerate(zip(['μ', 'σ', 'skew', 'kurtosis'], [mean_resid, std_resid, skew_resid, kur_resid])):
ax_y.text(0.5*right_corner_x, 0.9*right_corner_y - (u_y * (4*i)), s=f'{name}={value:0.2f}', fontsize=8)
plt.tight_layout()
fig.suptitle('Studentized Residuals and Predicted Average $\hat{y}_{i,.}$, with Distributions \n' + f'{title}',
y=1.065, size=14, fontweight='semibold')
plt.show()
resid_plot(x = sm.add_constant(x), model=model, title='Between Estimator',
transform='between', labels_y=True, outlier_y=(-2, 2))
Investigating residuals, we see that they are evenly distributed around 0 with unit variance (a result of studentization), but also that they are not greatly skewed, nor are there extreme outliers.
We begin by using only fixed effects. Recall that Institutional Quality and Ethnic Fractionalization are time-invariant (Question 10), and will be absorbed by the individual fixed effects. The same is true of the regional dummy variables used by Burnside and Dollar (Sub-Saharan Africa and East Asia). These variables are preemptively dropped to avoid an absorbtion warning.
Exog_vars_within = set(Exog_vars) - set(['Institutional Quality', 'Ethnic Frac',
'Sub-Saharan Africa', 'East Asia'])
x = _df[Exog_vars_within]
within_fe = lm.PanelOLS(y, x, entity_effects=True, time_effects=False, drop_absorbed=True).fit(cov_type='clustered', clusters=_df['Clusters'])
fe_results = save_results(within_fe, 'Fixed Effects')
within_fe.summary
Controlling for country fixed effects, the model loses much explainator power (R2 down from .60 to .11!) which makes sense, given the presense of outliers, which will inflate the sum of square residuals term used to calculate that statistic.
In this specification, only Inflation and Money Supply remain statistically significant at the 5% level, while trade openness is significant at the 10% level. The variable of interest, Effective Aid, is not significant at the 10% level.
We next run a model with both time effects and individual effects:
within_fete = lm.PanelOLS(y, x, entity_effects=True, time_effects=True, drop_absorbed=True).fit(cov_type='clustered', clusters=_df['Clusters'])
fete_results = save_results(within_fete, 'Fixed/Time Effects')
within_fete.summary
pd.merge(fe_results, fete_results, left_index=True, right_index=True)
While we saw (way, way back in Question 1) the some seemingly exogeneous shocks in the time dimension (1982-1985 in Budget Surplus and GDP Growth), the majority of variables rather have time trends and autogressive presistance. We might want to check the effect of first differences with Fixed Effects, and compare the results to adding Time Effects. Neverthless, adding Time Effects significantly changes the outcome of the model. The r^2 drops from .16 to .10, and Lagged Money Supply, which has a strong time trend, becomes insignificant.
Policy, although it too has a strong time trend, remains significant at the 1% level, although the magnitude of the parameter decreases. Additionally, the Assassinations variable becomes significant with the inclusion of Time Effects.
In either specification, Effective Aid is not significantly different from zero.
within_x = x.groupby('Country').apply(lambda x: x - x.mean())
resid_plot(within_x,
model=within_fe, title='Within Estimator with Individual and Time Effects',
transform='within', labels_y=True, outlier_y=(-2,2),
labels_x=True, outlier_x=(-5,5))
The errors of this model are very normally distributed, with a small group of outliers beyond the 2 sigma cutoff. Looking carefully, we notice that the majority of these outliers arer Sub-Saharan African countries. While we cannot include the regional dummies in a within estimation, we can color the plot using these dummies to get a sense of how the model is performing with respect to these potentially omitted variables.
def make_regions_var(dummy_frame):
for col in dummy_frame.index:
if dummy_frame[col] > 0:
return col
return 'Other'
region_dummies = _df[['East Asia', 'Egypt', 'Sub-Saharan Africa', 'Central America']].copy()
region_dummies = region_dummies.apply(make_regions_var, axis=1)
region_dummies.name = 'Region'
resid_plot(within_x, model=within_fete, transform='within', title='Within Estimator with Individual and Time Effects, by Region',
labels_y=True, outlier_y=(-2,2), color_col = region_dummies)
By coloring the scatterplot, we see that the model is correctly dividing the data into two groups: Sub-Saharan Africa and the rest of the world. Correctly, because the main cloud of predictions where the divide is cleanest fals within 2 sigmas of the true value.
More interestingly, the extreme outliers are all African counries, with the exception of Syria 1986 and Nicaragua 1978.
y = _df['GDP Growth']
time_effects = ['year_3', 'year_4', 'year_5', 'year_6', 'year_7']
x = sm.add_constant(_df[Exog_vars + time_effects])
ref = lm.panel.RandomEffects(y, x).fit(small_sample=True, cov_type='clustered', clusters=_df['Clusters'])
re_results = save_results(ref, 'Random Effects')
ref.summary
It is difficult to evcaluate the results of this regression, because of the specter of endogeneity resulting from the lack of individual fixed effects. I have included time effects, because the results of the Time Effects specification, combined with our exploratory investigation, showed clear evidence that there is strong trend in the macroeconomic variables (we recall how log GDP fell from significant at the 1% level to insignificant with the inclusion of time effects).
To verify that there are individual fixed effects, we use a Hausman test to compare the results of the Random Effects and the Within Estimators.
The test takes the form:
$ H = (\hat{\beta}_{RE} - \hat{\beta}_{FE})^T [Var(\hat{\beta}_{RE}) - Var(\hat{\beta}_{FE})]^{-1} (\hat{\beta}_{RE} - \hat{\beta}_{FE}) $
And follows a $\chi^2$ distribution with k degrees of freedom, h0: $E[\alpha_i | X] = 0$
Note that we need the same model specification for each model to run the test, and several explainatory variables were absorbed by the Fixed Effects. We first re-estimate the Mundlak regression, excluding the same variables, and perform the Hausman Test. If we find the Fixed Effects are not endogeneous, we can re-introduce the variables.
x = _df[Exog_vars_within]
ref = lm.panel.RandomEffects(y, x).fit()
β_re = ref.params.values.squeeze()
β_fe = within_fete.params.values.squeeze()
H = (β_fe - β_re).T @ np.linalg.inv(within_fete.cov.values - ref.cov.values) @ (β_fe - β_re)
print('Hausman Test of Endogeneity')
print('H0: No Endogeneity in RE model (E[c_i | X] = 0)')
print('='*30)
print(f'H = {H:0.4f}, df = {x.shape[1]}')
print(f'p = {stats.chi2.sf(H, x.shape[1]):0.05f}')
With a p-value of 0, we strongly rejecet the null hypothesis, and conclude that the Random Effects model has endogeneous unobserved effects. We note, per Wooldridge (2001), however, that this test makes very strong assumptions about the form of the errors, that their conditional expectations is $\sigma ^2 \bf{I}_T$. Given that the test result is similar with the intuition that there were no large systematic differences between countries in the time dimension, however, this test result seems to make sense.
Mundlak (1978) demonstrated that if the unobserved effects are a linear function of $\bar{x}_i$, adding these averages to the GLS model will cause the GLS estimators to collapse to Fixed Effects estimators, effectively accounting for individual effects.
To accomplish this, we add between-transformed variables to the Random Effects regression and confirm that the estimations are indeed equal to the Fixed Effects estimations.
Exog_vars_no_dummies = list(set(Exog_vars) - set(['Sub-Saharan Africa', 'East Asia']))
x = pd.merge(_df[Exog_vars + time_effects], between[Exog_vars_no_dummies],
how='inner', left_index=True, right_index=True, suffixes=('', '_mean'))
x = sm.add_constant(x)
x.drop(columns=['Institutional Quality_mean', 'Ethnic Frac_mean'], inplace=True)
mundlak = lm.panel.RandomEffects(y, x).fit(small_sample=False, cov_type='clustered',clusters=_df['Clusters'])
mundlak_results = save_results(mundlak, 'Mundlak')
mundlak.summary
from functools import reduce
#Remind ourselves what the FE model results were
to_merge = [fete_results, mundlak_results]
reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), to_merge)
While not identical, we see that the Mundlak hypothesis of fixed effects linear in individual means works quite well here, the Coefficients between the Time Fixed Effects and Mundlak regressions are very similar.
The r2 is greatly improved in this model, and nearly all covariates are significant. Interestingly, log GDP, which is significant in the Within Estimator without Time Effects, once again becomes significant at a 10% level in the Mundlak regression.
Nevertheless, the variable of interest, Effective Aid, remains insignificant.
One additional point of note is that all fixed effect terms (between transformed means) are insignificant at a 10% level; this indicates that these variables are, individually, exogeneous to the individual unobserved effects. The time effects, however, are significant at the 1% level, especially towards the end of the sample (the 80's and 90's).
resid_plot(x=x, model=mundlak, title='Mundlak Regression, Colored by Year',
labels_y=True, transform='Random', outlier_y=(-2,2),
color_col=pd.Series(df.index.get_level_values(1), index=_df.index))
resid_plot(x=x, model=mundlak, title='Mundlak Regression, Colored by Region',
labels_y=True, transform='Random', outlier_y=(-2,2),
color_col=region_dummies
)
Out of curiosity, I plot the errors of the Mundlak regression by both Year and Region. We see that predictions for 1982-1985 are clustered below the average, but do not appear systematically mis-classified one way or the other. The regions seem to be mixed together with no obvious pattern, expect to note that once again, African countries are more likely to be mis-classified by more than 2 sigma (Gabon, Cameroon, and Ethiopia, plus Nicarauga, again).
The last model we consider is taking first differences on the level data. We need to use a reduced explainatory variable set, because Ethnic Fractionalization and Institutional Quality are time invariant, when first differences are taken the column becomes 0 and collinear with other low time-variance variables.
y = _df['GDP Growth']
x = _df[list(set(Exog_vars) - set(['Institutional Quality', 'Ethnic Frac', 'Sub-Saharan Africa', 'East Asia']))]
diffOLS = lm.panel.FirstDifferenceOLS(y, x).fit(cov_type='clustered', clusters=_df['Clusters'])
diff_results = save_results(diffOLS, 'First Diff')
diffOLS.summary
diff_x = x.unstack(0).diff(1).stack().dropna().swaplevel().sort_values(by='Country')
resid_plot(diff_x,
diffOLS, title='First Difference Estimator', transform='Diff',
labels_y=True, outlier_y=(-2,2),
color_col=region_dummies[diff_x.index])
Three countries (Mali, Cote d'Ivoire, and Turkey) are dropped when perfoming first differnces, because they only have a single observation. This, plus dropping the whole first year of observations, results in a significantly smaller dataset, n = 217. Note that this particular implementation of first differences treats NAN as 0 when doing the difference, so panel patterns with "holes" are effectively lagged rather than differenced.
The r2 of the model is quite low, at 0.13, similar to the Within Models. Like the Within model without time unobserved effects, log GDP is highly significant with a large parameter, as is policy.
One important fact about this model is that it miscategorizes different observations than the Within and Mundlak models. It does not struggle with Gabon, Cameroon, and Ethiopia, instead struggling a lot with Equador. This suggets it may be worthwhile to investigate bagging methodologies, where the results of the two regressions are averaged to improve prediction accuracy.
Hausman-Taylor family models allow for the inclusion of otherwise absorbed time-invariant variables in Fixed Effect models through the use of instrumentation, as well the inclusion of those variables which might be correlated with the unobserved effects. Variables are separated into 4 categories:
To decide which variables will go where, a rule of thumb is to use the results of the Mundlak regression. Means that are not found to be significant in that regression can be assumed exogeneous (class X1), while those that ARE significant can be considered X2. Z2 variables, on the other hand, need to be determined theoritically.
In this case, we found that all means are exogeneous individually, but jointly significant. As a result, we insturment all our covariates as X2 variables, using the between transformed variables themselves, plus Ethnic Fractionalization and Institutional Quality.
Python doesn't have a dedicated package for performing Hausman-Taylor, so my regression strategy will be to first do a create a matrix $\hat{X}$ by regressing the explainatory variables one by one on their de-meaned counterparts (both between and within transformed), plus Institutional Quality (because it is time-invariant and thus assumed uncorrelated with the unobserved effects), and saving predicted results. An IV estimation will be performed. Then $\hat{\lambda}$, the random effects transformer, can be estimated using an identification strategy unique to unbalanced panels. The data will be transformed using this $\hat{\lambda}$, and then a new transformed $\hat{X}$ matrix will be computed, and a final IV regression run.
between = _df.groupby('Country').transform(np.nanmean)
within = _df.groupby('Country').transform(lambda x: x - x.mean())
to_instrument = ['L_Money Supply', 'Policy', 'Effective Aid', 'Assassinations', 'Frac*Assass', 'log GDP']
instruments = ['Institutional Quality', 'Ethnic Frac']
instruments += ['P'+str(x) for x in between[to_instrument].columns]
instruments += ['Q'+str(x) for x in within[to_instrument].columns]
Pz = pd.merge(between[to_instrument], within[to_instrument], how='outer', left_index=True, right_index=True)
Pz = pd.merge(_df[instruments[:2]], Pz, left_index=True, right_index=True)
Pz.columns = instruments
Pz.index = _df.index
Pz['const'] = 1
for i, var in enumerate(to_instrument):
ols = sm.OLS(_df[var], Pz, has_constant=True).fit(cov_type='cluster', cov_kwds={'groups':_df['Clusters']})
temp_frame = pd.DataFrame(ols.predict(Pz), columns=[var])
if i == 0: hat_frame = temp_frame
else: hat_frame = pd.merge(hat_frame, temp_frame, left_index=True, right_index=True)
# Stage 1 IV Results
hat_frame['const'] = 1
iv = sm.OLS(_df['GDP Growth'], hat_frame, has_const=True).fit(cov_type='cluster', cov_kwds={'groups':_df['Clusters']})
iv_results = save_results(iv, 'IV')
iv.summary()
We can exploit the imbalanced panel structure to estimate $\hat{\lambda}$, noting that:
$\hat{\lambda} = 1 - \sqrt{\frac{\sigma^{2}_u}{\sigma^2_u + T \cdot \sigma^2_\alpha}}$
And, given a between estimator regression, the errors can be written:
$\bar{\epsilon}_i = \alpha_i + T_i^{-1} \sum_{t=1}^{T_i} u_{it} $
And therefore the variance is:
$\bar{\epsilon}^2_i = \sigma^2_\alpha + \frac{1}{T}\sigma^2_u$
We thus between transform and square the residuals of the IV Regression, achieving the heteroskedastic error structure above, and estimate both sigma terms.
r = iv.resid
iT = 1 / r.groupby('Country').size()
r_b = r.groupby('Country').agg('mean')
r_b2 = r_b.pow(2)
iT = sm.add_constant(iT)
iT.columns = ['σ2_α', 'σ2_u']
var_est = sm.OLS(r_b2, iT, has_constant=True).fit(cov_type='HC0')
var_est.summary()
σ2_α, σ2_u = var_est.params
λ = 1 - np.sqrt(σ2_u / (σ2_u + iT.iloc[:,1] * σ2_α))
λ.name = 'lambda'
test = _df.groupby('Country').agg(np.mean)[to_instrument + instruments[:2]]
results = None
for col in test.columns:
if results is None:
results = test[col] * λ
else:
results = pd.concat([results, test[col] * λ], axis=1)
results.columns = test.columns
results = results.reindex(between.index, level=0)
lambda_weighted_y = _df['GDP Growth'].subtract(between['GDP Growth'] * λ, axis=0 )
lambda_weighted_demeaned = _df[to_instrument + instruments[:2]].subtract(results, axis=0)
for i, var in enumerate(to_instrument + instruments[:2]):
ols = sm.OLS(lambda_weighted_demeaned[var], Pz,
has_constant=True).fit(cov_type='cluster', cov_kwds={'groups':_df['Clusters']})
temp_frame = pd.DataFrame(ols.predict(Pz), columns=[var])
if i == 0: hat_frame = temp_frame
else: hat_frame = pd.merge(hat_frame, temp_frame, left_index=True, right_index=True)
lambda_iv = sm.OLS(lambda_weighted_y, hat_frame, has_const=False).fit(cov_type='cluster', cov_kwds={'groups':_df['Clusters']})
ht_results = save_results(lambda_iv, 'HT')
lambda_iv.summary()
The Random Effects Transformation has made the Effective Aid variable significant at the 5% level, but we find that it is associated with reduced GDP Growth, making one wonder if all of the endogeneity has indeed been scrubbed out by the instrumentation process. It is less negative than in the pre-Random Effects Transformed regression, though.
resid_plot(x=lambda_weighted_demeaned, model=lambda_iv, title='Hausman-Taylor Model', transform='HT',
color_col=region_dummies, labels_y=True, outlier_y=(-2,2))
Region-based clustering is most visible in the residuals of the Hausman-Taylor model. East Asian countries are pulling off into a clearly visible cluster on the right. African countries remain the majority of 2-sigma misclassifications, the usual Cameroon, Gabon, and Ethiopia, plus Nicaragua.
Despite all this instrumentation and transformation, the model consistently makes the same errors; even when the "core" of predictions become tighter, the same eggregious classificaiton errors on these 3-6 countries destroy the predictive power of the model.
mundlak_results = mundlak_results[[x.find('mean') == -1 for x in mundlak_results.index]]
to_merge = [between_results, fe_results, fete_results, re_results, mundlak_results, diff_results, iv_results, ht_results]
results = reduce(lambda left, right: pd.merge(left, right, how='outer', left_index=True, right_index=True), to_merge)
results.iloc[:12, :]
Across all regressions, Policy remained strongly, consistently significant. This is largely owning to the macroeconomic variables used in its construction. Institutional Quality was also significant in every regression that could include it, and the ability to include it was likely part of the large improvement between the basic Fixed Effects model to Mundlak and Hausman-Taylor models.
Looking at the more sophisticated models, the Hausman-Taylor model has a slightly lower r2 than the Mundlak regression, but some caveats are necessary. First, Mundlak included nearly 20 variables, which mechanically increases the r2. Secondly, the worst predition errors of the Hausman-Taylor model were closer to the truth than the Mundlak model -- HT had more normal kurtosis and skew of residuals.
Hausman-Taylor was also the only model to show statisitcal signifiance on the Effective Aid variable, though the sign was "wrong", in terms of expectation.
Many of the signs are "wrong" in these regressions, but novel explainations are of course possible. The interaction term, Frac * Assass, is positive, suggesting that getting ethnic straife and political assassination together mitigates the negative effects of Assassinations (the whole package ends up being positive). The explaination, I suppose, could be related to a "civil war reconstruction" effect: when a country is in such bad shape that there is open ethnic warfare, there's nowhere to go but up.
The magnitude of log GDP fluxuated wildly over all regressions, but remained negative, suggesting a convergence effect.
Policy is strongly positive, but I suspect it is mostly driven by the Trade Openness component, which was specifically linked to Aid during the 80s and 90s as part of the Washington Concensus, causing endogeneity with Aid.
ols = sm.OLS(_df['Institutional Quality'].groupby('Country').transform('mean'),
sm.add_constant(between[to_instrument]), hasconst=True)
results = ols.fit(cov_type='HC0')
results.summary()
Recall that:
$\hat{\beta}_{IV} = \frac{Cov(z,y)}{Cov(z,x)}$
An instrument is considered "bad" when the denominator is very small, because it causes the bias in the estimator to explode, even relative to $\hat{\beta}_{OLS}$. The above regression results show the covariance between the Instrumnet and the covariates. The model has a reasonable r2 at .27, with Budget Surplus significant at the 5% level and Assassinations significant at the 1% level.
Recall that $\hat{\beta}_{OLS} = \frac{cov(x,z)}{S_{XX}}$, so seeing positive and significant variables in this regression strongly suggets that Institutional is a good instrument. To test if it is weak or strong, we can regress it on GDP Growth:
sm.OLS(_df['Institutional Quality'].groupby('Country').transform('mean'),
sm.add_constant(between['GDP Growth']), has_const=True).fit().summary()
Indeed, GDP Growth is significant at the 1% level with a positive coefficient. This suggests that Institutional Quality is both Strong and Good, making it a valid instrumnet for use in the Hausman-Taylor regression.