Using ARIMA to forecast college enrollment for 2015 and 2016 at state level.
NCES publishes its school enrollment data every year here: https://nces.ed.gov/ipeds/datacenter/
However, the latest you data can find is always 1 year behind, in order to get the latest estimate, below shows my implementation using ARIMA to forecast current and next year's enrollment number.
NCES has forecast on national level and few demographic attribute, however, there's no state level forecast.
# load package
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from statsmodels import api as sm
import numpy as np
# turn off warning
import warnings
warnings.filterwarnings('ignore')
# load data
enrollment = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='EnrollmentByInstitution')
inst_state = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='Institution')
inst_state.columns = ['UnitId', 'Name', 'City', 'State']
state_enrollment = enrollment.merge(inst_state.drop('Name', axis = 1))
# a preview
state_enrollment.head(2)
Transpose the dataframe, use state as column and convert to dateindex
# some plastic surgery on the df
# group sum by state
to_group = state_enrollment.drop([‘UnitId’, ‘Institution Name’, ‘City’], axis = 1)
to_group = to_group.replace(’-’, 0)
state_enrollment_grp = to_group.groupby(‘State’)
df = state_enrollment_grp.agg(sum).reset_index()
df = df.transpose()
# columns as state
df.columns = df.ix[0,:].values
df = df.drop([‘State’])
# add time series index
df.index = [pd.Period(x, freq = ‘A’) for x in range(1990,2015)]
df.head()
Timeseries Model: ARIMA - Details on ARIMA
Error Metrics: MAPE - Details on MAPE
def produce_forecast(hold_out_n, inog, order, exog = None):
”“”
CV module
input:
hold_out_n (hold out n years from training for cross validation)
inog: endogenous variable (the enrollment number to forecast)
exog: exogenous variable (other variable to forecast, if any)
order: (p, d, q) parameter of ARIMA model
output:
print (average MAPE across all states)
mape by state
“””
<span class="k">def</span> <span class="nf">forecast_state</span><span class="p">(</span><span class="n">inog</span><span class="p">,</span> <span class="n">exog</span><span class="p">,</span> <span class="n">order</span><span class="p">,</span> <span class="n">hold_out_n</span><span class="p">):</span>
<span class="c"># prep exog</span>
<span class="k">if</span> <span class="n">exog</span> <span class="ow">is</span> <span class="ow">not</span> <span class="k">None</span><span class="p">:</span>
<span class="n">train_exog</span> <span class="o">=</span> <span class="n">exog</span><span class="p">[:</span><span class="o">-</span><span class="n">hold_out_n</span><span class="p">]</span>
<span class="n">hold_out_exog</span> <span class="o">=</span> <span class="n">exog</span><span class="p">[</span><span class="o">-</span><span class="n">hold_out_n</span><span class="p">:]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">train_exog</span> <span class="o">=</span> <span class="n">hold_out_exog</span> <span class="o">=</span> <span class="k">None</span>
<span class="c"># forecast wrapper</span>
<span class="n">m</span> <span class="o">=</span> <span class="n">sm</span><span class="o">.</span><span class="n">tsa</span><span class="o">.</span><span class="n">ARIMA</span><span class="p">(</span><span class="n">inog</span><span class="p">,</span> <span class="n">order</span><span class="o">=</span><span class="n">order</span><span class="p">,</span> <span class="n">exog</span><span class="o">=</span><span class="n">train_exog</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">m</span><span class="o">.</span><span class="n">fit</span><span class="p">()</span>
<span class="k">return</span> <span class="n">model</span><span class="o">.</span><span class="n">forecast</span><span class="p">(</span><span class="n">hold_out_n</span><span class="p">,</span> <span class="n">exog</span> <span class="o">=</span> <span class="n">hold_out_exog</span><span class="p">,</span> <span class="n">alpha</span> <span class="o">=</span> <span class="o">.</span><span class="mi">95</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">forecast_y</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c"># data needed</span>
<span class="n">Y</span> <span class="o">=</span> <span class="n">inog</span><span class="o">.</span><span class="n">ix</span><span class="p">[:</span><span class="o">-</span><span class="n">hold_out_n</span><span class="p">,]</span>
<span class="n">yTrue</span> <span class="o">=</span> <span class="n">inog</span><span class="o">.</span><span class="n">ix</span><span class="p">[</span><span class="o">-</span><span class="n">hold_out_n</span><span class="p">:,]</span>
<span class="k">if</span> <span class="n">exog</span> <span class="ow">is</span> <span class="ow">not</span> <span class="k">None</span><span class="p">:</span>
<span class="n">X</span> <span class="o">=</span> <span class="n">exog</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">X</span> <span class="o">=</span> <span class="k">None</span>
<span class="k">for</span> <span class="n">state</span> <span class="ow">in</span> <span class="n">Y</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="n">stateY</span> <span class="o">=</span> <span class="n">Y</span><span class="p">[</span><span class="n">state</span><span class="p">]</span>
<span class="n">yHat</span> <span class="o">=</span> <span class="n">forecast_state</span><span class="p">(</span><span class="n">stateY</span><span class="o">.</span><span class="n">values</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s">'float64'</span><span class="p">),</span> <span class="n">X</span><span class="p">,</span> <span class="n">order</span><span class="p">,</span> <span class="n">hold_out_n</span><span class="p">)</span>
<span class="n">forecast_y</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">yHat</span><span class="p">)</span>
<span class="n">test</span> <span class="o">=</span> <span class="n">yTrue</span>
<span class="n">yHat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">forecast_y</span><span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">()</span>
<span class="n">error_matrix</span> <span class="o">=</span> <span class="nb">abs</span><span class="p">(</span><span class="n">yTrue</span><span class="o">-</span><span class="n">yHat</span><span class="p">)</span><span class="o">/</span><span class="n">yHat</span>
<span class="nb">print</span><span class="p">(</span><span class="s">'MAPE for ARIMA %i, %i, %i'</span> <span class="o">%</span> <span class="n">order</span><span class="p">)</span>
<span class="nb">print</span><span class="p">((</span><span class="n">np</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">error_matrix</span><span class="p">,</span> <span class="n">axis</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">Y</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span>
<span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">(</span><span class="n">figsize</span> <span class="o">=</span> <span class="p">(</span><span class="mi">15</span><span class="p">,</span><span class="mi">3</span><span class="p">))</span>
<span class="n">plt</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">'mape'</span><span class="p">);</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">error_matrix</span><span class="o">.</span><span class="n">ix</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="s">'-o'</span><span class="p">);</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">error_matrix</span><span class="o">.</span><span class="n">ix</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="s">'-o'</span><span class="p">);</span>
<span class="n">plt</span><span class="o">.</span><span class="n">xticks</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">Y</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]),</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">rotation</span> <span class="o">=</span> <span class="s">'vertical'</span><span class="p">);</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">error_matrix</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
<span class="k">return</span> <span class="n">error_matrix</span>
error_100 = produce_forecast(2, df, (1,0,0), exog = None)
Looks good!
# load umemployment data
umemployment = pd.read_excel(‘DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx’, sheetname=‘Unemployment Rate’)
umemployment.index = [pd.Period(x, freq = ‘A’) for x in umemployment.Year]
umemployment.drop([‘Year’], axis = 1, inplace=True)
# umemployment lag 1
error_100_ue1 = produce_forecast(2, df, (1,0,0), exog = umemployment.ix[33:-1,0])
# umemployment lag 0 + lag 1
error_100_ue01 = produce_forecast(2, df, (1,0,0), exog = np.vstack([umemployment.ix[33:-1,0], umemployment.ix[34:,0]]).transpose())
Slight improvement.
# load disposable income data
dp = pd.read_excel(‘DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx’, sheetname=‘DISPOSABLE_INCOME’)
dp.columns = [‘Year’, ‘DSPI’]
dpg = dp.groupby(‘Year’)
dp = dpg.mean().reset_index()
dp.index = [pd.Period(x, freq = ‘A’) for x in dp.Year]
dp.drop([‘Year’], axis = 1, inplace=True)
# umemployment lag 0 + lag 1 + lag 5 disposable income
# not helping, but lag 5 has the lowest avg mape
exog = np.vstack([umemployment.ix[33:-1,0], umemployment.ix[34:,0], dp.ix[27:-5,0]]).transpose()
error_100_ue01 = produce_forecast(2, df, (1,0,0), exog = exog)
# 2015⁄6’s umpmployment rate forecast from fed = [5.0, 4.8]
# http://www.federalreserve.gov/monetarypolicy/files/fomcprojtabl20150917.pdf
umemployment.ix[pd.Period(2015, freq = ‘A’)] = 5.0
umemployment.ix[pd.Period(2016, freq = ‘A’)] = 4.8
exog = np.vstack([umemployment.ix[34:,0], umemployment.ix[33:-1,0]]).transpose()
order = (1,0,0)
forecast_y = []
for state in df.columns:
stateY = df[state]
# yHat = forecast_state(stateY.values.astype(‘float64’), X, order, hold_out_n)
m = sm.tsa.ARIMA(stateY.values.astype(‘float64’), order=order, exog=exog[:-2])
model = m.fit()
yHat = model.forecast(2, exog = exog[-2:], alpha = .95)[0]
forecast_y.append(yHat)
output = pd.DataFrame(forecast_y)
output = output.transpose()
output.columns = df.columns
output.index = [pd.Period(2015, freq = ‘A’), pd.Period(2016, freq = ‘A’)]
pd.concat([df, output]).transpose().head()
# pd.concat([df, output]).transpose().to_csv(‘output.csv’)