Using ARIMA to forecast college enrollment for 2015 and 2016 at institution level.
NCES publishes its school enrollment data every year here: https://nces.ed.gov/ipeds/datacenter/
However, the latest you data can find is always 1 year behind, in order to get the latest estimate, below shows my implementation using ARIMA to forecast current and next year's enrollment number.
NCES has forecast on national level and few demographic attribute, however, there's no state level forecast.
This is a follow up from the state level forecast. Now we wanted to see if there's a way to forecast institutional level enrollment.
# load package
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from statsmodels import api as sm
import numpy as np
import pickle
# load mpld3 for drawing
import mpld3
mpld3.enable_notebook()
%autoreload 1
%aimport vaildation_mod
# turn off warning
import warnings
warnings.filterwarnings('ignore')
# Load the institution info
intinfo = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='INST_CHAR')
lab = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='INST_CHAR_COLUMNS')
# Load Data
enrollment = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='EnrollmentByInstitution')
inst_state = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='Institution')
state_level_forecast = pd.read_excel('DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx', sheetname='STATE_FORECAST_1516')
inst_state.columns = ['UnitId', 'Name', 'City', 'State']
enrollment = enrollment.merge(inst_state.drop('Name', axis = 1))
enrollment = enrollment.replace('-', 0)
schools = enrollment[['UnitId', 'Institution Name']]
# overall trend
plt.plot(np.sum(enrollment.iloc[:,2:-2]).values, ’-o’);
# total school enrollment
er1 = enrollment.copy()
er1[‘total_school_enrollment’] = np.sum(er1.drop([‘UnitId’, ‘Institution Name’, ‘City’, ‘State’], axis = 1), axis = 1)
# count per state
state_sum = pd.DataFrame(np.sum(state_level_forecast.drop([2015,2016], axis = 1), axis = 1))
state_sum.columns = [‘state_sum’]
state_sum[‘State’] = state_sum.index
er1 = er1.merge(state_sum)
# calculate wis as defined above
er1[‘wis’] = er1.total_school_enrollment/er1.state_sum
# preview
er1.head()
# MAPE with real 13, 14 enrollment
# Note: more logical step would be forecasting them and validate them, but here’s just for a quick check
# calculate forecast by real data
def times_wis(x, yr):
return state_level_forecast.ix[x.State, yr] * x.wis
er1[‘fall_2013_w’] = er1.apply(lambda x: times_wis(x, 2013), axis = 1)
er1[‘fall_2014_w’] = er1.apply(lambda x: times_wis(x, 2014), axis = 1)
# calculate mape
print(‘MAPE of 2013’, sum(er1[‘fall_2013_w’]-er1[‘Fall 2013’])/er1.shape[0])
print(‘MAPE of 2014’, sum(er1[‘fall_2014_w’]-er1[‘Fall 2014’])/er1.shape[0])
Ok… so probably this is not the best approach.
It’s possible to do more processing and get different result, but we shall see which makes more sense.
# prepare a matrix to cluster
er2 = enrollment.copy()
er2 = er2.transpose()
er2.columns = er2.ix[‘UnitId’]
er2.drop([‘UnitId’, ‘Institution Name’, ‘City’, ‘State’], inplace = True)
er2.index = [pd.Period(x) for x in range(1990,2015)]
er2.tail(5)
def how_many_zeros(x):
return sum(x == 0)
# Filter out those with less than 100 enrollment in the past
er2 = er2.ix[:, np.sum(er2, axis = 0) > 100]
# Filter out those with no data in the past 5 years
er2 = er2.ix[:,er2.tail(3).apply(lambda x: how_many_zeros(x))==0]
# filtered 7682-6970 = 1124 rows
er2.shape
# validation module pipline the process and paramatize important variables.
# demo
demo = vaildation_mod.predict_by_cluster(er2)
demo.draw_cluster_error()
# see what did scaling do
demo.draw_trend(0,1)
Notice that relatively, there’s no change to the trend, just that now it’s possible to compare different trend.
# see what cluster does
demo.draw_cluster_trend(15, 10, schools)
The last n period trend are similar
# change clustering algo (kmeans) with large n?
# conceptually, it should be more predictive, but it’s not
# worse than GMM but similar to kmeans on n_clusters = default
p_km = vaildation_mod.predict_by_cluster(er2, cluster_method = ‘kmeans’)
# different base?
p_n = vaildation_mod.predict_by_cluster(er2, rebase_to_n = 3, cluster_n_period = 20)
p_n.draw_cluster_error()
p_n.draw_cluster_trend(26, 15, schools)
# add umemployment
umemployment = pd.read_excel(‘DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx’, sheetname=‘Unemployment Rate’)
umemployment.index = [pd.Period(x) for x in umemployment.Year]
umemployment.drop([‘Year’], axis = 1, inplace = True)
umemployment.columns = [‘umemployment_rate’]
print(‘umemployment lag0’)
exog = umemployment[34:].copy()
p_um0 = vaildation_mod.predict_by_cluster(er2, exog = exog, rebase_to_n = 3, cluster_n_period = 20)
print(‘umemployment lag0 + lag1’)
exog[‘umemployment_rate_lag1’] = umemployment[33:-1].values
p_um01 = vaildation_mod.predict_by_cluster(er2, exog = exog, rebase_to_n = 3, cluster_n_period = 20)
dp = pd.read_excel(‘DATA_SOURCE\ENROLLMENT_FORECAST_DS.xlsx’, sheetname=‘DISPOSABLE_INCOME’)
dp.columns = [‘Year’, ‘DSPI’]
dpg = dp.groupby(‘Year’)
dp = dpg.mean().reset_index()
dp.index = [pd.Period(x, freq = ‘A’) for x in dp.Year]
dp.drop([‘Year’], axis = 1, inplace=True)
print(‘umemployment lag0 + lag1 + dspi lag 5’)
exog = umemployment[34:].copy()
exog[‘umemployment_rate_lag1’] = umemployment[33:-1].values
exog[‘dspi_lag5’] = dp[26:-6].values
p_um0_ds5 = vaildation_mod.predict_by_cluster(er2, exog = exog, rebase_to_n = 3, cluster_n_period = 20)
print (‘umemployment lag0 + lag1 + dspi lag 5 + dspi lag 4’)
exog[‘dspi_lag4’] = dp[27:-5].values
p_um0_ds45 = vaildation_mod.predict_by_cluster(er2, exog = exog, rebase_to_n = 3, cluster_n_period = 20)
exog = umemployment[34:].copy()
exog[‘umemployment_rate_lag1’] = umemployment[33:-1].values
p = vaildation_mod.predict_by_cluster(er2, exog = None, rebase_to_n = 3, cluster_n_period = 20, hold_out_n=2)
# pickle it, takes a while to run
pickle.dump(p, open(‘inst_model.p’, ‘wb’))
# p = pickle.load(open(‘inst_model.p’, ‘rb’))
cluster = pd.DataFrame([p.ds.columns, p.ds_c.tolist()])
cluster = cluster.transpose()
cluster.columns = [‘UnitID’, ‘cluster’]
ds = cluster.merge(intinfo, how = ‘left’)
Where are the errors?
p.draw_cluster_error()
p.draw_cluster_trend(10, 10, schools)
p.look_at_institute(19, inst_name = schools)
# add two more years umemployment projection from FED
umemployment.loc[pd.Period(2015)] = 5.0
umemployment.loc[pd.Period(2016)] = 4.8
# umemployment.tail(2)
# produce the forecast
exog = umemployment[34:].copy()
exog[‘umemployment_rate_lag1’] = umemployment[33:-1].values
p = vaildation_mod.predict_by_cluster(er2, exog = exog, rebase_to_n = 3, cluster_n_period = 15, hold_out_n = 0)
forecast = p.get_forecast()
forecast.to_csv(‘institution_forecast.csv’)
This section will used some institution info such as age, gender, institution type (occupational/academic etc.) downloaded from NCES and try to make sense of cluster (or not?).
# variables to profile
for i in range(len(ds.columns)):
print(i, ds.columns[i])
vaildation_mod.cluster_profiling(ds, lab, ds.columns[9])
vaildation_mod.cluster_profiling(ds, lab, ds.columns[25], ds.columns[35], ds.columns[29], ds.columns[26])
p.draw_cluster_trend(21, 10, schools)
# school lookup
p.ds_c[np.where(p.ds.columns == schools[schools[‘Institution Name’] == “Princeton University”][‘UnitId’].values[0])]
Python module: https://github.com/l1990790120/l1990790120.github.io/blob/master/_includes/nb/COLLEGE_ENROLLMENT_FORECAST_INST_LEVEL_vaildation_mod.py
comments powered by Disqus