I’ve run a couple classification ML algorithm on the dataset. What makes this problem interesting is that most of the students did not pass the course. I’ve re-sampled the positive cases multiple times to make the algorithms punish the false positive cases more severely.
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/26147
There's definition for each of the variables here.
Now, I simply want to understand, base on the attributes in this dataset, can I succesfully predict whether the student will pass/fail (certified == 1/0
) the course?
In machine learning - classification especially, a common pitfall is to benchmark classification algorithm by accuracy when dealing with unbalanced cases, even if your algorithm don't work, if your data is extremely unbalanced, you could still get 97% accuracy just by returning true to all cases (assuming 3% false cases). It's often advised to compare precision and recall (here, I look at ROC - tpr vs fpr) instead of accuracy. However, what's more complicated is how to improve the performance - Here, I am demonstrating one way of dealing with it - increase the probability (punishment) of drawing minor cases by repeating sampling.
This will be a fun exercise to explore different classification algorithms and do some visualization around.
# import requirement
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import *
%matplotlib inline
import seaborn as sns
# load data
df = pd.read_csv(‘data/HMXPC13_DI_v2_5-14-14.csv’)
print(df.shape)
df.head()
Whoo, not many missing values!
def quick_profile(df):
f, ax = plt.subplots(1,2, figsize=(10,3))
p = sns.barplot(y=np.sum(df.isnull(),axis=0)/df.shape[0], x= df.columns, ax = ax[0])
p.set_xticklabels(df.columns, rotation=90);
p.set_title(‘% of missing values by column’);
sns.barplot(y=np.bincount(df[‘certified’]), x=[0,1], ax = ax[1]);
ax[1].set_title(‘certified freq’);
quick_profile(df)
Ohoh, Less than 3% negative cases! This means, we need to boost negative cases if possible.
Now, I am not interested to classify those that does nothing beyond registration. By definition, that’s nevents
is not null
.
df = df[~df.nevents.isnull()]
quick_profile(df)
Discrete Variables
f, ax = plt.subplots(1,2, figsize=(10,4))
sns.countplot(data=df[[‘LoE_DI’, ‘certified’]], x = ‘LoE_DI’, hue = ‘certified’, ax = ax[0]);
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=30);
sns.countplot(data=df[[‘gender’, ‘certified’]], x = ‘gender’, hue = ‘certified’, ax = ax[1]);
plt.figure(figsize=(12,4))
p = sns.countplot(data=df[[‘final_cc_cname_DI’, ‘certified’]], x = ‘final_cc_cname_DI’, hue = ‘certified’)
p.set_xticklabels(p.get_xticklabels(), rotation=90);
Numeric Variables
df.grade.replace(‘nan’, np.nan, inplace=True)
df.grade.replace(’ ‘, np.nan, inplace=True)
df.grade = df.grade.astype(‘float64’)
sns.pairplot(
data=df[[‘YoB’, ‘grade’, ‘certified’, ‘nevents’, ‘ndays_act’, ‘nplay_video’, ‘nchapters’, ‘nforum_posts’]].dropna(),
hue=“certified”)
As what we’ve seen, some data processing needs to take place
# drop nplay_videio
since 60% of rows are missing
num = df[[‘certified’, ‘YoB’, ‘nevents’, ‘ndays_act’, ‘nchapters’]]
quick_profile(num)
# dummy code nominal variables
def dict_lookup(df,f):
unique = df[f].unique()
lookup = {}
i = 0
for x in unique:
lookup[x] = i
i += 1
<span class="k">return</span> <span class="n">df</span><span class="p">[</span><span class="n">f</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">lookup</span><span class="p">[</span><span class="n">x</span><span class="p">]),</span> <span class="n">lookup</span>
lookups = {}
for x in [‘final_cc_cname_DI’, ‘LoE_DI’, ‘gender’]:
v, lookups[x] = dict_lookup(df, x)
df[‘%s_coded’%x] = v
data = df[[‘course_id’, ‘userid_DI’, ‘certified’, ‘YoB’, ‘nevents’, ‘ndays_act’, ‘nchapters’, ‘final_cc_cname_DI_coded’, ‘LoE_DI_coded’, ‘gender_coded’]]
# from the profiling, you might notice some poeple are born in the future ???
# clean those out
data.loc[data[‘YoB’] >= 2013, ‘YoB’] = 2013
# use most freq value to replace missing value
for x in [‘YoB’, ‘nevents’, ‘ndays_act’, ‘nchapters’]:
replace_val = (np.bincount(data[x].dropna().astype(‘int64’)).argmax())
data.loc[data[x].isnull(), x] = replace_val
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
disc = enc.fit_transform(data[[‘final_cc_cname_DI_coded’, ‘LoE_DI_coded’, ‘gender_coded’]])
# stack onehotcoded + numeric values
num = data.drop([‘final_cc_cname_DI_coded’, ‘LoE_DI_coded’, ‘gender_coded’], axis=1)
d3 = np.hstack([num.values, disc])
# generate headers so we don’t lost track of the data transformation afterwards
new = True
for x in [‘final_cc_cname_DI’, ‘LoE_DI’, ‘gender’]:
t = []
for y in lookups[x].keys():
t.append([x, y,lookups[x][y]])
t2 = pd.DataFrame(t).sort_values(2)
if new:
headers = t2.copy()
new = False
else:
headers = pd.concat([headers, t2], axis = 0)
header = [‘course_id’, ‘userid_DI’, ‘certified’, ‘YoB’, ‘nevents’, ‘ndays_act’, ‘nchapters’]
header.extend(headers[1].values.tolist())
header
# best practive: create a hold out set
from sklearn.cross_validation import train_test_split
train, hold_out = train_test_split(d3, test_size=.3, random_state=0)
# balance the classification class in training set
# Repeated sample positive cases 24 times!
n = train[train[:,2] != 1]
y = train[train[:,2] == 1]
y2 = np.vstack([y for i in range(24)])
d4 = np.vstack([y2, n])
# take a look what I did
f, ax = plt.subplots(1,2, figsize=(8,4))
sns.barplot(y=np.bincount(d4[:,2].astype(‘int’)), x = [0,1], ax = ax[0]);
ax[0].set_title(‘train’);
sns.barplot(y = np.bincount(hold_out[:,2].astype(‘int’)), x = [0,1], ax = ax[1]);
ax[1].set_title(‘hold_out’);
Now that we’ve seen enough, let’s try to train the model.
# create train/test within training set
train_tr, train_x = train_test_split(train, test_size=.3, random_state=0)
Decision Tree
# first, let’s try a decision tree
from sklearn.tree import DecisionTreeClassifier
# let’s draw a training curve and see if this works
clss = dict()
test_err = []
train_err = []
for x in tqdm(range(5,50,5)):
clss[x] = DecisionTreeClassifier(max_depth=x)
clss[x].fit(train_tr[:,3:], train_tr[:,2].astype(‘int64’))
test_err.append(1-clss[x].score(train_x[:, 3:], train_x[:, 2].astype(‘int’)))
train_err.append(1-clss[x].score(train_tr[:,3:], train_tr[:,2].astype(‘int’)))
def training_curve(test, train, param, ax):
ax.plot(param, test_err, ’-o’);
ax.plot(param, train_err, ’-o’);
ax.legend([‘test error’, ‘train error’]);
ax.set_title(‘error curve’);
def roc(model, true, data, ax):
from sklearn.metrics import roccurve
fpr, tpr, = roc_curve(true, model.predict(data));
ax.plot(fpr, tpr, ’-o’);
ax.set_xlabel(‘fpr’);
ax.set_ylabel(‘tpr’);
ax.set_title(‘hold_out roc curve’);
f, ax = plt.subplots(1,2, figsize=(10,4))
training_curve(test_err, train_err, range(5,50,5), ax[0])
roc(clss[10], hold_out[:,2], hold_out[:,3:], ax[1])
Random Forest
# it does an okay job on hold_out, so let’s see how to improvee this even more!
# unfortunately, there’s no existing prune module in sklearn
# let’s try DT’s ensemble version -> RF
from sklearn.ensemble import RandomForestClassifier
# clss2 = dict()
test_err = []
train_err = []
for x in tqdm(range(5,50,5)):
clss2[x] = RandomForestClassifier(max_depth=x)
clss2[x].fit(train_tr[:,3:], train_tr[:,2].astype(‘int64’))
test_err.append(1-clss2[x].score(train_x[:, 3:], train_x[:, 2].astype(‘int’)))
train_err.append(1-clss2[x].score(train_tr[:,3:], train_tr[:,2].astype(‘int’)))
f, ax = plt.subplots(1,2, figsize=(10,4))
training_curve(test_err, train_err, range(5,50,5), ax[0])
roc(clss2[15], hold_out[:,2], hold_out[:,3:], ax[1])
SVM
# alright, not really helping
# let’s try SVM
from sklearn.svm import LinearSVC
clss3 = dict()
test_err = []
train_err = []
for c in tqdm(range(80,120,5)):
clss3[c] = LinearSVC(C=(c*.01))
clss3[c].fit(train_tr[:,3:], train_tr[:,2].astype(‘int64’))
test_err.append(1-clss3[c].score(train_x[:, 3:], train_x[:, 2].astype(‘int’)))
train_err.append(1-clss3[c].score(train_tr[:,3:], train_tr[:,2].astype(‘int’)))
f, ax = plt.subplots(1,2, figsize=(10,4))
training_curve(test_err, train_err, range(80,120,5), ax[0])
roc(clss3[115], hold_out[:,2], hold_out[:,3:], ax[1])