Draw with US College Data

Showcase on what you can do with IPEDS data API. Choropleth with d3. I’ve also tried this on beaker notebook. More details to come!


Now, here are a few cool things you can see with this data

In [4]:
HTML(map_css() + """

""")
Out[4]:

In [2]:
# init
# import requirement
import pandas as pd
import numpy as np
from IPython.display import Javascript, HTML

# visual
import mpld3
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpld3.enable_notebook()

# import dependency
from dep import *
api_key = get_api_key()
In [5]:
# pull data
# save data to local hdf
# pull([2008,2009,2010,2011,2012,2013])
# pull([2013], api_key)

# read data from local hdf
yr_2013 = pd.read_hdf('d.h', 'yr_2013')
geo = pd.read_hdf('d.h', 'geo')

# columns
HTML(pd.DataFrame(yr_2013.columns).to_html())
Out[5]:
0
02013.academics.program_percentage.agriculture
12013.academics.program_percentage.architecture
22013.academics.program_percentage.biological
32013.academics.program_percentage.business_mar...
42013.academics.program_percentage.communication
52013.academics.program_percentage.communicatio...
62013.academics.program_percentage.computer
72013.academics.program_percentage.construction
82013.academics.program_percentage.education
92013.academics.program_percentage.engineering
102013.academics.program_percentage.engineering_...
112013.academics.program_percentage.english
122013.academics.program_percentage.ethnic_cultu...
132013.academics.program_percentage.family_consu...
142013.academics.program_percentage.health
152013.academics.program_percentage.history
162013.academics.program_percentage.humanities
172013.academics.program_percentage.language
182013.academics.program_percentage.legal
192013.academics.program_percentage.library
202013.academics.program_percentage.mathematics
212013.academics.program_percentage.mechanic_rep...
222013.academics.program_percentage.military
232013.academics.program_percentage.multidiscipline
242013.academics.program_percentage.parks_recrea...
252013.academics.program_percentage.personal_cul...
262013.academics.program_percentage.philosophy_r...
272013.academics.program_percentage.physical_sci...
282013.academics.program_percentage.precision_pr...
292013.academics.program_percentage.psychology
302013.academics.program_percentage.public_admin...
312013.academics.program_percentage.resources
322013.academics.program_percentage.science_tech...
332013.academics.program_percentage.security_law...
342013.academics.program_percentage.social_science
352013.academics.program_percentage.theology_rel...
362013.academics.program_percentage.transportation
372013.academics.program_percentage.visual_perfo...
382013.admissions.admission_rate.by_ope_id
392013.admissions.admission_rate.overall
402013.admissions.sat_scores.average.by_ope_id
412013.admissions.sat_scores.average.overall
422013.completion.completion_rate_4yr_150_asian
432013.completion.completion_rate_4yr_150_black
442013.completion.completion_rate_4yr_150_hispanic
452013.completion.completion_rate_4yr_150_white
462013.completion.completion_rate_4yr_150nt
472013.completion.completion_rate_4yr_150nt_pooled
482013.completion.completion_rate_less_than_4yr_...
492013.completion.completion_rate_less_than_4yr_...
502013.cost.attendance.academic_year
512013.cost.attendance.program_year
522013.cost.avg_net_price.private
532013.cost.avg_net_price.public
542013.cost.tuition.in_state
552013.cost.tuition.out_of_state
562013.earnings.10_yrs_after_entry.median
572013.earnings.6_yrs_after_entry.median
582013.earnings.8_yrs_after_entry.median_earnings
592013.student.demographics.avg_family_income
602013.student.demographics.avg_family_income_in...
612013.student.demographics.avg_family_income_in...
622013.student.demographics.avg_family_income_log
632013.student.demographics.first_generation
642013.student.demographics.median_family_income
652013.student.demographics.race_ethnicity.asian
662013.student.demographics.race_ethnicity.black
672013.student.demographics.race_ethnicity.hispanic
682013.student.demographics.race_ethnicity.non_r...
692013.student.demographics.race_ethnicity.white
702013.student.demographics.share_asian.home_ZIP
712013.student.demographics.share_bachelors_degr...
722013.student.demographics.share_black.home_ZIP
732013.student.demographics.share_born_US.home_ZIP
742013.student.demographics.share_hispanic.home_ZIP
752013.student.demographics.share_professional_d...
762013.student.demographics.share_white.home_ZIP
772013.student.size
78id
79school.carnegie_basic
80school.carnegie_size_setting
81school.carnegie_undergrad
82school.degrees_awarded.predominant
83school.locale
84school.name
85school.online_only
86school.operating
87school.ownership
88school.zip
In [6]:
# select query
d_2013 = yr_2013[[  'id',
                    'school.name', 'school.operating', 'school.locale', 'school.ownership','school.zip',
                    '2013.student.size',
                    '2013.cost.avg_net_price.private', '2013.cost.avg_net_price.public',
                    '2013.admissions.admission_rate.by_ope_id',
                    '2013.admissions.admission_rate.overall',
                    '2013.admissions.sat_scores.average.by_ope_id',
                    '2013.admissions.sat_scores.average.overall',
                    '2013.cost.attendance.academic_year',
                    '2013.cost.attendance.program_year',
                    '2013.cost.avg_net_price.public',
                    '2013.cost.tuition.in_state',
                    '2013.cost.tuition.out_of_state',
                    '2013.completion.completion_rate_4yr_150nt',
                    '2013.completion.completion_rate_4yr_150nt_pooled',
                    '2013.completion.completion_rate_less_than_4yr_150nt',
                    '2013.completion.completion_rate_less_than_4yr_150nt_pooled',
                    '2013.completion.completion_rate_4yr_150_asian',
                    '2013.completion.completion_rate_4yr_150_black',
                    '2013.completion.completion_rate_4yr_150_hispanic',
                    '2013.completion.completion_rate_4yr_150_white',
                    '2013.student.demographics.race_ethnicity.asian',
                    '2013.student.demographics.race_ethnicity.black',
                    '2013.student.demographics.race_ethnicity.hispanic',
                    '2013.student.demographics.race_ethnicity.non_resident_alien',
                    '2013.student.demographics.race_ethnicity.white',
                    '2013.earnings.10_yrs_after_entry.median',
                    '2013.earnings.6_yrs_after_entry.median',
                    '2013.earnings.8_yrs_after_entry.median_earnings'
                 ]]

# just looking at 'operating school'
d_2013 = d_2013[d_2013['school.operating'] == 1]

# show # null value of each field
HTML(pd.DataFrame(np.sum(d_2013.isnull(), axis=0)/d_2013.shape[0]).to_html())
Out[6]:
0
id0.000000
school.name0.000000
school.operating0.000000
school.locale0.051587
school.ownership0.000000
school.zip0.000000
2013.student.size0.088944
2013.cost.avg_net_price.private0.399015
2013.cost.avg_net_price.public0.739737
2013.admissions.admission_rate.by_ope_id0.669267
2013.admissions.admission_rate.overall0.703202
2013.admissions.sat_scores.average.by_ope_id0.792282
2013.admissions.sat_scores.average.overall0.806924
2013.cost.attendance.academic_year0.456076
2013.cost.attendance.program_year0.682403
2013.cost.avg_net_price.public0.739737
2013.cost.tuition.in_state0.419540
2013.cost.tuition.out_of_state0.446360
2013.completion.completion_rate_4yr_150nt0.674330
2013.completion.completion_rate_4yr_150nt_pooled0.671319
2013.completion.completion_rate_less_than_4yr_150nt0.498495
2013.completion.completion_rate_less_than_4yr_150nt_pooled0.492200
2013.completion.completion_rate_4yr_150_asian0.747126
2013.completion.completion_rate_4yr_150_black0.710044
2013.completion.completion_rate_4yr_150_hispanic0.709770
2013.completion.completion_rate_4yr_150_white0.687329
2013.student.demographics.race_ethnicity.asian0.088944
2013.student.demographics.race_ethnicity.black0.088944
2013.student.demographics.race_ethnicity.hispanic0.088944
2013.student.demographics.race_ethnicity.non_resident_alien0.088944
2013.student.demographics.race_ethnicity.white0.088944
2013.earnings.10_yrs_after_entry.median1.000000
2013.earnings.6_yrs_after_entry.median1.000000
2013.earnings.8_yrs_after_entry.median_earnings1.000000
In [7]:
# take a look at my school
# print(d_2013[d_2013['school.name'].str.contains('Syracuse University', na=False)].transpose())
HTML(d_2013[d_2013['school.name'].str.contains('Georgia Institute of Technology', na=False)].transpose().to_html())
Out[7]:
12
id139755
school.nameGeorgia Institute of Technology-Main Campus
school.operating1
school.locale11
school.ownership1
school.zip30332
2013.student.size13975
2013.cost.avg_net_price.privateNaN
2013.cost.avg_net_price.public11053
2013.admissions.admission_rate.by_ope_id0.5493
2013.admissions.admission_rate.overall0.5493
2013.admissions.sat_scores.average.by_ope_id1352
2013.admissions.sat_scores.average.overall1352
2013.cost.attendance.academic_year21980
2013.cost.attendance.program_yearNaN
2013.cost.avg_net_price.public11053
2013.cost.tuition.in_state10650
2013.cost.tuition.out_of_state29954
2013.completion.completion_rate_4yr_150nt0.8219
2013.completion.completion_rate_4yr_150nt_pooled0.806982
2013.completion.completion_rate_less_than_4yr_150ntNaN
2013.completion.completion_rate_less_than_4yr_150nt_pooledNaN
2013.completion.completion_rate_4yr_150_asian0.8552
2013.completion.completion_rate_4yr_150_black0.7521
2013.completion.completion_rate_4yr_150_hispanic0.7328
2013.completion.completion_rate_4yr_150_white0.8273
2013.student.demographics.race_ethnicity.asian0.175
2013.student.demographics.race_ethnicity.black0.0628
2013.student.demographics.race_ethnicity.hispanic0.065
2013.student.demographics.race_ethnicity.non_resident_alien0.1048
2013.student.demographics.race_ethnicity.white0.553
2013.earnings.10_yrs_after_entry.medianNone
2013.earnings.6_yrs_after_entry.medianNone
2013.earnings.8_yrs_after_entry.median_earningsNone

Geo Visualization on Student Demographics

In [9]:
Javascript(d3_choropleth_generator(d_2013, geo, '2013.student.demographics.race_ethnicity.white', np.mean, '2013 White Student Ratio', 'white_student'))
Javascript(d3_choropleth_generator(d_2013, geo, '2013.completion.completion_rate_4yr_150nt', np.mean, '2013 Mean Completion Rate', 'completion_rate'))
Out[9]:

Now, it would be interesting to see overall completion rate can be correlated with student ratio by different races

In [10]:
f, ax_i = plt.subplots(2,2, figsize=(8,8))

x = d_2013['2013.completion.completion_rate_4yr_150nt']

def draw(ax, x, y):
    sns.regplot(x=x, y=d_2013['2013.student.demographics.race_ethnicity.%s'%y], ax=ax)
    ax.set_xlim(0,1); ax.set_ylim(0,1);
    ax.set_title(y);
    ax.set_xlabel('Student Completion Rate');
    ax.set_ylabel('Student Ratio');

axes = [ax_i[0][0], ax_i[0][1], ax_i[1][0], ax_i[1][1]]
races = ['white', 'asian', 'black', 'hispanic']

for i in range(4):
    draw(axes[i], x, races[i])

f.tight_layout();

Python module: https://github.com/l1990790120/l1990790120.github.io/blob/master/_includes/nb/Draw_with_US_College_Data_dep.py

comments powered by Disqus