import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

recent_grads = pd.read_csv('recent-grads.csv')
print(recent_grads.iloc[0,:])

Rank                                        1
Major_code                               2419
Major                   PETROLEUM ENGINEERING
Total                                    2339
Men                                      2057
Women                                     282
Major_category                    Engineering
ShareWomen                           0.120564
Sample_size                                36
Employed                                 1976
Full_time                                1849
Part_time                                 270
Full_time_year_round                     1207
Unemployed                                 37
Unemployment_rate                   0.0183805
Median                                 110000
P25th                                   95000
P75th                                  125000
College_jobs                             1534
Non_college_jobs                          364
Low_wage_jobs                             193
Name: 0, dtype: object


print(recent_grads.head())

   Rank  Major_code                                      Major    Total  \
0     1        2419                      PETROLEUM ENGINEERING   2339.0   
1     2        2416             MINING AND MINERAL ENGINEERING    756.0   
2     3        2415                  METALLURGICAL ENGINEERING    856.0   
3     4        2417  NAVAL ARCHITECTURE AND MARINE ENGINEERING   1258.0   
4     5        2405                       CHEMICAL ENGINEERING  32260.0   

       Men    Women Major_category  ShareWomen  Sample_size  Employed  \
0   2057.0    282.0    Engineering    0.120564           36      1976   
1    679.0     77.0    Engineering    0.101852            7       640   
2    725.0    131.0    Engineering    0.153037            3       648   
3   1123.0    135.0    Engineering    0.107313           16       758   
4  21239.0  11021.0    Engineering    0.341631          289     25694   

       ...        Part_time  Full_time_year_round  Unemployed  \
0      ...              270                  1207          37   
1      ...              170                   388          85   
2      ...              133                   340          16   
3      ...              150                   692          40   
4      ...             5180                 16697        1672   

   Unemployment_rate  Median  P25th   P75th  College_jobs  Non_college_jobs  \
0           0.018381  110000  95000  125000          1534               364   
1           0.117241   75000  55000   90000           350               257   
2           0.024096   73000  50000  105000           456               176   
3           0.050125   70000  43000   80000           529               102   
4           0.061098   65000  50000   75000         18314              4440   

   Low_wage_jobs  
0            193  
1             50  
2              0  
3              0  
4            972  

[5 rows x 21 columns]


print(recent_grads.describe())

             Rank   Major_code          Total            Men          Women  \
count  173.000000   173.000000     172.000000     172.000000     172.000000   
mean    87.000000  3879.815029   39370.081395   16723.406977   22646.674419   
std     50.084928  1687.753140   63483.491009   28122.433474   41057.330740   
min      1.000000  1100.000000     124.000000     119.000000       0.000000   
25%     44.000000  2403.000000    4549.750000    2177.500000    1778.250000   
50%     87.000000  3608.000000   15104.000000    5434.000000    8386.500000   
75%    130.000000  5503.000000   38909.750000   14631.000000   22553.750000   
max    173.000000  6403.000000  393735.000000  173809.000000  307087.000000   

       ShareWomen  Sample_size       Employed      Full_time      Part_time  \
count  172.000000   173.000000     173.000000     173.000000     173.000000   
mean     0.522223   356.080925   31192.763006   26029.306358    8832.398844   
std      0.231205   618.361022   50675.002241   42869.655092   14648.179473   
min      0.000000     2.000000       0.000000     111.000000       0.000000   
25%      0.336026    39.000000    3608.000000    3154.000000    1030.000000   
50%      0.534024   130.000000   11797.000000   10048.000000    3299.000000   
75%      0.703299   338.000000   31433.000000   25147.000000    9948.000000   
max      0.968954  4212.000000  307933.000000  251540.000000  115172.000000   

       Full_time_year_round    Unemployed  Unemployment_rate         Median  \
count            173.000000    173.000000         173.000000     173.000000   
mean           19694.427746   2416.329480           0.068191   40151.445087   
std            33160.941514   4112.803148           0.030331   11470.181802   
min              111.000000      0.000000           0.000000   22000.000000   
25%             2453.000000    304.000000           0.050306   33000.000000   
50%             7413.000000    893.000000           0.067961   36000.000000   
75%            16891.000000   2393.000000           0.087557   45000.000000   
max           199897.000000  28169.000000           0.177226  110000.000000   

              P25th          P75th   College_jobs  Non_college_jobs  \
count    173.000000     173.000000     173.000000        173.000000   
mean   29501.445087   51494.219653   12322.635838      13284.497110   
std     9166.005235   14906.279740   21299.868863      23789.655363   
min    18500.000000   22000.000000       0.000000          0.000000   
25%    24000.000000   42000.000000    1675.000000       1591.000000   
50%    27000.000000   47000.000000    4390.000000       4595.000000   
75%    33000.000000   60000.000000   14444.000000      11783.000000   
max    95000.000000  125000.000000  151643.000000     148395.000000   

       Low_wage_jobs  
count     173.000000  
mean     3859.017341  
std      6944.998579  
min         0.000000  
25%       340.000000  
50%      1231.000000  
75%      3466.000000  
max     48207.000000


raw_data_count = recent_grads.shape[0]
recent_grads = recent_grads.dropna()
print(raw_data_count)
print(recent_grads.shape[0])

173
172


recent_grads['Sample_size'].hist(bins = 30, range = (0,2500))

<matplotlib.axes._subplots.AxesSubplot at 0x7fc602e0e278>


recent_grads['Median'].hist(range= (15000,80000))

<matplotlib.axes._subplots.AxesSubplot at 0x7fc602d3c780>


recent_grads.loc[recent_grads['ShareWomen']<0.5,'Men'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc602db1518>


recent_grads['ShareWomen'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc600b23710>


recent_grads.plot(x='Sample_size',y='Median', kind = 'scatter')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc600c0d898>


recent_grads.plot(x='Full_time',y='Median',kind = 'scatter')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc600ace9b0>


# recent_grads.plot(x='ShareWomen',y='Unemployment_rate',kind = 'scatter')
recent_grads.plot(x='Men',y='Median',kind = 'scatter')
recent_grads.plot(x='Women',y='Median',kind = 'scatter')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc600ac2ba8>


med_hist = recent_grads['Median'].hist(bins=60, range=(0,80000))
med_hist.set_title('Median salary distribution')

<matplotlib.text.Text at 0x7fc6008ffac8>


from pandas.plotting import scatter_matrix


scatter_matrix(recent_grads[['Sample_size','Median']], hist_kwds = {'bins':20} ,figsize = (5,5))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6007eee80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6007d7160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc60079f9b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc60075a828>]],
      dtype=object)


scatter_matrix(recent_grads[['Men','Women','Median']])

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6006684e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc600653e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6006228d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6005e04a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6005276d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6004e3fd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6004b1cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc600467e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc60043b198>]],
      dtype=object)


import numpy as np
ax1 = recent_grads[:10].plot.bar(x=np.arange(1,11), y= 'ShareWomen')
ax1.set_title('Percentage of Women from the first ten rows')

ax2 = recent_grads[-10:].plot.bar(x=np.arange(-10,0,1), y= 'ShareWomen')
ax2.set_title('Percentage of Women from the last ten rows')

<matplotlib.text.Text at 0x7fc6002a04a8>


ax3 = recent_grads[:10].plot.bar(x=np.arange(1,11), y= 'Unemployment_rate')
ax3.set_title('Unemployment rate from the first ten rows')
ax3.set_xlabel('First 10 rows from the dataset')

ax4 = recent_grads[-10:].plot.bar(x=np.arange(-10,0,1), y= 'Unemployment_rate')
ax4.set_title('Unemployment rate from the last ten rows')
ax4.set_xlabel('Last 10 rows from the dataset')

<matplotlib.text.Text at 0x7fc600147c88>


final_df = recent_grads.groupby(['Major_category']).sum().reset_index()
final_df.head()


final_df.plot(x='Major_category', y =['Men','Women'], kind='bar')

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/plotting/_core.py:1716: UserWarning:

Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access

<matplotlib.axes._subplots.AxesSubplot at 0x7fc6001cd320>


recent_grads.boxplot(column=['Unemployment_rate'])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc600003080>


recent_grads.boxplot(column=['Median'])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc5fff9a390>


recent_grads.plot.hexbin(x='Sample_size',y='Median',gridsize=10)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc5fffcada0>


recent_grads.plot.hexbin(x='Full_time',y='Median',gridsize=10)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc5ffe4dc88>

	Major_category	Rank	Major_code	Total	Men	Women	ShareWomen	Sample_size	Employed	Full_time	Part_time	Full_time_year_round	Unemployed	Unemployment_rate	Median	P25th	P75th	College_jobs	Non_college_jobs	Low_wage_jobs
0	Agriculture & Natural Resources	993	10421	75620.0	40357.0	35263.0	3.647407	1068	63794	55585	15470	41891	3486	0.466352	316000	222000	410100	18677	33217	7414
1	Arts	1049	48121	357130.0	134390.0	222740.0	4.829264	3260	288114	207773	114791	153111	28228	0.721382	264500	175700	349300	94785	163720	60116
2	Biology & Life Science	1335	48662	453862.0	184919.0	268943.0	8.220700	2317	302797	240377	116736	165802	22854	0.852849	509900	372600	645200	151233	127182	42742
3	Business	726	80769	1302376.0	667852.0	634524.0	6.281573	15505	1088742	988870	196936	790425	79877	0.923826	566000	435000	713000	148538	496570	126788
4	Communications & Journalism	416	7610	392601.0	131921.0	260680.0	2.633536	4508	330660	273330	89817	214228	26852	0.302151	138000	105000	179900	86556	172992	49595

Visualizing earnings based on majors¶

What percent of majors are predominatly male? Predominatly female?¶

Do students in more popular majors make more money?¶

Is there any link between the number of full-time employees and median salary?¶

Do students that majored in subjects that were majority female make more money?¶

What's the most common median salary range?¶

Using scatter matrix plots to explore previous questions¶

Use barplots to compare the percentages of women(ShareWomen) from the first ten rows and last ten rows¶

Use bar plots to compare the unemployment_rate from the first ten rows and last ten rows.¶

Use a grouped bar plot to compare the number of men with the number of women in each category of majors¶

Use a box plot to explore the distributions of median salaries and unemployment rate.¶

Use a hexagonal bin plot to visualize the columns that had dense scatter plots from earlier in the project.¶

Conclusion¶