In this project, we will be analyzing data on gun deaths in the US.The dataset came from FiveThirtyEight, and can be found here.
The dataset is stored in the guns.csv file. It contains information on gun deaths in the US from 2012
to 2014
. Each row in the dataset represents a single fatality. The columns contain demographic and other information about the victim. Here are the first few rows of the dataset:
year | month | intent | police | sex | age | race | hispanic | place | education | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 2012 | 1 | Suicide | 0 | M | 34.0 | Asian/Pacific Islander | 100 | Home | 4.0 |
2 | 2012 | 1 | Suicide | 0 | M | 34.0 | White | 100 | Street | 3.0 |
3 | 2012 | 1 | Suicide | 0 | M | 60.0 | White | 100 | Other Specied | 4.0 |
4 | 2012 | 2 | Suicide | 0 | M | 64.0 | White | 100 | Home | 4.0 |
5 | 2012 | 2 | Suicide | 0 | M | 31.0 | White | 100 | Other Specified | 2.0 |
import csv
f = open('guns.csv','r')
csvreader = csv.reader(f)
data = list(csvreader)
print(data[:5])
[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education'], ['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4']]
headers = data[0]
data = data[1:]
print(headers)
['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education']
print(data[:5])
[['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4'], ['5', '2012', '02', 'Suicide', '0', 'M', '31', 'White', '100', 'Other specified', '2']]
years = [row[1] for row in data]
year_counts = {}
for each in years:
if each in year_counts:
year_counts[each] = year_counts[each] + 1
else:
year_counts[each] = 1
print(year_counts)
{'2013': 33636, '2012': 33563, '2014': 33599}
It looks like gun deaths didn't change much by year from 2012 to 2014. Let's see if gun deaths in the US change by month and year.
import datetime
dates = [datetime.datetime(year=int(each[1]),month=int(each[2]),day=1) for each in data]
print(dates[:5])
[datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0)]
date_counts = {}
for each in dates:
if each in date_counts:
date_counts[each] = date_counts[each] + 1
else:
date_counts[each] = 1
print(date_counts)
{datetime.datetime(2012, 3, 1, 0, 0): 2743, datetime.datetime(2014, 10, 1, 0, 0): 2865, datetime.datetime(2014, 3, 1, 0, 0): 2684, datetime.datetime(2012, 12, 1, 0, 0): 2791, datetime.datetime(2013, 10, 1, 0, 0): 2808, datetime.datetime(2014, 5, 1, 0, 0): 2864, datetime.datetime(2014, 6, 1, 0, 0): 2931, datetime.datetime(2013, 2, 1, 0, 0): 2375, datetime.datetime(2013, 1, 1, 0, 0): 2864, datetime.datetime(2013, 7, 1, 0, 0): 3079, datetime.datetime(2014, 2, 1, 0, 0): 2361, datetime.datetime(2013, 9, 1, 0, 0): 2742, datetime.datetime(2014, 12, 1, 0, 0): 2857, datetime.datetime(2012, 9, 1, 0, 0): 2852, datetime.datetime(2013, 12, 1, 0, 0): 2765, datetime.datetime(2013, 5, 1, 0, 0): 2806, datetime.datetime(2012, 7, 1, 0, 0): 3026, datetime.datetime(2012, 4, 1, 0, 0): 2795, datetime.datetime(2012, 5, 1, 0, 0): 2999, datetime.datetime(2012, 11, 1, 0, 0): 2729, datetime.datetime(2013, 4, 1, 0, 0): 2798, datetime.datetime(2012, 8, 1, 0, 0): 2954, datetime.datetime(2014, 9, 1, 0, 0): 2914, datetime.datetime(2012, 2, 1, 0, 0): 2357, datetime.datetime(2012, 6, 1, 0, 0): 2826, datetime.datetime(2014, 4, 1, 0, 0): 2862, datetime.datetime(2014, 1, 1, 0, 0): 2651, datetime.datetime(2013, 11, 1, 0, 0): 2758, datetime.datetime(2013, 8, 1, 0, 0): 2859, datetime.datetime(2012, 10, 1, 0, 0): 2733, datetime.datetime(2014, 8, 1, 0, 0): 2970, datetime.datetime(2013, 6, 1, 0, 0): 2920, datetime.datetime(2012, 1, 1, 0, 0): 2758, datetime.datetime(2014, 11, 1, 0, 0): 2756, datetime.datetime(2013, 3, 1, 0, 0): 2862, datetime.datetime(2014, 7, 1, 0, 0): 2884}
The sex and race columns contain potentially interesting information on how gun deaths in the US vary by gender and race.
sex_counts = {}
for each in data:
if each[5] in sex_counts:
sex_counts[each[5]] = sex_counts[each[5]] + 1
else:
sex_counts[each[5]] = 1
print(sex_counts)
{'M': 86349, 'F': 14449}
race_counts = {}
for each in data:
if each[7] in race_counts:
race_counts[each[7]] = race_counts[each[7]] + 1
else:
race_counts[each[7]] = 1
print(race_counts)
{'Native American/Native Alaskan': 917, 'Asian/Pacific Islander': 1326, 'White': 66237, 'Hispanic': 9022, 'Black': 23296}
As far as the analysis till now,male death rate seems more than that of female deaths, more than 5X.Across the racial groups, the minority seem to be more affected specially the Hispanic race.To find the actual proportion of each affected racial groups, let's read the census.csv
which gives the total population across each racial category.
census = list(csv.reader(open('census.csv','r')))
print(census)
[['Id', 'Year', 'Id', 'Sex', 'Id', 'Hispanic Origin', 'Id', 'Id2', 'Geography', 'Total', 'Race Alone - White', 'Race Alone - Hispanic', 'Race Alone - Black or African American', 'Race Alone - American Indian and Alaska Native', 'Race Alone - Asian', 'Race Alone - Native Hawaiian and Other Pacific Islander', 'Two or More Races'], ['cen42010', 'April 1, 2010 Census', 'totsex', 'Both Sexes', 'tothisp', 'Total', '0100000US', '', 'United States', '308745538', '197318956', '44618105', '40250635', '3739506', '15159516', '674625', '6984195']]
import re
mapping ={}
for idx, name in enumerate(race_counts):
if name not in mapping:
mapping[name] = 0
for idx, name in enumerate(mapping):
if re.search("Asian|Pacific Islander",name) is not None:
mapping[name] = census[1][14] + census[1][15]
if re.search("Black",name) is not None:
mapping[name] = census[1][12]
if re.search("Hispanic",name) is not None:
mapping[name] = census[1][11]
if re.search("Alaska",name) is not None:
mapping[name] = census[1][13]
if re.search("White",name) is not None:
mapping[name] = census[1][10]
print(mapping)
{'Native American/Native Alaskan': '3739506', 'Asian/Pacific Islander': '15159516674625', 'White': '197318956', 'Hispanic': '44618105', 'Black': '40250635'}
race_per_hundredk = {}
for idx, name in enumerate(race_counts):
if name not in race_per_hundredk:
race_per_hundredk[name] = 0
for race in race_counts:
race_per_hundredk[race] = (int(race_counts[race])/int(mapping[race]))*100000
print(race_per_hundredk)
{'Native American/Native Alaskan': 24.521955573811088, 'Asian/Pacific Islander': 8.746980714890115e-06, 'White': 33.56849303419181, 'Hispanic': 20.220491210910907, 'Black': 57.8773477735196}
So, it can be seen that Black groups were more affected.Now it piques my curosity to explore the intent behind the gun deaths. Let's explore the gun deaths by filtering on the intent
intents = [row[3] for row in data]
races= [row[7] for row in data]
homicide_race_counts = {}
for i, race in enumerate(races):
if intents[i] == "Homicide":
if race in homicide_race_counts:
homicide_race_counts[race] +=1
else:
homicide_race_counts[race] = 1
homicide_by_race_per_hundredk = {}
for idx, name in enumerate(homicide_race_counts):
if name not in homicide_by_race_per_hundredk:
homicide_by_race_per_hundredk[name] = 0
for race in homicide_race_counts:
homicide_by_race_per_hundredk[race] = (int(homicide_race_counts[race])/int(mapping[race]))*100000
print(homicide_by_race_per_hundredk)
{'Native American/Native Alaskan': 8.717729026240365, 'Asian/Pacific Islander': 3.687452654316421e-06, 'White': 4.6356417981453335, 'Hispanic': 12.627161104219914, 'Black': 48.471284987180944}
Thus, for the gun death for homicidal cause, the Black and Hispanic were killed more than any other racial group.lets how the gun deaths correlate with gender if the cause of the gun death is homicide.
genders = [row[5] for row in data]
homicide_gender_counts = {}
for i, gender in enumerate(genders):
if intents[i] == "Homicide":
if gender in homicide_gender_counts:
homicide_gender_counts[gender] +=1
else:
homicide_gender_counts[gender] = 1
homicide_by_sex_per_hundredk = {}
for idx, name in enumerate(homicide_gender_counts):
if name not in homicide_by_sex_per_hundredk:
homicide_by_sex_per_hundredk[name] = 0
for k,v in homicide_by_sex_per_hundredk.items():
homicide_by_sex_per_hundredk[k] = int(homicide_gender_counts[k])/int(census[1][9])
print(homicide_by_sex_per_hundredk)
{'M': 9.652933024735729e-05, 'F': 1.7402680650238256e-05}
homicide_rate_per_month = {}
for i, date in enumerate(dates):
if intents[i] == "Homicide":
if date.strftime("%b") in homicide_rate_per_month:
homicide_rate_per_month[date.strftime("%b")] +=1
else:
homicide_rate_per_month[date.strftime("%b")] =1
print(homicide_rate_per_month)
{'Apr': 2845, 'Jan': 2829, 'Dec': 3191, 'Aug': 3125, 'Feb': 2178, 'Jun': 3130, 'Mar': 2780, 'Jul': 3269, 'Sep': 2966, 'May': 2976, 'Nov': 2919, 'Oct': 2968}
There seems to be a seasonality effect in the gun-death rate due to homicide. Rates start to increase from March till July, i.e. Spring through Mid-Summer. Between Jan - March and Sep- Dec , there seems to be an alternating increase and decrease in the pattern of gun death rates
place = [each[9] for each in data]
gun_deaths_place = {}
for each in place:
if each in gun_deaths_place:
gun_deaths_place[each] +=1
else:
gun_deaths_place[each] = 1
print(gun_deaths_place)
{'Street': 11151, 'School/instiution': 671, 'Farm': 470, 'NA': 1384, 'Other specified': 13751, 'Residential institution': 203, 'Industrial/construction': 248, 'Home': 60486, 'Other unspecified': 8867, 'Sports': 128, 'Trade/service area': 3439}
From above results, most of the deaths occured at the residence of the perpetrator and the second highest at unspecified location.
education = [each[10] for each in data]
gun_deaths_education = {}
for each in education:
if each in gun_deaths_education:
gun_deaths_education[each] +=1
else:
gun_deaths_education[each] = 1
print(gun_deaths_education)
{'4': 12946, '3': 21680, 'NA': 53, '5': 1369, '2': 42927, '1': 21823}
It can be seen from above results that gun death rates is highest among the high school graduates and the second highest being among the college enrolled students.
accidental_gender_counts = {}
for k,v in enumerate(genders):
if intents[k] == "Accidental":
if v in accidental_gender_counts:
accidental_gender_counts[v] +=1
else:
accidental_gender_counts[v] = 1
accidental_by_gender_per_hundredk = {}
for idx, name in enumerate(accidental_gender_counts):
if name not in accidental_by_gender_per_hundredk:
accidental_by_gender_per_hundredk[name] = 0
for gender in accidental_gender_counts:
accidental_by_gender_per_hundredk[gender] = (int(accidental_gender_counts[gender])/int(census[1][9]))
print(accidental_by_gender_per_hundredk)
{'M': 4.602495664245033e-06, 'F': 7.060830786808002e-07}
accidental_race_counts = {}
for i, race in enumerate(races):
if intents[i] == "Accidental":
if race in accidental_race_counts:
accidental_race_counts[race] +=1
else:
accidental_race_counts[race] = 1
accidental_by_race_per_hundredk = {}
for idx, name in enumerate(accidental_race_counts):
if name not in accidental_by_race_per_hundredk:
accidental_by_race_per_hundredk[name] = 0
for race in homicide_race_counts:
accidental_by_race_per_hundredk[race] = (int(accidental_race_counts[race])/int(mapping[race]))
print(accidental_by_race_per_hundredk)
{'Native American/Native Alaskan': 5.883130017708222e-06, 'Asian/Pacific Islander': 7.915819651484267e-13, 'White': 5.7369044664923125e-06, 'Hispanic': 3.2498018461339853e-06, 'Black': 8.14893976206835e-06}