1.1 LOADING THE LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
1.2 IMPORTING THE DATASETS
data=pd.read_csv('master.csv')
1.3 Check the data head()
data.head()
| country | year | sex | age | suicides_no | population | suicides/100k pop | country-year | HDI for year | gdp_for_year ($) | gdp_per_capita ($) | generation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 1987 | male | 15-24 years | 21 | 312900 | 6.71 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 1 | Albania | 1987 | male | 35-54 years | 16 | 308000 | 5.19 | Albania1987 | NaN | 2,156,624,900 | 796 | Silent |
| 2 | Albania | 1987 | female | 15-24 years | 14 | 289700 | 4.83 | Albania1987 | NaN | 2,156,624,900 | 796 | Generation X |
| 3 | Albania | 1987 | male | 75+ years | 1 | 21800 | 4.59 | Albania1987 | NaN | 2,156,624,900 | 796 | G.I. Generation |
| 4 | Albania | 1987 | male | 25-34 years | 9 | 274300 | 3.28 | Albania1987 | NaN | 2,156,624,900 | 796 | Boomers |
1.4 Check the data.describe()
data.describe()
| year | suicides_no | population | suicides/100k pop | HDI for year | gdp_per_capita ($) | |
|---|---|---|---|---|---|---|
| count | 27820.000000 | 27820.000000 | 2.782000e+04 | 27820.000000 | 8364.000000 | 27820.000000 |
| mean | 2001.258375 | 242.574407 | 1.844794e+06 | 12.816097 | 0.776601 | 16866.464414 |
| std | 8.469055 | 902.047917 | 3.911779e+06 | 18.961511 | 0.093367 | 18887.576472 |
| min | 1985.000000 | 0.000000 | 2.780000e+02 | 0.000000 | 0.483000 | 251.000000 |
| 25% | 1995.000000 | 3.000000 | 9.749850e+04 | 0.920000 | 0.713000 | 3447.000000 |
| 50% | 2002.000000 | 25.000000 | 4.301500e+05 | 5.990000 | 0.779000 | 9372.000000 |
| 75% | 2008.000000 | 131.000000 | 1.486143e+06 | 16.620000 | 0.855000 | 24874.000000 |
| max | 2016.000000 | 22338.000000 | 4.380521e+07 | 224.970000 | 0.944000 | 126352.000000 |
1.5 Cleaning check if there is missing data Check the Columns
data.columns
Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
'suicides/100k pop', 'country-year', 'HDI for year',
' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
dtype='object')
1.6 Checking the shape of data
data.shape
(27820, 12)
1.7 Count the datatypes
data.dtypes.value_counts()
object 6 int64 4 float64 2 dtype: int64
1.8 Check the data set information
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 27820 entries, 0 to 27819 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 27820 non-null object 1 year 27820 non-null int64 2 sex 27820 non-null object 3 age 27820 non-null object 4 suicides_no 27820 non-null int64 5 population 27820 non-null int64 6 suicides/100k pop 27820 non-null float64 7 country-year 27820 non-null object 8 HDI for year 8364 non-null float64 9 gdp_for_year ($) 27820 non-null object 10 gdp_per_capita ($) 27820 non-null int64 11 generation 27820 non-null object dtypes: float64(2), int64(4), object(6) memory usage: 2.5+ MB
1.8 Checking the dataset missing values
total_missing_check=data.isnull().sum().sort_values(ascending=False) #Total number of Null values
total_missing_check
HDI for year 19456 generation 0 gdp_per_capita ($) 0 gdp_for_year ($) 0 country-year 0 suicides/100k pop 0 population 0 suicides_no 0 age 0 sex 0 year 0 country 0 dtype: int64
1.9 Create Frequency table for Age
my_tab=pd.crosstab(index=data['age'], columns='Freq')
my_tab
| col_0 | Freq |
|---|---|
| age | |
| 15-24 years | 4642 |
| 25-34 years | 4642 |
| 35-54 years | 4642 |
| 5-14 years | 4610 |
| 55-74 years | 4642 |
| 75+ years | 4642 |
1.10 Bar plot to check Number of Suicides by top Countries
data.groupby(by=['country'])['suicides_no'].sum().reset_index().sort_values(['suicides_no']).tail(10).plot(x='country',y='suicides_no',kind='bar', figsize=(15,5))
plt.title('Plot of Number of Suicides by top Countries')
plt.show()
Fig 1.0: Number of suicides by countries
1.11 Bar Plot for Number of suicides by Age
plt.figure(figsize=(10,5)) #setting the figure size
ax=sns.barplot(x='age',y='suicides_no', data=data, estimator= np.mean,ci=90, palette='rocket' )#barplot
plt.title('Plot of Number of Suicides by age')
Text(0.5, 1.0, 'Plot of Number of Suicides by age')
Fig 2.0: Bar Plot of Number of suicides by Age
1.12 Bar Plot of Number of Suicides by Sex
plt.figure(figsize=(8,4))
ax = sns.barplot(x='sex',y='suicides_no', data=data,ci=None)
plt.show()
1.13 Bar plot of Number of Suicides vs generation
plt.figure(figsize=(9,5))
ax= sns.barplot(x='generation', y='suicides_no', data=data,ci=None)
Interpretation:
1.14 Scatter Plot of Number of Suicides vs Population.
figure=plt.figure(figsize=(10,4))
ax= sns.scatterplot(x='population',y='suicides_no', data=data,size='suicides_no',color='g') #scatter plot
figure=plt.figure(figsize=(20,10))
ax= sns.regplot(x='population',y='suicides_no', data=data,color='r') #scatter plot
Fig 3.0: Scatter Plot of Number of Suicides vs Population
x= sns.lineplot(x='population',y='suicides_no', data=data.head(),color='m')
1.15 Effects of GDP per capita on the suicide rates of a country We use a scatter plot: +Scatter plot Number of suicides/100k population vs GDP percapita
plt.figure(figsize=(15,7))
sns.scatterplot(x='gdp_per_capita ($)', y='suicides/100k pop', data=data) #scatter plot
plt.show()
Fig 4.0: Suicide rate vs GDP percapita +
Correlation among pairs of continous variables
plt.figure(figsize=(10,5))
sns.heatmap(data.corr(), annot=True, linewidths= 0.5, fmt='0.1f', center=1)
<matplotlib.axes._subplots.AxesSubplot at 0x1ee1fbed208>
1.16 Bar plot to check Number of suicides by sex and age to generate a single plot
plt.figure(figsize=(15,5))
sns.barplot(x='sex',y='suicides_no', data=data, hue='age')
plt.show()
Fig 5.0:Bar plot to check Number of suicides by sex
1.17 Trend of suicide rates across all the years
data[['year','suicides_no']].groupby(['year']).sum().plot(figsize=(15,5))
<matplotlib.axes._subplots.AxesSubplot at 0x1ee20fd1b08>
Fig 6.0: Trend plot of suicide rates across all the years