import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data=pd.read_csv('master.csv')


data.head()


data.describe()


data.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')


data.shape

(27820, 12)


data.dtypes.value_counts()

object     6
int64      4
float64    2
dtype: int64


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             27820 non-null  object 
 1   year                27820 non-null  int64  
 2   sex                 27820 non-null  object 
 3   age                 27820 non-null  object 
 4   suicides_no         27820 non-null  int64  
 5   population          27820 non-null  int64  
 6   suicides/100k pop   27820 non-null  float64
 7   country-year        27820 non-null  object 
 8   HDI for year        8364 non-null   float64
 9    gdp_for_year ($)   27820 non-null  object 
 10  gdp_per_capita ($)  27820 non-null  int64  
 11  generation          27820 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


total_missing_check=data.isnull().sum().sort_values(ascending=False)    #Total number of Null values
total_missing_check

HDI for year          19456
generation                0
gdp_per_capita ($)        0
 gdp_for_year ($)         0
country-year              0
suicides/100k pop         0
population                0
suicides_no               0
age                       0
sex                       0
year                      0
country                   0
dtype: int64


my_tab=pd.crosstab(index=data['age'], columns='Freq')
my_tab


data.groupby(by=['country'])['suicides_no'].sum().reset_index().sort_values(['suicides_no']).tail(10).plot(x='country',y='suicides_no',kind='bar', figsize=(15,5))
plt.title('Plot of Number of Suicides by top Countries')

plt.show()


plt.figure(figsize=(10,5))   #setting the figure size
ax=sns.barplot(x='age',y='suicides_no', data=data,  estimator= np.mean,ci=90, palette='rocket' )#barplot
plt.title('Plot of Number of Suicides by age')

Text(0.5, 1.0, 'Plot of Number of Suicides by age')


plt.figure(figsize=(8,4))
ax = sns.barplot(x='sex',y='suicides_no', data=data,ci=None)
plt.show()


plt.figure(figsize=(9,5))
ax= sns.barplot(x='generation', y='suicides_no', data=data,ci=None)


figure=plt.figure(figsize=(10,4))
ax= sns.scatterplot(x='population',y='suicides_no', data=data,size='suicides_no',color='g') #scatter plot


figure=plt.figure(figsize=(20,10))
ax= sns.regplot(x='population',y='suicides_no', data=data,color='r') #scatter plot


x= sns.lineplot(x='population',y='suicides_no', data=data.head(),color='m')


plt.figure(figsize=(15,7))
sns.scatterplot(x='gdp_per_capita ($)', y='suicides/100k pop', data=data)  #scatter plot
plt.show()


plt.figure(figsize=(10,5))
sns.heatmap(data.corr(), annot=True, linewidths= 0.5, fmt='0.1f', center=1)

<matplotlib.axes._subplots.AxesSubplot at 0x1ee1fbed208>


plt.figure(figsize=(15,5))
sns.barplot(x='sex',y='suicides_no', data=data, hue='age')
plt.show()


data[['year','suicides_no']].groupby(['year']).sum().plot(figsize=(15,5))

<matplotlib.axes._subplots.AxesSubplot at 0x1ee20fd1b08>

	country	year	sex	age	suicides_no	population	suicides/100k pop	country-year	HDI for year	gdp_for_year ($)	gdp_per_capita ($)	generation
0	Albania	1987	male	15-24 years	21	312900	6.71	Albania1987	NaN	2,156,624,900	796	Generation X
1	Albania	1987	male	35-54 years	16	308000	5.19	Albania1987	NaN	2,156,624,900	796	Silent
2	Albania	1987	female	15-24 years	14	289700	4.83	Albania1987	NaN	2,156,624,900	796	Generation X
3	Albania	1987	male	75+ years	1	21800	4.59	Albania1987	NaN	2,156,624,900	796	G.I. Generation
4	Albania	1987	male	25-34 years	9	274300	3.28	Albania1987	NaN	2,156,624,900	796	Boomers

	year	suicides_no	population	suicides/100k pop	HDI for year	gdp_per_capita ($)
count	27820.000000	27820.000000	2.782000e+04	27820.000000	8364.000000	27820.000000
mean	2001.258375	242.574407	1.844794e+06	12.816097	0.776601	16866.464414
std	8.469055	902.047917	3.911779e+06	18.961511	0.093367	18887.576472
min	1985.000000	0.000000	2.780000e+02	0.000000	0.483000	251.000000
25%	1995.000000	3.000000	9.749850e+04	0.920000	0.713000	3447.000000
50%	2002.000000	25.000000	4.301500e+05	5.990000	0.779000	9372.000000
75%	2008.000000	131.000000	1.486143e+06	16.620000	0.855000	24874.000000
max	2016.000000	22338.000000	4.380521e+07	224.970000	0.944000	126352.000000

col_0	Freq
age
15-24 years	4642
25-34 years	4642
35-54 years	4642
5-14 years	4610
55-74 years	4642
75+ years	4642

TOPIC: EDA-Suicide Rate Analysis¶

This is a numerical vs numerical data, so, scatter plot will be a good preference.¶