In [1]:
import pandas as pd 
from pandas.plotting import scatter_matrix
import numpy as np
import json
from helpers import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sys.path.append("..")
from data.unlabeled.raw import hdro_inicator_values as inicator_values, hdro_country_name as country_name, hdro_indicator_name as indicator_name
../data/unlabeled/raw/__init__.py:52: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_eah = pd.read_csv(aquastat_eah_path, skipfooter=8)
../data/unlabeled/raw/__init__.py:53: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_wr = pd.read_csv(aquastat_wr_path, skipfooter=8)
../data/unlabeled/raw/__init__.py:54: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_wu = pd.read_csv(aquastat_wu_path, skipfooter=8)
In [2]:
#print(inicator_values)
#print(country_name)
#print(indicator_name)

Unwrap values to create good df

In [3]:
df_inicator_values = pd.DataFrame(inicator_values).T
df_inicator_values.columns = [indicator_name[nm] for nm in df_inicator_values.columns]
df_inicator_values = df_inicator_values.applymap(lambda x: x['2019'] if pd.notnull(x) else x)
df_inicator_values
Out[3]:
Population with at least some secondary education (% ages 25 and older) Population with at least some secondary education, female (% ages 25 and older) Population with at least some secondary education, male (% ages 25 and older) Mean years of schooling, female (years) Mean years of schooling, male (years) Share of seats in parliament (% held by women) Adolescent birth rate (births per 1,000 women ages 15-19) Vulnerable employment (% of total employment) Total population (millions) Urban population (%) ... Gross enrolment ratio, pre-primary (% of preschool-age children) Percentage of primary schools with access to the internet Percentage of secondary schools with access to the internet Gross enrolment ratio, tertiary (% of tertiary school-age population) Share of graduates in science, technology, engineering and mathematics programmes at tertiary level, female (%) Share of graduates in science, technology, engineering and mathematics programmes at tertiary level, male (%) Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are female (%) Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are male (%) Primary school teachers trained to teach (%) Pupil-teacher ratio, primary school (pupils per teacher)
AFG 26.080 13.220 36.920 1.948 6.006 27.244 68.957 79.726 38.042 25.8 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
AGO 30.232 23.133 38.056 4.023 6.359 30.000 150.526 65.995 31.825 66.2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ALB 93.174 93.700 92.497 9.702 10.614 29.508 19.642 52.852 2.881 61.2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
AND 72.327 71.484 73.327 10.439 10.564 46.429 NaN NaN 0.077 88.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ARG 57.158 59.161 54.828 11.123 10.729 39.877 62.782 21.805 44.781 92.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
WSM 74.942 79.127 71.583 NaN NaN 10.000 23.886 29.983 0.197 18.1 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
YEM 28.020 19.920 36.918 2.880 5.146 0.971 60.352 45.627 29.162 37.3 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZAF 75.478 74.977 78.207 10.031 10.291 45.333 67.908 10.298 58.558 66.9 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZMB 44.440 38.488 54.068 6.283 8.176 17.964 120.112 78.134 17.861 44.1 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZWE 64.935 59.792 70.783 8.066 8.923 34.571 86.135 64.739 14.645 32.2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

195 rows × 98 columns

Display the max, min and mean number of null values per column

In [4]:
print_missing_percentages(df_inicator_values)
Max, min and mean number of missing values for the columns
Max: 98.97435897435898 %
Min: 0.0 %
Mean: 37.566718995290415 %
Out[4]:
(0.0, 98.97435897435898)

There is a high number of missing values, we therefore remove columns where more than 50% of the data is missing

In [5]:
dropColumnHalf(df_inicator_values)

Again see the max, min and mean number of missing values per column

In [6]:
min_missing, max_missing = print_missing_percentages(df_inicator_values)
Max, min and mean number of missing values for the columns
Max: 49.743589743589745 %
Min: 0.0 %
Mean: 10.616150019135096 %
In [7]:
df_inicator_values.describe()
Out[7]:
Population with at least some secondary education (% ages 25 and older) Population with at least some secondary education, female (% ages 25 and older) Population with at least some secondary education, male (% ages 25 and older) Mean years of schooling, female (years) Mean years of schooling, male (years) Share of seats in parliament (% held by women) Adolescent birth rate (births per 1,000 women ages 15-19) Vulnerable employment (% of total employment) Total population (millions) Urban population (%) ... Gender Development Index (GDI) Estimated gross national income per capita, female (2017 PPP $) Estimated gross national income per capita, male (2017 PPP $) Human Development Index (HDI), female Human Development Index (HDI), male Inequality-adjusted income index Overall loss in HDI due to inequality (%) Inequality in income (%) Coefficient of human inequality Inequality-adjusted HDI (IHDI)
count 175.000000 167.000000 167.000000 174.000000 174.000000 193.000000 185.000000 180.000000 195.000000 195.000000 ... 167.000000 178.000000 178.000000 167.000000 167.000000 156.000000 152.000000 156.000000 152.000000 152.000000
mean 61.068074 61.736012 65.823587 8.480017 9.133402 22.981295 48.309346 38.258011 39.391426 59.257436 ... 0.938994 14440.998511 24458.331354 0.702683 0.742186 0.545064 19.389145 23.401385 19.002599 0.595250
std 29.610798 29.284050 26.395358 3.421332 2.802984 11.828784 40.528834 27.774415 146.485855 23.231038 ... 0.074559 15359.935986 23943.482628 0.165867 0.143194 0.172789 9.948718 9.744004 9.777324 0.190002
min 0.000000 1.738000 9.000000 1.070000 2.256000 0.100000 0.283000 0.144000 0.011000 13.200000 ... 0.488000 186.041000 640.105000 0.270000 0.432000 0.176000 4.444000 8.500000 4.424000 0.232000
25% 37.296500 36.925000 45.356500 5.850000 6.733750 14.765000 13.177000 12.538250 2.081000 41.200000 ... 0.908500 2925.614250 6275.933750 0.577000 0.621500 0.404750 10.792250 16.603000 10.594000 0.431500
50% 64.828000 68.067000 70.682000 9.111000 9.252500 21.094000 40.536000 32.533500 8.772000 60.000000 ... 0.965000 8399.442500 16951.357000 0.730000 0.760000 0.527500 17.934500 21.779500 17.526000 0.604000
75% 89.145000 87.990500 90.916500 11.230500 11.551000 30.000000 70.504000 63.345500 28.562500 78.000000 ... 0.986000 22583.779500 35488.476000 0.831500 0.848500 0.691750 27.615250 28.625000 27.012500 0.767250
max 100.000000 100.000000 100.000000 13.882000 14.431000 55.660000 186.538000 94.581000 1433.784000 100.000000 ... 1.036000 71387.276000 107833.029000 0.949000 0.965000 0.858000 45.307000 56.996000 44.167000 0.899000

8 rows × 67 columns

In [11]:
#Find columns that only contain integers or null values
#find_all_integer_columns(df_inicator_values)
In [8]:
# Column values are shown

df_inicator_values.columns.values
Out[8]:
array(['Population with at least some secondary education (% ages 25 and older)',
       'Population with at least some secondary education, female (% ages 25 and older)',
       'Population with at least some secondary education, male (% ages 25 and older)',
       'Mean years of schooling, female (years)',
       'Mean years of schooling, male (years)',
       'Share of seats in parliament (% held by women)',
       'Adolescent birth rate (births per 1,000 women ages 15-19)',
       'Vulnerable employment (% of total employment)',
       'Total population (millions)', 'Urban population (%)',
       'Labour force participation rate (% ages 15 and older), female',
       'Labour force participation rate (% ages 15 and older), male',
       'Sex ratio at birth (male to female births)',
       'Remittances, inflows (% of GDP)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Population ages 15?64 (millions)',
       'Infants lacking immunization, measles (% of one-year-olds)',
       'Infants lacking immunization, DTP (% of one-year-olds)',
       'Gross fixed capital formation (% of GDP)',
       'Gender Inequality Index (GII)',
       'Life expectancy at birth (years)',
       'Expected years of schooling (years)',
       'Inequality-adjusted education index',
       'Inequality-adjusted life expectancy index',
       'Inequality in education (%)', 'Inequality in life expectancy (%)',
       'Mean years of schooling (years)', 'Life expectancy index',
       'Income index', 'Education index',
       'Unemployment, youth (% ages 15?24)',
       'Private capital flows (% of GDP)',
       'Life expectancy at birth, female (years)',
       'Life expectancy at birth, male (years)',
       'Young age (0-14) dependency ratio (per 100 people ages 15-64)',
       'Old-age (65 and older) dependency ratio (per 100 people ages 15-64)',
       'Expected years of schooling, female (years)',
       'Expected years of schooling, male (years)',
       'Population ages 65 and older (millions)',
       'Population under age 5 (millions)',
       'Exports and imports (% of GDP)', 'Human Development Index (HDI)',
       'Unemployment, total (% of labour force)', 'HDI rank',
       'Youth not in school or employment (% ages 15-24)',
       'Labour force participation rate (% ages 15 and older)',
       'Employment to population ratio (% ages 15 and older)',
       'Employment in agriculture (% of total employment)',
       'Employment in services (% of total employment)',
       'Working poor at PPP$3.20 a day (% of total employment)',
       'Total unemployment rate (female to male ratio)',
       'Youth unemployment rate (female to male ratio)',
       'Share of employment in nonagriculture, female (% of total employment in nonagriculture)',
       'Gross capital formation (% of GDP)',
       'Gross domestic product (GDP), total (2017 PPP $ billions)',
       'GDP per capita (2017 PPP $)',
       'Gross national income (GNI) per capita (constant 2017 PPP$)',
       'Gender Development Index (GDI)',
       'Estimated gross national income per capita, female (2017 PPP $)',
       'Estimated gross national income per capita, male (2017 PPP $)',
       'Human Development Index (HDI), female',
       'Human Development Index (HDI), male',
       'Inequality-adjusted income index',
       'Overall loss in HDI due to inequality (%)',
       'Inequality in income (%)', 'Coefficient of human inequality',
       'Inequality-adjusted HDI (IHDI)'], dtype=object)

Initial Correlation Matrix

In [13]:
size = df_inicator_values.shape[1]
corr = df_inicator_values.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size),cmap='mako')
Out[13]:
<AxesSubplot:>
2021-05-12T09:28:20.328956 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/

Division of the columns

In [9]:
percentage_columns = []
money_columns = []
index_columns = []
year_columns = []
millions_columns = []
rest = []

for column in df_inicator_values.columns.values:
    if '%' in column:
        percentage_columns.append(column)
    elif '$' in column:
        money_columns.append(column)
    elif 'years' in column:
        year_columns.append(column)
    elif 'index' in column.lower():
        index_columns.append(column)
    elif 'millions' in column.lower():
        millions_columns.append(column)
    else:
        rest.append(column)

print(percentage_columns,'\n',len(percentage_columns))
print(money_columns,'\n',len(money_columns))
print(index_columns,'\n',len(index_columns))
print(year_columns,'\n',len(year_columns))
print(millions_columns,'\n',len(millions_columns))
print(rest,'\n',len(rest))
['Population with at least some secondary education (% ages 25 and older)', 'Population with at least some secondary education, female (% ages 25 and older)', 'Population with at least some secondary education, male (% ages 25 and older)', 'Share of seats in parliament (% held by women)', 'Vulnerable employment (% of total employment)', 'Urban population (%)', 'Labour force participation rate (% ages 15 and older), female', 'Labour force participation rate (% ages 15 and older), male', 'Remittances, inflows (% of GDP)', 'Foreign direct investment, net inflows (% of GDP)', 'Infants lacking immunization, measles (% of one-year-olds)', 'Infants lacking immunization, DTP (% of one-year-olds)', 'Gross fixed capital formation (% of GDP)', 'Inequality in education (%)', 'Inequality in life expectancy (%)', 'Unemployment, youth (% ages 15?24)', 'Private capital flows (% of GDP)', 'Exports and imports (% of GDP)', 'Unemployment, total (% of labour force)', 'Youth not in school or employment (% ages 15-24)', 'Labour force participation rate (% ages 15 and older)', 'Employment to population ratio (% ages 15 and older)', 'Employment in agriculture (% of total employment)', 'Employment in services (% of total employment)', 'Working poor at PPP$3.20 a day (% of total employment)', 'Share of employment in nonagriculture, female (% of total employment in nonagriculture)', 'Gross capital formation (% of GDP)', 'Overall loss in HDI due to inequality (%)', 'Inequality in income (%)'] 
 29
['Gross domestic product (GDP), total (2017 PPP $ billions)', 'GDP per capita (2017 PPP $)', 'Gross national income (GNI) per capita (constant 2017 PPP$)', 'Estimated gross national income per capita, female (2017 PPP $)', 'Estimated gross national income per capita, male (2017 PPP $)'] 
 5
['Gender Inequality Index (GII)', 'Inequality-adjusted education index', 'Inequality-adjusted life expectancy index', 'Life expectancy index', 'Income index', 'Education index', 'Human Development Index (HDI)', 'Gender Development Index (GDI)', 'Human Development Index (HDI), female', 'Human Development Index (HDI), male', 'Inequality-adjusted income index'] 
 11
['Mean years of schooling, female (years)', 'Mean years of schooling, male (years)', 'Life expectancy at birth (years)', 'Expected years of schooling (years)', 'Mean years of schooling (years)', 'Life expectancy at birth, female (years)', 'Life expectancy at birth, male (years)', 'Expected years of schooling, female (years)', 'Expected years of schooling, male (years)'] 
 9
['Total population (millions)', 'Population ages 15?64 (millions)', 'Population ages 65 and older (millions)', 'Population under age 5 (millions)'] 
 4
['Adolescent birth rate (births per 1,000 women ages 15-19)', 'Sex ratio at birth (male to female births)', 'Young age (0-14) dependency ratio (per 100 people ages 15-64)', 'Old-age (65 and older) dependency ratio (per 100 people ages 15-64)', 'HDI rank', 'Total unemployment rate (female to male ratio)', 'Youth unemployment rate (female to male ratio)', 'Coefficient of human inequality', 'Inequality-adjusted HDI (IHDI)'] 
 9

Dataframe is splitted

In [10]:
split_df1 = df_inicator_values[percentage_columns]
split_df2 = df_inicator_values[money_columns]
split_df3 = df_inicator_values[index_columns]
split_df4 = df_inicator_values[year_columns]
split_df5 = df_inicator_values[millions_columns]
split_df6 = df_inicator_values[rest]
In [11]:
print('- Dataframe 1 -')
min1, max1 = print_missing_percentages(split_df1)
print('- Dataframe 2 -')
min2, max2 = print_missing_percentages(split_df2)
print('- Dataframe 3 -')
min3, max3 = print_missing_percentages(split_df3)
print('- Dataframe 4 -')
min4, max4 = print_missing_percentages(split_df4)
print('- Dataframe 5 -')
min5, max5 = print_missing_percentages(split_df5)
print('- Dataframe 6 -')
min6, max6 = print_missing_percentages(split_df6)
- Dataframe 1 -
Max, min and mean number of missing values for the columns
Max: 49.743589743589745 %
Min: 0.0 %
Mean: 14.624226348364274 %
- Dataframe 2 -
Max, min and mean number of missing values for the columns
Max: 8.717948717948717 %
Min: 2.051282051282051 %
Mean: 6.153846153846153 %
- Dataframe 3 -
Max, min and mean number of missing values for the columns
Max: 20.0 %
Min: 2.051282051282051 %
Mean: 9.790209790209792 %
- Dataframe 4 -
Max, min and mean number of missing values for the columns
Max: 10.76923076923077 %
Min: 1.0256410256410255 %
Mean: 5.584045584045585 %
- Dataframe 5 -
Max, min and mean number of missing values for the columns
Max: 5.128205128205129 %
Min: 0.0 %
Mean: 3.8461538461538467 %
- Dataframe 6 -
Max, min and mean number of missing values for the columns
Max: 22.05128205128205 %
Min: 3.076923076923077 %
Mean: 9.230769230769232 %

Imputation of the individual datasets

idf --> stands for imputed dataframe

In [12]:
if max1 < 10:
    max1 = 10

idf1 = impute_df(split_df1, max_iter= int(max1), verbose=2)
size = idf1.shape[1]
corr = idf1.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 29)
[IterativeImputer] Ending imputation round 1/49, elapsed time 0.22
[IterativeImputer] Ending imputation round 2/49, elapsed time 0.30
[IterativeImputer] Ending imputation round 3/49, elapsed time 0.42
[IterativeImputer] Ending imputation round 4/49, elapsed time 0.49
[IterativeImputer] Ending imputation round 5/49, elapsed time 0.56
[IterativeImputer] Ending imputation round 6/49, elapsed time 0.63
[IterativeImputer] Ending imputation round 7/49, elapsed time 0.73
[IterativeImputer] Ending imputation round 8/49, elapsed time 0.81
[IterativeImputer] Ending imputation round 9/49, elapsed time 0.92
[IterativeImputer] Ending imputation round 10/49, elapsed time 0.97
[IterativeImputer] Ending imputation round 11/49, elapsed time 1.03
[IterativeImputer] Ending imputation round 12/49, elapsed time 1.08
[IterativeImputer] Ending imputation round 13/49, elapsed time 1.13
[IterativeImputer] Ending imputation round 14/49, elapsed time 1.21
[IterativeImputer] Ending imputation round 15/49, elapsed time 1.30
[IterativeImputer] Ending imputation round 16/49, elapsed time 1.41
[IterativeImputer] Ending imputation round 17/49, elapsed time 1.50
[IterativeImputer] Ending imputation round 18/49, elapsed time 1.60
[IterativeImputer] Ending imputation round 19/49, elapsed time 1.69
[IterativeImputer] Ending imputation round 20/49, elapsed time 1.75
[IterativeImputer] Ending imputation round 21/49, elapsed time 1.81
[IterativeImputer] Ending imputation round 22/49, elapsed time 1.86
[IterativeImputer] Ending imputation round 23/49, elapsed time 1.92
[IterativeImputer] Ending imputation round 24/49, elapsed time 1.98
[IterativeImputer] Ending imputation round 25/49, elapsed time 2.06
[IterativeImputer] Ending imputation round 26/49, elapsed time 2.18
[IterativeImputer] Ending imputation round 27/49, elapsed time 2.26
[IterativeImputer] Ending imputation round 28/49, elapsed time 2.35
[IterativeImputer] Ending imputation round 29/49, elapsed time 2.43
[IterativeImputer] Ending imputation round 30/49, elapsed time 2.50
[IterativeImputer] Ending imputation round 31/49, elapsed time 2.57
[IterativeImputer] Ending imputation round 32/49, elapsed time 2.63
[IterativeImputer] Ending imputation round 33/49, elapsed time 2.71
[IterativeImputer] Ending imputation round 34/49, elapsed time 2.77
[IterativeImputer] Ending imputation round 35/49, elapsed time 2.90
[IterativeImputer] Ending imputation round 36/49, elapsed time 2.97
[IterativeImputer] Ending imputation round 37/49, elapsed time 3.09
[IterativeImputer] Ending imputation round 38/49, elapsed time 3.16
[IterativeImputer] Ending imputation round 39/49, elapsed time 3.22
[IterativeImputer] Ending imputation round 40/49, elapsed time 3.27
[IterativeImputer] Ending imputation round 41/49, elapsed time 3.32
[IterativeImputer] Ending imputation round 42/49, elapsed time 3.37
[IterativeImputer] Ending imputation round 43/49, elapsed time 3.44
[IterativeImputer] Ending imputation round 44/49, elapsed time 3.51
[IterativeImputer] Ending imputation round 45/49, elapsed time 3.61
[IterativeImputer] Ending imputation round 46/49, elapsed time 3.68
[IterativeImputer] Ending imputation round 47/49, elapsed time 3.75
[IterativeImputer] Ending imputation round 48/49, elapsed time 3.80
[IterativeImputer] Ending imputation round 49/49, elapsed time 3.86
[IterativeImputer] Completing matrix with shape (195, 29)
[IterativeImputer] Ending imputation round 1/49, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/49, elapsed time 0.01
[IterativeImputer] Ending imputation round 3/49, elapsed time 0.01
[IterativeImputer] Ending imputation round 4/49, elapsed time 0.01
[IterativeImputer] Ending imputation round 5/49, elapsed time 0.02
[IterativeImputer] Ending imputation round 6/49, elapsed time 0.02
[IterativeImputer] Ending imputation round 7/49, elapsed time 0.02
[IterativeImputer] Ending imputation round 8/49, elapsed time 0.03
[IterativeImputer] Ending imputation round 9/49, elapsed time 0.03
[IterativeImputer] Ending imputation round 10/49, elapsed time 0.03
[IterativeImputer] Ending imputation round 11/49, elapsed time 0.04
[IterativeImputer] Ending imputation round 12/49, elapsed time 0.04
[IterativeImputer] Ending imputation round 13/49, elapsed time 0.04
[IterativeImputer] Ending imputation round 14/49, elapsed time 0.05
[IterativeImputer] Ending imputation round 15/49, elapsed time 0.05
[IterativeImputer] Ending imputation round 16/49, elapsed time 0.05
[IterativeImputer] Ending imputation round 17/49, elapsed time 0.06
[IterativeImputer] Ending imputation round 18/49, elapsed time 0.06
[IterativeImputer] Ending imputation round 19/49, elapsed time 0.07
[IterativeImputer] Ending imputation round 20/49, elapsed time 0.07
[IterativeImputer] Ending imputation round 21/49, elapsed time 0.07
[IterativeImputer] Ending imputation round 22/49, elapsed time 0.08
[IterativeImputer] Ending imputation round 23/49, elapsed time 0.08
[IterativeImputer] Ending imputation round 24/49, elapsed time 0.08
[IterativeImputer] Ending imputation round 25/49, elapsed time 0.09
[IterativeImputer] Ending imputation round 26/49, elapsed time 0.09
[IterativeImputer] Ending imputation round 27/49, elapsed time 0.09
[IterativeImputer] Ending imputation round 28/49, elapsed time 0.10
[IterativeImputer] Ending imputation round 29/49, elapsed time 0.10
[IterativeImputer] Ending imputation round 30/49, elapsed time 0.10
[IterativeImputer] Ending imputation round 31/49, elapsed time 0.11
[IterativeImputer] Ending imputation round 32/49, elapsed time 0.11
[IterativeImputer] Ending imputation round 33/49, elapsed time 0.11
[IterativeImputer] Ending imputation round 34/49, elapsed time 0.11
[IterativeImputer] Ending imputation round 35/49, elapsed time 0.12
[IterativeImputer] Ending imputation round 36/49, elapsed time 0.12
[IterativeImputer] Ending imputation round 37/49, elapsed time 0.12
[IterativeImputer] Ending imputation round 38/49, elapsed time 0.13
[IterativeImputer] Ending imputation round 39/49, elapsed time 0.13
[IterativeImputer] Ending imputation round 40/49, elapsed time 0.13
[IterativeImputer] Ending imputation round 41/49, elapsed time 0.14
[IterativeImputer] Ending imputation round 42/49, elapsed time 0.14
[IterativeImputer] Ending imputation round 43/49, elapsed time 0.14
[IterativeImputer] Ending imputation round 44/49, elapsed time 0.15
[IterativeImputer] Ending imputation round 45/49, elapsed time 0.15
[IterativeImputer] Ending imputation round 46/49, elapsed time 0.15
[IterativeImputer] Ending imputation round 47/49, elapsed time 0.16
[IterativeImputer] Ending imputation round 48/49, elapsed time 0.16
[IterativeImputer] Ending imputation round 49/49, elapsed time 0.16
/opt/anaconda3/lib/python3.7/site-packages/sklearn/impute/_iterative.py:603: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  " reached.", ConvergenceWarning)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dc9daa10>
In [13]:
if max2 < 10:
    max2 = 10

idf2 = impute_df(split_df2, max_iter= int(max2), verbose=2)
size = idf2.shape[1]
corr = idf2.corr()
fig = plt.subplots(figsize=(15,15))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 5)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.08
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.10
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.11
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.14
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.15
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.16
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.17
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.18
[IterativeImputer] Ending imputation round 9/10, elapsed time 0.21
[IterativeImputer] Ending imputation round 10/10, elapsed time 0.22
[IterativeImputer] Completing matrix with shape (195, 5)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.02
[IterativeImputer] Ending imputation round 9/10, elapsed time 0.02
[IterativeImputer] Ending imputation round 10/10, elapsed time 0.02
/opt/anaconda3/lib/python3.7/site-packages/sklearn/impute/_iterative.py:603: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  " reached.", ConvergenceWarning)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dca54bd0>
In [14]:
if max3 < 10:
    max3 = 10

idf3 = impute_df(split_df3, max_iter= int(max3), verbose=2)
size = idf3.shape[1]
corr = idf3.corr()
fig = plt.subplots(figsize=(15,15))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 11)
[IterativeImputer] Ending imputation round 1/20, elapsed time 0.09
[IterativeImputer] Ending imputation round 2/20, elapsed time 0.18
[IterativeImputer] Ending imputation round 3/20, elapsed time 0.20
[IterativeImputer] Ending imputation round 4/20, elapsed time 0.22
[IterativeImputer] Ending imputation round 5/20, elapsed time 0.24
[IterativeImputer] Ending imputation round 6/20, elapsed time 0.26
[IterativeImputer] Ending imputation round 7/20, elapsed time 0.29
[IterativeImputer] Ending imputation round 8/20, elapsed time 0.37
[IterativeImputer] Ending imputation round 9/20, elapsed time 0.39
[IterativeImputer] Ending imputation round 10/20, elapsed time 0.41
[IterativeImputer] Ending imputation round 11/20, elapsed time 0.43
[IterativeImputer] Ending imputation round 12/20, elapsed time 0.45
[IterativeImputer] Ending imputation round 13/20, elapsed time 0.47
[IterativeImputer] Ending imputation round 14/20, elapsed time 0.49
[IterativeImputer] Ending imputation round 15/20, elapsed time 0.54
[IterativeImputer] Ending imputation round 16/20, elapsed time 0.59
[IterativeImputer] Ending imputation round 17/20, elapsed time 0.62
[IterativeImputer] Ending imputation round 18/20, elapsed time 0.64
[IterativeImputer] Ending imputation round 19/20, elapsed time 0.66
[IterativeImputer] Ending imputation round 20/20, elapsed time 0.68
[IterativeImputer] Completing matrix with shape (195, 11)
[IterativeImputer] Ending imputation round 1/20, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 3/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 4/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 5/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 6/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 7/20, elapsed time 0.01
[IterativeImputer] Ending imputation round 8/20, elapsed time 0.02
[IterativeImputer] Ending imputation round 9/20, elapsed time 0.02
[IterativeImputer] Ending imputation round 10/20, elapsed time 0.02
[IterativeImputer] Ending imputation round 11/20, elapsed time 0.02
[IterativeImputer] Ending imputation round 12/20, elapsed time 0.02
[IterativeImputer] Ending imputation round 13/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 14/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 15/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 16/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 17/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 18/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 19/20, elapsed time 0.04
[IterativeImputer] Ending imputation round 20/20, elapsed time 0.04
/opt/anaconda3/lib/python3.7/site-packages/sklearn/impute/_iterative.py:603: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  " reached.", ConvergenceWarning)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dca79190>
In [15]:
if max4 < 10:
    max4 = 10

idf4 = impute_df(split_df4, max_iter= int(max4), verbose=2)
size = idf4.shape[1]
corr = idf4.corr()
fig = plt.subplots(figsize=(15,15))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 9)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.06
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.08
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.09
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.11
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.12
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.13
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.14
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.15
[IterativeImputer] Ending imputation round 9/10, elapsed time 0.16
[IterativeImputer] Ending imputation round 10/10, elapsed time 0.18
[IterativeImputer] Completing matrix with shape (195, 9)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.00
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 9/10, elapsed time 0.01
[IterativeImputer] Ending imputation round 10/10, elapsed time 0.01
/opt/anaconda3/lib/python3.7/site-packages/sklearn/impute/_iterative.py:603: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  " reached.", ConvergenceWarning)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dcaaca90>
In [16]:
if max5 < 10:
    max5 = 10

idf5 = impute_df(split_df5, max_iter= int(max5), verbose=2)
size = idf5.shape[1]
corr = idf5.corr()
fig = plt.subplots(figsize=(15,15))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 4)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.03
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.04
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.04
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.04
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.05
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.05
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.06
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.08
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (195, 4)
[IterativeImputer] Ending imputation round 1/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 3/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 4/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 5/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 6/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 7/8, elapsed time 0.00
[IterativeImputer] Ending imputation round 8/8, elapsed time 0.00
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dcaaa210>
In [17]:
if max6 < 10:
    max6 = 10

idf6 = impute_df(split_df6, max_iter= int(max6), verbose=2)
size = idf6.shape[1]
corr = idf6.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
[IterativeImputer] Completing matrix with shape (195, 9)
[IterativeImputer] Ending imputation round 1/22, elapsed time 0.12
[IterativeImputer] Ending imputation round 2/22, elapsed time 0.14
[IterativeImputer] Ending imputation round 3/22, elapsed time 0.15
[IterativeImputer] Ending imputation round 4/22, elapsed time 0.16
[IterativeImputer] Ending imputation round 5/22, elapsed time 0.17
[IterativeImputer] Ending imputation round 6/22, elapsed time 0.18
[IterativeImputer] Ending imputation round 7/22, elapsed time 0.20
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (195, 9)
[IterativeImputer] Ending imputation round 1/7, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 3/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 4/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 5/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 6/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 7/7, elapsed time 0.02
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dcad8550>

Dataframe is merged and displayed below

In [18]:
# 29 - 5 - 11 - 9 - 4 - 9
final_df = idf1.merge(idf2, left_index=True, right_index=True)
print(final_df.shape)
final_df = final_df.merge(idf3, left_index=True, right_index=True)
print(final_df.shape)
final_df = final_df.merge(idf4, left_index=True, right_index=True)
print(final_df.shape)
final_df = final_df.merge(idf5, left_index=True, right_index=True)
print(final_df.shape)
final_df = final_df.merge(idf6, left_index=True, right_index=True)
print(final_df.shape)

display(final_df)
(195, 34)
(195, 45)
(195, 54)
(195, 58)
(195, 67)
Population with at least some secondary education (% ages 25 and older) Population with at least some secondary education, female (% ages 25 and older) Population with at least some secondary education, male (% ages 25 and older) Share of seats in parliament (% held by women) Vulnerable employment (% of total employment) Urban population (%) Labour force participation rate (% ages 15 and older), female Labour force participation rate (% ages 15 and older), male Remittances, inflows (% of GDP) Foreign direct investment, net inflows (% of GDP) ... Population under age 5 (millions) Adolescent birth rate (births per 1,000 women ages 15-19) Sex ratio at birth (male to female births) Young age (0-14) dependency ratio (per 100 people ages 15-64) Old-age (65 and older) dependency ratio (per 100 people ages 15-64) HDI rank Total unemployment rate (female to male ratio) Youth unemployment rate (female to male ratio) Coefficient of human inequality Inequality-adjusted HDI (IHDI)
AFG 26.080 13.220 36.920 27.244 79.72600 25.8 21.595000 74.658000 4.542000 0.123000 ... 5.639000 68.957000 1.060000 77.346000 4.764000 169.0 1.356000 1.308000 29.772732 0.360280
AGO 30.232 23.133 38.056 30.000 65.99500 66.2 76.136000 78.913000 0.002000 -4.331000 ... 5.670000 150.526000 1.030000 91.097000 4.297000 148.0 1.016000 0.906000 31.733000 0.397000
ALB 93.174 93.700 92.497 29.508 52.85200 61.2 46.712000 64.568000 9.640000 7.912000 ... 0.169000 19.642000 1.090000 25.439000 20.764000 69.0 0.903000 0.799000 10.893000 0.708000
AND 72.327 71.484 73.327 46.429 15.42884 88.0 54.351364 72.232983 1.703573 3.286625 ... -7.122259 14.976928 1.058547 25.017928 21.992987 36.0 1.788414 1.412073 10.066536 0.786988
ARG 57.158 59.161 54.828 39.877 21.80500 92.0 50.721000 72.730000 0.119000 1.389000 ... 3.742000 62.782000 1.040000 38.334000 17.523000 46.0 1.222000 1.291000 13.238000 0.729000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
WSM 74.942 79.127 71.583 10.000 29.98300 18.1 31.104000 55.456000 17.254000 3.114609 ... 0.027000 23.886000 1.080000 66.194000 8.623000 111.0 1.297000 1.492000 21.104700 0.551524
YEM 28.020 19.920 36.918 0.971 45.62700 37.3 5.834000 70.183000 7.999177 -4.382862 ... 4.099000 60.352000 1.050000 67.773000 5.015000 179.0 2.088000 1.467000 30.867000 0.321000
ZAF 75.478 74.977 78.207 45.333 10.29800 66.9 49.610000 62.749000 0.253000 1.316000 ... 5.786000 67.908000 1.030000 44.148000 8.253000 114.0 1.149000 1.161000 31.163000 0.468000
ZMB 44.440 38.488 54.068 17.964 78.13400 44.1 70.370000 79.076000 0.551000 2.087000 ... 2.902000 120.112000 1.030000 83.229000 3.960000 146.0 1.147000 1.079000 30.592000 0.401000
ZWE 64.935 59.792 70.783 34.571 64.73900 32.2 78.106000 88.993000 8.068000 4.005312 ... 2.138000 86.135000 1.020000 76.845000 5.433000 150.0 1.231000 1.269000 22.525000 0.441000

195 rows × 67 columns

In [19]:
size = final_df.shape[1]
corr = final_df.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, xticklabels=range(size), yticklabels=range(size))
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96dd44db50>

PCA of the Final Dataset

In [20]:
x = final_df.loc[:, final_df.columns].values
print(x)
x = StandardScaler().fit_transform(x)

print(final_df.shape)
print(x.shape)
[[26.08       13.22       36.92       ...  1.308      29.77273216
   0.36028022]
 [30.232      23.133      38.056      ...  0.906      31.733
   0.397     ]
 [93.174      93.7        92.497      ...  0.799      10.893
   0.708     ]
 ...
 [75.478      74.977      78.207      ...  1.161      31.163
   0.468     ]
 [44.44       38.488      54.068      ...  1.079      30.592
   0.401     ]
 [64.935      59.792      70.783      ...  1.269      22.525
   0.441     ]]
(195, 67)
(195, 67)
In [21]:
np.mean(x), np.std(x)
Out[21]:
(-1.713134035701734e-17, 1.0)
In [22]:
feat_cols = final_df.columns.values.tolist()
#print(feat_cols)
normalized_final_df = pd.DataFrame(x, columns=feat_cols)
normalized_final_df.head()
Out[22]:
Population with at least some secondary education (% ages 25 and older) Population with at least some secondary education, female (% ages 25 and older) Population with at least some secondary education, male (% ages 25 and older) Share of seats in parliament (% held by women) Vulnerable employment (% of total employment) Urban population (%) Labour force participation rate (% ages 15 and older), female Labour force participation rate (% ages 15 and older), male Remittances, inflows (% of GDP) Foreign direct investment, net inflows (% of GDP) ... Population under age 5 (millions) Adolescent birth rate (births per 1,000 women ages 15-19) Sex ratio at birth (male to female births) Young age (0-14) dependency ratio (per 100 people ages 15-64) Old-age (65 and older) dependency ratio (per 100 people ages 15-64) HDI rank Total unemployment rate (female to male ratio) Youth unemployment rate (female to male ratio) Coefficient of human inequality Inequality-adjusted HDI (IHDI)
0 -1.242745 -1.537407 -0.972184 0.366696 1.554740 -1.443911 -2.035680 0.233312 -0.027719 -0.441756 ... 0.225085 0.534851 0.462524 1.513894 -0.950705 1.364735 -0.115960 -0.049773 1.192044 -1.315801
1 -1.094493 -1.202607 -0.930083 0.601335 1.044887 0.299618 1.612659 0.733974 -0.796618 -0.915527 ... 0.227833 2.593772 -1.216161 2.161911 -0.999777 0.976870 -0.308338 -0.485075 1.408239 -1.110134
2 1.152920 1.180706 1.087538 0.559447 0.556868 0.083835 -0.355562 -0.953924 0.835683 0.386758 ... -0.259851 -0.709932 2.141209 -0.932226 0.730564 -0.482239 -0.372276 -0.600939 -0.890168 0.631779
3 0.408555 0.430388 0.377085 2.000054 -0.832710 1.240433 0.155448 -0.052027 -0.508438 -0.105242 ... -0.906248 -0.827685 0.381230 -0.952069 0.859705 -1.091741 0.128707 0.062921 -0.981317 1.074191
4 -0.133071 0.014194 -0.308501 1.442235 -0.595953 1.413060 -0.087393 0.006454 -0.776802 -0.307092 ... 0.056909 0.378985 -0.656599 -0.324548 0.390002 -0.907043 -0.191780 -0.068182 -0.631542 0.749400

5 rows × 67 columns

In [23]:
num_components = 3
pca_final = PCA(n_components=num_components)
pComponents_final = pca_final.fit_transform(x)
component_col = ['PC'+str(i+1) for i in range(num_components)]
print(component_col)

percentage_list = [element * 100 for element in pca_final.explained_variance_ratio_]
percentage_list = ['%.2f' % elem for elem in percentage_list]
print(percentage_list)
['PC1', 'PC2', 'PC3']
['49.73', '7.93', '7.18']
In [24]:
pc_final_df = pd.DataFrame(data = pComponents_final, columns = component_col)
print(pc_final_df.shape)
pc_final_df.head()
(195, 3)
Out[24]:
PC1 PC2 PC3
0 8.660925 3.855080 -0.080563
1 7.369314 -2.616149 0.028787
2 -3.287324 2.085875 -0.449373
3 -5.811149 -0.738707 -0.166838
4 -3.917583 0.676241 -0.271113
In [25]:
print('Explained variation percentage per principal component: {}'.format(percentage_list))
total_explained_percentage = (sum(pca_final.explained_variance_ratio_)*100)
print('Total percentage of the explained data by',pca_final.n_components,'components is: %.2f' %total_explained_percentage)
print('Percentage of the information that is lost for using',pca_final.n_components,'components is: %.2f' %(100-total_explained_percentage))
Explained variation percentage per principal component: ['49.73', '7.93', '7.18']
Total percentage of the explained data by 3 components is: 64.84
Percentage of the information that is lost for using 3 components is: 35.16

3 Main Principle Component is presented

In [26]:
l_dict = {}
for i in range(len(percentage_list)):
    l_dict[str(i)] = 'PC'+str(i+1)+' '+str(percentage_list[i])+'%'

print(l_dict)

fig = px.scatter_3d(
    pComponents_final, x=0, y=1, z=2,
    title=f'Total Explained Variance: {total_explained_percentage:.2f}%',
    labels=l_dict
)

fig.show()
{'0': 'PC1 49.73%', '1': 'PC2 7.93%', '2': 'PC3 7.18%'}

Conversion of the Dataset to CSV

In [29]:
final_df.to_csv("../data/unlabeled/preprocessed/hdro_preprocessed.csv")