In [2]:
import pandas as pd
import sklearn
from helpers import *
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from data.unlabeled.raw import edstats_co as df_c, edstats_da as df_d, edstats_se as df_s
import seaborn as sns
..\data\unlabeled\raw\__init__.py:41: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_eah = pd.read_csv(aquastat_eah_path, skipfooter=8)
..\data\unlabeled\raw\__init__.py:42: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_wr = pd.read_csv(aquastat_wr_path, skipfooter=8)
..\data\unlabeled\raw\__init__.py:43: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  aquastat_wu = pd.read_csv(aquastat_wu_path, skipfooter=8)

Remember to extract Edstats_csv.zip to Edstats_csv

In [3]:
df_d.columns
Out[3]:
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
       '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
       '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045',
       '2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090',
       '2095', '2100', 'Unnamed: 69'],
      dtype='object')
In [4]:
years = ['2015', '2016', '2017', '2020']
stripped_df_d = df_d[['Country Code','Indicator Code',*years]]
print("Non na values in col 2020:",stripped_df_d['2020'].count())
stripped_df_d
Non na values in col 2020: 51436
Out[4]:
Country Code Indicator Code 2015 2016 2017 2020
0 ARB UIS.NERA.2 NaN NaN NaN NaN
1 ARB UIS.NERA.2.F NaN NaN NaN NaN
2 ARB UIS.NERA.2.GPI NaN NaN NaN NaN
3 ARB UIS.NERA.2.M NaN NaN NaN NaN
4 ARB SE.PRM.TENR NaN NaN NaN NaN
... ... ... ... ... ... ...
886925 ZWE UIS.LP.AG15T24.M NaN NaN NaN NaN
886926 ZWE SE.ADT.1524.LT.ZS NaN NaN NaN NaN
886927 ZWE SE.ADT.1524.LT.FE.ZS NaN NaN NaN NaN
886928 ZWE SE.ADT.1524.LT.FM.ZS NaN NaN NaN NaN
886929 ZWE SE.ADT.1524.LT.MA.ZS NaN NaN NaN NaN

886930 rows × 6 columns

Merge colums to remove null values

In [5]:
for year in years[:-1]:
    fill_missing_with_column(stripped_df_d, '2020',year)
print("Non na values in col 2020:", stripped_df_d['2020'].count())
stripped_df_d
Non na values in col 2020: 132991
c:\Users\joach\code-projects\WaterSecurity\unlabeled_preprocessing\helpers.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[into] = df[into].combine_first(df[fro])
C:\Users\joach\.conda\envs\wsenv\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
Out[5]:
Country Code Indicator Code 2020
0 ARB UIS.NERA.2 NaN
1 ARB UIS.NERA.2.F NaN
2 ARB UIS.NERA.2.GPI NaN
3 ARB UIS.NERA.2.M NaN
4 ARB SE.PRM.TENR NaN
... ... ... ...
886925 ZWE UIS.LP.AG15T24.M NaN
886926 ZWE SE.ADT.1524.LT.ZS NaN
886927 ZWE SE.ADT.1524.LT.FE.ZS NaN
886928 ZWE SE.ADT.1524.LT.FM.ZS NaN
886929 ZWE SE.ADT.1524.LT.MA.ZS NaN

886930 rows × 3 columns

In [6]:
df_d_withseries = stripped_df_d.merge(df_s, how='left', left_on='Indicator Code', right_on='Series Code')
df_d_withseries.drop(['Indicator Code'], inplace=True, axis=1)
In [7]:
df_d_withseries.columns
Out[7]:
Index(['Country Code', '2020', 'Series Code', 'Topic', 'Indicator Name',
       'Short definition', 'Long definition', 'Unit of measure', 'Periodicity',
       'Base Period', 'Other notes', 'Aggregation method',
       'Limitations and exceptions', 'Notes from original source',
       'General comments', 'Source', 'Statistical concept and methodology',
       'Development relevance', 'Related source links', 'Other web links',
       'Related indicators', 'License Type', 'Unnamed: 20'],
      dtype='object')

Rearange table to indicator on column

In [8]:
countryXindicator = df_d_withseries.pivot_table('2020',['Country Code'],'Indicator Name')
countryXindicator
Out[8]:
Indicator Name Adjusted net enrolment rate, lower secondary, both sexes (%) Adjusted net enrolment rate, lower secondary, female (%) Adjusted net enrolment rate, lower secondary, gender parity index (GPI) Adjusted net enrolment rate, lower secondary, male (%) Adjusted net enrolment rate, primary, both sexes (%) Adjusted net enrolment rate, primary, female (%) Adjusted net enrolment rate, primary, gender parity index (GPI) Adjusted net enrolment rate, primary, male (%) Adjusted net enrolment rate, upper secondary, both sexes (%) Adjusted net enrolment rate, upper secondary, female (%) ... Under-age enrolment ratio in secondary education, female (%) Under-age enrolment ratio in secondary education, male (%) Unemployment, female (% of female labor force) (modeled ILO estimate) Unemployment, male (% of male labor force) (modeled ILO estimate) Unemployment, total (% of total labor force) (modeled ILO estimate) Youth illiterate population, 15-24 years, % female Youth literacy rate, population 15-24 years, both sexes (%) Youth literacy rate, population 15-24 years, female (%) Youth literacy rate, population 15-24 years, gender parity index (GPI) Youth literacy rate, population 15-24 years, male (%)
Country Code
ABW NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
AFG NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 12.700000 7.7 8.600000 NaN NaN NaN NaN NaN
AGO NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 6.700000 5.7 6.200000 NaN NaN NaN NaN NaN
ALB NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 17.299999 17.0 17.100000 NaN NaN NaN NaN NaN
AND NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 47.80890 100.00000 100.00000 1.00000 100.00000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
XKX NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
YEM NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 32.700001 11.1 16.700001 NaN NaN NaN NaN NaN
ZAF NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 27.700001 23.1 25.200001 34.34808 98.95578 99.22904 1.00552 98.68459
ZMB NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 8.000000 7.3 7.700000 NaN NaN NaN NaN NaN
ZWE NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 5.100000 5.1 5.100000 NaN NaN NaN NaN NaN

241 rows × 2047 columns

Find columns that contain "number", "Projection" or "$"

In [9]:
projection_col = [colname for colname in countryXindicator.columns if "Projection" in colname]
number = [colname for colname in countryXindicator.columns if "number" in colname]
dollar = [colname for colname in countryXindicator.columns if "$" in colname]

And remove them

In [10]:
countryXindicator.drop(projection_col + number+ dollar,inplace=True,axis=1)
countryXindicator.shape
Out[10]:
(241, 1502)

Remove population data

In [11]:
countryXindicator.drop(['Population, total'], axis=1, inplace=True)

See how much of the data is missing and drop columns where more than 50% of the values are null

In [12]:
print_missing_percentages(countryXindicator)
Max, min and mean number of missing values for the columns
Max: 99.5850622406639 %
Min: 3.7344398340248963 %
Mean: 81.31204369977415 %
Out[12]:
(3.7344398340248963, 99.5850622406639)
In [13]:
dropColumnHalf(countryXindicator)
In [14]:
print_missing_percentages(countryXindicator)
Max, min and mean number of missing values for the columns
Max: 49.79253112033195 %
Min: 3.7344398340248963 %
Mean: 26.60212079299217 %
Out[14]:
(3.7344398340248963, 49.79253112033195)
In [15]:
countryXindicator.shape
Out[15]:
(241, 243)
In [16]:
 countryXindicator
Out[16]:
Indicator Name Adjusted net enrolment rate, primary, both sexes (%) Age population, age 0, female, UNESCO Age population, age 0, total, UNESCO Age population, age 01, female, UNESCO Age population, age 01, total, UNESCO Age population, age 02, female, UNESCO Age population, age 02, total, UNESCO Age population, age 03, female, UNESCO Age population, age 03, total, UNESCO Age population, age 04, female, UNESCO ... Prevalence of HIV, total (% of population ages 15-49) Primary completion rate, both sexes (%) Primary completion rate, female (%) Primary completion rate, male (%) Theoretical duration of primary education (years) Theoretical duration of secondary education (years) Theoretical duration of upper secondary education (years) Unemployment, female (% of female labor force) (modeled ILO estimate) Unemployment, male (% of male labor force) (modeled ILO estimate) Unemployment, total (% of total labor force) (modeled ILO estimate)
Country Code
ABW NaN 542.0 1112.0 572.0 1170.0 600.0 1224.0 623.0 1269.0 643.0 ... NaN NaN NaN NaN 6.0 5.0 3.0 NaN NaN NaN
AFG NaN 682677.0 1403010.0 650389.0 1338500.0 620880.0 1279403.0 593957.0 1225345.0 569425.0 ... 0.1 NaN NaN NaN 6.0 6.0 3.0 12.700000 7.7 8.600000
AGO NaN 414919.0 832093.0 396537.0 793950.0 379705.0 759164.0 364295.0 727455.0 350180.0 ... 1.9 NaN NaN NaN 6.0 6.0 3.0 6.700000 5.7 6.200000
ALB NaN 23697.0 49105.0 24026.0 49709.0 24243.0 50104.0 24360.0 50315.0 24397.0 ... 0.1 106.367561 104.699371 107.900124 5.0 7.0 3.0 17.299999 17.0 17.100000
AND NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 6.0 6.0 2.0 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
XKX NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
YEM NaN 458859.0 936053.0 452601.0 923017.0 444988.0 907252.0 436223.0 889171.0 426514.0 ... 0.1 NaN NaN NaN 6.0 6.0 3.0 32.700001 11.1 16.700001
ZAF NaN 484535.0 980663.0 486901.0 985215.0 489516.0 990194.0 492299.0 995445.0 495165.0 ... 18.9 NaN NaN NaN 7.0 5.0 3.0 27.700001 23.1 25.200001
ZMB NaN 226616.0 457329.0 220950.0 445451.0 215890.0 434876.0 211367.0 425454.0 207311.0 ... 12.6 NaN NaN NaN 7.0 5.0 3.0 8.000000 7.3 7.700000
ZWE NaN 181422.0 364761.0 179761.0 361101.0 177982.0 357261.0 176130.0 353321.0 174240.0 ... 13.9 NaN NaN NaN 7.0 6.0 4.0 5.100000 5.1 5.100000

241 rows × 243 columns

Split dataset into a dataframe containing year-data and a dataframe for the float-precentages-data

This i done for better imputation results

In [17]:
int_cols = find_all_integer_columns(countryXindicator)
In [18]:
countryXindicator_float = countryXindicator.drop(int_cols, axis=1)
print(countryXindicator_float.index.to_list())
['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARB', 'ARE', 'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHI', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'EAP', 'EAS', 'ECA', 'ECS', 'ECU', 'EGY', 'EMU', 'ERI', 'ESP', 'EST', 'ETH', 'EUU', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUM', 'GUY', 'HIC', 'HKG', 'HND', 'HPC', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAC', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LCN', 'LDC', 'LIC', 'LIE', 'LKA', 'LMC', 'LMY', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEA', 'MEX', 'MHL', 'MIC', 'MKD', 'MLI', 'MLT', 'MMR', 'MNA', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAC', 'NAM', 'NCL', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OED', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'ROU', 'RUS', 'RWA', 'SAS', 'SAU', 'SDN', 'SEN', 'SGP', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SRB', 'SSA', 'SSD', 'SSF', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TUV', 'TZA', 'UGA', 'UKR', 'UMC', 'URY', 'USA', 'UZB', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLD', 'WSM', 'XKX', 'YEM', 'ZAF', 'ZMB', 'ZWE']
In [19]:
countryXindicator_year = countryXindicator[int_cols]
countryXindicator_year
Out[19]:
Indicator Name Age population, age 0, female, UNESCO Age population, age 0, total, UNESCO Age population, age 01, female, UNESCO Age population, age 01, total, UNESCO Age population, age 02, female, UNESCO Age population, age 02, total, UNESCO Age population, age 03, female, UNESCO Age population, age 03, total, UNESCO Age population, age 04, female, UNESCO Age population, age 04, total, UNESCO ... Population, ages 7-13, male Population, ages 7-13, total Population, ages 7-9, female Population, ages 7-9, male Population, ages 7-9, total Population, female Population, male Theoretical duration of primary education (years) Theoretical duration of secondary education (years) Theoretical duration of upper secondary education (years)
Country Code
ABW 542.0 1112.0 572.0 1170.0 600.0 1224.0 623.0 1269.0 643.0 1310.0 ... 5152.0 10124.0 2095.0 2169.0 4264.0 54743.0 49598.0 6.0 5.0 3.0
AFG 682677.0 1403010.0 650389.0 1338500.0 620880.0 1279403.0 593957.0 1225345.0 569425.0 1175957.0 ... 3481981.0 6728457.0 1475542.0 1580498.0 3056040.0 16346869.0 17389625.0 6.0 6.0 3.0
AGO 414919.0 832093.0 396537.0 793950.0 379705.0 759164.0 364295.0 727455.0 350180.0 698548.0 ... 1988558.0 3995534.0 913109.0 904577.0 1817686.0 14205741.0 13653564.0 6.0 6.0 3.0
ALB 23697.0 49105.0 24026.0 49709.0 24243.0 50104.0 24360.0 50315.0 24397.0 50378.0 ... 179160.0 346732.0 72246.0 77107.0 149353.0 1426369.0 1454334.0 5.0 7.0 3.0
AND NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 6.0 6.0 2.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
XKX NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
YEM 458859.0 936053.0 452601.0 923017.0 444988.0 907252.0 436223.0 889171.0 426514.0 869195.0 ... 2614331.0 5137180.0 1147011.0 1189006.0 2336017.0 13315678.0 13600529.0 6.0 6.0 3.0
ZAF 484535.0 980663.0 486901.0 985215.0 489516.0 990194.0 492299.0 995445.0 495165.0 1000810.0 ... 3615273.0 7175459.0 1516934.0 1543083.0 3060017.0 27999778.0 27012199.0 7.0 5.0 3.0
ZMB 226616.0 457329.0 220950.0 445451.0 215890.0 434876.0 211367.0 425454.0 207311.0 417041.0 ... 1334582.0 2657396.0 583300.0 588617.0 1171917.0 8112243.0 7988344.0 7.0 5.0 3.0
ZWE 181422.0 364761.0 179761.0 361101.0 177982.0 357261.0 176130.0 353321.0 174240.0 349351.0 ... 1156372.0 2310111.0 501606.0 502943.0 1004549.0 8099354.0 7678097.0 7.0 6.0 4.0

241 rows × 195 columns

In [20]:
countryXindicator_year = countryXindicator_year[[colname for colname in countryXindicator_year.columns if "years" in colname]]
countryXindicator_year
Out[20]:
Indicator Name Duration of compulsory education (years) Official entrance age to lower secondary education (years) Official entrance age to primary education (years) Theoretical duration of primary education (years) Theoretical duration of secondary education (years) Theoretical duration of upper secondary education (years)
Country Code
ABW 13.0 12.0 6.0 6.0 5.0 3.0
AFG 9.0 13.0 7.0 6.0 6.0 3.0
AGO 6.0 12.0 6.0 6.0 6.0 3.0
ALB 9.0 11.0 6.0 5.0 7.0 3.0
AND 10.0 12.0 6.0 6.0 6.0 2.0
... ... ... ... ... ... ...
XKX NaN NaN NaN NaN NaN NaN
YEM 9.0 12.0 6.0 6.0 6.0 3.0
ZAF 9.0 14.0 7.0 7.0 5.0 3.0
ZMB 7.0 14.0 7.0 7.0 5.0 3.0
ZWE 7.0 13.0 6.0 7.0 6.0 4.0

241 rows × 6 columns

Impute both dataframes

In [21]:
imputed_countryXindicator_float = impute_df(countryXindicator_float, max_iter=100, verbose=2)
veImputer] Change: 2.532294563247444, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 19/100, elapsed time 4.17
[IterativeImputer] Change: 2.2119869217502828, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 20/100, elapsed time 4.38
[IterativeImputer] Change: 1.9202443611257713, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 21/100, elapsed time 4.59
[IterativeImputer] Change: 1.6606990496546086, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 22/100, elapsed time 4.82
[IterativeImputer] Change: 1.6559282826221846, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 23/100, elapsed time 5.04
[IterativeImputer] Change: 1.6520065848505072, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 24/100, elapsed time 5.26
[IterativeImputer] Change: 1.648782063252183, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 25/100, elapsed time 5.48
[IterativeImputer] Change: 1.6461831356141814, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 26/100, elapsed time 5.69
[IterativeImputer] Change: 1.644058054001174, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 27/100, elapsed time 5.90
[IterativeImputer] Change: 1.6423013079998188, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 28/100, elapsed time 6.12
[IterativeImputer] Change: 1.6408741216707456, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 29/100, elapsed time 6.33
[IterativeImputer] Change: 1.6396788922926566, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 30/100, elapsed time 6.55
[IterativeImputer] Change: 1.638655100233116, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 31/100, elapsed time 6.77
[IterativeImputer] Change: 1.6377695134752297, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 32/100, elapsed time 6.98
[IterativeImputer] Change: 1.6369872536064, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 33/100, elapsed time 7.19
[IterativeImputer] Change: 1.636221623251295, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 34/100, elapsed time 7.41
[IterativeImputer] Change: 1.6355083766661642, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 35/100, elapsed time 7.63
[IterativeImputer] Change: 1.634821858203819, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 36/100, elapsed time 7.85
[IterativeImputer] Change: 1.6341370631609828, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 37/100, elapsed time 8.07
[IterativeImputer] Change: 1.6334747186241456, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 38/100, elapsed time 8.28
[IterativeImputer] Change: 1.645652531918897, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 39/100, elapsed time 8.50
[IterativeImputer] Change: 1.652182711035818, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 40/100, elapsed time 8.72
[IterativeImputer] Change: 1.6532662904419548, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 41/100, elapsed time 8.94
[IterativeImputer] Change: 1.649451831600862, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 42/100, elapsed time 9.16
[IterativeImputer] Change: 1.6408222980663139, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 43/100, elapsed time 9.38
[IterativeImputer] Change: 1.6294400496414387, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 44/100, elapsed time 9.60
[IterativeImputer] Change: 1.6287678524729357, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 45/100, elapsed time 9.82
[IterativeImputer] Change: 1.6281047469396985, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 46/100, elapsed time 10.05
[IterativeImputer] Change: 1.6274834308429378, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 47/100, elapsed time 10.26
[IterativeImputer] Change: 1.6270130873718072, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 48/100, elapsed time 10.48
[IterativeImputer] Change: 1.6262940758389055, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 49/100, elapsed time 10.70
[IterativeImputer] Change: 1.6257297763508762, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 50/100, elapsed time 10.92
[IterativeImputer] Change: 1.6252298538321177, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 51/100, elapsed time 11.14
[IterativeImputer] Change: 1.624722930666042, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 52/100, elapsed time 11.35
[IterativeImputer] Change: 1.6242577689697895, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 53/100, elapsed time 11.57
[IterativeImputer] Change: 1.6238212014114874, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 54/100, elapsed time 11.79
[IterativeImputer] Change: 1.6234519318135652, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 55/100, elapsed time 12.01
[IterativeImputer] Change: 1.6231115358163457, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 56/100, elapsed time 12.23
[IterativeImputer] Change: 1.6228249713969374, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 57/100, elapsed time 12.44
[IterativeImputer] Change: 1.6225670613424206, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 58/100, elapsed time 12.66
[IterativeImputer] Change: 1.6222797867936543, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 59/100, elapsed time 12.89
[IterativeImputer] Change: 1.6221342960698018, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 60/100, elapsed time 13.10
[IterativeImputer] Change: 1.6219870713467206, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 61/100, elapsed time 13.31
[IterativeImputer] Change: 1.6218646076142422, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 62/100, elapsed time 13.53
[IterativeImputer] Change: 1.6217730363655758, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 63/100, elapsed time 13.75
[IterativeImputer] Change: 1.621725788928061, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 64/100, elapsed time 14.06
[IterativeImputer] Change: 1.6216920357302036, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 65/100, elapsed time 14.27
[IterativeImputer] Change: 1.621698990708198, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 66/100, elapsed time 14.50
[IterativeImputer] Change: 1.6217532944875095, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 67/100, elapsed time 14.72
[IterativeImputer] Change: 1.6218227224444455, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 68/100, elapsed time 14.93
[IterativeImputer] Change: 1.6218972339192113, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 69/100, elapsed time 15.15
[IterativeImputer] Change: 1.6232902366910664, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 70/100, elapsed time 15.37
[IterativeImputer] Change: 1.6249616723710512, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 71/100, elapsed time 15.59
[IterativeImputer] Change: 1.6261259356463242, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 72/100, elapsed time 15.81
[IterativeImputer] Change: 1.6281859862659571, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 73/100, elapsed time 16.03
[IterativeImputer] Change: 1.63006029323919, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 74/100, elapsed time 16.24
[IterativeImputer] Change: 1.6318311593012644, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 75/100, elapsed time 16.46
[IterativeImputer] Change: 1.6336233225534909, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 76/100, elapsed time 16.68
[IterativeImputer] Change: 1.6353960973093213, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 77/100, elapsed time 16.90
[IterativeImputer] Change: 1.637182751172711, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 78/100, elapsed time 17.12
[IterativeImputer] Change: 1.6389758111697363, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 79/100, elapsed time 17.34
[IterativeImputer] Change: 1.6407763042253765, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 80/100, elapsed time 17.55
[IterativeImputer] Change: 1.6425834310407716, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 81/100, elapsed time 17.77
[IterativeImputer] Change: 1.6443968858747002, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 82/100, elapsed time 17.99
[IterativeImputer] Change: 1.64625348689472, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 83/100, elapsed time 18.21
[IterativeImputer] Change: 1.6481429500945106, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 84/100, elapsed time 18.43
[IterativeImputer] Change: 1.6500518120410161, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 85/100, elapsed time 18.65
[IterativeImputer] Change: 1.6519548194561064, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 86/100, elapsed time 18.87
[IterativeImputer] Change: 1.6538680162736423, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 87/100, elapsed time 19.09
[IterativeImputer] Change: 1.6552679864808504, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 88/100, elapsed time 19.31
[IterativeImputer] Change: 1.6574626185722092, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 89/100, elapsed time 19.52
[IterativeImputer] Change: 1.6594748268725177, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 90/100, elapsed time 19.74
[IterativeImputer] Change: 1.6613749089305434, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 91/100, elapsed time 19.96
[IterativeImputer] Change: 1.6632615294758013, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 92/100, elapsed time 20.18
[IterativeImputer] Change: 1.6651267664591594, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 93/100, elapsed time 20.40
[IterativeImputer] Change: 1.6669733028122995, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 94/100, elapsed time 20.62
[IterativeImputer] Change: 1.6688041142649566, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 95/100, elapsed time 20.84
[IterativeImputer] Change: 1.6706225132059456, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 96/100, elapsed time 21.08
[IterativeImputer] Change: 1.672451264542782, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 97/100, elapsed time 21.30
[IterativeImputer] Change: 1.6742766711767634, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 98/100, elapsed time 21.53
[IterativeImputer] Change: 1.6761039010597747, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 99/100, elapsed time 21.74
[IterativeImputer] Change: 1.6778935337913161, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Ending imputation round 100/100, elapsed time 21.96
[IterativeImputer] Change: 1.679678914280904, scaled tolerance: 0.19410198974609402 
[IterativeImputer] Completing matrix with shape (241, 48)
[IterativeImputer] Ending imputation round 1/100, elapsed time 0.01
[IterativeImputer] Ending imputation round 2/100, elapsed time 0.02
[IterativeImputer] Ending imputation round 3/100, elapsed time 0.03
[IterativeImputer] Ending imputation round 4/100, elapsed time 0.04
[IterativeImputer] Ending imputation round 5/100, elapsed time 0.05
[IterativeImputer] Ending imputation round 6/100, elapsed time 0.06
[IterativeImputer] Ending imputation round 7/100, elapsed time 0.07
[IterativeImputer] Ending imputation round 8/100, elapsed time 0.07
[IterativeImputer] Ending imputation round 9/100, elapsed time 0.08
[IterativeImputer] Ending imputation round 10/100, elapsed time 0.09
[IterativeImputer] Ending imputation round 11/100, elapsed time 0.10
[IterativeImputer] Ending imputation round 12/100, elapsed time 0.11
[IterativeImputer] Ending imputation round 13/100, elapsed time 0.12
[IterativeImputer] Ending imputation round 14/100, elapsed time 0.12
[IterativeImputer] Ending imputation round 15/100, elapsed time 0.13
[IterativeImputer] Ending imputation round 16/100, elapsed time 0.14
[IterativeImputer] Ending imputation round 17/100, elapsed time 0.15
[IterativeImputer] Ending imputation round 18/100, elapsed time 0.16
[IterativeImputer] Ending imputation round 19/100, elapsed time 0.17
[IterativeImputer] Ending imputation round 20/100, elapsed time 0.18
[IterativeImputer] Ending imputation round 21/100, elapsed time 0.18
C:\Users\joach\.conda\envs\wsenv\lib\site-packages\sklearn\impute\_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"
[IterativeImputer] Ending imputation round 22/100, elapsed time 0.20
[IterativeImputer] Ending imputation round 23/100, elapsed time 0.20
[IterativeImputer] Ending imputation round 24/100, elapsed time 0.21
[IterativeImputer] Ending imputation round 25/100, elapsed time 0.22
[IterativeImputer] Ending imputation round 26/100, elapsed time 0.23
[IterativeImputer] Ending imputation round 27/100, elapsed time 0.24
[IterativeImputer] Ending imputation round 28/100, elapsed time 0.24
[IterativeImputer] Ending imputation round 29/100, elapsed time 0.25
[IterativeImputer] Ending imputation round 30/100, elapsed time 0.26
[IterativeImputer] Ending imputation round 31/100, elapsed time 0.27
[IterativeImputer] Ending imputation round 32/100, elapsed time 0.28
[IterativeImputer] Ending imputation round 33/100, elapsed time 0.28
[IterativeImputer] Ending imputation round 34/100, elapsed time 0.29
[IterativeImputer] Ending imputation round 35/100, elapsed time 0.30
[IterativeImputer] Ending imputation round 36/100, elapsed time 0.31
[IterativeImputer] Ending imputation round 37/100, elapsed time 0.31
[IterativeImputer] Ending imputation round 38/100, elapsed time 0.32
[IterativeImputer] Ending imputation round 39/100, elapsed time 0.33
[IterativeImputer] Ending imputation round 40/100, elapsed time 0.34
[IterativeImputer] Ending imputation round 41/100, elapsed time 0.34
[IterativeImputer] Ending imputation round 42/100, elapsed time 0.35
[IterativeImputer] Ending imputation round 43/100, elapsed time 0.36
[IterativeImputer] Ending imputation round 44/100, elapsed time 0.37
[IterativeImputer] Ending imputation round 45/100, elapsed time 0.38
[IterativeImputer] Ending imputation round 46/100, elapsed time 0.38
[IterativeImputer] Ending imputation round 47/100, elapsed time 0.39
[IterativeImputer] Ending imputation round 48/100, elapsed time 0.40
[IterativeImputer] Ending imputation round 49/100, elapsed time 0.41
[IterativeImputer] Ending imputation round 50/100, elapsed time 0.42
[IterativeImputer] Ending imputation round 51/100, elapsed time 0.43
[IterativeImputer] Ending imputation round 52/100, elapsed time 0.43
[IterativeImputer] Ending imputation round 53/100, elapsed time 0.44
[IterativeImputer] Ending imputation round 54/100, elapsed time 0.45
[IterativeImputer] Ending imputation round 55/100, elapsed time 0.46
[IterativeImputer] Ending imputation round 56/100, elapsed time 0.47
[IterativeImputer] Ending imputation round 57/100, elapsed time 0.48
[IterativeImputer] Ending imputation round 58/100, elapsed time 0.49
[IterativeImputer] Ending imputation round 59/100, elapsed time 0.49
[IterativeImputer] Ending imputation round 60/100, elapsed time 0.50
[IterativeImputer] Ending imputation round 61/100, elapsed time 0.51
[IterativeImputer] Ending imputation round 62/100, elapsed time 0.52
[IterativeImputer] Ending imputation round 63/100, elapsed time 0.52
[IterativeImputer] Ending imputation round 64/100, elapsed time 0.53
[IterativeImputer] Ending imputation round 65/100, elapsed time 0.54
[IterativeImputer] Ending imputation round 66/100, elapsed time 0.55
[IterativeImputer] Ending imputation round 67/100, elapsed time 0.55
[IterativeImputer] Ending imputation round 68/100, elapsed time 0.56
[IterativeImputer] Ending imputation round 69/100, elapsed time 0.57
[IterativeImputer] Ending imputation round 70/100, elapsed time 0.58
[IterativeImputer] Ending imputation round 71/100, elapsed time 0.58
[IterativeImputer] Ending imputation round 72/100, elapsed time 0.59
[IterativeImputer] Ending imputation round 73/100, elapsed time 0.60
[IterativeImputer] Ending imputation round 74/100, elapsed time 0.61
[IterativeImputer] Ending imputation round 75/100, elapsed time 0.61
[IterativeImputer] Ending imputation round 76/100, elapsed time 0.62
[IterativeImputer] Ending imputation round 77/100, elapsed time 0.63
[IterativeImputer] Ending imputation round 78/100, elapsed time 0.64
[IterativeImputer] Ending imputation round 79/100, elapsed time 0.65
[IterativeImputer] Ending imputation round 80/100, elapsed time 0.66
[IterativeImputer] Ending imputation round 81/100, elapsed time 0.66
[IterativeImputer] Ending imputation round 82/100, elapsed time 0.67
[IterativeImputer] Ending imputation round 83/100, elapsed time 0.68
[IterativeImputer] Ending imputation round 84/100, elapsed time 0.69
[IterativeImputer] Ending imputation round 85/100, elapsed time 0.70
[IterativeImputer] Ending imputation round 86/100, elapsed time 0.70
[IterativeImputer] Ending imputation round 87/100, elapsed time 0.71
[IterativeImputer] Ending imputation round 88/100, elapsed time 0.72
[IterativeImputer] Ending imputation round 89/100, elapsed time 0.73
[IterativeImputer] Ending imputation round 90/100, elapsed time 0.73
[IterativeImputer] Ending imputation round 91/100, elapsed time 0.74
[IterativeImputer] Ending imputation round 92/100, elapsed time 0.75
[IterativeImputer] Ending imputation round 93/100, elapsed time 0.76
[IterativeImputer] Ending imputation round 94/100, elapsed time 0.76
[IterativeImputer] Ending imputation round 95/100, elapsed time 0.77
[IterativeImputer] Ending imputation round 96/100, elapsed time 0.78
[IterativeImputer] Ending imputation round 97/100, elapsed time 0.79
[IterativeImputer] Ending imputation round 98/100, elapsed time 0.79
[IterativeImputer] Ending imputation round 99/100, elapsed time 0.80
[IterativeImputer] Ending imputation round 100/100, elapsed time 0.81
In [22]:
imputed_countryXindicator_year = impute_df(countryXindicator_year, max_iter=100, verbose=2)
[IterativeImputer] Completing matrix with shape (241, 6)
[IterativeImputer] Ending imputation round 1/100, elapsed time 0.01
[IterativeImputer] Change: 1.6443802657682207, scaled tolerance: 0.015 
[IterativeImputer] Ending imputation round 2/100, elapsed time 0.02
[IterativeImputer] Change: 0.0830316620808409, scaled tolerance: 0.015 
[IterativeImputer] Ending imputation round 3/100, elapsed time 0.04
[IterativeImputer] Change: 0.0289166641679115, scaled tolerance: 0.015 
[IterativeImputer] Ending imputation round 4/100, elapsed time 0.05
[IterativeImputer] Change: 0.009636184792338298, scaled tolerance: 0.015 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (241, 6)
[IterativeImputer] Ending imputation round 1/4, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/4, elapsed time 0.00
[IterativeImputer] Ending imputation round 3/4, elapsed time 0.00
[IterativeImputer] Ending imputation round 4/4, elapsed time 0.01

Convert years float to int

In [23]:
imputed_countryXindicator_year = imputed_countryXindicator_year.round(0).astype(int)

Comparison correlation plot float-precentages-data

Before Imputation

In [24]:
corr_calc = countryXindicator_float.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
Out[24]:
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
2021-05-12T09:40:53.408432 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/

After imputation

In [25]:
corr_calc = imputed_countryXindicator_float.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
Out[25]:
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
2021-05-12T09:40:53.973519 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/

Comparison correlation plot yeardata

Before imputation

In [26]:
corr_calc = countryXindicator_year.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, cmap='mako')
Out[26]:
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
2021-05-12T09:40:54.639592 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/

After imputation

In [27]:
corr_calc = imputed_countryXindicator_year.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, cmap='mako')
Out[27]:
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
2021-05-12T09:40:55.089503 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/

Merge imputed dataframes

In [28]:
imputed_data = imputed_countryXindicator_year.merge(imputed_countryXindicator_float, how='inner', on="Country Code")
imputed_data
Out[28]:
Indicator Name Duration of compulsory education (years) Official entrance age to lower secondary education (years) Official entrance age to primary education (years) Theoretical duration of primary education (years) Theoretical duration of secondary education (years) Theoretical duration of upper secondary education (years) Adjusted net enrolment rate, primary, both sexes (%) Gross enrolment ratio, lower secondary, both sexes (%) Gross enrolment ratio, lower secondary, female (%) Gross enrolment ratio, lower secondary, male (%) ... Population growth (annual %) Population, female (% of total) Population, male (% of total) Prevalence of HIV, total (% of population ages 15-49) Primary completion rate, both sexes (%) Primary completion rate, female (%) Primary completion rate, male (%) Unemployment, female (% of female labor force) (modeled ILO estimate) Unemployment, male (% of male labor force) (modeled ILO estimate) Unemployment, total (% of total labor force) (modeled ILO estimate)
Country Code
ABW 13 12 6 6 5 3 96.636541 91.072857 90.689076 91.406452 ... 0.524658 52.465521 47.534479 1.416423 91.125743 90.752535 91.425942 10.349564 8.746525 9.324957
AFG 9 13 7 6 6 3 87.416874 67.447617 49.580441 84.329559 ... 2.943234 48.454558 51.545442 0.100000 93.312781 88.498878 97.920359 12.700000 7.700000 8.600000
AGO 6 12 6 6 6 3 87.446703 90.903513 90.527003 91.325392 ... 3.428021 50.991010 49.008990 1.900000 92.486172 92.930030 92.143730 6.700000 5.700000 6.200000
ALB 9 11 6 5 7 3 99.516937 101.488373 99.889503 102.980438 ... -0.291206 49.514599 50.485401 0.100000 106.367561 104.699371 107.900124 17.299999 17.000000 17.100000
AND 10 12 6 6 6 2 94.592438 91.281081 91.067359 91.467822 ... -1.537836 49.937398 50.062602 -0.301099 90.510694 89.175083 91.671974 14.798726 11.023894 11.875731
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
XKX 10 12 6 6 6 3 95.413339 91.084565 90.721799 91.429833 ... -1.103886 49.937397 50.062603 -0.423033 91.464884 91.250304 91.619722 9.662467 9.205525 9.289344
YEM 9 12 6 6 6 3 86.150744 90.902881 90.439904 91.274501 ... 2.520254 49.470856 50.529144 0.100000 92.609481 92.837224 92.326815 32.700001 11.100000 16.700001
ZAF 9 14 7 7 5 3 96.505376 91.062603 90.916364 91.240835 ... 1.585325 50.897604 49.102396 18.900000 91.111444 91.824840 90.441719 27.700001 23.100000 25.200001
ZMB 7 14 7 7 5 3 93.181744 91.012954 90.829133 91.267250 ... 3.024123 50.384766 49.615234 12.600000 91.406170 91.905970 90.976530 8.000000 7.300000 7.700000
ZWE 7 13 6 7 6 4 94.625610 91.047967 90.927321 91.251754 ... 2.345643 51.334997 48.665003 13.900000 91.325104 91.913467 90.806284 5.100000 5.100000 5.100000

241 rows × 54 columns

In [29]:
bigtable = imputed_data.merge(df_c, how='left', on="Country Code")
bigtable.columns
Out[29]:
Index(['Country Code', 'Duration of compulsory education (years)',
       'Official entrance age to lower secondary education (years)',
       'Official entrance age to primary education (years)',
       'Theoretical duration of primary education (years)',
       'Theoretical duration of secondary education (years)',
       'Theoretical duration of upper secondary education (years)',
       'Adjusted net enrolment rate, primary, both sexes (%)',
       'Gross enrolment ratio, lower secondary, both sexes (%)',
       'Gross enrolment ratio, lower secondary, female (%)',
       'Gross enrolment ratio, lower secondary, male (%)',
       'Gross enrolment ratio, pre-primary, both sexes (%)',
       'Gross enrolment ratio, pre-primary, female (%)',
       'Gross enrolment ratio, pre-primary, male (%)',
       'Gross enrolment ratio, primary, both sexes (%)',
       'Gross enrolment ratio, primary, female (%)',
       'Gross enrolment ratio, primary, gender parity index (GPI)',
       'Gross enrolment ratio, primary, male (%)',
       'Gross enrolment ratio, secondary, both sexes (%)',
       'Gross enrolment ratio, secondary, female (%)',
       'Gross enrolment ratio, secondary, gender parity index (GPI)',
       'Gross enrolment ratio, secondary, male (%)',
       'Gross enrolment ratio, upper secondary, both sexes (%)',
       'Gross intake ratio to Grade 1 of primary education, both sexes (%)',
       'Gross intake ratio to Grade 1 of primary education, female (%)',
       'Gross intake ratio to Grade 1 of primary education, male (%)',
       'Internet users (per 100 people)',
       'Labor force, female (% of total labor force)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Net enrolment rate, primary, both sexes (%)',
       'Percentage of enrolment in pre-primary education in private institutions (%)',
       'Percentage of enrolment in primary education in private institutions (%)',
       'Percentage of enrolment in secondary education in private institutions (%)',
       'Percentage of female students enrolled in primary education who are over-age, female (%)',
       'Percentage of male students enrolled in primary education who are over-age, male (%)',
       'Percentage of repeaters in primary education, all grades, both sexes (%)',
       'Percentage of repeaters in primary education, all grades, female (%)',
       'Percentage of repeaters in primary education, all grades, male (%)',
       'Percentage of students enrolled in primary education who are over-age, both sexes (%)',
       'Percentage of students in pre-primary education who are female (%)',
       'Percentage of students in primary education who are female (%)',
       'Percentage of students in secondary education who are female (%)',
       'Percentage of students in secondary general education who are female (%)',
       'Population ages 0-14 (% of total)',
       'Population ages 15-64 (% of total)', 'Population growth (annual %)',
       'Population, female (% of total)', 'Population, male (% of total)',
       'Prevalence of HIV, total (% of population ages 15-49)',
       'Primary completion rate, both sexes (%)',
       'Primary completion rate, female (%)',
       'Primary completion rate, male (%)',
       'Unemployment, female (% of female labor force) (modeled ILO estimate)',
       'Unemployment, male (% of male labor force) (modeled ILO estimate)',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)',
       'Short Name', 'Table Name', 'Long Name', '2-alpha code',
       'Currency Unit', 'Special Notes', 'Region', 'Income Group', 'WB-2 code',
       'National accounts base year', 'National accounts reference year',
       'SNA price valuation', 'Lending category', 'Other groups',
       'System of National Accounts', 'Alternative conversion factor',
       'PPP survey year', 'Balance of Payments Manual in use',
       'External debt Reporting status', 'System of trade',
       'Government Accounting concept', 'IMF data dissemination standard',
       'Latest population census', 'Latest household survey',
       'Source of most recent Income and expenditure data',
       'Vital registration complete', 'Latest agricultural census',
       'Latest industrial data', 'Latest trade data',
       'Latest water withdrawal data', 'Unnamed: 31'],
      dtype='object')
In [33]:
bigtable.set_index('Country Code', inplace=True)
In [34]:
bigtable
Out[34]:
Duration of compulsory education (years) Official entrance age to lower secondary education (years) Official entrance age to primary education (years) Theoretical duration of primary education (years) Theoretical duration of secondary education (years) Theoretical duration of upper secondary education (years) Adjusted net enrolment rate, primary, both sexes (%) Gross enrolment ratio, lower secondary, both sexes (%) Gross enrolment ratio, lower secondary, female (%) Gross enrolment ratio, lower secondary, male (%) ... IMF data dissemination standard Latest population census Latest household survey Source of most recent Income and expenditure data Vital registration complete Latest agricultural census Latest industrial data Latest trade data Latest water withdrawal data Unnamed: 31
Country Code
ABW 13 12 6 6 5 3 96.636541 91.072857 90.689076 91.406452 ... NaN 2010 NaN NaN Yes NaN NaN 2012.0 NaN NaN
AFG 9 13 7 6 6 3 87.416874 67.447617 49.580441 84.329559 ... General Data Dissemination System (GDDS) 1979 Multiple Indicator Cluster Survey (MICS), 2010/11 Integrated household survey (IHS), 2008 NaN 2013/14 NaN 2012.0 2000 NaN
AGO 6 12 6 6 6 3 87.446703 90.903513 90.527003 91.325392 ... General Data Dissemination System (GDDS) 1970 Malaria Indicator Survey (MIS), 2011 Integrated household survey (IHS), 2008 NaN 2015 NaN NaN 2005 NaN
ALB 9 11 6 5 7 3 99.516937 101.488373 99.889503 102.980438 ... General Data Dissemination System (GDDS) 2011 Demographic and Health Survey (DHS), 2008/09 Living Standards Measurement Study Survey (LSM... Yes 2012 2010.0 2012.0 2006 NaN
AND 10 12 6 6 6 2 94.592438 91.281081 91.067359 91.467822 ... NaN 2011. Population figures compiled from adminis... NaN NaN Yes NaN NaN 2006.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
XKX 10 12 6 6 6 3 95.413339 91.084565 90.721799 91.429833 ... General Data Dissemination System (GDDS) 2011 NaN Integrated household survey (IHS), 2011 NaN NaN NaN NaN NaN NaN
YEM 9 12 6 6 6 3 86.150744 90.902881 90.439904 91.274501 ... General Data Dissemination System (GDDS) 2004 Demographic and Health Survey (DHS), 2013 Expenditure survey/budget survey (ES/BS), 2005 NaN NaN 2006.0 2012.0 2005 NaN
ZAF 9 14 7 7 5 3 96.505376 91.062603 90.916364 91.240835 ... Special Data Dissemination Standard (SDDS) 2011 Demographic and Health Survey (DHS), 2003; Wor... Expenditure survey/budget survey (ES/BS), 2010 NaN 2007 2010.0 2012.0 2000 NaN
ZMB 7 14 7 7 5 3 93.181744 91.012954 90.829133 91.267250 ... General Data Dissemination System (GDDS) 2010 Demographic and Health Survey (DHS), 2013 Integrated household survey (IHS), 2010 NaN 2010. Population and Housing Census. NaN 2011.0 2002 NaN
ZWE 7 13 6 7 6 4 94.625610 91.047967 90.927321 91.251754 ... General Data Dissemination System (GDDS) 2012 Demographic and Health Survey (DHS), 2010/11 Integrated household survey (IHS), 2011/12 NaN NaN NaN 2012.0 2002 NaN

241 rows × 85 columns

In [35]:
bigtable.to_csv("../data/unlabeled/preprocessed/edstats_preprocessed.csv")
In [ ]: