import pandas as pd
import sklearn
from helpers import *
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from data.unlabeled.raw import edstats_co as df_c, edstats_da as df_d, edstats_se as df_s
import seaborn as sns
..\data\unlabeled\raw\__init__.py:41: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_eah = pd.read_csv(aquastat_eah_path, skipfooter=8) ..\data\unlabeled\raw\__init__.py:42: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_wr = pd.read_csv(aquastat_wr_path, skipfooter=8) ..\data\unlabeled\raw\__init__.py:43: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_wu = pd.read_csv(aquastat_wu_path, skipfooter=8)
df_d.columns
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045', '2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090', '2095', '2100', 'Unnamed: 69'], dtype='object')
years = ['2015', '2016', '2017', '2020']
stripped_df_d = df_d[['Country Code','Indicator Code',*years]]
print("Non na values in col 2020:",stripped_df_d['2020'].count())
stripped_df_d
Non na values in col 2020: 51436
Country Code | Indicator Code | 2015 | 2016 | 2017 | 2020 | |
---|---|---|---|---|---|---|
0 | ARB | UIS.NERA.2 | NaN | NaN | NaN | NaN |
1 | ARB | UIS.NERA.2.F | NaN | NaN | NaN | NaN |
2 | ARB | UIS.NERA.2.GPI | NaN | NaN | NaN | NaN |
3 | ARB | UIS.NERA.2.M | NaN | NaN | NaN | NaN |
4 | ARB | SE.PRM.TENR | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... |
886925 | ZWE | UIS.LP.AG15T24.M | NaN | NaN | NaN | NaN |
886926 | ZWE | SE.ADT.1524.LT.ZS | NaN | NaN | NaN | NaN |
886927 | ZWE | SE.ADT.1524.LT.FE.ZS | NaN | NaN | NaN | NaN |
886928 | ZWE | SE.ADT.1524.LT.FM.ZS | NaN | NaN | NaN | NaN |
886929 | ZWE | SE.ADT.1524.LT.MA.ZS | NaN | NaN | NaN | NaN |
886930 rows × 6 columns
for year in years[:-1]:
fill_missing_with_column(stripped_df_d, '2020',year)
print("Non na values in col 2020:", stripped_df_d['2020'].count())
stripped_df_d
Non na values in col 2020: 132991 c:\Users\joach\code-projects\WaterSecurity\unlabeled_preprocessing\helpers.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[into] = df[into].combine_first(df[fro]) C:\Users\joach\.conda\envs\wsenv\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
Country Code | Indicator Code | 2020 | |
---|---|---|---|
0 | ARB | UIS.NERA.2 | NaN |
1 | ARB | UIS.NERA.2.F | NaN |
2 | ARB | UIS.NERA.2.GPI | NaN |
3 | ARB | UIS.NERA.2.M | NaN |
4 | ARB | SE.PRM.TENR | NaN |
... | ... | ... | ... |
886925 | ZWE | UIS.LP.AG15T24.M | NaN |
886926 | ZWE | SE.ADT.1524.LT.ZS | NaN |
886927 | ZWE | SE.ADT.1524.LT.FE.ZS | NaN |
886928 | ZWE | SE.ADT.1524.LT.FM.ZS | NaN |
886929 | ZWE | SE.ADT.1524.LT.MA.ZS | NaN |
886930 rows × 3 columns
df_d_withseries = stripped_df_d.merge(df_s, how='left', left_on='Indicator Code', right_on='Series Code')
df_d_withseries.drop(['Indicator Code'], inplace=True, axis=1)
df_d_withseries.columns
Index(['Country Code', '2020', 'Series Code', 'Topic', 'Indicator Name', 'Short definition', 'Long definition', 'Unit of measure', 'Periodicity', 'Base Period', 'Other notes', 'Aggregation method', 'Limitations and exceptions', 'Notes from original source', 'General comments', 'Source', 'Statistical concept and methodology', 'Development relevance', 'Related source links', 'Other web links', 'Related indicators', 'License Type', 'Unnamed: 20'], dtype='object')
countryXindicator = df_d_withseries.pivot_table('2020',['Country Code'],'Indicator Name')
countryXindicator
Indicator Name | Adjusted net enrolment rate, lower secondary, both sexes (%) | Adjusted net enrolment rate, lower secondary, female (%) | Adjusted net enrolment rate, lower secondary, gender parity index (GPI) | Adjusted net enrolment rate, lower secondary, male (%) | Adjusted net enrolment rate, primary, both sexes (%) | Adjusted net enrolment rate, primary, female (%) | Adjusted net enrolment rate, primary, gender parity index (GPI) | Adjusted net enrolment rate, primary, male (%) | Adjusted net enrolment rate, upper secondary, both sexes (%) | Adjusted net enrolment rate, upper secondary, female (%) | ... | Under-age enrolment ratio in secondary education, female (%) | Under-age enrolment ratio in secondary education, male (%) | Unemployment, female (% of female labor force) (modeled ILO estimate) | Unemployment, male (% of male labor force) (modeled ILO estimate) | Unemployment, total (% of total labor force) (modeled ILO estimate) | Youth illiterate population, 15-24 years, % female | Youth literacy rate, population 15-24 years, both sexes (%) | Youth literacy rate, population 15-24 years, female (%) | Youth literacy rate, population 15-24 years, gender parity index (GPI) | Youth literacy rate, population 15-24 years, male (%) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
ABW | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
AFG | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 12.700000 | 7.7 | 8.600000 | NaN | NaN | NaN | NaN | NaN |
AGO | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 6.700000 | 5.7 | 6.200000 | NaN | NaN | NaN | NaN | NaN |
ALB | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 17.299999 | 17.0 | 17.100000 | NaN | NaN | NaN | NaN | NaN |
AND | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 47.80890 | 100.00000 | 100.00000 | 1.00000 | 100.00000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
YEM | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 32.700001 | 11.1 | 16.700001 | NaN | NaN | NaN | NaN | NaN |
ZAF | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 27.700001 | 23.1 | 25.200001 | 34.34808 | 98.95578 | 99.22904 | 1.00552 | 98.68459 |
ZMB | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 8.000000 | 7.3 | 7.700000 | NaN | NaN | NaN | NaN | NaN |
ZWE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 5.100000 | 5.1 | 5.100000 | NaN | NaN | NaN | NaN | NaN |
241 rows × 2047 columns
projection_col = [colname for colname in countryXindicator.columns if "Projection" in colname]
number = [colname for colname in countryXindicator.columns if "number" in colname]
dollar = [colname for colname in countryXindicator.columns if "$" in colname]
countryXindicator.drop(projection_col + number+ dollar,inplace=True,axis=1)
countryXindicator.shape
(241, 1502)
countryXindicator.drop(['Population, total'], axis=1, inplace=True)
print_missing_percentages(countryXindicator)
Max, min and mean number of missing values for the columns Max: 99.5850622406639 % Min: 3.7344398340248963 % Mean: 81.31204369977415 %
(3.7344398340248963, 99.5850622406639)
dropColumnHalf(countryXindicator)
print_missing_percentages(countryXindicator)
Max, min and mean number of missing values for the columns Max: 49.79253112033195 % Min: 3.7344398340248963 % Mean: 26.60212079299217 %
(3.7344398340248963, 49.79253112033195)
countryXindicator.shape
(241, 243)
countryXindicator
Indicator Name | Adjusted net enrolment rate, primary, both sexes (%) | Age population, age 0, female, UNESCO | Age population, age 0, total, UNESCO | Age population, age 01, female, UNESCO | Age population, age 01, total, UNESCO | Age population, age 02, female, UNESCO | Age population, age 02, total, UNESCO | Age population, age 03, female, UNESCO | Age population, age 03, total, UNESCO | Age population, age 04, female, UNESCO | ... | Prevalence of HIV, total (% of population ages 15-49) | Primary completion rate, both sexes (%) | Primary completion rate, female (%) | Primary completion rate, male (%) | Theoretical duration of primary education (years) | Theoretical duration of secondary education (years) | Theoretical duration of upper secondary education (years) | Unemployment, female (% of female labor force) (modeled ILO estimate) | Unemployment, male (% of male labor force) (modeled ILO estimate) | Unemployment, total (% of total labor force) (modeled ILO estimate) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
ABW | NaN | 542.0 | 1112.0 | 572.0 | 1170.0 | 600.0 | 1224.0 | 623.0 | 1269.0 | 643.0 | ... | NaN | NaN | NaN | NaN | 6.0 | 5.0 | 3.0 | NaN | NaN | NaN |
AFG | NaN | 682677.0 | 1403010.0 | 650389.0 | 1338500.0 | 620880.0 | 1279403.0 | 593957.0 | 1225345.0 | 569425.0 | ... | 0.1 | NaN | NaN | NaN | 6.0 | 6.0 | 3.0 | 12.700000 | 7.7 | 8.600000 |
AGO | NaN | 414919.0 | 832093.0 | 396537.0 | 793950.0 | 379705.0 | 759164.0 | 364295.0 | 727455.0 | 350180.0 | ... | 1.9 | NaN | NaN | NaN | 6.0 | 6.0 | 3.0 | 6.700000 | 5.7 | 6.200000 |
ALB | NaN | 23697.0 | 49105.0 | 24026.0 | 49709.0 | 24243.0 | 50104.0 | 24360.0 | 50315.0 | 24397.0 | ... | 0.1 | 106.367561 | 104.699371 | 107.900124 | 5.0 | 7.0 | 3.0 | 17.299999 | 17.0 | 17.100000 |
AND | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 6.0 | 6.0 | 2.0 | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
YEM | NaN | 458859.0 | 936053.0 | 452601.0 | 923017.0 | 444988.0 | 907252.0 | 436223.0 | 889171.0 | 426514.0 | ... | 0.1 | NaN | NaN | NaN | 6.0 | 6.0 | 3.0 | 32.700001 | 11.1 | 16.700001 |
ZAF | NaN | 484535.0 | 980663.0 | 486901.0 | 985215.0 | 489516.0 | 990194.0 | 492299.0 | 995445.0 | 495165.0 | ... | 18.9 | NaN | NaN | NaN | 7.0 | 5.0 | 3.0 | 27.700001 | 23.1 | 25.200001 |
ZMB | NaN | 226616.0 | 457329.0 | 220950.0 | 445451.0 | 215890.0 | 434876.0 | 211367.0 | 425454.0 | 207311.0 | ... | 12.6 | NaN | NaN | NaN | 7.0 | 5.0 | 3.0 | 8.000000 | 7.3 | 7.700000 |
ZWE | NaN | 181422.0 | 364761.0 | 179761.0 | 361101.0 | 177982.0 | 357261.0 | 176130.0 | 353321.0 | 174240.0 | ... | 13.9 | NaN | NaN | NaN | 7.0 | 6.0 | 4.0 | 5.100000 | 5.1 | 5.100000 |
241 rows × 243 columns
This i done for better imputation results
int_cols = find_all_integer_columns(countryXindicator)
countryXindicator_float = countryXindicator.drop(int_cols, axis=1)
print(countryXindicator_float.index.to_list())
['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARB', 'ARE', 'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHI', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'EAP', 'EAS', 'ECA', 'ECS', 'ECU', 'EGY', 'EMU', 'ERI', 'ESP', 'EST', 'ETH', 'EUU', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUM', 'GUY', 'HIC', 'HKG', 'HND', 'HPC', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAC', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LCN', 'LDC', 'LIC', 'LIE', 'LKA', 'LMC', 'LMY', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEA', 'MEX', 'MHL', 'MIC', 'MKD', 'MLI', 'MLT', 'MMR', 'MNA', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAC', 'NAM', 'NCL', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OED', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'ROU', 'RUS', 'RWA', 'SAS', 'SAU', 'SDN', 'SEN', 'SGP', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SRB', 'SSA', 'SSD', 'SSF', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TUV', 'TZA', 'UGA', 'UKR', 'UMC', 'URY', 'USA', 'UZB', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLD', 'WSM', 'XKX', 'YEM', 'ZAF', 'ZMB', 'ZWE']
countryXindicator_year = countryXindicator[int_cols]
countryXindicator_year
Indicator Name | Age population, age 0, female, UNESCO | Age population, age 0, total, UNESCO | Age population, age 01, female, UNESCO | Age population, age 01, total, UNESCO | Age population, age 02, female, UNESCO | Age population, age 02, total, UNESCO | Age population, age 03, female, UNESCO | Age population, age 03, total, UNESCO | Age population, age 04, female, UNESCO | Age population, age 04, total, UNESCO | ... | Population, ages 7-13, male | Population, ages 7-13, total | Population, ages 7-9, female | Population, ages 7-9, male | Population, ages 7-9, total | Population, female | Population, male | Theoretical duration of primary education (years) | Theoretical duration of secondary education (years) | Theoretical duration of upper secondary education (years) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
ABW | 542.0 | 1112.0 | 572.0 | 1170.0 | 600.0 | 1224.0 | 623.0 | 1269.0 | 643.0 | 1310.0 | ... | 5152.0 | 10124.0 | 2095.0 | 2169.0 | 4264.0 | 54743.0 | 49598.0 | 6.0 | 5.0 | 3.0 |
AFG | 682677.0 | 1403010.0 | 650389.0 | 1338500.0 | 620880.0 | 1279403.0 | 593957.0 | 1225345.0 | 569425.0 | 1175957.0 | ... | 3481981.0 | 6728457.0 | 1475542.0 | 1580498.0 | 3056040.0 | 16346869.0 | 17389625.0 | 6.0 | 6.0 | 3.0 |
AGO | 414919.0 | 832093.0 | 396537.0 | 793950.0 | 379705.0 | 759164.0 | 364295.0 | 727455.0 | 350180.0 | 698548.0 | ... | 1988558.0 | 3995534.0 | 913109.0 | 904577.0 | 1817686.0 | 14205741.0 | 13653564.0 | 6.0 | 6.0 | 3.0 |
ALB | 23697.0 | 49105.0 | 24026.0 | 49709.0 | 24243.0 | 50104.0 | 24360.0 | 50315.0 | 24397.0 | 50378.0 | ... | 179160.0 | 346732.0 | 72246.0 | 77107.0 | 149353.0 | 1426369.0 | 1454334.0 | 5.0 | 7.0 | 3.0 |
AND | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 6.0 | 2.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
YEM | 458859.0 | 936053.0 | 452601.0 | 923017.0 | 444988.0 | 907252.0 | 436223.0 | 889171.0 | 426514.0 | 869195.0 | ... | 2614331.0 | 5137180.0 | 1147011.0 | 1189006.0 | 2336017.0 | 13315678.0 | 13600529.0 | 6.0 | 6.0 | 3.0 |
ZAF | 484535.0 | 980663.0 | 486901.0 | 985215.0 | 489516.0 | 990194.0 | 492299.0 | 995445.0 | 495165.0 | 1000810.0 | ... | 3615273.0 | 7175459.0 | 1516934.0 | 1543083.0 | 3060017.0 | 27999778.0 | 27012199.0 | 7.0 | 5.0 | 3.0 |
ZMB | 226616.0 | 457329.0 | 220950.0 | 445451.0 | 215890.0 | 434876.0 | 211367.0 | 425454.0 | 207311.0 | 417041.0 | ... | 1334582.0 | 2657396.0 | 583300.0 | 588617.0 | 1171917.0 | 8112243.0 | 7988344.0 | 7.0 | 5.0 | 3.0 |
ZWE | 181422.0 | 364761.0 | 179761.0 | 361101.0 | 177982.0 | 357261.0 | 176130.0 | 353321.0 | 174240.0 | 349351.0 | ... | 1156372.0 | 2310111.0 | 501606.0 | 502943.0 | 1004549.0 | 8099354.0 | 7678097.0 | 7.0 | 6.0 | 4.0 |
241 rows × 195 columns
countryXindicator_year = countryXindicator_year[[colname for colname in countryXindicator_year.columns if "years" in colname]]
countryXindicator_year
Indicator Name | Duration of compulsory education (years) | Official entrance age to lower secondary education (years) | Official entrance age to primary education (years) | Theoretical duration of primary education (years) | Theoretical duration of secondary education (years) | Theoretical duration of upper secondary education (years) |
---|---|---|---|---|---|---|
Country Code | ||||||
ABW | 13.0 | 12.0 | 6.0 | 6.0 | 5.0 | 3.0 |
AFG | 9.0 | 13.0 | 7.0 | 6.0 | 6.0 | 3.0 |
AGO | 6.0 | 12.0 | 6.0 | 6.0 | 6.0 | 3.0 |
ALB | 9.0 | 11.0 | 6.0 | 5.0 | 7.0 | 3.0 |
AND | 10.0 | 12.0 | 6.0 | 6.0 | 6.0 | 2.0 |
... | ... | ... | ... | ... | ... | ... |
XKX | NaN | NaN | NaN | NaN | NaN | NaN |
YEM | 9.0 | 12.0 | 6.0 | 6.0 | 6.0 | 3.0 |
ZAF | 9.0 | 14.0 | 7.0 | 7.0 | 5.0 | 3.0 |
ZMB | 7.0 | 14.0 | 7.0 | 7.0 | 5.0 | 3.0 |
ZWE | 7.0 | 13.0 | 6.0 | 7.0 | 6.0 | 4.0 |
241 rows × 6 columns
imputed_countryXindicator_float = impute_df(countryXindicator_float, max_iter=100, verbose=2)
veImputer] Change: 2.532294563247444, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 19/100, elapsed time 4.17 [IterativeImputer] Change: 2.2119869217502828, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 20/100, elapsed time 4.38 [IterativeImputer] Change: 1.9202443611257713, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 21/100, elapsed time 4.59 [IterativeImputer] Change: 1.6606990496546086, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 22/100, elapsed time 4.82 [IterativeImputer] Change: 1.6559282826221846, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 23/100, elapsed time 5.04 [IterativeImputer] Change: 1.6520065848505072, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 24/100, elapsed time 5.26 [IterativeImputer] Change: 1.648782063252183, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 25/100, elapsed time 5.48 [IterativeImputer] Change: 1.6461831356141814, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 26/100, elapsed time 5.69 [IterativeImputer] Change: 1.644058054001174, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 27/100, elapsed time 5.90 [IterativeImputer] Change: 1.6423013079998188, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 28/100, elapsed time 6.12 [IterativeImputer] Change: 1.6408741216707456, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 29/100, elapsed time 6.33 [IterativeImputer] Change: 1.6396788922926566, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 30/100, elapsed time 6.55 [IterativeImputer] Change: 1.638655100233116, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 31/100, elapsed time 6.77 [IterativeImputer] Change: 1.6377695134752297, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 32/100, elapsed time 6.98 [IterativeImputer] Change: 1.6369872536064, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 33/100, elapsed time 7.19 [IterativeImputer] Change: 1.636221623251295, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 34/100, elapsed time 7.41 [IterativeImputer] Change: 1.6355083766661642, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 35/100, elapsed time 7.63 [IterativeImputer] Change: 1.634821858203819, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 36/100, elapsed time 7.85 [IterativeImputer] Change: 1.6341370631609828, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 37/100, elapsed time 8.07 [IterativeImputer] Change: 1.6334747186241456, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 38/100, elapsed time 8.28 [IterativeImputer] Change: 1.645652531918897, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 39/100, elapsed time 8.50 [IterativeImputer] Change: 1.652182711035818, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 40/100, elapsed time 8.72 [IterativeImputer] Change: 1.6532662904419548, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 41/100, elapsed time 8.94 [IterativeImputer] Change: 1.649451831600862, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 42/100, elapsed time 9.16 [IterativeImputer] Change: 1.6408222980663139, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 43/100, elapsed time 9.38 [IterativeImputer] Change: 1.6294400496414387, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 44/100, elapsed time 9.60 [IterativeImputer] Change: 1.6287678524729357, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 45/100, elapsed time 9.82 [IterativeImputer] Change: 1.6281047469396985, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 46/100, elapsed time 10.05 [IterativeImputer] Change: 1.6274834308429378, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 47/100, elapsed time 10.26 [IterativeImputer] Change: 1.6270130873718072, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 48/100, elapsed time 10.48 [IterativeImputer] Change: 1.6262940758389055, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 49/100, elapsed time 10.70 [IterativeImputer] Change: 1.6257297763508762, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 50/100, elapsed time 10.92 [IterativeImputer] Change: 1.6252298538321177, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 51/100, elapsed time 11.14 [IterativeImputer] Change: 1.624722930666042, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 52/100, elapsed time 11.35 [IterativeImputer] Change: 1.6242577689697895, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 53/100, elapsed time 11.57 [IterativeImputer] Change: 1.6238212014114874, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 54/100, elapsed time 11.79 [IterativeImputer] Change: 1.6234519318135652, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 55/100, elapsed time 12.01 [IterativeImputer] Change: 1.6231115358163457, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 56/100, elapsed time 12.23 [IterativeImputer] Change: 1.6228249713969374, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 57/100, elapsed time 12.44 [IterativeImputer] Change: 1.6225670613424206, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 58/100, elapsed time 12.66 [IterativeImputer] Change: 1.6222797867936543, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 59/100, elapsed time 12.89 [IterativeImputer] Change: 1.6221342960698018, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 60/100, elapsed time 13.10 [IterativeImputer] Change: 1.6219870713467206, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 61/100, elapsed time 13.31 [IterativeImputer] Change: 1.6218646076142422, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 62/100, elapsed time 13.53 [IterativeImputer] Change: 1.6217730363655758, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 63/100, elapsed time 13.75 [IterativeImputer] Change: 1.621725788928061, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 64/100, elapsed time 14.06 [IterativeImputer] Change: 1.6216920357302036, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 65/100, elapsed time 14.27 [IterativeImputer] Change: 1.621698990708198, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 66/100, elapsed time 14.50 [IterativeImputer] Change: 1.6217532944875095, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 67/100, elapsed time 14.72 [IterativeImputer] Change: 1.6218227224444455, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 68/100, elapsed time 14.93 [IterativeImputer] Change: 1.6218972339192113, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 69/100, elapsed time 15.15 [IterativeImputer] Change: 1.6232902366910664, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 70/100, elapsed time 15.37 [IterativeImputer] Change: 1.6249616723710512, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 71/100, elapsed time 15.59 [IterativeImputer] Change: 1.6261259356463242, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 72/100, elapsed time 15.81 [IterativeImputer] Change: 1.6281859862659571, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 73/100, elapsed time 16.03 [IterativeImputer] Change: 1.63006029323919, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 74/100, elapsed time 16.24 [IterativeImputer] Change: 1.6318311593012644, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 75/100, elapsed time 16.46 [IterativeImputer] Change: 1.6336233225534909, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 76/100, elapsed time 16.68 [IterativeImputer] Change: 1.6353960973093213, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 77/100, elapsed time 16.90 [IterativeImputer] Change: 1.637182751172711, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 78/100, elapsed time 17.12 [IterativeImputer] Change: 1.6389758111697363, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 79/100, elapsed time 17.34 [IterativeImputer] Change: 1.6407763042253765, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 80/100, elapsed time 17.55 [IterativeImputer] Change: 1.6425834310407716, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 81/100, elapsed time 17.77 [IterativeImputer] Change: 1.6443968858747002, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 82/100, elapsed time 17.99 [IterativeImputer] Change: 1.64625348689472, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 83/100, elapsed time 18.21 [IterativeImputer] Change: 1.6481429500945106, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 84/100, elapsed time 18.43 [IterativeImputer] Change: 1.6500518120410161, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 85/100, elapsed time 18.65 [IterativeImputer] Change: 1.6519548194561064, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 86/100, elapsed time 18.87 [IterativeImputer] Change: 1.6538680162736423, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 87/100, elapsed time 19.09 [IterativeImputer] Change: 1.6552679864808504, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 88/100, elapsed time 19.31 [IterativeImputer] Change: 1.6574626185722092, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 89/100, elapsed time 19.52 [IterativeImputer] Change: 1.6594748268725177, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 90/100, elapsed time 19.74 [IterativeImputer] Change: 1.6613749089305434, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 91/100, elapsed time 19.96 [IterativeImputer] Change: 1.6632615294758013, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 92/100, elapsed time 20.18 [IterativeImputer] Change: 1.6651267664591594, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 93/100, elapsed time 20.40 [IterativeImputer] Change: 1.6669733028122995, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 94/100, elapsed time 20.62 [IterativeImputer] Change: 1.6688041142649566, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 95/100, elapsed time 20.84 [IterativeImputer] Change: 1.6706225132059456, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 96/100, elapsed time 21.08 [IterativeImputer] Change: 1.672451264542782, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 97/100, elapsed time 21.30 [IterativeImputer] Change: 1.6742766711767634, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 98/100, elapsed time 21.53 [IterativeImputer] Change: 1.6761039010597747, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 99/100, elapsed time 21.74 [IterativeImputer] Change: 1.6778935337913161, scaled tolerance: 0.19410198974609402 [IterativeImputer] Ending imputation round 100/100, elapsed time 21.96 [IterativeImputer] Change: 1.679678914280904, scaled tolerance: 0.19410198974609402 [IterativeImputer] Completing matrix with shape (241, 48) [IterativeImputer] Ending imputation round 1/100, elapsed time 0.01 [IterativeImputer] Ending imputation round 2/100, elapsed time 0.02 [IterativeImputer] Ending imputation round 3/100, elapsed time 0.03 [IterativeImputer] Ending imputation round 4/100, elapsed time 0.04 [IterativeImputer] Ending imputation round 5/100, elapsed time 0.05 [IterativeImputer] Ending imputation round 6/100, elapsed time 0.06 [IterativeImputer] Ending imputation round 7/100, elapsed time 0.07 [IterativeImputer] Ending imputation round 8/100, elapsed time 0.07 [IterativeImputer] Ending imputation round 9/100, elapsed time 0.08 [IterativeImputer] Ending imputation round 10/100, elapsed time 0.09 [IterativeImputer] Ending imputation round 11/100, elapsed time 0.10 [IterativeImputer] Ending imputation round 12/100, elapsed time 0.11 [IterativeImputer] Ending imputation round 13/100, elapsed time 0.12 [IterativeImputer] Ending imputation round 14/100, elapsed time 0.12 [IterativeImputer] Ending imputation round 15/100, elapsed time 0.13 [IterativeImputer] Ending imputation round 16/100, elapsed time 0.14 [IterativeImputer] Ending imputation round 17/100, elapsed time 0.15 [IterativeImputer] Ending imputation round 18/100, elapsed time 0.16 [IterativeImputer] Ending imputation round 19/100, elapsed time 0.17 [IterativeImputer] Ending imputation round 20/100, elapsed time 0.18 [IterativeImputer] Ending imputation round 21/100, elapsed time 0.18 C:\Users\joach\.conda\envs\wsenv\lib\site-packages\sklearn\impute\_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" [IterativeImputer] Ending imputation round 22/100, elapsed time 0.20 [IterativeImputer] Ending imputation round 23/100, elapsed time 0.20 [IterativeImputer] Ending imputation round 24/100, elapsed time 0.21 [IterativeImputer] Ending imputation round 25/100, elapsed time 0.22 [IterativeImputer] Ending imputation round 26/100, elapsed time 0.23 [IterativeImputer] Ending imputation round 27/100, elapsed time 0.24 [IterativeImputer] Ending imputation round 28/100, elapsed time 0.24 [IterativeImputer] Ending imputation round 29/100, elapsed time 0.25 [IterativeImputer] Ending imputation round 30/100, elapsed time 0.26 [IterativeImputer] Ending imputation round 31/100, elapsed time 0.27 [IterativeImputer] Ending imputation round 32/100, elapsed time 0.28 [IterativeImputer] Ending imputation round 33/100, elapsed time 0.28 [IterativeImputer] Ending imputation round 34/100, elapsed time 0.29 [IterativeImputer] Ending imputation round 35/100, elapsed time 0.30 [IterativeImputer] Ending imputation round 36/100, elapsed time 0.31 [IterativeImputer] Ending imputation round 37/100, elapsed time 0.31 [IterativeImputer] Ending imputation round 38/100, elapsed time 0.32 [IterativeImputer] Ending imputation round 39/100, elapsed time 0.33 [IterativeImputer] Ending imputation round 40/100, elapsed time 0.34 [IterativeImputer] Ending imputation round 41/100, elapsed time 0.34 [IterativeImputer] Ending imputation round 42/100, elapsed time 0.35 [IterativeImputer] Ending imputation round 43/100, elapsed time 0.36 [IterativeImputer] Ending imputation round 44/100, elapsed time 0.37 [IterativeImputer] Ending imputation round 45/100, elapsed time 0.38 [IterativeImputer] Ending imputation round 46/100, elapsed time 0.38 [IterativeImputer] Ending imputation round 47/100, elapsed time 0.39 [IterativeImputer] Ending imputation round 48/100, elapsed time 0.40 [IterativeImputer] Ending imputation round 49/100, elapsed time 0.41 [IterativeImputer] Ending imputation round 50/100, elapsed time 0.42 [IterativeImputer] Ending imputation round 51/100, elapsed time 0.43 [IterativeImputer] Ending imputation round 52/100, elapsed time 0.43 [IterativeImputer] Ending imputation round 53/100, elapsed time 0.44 [IterativeImputer] Ending imputation round 54/100, elapsed time 0.45 [IterativeImputer] Ending imputation round 55/100, elapsed time 0.46 [IterativeImputer] Ending imputation round 56/100, elapsed time 0.47 [IterativeImputer] Ending imputation round 57/100, elapsed time 0.48 [IterativeImputer] Ending imputation round 58/100, elapsed time 0.49 [IterativeImputer] Ending imputation round 59/100, elapsed time 0.49 [IterativeImputer] Ending imputation round 60/100, elapsed time 0.50 [IterativeImputer] Ending imputation round 61/100, elapsed time 0.51 [IterativeImputer] Ending imputation round 62/100, elapsed time 0.52 [IterativeImputer] Ending imputation round 63/100, elapsed time 0.52 [IterativeImputer] Ending imputation round 64/100, elapsed time 0.53 [IterativeImputer] Ending imputation round 65/100, elapsed time 0.54 [IterativeImputer] Ending imputation round 66/100, elapsed time 0.55 [IterativeImputer] Ending imputation round 67/100, elapsed time 0.55 [IterativeImputer] Ending imputation round 68/100, elapsed time 0.56 [IterativeImputer] Ending imputation round 69/100, elapsed time 0.57 [IterativeImputer] Ending imputation round 70/100, elapsed time 0.58 [IterativeImputer] Ending imputation round 71/100, elapsed time 0.58 [IterativeImputer] Ending imputation round 72/100, elapsed time 0.59 [IterativeImputer] Ending imputation round 73/100, elapsed time 0.60 [IterativeImputer] Ending imputation round 74/100, elapsed time 0.61 [IterativeImputer] Ending imputation round 75/100, elapsed time 0.61 [IterativeImputer] Ending imputation round 76/100, elapsed time 0.62 [IterativeImputer] Ending imputation round 77/100, elapsed time 0.63 [IterativeImputer] Ending imputation round 78/100, elapsed time 0.64 [IterativeImputer] Ending imputation round 79/100, elapsed time 0.65 [IterativeImputer] Ending imputation round 80/100, elapsed time 0.66 [IterativeImputer] Ending imputation round 81/100, elapsed time 0.66 [IterativeImputer] Ending imputation round 82/100, elapsed time 0.67 [IterativeImputer] Ending imputation round 83/100, elapsed time 0.68 [IterativeImputer] Ending imputation round 84/100, elapsed time 0.69 [IterativeImputer] Ending imputation round 85/100, elapsed time 0.70 [IterativeImputer] Ending imputation round 86/100, elapsed time 0.70 [IterativeImputer] Ending imputation round 87/100, elapsed time 0.71 [IterativeImputer] Ending imputation round 88/100, elapsed time 0.72 [IterativeImputer] Ending imputation round 89/100, elapsed time 0.73 [IterativeImputer] Ending imputation round 90/100, elapsed time 0.73 [IterativeImputer] Ending imputation round 91/100, elapsed time 0.74 [IterativeImputer] Ending imputation round 92/100, elapsed time 0.75 [IterativeImputer] Ending imputation round 93/100, elapsed time 0.76 [IterativeImputer] Ending imputation round 94/100, elapsed time 0.76 [IterativeImputer] Ending imputation round 95/100, elapsed time 0.77 [IterativeImputer] Ending imputation round 96/100, elapsed time 0.78 [IterativeImputer] Ending imputation round 97/100, elapsed time 0.79 [IterativeImputer] Ending imputation round 98/100, elapsed time 0.79 [IterativeImputer] Ending imputation round 99/100, elapsed time 0.80 [IterativeImputer] Ending imputation round 100/100, elapsed time 0.81
imputed_countryXindicator_year = impute_df(countryXindicator_year, max_iter=100, verbose=2)
[IterativeImputer] Completing matrix with shape (241, 6) [IterativeImputer] Ending imputation round 1/100, elapsed time 0.01 [IterativeImputer] Change: 1.6443802657682207, scaled tolerance: 0.015 [IterativeImputer] Ending imputation round 2/100, elapsed time 0.02 [IterativeImputer] Change: 0.0830316620808409, scaled tolerance: 0.015 [IterativeImputer] Ending imputation round 3/100, elapsed time 0.04 [IterativeImputer] Change: 0.0289166641679115, scaled tolerance: 0.015 [IterativeImputer] Ending imputation round 4/100, elapsed time 0.05 [IterativeImputer] Change: 0.009636184792338298, scaled tolerance: 0.015 [IterativeImputer] Early stopping criterion reached. [IterativeImputer] Completing matrix with shape (241, 6) [IterativeImputer] Ending imputation round 1/4, elapsed time 0.00 [IterativeImputer] Ending imputation round 2/4, elapsed time 0.00 [IterativeImputer] Ending imputation round 3/4, elapsed time 0.00 [IterativeImputer] Ending imputation round 4/4, elapsed time 0.01
imputed_countryXindicator_year = imputed_countryXindicator_year.round(0).astype(int)
Before Imputation
corr_calc = countryXindicator_float.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
After imputation
corr_calc = imputed_countryXindicator_float.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
Before imputation
corr_calc = countryXindicator_year.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, cmap='mako')
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
After imputation
corr_calc = imputed_countryXindicator_year.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, cmap='mako')
<AxesSubplot:xlabel='Indicator Name', ylabel='Indicator Name'>
imputed_data = imputed_countryXindicator_year.merge(imputed_countryXindicator_float, how='inner', on="Country Code")
imputed_data
Indicator Name | Duration of compulsory education (years) | Official entrance age to lower secondary education (years) | Official entrance age to primary education (years) | Theoretical duration of primary education (years) | Theoretical duration of secondary education (years) | Theoretical duration of upper secondary education (years) | Adjusted net enrolment rate, primary, both sexes (%) | Gross enrolment ratio, lower secondary, both sexes (%) | Gross enrolment ratio, lower secondary, female (%) | Gross enrolment ratio, lower secondary, male (%) | ... | Population growth (annual %) | Population, female (% of total) | Population, male (% of total) | Prevalence of HIV, total (% of population ages 15-49) | Primary completion rate, both sexes (%) | Primary completion rate, female (%) | Primary completion rate, male (%) | Unemployment, female (% of female labor force) (modeled ILO estimate) | Unemployment, male (% of male labor force) (modeled ILO estimate) | Unemployment, total (% of total labor force) (modeled ILO estimate) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
ABW | 13 | 12 | 6 | 6 | 5 | 3 | 96.636541 | 91.072857 | 90.689076 | 91.406452 | ... | 0.524658 | 52.465521 | 47.534479 | 1.416423 | 91.125743 | 90.752535 | 91.425942 | 10.349564 | 8.746525 | 9.324957 |
AFG | 9 | 13 | 7 | 6 | 6 | 3 | 87.416874 | 67.447617 | 49.580441 | 84.329559 | ... | 2.943234 | 48.454558 | 51.545442 | 0.100000 | 93.312781 | 88.498878 | 97.920359 | 12.700000 | 7.700000 | 8.600000 |
AGO | 6 | 12 | 6 | 6 | 6 | 3 | 87.446703 | 90.903513 | 90.527003 | 91.325392 | ... | 3.428021 | 50.991010 | 49.008990 | 1.900000 | 92.486172 | 92.930030 | 92.143730 | 6.700000 | 5.700000 | 6.200000 |
ALB | 9 | 11 | 6 | 5 | 7 | 3 | 99.516937 | 101.488373 | 99.889503 | 102.980438 | ... | -0.291206 | 49.514599 | 50.485401 | 0.100000 | 106.367561 | 104.699371 | 107.900124 | 17.299999 | 17.000000 | 17.100000 |
AND | 10 | 12 | 6 | 6 | 6 | 2 | 94.592438 | 91.281081 | 91.067359 | 91.467822 | ... | -1.537836 | 49.937398 | 50.062602 | -0.301099 | 90.510694 | 89.175083 | 91.671974 | 14.798726 | 11.023894 | 11.875731 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | 10 | 12 | 6 | 6 | 6 | 3 | 95.413339 | 91.084565 | 90.721799 | 91.429833 | ... | -1.103886 | 49.937397 | 50.062603 | -0.423033 | 91.464884 | 91.250304 | 91.619722 | 9.662467 | 9.205525 | 9.289344 |
YEM | 9 | 12 | 6 | 6 | 6 | 3 | 86.150744 | 90.902881 | 90.439904 | 91.274501 | ... | 2.520254 | 49.470856 | 50.529144 | 0.100000 | 92.609481 | 92.837224 | 92.326815 | 32.700001 | 11.100000 | 16.700001 |
ZAF | 9 | 14 | 7 | 7 | 5 | 3 | 96.505376 | 91.062603 | 90.916364 | 91.240835 | ... | 1.585325 | 50.897604 | 49.102396 | 18.900000 | 91.111444 | 91.824840 | 90.441719 | 27.700001 | 23.100000 | 25.200001 |
ZMB | 7 | 14 | 7 | 7 | 5 | 3 | 93.181744 | 91.012954 | 90.829133 | 91.267250 | ... | 3.024123 | 50.384766 | 49.615234 | 12.600000 | 91.406170 | 91.905970 | 90.976530 | 8.000000 | 7.300000 | 7.700000 |
ZWE | 7 | 13 | 6 | 7 | 6 | 4 | 94.625610 | 91.047967 | 90.927321 | 91.251754 | ... | 2.345643 | 51.334997 | 48.665003 | 13.900000 | 91.325104 | 91.913467 | 90.806284 | 5.100000 | 5.100000 | 5.100000 |
241 rows × 54 columns
bigtable = imputed_data.merge(df_c, how='left', on="Country Code")
bigtable.columns
Index(['Country Code', 'Duration of compulsory education (years)', 'Official entrance age to lower secondary education (years)', 'Official entrance age to primary education (years)', 'Theoretical duration of primary education (years)', 'Theoretical duration of secondary education (years)', 'Theoretical duration of upper secondary education (years)', 'Adjusted net enrolment rate, primary, both sexes (%)', 'Gross enrolment ratio, lower secondary, both sexes (%)', 'Gross enrolment ratio, lower secondary, female (%)', 'Gross enrolment ratio, lower secondary, male (%)', 'Gross enrolment ratio, pre-primary, both sexes (%)', 'Gross enrolment ratio, pre-primary, female (%)', 'Gross enrolment ratio, pre-primary, male (%)', 'Gross enrolment ratio, primary, both sexes (%)', 'Gross enrolment ratio, primary, female (%)', 'Gross enrolment ratio, primary, gender parity index (GPI)', 'Gross enrolment ratio, primary, male (%)', 'Gross enrolment ratio, secondary, both sexes (%)', 'Gross enrolment ratio, secondary, female (%)', 'Gross enrolment ratio, secondary, gender parity index (GPI)', 'Gross enrolment ratio, secondary, male (%)', 'Gross enrolment ratio, upper secondary, both sexes (%)', 'Gross intake ratio to Grade 1 of primary education, both sexes (%)', 'Gross intake ratio to Grade 1 of primary education, female (%)', 'Gross intake ratio to Grade 1 of primary education, male (%)', 'Internet users (per 100 people)', 'Labor force, female (% of total labor force)', 'Mortality rate, under-5 (per 1,000 live births)', 'Net enrolment rate, primary, both sexes (%)', 'Percentage of enrolment in pre-primary education in private institutions (%)', 'Percentage of enrolment in primary education in private institutions (%)', 'Percentage of enrolment in secondary education in private institutions (%)', 'Percentage of female students enrolled in primary education who are over-age, female (%)', 'Percentage of male students enrolled in primary education who are over-age, male (%)', 'Percentage of repeaters in primary education, all grades, both sexes (%)', 'Percentage of repeaters in primary education, all grades, female (%)', 'Percentage of repeaters in primary education, all grades, male (%)', 'Percentage of students enrolled in primary education who are over-age, both sexes (%)', 'Percentage of students in pre-primary education who are female (%)', 'Percentage of students in primary education who are female (%)', 'Percentage of students in secondary education who are female (%)', 'Percentage of students in secondary general education who are female (%)', 'Population ages 0-14 (% of total)', 'Population ages 15-64 (% of total)', 'Population growth (annual %)', 'Population, female (% of total)', 'Population, male (% of total)', 'Prevalence of HIV, total (% of population ages 15-49)', 'Primary completion rate, both sexes (%)', 'Primary completion rate, female (%)', 'Primary completion rate, male (%)', 'Unemployment, female (% of female labor force) (modeled ILO estimate)', 'Unemployment, male (% of male labor force) (modeled ILO estimate)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Short Name', 'Table Name', 'Long Name', '2-alpha code', 'Currency Unit', 'Special Notes', 'Region', 'Income Group', 'WB-2 code', 'National accounts base year', 'National accounts reference year', 'SNA price valuation', 'Lending category', 'Other groups', 'System of National Accounts', 'Alternative conversion factor', 'PPP survey year', 'Balance of Payments Manual in use', 'External debt Reporting status', 'System of trade', 'Government Accounting concept', 'IMF data dissemination standard', 'Latest population census', 'Latest household survey', 'Source of most recent Income and expenditure data', 'Vital registration complete', 'Latest agricultural census', 'Latest industrial data', 'Latest trade data', 'Latest water withdrawal data', 'Unnamed: 31'], dtype='object')
bigtable.set_index('Country Code', inplace=True)
bigtable
Duration of compulsory education (years) | Official entrance age to lower secondary education (years) | Official entrance age to primary education (years) | Theoretical duration of primary education (years) | Theoretical duration of secondary education (years) | Theoretical duration of upper secondary education (years) | Adjusted net enrolment rate, primary, both sexes (%) | Gross enrolment ratio, lower secondary, both sexes (%) | Gross enrolment ratio, lower secondary, female (%) | Gross enrolment ratio, lower secondary, male (%) | ... | IMF data dissemination standard | Latest population census | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest industrial data | Latest trade data | Latest water withdrawal data | Unnamed: 31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
ABW | 13 | 12 | 6 | 6 | 5 | 3 | 96.636541 | 91.072857 | 90.689076 | 91.406452 | ... | NaN | 2010 | NaN | NaN | Yes | NaN | NaN | 2012.0 | NaN | NaN |
AFG | 9 | 13 | 7 | 6 | 6 | 3 | 87.416874 | 67.447617 | 49.580441 | 84.329559 | ... | General Data Dissemination System (GDDS) | 1979 | Multiple Indicator Cluster Survey (MICS), 2010/11 | Integrated household survey (IHS), 2008 | NaN | 2013/14 | NaN | 2012.0 | 2000 | NaN |
AGO | 6 | 12 | 6 | 6 | 6 | 3 | 87.446703 | 90.903513 | 90.527003 | 91.325392 | ... | General Data Dissemination System (GDDS) | 1970 | Malaria Indicator Survey (MIS), 2011 | Integrated household survey (IHS), 2008 | NaN | 2015 | NaN | NaN | 2005 | NaN |
ALB | 9 | 11 | 6 | 5 | 7 | 3 | 99.516937 | 101.488373 | 99.889503 | 102.980438 | ... | General Data Dissemination System (GDDS) | 2011 | Demographic and Health Survey (DHS), 2008/09 | Living Standards Measurement Study Survey (LSM... | Yes | 2012 | 2010.0 | 2012.0 | 2006 | NaN |
AND | 10 | 12 | 6 | 6 | 6 | 2 | 94.592438 | 91.281081 | 91.067359 | 91.467822 | ... | NaN | 2011. Population figures compiled from adminis... | NaN | NaN | Yes | NaN | NaN | 2006.0 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | 10 | 12 | 6 | 6 | 6 | 3 | 95.413339 | 91.084565 | 90.721799 | 91.429833 | ... | General Data Dissemination System (GDDS) | 2011 | NaN | Integrated household survey (IHS), 2011 | NaN | NaN | NaN | NaN | NaN | NaN |
YEM | 9 | 12 | 6 | 6 | 6 | 3 | 86.150744 | 90.902881 | 90.439904 | 91.274501 | ... | General Data Dissemination System (GDDS) | 2004 | Demographic and Health Survey (DHS), 2013 | Expenditure survey/budget survey (ES/BS), 2005 | NaN | NaN | 2006.0 | 2012.0 | 2005 | NaN |
ZAF | 9 | 14 | 7 | 7 | 5 | 3 | 96.505376 | 91.062603 | 90.916364 | 91.240835 | ... | Special Data Dissemination Standard (SDDS) | 2011 | Demographic and Health Survey (DHS), 2003; Wor... | Expenditure survey/budget survey (ES/BS), 2010 | NaN | 2007 | 2010.0 | 2012.0 | 2000 | NaN |
ZMB | 7 | 14 | 7 | 7 | 5 | 3 | 93.181744 | 91.012954 | 90.829133 | 91.267250 | ... | General Data Dissemination System (GDDS) | 2010 | Demographic and Health Survey (DHS), 2013 | Integrated household survey (IHS), 2010 | NaN | 2010. Population and Housing Census. | NaN | 2011.0 | 2002 | NaN |
ZWE | 7 | 13 | 6 | 7 | 6 | 4 | 94.625610 | 91.047967 | 90.927321 | 91.251754 | ... | General Data Dissemination System (GDDS) | 2012 | Demographic and Health Survey (DHS), 2010/11 | Integrated household survey (IHS), 2011/12 | NaN | NaN | NaN | 2012.0 | 2002 | NaN |
241 rows × 85 columns
bigtable.to_csv("../data/unlabeled/preprocessed/edstats_preprocessed.csv")