import pandas as pd
from helpers import *
import sys
sys.path.append("..")
from data.unlabeled.raw import econ_co as df1, econ_da as df2, econ_se as df3
..\data\unlabeled\raw\__init__.py:41: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_eah = pd.read_csv(aquastat_eah_path, skipfooter=8) ..\data\unlabeled\raw\__init__.py:42: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_wr = pd.read_csv(aquastat_wr_path, skipfooter=8) ..\data\unlabeled\raw\__init__.py:43: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. aquastat_wu = pd.read_csv(aquastat_wu_path, skipfooter=8)
print(df2.columns)
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', 'Unnamed: 25'], dtype='object')
years = ['2010', '2011', '2012','2013', '2014', '2015']
df2 = df2[['Country Code','Indicator Code',*years]]
df2.shape
(298, 8)
df2.isna().sum()
Country Code 0 Indicator Code 0 2010 0 2011 0 2012 2 2013 2 2014 2 2015 2 dtype: int64
fill_missing_with_column(df2,'2015','2011')
df2.isna().sum()
c:\Users\joach\code-projects\WaterSecurity\unlabeled_preprocessing\helpers.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[into] = df[into].combine_first(df[fro]) C:\Users\joach\.conda\envs\wsenv\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
Country Code 0 Indicator Code 0 2010 0 2012 2 2013 2 2014 2 2015 0 dtype: int64
df2 = df2[['Country Code','Indicator Code','2015']]
df2
Country Code | Indicator Code | 2015 | |
---|---|---|---|
0 | AFG | EF.EFM.OVRL.XD | 5.369534e-02 |
1 | AFG | EF.EFM.RANK.XD | 1.120000e+02 |
2 | AGO | EF.EFM.OVRL.XD | 7.890000e-13 |
3 | AGO | EF.EFM.RANK.XD | 1.460000e+02 |
4 | ALB | EF.EFM.OVRL.XD | 3.746223e-01 |
... | ... | ... | ... |
293 | ZAF | EF.EFM.RANK.XD | 4.500000e+01 |
294 | ZMB | EF.EFM.OVRL.XD | 6.530739e-02 |
295 | ZMB | EF.EFM.RANK.XD | 1.090000e+02 |
296 | ZWE | EF.EFM.OVRL.XD | 1.078169e-01 |
297 | ZWE | EF.EFM.RANK.XD | 1.000000e+02 |
298 rows × 3 columns
comb1 = df2.merge(df3, how='inner', left_on='Indicator Code', right_on='Series Code')
print(comb1.shape)
(298, 24)
comb1
Country Code | Indicator Code | 2015 | Series Code | Topic | Indicator Name | Short definition | Long definition | Unit of measure | Periodicity | ... | Notes from original source | General comments | Source | Statistical concept and methodology | Development relevance | Related source links | Other web links | Related indicators | License Type | Unnamed: 20 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AFG | EF.EFM.OVRL.XD | 5.369534e-02 | EF.EFM.OVRL.XD | Economic Policy & Debt | Economic Fitness Metric | NaN | Economic Fitness (EF) is both a measure of a c... | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
1 | AGO | EF.EFM.OVRL.XD | 7.890000e-13 | EF.EFM.OVRL.XD | Economic Policy & Debt | Economic Fitness Metric | NaN | Economic Fitness (EF) is both a measure of a c... | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
2 | ALB | EF.EFM.OVRL.XD | 3.746223e-01 | EF.EFM.OVRL.XD | Economic Policy & Debt | Economic Fitness Metric | NaN | Economic Fitness (EF) is both a measure of a c... | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
3 | AND | EF.EFM.OVRL.XD | 3.694202e-01 | EF.EFM.OVRL.XD | Economic Policy & Debt | Economic Fitness Metric | NaN | Economic Fitness (EF) is both a measure of a c... | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
4 | ARE | EF.EFM.OVRL.XD | 3.438621e-01 | EF.EFM.OVRL.XD | Economic Policy & Debt | Economic Fitness Metric | NaN | Economic Fitness (EF) is both a measure of a c... | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
293 | VNM | EF.EFM.RANK.XD | 3.600000e+01 | EF.EFM.RANK.XD | Economic Policy & Debt | Economic Fitness Ranking (1 = high, 149 = low) | NaN | NaN | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
294 | YEM | EF.EFM.RANK.XD | 1.070000e+02 | EF.EFM.RANK.XD | Economic Policy & Debt | Economic Fitness Ranking (1 = high, 149 = low) | NaN | NaN | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
295 | ZAF | EF.EFM.RANK.XD | 4.500000e+01 | EF.EFM.RANK.XD | Economic Policy & Debt | Economic Fitness Ranking (1 = high, 149 = low) | NaN | NaN | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
296 | ZMB | EF.EFM.RANK.XD | 1.090000e+02 | EF.EFM.RANK.XD | Economic Policy & Debt | Economic Fitness Ranking (1 = high, 149 = low) | NaN | NaN | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
297 | ZWE | EF.EFM.RANK.XD | 1.000000e+02 | EF.EFM.RANK.XD | Economic Policy & Debt | Economic Fitness Ranking (1 = high, 149 = low) | NaN | NaN | NaN | Annual | ... | NaN | NaN | World Bank, Economic Fitness project. For more... | The new literature of Economic Fitness uses te... | NaN | NaN | NaN | NaN | NaN | NaN |
298 rows × 24 columns
countryXindicator = comb1.pivot_table('2015',['Country Code'], 'Indicator Name')
countryXindicator
Indicator Name | Economic Fitness Metric | Economic Fitness Ranking (1 = high, 149 = low) |
---|---|---|
Country Code | ||
AFG | 5.369534e-02 | 112.0 |
AGO | 7.890000e-13 | 146.0 |
ALB | 3.746223e-01 | 73.0 |
AND | 3.694202e-01 | 74.0 |
ARE | 3.438621e-01 | 76.0 |
... | ... | ... |
VNM | 1.391330e+00 | 36.0 |
YEM | 7.417100e-02 | 107.0 |
ZAF | 9.784253e-01 | 45.0 |
ZMB | 6.530739e-02 | 109.0 |
ZWE | 1.078169e-01 | 100.0 |
149 rows × 2 columns
print_missing_percentages(countryXindicator)
Max, min and mean number of missing values for the columns Max: 0.0 % Min: 0.0 % Mean: 0.0 %
(0.0, 0.0)
comb2 = df1.merge(countryXindicator, how='inner', left_on='Country Code', right_on='Country Code')
print(comb2.columns)
print(comb2.shape)
Index(['Country Code', 'Short Name', 'Table Name', 'Long Name', '2-alpha code', 'Currency Unit', 'Special Notes', 'Region', 'Income Group', 'WB-2 code', 'National accounts base year', 'National accounts reference year', 'SNA price valuation', 'Lending category', 'Other groups', 'System of National Accounts', 'Alternative conversion factor', 'PPP survey year', 'Balance of Payments Manual in use', 'External debt Reporting status', 'System of trade', 'Government Accounting concept', 'IMF data dissemination standard', 'Latest population census', 'Latest household survey', 'Source of most recent Income and expenditure data', 'Vital registration complete', 'Latest agricultural census', 'Latest industrial data', 'Latest trade data', 'Latest water withdrawal data', 'Unnamed: 31', 'Economic Fitness Metric', 'Economic Fitness Ranking (1 = high, 149 = low)'], dtype='object') (149, 34)
comb2.set_index('Country Code', inplace=True)
display(comb2)
Short Name | Table Name | Long Name | 2-alpha code | Currency Unit | Special Notes | Region | Income Group | WB-2 code | National accounts base year | ... | Latest household survey | Source of most recent Income and expenditure data | Vital registration complete | Latest agricultural census | Latest industrial data | Latest trade data | Latest water withdrawal data | Unnamed: 31 | Economic Fitness Metric | Economic Fitness Ranking (1 = high, 149 = low) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country Code | |||||||||||||||||||||
AFG | Afghanistan | Afghanistan | Islamic State of Afghanistan | AF | Afghan afghani | Fiscal year end: March 20; reporting period fo... | South Asia | Low income | AF | 2002/03 | ... | Demographic and Health Survey, 2015 | Integrated household survey (IHS), 2011 | NaN | NaN | NaN | 2016.0 | 2000.0 | NaN | 5.369534e-02 | 112.0 |
AGO | Angola | Angola | People's Republic of Angola | AO | Angolan kwanza | NaN | Sub-Saharan Africa | Lower middle income | AO | 2002 | ... | Demographic and Health Survey, 2015/16 | Integrated household survey (IHS), 2008/09 | NaN | NaN | NaN | 2016.0 | 2005.0 | NaN | 7.890000e-13 | 146.0 |
ALB | Albania | Albania | Republic of Albania | AL | Albanian lek | NaN | Europe & Central Asia | Upper middle income | AL | Original chained constant price data are resca... | ... | Demographic and Health Survey, 2008/09 | Living Standards Measurement Study Survey (LSM... | Yes | 2012 | 2013.0 | 2016.0 | 2006.0 | NaN | 3.746223e-01 | 73.0 |
AND | Andorra | Andorra | Principality of Andorra | AD | Euro | WB-3 code changed from ADO to AND to align wit... | Europe & Central Asia | High income | AD | 2000 | ... | NaN | NaN | Yes | NaN | NaN | NaN | NaN | NaN | 3.694202e-01 | 74.0 |
ARE | United Arab Emirates | United Arab Emirates | United Arab Emirates | AE | U.A.E. dirham | NaN | Middle East & North Africa | High income | AE | 2010 | ... | World Health Survey, 2003 | NaN | NaN | 2012 | 1985.0 | 2016.0 | 2005.0 | NaN | 3.438621e-01 | 76.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
VNM | Vietnam | Vietnam | Socialist Republic of Vietnam | VN | Vietnamese dong | NaN | East Asia & Pacific | Lower middle income | VN | 2010 | ... | Multiple Indicator Cluster Survey, 2013/14 | Integrated household survey (IHS), 2014 | Yes | 2011 | 2013.0 | 2016.0 | 2005.0 | NaN | 1.391330e+00 | 36.0 |
YEM | Yemen | Yemen, Rep. | Republic of Yemen | YE | Yemeni rial | Based on official government statistics and In... | Middle East & North Africa | Lower middle income | RY | 1990 | ... | Demographic and Health Survey, 2013 | Expenditure survey/budget survey (ES/BS), 2005 | NaN | NaN | 2012.0 | 2016.0 | 2005.0 | NaN | 7.417100e-02 | 107.0 |
ZAF | South Africa | South Africa | Republic of South Africa | ZA | South African rand | Fiscal year end: March 31; reporting period fo... | Sub-Saharan Africa | Upper middle income | ZA | 2010 | ... | Demographic and Health Survey, 2003; World Hea... | Expenditure survey/budget survey (ES/BS), 2010/11 | NaN | 2007 | 2010.0 | 2016.0 | 2013.0 | NaN | 9.784253e-01 | 45.0 |
ZMB | Zambia | Zambia | Republic of Zambia | ZM | New Zambian kwacha | The base year is 2010. National accounts data ... | Sub-Saharan Africa | Lower middle income | ZM | 2010 | ... | Demographic and Health Survey, 2013/14 | Integrated household survey (IHS), 2010 | NaN | NaN | 1994.0 | 2016.0 | 2002.0 | NaN | 6.530739e-02 | 109.0 |
ZWE | Zimbabwe | Zimbabwe | Republic of Zimbabwe | ZW | U.S. dollar | Fiscal year end: June 30; reporting period for... | Sub-Saharan Africa | Low income | ZW | 2009 | ... | Demographic and Health Survey, 2015 | Integrated household survey (IHS), 2011/12 | NaN | NaN | NaN | 2016.0 | 2007.0 | NaN | 1.078169e-01 | 100.0 |
149 rows × 33 columns
comb2.to_csv("../data/unlabeled/preprocessed/economy_preprocessed.csv")