import pandas as pd
import sklearn
from helpers import *
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from data.unlabeled.raw import aquastat_eah, aquastat_wr, aquastat_wu, aquastat_cc
import seaborn as sns
import os
aquastat_wr
aquastat_wu
aquastat_eah
aquastat_cc
alldf = pd.concat([aquastat_eah,aquastat_wu,aquastat_wr])
alldf
alldf.info()
alldf[alldf['Symbol'] == "I"]
alldf[alldf['Value'].isna()]
countryXindicator = alldf.pivot_table('Value',['Area'],'Variable Name')
countryXindicator
countryXindicator.drop(['Urban population','Total population','Rural population'], inplace=True, axis=1)
countryXindicator.shape
print_missing_percentages(countryXindicator)
dropColumnHalf(countryXindicator)
print_missing_percentages(countryXindicator)
countryXindicator.shape
imputed_countryXindicator = impute_df(countryXindicator, verbose=2, max_iter=20)
corr_calc = countryXindicator.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
corr_calc = imputed_countryXindicator.corr()
sns.heatmap(corr_calc, vmin=-1, vmax=1, center=0, xticklabels=False, yticklabels=False, cmap='mako')
merged = imputed_countryXindicator.merge(aquastat_cc, how='inner', left_on="Area", right_on="Country").set_index("Code")
merged
!dir
merged.to_csv("../data/unlabeled/preprocessed/aquastat_preprocessed.csv")