Module water_security.unlabeled_preprocessing.helpers
Expand source code
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd
def dropColumnHalf(df):
"""
Removes columns where the number of missig values is 50% or more
"""
df.dropna(thresh=len(df.index)/2, axis=1, inplace=True)
def fill_missing_with_column(df, into, fro):
"""
Merges one column into the other filling null values of the into colun and removing the fro column
"""
df[into] = df[into].combine_first(df[fro])
df.drop([fro], axis=1, inplace = True)
def impute_df(df,verbose=0, **kwargs):
"""
Imputes a df and returns a dataframe with the original and imputed values
"""
imp = IterativeImputer(verbose=verbose, **kwargs)
imp.fit_transform(df)
imputed_df = imp.transform(df)
return pd.DataFrame(imputed_df, columns=df.columns,index=df.index)
def print_missing_percentages(df):
"""
Max, min and mean number of missing values for the columns
"""
percent_missing = df.isnull().sum() * 100 / len(df)
max_missing = percent_missing.max()
min_missing = percent_missing.min()
mean_missing = percent_missing.mean()
print("Max, min and mean number of missing values for the columns")
print("Max:", max_missing,'%')
print("Min:", min_missing,'%')
print("Mean:", mean_missing,'%')
return min_missing, max_missing
def find_all_integer_columns(df):
"""
Returns an array of all colums that contain only integers or null values
"""
integer_columns = df.applymap(lambda x: int(x)==x if pd.notnull(x) else x).prod().values.astype(bool)
return df.columns[integer_columns].values
Functions
def dropColumnHalf(df)
-
Removes columns where the number of missig values is 50% or more
Expand source code
def dropColumnHalf(df): """ Removes columns where the number of missig values is 50% or more """ df.dropna(thresh=len(df.index)/2, axis=1, inplace=True)
def fill_missing_with_column(df, into, fro)
-
Merges one column into the other filling null values of the into colun and removing the fro column
Expand source code
def fill_missing_with_column(df, into, fro): """ Merges one column into the other filling null values of the into colun and removing the fro column """ df[into] = df[into].combine_first(df[fro]) df.drop([fro], axis=1, inplace = True)
def find_all_integer_columns(df)
-
Returns an array of all colums that contain only integers or null values
Expand source code
def find_all_integer_columns(df): """ Returns an array of all colums that contain only integers or null values """ integer_columns = df.applymap(lambda x: int(x)==x if pd.notnull(x) else x).prod().values.astype(bool) return df.columns[integer_columns].values
def impute_df(df, verbose=0, **kwargs)
-
Imputes a df and returns a dataframe with the original and imputed values
Expand source code
def impute_df(df,verbose=0, **kwargs): """ Imputes a df and returns a dataframe with the original and imputed values """ imp = IterativeImputer(verbose=verbose, **kwargs) imp.fit_transform(df) imputed_df = imp.transform(df) return pd.DataFrame(imputed_df, columns=df.columns,index=df.index)
def print_missing_percentages(df)
-
Max, min and mean number of missing values for the columns
Expand source code
def print_missing_percentages(df): """ Max, min and mean number of missing values for the columns """ percent_missing = df.isnull().sum() * 100 / len(df) max_missing = percent_missing.max() min_missing = percent_missing.min() mean_missing = percent_missing.mean() print("Max, min and mean number of missing values for the columns") print("Max:", max_missing,'%') print("Min:", min_missing,'%') print("Mean:", mean_missing,'%') return min_missing, max_missing