Module water_security.classification.model_handler
Expand source code
import importlib
import os
import pickle
from typing import Generator
import numpy as np
import pandas as pd
import shap
from data.model import MODEL_PATH
from data.model.metrics import (
TRAINING_METRICS_PATH,
VALIDATION_METRICS_PATH,
FEATURES_IMPORTANCES_PATH,
)
from data.model.predictions import FILLED_DATASET_PATH, PREDICTION_MASK_PATH
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from sklearn.metrics import (
classification_report,
confusion_matrix,
explained_variance_score,
mean_absolute_error,
mean_squared_error,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from utils.geo import (
get_average_1k_population_density,
get_elevation,
get_place,
is_close,
)
from classification import RANDOM_SEED
from classification.classifier import Classifier
from classification.feature_selection import FeatureSelectionAndGeneration
def regression_report(y_true, y_pred):
"""
Returns a regression report, including Mean Absolute and Squared Errors and Explained Variance
"""
return {
"MAE": mean_absolute_error(y_true, y_pred),
"MSE": mean_squared_error(y_true, y_pred),
"Explained Variance": explained_variance_score(y_true, y_pred),
}
class TrainingRequired(NotFittedError):
def __init__(self, obj):
super().__init__(f"{obj} could not be loaded. Training model is required")
class InvalidCoordinates(BaseException):
pass
class ModelHandler:
"""
Trains and Tests the model, while also computing metrics.
During training the model is first fitted, then produces predictions for any unlabled points inside the dataset
During testing, it receives latitude, longitude, computes the required features for the city, merges with the country features,
uses the model to predict the output and also output the shap values associated with it.
"""
def __init__(self):
self._model = None
self._explainers = None
self._dataset = None
self._valid_metrics = None
self._train_metrics = None
self._filled_dataset = None
self.train_mask = None
self.feat_names = None
self.lab_names = None
# The id columns to remain in the filled dataset
self.id_columns = [
"city",
"country",
"country_code",
"c40",
"latitude",
"longitude",
"population_1k_density",
"elevation",
]
# The id columns to consider also as features
self.feat_id_columns = [
"latitude",
"longitude",
"population_1k_density",
"elevation",
]
@property
def model(self) -> Pipeline:
"""
If model is not defined, try to loaded from disk
"""
if self._model is None:
try:
from data.model import MODEL, MODEL_PATH
print(f"Loaded model from {MODEL_PATH}.")
self._model = MODEL
except ImportError:
raise TrainingRequired("Model")
return self._model
@model.setter
def model(self, model: Pipeline):
self._model = model
def save_model(self) -> None:
"""
Saves model to memory
"""
with open(os.path.join(MODEL_PATH), "wb") as out:
pickle.dump(self.model, out)
import data.model
importlib.reload(data.model)
@property
def dataset(self) -> pd.DataFrame:
"""
The dataset for the training step.
When it is loaded the first time, several variables are defined:
- lab_names: the labels names/columns of the dataset
- unique_labs: the unique labels values
- feat_names: the features names/columns of the dataset
- train_mask: the mask that refers to the cities that are labeled at least for one risk
"""
if self._dataset is None:
from data.dataset import DATASET as dataset
from data.labeled.preprocessed import LABELED_CITIES, RISKS_MAPPING
self.lab_names = sorted(RISKS_MAPPING.keys())
self.unique_labs = np.unique(dataset[self.lab_names].T.stack().values)
self.feat_names = [
x
for x in dataset.columns
if x not in self.lab_names
and (x in self.feat_id_columns or x not in self.id_columns)
]
self.train_mask = dataset[self.lab_names].apply(
lambda x: not all(pd.isnull(x)), axis=1
)
self._dataset = dataset
return self._dataset
@property
def filled_dataset(self) -> pd.DataFrame:
"""
The dataset that has filled labels, which were produced from the predictions
"""
if self._filled_dataset is None:
try:
self._filled_dataset = pd.read_csv(FILLED_DATASET_PATH)
except IOError:
raise TrainingRequired("Filled Dataset")
return self._filled_dataset
@filled_dataset.setter
def filled_dataset(self, dataset: pd.DataFrame):
self._filled_dataset = dataset
def compute_metrics(self, y_true, y_pred):
"""
Compute metrics for regression labels of size nx1
"""
metrics = {}
# Interpolate predictions to labels, eg convert 0.2 to 0, 0.7 to 1 etc.
y_pred_interp = self.unique_labs[
np.abs(np.reshape(self.unique_labs, (-1, 1)) - y_pred).argmin(axis=0)
]
metrics["confusion_matrix"] = confusion_matrix(y_true, y_pred_interp)
metrics["classification_report"] = classification_report(
y_true, y_pred_interp, output_dict=True
)
metrics["regression_report"] = regression_report(y_true, y_pred)
return metrics
@property
def is_fitted(self) -> bool:
"""
Tries to load model from memory/disk, if it fails, returns False, else returns True
"""
try:
self.model
except TrainingRequired:
return False
return True
def get_total_train_val_set_per_risk(self) -> Generator:
dataset = self.dataset
labeled = dataset[self.train_mask]
for label in self.lab_names:
train_mask = ~pd.isnull(dataset[label])
labeled = dataset.loc[train_mask, :]
try:
train_set, valid_set = train_test_split(
labeled,
test_size=0.3,
random_state=RANDOM_SEED,
stratify=labeled[label],
)
except ValueError:
print(
f"Using normal split for label {label} due to underrepresentated levels."
f" The levels counts for that label are:\n {labeled[label].value_counts()}"
)
train_set, valid_set = train_test_split(
labeled,
test_size=0.3,
random_state=RANDOM_SEED,
)
yield (label, labeled, [train_set, valid_set])
@property
def explainers(self):
"""
The SHAP explainers per model
"""
if self._explainers is None:
self._explainers = {
label: shap.Explainer(
self.model[label].named_steps["Classification"].regressor,
)
for label in self.model
}
return self._explainers
def train(self) -> None:
"""
- Trains 7 different models, one per each different water security risk.
- Applies feature selection and generation per different model.
- Keeps 0.3 validation size, computes classification metrics, saves them, then fits each model to the whole available dataset for each risk.
- Creates the filled dataset and saves it to disk
- Creates the prediction mask (what labels from the filled dataset were predicted) and saves it to memory
"""
model = {}
train_metrics = {}
valid_metrics = {}
filled_dataset: pd.DataFrame = None
importances = {}
for (
label,
labeled,
[train_set, valid_set],
) in self.get_total_train_val_set_per_risk():
model[label] = Pipeline(
[
("FeatureSelection", FeatureSelectionAndGeneration(feats_num=500)),
("Classification", Classifier(label)),
]
)
model[label].fit(train_set[self.feat_names], train_set[label])
train_preds = model[label].predict(train_set[self.feat_names])
valid_preds = model[label].predict(valid_set[self.feat_names])
train_metrics[label] = self.compute_metrics(train_set[label], train_preds)
valid_metrics[label] = self.compute_metrics(valid_set[label], valid_preds)
model[label].fit(labeled[self.feat_names], labeled[label])
importances[label] = (
model[label].named_steps["Classification"].feature_importances_
)
if filled_dataset is None:
filled_dataset = self.dataset[self.id_columns + self.lab_names].copy()
filled_dataset.loc[~self.train_mask, label] = model[label].predict(
self.dataset.loc[~self.train_mask, self.feat_names]
)
self.model = model
self.save_model()
with open(VALIDATION_METRICS_PATH, "wb") as out:
pickle.dump(valid_metrics, out)
with open(TRAINING_METRICS_PATH, "wb") as out:
pickle.dump(train_metrics, out)
with open(FEATURES_IMPORTANCES_PATH, "wb") as out:
pickle.dump(importances, out)
self.filled_dataset = filled_dataset
self.filled_dataset.to_csv(FILLED_DATASET_PATH, index=False)
prediction_mask = self.filled_dataset[self.id_columns + self.lab_names]
prediction_mask[self.lab_names] = pd.isnull(self.dataset[self.lab_names])
prediction_mask.to_csv(PREDICTION_MASK_PATH, index=False)
import data.model.metrics
importlib.reload(data.model.metrics)
import data.model.predictions
importlib.reload(data.model.predictions)
def test(self, latitude: float, longitude: float):
"""
Given a specific latitude and longitude value, either returns saved predictions from the filled dataset, if the point is close to the
ones that have already been predicted, or uses a REST API to load the country to which the latitude and longitude refer, uses the country data
to create the feature vector and computes the prediction using the trained models.
Returns the series of the found labels, which also contain city and country,
and the series of booleans which shows which predictions were predicted and which where not.
IF it is an online prediction, it also returns the shap values associated with the prediction.
"""
try:
from data.model.predictions import FILLED_DATASET, PREDICTION_MASK
except ImportError:
raise TrainingRequired("Filled Dataset")
check_existing = FILLED_DATASET.apply(
lambda x: is_close((latitude, longitude), (x["latitude"], x["longitude"])),
axis=1,
)
if np.any(check_existing):
labs = list(sorted(self.model.keys()))
return (
FILLED_DATASET.loc[check_existing, labs + ["city", "country"]].iloc[0],
PREDICTION_MASK.loc[check_existing, labs].iloc[0],
)
return self._test_online_prediction(latitude, longitude)
def _test_online_prediction(self, latitude, longitude):
try:
place = get_place(latitude, longitude)
except AttributeError:
raise InvalidCoordinates
population_density = get_average_1k_population_density(latitude, longitude)
elevation = get_elevation(latitude, longitude)
from data.unlabeled import COUNTRIES_DATASET
feats = COUNTRIES_DATASET.loc[place["code"]].copy()
feats["latitude"] = latitude
feats["longitude"] = longitude
feats["population_1k_density"] = population_density
feats["elevation"] = elevation
feats["population"] = None
preds = {}
mask = {}
shap_values = {}
for label in self.model:
preds[label] = self.model[label].predict(feats)[0]
mask[label] = True
transformed = (
self.model[label].named_steps["FeatureSelection"].transform(feats)
)
shap_values[label] = self.explainers[label](transformed)
preds["city"] = place["city"]
preds["country"] = place["country"]
return pd.Series(preds), pd.Series(mask), shap_values
Functions
def regression_report(y_true, y_pred)
-
Returns a regression report, including Mean Absolute and Squared Errors and Explained Variance
Expand source code
def regression_report(y_true, y_pred): """ Returns a regression report, including Mean Absolute and Squared Errors and Explained Variance """ return { "MAE": mean_absolute_error(y_true, y_pred), "MSE": mean_squared_error(y_true, y_pred), "Explained Variance": explained_variance_score(y_true, y_pred), }
Classes
class InvalidCoordinates (*args, **kwargs)
-
Common base class for all exceptions
Expand source code
class InvalidCoordinates(BaseException): pass
Ancestors
- builtins.BaseException
class ModelHandler
-
Trains and Tests the model, while also computing metrics. During training the model is first fitted, then produces predictions for any unlabled points inside the dataset During testing, it receives latitude, longitude, computes the required features for the city, merges with the country features, uses the model to predict the output and also output the shap values associated with it.
Expand source code
class ModelHandler: """ Trains and Tests the model, while also computing metrics. During training the model is first fitted, then produces predictions for any unlabled points inside the dataset During testing, it receives latitude, longitude, computes the required features for the city, merges with the country features, uses the model to predict the output and also output the shap values associated with it. """ def __init__(self): self._model = None self._explainers = None self._dataset = None self._valid_metrics = None self._train_metrics = None self._filled_dataset = None self.train_mask = None self.feat_names = None self.lab_names = None # The id columns to remain in the filled dataset self.id_columns = [ "city", "country", "country_code", "c40", "latitude", "longitude", "population_1k_density", "elevation", ] # The id columns to consider also as features self.feat_id_columns = [ "latitude", "longitude", "population_1k_density", "elevation", ] @property def model(self) -> Pipeline: """ If model is not defined, try to loaded from disk """ if self._model is None: try: from data.model import MODEL, MODEL_PATH print(f"Loaded model from {MODEL_PATH}.") self._model = MODEL except ImportError: raise TrainingRequired("Model") return self._model @model.setter def model(self, model: Pipeline): self._model = model def save_model(self) -> None: """ Saves model to memory """ with open(os.path.join(MODEL_PATH), "wb") as out: pickle.dump(self.model, out) import data.model importlib.reload(data.model) @property def dataset(self) -> pd.DataFrame: """ The dataset for the training step. When it is loaded the first time, several variables are defined: - lab_names: the labels names/columns of the dataset - unique_labs: the unique labels values - feat_names: the features names/columns of the dataset - train_mask: the mask that refers to the cities that are labeled at least for one risk """ if self._dataset is None: from data.dataset import DATASET as dataset from data.labeled.preprocessed import LABELED_CITIES, RISKS_MAPPING self.lab_names = sorted(RISKS_MAPPING.keys()) self.unique_labs = np.unique(dataset[self.lab_names].T.stack().values) self.feat_names = [ x for x in dataset.columns if x not in self.lab_names and (x in self.feat_id_columns or x not in self.id_columns) ] self.train_mask = dataset[self.lab_names].apply( lambda x: not all(pd.isnull(x)), axis=1 ) self._dataset = dataset return self._dataset @property def filled_dataset(self) -> pd.DataFrame: """ The dataset that has filled labels, which were produced from the predictions """ if self._filled_dataset is None: try: self._filled_dataset = pd.read_csv(FILLED_DATASET_PATH) except IOError: raise TrainingRequired("Filled Dataset") return self._filled_dataset @filled_dataset.setter def filled_dataset(self, dataset: pd.DataFrame): self._filled_dataset = dataset def compute_metrics(self, y_true, y_pred): """ Compute metrics for regression labels of size nx1 """ metrics = {} # Interpolate predictions to labels, eg convert 0.2 to 0, 0.7 to 1 etc. y_pred_interp = self.unique_labs[ np.abs(np.reshape(self.unique_labs, (-1, 1)) - y_pred).argmin(axis=0) ] metrics["confusion_matrix"] = confusion_matrix(y_true, y_pred_interp) metrics["classification_report"] = classification_report( y_true, y_pred_interp, output_dict=True ) metrics["regression_report"] = regression_report(y_true, y_pred) return metrics @property def is_fitted(self) -> bool: """ Tries to load model from memory/disk, if it fails, returns False, else returns True """ try: self.model except TrainingRequired: return False return True def get_total_train_val_set_per_risk(self) -> Generator: dataset = self.dataset labeled = dataset[self.train_mask] for label in self.lab_names: train_mask = ~pd.isnull(dataset[label]) labeled = dataset.loc[train_mask, :] try: train_set, valid_set = train_test_split( labeled, test_size=0.3, random_state=RANDOM_SEED, stratify=labeled[label], ) except ValueError: print( f"Using normal split for label {label} due to underrepresentated levels." f" The levels counts for that label are:\n {labeled[label].value_counts()}" ) train_set, valid_set = train_test_split( labeled, test_size=0.3, random_state=RANDOM_SEED, ) yield (label, labeled, [train_set, valid_set]) @property def explainers(self): """ The SHAP explainers per model """ if self._explainers is None: self._explainers = { label: shap.Explainer( self.model[label].named_steps["Classification"].regressor, ) for label in self.model } return self._explainers def train(self) -> None: """ - Trains 7 different models, one per each different water security risk. - Applies feature selection and generation per different model. - Keeps 0.3 validation size, computes classification metrics, saves them, then fits each model to the whole available dataset for each risk. - Creates the filled dataset and saves it to disk - Creates the prediction mask (what labels from the filled dataset were predicted) and saves it to memory """ model = {} train_metrics = {} valid_metrics = {} filled_dataset: pd.DataFrame = None importances = {} for ( label, labeled, [train_set, valid_set], ) in self.get_total_train_val_set_per_risk(): model[label] = Pipeline( [ ("FeatureSelection", FeatureSelectionAndGeneration(feats_num=500)), ("Classification", Classifier(label)), ] ) model[label].fit(train_set[self.feat_names], train_set[label]) train_preds = model[label].predict(train_set[self.feat_names]) valid_preds = model[label].predict(valid_set[self.feat_names]) train_metrics[label] = self.compute_metrics(train_set[label], train_preds) valid_metrics[label] = self.compute_metrics(valid_set[label], valid_preds) model[label].fit(labeled[self.feat_names], labeled[label]) importances[label] = ( model[label].named_steps["Classification"].feature_importances_ ) if filled_dataset is None: filled_dataset = self.dataset[self.id_columns + self.lab_names].copy() filled_dataset.loc[~self.train_mask, label] = model[label].predict( self.dataset.loc[~self.train_mask, self.feat_names] ) self.model = model self.save_model() with open(VALIDATION_METRICS_PATH, "wb") as out: pickle.dump(valid_metrics, out) with open(TRAINING_METRICS_PATH, "wb") as out: pickle.dump(train_metrics, out) with open(FEATURES_IMPORTANCES_PATH, "wb") as out: pickle.dump(importances, out) self.filled_dataset = filled_dataset self.filled_dataset.to_csv(FILLED_DATASET_PATH, index=False) prediction_mask = self.filled_dataset[self.id_columns + self.lab_names] prediction_mask[self.lab_names] = pd.isnull(self.dataset[self.lab_names]) prediction_mask.to_csv(PREDICTION_MASK_PATH, index=False) import data.model.metrics importlib.reload(data.model.metrics) import data.model.predictions importlib.reload(data.model.predictions) def test(self, latitude: float, longitude: float): """ Given a specific latitude and longitude value, either returns saved predictions from the filled dataset, if the point is close to the ones that have already been predicted, or uses a REST API to load the country to which the latitude and longitude refer, uses the country data to create the feature vector and computes the prediction using the trained models. Returns the series of the found labels, which also contain city and country, and the series of booleans which shows which predictions were predicted and which where not. IF it is an online prediction, it also returns the shap values associated with the prediction. """ try: from data.model.predictions import FILLED_DATASET, PREDICTION_MASK except ImportError: raise TrainingRequired("Filled Dataset") check_existing = FILLED_DATASET.apply( lambda x: is_close((latitude, longitude), (x["latitude"], x["longitude"])), axis=1, ) if np.any(check_existing): labs = list(sorted(self.model.keys())) return ( FILLED_DATASET.loc[check_existing, labs + ["city", "country"]].iloc[0], PREDICTION_MASK.loc[check_existing, labs].iloc[0], ) return self._test_online_prediction(latitude, longitude) def _test_online_prediction(self, latitude, longitude): try: place = get_place(latitude, longitude) except AttributeError: raise InvalidCoordinates population_density = get_average_1k_population_density(latitude, longitude) elevation = get_elevation(latitude, longitude) from data.unlabeled import COUNTRIES_DATASET feats = COUNTRIES_DATASET.loc[place["code"]].copy() feats["latitude"] = latitude feats["longitude"] = longitude feats["population_1k_density"] = population_density feats["elevation"] = elevation feats["population"] = None preds = {} mask = {} shap_values = {} for label in self.model: preds[label] = self.model[label].predict(feats)[0] mask[label] = True transformed = ( self.model[label].named_steps["FeatureSelection"].transform(feats) ) shap_values[label] = self.explainers[label](transformed) preds["city"] = place["city"] preds["country"] = place["country"] return pd.Series(preds), pd.Series(mask), shap_values
Instance variables
var dataset : pandas.core.frame.DataFrame
-
The dataset for the training step. When it is loaded the first time, several variables are defined: - lab_names: the labels names/columns of the dataset - unique_labs: the unique labels values - feat_names: the features names/columns of the dataset - train_mask: the mask that refers to the cities that are labeled at least for one risk
Expand source code
@property def dataset(self) -> pd.DataFrame: """ The dataset for the training step. When it is loaded the first time, several variables are defined: - lab_names: the labels names/columns of the dataset - unique_labs: the unique labels values - feat_names: the features names/columns of the dataset - train_mask: the mask that refers to the cities that are labeled at least for one risk """ if self._dataset is None: from data.dataset import DATASET as dataset from data.labeled.preprocessed import LABELED_CITIES, RISKS_MAPPING self.lab_names = sorted(RISKS_MAPPING.keys()) self.unique_labs = np.unique(dataset[self.lab_names].T.stack().values) self.feat_names = [ x for x in dataset.columns if x not in self.lab_names and (x in self.feat_id_columns or x not in self.id_columns) ] self.train_mask = dataset[self.lab_names].apply( lambda x: not all(pd.isnull(x)), axis=1 ) self._dataset = dataset return self._dataset
var explainers
-
The SHAP explainers per model
Expand source code
@property def explainers(self): """ The SHAP explainers per model """ if self._explainers is None: self._explainers = { label: shap.Explainer( self.model[label].named_steps["Classification"].regressor, ) for label in self.model } return self._explainers
var filled_dataset : pandas.core.frame.DataFrame
-
The dataset that has filled labels, which were produced from the predictions
Expand source code
@property def filled_dataset(self) -> pd.DataFrame: """ The dataset that has filled labels, which were produced from the predictions """ if self._filled_dataset is None: try: self._filled_dataset = pd.read_csv(FILLED_DATASET_PATH) except IOError: raise TrainingRequired("Filled Dataset") return self._filled_dataset
var is_fitted : bool
-
Tries to load model from memory/disk, if it fails, returns False, else returns True
Expand source code
@property def is_fitted(self) -> bool: """ Tries to load model from memory/disk, if it fails, returns False, else returns True """ try: self.model except TrainingRequired: return False return True
var model : sklearn.pipeline.Pipeline
-
If model is not defined, try to loaded from disk
Expand source code
@property def model(self) -> Pipeline: """ If model is not defined, try to loaded from disk """ if self._model is None: try: from data.model import MODEL, MODEL_PATH print(f"Loaded model from {MODEL_PATH}.") self._model = MODEL except ImportError: raise TrainingRequired("Model") return self._model
Methods
def compute_metrics(self, y_true, y_pred)
-
Compute metrics for regression labels of size nx1
Expand source code
def compute_metrics(self, y_true, y_pred): """ Compute metrics for regression labels of size nx1 """ metrics = {} # Interpolate predictions to labels, eg convert 0.2 to 0, 0.7 to 1 etc. y_pred_interp = self.unique_labs[ np.abs(np.reshape(self.unique_labs, (-1, 1)) - y_pred).argmin(axis=0) ] metrics["confusion_matrix"] = confusion_matrix(y_true, y_pred_interp) metrics["classification_report"] = classification_report( y_true, y_pred_interp, output_dict=True ) metrics["regression_report"] = regression_report(y_true, y_pred) return metrics
def get_total_train_val_set_per_risk(self) ‑> Generator
-
Expand source code
def get_total_train_val_set_per_risk(self) -> Generator: dataset = self.dataset labeled = dataset[self.train_mask] for label in self.lab_names: train_mask = ~pd.isnull(dataset[label]) labeled = dataset.loc[train_mask, :] try: train_set, valid_set = train_test_split( labeled, test_size=0.3, random_state=RANDOM_SEED, stratify=labeled[label], ) except ValueError: print( f"Using normal split for label {label} due to underrepresentated levels." f" The levels counts for that label are:\n {labeled[label].value_counts()}" ) train_set, valid_set = train_test_split( labeled, test_size=0.3, random_state=RANDOM_SEED, ) yield (label, labeled, [train_set, valid_set])
def save_model(self) ‑> NoneType
-
Saves model to memory
Expand source code
def save_model(self) -> None: """ Saves model to memory """ with open(os.path.join(MODEL_PATH), "wb") as out: pickle.dump(self.model, out) import data.model importlib.reload(data.model)
def test(self, latitude: float, longitude: float)
-
Given a specific latitude and longitude value, either returns saved predictions from the filled dataset, if the point is close to the ones that have already been predicted, or uses a REST API to load the country to which the latitude and longitude refer, uses the country data to create the feature vector and computes the prediction using the trained models. Returns the series of the found labels, which also contain city and country, and the series of booleans which shows which predictions were predicted and which where not. IF it is an online prediction, it also returns the shap values associated with the prediction.
Expand source code
def test(self, latitude: float, longitude: float): """ Given a specific latitude and longitude value, either returns saved predictions from the filled dataset, if the point is close to the ones that have already been predicted, or uses a REST API to load the country to which the latitude and longitude refer, uses the country data to create the feature vector and computes the prediction using the trained models. Returns the series of the found labels, which also contain city and country, and the series of booleans which shows which predictions were predicted and which where not. IF it is an online prediction, it also returns the shap values associated with the prediction. """ try: from data.model.predictions import FILLED_DATASET, PREDICTION_MASK except ImportError: raise TrainingRequired("Filled Dataset") check_existing = FILLED_DATASET.apply( lambda x: is_close((latitude, longitude), (x["latitude"], x["longitude"])), axis=1, ) if np.any(check_existing): labs = list(sorted(self.model.keys())) return ( FILLED_DATASET.loc[check_existing, labs + ["city", "country"]].iloc[0], PREDICTION_MASK.loc[check_existing, labs].iloc[0], ) return self._test_online_prediction(latitude, longitude)
def train(self) ‑> NoneType
-
- Trains 7 different models, one per each different water security risk.
- Applies feature selection and generation per different model.
- Keeps 0.3 validation size, computes classification metrics, saves them, then fits each model to the whole available dataset for each risk.
- Creates the filled dataset and saves it to disk
- Creates the prediction mask (what labels from the filled dataset were predicted) and saves it to memory
Expand source code
def train(self) -> None: """ - Trains 7 different models, one per each different water security risk. - Applies feature selection and generation per different model. - Keeps 0.3 validation size, computes classification metrics, saves them, then fits each model to the whole available dataset for each risk. - Creates the filled dataset and saves it to disk - Creates the prediction mask (what labels from the filled dataset were predicted) and saves it to memory """ model = {} train_metrics = {} valid_metrics = {} filled_dataset: pd.DataFrame = None importances = {} for ( label, labeled, [train_set, valid_set], ) in self.get_total_train_val_set_per_risk(): model[label] = Pipeline( [ ("FeatureSelection", FeatureSelectionAndGeneration(feats_num=500)), ("Classification", Classifier(label)), ] ) model[label].fit(train_set[self.feat_names], train_set[label]) train_preds = model[label].predict(train_set[self.feat_names]) valid_preds = model[label].predict(valid_set[self.feat_names]) train_metrics[label] = self.compute_metrics(train_set[label], train_preds) valid_metrics[label] = self.compute_metrics(valid_set[label], valid_preds) model[label].fit(labeled[self.feat_names], labeled[label]) importances[label] = ( model[label].named_steps["Classification"].feature_importances_ ) if filled_dataset is None: filled_dataset = self.dataset[self.id_columns + self.lab_names].copy() filled_dataset.loc[~self.train_mask, label] = model[label].predict( self.dataset.loc[~self.train_mask, self.feat_names] ) self.model = model self.save_model() with open(VALIDATION_METRICS_PATH, "wb") as out: pickle.dump(valid_metrics, out) with open(TRAINING_METRICS_PATH, "wb") as out: pickle.dump(train_metrics, out) with open(FEATURES_IMPORTANCES_PATH, "wb") as out: pickle.dump(importances, out) self.filled_dataset = filled_dataset self.filled_dataset.to_csv(FILLED_DATASET_PATH, index=False) prediction_mask = self.filled_dataset[self.id_columns + self.lab_names] prediction_mask[self.lab_names] = pd.isnull(self.dataset[self.lab_names]) prediction_mask.to_csv(PREDICTION_MASK_PATH, index=False) import data.model.metrics importlib.reload(data.model.metrics) import data.model.predictions importlib.reload(data.model.predictions)
class TrainingRequired (obj)
-
Exception class to raise if estimator is used before fitting.
This class inherits from both ValueError and AttributeError to help with exception handling and backward compatibility.
Examples
>>> from sklearn.svm import LinearSVC >>> from sklearn.exceptions import NotFittedError >>> try: ... LinearSVC().predict([[1, 2], [2, 3], [3, 4]]) ... except NotFittedError as e: ... print(repr(e)) NotFittedError("This LinearSVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."...)
Changed in version: 0.18
Moved from sklearn.utils.validation.
Expand source code
class TrainingRequired(NotFittedError): def __init__(self, obj): super().__init__(f"{obj} could not be loaded. Training model is required")
Ancestors
- sklearn.exceptions.NotFittedError
- builtins.ValueError
- builtins.AttributeError
- builtins.Exception
- builtins.BaseException