import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import country_converter as coco
from haversine import haversine
From there population will be filled
from data.unlabeled import WORLD_CITIES
from data.labeled.preprocessed import LABELED_CITIES
WORLD_CITIES.head()
code_dict = {x: coco.convert(x) for x in set(WORLD_CITIES['country'].unique()).union(LABELED_CITIES['country'].unique())}
WORLD_CITIES['country_code'] = WORLD_CITIES['country'].apply(lambda x: code_dict[x])
LABELED_CITIES['country_code']= LABELED_CITIES['country'].apply(lambda x: code_dict[x])
mapping_dists = {x['city']:{y['city']: (haversine((x['latitude'],x['longitude']),(y['lat'],y['lng'])) if x['city']!=y['city'] else 0)
for (_, y) in WORLD_CITIES[WORLD_CITIES['country_code']==x['country_code']].iterrows()} for (_, x) in LABELED_CITIES.iterrows()}
WORLD_CITIES.set_index('city', inplace=True)
LABELED_CITIES.set_index('city', inplace=True)
mapping_dists_fixed = {
x:(
{y: val for y,val in d.items()} if min(d.values()) < 2000 else
{y: haversine((LABELED_CITIES.loc[[x], 'longitude'].values[0],LABELED_CITIES.loc[[x], 'latitude'].values[0]),
(WORLD_CITIES.loc[[y], 'lat'].values[0],WORLD_CITIES.loc[[y], 'lng'].values[0]))
for y,val in d.items()}) for x,d in mapping_dists.items()}
mapping_df = pd.DataFrame(mapping_dists_fixed).T
mapping_df.head()
closest_match_df = pd.DataFrame(mapping_df.fillna(100000000).apply(lambda x: {'match':mapping_df.columns[np.argmin(x)], 'distance':np.min(x)}, axis=1).tolist())
closest_match_df.index = mapping_df.index
closest_match_df
dist_thres = 10 # the distance threshold for a city to be considered mapped correctly, in km
unmapped = closest_match_df[closest_match_df['distance']>dist_thres]
to_add_whole = LABELED_CITIES.loc[unmapped.index].copy() # add those rows as a whole to the augmented dataset
to_add_labels = LABELED_CITIES.loc[closest_match_df[closest_match_df['distance']<=dist_thres].index].copy() # only add the labels from these rows
to_add_labels.index = closest_match_df.loc[closest_match_df['distance']<=dist_thres,'match']
Creating an augmented dataset with all the cities, annotated or not
from data.labeled.preprocessed import RISKS_MAPPING
risks_cols = list(RISKS_MAPPING.keys())
augmented_cities_dataset = WORLD_CITIES[['lat','lng','country','population','country_code']].rename(columns={'lat':'latitude','lng':'longitude'}).copy()
augmented_cities_dataset = augmented_cities_dataset.merge(to_add_labels[['c40'] + risks_cols],left_index=True, right_index=True,how='left')
augmented_cities_dataset = pd.concat([augmented_cities_dataset, to_add_whole],axis=0)
augmented_cities_dataset['c40']= augmented_cities_dataset['c40'].fillna(False)
augmented_cities_dataset = augmented_cities_dataset[~pd.isna(augmented_cities_dataset['population'])]
augmented_cities_dataset
which probably is cause by used "."
augmented_cities_dataset.population = augmented_cities_dataset.population.apply(lambda x: x * 1000 if int(x)!=x else x)
from data.unlabeled import BIG_CITIES_ALL_COUNTRIES
unlab_mask = augmented_cities_dataset[risks_cols].isnull().all(axis=1)
not_in_big_cities_mask = augmented_cities_dataset.index.to_series().apply(lambda x: x not in BIG_CITIES_ALL_COUNTRIES.city.values)
augmented_cities_dataset = augmented_cities_dataset[~(unlab_mask¬_in_big_cities_mask)]
augmented_cities_dataset
from data.unlabeled import COUNTRIES_DATASET
set(augmented_cities_dataset.columns).intersection(COUNTRIES_DATASET.columns)
augmented_cities_dataset = augmented_cities_dataset.merge(COUNTRIES_DATASET, left_on='country_code', right_index=True)
all(augmented_cities_dataset.Country == augmented_cities_dataset.country)
augmented_cities_dataset.drop(columns='country',inplace=True)
augmented_cities_dataset.rename(columns={'Country':'country'},inplace=True)
augmented_cities_dataset= augmented_cities_dataset.reset_index().rename(columns={'index':'city'}).drop_duplicates(subset=['city','country'])
augmented_cities_dataset
from data.dataset import DATASET_PATH
augmented_cities_dataset.to_csv(DATASET_PATH,index=False)
augmented_cities_dataset
(~(augmented_cities_dataset[risks_cols].isnull()).all(axis=1)).sum() # the labeled samples