In [1]:
import sys
sys.path.append('..')
from data.dataset import DATASET, DATASET_PATH
from utils.geo import get_average_1k_population_density,get_place
In [2]:
DATASET['population_1k_density'] = DATASET.apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
/home/vaslem/code/bioinformatics_ms/modern_data_analytics/.venv/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3419: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/home/vaslem/code/bioinformatics_ms/modern_data_analytics/.venv/lib/python3.8/site-packages/numpy/core/_methods.py:188: RuntimeWarning: invalid value encountered in true_divide
  ret = ret.dtype.type(ret / rcount)
In [3]:
DATASET.sort_values(by='population_1k_density').iloc[-10:][['city','country','latitude','longitude','population_1k_density']]
Out[3]:
city country latitude longitude population_1k_density
181 Kano Nigeria 12.0000 8.5167 32776.671875
680 Shenzhen China 22.5350 114.0540 32942.902344
1081 Hanoi Viet Nam 21.0245 105.8412 33384.570312
936 Dhaka Bangladesh 23.7161 90.3961 34085.695312
677 Guangzhou China 23.1288 113.2590 38069.554688
1105 Mbuji-Mayi Democratic Republic of the Congo -6.1209 23.5967 38148.828125
742 Kathmandu Nepal 27.7167 85.3667 38521.742188
587 Bangalore India 12.9699 77.5980 40835.621094
716 Luanda Angola -8.8383 13.2344 42064.023438
793 Roma Italy 41.5300 12.2858 NaN
In [4]:
DATASET.sort_values(by='population_1k_density').iloc[:10][['city','country','latitude','longitude','population_1k_density']]
Out[4]:
city country latitude longitude population_1k_density
401 Natal Brazil -6.9838 -60.2699 0.003718
725 Vhembe South Africa -22.7695 29.9740 0.004092
16 Anchorage United States of America 61.1508 -149.1091 0.020515
230 Douglas Canada 46.2819 -66.9420 0.076147
792 Oristano Italy 39.7207 8.8980 0.193612
199 Georgetown Australia -18.3000 143.5500 0.341027
515 JICOSUR Mexico 19.3400 -104.3800 0.483595
724 Sekhukhune District Municipality South Africa -24.8335 29.9740 0.825482
271 Victoria Romania 45.6466 24.7059 1.035097
975 Santa Cruz Ecuador -0.5333 -90.3500 1.056872

The population density results can be used to manually curate the rather big dataset

as density nan or lower than 2 people/sqkm is probably bad measurement

In [5]:
#Incorrect lat,long for Roma, Italy: 41.5300, 12.2858
#Manual Curation
flag = (DATASET.city=='Roma')&(DATASET.country=='Italy')
DATASET.loc[flag,'latitude'] = 41.9028
DATASET.loc[flag,'longitude'] = 12.4964
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
Out[5]:
793    4297.904297
Name: population_1k_density, dtype: float64
In [6]:
#Incorrect lat,long for Natal, Brazil: -6.9838, -60.2699
#Manual Curation
DATASET.loc[(DATASET.city=='Natal')&(DATASET.country=='Brazil'),'latitude'] = -5.7793
DATASET.loc[(DATASET.city=='Natal')&(DATASET.country=='Brazil'),'longitude'] = -35.2009
DATASET.loc[(DATASET.city=='Natal')&(DATASET.country=='Brazil'),'population_1k_density'] = DATASET.loc[(DATASET.city=='Natal')&(DATASET.country=='Brazil')].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[(DATASET.city=='Natal')&(DATASET.country=='Brazil'),'population_1k_density']
Out[6]:
401    1053.169678
Name: population_1k_density, dtype: float64
In [7]:
get_place( -5.7793,-35.2009)
Out[7]:
{'city': 'Natal', 'country': 'Brazil', 'code': 'BRA'}
In [8]:
#Incorrect lat,long for Anchorage, US: 61.1508 -149.1091
#Manual Curation
flag = (DATASET.city=='Anchorage')&(DATASET.country=='United States of America')
DATASET.loc[flag,'latitude'] = 61.2181
DATASET.loc[flag,'longitude'] = -149.9003
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
Out[8]:
16    319.156555
Name: population_1k_density, dtype: float64
In [9]:
#Incorrect lat,long for Douglas, Canada: 46.2819 -66.9420
#Manual Curation
flag = (DATASET.city=='Douglas')&(DATASET.country=='Canada')
DATASET.loc[flag,'latitude'] = 49.0056
DATASET.loc[flag,'longitude'] = -122.7452
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
Out[9]:
230    63.305809
Name: population_1k_density, dtype: float64
In [10]:
#Incorrect lat,long for Santa Cruz, Ecuador: -0.5333 -90.3500
#Manual Curation
flag = (DATASET.city=='Santa Cruz')&(DATASET.country=='Ecuador')
DATASET.loc[flag,'latitude'] = -0.6394
DATASET.loc[flag,'longitude'] = -90.3372
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
Out[10]:
975    1.603762
Name: population_1k_density, dtype: float64
In [11]:
#Dropping Vhembe, South Africa, as it is a municipality of  25597 sqkm!
#Manual Curation
flag = ~((DATASET.city=='Vhembe')&(DATASET.country=='South Africa'))
DATASET = DATASET[flag].copy()
In [12]:
#Unknown location JICOSUR, Mexico: 19.3400 -104.3800
#Manual Curation
flag = (DATASET.city=='JICOSUR')&(DATASET.country=='Mexico')
DATASET.loc[flag,'city'] = 'Cihuatlán'
In [13]:
#Incorrect lat,long for Oristano, Italy: 39.7207, 8.8980
#Manual Curation
flag = (DATASET.city=='Oristano')&(DATASET.country=='Italy')
DATASET.loc[flag,'latitude'] = 39.9062
DATASET.loc[flag,'longitude'] = 8.5884
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
Out[13]:
792    25.601376
Name: population_1k_density, dtype: float64
In [14]:
DATASET.sort_values(by='population_1k_density').iloc[:10][['city','country','latitude','longitude','population_1k_density','c40']]
Out[14]:
city country latitude longitude population_1k_density c40
199 Georgetown Australia -18.3000 143.5500 0.341027 False
515 Cihuatlán Mexico 19.3400 -104.3800 0.483595 False
724 Sekhukhune District Municipality South Africa -24.8335 29.9740 0.825482 False
271 Victoria Romania 45.6466 24.7059 1.035097 False
838 Cottica Suriname 3.8547 -54.2289 1.318348 False
975 Santa Cruz Ecuador -0.6394 -90.3372 1.603762 False
488 New Plymouth District New Zealand -39.1011 174.3540 1.658758 False
806 Parma Russian Federation 65.9230 57.4030 1.797995 False
1069 Lata Solomon Islands -10.7380 165.8567 2.206394 False
837 Brokopondo Suriname 5.0404 -55.0200 2.214054 False
In [15]:
#Incorrect lat,long for Georgetown, Australia: -18.3000 143.5500
#Manual Curation
flag = (DATASET.city=='Georgetown')&(DATASET.country=='Australia')
DATASET.loc[flag,'latitude'] = -18.2470
DATASET.loc[flag,'longitude'] = 143.0360
DATASET.loc[flag,'population_1k_density'] = DATASET.loc[flag].apply(lambda city: get_average_1k_population_density(city['latitude'],city['longitude']),axis=1)
DATASET.loc[flag,'population_1k_density']
/home/vaslem/code/bioinformatics_ms/modern_data_analytics/.venv/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3419: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/home/vaslem/code/bioinformatics_ms/modern_data_analytics/.venv/lib/python3.8/site-packages/numpy/core/_methods.py:188: RuntimeWarning: invalid value encountered in true_divide
  ret = ret.dtype.type(ret / rcount)
Out[15]:
199    0.011311
Name: population_1k_density, dtype: float64
In [16]:
DATASET.to_csv(DATASET_PATH)