In [1]:
import pandas as pd
import numpy as np
from langdetect import detect as langdetect
import random
seed = 42
np.random.seed(seed)
random.seed(seed)
import sys
sys.path.append("..")
from data.labeled.raw import CWA as cwa, CWR as cwr
from data.labeled.preprocessed import LABELED_CITIES_PATH, RISKS_MAPPING_PATH, SEVERITY_MAPPING_PATH, IMPUTATION_REPORT_PATH

Preprocess whole dataset

In [2]:
cwa.head()
Out[2]:
Account No Organisation City Country Region Access C40 Reporting Year Risks to city's water supply Adaptation action Action description City Location
0 1093 City of Atlanta Atlanta United States of America North America public NaN 2018 Increased water stress or scarcity Investment in existing water supply infrastruc... The city is building a new reservoir, fixing t... (33.749, -84.388)
1 1184 City of Austin Austin United States of America North America public C40 2018 Increased water stress or scarcity Conservation awareness and education NaN (30.2672, -97.7431)
2 1184 City of Austin Austin United States of America North America public C40 2018 Increased water stress or scarcity Conservation incentives NaN (30.2672, -97.7431)
3 1184 City of Austin Austin United States of America North America public C40 2018 Increased water stress or scarcity Water use restrictions NaN (30.2672, -97.7431)
4 1184 City of Austin Austin United States of America North America public C40 2018 Increased water stress or scarcity Efficiency regulations or standards NaN (30.2672, -97.7431)
In [3]:
cwr.head()
Out[3]:
Account No Organisation City Country Region Access C40 Reporting year Risks to city's water supply Timescale Magnitude Risk description Current population Population year City Location
0 1093 City of Atlanta Atlanta United States of America North America public NaN 2018 Increased water stress or scarcity Short-term Serious The city depends on one river for water supply... 450182.0 2017 (33.749, -84.388)
1 1093 City of Atlanta Atlanta United States of America North America public NaN 2018 Higher water prices Current Serious The water cost burden is felt across all ZIP c... 450182.0 2017 (33.749, -84.388)
2 1184 City of Austin Austin United States of America North America public C40 2018 Increased water stress or scarcity Medium-term Extremely serious NaN 1265974.0 2017 (30.2672, -97.7431)
3 1499 Ajuntament de Barcelona Barcelona Spain Europe public C40 2018 Increased water stress or scarcity Current Extremely serious The downscaling of the climate projections at ... 1604555.0 2015 (41.3823, 2.1775)
4 1499 Ajuntament de Barcelona Barcelona Spain Europe public C40 2018 Declining water quality Medium-term Serious A reduction in water resources availability c... 1604555.0 2015 (41.3823, 2.1775)

Normalize columns names

In [4]:
# Convert columns names to lowercase
cwa.columns = cwa.columns.str.lower().str.replace(" ", "_").str.replace("'","")
cwr.columns = cwr.columns.str.lower().str.replace(" ", "_").str.replace("'","")

Find counts of nulls per column

In [5]:
cwr.isnull().sum()
Out[5]:
account_no                       0
organisation                     0
city                             0
country                          0
region                           0
access                           0
c40                            504
reporting_year                   0
risks_to_citys_water_supply     40
timescale                       11
magnitude                      159
risk_description               107
current_population               0
population_year                  0
city_location                    0
dtype: int64
In [6]:
cwa.isnull().sum()
Out[6]:
account_no                       0
organisation                     0
city                             0
country                          0
region                           0
access                           0
c40                            536
reporting_year                   0
risks_to_citys_water_supply     75
adaptation_action               19
action_description             146
city_location                    0
dtype: int64

Find columns that exist in both datasets

In [7]:
id_columns = list(set(cwr.columns) & set(cwa.columns))
id_columns
Out[7]:
['region',
 'city',
 'access',
 'country',
 'risks_to_citys_water_supply',
 'c40',
 'organisation',
 'city_location',
 'account_no',
 'reporting_year']

Check how many nulls exist in these columns

In [8]:
cwr[id_columns].isnull().sum()
Out[8]:
region                           0
city                             0
access                           0
country                          0
risks_to_citys_water_supply     40
c40                            504
organisation                     0
city_location                    0
account_no                       0
reporting_year                   0
dtype: int64
In [9]:
cwa[id_columns].isnull().sum()
Out[9]:
region                           0
city                             0
access                           0
country                          0
risks_to_citys_water_supply     75
c40                            536
organisation                     0
city_location                    0
account_no                       0
reporting_year                   0
dtype: int64

Replace nulls there with "nan" strings, so that merging is done correctly

In [10]:
cwr['c40'] = cwr['c40'] == 'C40'
cwr[id_columns] = cwr[id_columns].fillna('nan')
In [11]:
cwa['c40'] = cwa['c40'] == 'C40'
cwa[id_columns] = cwa[id_columns].fillna('nan')

Check id_columns uniqueness

In [12]:
len(cwa[id_columns]), len(cwa[id_columns].drop_duplicates())
Out[12]:
(685, 484)
In [13]:
len(cwr[id_columns]), len(cwr[id_columns].drop_duplicates())
Out[13]:
(608, 578)

Aggregate by grouping by id_columns and applying set operations to other colums

In [14]:
cwa_agg = cwa.groupby(id_columns).agg(set).reset_index()
cwr_agg = cwr.groupby(id_columns).agg(set).reset_index()
In [15]:
len(cwa_agg[id_columns]), len(cwa_agg[id_columns].drop_duplicates())
Out[15]:
(484, 484)
In [16]:
len(cwr_agg[id_columns]), len(cwr_agg[id_columns].drop_duplicates())
Out[16]:
(578, 578)

Make sure that risks magnitude (which is going to be used as an entry label) has a single value

In [17]:
cwr_agg.magnitude = cwr_agg.magnitude.apply(lambda x: 'Extremely serious' if 'Extremely serious' in x  else ('Serious' if 'Serious' in x else list(x)[0]))
In [18]:
cwr_agg.magnitude.unique()
Out[18]:
array([nan, 'Serious', 'Extremely serious', 'Less Serious'], dtype=object)

Merge (outer) datasets and find the columns that are set (list_columns)

filling the non-merged rows with {np.nan} on list_columns to make sure there is contingency in each column

In [19]:
cw_data = pd.merge(cwa_agg, cwr_agg, on=id_columns, how='outer')
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, set))
list_columns = list_columns[list_columns].index
cw_data[list_columns] = cw_data[list_columns].applymap(lambda x: {np.nan} if pd.isna(x) else x)
In [20]:
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[20]:
(677, 677)
In [21]:
cw_data.head()
Out[21]:
region city access country risks_to_citys_water_supply c40 organisation city_location account_no reporting_year adaptation_action action_description timescale magnitude risk_description current_population population_year
0 Africa Abuja public Nigeria Higher water prices False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 {Water metering} {nan} {Short-term} NaN {nan} {2440000.0} {2017}
1 Africa Abuja public Nigeria Inadequate or aging infrastructure False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 {Investment in existing water supply infrastru... {nan} {nan} NaN {nan} {nan} {nan}
2 Africa Abuja public Nigeria Increased water stress or scarcity False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 {Efficiency regulations or standards} {nan} {Short-term} NaN {nan} {2440000.0} {2017}
3 Africa Bouaké public Côte d'Ivoire Declining water quality False Commune de Bouaké (7.4137, -5.0149) 831617 2018 {Diversifying water supply (including new sour... {Approvisionnement en eau potable par la const... {Current} Extremely serious {La ville manque d'eau depuis plus de 5 mois} {715435.0} {2013}
4 Africa Cape Town public South Africa Increased water stress or scarcity True City of Cape Town (-33.9253, 18.4239) 35858 2018 {Water use restrictions, Water metering, Use o... {In terms of diversifying water supply (includ... {Current} Extremely serious {The City has generally been able to successfu... {4174510.0} {2017}

Extract longitude and latitude from city_location

In [22]:
cw_data['city_location'] = cw_data['city_location'].apply(eval)
cw_data['latitude'] = cw_data['city_location'].apply(lambda x: x[0])
cw_data['longitude'] = cw_data['city_location'].apply(lambda x: x[1])
id_columns = [x for x in id_columns] + ['latitude','longitude']
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[22]:
(677, 677)

Find list columns that are singular (every element contains a length 1 list)

In [23]:
cw_value_columns = [ x for x in cw_data.columns if x not in id_columns]
# Values that can be converted back to strings from sets

singular_cols = cw_data.set_index(id_columns)[list_columns].applymap(lambda x: len(x)==1).product() > 0
singular_cols
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[23]:
(677, 677)

Convert singular columns to non list ones

In [24]:
# magnitude is nicely singular per id
cw_data.loc[:, singular_cols[singular_cols].index] = cw_data.loc[:, singular_cols[singular_cols].index].applymap(lambda x: list(x)[0])
list_columns = [x for x in list_columns if x not in singular_cols[singular_cols].index]

Remove entries that are invalid or in different language than English from the list columns

also convert sets to lists

In [25]:
def try_lang_detect_except(x):
    try:
        return (langdetect(x)=='en')
    except:
        return False
cw_data = cw_data.applymap(lambda x: [t for t in x if not pd.isnull(t)  and  (t!='') and (t!='ERROR: #NAME?') and try_lang_detect_except(t)] if isinstance(x,set) else x)
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[25]:
(677, 677)

Fix specific risks annotations by inspection

In [26]:
cw_data.risks_to_citys_water_supply.unique()
Out[26]:
array(['Higher water prices', 'Inadequate or aging infrastructure',
       'Increased water stress or scarcity', 'Declining water quality',
       'nan', 'Increased water demand', 'Regulatory',
       'Energy supply issues', 'Inadequate or ageing infrastructure',
       'Declining water quality: Declining water quality'], dtype=object)
In [27]:
cw_data.risks_to_citys_water_supply = cw_data.risks_to_citys_water_supply.apply(lambda x: x.replace('Inadequate or ageing infrastructure','Inadequate or aging infrastructure'))
cw_data['risks_to_citys_water_supply'] = cw_data['risks_to_citys_water_supply'].apply(lambda x: x.replace('Declining water quality: Declining water quality', 'Declining water quality'))
cw_data.risks_to_citys_water_supply.unique()
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[27]:
(677, 598)
In [28]:
cw_data.drop_duplicates(subset=id_columns,inplace=True)
In [29]:
cw_data.head()
Out[29]:
region city access country risks_to_citys_water_supply c40 organisation city_location account_no reporting_year adaptation_action action_description timescale magnitude risk_description current_population population_year latitude longitude
0 Africa Abuja public Nigeria Higher water prices False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [] [] [Short-term] NaN [] 2440000.0 2017.0 9.07647 7.39857
1 Africa Abuja public Nigeria Inadequate or aging infrastructure False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [Investment in existing water supply infrastru... [] [] NaN [] NaN NaN 9.07647 7.39857
2 Africa Abuja public Nigeria Increased water stress or scarcity False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [Efficiency regulations or standards] [] [Short-term] NaN [] 2440000.0 2017.0 9.07647 7.39857
3 Africa Bouaké public Côte d'Ivoire Declining water quality False Commune de Bouaké (7.4137, -5.0149) 831617 2018 [Diversifying water supply (including new sour... [] [] Extremely serious [] 715435.0 2013.0 7.41370 -5.01490
4 Africa Cape Town public South Africa Increased water stress or scarcity True City of Cape Town (-33.9253, 18.4239) 35858 2018 [Water use restrictions, Use of non-potable wa... [In terms of diversifying water supply (includ... [] Extremely serious [The City has generally been able to successfu... 4174510.0 2017.0 -33.92530 18.42390
In [30]:
to_drop = ['timescale','current_population','population_year']
cw_data['population_year'] = cw_data['population_year'].fillna(0)
t = cw_data.sort_values('population_year' ).groupby([x for x in cw_data.columns if (x !='population_year') and x not in list_columns],dropna=False).last().reset_index()
In [31]:
cw_data
Out[31]:
region city access country risks_to_citys_water_supply c40 organisation city_location account_no reporting_year adaptation_action action_description timescale magnitude risk_description current_population population_year latitude longitude
0 Africa Abuja public Nigeria Higher water prices False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [] [] [Short-term] NaN [] 2440000.0 2017.0 9.07647 7.39857
1 Africa Abuja public Nigeria Inadequate or aging infrastructure False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [Investment in existing water supply infrastru... [] [] NaN [] NaN 0.0 9.07647 7.39857
2 Africa Abuja public Nigeria Increased water stress or scarcity False Abuja Federal Capital Territory (9.07647, 7.39857) 36043 2018 [Efficiency regulations or standards] [] [Short-term] NaN [] 2440000.0 2017.0 9.07647 7.39857
3 Africa Bouaké public Côte d'Ivoire Declining water quality False Commune de Bouaké (7.4137, -5.0149) 831617 2018 [Diversifying water supply (including new sour... [] [] Extremely serious [] 715435.0 2013.0 7.41370 -5.01490
4 Africa Cape Town public South Africa Increased water stress or scarcity True City of Cape Town (-33.9253, 18.4239) 35858 2018 [Water use restrictions, Use of non-potable wa... [In terms of diversifying water supply (includ... [] Extremely serious [The City has generally been able to successfu... 4174510.0 2017.0 -33.92530 18.42390
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
669 Southeast Asia and Oceania City of Sydney public Australia Regulatory True City of Sydney (-33.8675, 151.207) 31114 2018 [] [] [] Serious [Regulatory, pricing and institutional arrange... 233219.0 2017.0 -33.86750 151.20700
670 Southeast Asia and Oceania Ho Chi Minh public Viet Nam Declining water quality True Ho Chi Minh City (10.7626, 106.66) 31164 2018 [] [] [] Serious [] 8611000.0 2017.0 10.76260 106.66000
671 Southeast Asia and Oceania Ho Chi Minh public Viet Nam Higher water prices True Ho Chi Minh City (10.7626, 106.66) 31164 2018 [] [] [Short-term] Less Serious [] 8611000.0 2017.0 10.76260 106.66000
672 Southeast Asia and Oceania Ho Chi Minh public Viet Nam Inadequate or aging infrastructure True Ho Chi Minh City (10.7626, 106.66) 31164 2018 [] [] [Short-term] Less Serious [] 8611000.0 2017.0 10.76260 106.66000
673 Southeast Asia and Oceania Ho Chi Minh public Viet Nam Increased water demand True Ho Chi Minh City (10.7626, 106.66) 31164 2018 [] [] [] Less Serious [] 8611000.0 2017.0 10.76260 106.66000

598 rows × 19 columns

Drop metadata columns that will not be used

In [32]:
cw_data.drop(columns=['timescale','population_year'],inplace=True)
cw_data.rename(columns={'current_population':'population'},inplace=True)
In [33]:
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[33]:
(598, 598)

Generate for each of the list columns a new one that contains the number of elements

Create value_columns that contains the names of all the list columns + the created ones

In [34]:
value_columns = set([x for x in cw_value_columns if x not in to_drop] + ['risks_to_citys_water_supply'])
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, list))
list_columns = list_columns[list_columns].index
len_columns = set()
for x in value_columns.copy():
    if x in list_columns:
        cw_data[x+'_n'] = cw_data[x].apply(lambda x: len(x) if x else 1)
        len_columns.add(x+'_n')
value_columns = value_columns.union(len_columns)
value_columns
Out[34]:
{'action_description',
 'action_description_n',
 'adaptation_action',
 'adaptation_action_n',
 'magnitude',
 'risk_description',
 'risk_description_n',
 'risks_to_citys_water_supply'}
In [35]:
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[35]:
(598, 598)

Filter by C40

We are making the assumption that, given that those cities participate in a specific program, they will also have better structured data. So this subset will be used as a reference for the imputation

In [36]:
c40_data = cw_data[cw_data['c40']].copy()
c40_data.shape
Out[36]:
(97, 20)

Find the null percentage of list columns

In [37]:
list_nulls = c40_data[list_columns].applymap(lambda x: len(x) == 0)
list_nulls.mean()
Out[37]:
adaptation_action     0.268041
action_description    0.381443
risk_description      0.402062
dtype: float64
In [38]:
c40_data['magnitude'].unique()
Out[38]:
array(['Extremely serious', nan, 'Serious', 'Less Serious'], dtype=object)

Create a report of the values of magnitude

In [39]:
c40_risks = c40_data[id_columns+['magnitude']].copy()
c40_risks['low'] = c40_risks['magnitude'] == 'Less Serious'
c40_risks['medium'] = c40_risks['magnitude'] == 'Serious'
c40_risks['high'] = c40_risks['magnitude'] == 'Extremely serious'
c40_risks['unknown'] = c40_risks['magnitude'].isna()
c40_risks = c40_risks.drop(columns=['magnitude']).groupby([x for x in id_columns if x!="risks_to_citys_water_supply"]).agg(sum).reset_index()
c40_risks
Out[39]:
region city access country c40 organisation city_location account_no reporting_year latitude longitude low medium high unknown
0 Africa Addis Ababa public Ethiopia True Addis Ababa City Administration (9.02887, 38.7544) 31146 2018 9.02887 38.75440 0 3 0 0
1 Africa Cape Town public South Africa True City of Cape Town (-33.9253, 18.4239) 35858 2018 -33.92530 18.42390 0 0 1 0
2 Africa Dar es Salaam public United Republic of Tanzania True City of Dar es Salaam (-6.79235, 39.2083) 35893 2018 -6.79235 39.20830 0 1 0 1
3 Africa Durban public South Africa True City of Durban (-29.8587, 31.0218) 35863 2018 -29.85870 31.02180 0 0 0 1
4 Africa Johannesburg public South Africa True City of Johannesburg (-26.2041, 28.0473) 31115 2018 -26.20410 28.04730 0 0 0 1
5 Africa Lagos public Nigeria True City of Lagos (6.52437, 3.3792) 31167 2018 6.52437 3.37920 0 4 2 1
6 Africa Nairobi public Kenya True City of Nairobi (-1.29206, 36.8219) 35913 2018 -1.29206 36.82190 0 2 0 0
7 East Asia Changwon public Republic of Korea True Changwon City (35.1542, 126.949) 31186 2018 35.15420 126.94900 0 1 0 0
8 East Asia Dalian public China True Dalian Municipal People's Government (38.9389, 121.567) 54298 2018 38.93890 121.56700 0 0 0 2
9 East Asia Hong Kong public China, Hong Kong Special Administrative Region True Government of Hong Kong Special Administrative... (22.2881, 114.14) 31169 2018 22.28810 114.14000 0 1 0 1
10 East Asia Tokyo public Japan True Tokyo Metropolitan Government (35.41, 139.41) 31111 2018 35.41000 139.41000 0 3 0 1
11 East Asia Yokohama public Japan True City of Yokohama (35.4437, 139.638) 31113 2018 35.44370 139.63800 0 1 0 0
12 Europe Barcelona public Spain True Ajuntament de Barcelona (41.3823, 2.1775) 1499 2018 41.38230 2.17750 0 2 1 1
13 Europe London public United Kingdom of Great Britain and Northern I... True Greater London Authority (51.5048, -0.07868) 3422 2018 51.50480 -0.07868 1 2 0 0
14 Europe Oslo public Norway True City of Oslo (59.9139, 10.7522) 14088 2018 59.91390 10.75220 1 0 1 1
15 Europe Paris public France True City of Paris (48.8566, 2.35222) 31175 2018 48.85660 2.35222 0 0 0 2
16 Europe Roma public Italy True Roma Capitale (41.53, 12.2858) 31110 2018 41.53000 12.28580 0 0 1 1
17 Europe Stockholm public Sweden True City of Stockholm (59.3293, 18.0686) 3429 2018 59.32930 18.06860 1 1 0 0
18 Europe Warsaw public Poland True City of Warsaw (52.2297, 21.0122) 31185 2018 52.22970 21.01220 0 1 0 0
19 Latin America Curitiba public Brazil True Municipality of Curitiba (-25.4311, -49.2647) 31156 2018 -25.43110 -49.26470 1 0 0 0
20 Latin America Guadalajara public Mexico True Region Metropolitana de Guadalajara (20.6597, -103.35) 35867 2018 20.65970 -103.35000 0 0 0 1
21 Latin America Medellín public Colombia True Municipality of Medellín (6.22729, -75.5735) 35873 2018 6.22729 -75.57350 0 2 0 0
22 Latin America Mexico City public Mexico True Mexico City (19.4326, -99.1332) 31172 2018 19.43260 -99.13320 0 0 1 1
23 Latin America Quito public Ecuador True Distrito Metropolitano de Quito (-0.2333, -78.5167) 42178 2018 -0.23330 -78.51670 0 1 0 0
24 Latin America Rio de Janeiro public Brazil True Prefeitura do Rio de Janeiro (-22.9054, -43.5614) 31176 2018 -22.90540 -43.56140 0 2 0 1
25 Latin America Santiago public Chile True Región Metropolitana de Santiago (-33.4489, -70.6693) 31180 2018 -33.44890 -70.66930 0 3 0 0
26 Latin America São Paulo public Brazil True Prefeitura de São Paulo (-23.5595, -46.7382) 31184 2018 -23.55950 -46.73820 0 1 0 0
27 Middle East Amman public Jordan True Greater Amman Municipality (31.9454, 35.9284) 54337 2018 31.94540 35.92840 0 0 2 0
28 North America Austin public United States of America True City of Austin (30.2672, -97.7431) 1184 2018 30.26720 -97.74310 0 0 1 0
29 North America Portland, OR public United States of America True City of Portland, OR (45.52, -122.682) 14874 2018 45.52000 -122.68200 0 0 0 4
30 North America San Francisco public United States of America True City of San Francisco (37.7749, -122.419) 31182 2018 37.77490 -122.41900 0 1 0 0
31 North America Seattle public United States of America True City of Seattle (47.6062, -122.332) 16581 2018 47.60620 -122.33200 0 0 0 1
32 North America Vancouver public Canada True City of Vancouver (49.2612, -123.114) 20113 2018 49.26120 -123.11400 1 0 0 0
33 North America Washington, DC public United States of America True District of Columbia (38.9072, -77.0369) 31090 2018 38.90720 -77.03690 0 2 0 1
34 South and West Asia Dhaka public Bangladesh True Dhaka City (23.8103, 90.4125) 50782 2018 23.81030 90.41250 0 0 0 4
35 South and West Asia Karachi public Pakistan True City District Government Karachi (24.9108, 67.1281) 31168 2018 24.91080 67.12810 0 0 1 0
36 South and West Asia Kolkata public India True Kolkata Metropolitan Area (22.5726, 88.3639) 35904 2018 22.57260 88.36390 0 0 0 1
37 Southeast Asia and Oceania Auckland public New Zealand True Auckland Council (-36.8153, 174.742) 43932 2018 -36.81530 174.74200 0 2 0 1
38 Southeast Asia and Oceania Bangkok public Thailand True Bangkok Metropolitan Administration (13.7712, 100.556) 31150 2018 13.77120 100.55600 1 0 0 0
39 Southeast Asia and Oceania City of Melbourne public Australia True City of Melbourne (-37.8141, 144.963) 31109 2018 -37.81410 144.96300 0 2 0 0
40 Southeast Asia and Oceania City of Sydney public Australia True City of Sydney (-33.8675, 151.207) 31114 2018 -33.86750 151.20700 0 4 0 1
41 Southeast Asia and Oceania Ho Chi Minh public Viet Nam True Ho Chi Minh City (10.7626, 106.66) 31164 2018 10.76260 106.66000 3 1 0 0
42 Southeast Asia and Oceania Jakarta public Indonesia True Jakarta City Government (-6.1751, 106.865) 31166 2018 -6.17510 106.86500 0 0 0 5
In [40]:
from sklearn.preprocessing import LabelEncoder
In [41]:
from utils.nlp import SimilarityAnalysis, create_sim_vector
In [42]:
c40_data.risks_to_citys_water_supply = c40_data.risks_to_citys_water_supply.apply(lambda x: np.nan if x=='nan' else x)
In [43]:
c40_data[c40_data.risks_to_citys_water_supply.isnull()]
Out[43]:
region city access country risks_to_citys_water_supply c40 organisation city_location account_no reporting_year adaptation_action action_description magnitude risk_description population latitude longitude adaptation_action_n action_description_n risk_description_n
21 Africa Johannesburg public South Africa NaN True City of Johannesburg (-26.2041, 28.0473) 31115 2018 [Water use restrictions] [A by-law that effects water restrictions is i... NaN [] NaN -26.2041 28.0473 1 1 1
71 East Asia Tokyo public Japan NaN True Tokyo Metropolitan Government (35.41, 139.41) 31111 2018 [] [Implementing measures against flood] Serious [] 13857664.0 35.4100 139.4100 1 1 1
108 Europe Oslo public Norway NaN True City of Oslo (59.9139, 10.7522) 14088 2018 [Stormwater management (natural or man-made in... [Small scale project to implement water meeter... NaN [] NaN 59.9139 10.7522 3 4 1
404 North America Portland, OR public United States of America NaN True City of Portland, OR (45.52, -122.682) 14874 2018 [Watershed preservation] [Portland seeks to manage the watershed in way... NaN [Portland has a secondary, redundant, groundwa... 639863.0 45.5200 -122.6820 1 1 1
454 South and West Asia Dhaka public Bangladesh NaN True Dhaka City (23.8103, 90.4125) 50782 2018 [Stormwater management (natural or man-made in... [Embankment is constructed around Dhaka City t... NaN [Increases water born diseases] 4500000.0 23.8103 90.4125 1 1 1
456 South and West Asia Kolkata public India NaN True Kolkata Metropolitan Area (22.5726, 88.3639) 35904 2018 [Efficiency regulations or standards, Stormwat... [Channelisation of drains Construction of new ... NaN [53% of the boroughs within Kolkata Municipal ... 14030000.0 22.5726 88.3639 2 2 2
476 Southeast Asia and Oceania Jakarta public Indonesia NaN True Jakarta City Government (-6.1751, 106.865) 31166 2018 [Other: Stormwater management (natural or man ... [- Restore damaged and lost water ecosystems (... NaN [Increased frequency of rain affecting the are... 10075310.0 -6.1751 106.8650 1 1 1
In [44]:
c40_data.risks_to_citys_water_supply.unique()
Out[44]:
array(['Increased water stress or scarcity',
       'Inadequate or aging infrastructure', nan,
       'Declining water quality', 'Energy supply issues',
       'Higher water prices', 'Increased water demand', 'Regulatory'],
      dtype=object)

Fit Similarity Analysis

per string list column of the value columns, by concatenating strings together. Vectorize all the string list columns based on their in between similarities

In [45]:
analyses = {x:SimilarityAnalysis() for x in value_columns if x in list_columns}
In [46]:
vectorized = np.hstack([analyses[x].fit_transform(c40_data[x]) for x in analyses])

Encode magnitude and risks into numerical labels

In [47]:
cols_to_encode =  [x for x in value_columns if x not in list_columns and x not in len_columns]
c40_data[cols_to_encode] = c40_data[cols_to_encode].fillna('nan')
encoders = {x:LabelEncoder() for x in cols_to_encode}
In [48]:
encoded = np.array([encoders[x].fit_transform(c40_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
    encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan
In [49]:
lab_freqs = c40_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
lab_freqs
Out[49]:
risks_to_citys_water_supply magnitude size
0 Declining water quality Less Serious 2
1 Declining water quality Serious 9
2 Declining water quality nan 3
3 Energy supply issues Extremely serious 1
4 Energy supply issues Serious 1
5 Higher water prices Extremely serious 1
6 Higher water prices Less Serious 1
7 Higher water prices Serious 1
8 Higher water prices nan 3
9 Inadequate or aging infrastructure Less Serious 1
10 Inadequate or aging infrastructure Serious 1
11 Inadequate or aging infrastructure nan 15
12 Increased water demand Extremely serious 1
13 Increased water demand Less Serious 1
14 Increased water demand Serious 6
15 Increased water stress or scarcity Extremely serious 8
16 Increased water stress or scarcity Less Serious 3
17 Increased water stress or scarcity Serious 21
18 Increased water stress or scarcity nan 6
19 Regulatory Less Serious 1
20 Regulatory Serious 3
21 Regulatory nan 1
22 nan Serious 1
23 nan nan 6

Create dataset that contains the encoded labels, the vectorized columns and the columns referring the lists lengths

In [50]:
whole = np.hstack([encoded, vectorized, c40_data[len_columns]])
In [51]:
import sys
sys.path.append('..')
from labeled_preprocessing.imputation import LabeledDatasetImputer

Fit LabeledDatasetImputer and transform dataset

(LabeledDatasetImputer is a union of two Iterative Imputers , one for labels and one for features)

In [52]:
imputer = LabeledDatasetImputer(verbose=2,k_features_per_label=0, seed=seed)
continuous_imputed, y = imputer.fit_transform(whole[:,2:], whole[:,:2], ret_imputed_x=True)
Applying feature selection..
Creating imputed X using KNeighborsRegressor..
[IterativeImputer] Completing matrix with shape (97, 192)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.45
[IterativeImputer] Change: 16.78842081477814, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.88
[IterativeImputer] Change: 7.823153835169504, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 3/10, elapsed time 1.57
[IterativeImputer] Change: 3.379485724678483, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 4/10, elapsed time 2.09
[IterativeImputer] Change: 0.021456167679847482, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 5/10, elapsed time 2.51
[IterativeImputer] Change: 3.3306690738754696e-16, scaled tolerance: 0.011 
[IterativeImputer] Early stopping criterion reached.
Creating imputed Y using Pipeline..
[IterativeImputer] Completing matrix with shape (97, 194)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.13
[IterativeImputer] Change: 5.0, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.23
[IterativeImputer] Change: 0.0, scaled tolerance: 0.011 
[IterativeImputer] Early stopping criterion reached.

Use the prediction power on the imputed data as measurement of coherent imputation.

In [53]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
simple_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=0.01, random_state=seed))
scoring = 'accuracy' 
scores = cross_val_score(simple_model, continuous_imputed, y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)
Out[53]:
0.773684210526316

Plot features correlation

In [54]:
import matplotlib.pyplot as plt
flg = np.all(~np.isnan(whole),axis=1), np.hstack([[True, True], imputer.selection_mask[:]])
corr = np.corrcoef(whole[flg[0],:][:,flg[1]].T)
print(corr.shape)
plt.matshow(corr)
(194, 194)
Out[54]:
<matplotlib.image.AxesImage at 0x7f9b93e80190>

Retrieve the categorical imputed labels

In [55]:
imputed = np.array([
    encoders[x].inverse_transform(
    y[:, c].astype(int)
 ) for c,x in enumerate(cols_to_encode)]).T
In [56]:
c40_data_imputed = c40_data.copy()
In [57]:
c40_data_imputed[cols_to_encode] = imputed
In [58]:
c40_data_imputed.head()
Out[58]:
region city access country risks_to_citys_water_supply c40 organisation city_location account_no reporting_year adaptation_action action_description magnitude risk_description population latitude longitude adaptation_action_n action_description_n risk_description_n
4 Africa Cape Town public South Africa Increased water stress or scarcity True City of Cape Town (-33.9253, 18.4239) 35858 2018 [Water use restrictions, Use of non-potable wa... [In terms of diversifying water supply (includ... Extremely serious [The City has generally been able to successfu... 4174510.0 -33.92530 18.4239 9 10 1
7 Africa Dar es Salaam public United Republic of Tanzania Inadequate or aging infrastructure True City of Dar es Salaam (-6.79235, 39.2083) 35893 2018 [Investment in existing water supply infrastru... [City provide some % of the total money correc... Serious [] NaN -6.79235 39.2083 1 1 1
8 Africa Durban public South Africa Inadequate or aging infrastructure True City of Durban (-29.8587, 31.0218) 35863 2018 [Other, Investment in existing water supply in... [improvements to infrastructure are being done... Serious [] NaN -29.85870 31.0218 3 3 1
21 Africa Johannesburg public South Africa Increased water stress or scarcity True City of Johannesburg (-26.2041, 28.0473) 31115 2018 [Water use restrictions] [A by-law that effects water restrictions is i... Serious [] NaN -26.20410 28.0473 1 1 1
24 Africa Lagos public Nigeria Declining water quality True City of Lagos (6.52437, 3.3792) 31167 2018 [Water use restrictions] [Application of water abstraction rules, surch... Serious [Increased rainfall will result to greater amo... 24821418.0 6.52437 3.3792 1 1 1

Find which labels have been updated after imputation

In [59]:
imp_lab_freqs = c40_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_ret = pd.merge(lab_freqs, imp_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_ret['increase'] = (imp_ret['size_imp'] - imp_ret['size'])/imp_ret['size']
imp_ret
Out[59]:
risks_to_citys_water_supply magnitude size size_imp increase
0 Declining water quality Extremely serious NaN 1 NaN
1 Declining water quality Less Serious 2.0 2 0.000000
2 Declining water quality Serious 9.0 12 0.333333
3 Energy supply issues Extremely serious 1.0 1 0.000000
4 Energy supply issues Serious 1.0 1 0.000000
5 Higher water prices Extremely serious 1.0 1 0.000000
6 Higher water prices Less Serious 1.0 1 0.000000
7 Higher water prices Serious 1.0 4 3.000000
8 Inadequate or aging infrastructure Less Serious 1.0 1 0.000000
9 Inadequate or aging infrastructure Serious 1.0 17 16.000000
10 Increased water demand Extremely serious 1.0 1 0.000000
11 Increased water demand Less Serious 1.0 1 0.000000
12 Increased water demand Serious 6.0 6 0.000000
13 Increased water stress or scarcity Extremely serious 8.0 9 0.125000
14 Increased water stress or scarcity Less Serious 3.0 3 0.000000
15 Increased water stress or scarcity Serious 21.0 31 0.476190
16 Regulatory Less Serious 1.0 1 0.000000
17 Regulatory Serious 3.0 4 0.333333

Make sure that all risks have been encoded and are imputation outcome candidates

In [60]:
set([x for x in cw_data['risks_to_citys_water_supply'] if x not in encoders['risks_to_citys_water_supply'].classes_])
Out[60]:
set()

Impute whole dataset

In [61]:
cw_data['risks_to_citys_water_supply'].value_counts()
Out[61]:
Increased water stress or scarcity    205
Declining water quality               127
Inadequate or aging infrastructure    101
nan                                    54
Increased water demand                 49
Higher water prices                    38
Regulatory                             17
Energy supply issues                    7
Name: risks_to_citys_water_supply, dtype: int64

Apply encoding and vectorization

based on fitted encoders and SimilarityAnalysis objects

In [62]:
cw_data[cols_to_encode] = cw_data[cols_to_encode].fillna('nan')
encoded = np.array([encoders[x].transform(cw_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
    encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan
In [63]:
all_vectorized = np.hstack([analyses[x].transform(cw_data[x]) for x in value_columns if x in list_columns])

Apply imputation

using the trained imputer

In [64]:
all_imputed_x, all_imputed_y  = imputer.transform(np.hstack([all_vectorized,cw_data[len_columns]]), encoded, ret_imputed_x=True)
[IterativeImputer] Completing matrix with shape (598, 192)
[IterativeImputer] Ending imputation round 1/5, elapsed time 0.48
[IterativeImputer] Ending imputation round 2/5, elapsed time 0.99
[IterativeImputer] Ending imputation round 3/5, elapsed time 1.49
[IterativeImputer] Ending imputation round 4/5, elapsed time 2.11
[IterativeImputer] Ending imputation round 5/5, elapsed time 2.72
[IterativeImputer] Completing matrix with shape (598, 194)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.02
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.03
In [65]:
all_imputed_y_dec = np.array([
    encoders[x].inverse_transform(
    all_imputed_y[:, c].astype(int)
 ) for c,x in enumerate(cols_to_encode)]).T
In [66]:
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Out[66]:
(598, 598)
In [67]:
cw_data_imputed = cw_data.copy()
cw_data_imputed[cols_to_encode] = all_imputed_y_dec
cw_data_imputed.drop_duplicates(id_columns,inplace=True)

Plot features correlation

In [68]:
import matplotlib.pyplot as plt
corr = np.corrcoef(np.hstack([all_imputed_y, all_imputed_x]).T)
print(corr.shape)
plt.matshow(corr)
(194, 194)
Out[68]:
<matplotlib.image.AxesImage at 0x7f9b92426c70>
In [69]:
scores = cross_val_score(simple_model, all_imputed_x, all_imputed_y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)
Out[69]:
0.7407983193277311

The score did not have significant drop, so we can assume that the imputation was cohesive across data

Create output dataset

In [205]:
final_labeled_data = cw_data_imputed[['city','latitude','longitude','country','population','c40','magnitude','risks_to_citys_water_supply']].copy()
final_labeled_data.rename(columns={'magnitude': 'risk','risks_to_citys_water_supply': 'description'},inplace=True)
In [206]:
final_labeled_data[['city','latitude','longitude','c40', 'population','description','risk']]
Out[206]:
city latitude longitude c40 population description risk
0 Abuja 9.07647 7.39857 False 2440000.0 Higher water prices Serious
1 Abuja 9.07647 7.39857 False NaN Inadequate or aging infrastructure Serious
2 Abuja 9.07647 7.39857 False 2440000.0 Increased water stress or scarcity Serious
3 Bouaké 7.41370 -5.01490 False 715435.0 Declining water quality Extremely serious
4 Cape Town -33.92530 18.42390 True 4174510.0 Increased water stress or scarcity Extremely serious
... ... ... ... ... ... ... ...
669 City of Sydney -33.86750 151.20700 True 233219.0 Regulatory Serious
670 Ho Chi Minh 10.76260 106.66000 True 8611000.0 Declining water quality Serious
671 Ho Chi Minh 10.76260 106.66000 True 8611000.0 Higher water prices Less Serious
672 Ho Chi Minh 10.76260 106.66000 True 8611000.0 Inadequate or aging infrastructure Less Serious
673 Ho Chi Minh 10.76260 106.66000 True 8611000.0 Increased water demand Less Serious

575 rows × 7 columns

In [207]:
cw_data_imputed[id_columns].drop_duplicates().shape
Out[207]:
(575, 12)

Pivoting risks description, so that each row is unique per city

In [208]:
risks = final_labeled_data.description.unique()
risks_description = {risk: f'risk{c}' for c,risk in enumerate(risks)}
risks_df = pd.Series(risks_description).to_frame()
risks_df.reset_index(inplace=True)
risks_df.columns=['description','code']
final_labeled_data['description'] = final_labeled_data['description'].apply(lambda x: risks_description[x])
risks_df
Out[208]:
description code
0 Higher water prices risk0
1 Inadequate or aging infrastructure risk1
2 Increased water stress or scarcity risk2
3 Declining water quality risk3
4 Increased water demand risk4
5 Regulatory risk5
6 Energy supply issues risk6
In [209]:
main_index=['city','latitude','longitude', 'country']

Check if there are cities that are listed to be both in and not in C40

and drop the ones that are not

In [210]:
c40_check = final_labeled_data.groupby(main_index)['c40'].nunique()
c40_check[c40_check>1]
Out[210]:
city      latitude  longitude  country
Santiago  -33.4489  -70.6693   Chile      2
Name: c40, dtype: int64
In [211]:
final_labeled_data.city.nunique()
Out[211]:
282
In [212]:
final_labeled_data[final_labeled_data.city == 'Santiago']
Out[212]:
city latitude longitude country population c40 risk description
563 Santiago -33.4489 -70.6693 Chile 404495.0 False Extremely serious risk3
564 Santiago -33.4489 -70.6693 Chile 7112808.0 True Serious risk3
565 Santiago -33.4489 -70.6693 Chile 404495.0 False Extremely serious risk4
566 Santiago -33.4489 -70.6693 Chile 404495.0 False Extremely serious risk2
567 Santiago -33.4489 -70.6693 Chile 7112808.0 True Serious risk2
568 Santiago -33.4489 -70.6693 Chile 7112808.0 True Serious risk5
In [221]:
final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()
final_labeled_data = final_labeled_data.drop(columns=['c40']).merge(final_labeled_data.groupby(main_index)['c40'].max().reset_index(),on=main_index)
final_labeled_data[final_labeled_data.city=='Santiago']
Out[221]:
city latitude longitude country description population risk c40
473 Santiago -33.4489 -70.6693 Chile risk2 7112808.0 Serious True
474 Santiago -33.4489 -70.6693 Chile risk3 7112808.0 Serious True
475 Santiago -33.4489 -70.6693 Chile risk4 404495.0 Extremely serious True
476 Santiago -33.4489 -70.6693 Chile risk5 7112808.0 Serious True
In [222]:
final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()

Fix population

In [223]:
pop_df = final_labeled_data.groupby(index)[['population']
                                          ].max().reset_index().drop_duplicates()
final_labeled_data = final_labeled_data.drop(columns=['population']).merge(pop_df.reset_index(), on=index)
In [224]:
final_labeled_data = final_labeled_data.pivot(index= index + ['c40','population'], columns='description', values='risk').reset_index()

Encode risks from 0 to 1

In [225]:
severity_mapping = {'Less Serious':1, "Serious":2, 'Extremely serious':3}
pd.DataFrame([severity_mapping]).to_csv(SEVERITY_MAPPING_PATH,index=False)
nrisks = list(risks_description.values())
final_labeled_data[nrisks] = final_labeled_data[nrisks].replace(severity_mapping)

Add 0 to risks belonging to C40, assuming that those have reported no risks because indeed they were none, not because they were unknown

In [226]:
final_labeled_data.loc[final_labeled_data['c40'], nrisks] = final_labeled_data.loc[final_labeled_data['c40'], nrisks].fillna(0)
In [227]:
final_labeled_data.to_csv(LABELED_CITIES_PATH,index=False)
risks_df.to_csv(RISKS_MAPPING_PATH,index=False)
In [228]:
final_labeled_data.head()
Out[228]:
description city latitude longitude country c40 population risk0 risk1 risk2 risk3 risk4 risk5 risk6
0 Abasan Al-Kabira 31.32310 34.34400 State of Palestine False 32000.0 NaN 2.0 2.0 2.0 NaN NaN NaN
1 Abington 40.12410 -75.11950 United States of America False 55310.0 NaN 1.0 1.0 2.0 NaN NaN NaN
2 Abuja 9.07647 7.39857 Nigeria False 2440000.0 2.0 2.0 2.0 NaN NaN NaN NaN
3 Addis Ababa 9.02887 38.75440 Ethiopia True 3103374.0 0.0 2.0 2.0 0.0 2.0 0.0 0.0
4 Adelaide -34.92850 138.60100 Australia False 23916.0 2.0 2.0 2.0 NaN NaN 2.0 NaN

See a report of the filled labels

In [229]:
cw_lab_freqs = cw_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_lab_freqs = cw_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_ret = pd.merge(cw_lab_freqs, imp_cw_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_cw_ret['increase'] = (imp_cw_ret['size_imp'] - imp_cw_ret['size'])/imp_cw_ret['size']
imp_cw_ret
Out[229]:
risks_to_citys_water_supply magnitude size size_imp increase
0 Declining water quality Extremely serious 23 24 0.043478
1 Declining water quality Less Serious 18 18 0.000000
2 Declining water quality Serious 57 89 0.561404
3 Energy supply issues Extremely serious 1 1 0.000000
4 Energy supply issues Less Serious 1 1 0.000000
5 Energy supply issues Serious 5 5 0.000000
6 Higher water prices Extremely serious 2 2 0.000000
7 Higher water prices Less Serious 10 11 0.100000
8 Higher water prices Serious 14 25 0.785714
9 Inadequate or aging infrastructure Extremely serious 3 4 0.333333
10 Inadequate or aging infrastructure Less Serious 8 8 0.000000
11 Inadequate or aging infrastructure Serious 11 98 7.909091
12 Increased water demand Extremely serious 7 7 0.000000
13 Increased water demand Less Serious 12 12 0.000000
14 Increased water demand Serious 30 32 0.066667
15 Increased water stress or scarcity Extremely serious 39 42 0.076923
16 Increased water stress or scarcity Less Serious 21 20 -0.047619
17 Increased water stress or scarcity Serious 94 159 0.691489
18 Regulatory Less Serious 4 4 0.000000
19 Regulatory Serious 8 13 0.625000
In [230]:
imp_report = pd.merge(imp_cw_ret, imp_ret, suffixes=('','_c40'), on=('magnitude','risks_to_citys_water_supply'))
imp_report.to_csv(IMPUTATION_REPORT_PATH,index=False)
imp_report
Out[230]:
risks_to_citys_water_supply magnitude size size_imp increase size_c40 size_imp_c40 increase_c40
0 Declining water quality Extremely serious 23 24 0.043478 NaN 1 NaN
1 Declining water quality Less Serious 18 18 0.000000 2.0 2 0.000000
2 Declining water quality Serious 57 89 0.561404 9.0 12 0.333333
3 Energy supply issues Extremely serious 1 1 0.000000 1.0 1 0.000000
4 Energy supply issues Serious 5 5 0.000000 1.0 1 0.000000
5 Higher water prices Extremely serious 2 2 0.000000 1.0 1 0.000000
6 Higher water prices Less Serious 10 11 0.100000 1.0 1 0.000000
7 Higher water prices Serious 14 25 0.785714 1.0 4 3.000000
8 Inadequate or aging infrastructure Less Serious 8 8 0.000000 1.0 1 0.000000
9 Inadequate or aging infrastructure Serious 11 98 7.909091 1.0 17 16.000000
10 Increased water demand Extremely serious 7 7 0.000000 1.0 1 0.000000
11 Increased water demand Less Serious 12 12 0.000000 1.0 1 0.000000
12 Increased water demand Serious 30 32 0.066667 6.0 6 0.000000
13 Increased water stress or scarcity Extremely serious 39 42 0.076923 8.0 9 0.125000
14 Increased water stress or scarcity Less Serious 21 20 -0.047619 3.0 3 0.000000
15 Increased water stress or scarcity Serious 94 159 0.691489 21.0 31 0.476190
16 Regulatory Less Serious 4 4 0.000000 1.0 1 0.000000
17 Regulatory Serious 8 13 0.625000 3.0 4 0.333333
In [ ]: