Module water_security.utils.nlp
Expand source code
import numpy as np
import spacy
try:
nlp = spacy.load("en_core_web_md")
except:
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")
from sklearn.base import BaseEstimator, TransformerMixin
class SimilarityAnalysis(BaseEstimator, TransformerMixin):
"""
Creates similarity matrix to the provided pandas series. Can be fitted to a specific data.
The computed non empty spacy vectors will then be used as reference to compare with another
dataset.
"""
def __init__(self):
self.similarity_vectors = None
def fit(self, description):
"""
Creates an sxs matrix
"""
ret = description.apply(lambda x: nlp(".".join(x)))
self.similarity_vectors = [x for x in ret if x]
return self
def transform(self, description):
"""
Produces a nxs matrix
"""
ret = description.apply(lambda x: nlp(".".join(x)))
ret = np.vstack(
ret.apply(
lambda x: [
(x.similarity(y) if x else np.nan) for y in self.similarity_vectors
]
)
)
return ret
def fit_transform(self, description):
"""
Produces a nxn matrix
"""
ret = description.apply(lambda x: nlp(".".join(x)))
self.similarity_vectors = [x for x in ret if x]
ret = ret.apply(
lambda x: [
(x.similarity(y) if x else np.nan) for y in self.similarity_vectors
]
)
ret = np.vstack(ret)
return ret
def create_sim_vector(description):
return SimilarityAnalysis().fit_transform(description)
Functions
def create_sim_vector(description)
-
Expand source code
def create_sim_vector(description): return SimilarityAnalysis().fit_transform(description)
Classes
class SimilarityAnalysis
-
Creates similarity matrix to the provided pandas series. Can be fitted to a specific data. The computed non empty spacy vectors will then be used as reference to compare with another dataset.
Expand source code
class SimilarityAnalysis(BaseEstimator, TransformerMixin): """ Creates similarity matrix to the provided pandas series. Can be fitted to a specific data. The computed non empty spacy vectors will then be used as reference to compare with another dataset. """ def __init__(self): self.similarity_vectors = None def fit(self, description): """ Creates an sxs matrix """ ret = description.apply(lambda x: nlp(".".join(x))) self.similarity_vectors = [x for x in ret if x] return self def transform(self, description): """ Produces a nxs matrix """ ret = description.apply(lambda x: nlp(".".join(x))) ret = np.vstack( ret.apply( lambda x: [ (x.similarity(y) if x else np.nan) for y in self.similarity_vectors ] ) ) return ret def fit_transform(self, description): """ Produces a nxn matrix """ ret = description.apply(lambda x: nlp(".".join(x))) self.similarity_vectors = [x for x in ret if x] ret = ret.apply( lambda x: [ (x.similarity(y) if x else np.nan) for y in self.similarity_vectors ] ) ret = np.vstack(ret) return ret
Ancestors
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
Methods
def fit(self, description)
-
Creates an sxs matrix
Expand source code
def fit(self, description): """ Creates an sxs matrix """ ret = description.apply(lambda x: nlp(".".join(x))) self.similarity_vectors = [x for x in ret if x] return self
def fit_transform(self, description)
-
Produces a nxn matrix
Expand source code
def fit_transform(self, description): """ Produces a nxn matrix """ ret = description.apply(lambda x: nlp(".".join(x))) self.similarity_vectors = [x for x in ret if x] ret = ret.apply( lambda x: [ (x.similarity(y) if x else np.nan) for y in self.similarity_vectors ] ) ret = np.vstack(ret) return ret
def transform(self, description)
-
Produces a nxs matrix
Expand source code
def transform(self, description): """ Produces a nxs matrix """ ret = description.apply(lambda x: nlp(".".join(x))) ret = np.vstack( ret.apply( lambda x: [ (x.similarity(y) if x else np.nan) for y in self.similarity_vectors ] ) ) return ret