# Copyright 2023 Mario Graff Guerrero
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union, List, Set, Callable
from b4msa.textmodel import TextModel
from microtc.weighting import TFIDF
from microtc.utils import tweet_iterator
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator, clone
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix
import numpy as np
from EvoMSA.utils import load_url
from EvoMSA.utils import MODEL_LANG, TAILORED, MICROTC
from EvoMSA.utils import load_bow, load_emoji, dataset_information,\
load_dataset, load_keyword, b4msa_params, Linear
from EvoMSA.model_selection import KruskalFS
def config_regressor(instance):
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold
instance.estimator_class = LinearSVR
instance.estimator_kwargs = dict()
instance.decision_function_name = 'predict'
instance.kfold_class = KFold
return instance
[docs]
class BoW(BaseEstimator):
"""
BoW is a bag-of-words text classifier. It is described in
"A Simple Approach to Multilingual Polarity Classification in Twitter.
Eric S. Tellez, Sabino Miranda-Jiménez, Mario Graff,
Daniela Moctezuma, Ranyart R. Suárez, Oscar S. Siordia.
Pattern Recognition Letters" and
"An Automated Text Categorization Framework based
on Hyperparameter Optimization. Eric S. Tellez, Daniela Moctezuma,
Sabino Miranda-Jímenez, Mario Graff.
Knowledge-Based Systems Volume 149, 1 June 2018."
BoW uses, by default, a pre-trained bag-of-words representation. The
representation was trained on 4,194,304 (:math:`2^{22}`) tweets
randomly selected. The pre-trained representations are used
when the parameters :attr:`lang` and :attr:`pretrain` are
set; :attr:`pretrain` by default is set to True, and the default
language is Spanish (es). The available languages are:
Arabic (ar), Catalan (ca), German (de), English (en),
Spanish (es), French (fr), Hindi (hi), Indonesian (in),
Italian (it), Japanese (ja), Korean (ko), Dutch (nl),
Polish (pl), Portuguese (pt), Russian (ru), Tagalog (tl),
Turkish (tr), and Chinese (zh).
:param lang: Language. (ar | ca | de | en | es | fr | hi | in | it | ja | ko | nl | pl | pt | ru | tl | tr | zh), default='es'.
:type lang: str
:param voc_size_exponent: Vocabulary size. default=17, i.e., :math:`2^{17}`
:type voc_size_exponent: int
:param voc_selection: Vocabulary (most_common_by_type | most_common). default=most_common_by_type
:type voc_selection: str
:param key: Key where the text is in the dictionary. (default='text')
:type key: Union[str, List[str]]
:param label_key: Key where the response is in the dictionary. (default='klass')
:type label_key: str
:param mixer_func: Function to combine the output in case of multiple texts
:type mixer_func: Callable[[List], csr_matrix]
:param decision_function_name: Name of the decision function (detaulf='decision_function')
:type decision_function_name: str
:param estimator_class: Classifier or Regressor
:type estimator_class: class
:param estimator_kwargs: Keyword parameters for the estimator
:type estimator_kwargs: dict
:param pretrain: Whether to use a pre-trained representation. default=True.
:type pretrain: bool
:param b4msa_kwargs: :py:class:`b4msa.textmodel.TextModel` keyword arguments used to train a bag-of-words representation. default=dict().
:type b4msa_kwargs: dict
:param kfold_class: Class of the KFold procedure (default=StratifiedKFold)
:type kfold_class: class
:param kfold_kwargs: Keyword parameters for the KFold class
:type kfold_kwargs: dict
:param v1: Whether to use version 1 or pretrained representations. default=False
:type v1: bool
:param n_jobs: Number of jobs. default=1
:type n_jobs: int
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> bow = BoW(lang='es').fit(list(tweet_iterator(TWEETS)))
>>> bow.predict(['Buenos dias']).tolist()
['P']
"""
[docs]
def __init__(self, lang: str='es',
voc_size_exponent: int=17,
voc_selection: str='most_common_by_type',
key: Union[str, List[str]]='text',
label_key: str='klass',
mixer_func: Callable[[List], csr_matrix]=sum,
decision_function_name: str='decision_function',
estimator_class=LinearSVC,
estimator_kwargs=dict(dual=True),
pretrain=True,
b4msa_kwargs=dict(),
kfold_class=StratifiedKFold,
kfold_kwargs: dict=dict(random_state=0,
shuffle=True),
v1: bool=False,
n_jobs: int=1) -> None:
assert lang is None or lang in MODEL_LANG
if lang in MODEL_LANG:
assert voc_size_exponent >= 13 and voc_size_exponent <= 17
assert voc_selection in ['most_common_by_type', 'most_common']
self.voc_size_exponent = voc_size_exponent
self.voc_selection = voc_selection
self.n_jobs = n_jobs
self._lang = lang
self.key = key
self.label_key = label_key
self.mixer_func = mixer_func
self.decision_function_name = decision_function_name
self.estimator_class = estimator_class
self.estimator_kwargs = estimator_kwargs
self.b4msa_kwargs = b4msa_kwargs
self._pretrain = pretrain
self.kfold_class = kfold_class
self.kfold_kwargs = kfold_kwargs
self._b4msa_estimated = False
self.v1 = v1
[docs]
def fit(self, D: List[Union[dict, list]],
y: Union[np.ndarray, None]=None) -> 'BoW':
"""Estimate the parameters of the BoW (:py:func:`BoW.b4msa_fit`)
and the classifier or regressor (:py:attr:`BoW.estimator_class` -
the fitted instance is accesible at :py:attr:`BoW.estimator_instance`)
using the dataset (`D`, `y`).
:param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
:param y: Response variable. The response variable can also be in `D` on the key :py:attr:`BoW.label_key`.
:type y: Array or None
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> import numpy as np
>>> D = list(tweet_iterator(TWEETS))
>>> bow = BoW(lang='es').fit(D)
"""
if not self.pretrain and not self._b4msa_estimated:
self.b4msa_fit(D)
y = self.dependent_variable(D, y=y)
_ = self.transform(D, y=y)
self.estimator_instance = self.estimator_class(**self.estimator_kwargs).fit(_, y)
return self
[docs]
def predict(self, D: List[Union[dict, list]]) -> np.ndarray:
"""Predict the response variable on the dataset `D`.
:param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> bow = BoW(lang='es').fit(list(tweet_iterator(TWEETS)))
>>> bow.predict(['Buenos dias']).tolist()
['P']
"""
_ = self.transform(D)
return self.estimator_instance.predict(_)
[docs]
def decision_function(self, D: List[Union[dict, list]]) -> Union[list, np.ndarray]:
"""Decision function of the estimate response variable in `D`.
:param D: Texts to be transformed. In the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> bow = BoW(lang='es').fit(list(tweet_iterator(TWEETS)))
>>> bow.decision_function(['Buenos dias'])
array([[-1.40547754, -1.01340503, -0.57912244, 0.90450322]])
"""
_ = self.transform(D)
hy = getattr(self.estimator_instance, self.decision_function_name)(_)
if hy.ndim == 1:
return np.atleast_2d(hy).T
return hy
@property
def bow(self):
"""Bag of Word text representation.
The following example tokenizes *hi*.
The notation is the following, the first 'hi' corresponds to the word *hi*.
Then, there come the q-grams of characters, the token 'q:hi' represents
the q-gram *hi*. All the q-grams start with the prefix 'q:'. Finally,
the character ~ represents a space.
>>> bow = BoW(lang='en')
>>> bow.bow.tokenize(['hi'])
['hi', 'q:~h', 'q:hi', 'q:i~', 'q:~hi', 'q:hi~', 'q:~hi~']
"""
try:
bow = self._bow
except AttributeError:
if self.pretrain:
if self.v1:
self._bow = load_bow(lang=self.lang, v1=self.v1)
else:
freq = load_bow(lang=self.lang,
d=self.voc_size_exponent,
func=self.voc_selection)
params = b4msa_params(lang=self.lang,
dim=self._voc_size_exponent)
params.update(self.b4msa_kwargs)
bow = TextModel(**params)
tfidf = TFIDF()
tfidf.N = freq.update_calls
tfidf.word2id, tfidf.wordWeight = tfidf.counter2weight(freq)
bow.model = tfidf
self._bow = bow
else:
self._bow = TextModel(lang=self.lang,
**self.b4msa_kwargs)
bow = self._bow
return bow
@bow.setter
def bow(self, value):
self._bow = value
@property
def names(self):
"""Vector space components"""
_names = [None] * len(self.bow.id2token)
for k, v in self.bow.id2token.items():
_names[k] = v
return _names
@property
def weights(self):
"""Vector space weights"""
try:
return self._weights
except AttributeError:
w = [None] * len(self.bow.token_weight)
for k, v in self.bow.token_weight.items():
w[k] = v
self._weights = w
return self._weights
@property
def estimator_instance(self):
"""Estimator - Classifier or Regressor fitted (:py:attr:`BoW.fit`) on the dataset"""
return self._m
@estimator_instance.setter
def estimator_instance(self, m):
self._m = m
@property
def pretrain(self):
"""Whether the to use pre-trained text representations
The parameters of the BoW text representation
are :py:attr:`BoW.lang`, :py:attr:`BoW.voc_selection`,
and :py:attr:`BoW.voc_size_exponent`. The aforementioned parameters are
not available on Version 1.0 (:py:attr:`BoW.v1`).
"""
return self._pretrain
@property
def lang(self):
"""Language of the pre-trained text representations"""
return self._lang
@property
def voc_selection(self):
"""Method used to select the vocabulary"""
return self._voc_selection
@voc_selection.setter
def voc_selection(self, value):
self._voc_selection = value
@property
def voc_size_exponent(self):
"""Vocabulary size :math:`2^v`; where :math:`v` is :py:attr:`voc_size_exponent` """
return self._voc_size_exponent
@voc_size_exponent.setter
def voc_size_exponent(self, value):
self._voc_size_exponent = value
@property
def v1(self):
"""Whether to use the Version 1.0 text representations.
This version is only available for Arabic (ar), English (en), and Spanish (es).
"""
return self._v1
@v1.setter
def v1(self, value):
self._v1 = value
[docs]
def b4msa_fit(self, D: List[Union[List, dict]]):
"""Estimate the parameters of the BoW (:py:class:`BoW.bow`)
in case it is not pretrained (:py:attr:`BoW.pretrain`)
:param D: Dataset
:type D: List of texts or dictionaries.
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> bow = BoW(pretrain=False)
>>> bow.b4msa_fit(list(tweet_iterator(TWEETS)))
>>> X = bow.transform(['Hola'])
>>> X.shape
(1, 84802)
"""
assert len(D)
self._b4msa_estimated = True
if self.key == 'text' or isinstance(D[0], str):
return self.bow.fit(D)
assert isinstance(D[0], dict)
if isinstance(self.key, str):
key = self.key
return self.bow.fit([x[key] for x in D])
_ = [[x[key] for key in self.key] for x in D]
return self.bow.fit(_)
[docs]
def train_predict_decision_function(self, D: List[Union[dict, list]],
y: Union[np.ndarray, None]=None) -> np.ndarray:
"""
Method to compute the kfold predictions on dataset `D` with response `y`
:param D: Texts to be transformed. In the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
:param y: Response variable
:type y: Array or None
For example, the following code computes the accuracy using k-fold cross-validation on the dataset found on `TWEETS`
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> from EvoMSA import BoW
>>> import numpy as np
>>> D = list(tweet_iterator(TWEETS))
>>> bow = BoW(lang='es')
>>> df = bow.train_predict_decision_function(D)
>>> df.shape
(1000, 4)
>>> hy = df.argmax(axis=1)
>>> y = np.array([x['klass'] for x in D])
>>> labels = np.unique(y)
>>> accuracy = (y == labels[hy]).mean()
"""
def train_predict(tr, vs):
m = self.estimator_class(**self.estimator_kwargs).fit(X[tr], y[tr])
return getattr(m, self.decision_function_name)(X[vs])
y = self.dependent_variable(D, y=y)
kf = self.kfold_class(**self.kfold_kwargs)
kfolds = [x for x in kf.split(D, y)]
X = self.transform(D, y=y)
hys = Parallel(n_jobs=self.n_jobs)(delayed(train_predict)(tr, vs)
for tr, vs in kfolds)
K = np.unique(y).shape[0]
if hys[0].ndim == 1:
hy = np.empty((y.shape[0], 1))
hys = [np.atleast_2d(x).T for x in hys]
else:
hy = np.empty((y.shape[0], K))
for (_, vs), pr in zip(kfolds, hys):
hy[vs] = pr
return hy
[docs]
def dependent_variable(self, D: List[Union[dict, list]],
y: Union[np.ndarray, None]=None) -> np.ndarray:
"""Obtain the response variable
:param D: Dataset
:type D: List of texts or dictionaries
:param y: Response variable
:type y: Array or None
"""
assert isinstance(D, list) and len(D)
label_key = self.label_key
if y is None:
assert isinstance(D[0], dict)
y = np.array([x[label_key] for x in D])
assert isinstance(y, np.ndarray)
return y
@property
def cache(self):
"""If the cache is set, it is returned when calling :py:attr:`BoW.transform`; afterward, it is unset."""
try:
return self._cache
except AttributeError:
return None
@cache.setter
def cache(self, value):
self._cache = value
@property
def label_key(self):
"""Key where the response is in the dictionary."""
return self._label_key
@label_key.setter
def label_key(self, value):
self._label_key = value
@property
def key(self):
"""Key where the text(s) is(are) in the dictionary."""
return self._key
@key.setter
def key(self, value):
self._key = value
@property
def decision_function_name(self):
"""Name of the estimator's decision function"""
return self._decision_function
@decision_function_name.setter
def decision_function_name(self, value):
self._decision_function = value
@property
def kfold_class(self):
"""Class to produce the kfolds"""
return self._kfold_instance
@kfold_class.setter
def kfold_class(self, value):
self._kfold_instance = value
@property
def kfold_kwargs(self):
"""Keyword arguments of the kfold class"""
return self._kfold_kwargs
@kfold_kwargs.setter
def kfold_kwargs(self, value):
self._kfold_kwargs = value
@property
def estimator_class(self):
"""Class of the classifier or regressor"""
return self._estimator_class
@estimator_class.setter
def estimator_class(self, value):
self._estimator_class = value
@property
def estimator_kwargs(self):
"""Keyword arguments of the estimator :py:class:`BoW.estimator_class`"""
return self._estimator_kwargs
@estimator_kwargs.setter
def estimator_kwargs(self, value):
self._estimator_kwargs = value
@property
def b4msa_kwargs(self):
"""Keyword arguments of B4MSA"""
return self._b4msa_kwargs
@b4msa_kwargs.setter
def b4msa_kwargs(self, value):
self._b4msa_kwargs = value
@property
def mixer_func(self):
"""The function is used to fix the output of the text's representations."""
return self._mixer_func
@mixer_func.setter
def mixer_func(self, value):
self._mixer_func = value
@property
def n_jobs(self):
"""Number of jobs used in multiprocessing."""
return self._n_jobs
@n_jobs.setter
def n_jobs(self, value):
self._n_jobs = value
def __sklearn_clone__(self):
klass = self.__class__
params = self.get_params()
return klass(**params)
[docs]
class DenseBoW(BoW):
"""
DenseBoW is a text classifier in fact it is
a subclass of :py:class:`BoW` being the difference the process
to represent the text in a vector space. This process is described in
"`EvoMSA: A Multilingual Evolutionary Approach
for Sentiment Analysis <https://ieeexplore.ieee.org/document/8956106>`_,
Mario Graff, Sabino Miranda-Jimenez, Eric Sadit Tellez, Daniela Moctezuma.
Computational Intelligence Magazine, vol 15 no. 1, pp. 76-88, Feb. 2020."
Particularly, in the section where the Emoji Space is described.
:param emoji: Whether to use emoji text representations. default=True.
:type emoji: bool
:param dataset: Whether to use labeled dataset text representations (only available in 'ar', 'en', 'es', and 'zh'). default=True
:type dataset: bool
:param keyword: Whether to use keyword text representations. default=True.
:type keyword: bool
:param skip_dataset: Set of discarded dataset.
:type skip_dataset: set
:param unit_vector: Normalize vectors to have length 1. default=True
:type unit_vector: bool
:param distance_hyperplane: Compute de distance to hyperplance in :py:func:`~EvoMSA.text_repr.DenseBoW.transform`
:type distance_hyperplane: bool
>>> from EvoMSA import DenseBoW
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> D = list(tweet_iterator(TWEETS))
>>> dense = DenseBoW(lang='es')
>>> dense.fit(D)
>>> dense.predict(['Buenos dias']).tolist()
['P']
"""
[docs]
def __init__(self,
emoji: bool=True,
dataset: bool=True,
keyword: bool=True,
skip_dataset: Set[str]=set(),
estimator_kwargs=dict(dual=False),
unit_vector=True,
distance_hyperplane=False,
**kwargs) -> None:
super(DenseBoW, self).__init__(estimator_kwargs=estimator_kwargs, **kwargs)
self.skip_dataset = skip_dataset
self._names = []
self._text_representations = []
self.unit_vector = unit_vector
self.emoji = emoji
self.dataset = dataset
self.keyword = keyword
self.distance_hyperplane = distance_hyperplane
[docs]
def fit(self, *args, **kwargs) -> 'DenseBoW':
"""Estimate the parameters of the classifier or regressor
(:py:attr:`DenseBoW.estimator_class` - the fitted instance is accesible
at :py:attr:`DenseBoW.estimator_instance`) using the dataset (`D`, `y`).
>>> from EvoMSA import DenseBoW
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> D = list(tweet_iterator(TWEETS))
>>> dense = DenseBoW(lang='es').fit(D)
"""
return super(DenseBoW, self).fit(*args, **kwargs)
[docs]
def predict(self, *args, **kwargs) -> np.ndarray:
"""Predict the response variable on the dataset `D`.
>>> from EvoMSA import DenseBoW
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> D = list(tweet_iterator(TWEETS))
>>> dense = DenseBoW(lang='es').fit(D)
>>> dense.predict(['Buenos dias']).tolist()
['P']
"""
return super(DenseBoW, self).predict(*args, **kwargs)
@property
def text_representations(self):
"""Classifiers that define the text representation."""
return self._text_representations
@text_representations.setter
def text_representations(self, value):
self._text_representations = value
[docs]
def select(self, subset: Union[list, None]=None,
D: List[Union[dict, list, None]]=None,
y: Union[np.ndarray, None]=None,
feature_selection: Callable=KruskalFS,
feature_selection_kwargs: dict=dict()) -> 'DenseBoW':
"""Procedure to perform feature selection or indices of the features to be selected.
:param subset: Representations to be selected.
:type subset: List of indices.
:param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`. default=None
:type D: List of texts or dictionaries.
:param y: Response variable. The response variable can also be in `D` on the key :py:attr:`BoW.label_key`. default=None
:type y: Array or None
>>> from EvoMSA import DenseBoW
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> T = list(tweet_iterator(TWEETS))
>>> text_repr = DenseBoW(lang='es').select(D=T)
>>> text_repr.weights.shape
(2672, 131072)
"""
assert subset is not None or D is not None
if hasattr(self, '_norm_weights'):
delattr(self, '_norm_weights')
if subset is not None:
if len(subset) == 0:
return self
tr = self.text_representations
self.text_representations = [tr[i] for i in subset]
names = self.names
self.names = [names[i] for i in subset]
return self
y = self.dependent_variable(D, y=y)
X = self.transform(D)
_ = feature_selection(**feature_selection_kwargs).fit(X, y=y)
self.feature_selection = _
index = self.feature_selection.get_support(indices=True)
return self.select(subset=index)
@property
def feature_selection(self):
"""Feature selection used in :py:func:`~EvoMSA.text_repr.DenseBoW.select`"""
return self._feature_selection
@feature_selection.setter
def feature_selection(self, value):
self._feature_selection = value
[docs]
def text_representations_extend(self, value: Union[List, str]):
"""Add dense BoW representations.
:param value: List of models or name
:type value: List of models or string
"""
if isinstance(value, str):
if value in TAILORED:
func = self.voc_selection
d = self.voc_size_exponent
if func == 'most_common_by_type' and d == 17:
value = f'{value}_{self.lang.capitalize()}.json.gz'
else:
value = f'{self.lang}_{MICROTC}_{value}_{func}_{d}.json.gz'
value = load_url(value, n_jobs=self.n_jobs)
names = set(self.names)
for x in value:
label = x.labels[-1]
if label not in names:
self.text_representations.append(x)
self.names.append(label)
names.add(label)
return self
@property
def names(self):
return self._names
@names.setter
def names(self, value):
self._names = value
@property
def norm_weights(self):
"""Euclidean norm of the weights"""
try:
return self._norm_weights
except AttributeError:
self._norm_weights = np.r_[[np.linalg.norm(w._coef)
for w in self.text_representations]]
return self._norm_weights
@property
def weights(self):
"""Weights of the vector space.
It is matrix, i.e., :math:`\mathbf W \in \mathbb R^{M \\times d}`, where
:math:`M` is the dimension of the vector space (see :py:attr:`DenseBoW.names`)
and :math:`d` is the vocabulary size.
>>> from EvoMSA import DenseBoW
>>> text_repr = DenseBoW(lang='es')
>>> text_repr.weights.shape
(2672, 131072)
"""
try:
return self._weights
except AttributeError:
w = np.array([x._coef for x in self.text_representations])
self._weights = w
return self._weights
@property
def bias(self):
"""Bias."""
try:
return self._bias
except AttributeError:
w = np.array([x._intercept for x in self.text_representations])
self._bias = w
return self._bias
@property
def dataset(self):
"""Dense Representation based on human-annotated datasets"""
return self._dataset
@dataset.setter
def dataset(self, value):
self._dataset = value
if value:
self.load_dataset()
@property
def emoji(self):
"""Dense Representation based on emojis"""
return self._emoji
@emoji.setter
def emoji(self, value):
self._emoji = value
if value:
self.load_emoji()
@property
def keyword(self):
"""Dense Representation based on keywords"""
return self._keyword
@keyword.setter
def keyword(self, value):
self._keyword = value
if value:
self.load_keyword()
@property
def unit_vector(self):
"""Normalize representation to have one length"""
return self._unit_vector
@unit_vector.setter
def unit_vector(self, value):
self._unit_vector = value
@property
def distance_hyperplane(self):
"""Compute the distance to the hyperplance in :py:func:`~EvoMSA.text_repr.DenseBoW.transform`"""
return self._distance_hyperplane
@distance_hyperplane.setter
def distance_hyperplane(self, value):
self._distance_hyperplane = value
[docs]
def fromjson(self, filename:str) -> 'DenseBoW':
"""Load the text representations from a json file.
:param filename: Path
:type filename: str
"""
models = [Linear(**kwargs)
for kwargs in tweet_iterator(filename)]
self.text_representations_extend(models)
return self
def get_params(self, deep=True):
"""Obtain the parameters of the class"""
dense_params = self._get_param_names()
bow_params = BoW._get_param_names()
params = dict()
for k in bow_params + dense_params:
params[k] = getattr(self, k)
return params
@property
def skip_dataset(self):
"""Datasets discarded from the text representations"""
return self._skip_dataset
@skip_dataset.setter
def skip_dataset(self, value):
self._skip_dataset = value
def load_emoji(self) -> None:
if self.v1:
emojis = load_emoji(lang=self.lang, v1=self.v1,
n_jobs=self.n_jobs)
self.text_representations.extend(emojis)
self.names.extend([x.labels[-1] for x in emojis])
else:
data = load_emoji(lang=self.lang,
d=self.voc_size_exponent,
func=self.voc_selection,
n_jobs=self.n_jobs)
self.text_representations.extend(data)
self.names.extend([x.labels[-1] for x in data])
def load_keyword(self) -> None:
if self.v1:
_ = load_keyword(lang=self.lang, v1=self.v1,
n_jobs=self.n_jobs)
self.text_representations.extend(_)
self.names.extend([x.labels[-1] for x in _])
else:
data = load_keyword(lang=self.lang,
d=self.voc_size_exponent,
func=self.voc_selection,
n_jobs=self.n_jobs)
self.text_representations.extend(data)
self.names.extend([x.labels[-1] for x in data])
def load_dataset(self) -> None:
if self.lang not in ['ar', 'zh', 'en', 'es']:
return
if self.v1:
names = [name for name in dataset_information(lang=self.lang)
if name not in self._skip_dataset]
_ = Parallel(n_jobs=self.n_jobs)(delayed(load_dataset)(lang=self.lang, name=name, v1=self.v1)
for name in names)
[self.text_representations.extend(k) for k in _]
[self.names.extend([name] if len(k) == 1 else [f'{name}({i.labels[-1]})' for i in k])
for k, name in zip(_, names)]
else:
data = load_dataset(lang=self.lang, name='datasets',
d=self.voc_size_exponent,
func=self.voc_selection)
_ = [x for x in data if x.labels[-1] not in self.skip_dataset]
self.text_representations.extend(_)
self.names.extend([x.labels[-1] for x in _])
def __sklearn_clone__(self):
klass = self.__class__
params = self.get_params()
models = ['emoji', 'keyword', 'dataset']
args = {k: params[k] for k in models}
params.update({k: False for k in models})
ins = klass(**params)
ins.text_representations_extend(self.text_representations)
for k, v in args.items():
setattr(ins, f'_{k}', v)
return ins
TextRepresentations = DenseBoW
[docs]
class StackGeneralization(BoW):
"""The idea behind stack generalization is to train an estimator on the predictions made by the base classifiers or regressors.
:param decision_function_models: Represent the text by calling the decision function
:type decision_function_models: List of :py:class:`BoW` or :py:class:`DenseBoW`
:param transform_models: Represent the text by calling the transform
:type transform_models: List of :py:class:`BoW` or :py:class:`DenseBoW`
>>> from EvoMSA import DenseBoW, BoW, StackGeneralization
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> emoji = DenseBoW(lang='es', dataset=False, keyword=False)
>>> dataset = DenseBoW(lang='es', emoji=False, keyword=False)
>>> bow = BoW(lang='es')
>>> stacking = StackGeneralization(decision_function_models=[bow],
transform_models=[dataset, emoji])
>>> stacking.fit(list(tweet_iterator(TWEETS)))
>>> stacking.predict(['Buenos dias']).tolist()
['P']
"""
[docs]
def __init__(self, decision_function_models: list=[],
transform_models: list=[],
decision_function_name: str='predict_proba',
estimator_class=GaussianNB,
estimator_kwargs=dict(),
n_jobs: int=1,
**kwargs) -> None:
assert len(decision_function_models) or len(transform_models)
assert n_jobs == 1
super(StackGeneralization, self).__init__(n_jobs=n_jobs,
decision_function_name=decision_function_name,
estimator_class=estimator_class,
estimator_kwargs=estimator_kwargs,
**kwargs)
self._decision_function_models = decision_function_models
self._transform_models = transform_models
self.estimated = False
[docs]
def fit(self, *args, **kwargs) -> 'StackGeneralization':
"""
>>> from EvoMSA import DenseBoW, BoW, StackGeneralization
>>> from microtc.utils import tweet_iterator
>>> from EvoMSA.tests.test_base import TWEETS
>>> D = list(tweet_iterator(TWEETS))
>>> emoji = DenseBoW(lang='es', dataset=False, keyword=False)
>>> dataset = DenseBoW(lang='es', emoji=False, keyword=False)
>>> bow = BoW(lang='es')
>>> stacking = StackGeneralization(decision_function_models=[bow],
transform_models=[dataset, emoji]).fit(D)
"""
super(StackGeneralization, self).fit(*args, **kwargs)
self._estimated = True
return self
@property
def estimated(self):
return self._estimated
@estimated.setter
def estimated(self, value):
self._estimated = value
def train_predict_decision_function(self, *args, **kwargs) -> np.ndarray:
assert not self.estimated
return super(StackGeneralization, self).train_predict_decision_function(*args, **kwargs)
@property
def decision_function_models(self):
"""These models create the vector space by calling the decision function."""
return self._decision_function_models
@property
def transform_models(self):
"""These models create the vector space by calling the transform."""
return self._transform_models
def __sklearn_clone__(self):
klass = self.__class__
params = self.get_params()
params['decision_function_models'] = [clone(x)
for x in self.decision_function_models]
params['transform_models'] = [clone(x)
for x in self.transform_models]
return klass(**params)