from collections.abc import Callable
from pathlib import Path
from time import time
from typing import Literal
import geopandas as gpd
import numpy as np
import pandas as pd
from libpysal import graph
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from .base import BaseClassifier
[docs]
class GWRandomForestClassifier(BaseClassifier):
"""Geographically weighted random forest classifier.
Fits one :class:`sklearn.ensemble.RandomForestClassifier` per focal observation
using spatially varying sample weights.
The spatial interaction is defined either by (a) ``geometry`` + bandwidth/kernel
settings or (b) a precomputed :class:`libpysal.graph.Graph` passed via ``graph``.
Notes
-----
- ``y`` must be binary (``{0, 1}`` or boolean).
- To enable prediction on new data via :meth:`predict`/:meth:`predict_proba`, you
must set ``keep_models=True`` (store in memory) or ``keep_models=Path(...)``
(serialize to disk).
- Only point geometries are supported.
Parameters
----------
bandwidth : float | int | None
Bandwidth for defining neighborhoods.
- If ``fixed=True``, this is a distance threshold.
- If ``fixed=False``, this is the number of nearest neighbors used to form the
local neighborhood.
If ``graph`` is provided, ``bandwidth`` is ignored.
fixed : bool, optional
True for distance based bandwidth and False for adaptive (nearest neighbor)
bandwidth, by default False
kernel : str | Callable, optional
type of kernel function used to weight observations, by default "bisquare"
include_focal : bool, optional
Include focal in the local model training. Excluding it allows assessment of
geographically weighted metrics on unseen data without a need for train/test
split, hence providing value for all samples. This is needed for futher spatial
analysis of the model performance (and generalises to models that do not support
OOB scoring). However, it leaves out the most representative sample. By default
False
geometry : gpd.GeoSeries, optional
Geographic location of the observations in the sample. Used to determine the
spatial interaction weight based on specification by ``bandwidth``, ``fixed``,
``kernel``, and ``include_focal`` keywords. Either ``geometry`` or ``graph``
need to be specified. To allow prediction, it is required to specify
``geometry``.
graph : Graph, optional
Custom libpysal.graph.Graph object encoding the spatial interaction between
observations in the sample. If given, it is used directly and ``bandwidth``,
``fixed``, ``kernel``, and ``include_focal`` keywords are ignored. Either
``geometry`` or ``graph`` need to be specified. To allow prediction, it is
required to specify ``geometry``. Potentially, both can be specified where
``graph`` encodes spatial interaction between observations in ``geometry``.
n_jobs : int, optional
The number of jobs to run in parallel. ``-1`` means using all processors by
default ``-1``
fit_global_model : bool, optional
Determines if the global baseline model shall be fitted alognside the
geographically weighted, by default True
strict : bool | None, optional
Do not fit any models if at least one neighborhood has invariant ``y``, by
default False. None is treated as False but provides a warning if there are
invariant models.
keep_models : bool | str | Path, optional
Keep all local models (required for prediction), by default False. Note that for
some models, like random forests, the objects can be large. If string or Path is
provided, the local models are not held in memory but serialized to the disk
from which they are loaded in prediction.
temp_folder : str | None, optional
Folder to be used by the pool for memmapping large arrays for sharing memory
with worker processes, e.g., ``/tmp``. Passed to ``joblib.Parallel``, by default
None
batch_size : int | None, optional
Number of models to process in each batch. Specify batch_size if your models do
not fit into memory. By default None
min_proportion : float, optional
Minimum proportion of minority class for a model to be fitted, by default 0.2
undersample : bool | float, optional
Whether to apply random undersampling to balance classes.
If ``True``, undersample the majority class to match the minority class
(i.e., minority/majority ratio = 1.0).
If a float ``alpha > 0``, target a minority/majority ratio of ``alpha`` after
resampling, i.e. ``alpha = N_min / N_resampled_majority``.
By default False
leave_out : float | int, optional
Leave out a fraction (when float) or a set number (when int) of random
observations from each local model to be used to measure out-of-sample log loss
based on pooled samples from all the models. This is useful for bandwidth
selection for cases where some local models are not fitted due to local
invariance and resulting information criteria are not comparable.
random_state : int | None, optional
Random seed for reproducibility, by default None
verbose : bool, optional
Whether to print progress information, by default False
**kwargs
Additional keyword arguments passed to ``model`` initialisation
Attributes
----------
proba_ : pd.DataFrame
Probability predictions for focal locations based on a local model trained
around the point itself.
pred_ : pd.Series
Binary predictions for focal locations based on a local model trained around the
location itself.
hat_values_ : pd.Series
Hat values for each location (diagonal elements of hat matrix)
effective_df_ : float
Effective degrees of freedom (sum of hat values)
log_likelihood_ : float
Global log likelihood of the model
aic_ : float
Akaike information criterion of the model
aicc_ : float
Corrected Akaike information criterion to account for model
complexity (smaller bandwidths)
bic_ : float
Bayesian information criterion
feature_importances_ : pd.DataFrame
Feature importance values for each local model
prediction_rate_ : float
Proportion of models that are fitted, where the rest are skipped due to not
fulfilling ``min_proportion``.
left_out_y_ : numpy.ndarray
Array of ``y`` values left out when ``leave_out`` is set.
left_out_proba_ : numpy.ndarray
Array of probabilites on left out observations in local models when
``leave_out`` is set.
left_out_w_ : numpy.ndarray
Array of weights on left out observations in local models when
``leave_out`` is set.
oob_y_pooled_ : numpy.ndarray
Pooled out-of-bag (OOB) true labels across all fitted local models.
oob_pred_pooled_ : numpy.ndarray
Pooled out-of-bag (OOB) predictions/scores across all fitted local models.
Examples
--------
>>> import geopandas as gpd
>>> from geodatasets import get_path
>>> from gwlearn.ensemble import GWRandomForestClassifier
>>> gdf = gpd.read_file(get_path('geoda.guerry'))
>>> X = gdf[['Crm_prp', 'Litercy', 'Donatns', 'Lottery']]
>>> y = gdf["Region"] == 'E'
>>> gw = GWRandomForestClassifier(
... bandwidth=30,
... fixed=False,
... geometry=gdf.representative_point(),
... random_state=0,
... ).fit(X, y)
>>> gw.pred_.head()
0 False
1 False
2 False
3 True
4 True
dtype: boolean
"""
def __init__(
self,
*,
bandwidth: float | None = None,
fixed: bool = False,
kernel: Literal[
"triangular",
"parabolic",
# "gaussian",
"bisquare",
"tricube",
"cosine",
"boxcar",
# "exponential",
]
| Callable = "bisquare",
include_focal: bool = False,
geometry: gpd.GeoSeries | None = None,
graph: graph.Graph | None = None,
n_jobs: int = -1,
fit_global_model: bool = True,
strict: bool | None = False,
keep_models: bool | str | Path = False,
temp_folder: str | None = None,
batch_size: int | None = None,
min_proportion: float = 0.2,
undersample: bool | float = False,
leave_out: float | int | None = None,
random_state: int | None = None,
verbose: bool = False,
**kwargs,
):
super().__init__(
model=RandomForestClassifier,
bandwidth=bandwidth,
fixed=fixed,
kernel=kernel,
include_focal=include_focal,
geometry=geometry,
graph=graph,
n_jobs=n_jobs,
fit_global_model=fit_global_model,
strict=strict,
keep_models=keep_models,
temp_folder=temp_folder,
batch_size=batch_size,
min_proportion=min_proportion,
undersample=undersample,
leave_out=leave_out,
random_state=random_state,
verbose=verbose,
**kwargs,
)
self._model_type = "random_forest"
self._model_kwargs["oob_score"] = self._get_oob_score_data
self._empty_score_data = (np.array([]).reshape(-1, 1), np.array([]))
def _get_oob_score_data(self, true, pred):
"""Callback used by scikit-learn to collect OOB targets/predictions."""
return true, pred
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series) -> "GWRandomForestClassifier":
"""Fit geographically weighted random forests.
Parameters
----------
X : pandas.DataFrame
Feature matrix.
y : pandas.Series
Binary target encoded as boolean or ``{0, 1}``.
Returns
-------
GWRandomForestClassifier
Fitted estimator.
Notes
-----
In addition to the base classifier outputs, this method also populates
``oob_y_pooled_`` and ``oob_pred_pooled_`` by pooling OOB values across all
fitted local models.
"""
self._empty_feature_imp = np.array([np.nan] * (X.shape[1]))
super().fit(X=X, y=y)
self._y_local = [x[0] for x in self._score_data]
self._pred_local = [x[1] for x in self._score_data]
del self._score_data
# Filter out empty arrays before concatenation
non_empty_y = [arr for arr in self._y_local if arr.size > 0]
non_empty_pred = [arr for arr in self._pred_local if arr.size > 0]
if non_empty_y:
self.oob_y_pooled_ = np.concatenate(non_empty_y)
else:
# Set to empty array with same dtype as y
self.oob_y_pooled_ = np.array([], dtype=y.dtype) # ty:ignore[no-matching-overload]
if non_empty_pred:
self.oob_pred_pooled_ = np.concatenate(non_empty_pred)
else:
# Set to empty array with float dtype (typical for predictions)
self.oob_pred_pooled_ = np.array([], dtype=float)
# feature importances
self.feature_importances_ = pd.DataFrame(
self._feature_importances, index=self._names, columns=X.columns
)
if self.verbose:
print(f"{(time() - self._start):.2f}s: Finished")
return self
def _get_score_data(self, local_model, X, y): # noqa: ARG002
return local_model.oob_score_
[docs]
class GWGradientBoostingClassifier(BaseClassifier):
"""Geographically weighted gradient boosting classifier.
Fits one :class:`sklearn.ensemble.GradientBoostingClassifier` per focal observation
using spatially varying sample weights.
The spatial interaction is defined either by (a) ``geometry`` + bandwidth/kernel
settings or (b) a precomputed :class:`libpysal.graph.Graph` passed via ``graph``.
Notes
-----
- ``y`` must be binary (``{0, 1}`` or boolean).
- To enable prediction on new data via :meth:`predict`/:meth:`predict_proba`, you
must set ``keep_models=True`` (store in memory) or ``keep_models=Path(...)``
(serialize to disk).
- Only point geometries are supported.
Parameters
----------
bandwidth : float | int | None
Bandwidth for defining neighborhoods.
- If ``fixed=True``, this is a distance threshold.
- If ``fixed=False``, this is the number of nearest neighbors used to form the
local neighborhood.
If ``graph`` is provided, ``bandwidth`` is ignored.
fixed : bool, optional
True for distance based bandwidth and False for adaptive (nearest neighbor)
bandwidth, by default False
kernel : str | Callable, optional
type of kernel function used to weight observations, by default "bisquare"
include_focal : bool, optional
Include focal in the local model training. Excluding it allows
assessment of geographically weighted metrics on unseen data without a need for
train/test split, hence providing value for all samples. This is needed for
futher spatial analysis of the model performance (and generalises to models
that do not support OOB scoring). However, it leaves out the most representative
sample. By default False
geometry : gpd.GeoSeries, optional
Geographic location of the observations in the sample. Used to determine the
spatial interaction weight based on specification by ``bandwidth``, ``fixed``,
``kernel``, and ``include_focal`` keywords. Either ``geometry`` or ``graph``
need to be specified. To allow prediction, it is required to specify
``geometry``.
graph : Graph, optional
Custom libpysal.graph.Graph object encoding the spatial interaction between
observations in the sample. If given, it is used directly and ``bandwidth``,
``fixed``, ``kernel``, and ``include_focal`` keywords are ignored. Either
``geometry`` or ``graph`` need to be specified. To allow prediction, it is
required to specify ``geometry``. Potentially, both can be specified where
``graph`` encodes spatial interaction between observations in ``geometry``.
n_jobs : int, optional
The number of jobs to run in parallel. ``-1`` means using all processors
by default ``-1``
fit_global_model : bool, optional
Determines if the global baseline model shall be fitted alognside
the geographically weighted, by default True
strict : bool | None, optional
Do not fit any models if at least one neighborhood has invariant ``y``,
by default False. None is treated as False but provides a warning if there are
invariant models.
keep_models : bool | str | Path, optional
Keep all local models (required for prediction), by default False. Note that
for some models, like random forests, the objects can be large. If string or
Path is provided, the local models are not held in memory but serialized to
the disk from which they are loaded in prediction.
temp_folder : str | None, optional
Folder to be used by the pool for memmapping large arrays for sharing memory
with worker processes, e.g., ``/tmp``. Passed to ``joblib.Parallel``, by default
None
batch_size : int | None, optional
Number of models to process in each batch. Specify batch_size fi your models do
not fit into memory. By default None
min_proportion : float, optional
Minimum proportion of minority class for a model to be fitted, by default 0.2
undersample : bool | float, optional
Whether to apply random undersampling to balance classes.
If ``True``, undersample the majority class to match the minority class
(i.e., minority/majority ratio = 1.0).
If a float ``alpha > 0``, target a minority/majority ratio of ``alpha`` after
resampling, i.e. ``alpha = N_min / N_resampled_majority``.
By default False
random_state : int | None, optional
Random seed for reproducibility, by default None
verbose : bool, optional
Whether to print progress information, by default False
**kwargs
Additional keyword arguments passed to ``model`` initialisation
Attributes
----------
proba_ : pd.DataFrame
Probability predictions for focal locations based on a local model trained
around the point itself.
pred_ : pd.Series
Binary predictions for focal locations based on a local model trained around the
location itself.
hat_values_ : pd.Series
Hat values for each location (diagonal elements of hat matrix)
effective_df_ : float
Effective degrees of freedom (sum of hat values)
log_likelihood_ : float
Global log likelihood of the model
aic_ : float
Akaike inofrmation criterion of the model
aicc_ : float
Corrected Akaike information criterion to account to account for model
complexity (smaller bandwidths)
bic_ : float
Bayesian information criterion
feature_importances_ : pd.DataFrame
Feature importance values for each local model
prediction_rate_ : float
Proportion of models that are fitted, where the rest are skipped due to not
fulfilling ``min_proportion``.
Examples
--------
>>> import geopandas as gpd
>>> from geodatasets import get_path
>>> from gwlearn.ensemble import GWGradientBoostingClassifier
>>> gdf = gpd.read_file(get_path('geoda.guerry'))
>>> X = gdf[['Crm_prp', 'Litercy', 'Donatns', 'Lottery']]
>>> y = gdf["Region"] == 'E'
>>> gw = GWGradientBoostingClassifier(
... bandwidth=30,
... fixed=False,
... geometry=gdf.representative_point(),
... random_state=0,
... ).fit(X, y)
>>> gw.pred_.head()
0 False
1 False
2 False
3 True
4 True
dtype: boolean
"""
def __init__(
self,
*,
bandwidth: int | float | None = None,
fixed: bool = False,
kernel: Literal[
"triangular",
"parabolic",
# "gaussian",
"bisquare",
"tricube",
"cosine",
"boxcar",
# "exponential",
]
| Callable = "bisquare",
include_focal: bool = False,
geometry: gpd.GeoSeries | None = None,
graph: graph.Graph | None = None,
n_jobs: int = -1,
fit_global_model: bool = True,
strict: bool | None = False,
keep_models: bool | str | Path = False,
temp_folder: str | None = None,
batch_size: int | None = None,
min_proportion: float = 0.2,
undersample: bool | float = False,
random_state: int | None = None,
verbose: bool = False,
**kwargs,
):
super().__init__(
model=GradientBoostingClassifier,
bandwidth=bandwidth,
fixed=fixed,
kernel=kernel,
include_focal=include_focal,
geometry=geometry,
graph=graph,
n_jobs=n_jobs,
fit_global_model=fit_global_model,
strict=strict,
keep_models=keep_models,
temp_folder=temp_folder,
batch_size=batch_size,
min_proportion=min_proportion,
undersample=undersample,
random_state=random_state,
verbose=verbose,
**kwargs,
)
self._model_type = "gradient_boosting"
self._empty_score_data = np.nan
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series) -> "GWGradientBoostingClassifier":
"""Fit geographically weighted gradient boosting classifiers.
Parameters
----------
X : pandas.DataFrame
Feature matrix.
y : pandas.Series
Binary target encoded as boolean or ``{0, 1}``.
Returns
-------
GWGradientBoostingClassifier
Fitted estimator.
Notes
-----
Populates ``feature_importances_`` from the fitted local models.
"""
self._empty_feature_imp = np.array([np.nan] * (X.shape[1]))
super().fit(X=X, y=y)
# feature importances
self.feature_importances_ = pd.DataFrame(
self._feature_importances, index=self._names, columns=X.columns
)
if self.verbose:
print(f"{(time() - self._start):.2f}s: Finished")
return self