Source code for gwlearn.ensemble

from collections.abc import Callable
from pathlib import Path
from time import time
from typing import Literal

import geopandas as gpd
import numpy as np
import pandas as pd
from libpysal import graph
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from .base import BaseClassifier



[docs]
class GWRandomForestClassifier(BaseClassifier):
    """Geographically weighted random forest classifier.

    Fits one :class:`sklearn.ensemble.RandomForestClassifier` per focal observation
    using spatially varying sample weights.

    The spatial interaction is defined either by (a) ``geometry`` + bandwidth/kernel
    settings or (b) a precomputed :class:`libpysal.graph.Graph` passed via ``graph``.

    Notes
    -----
    - ``y`` must be binary (``{0, 1}`` or boolean).
    - To enable prediction on new data via :meth:`predict`/:meth:`predict_proba`, you
      must set ``keep_models=True`` (store in memory) or ``keep_models=Path(...)``
      (serialize to disk).
    - Only point geometries are supported.

    Parameters
    ----------
    bandwidth : float | int | None
        Bandwidth for defining neighborhoods.

        - If ``fixed=True``, this is a distance threshold.
        - If ``fixed=False``, this is the number of nearest neighbors used to form the
          local neighborhood.

        If ``graph`` is provided, ``bandwidth`` is ignored.
    fixed : bool, optional
        True for distance based bandwidth and False for adaptive (nearest neighbor)
        bandwidth, by default False
    kernel : str | Callable, optional
        type of kernel function used to weight observations, by default "bisquare"
    include_focal : bool, optional
        Include focal in the local model training. Excluding it allows assessment of
        geographically weighted metrics on unseen data without a need for train/test
        split, hence providing value for all samples. This is needed for futher spatial
        analysis of the model performance (and generalises to models that do not support
        OOB scoring). However, it leaves out the most representative sample. By default
        False
    geometry : gpd.GeoSeries, optional
        Geographic location of the observations in the sample. Used to determine the
        spatial interaction weight based on specification by ``bandwidth``, ``fixed``,
        ``kernel``, and ``include_focal`` keywords.  Either ``geometry`` or ``graph``
        need to be specified. To allow prediction, it is required to specify
        ``geometry``.
    graph : Graph, optional
        Custom libpysal.graph.Graph object encoding the spatial interaction between
        observations in the sample. If given, it is used directly and ``bandwidth``,
        ``fixed``, ``kernel``, and ``include_focal`` keywords are ignored. Either
        ``geometry`` or ``graph`` need to be specified. To allow prediction, it is
        required to specify ``geometry``. Potentially, both can be specified where
        ``graph`` encodes spatial interaction between observations in ``geometry``.
    n_jobs : int, optional
        The number of jobs to run in parallel. ``-1`` means using all processors by
        default ``-1``
    fit_global_model : bool, optional
        Determines if the global baseline model shall be fitted alognside the
        geographically weighted, by default True
    strict : bool | None, optional
        Do not fit any models if at least one neighborhood has invariant ``y``, by
        default False. None is treated as False but provides a warning if there are
        invariant models.
    keep_models : bool | str | Path, optional
        Keep all local models (required for prediction), by default False. Note that for
        some models, like random forests, the objects can be large. If string or Path is
        provided, the local models are not held in memory but serialized to the disk
        from which they are loaded in prediction.
    temp_folder : str | None, optional
        Folder to be used by the pool for memmapping large arrays for sharing memory
        with worker processes, e.g., ``/tmp``. Passed to ``joblib.Parallel``, by default
        None
    batch_size : int | None, optional
        Number of models to process in each batch. Specify batch_size if your models do
        not fit into memory. By default None
    min_proportion : float, optional
        Minimum proportion of minority class for a model to be fitted, by default 0.2
    undersample : bool | float, optional
        Whether to apply random undersampling to balance classes.

        If ``True``, undersample the majority class to match the minority class
        (i.e., minority/majority ratio = 1.0).

        If a float ``alpha > 0``, target a minority/majority ratio of ``alpha`` after
        resampling, i.e. ``alpha = N_min / N_resampled_majority``.
        By default False
    leave_out : float | int, optional
        Leave out a fraction (when float) or a set number (when int) of random
        observations from each local model to be used to measure out-of-sample log loss
        based on pooled samples from all the models. This is useful for bandwidth
        selection for cases where some local models are not fitted due to local
        invariance and resulting information criteria are not comparable.
    random_state : int | None, optional
        Random seed for reproducibility, by default None
    verbose : bool, optional
        Whether to print progress information, by default False
    **kwargs
        Additional keyword arguments passed to ``model`` initialisation

    Attributes
    ----------
    proba_ : pd.DataFrame
        Probability predictions for focal locations based on a local model trained
        around the point itself.
    pred_ : pd.Series
        Binary predictions for focal locations based on a local model trained around the
        location itself.
    hat_values_ : pd.Series
        Hat values for each location (diagonal elements of hat matrix)
    effective_df_ : float
        Effective degrees of freedom (sum of hat values)
    log_likelihood_ : float
        Global log likelihood of the model
    aic_ : float
        Akaike information criterion of the model
    aicc_ : float
        Corrected Akaike information criterion to account for model
        complexity (smaller bandwidths)
    bic_ : float
        Bayesian information criterion
    feature_importances_ : pd.DataFrame
        Feature importance values for each local model
    prediction_rate_ : float
        Proportion of models that are fitted, where the rest are skipped due to not
        fulfilling ``min_proportion``.
    left_out_y_ : numpy.ndarray
        Array of ``y`` values left out when ``leave_out`` is set.
    left_out_proba_ : numpy.ndarray
        Array of probabilites on left out observations in local models when
        ``leave_out`` is set.
    left_out_w_ : numpy.ndarray
        Array of weights on left out observations in local models when
        ``leave_out`` is set.
    oob_y_pooled_ : numpy.ndarray
        Pooled out-of-bag (OOB) true labels across all fitted local models.
    oob_pred_pooled_ : numpy.ndarray
        Pooled out-of-bag (OOB) predictions/scores across all fitted local models.

    Examples
    --------
    >>> import geopandas as gpd
    >>> from geodatasets import get_path
    >>> from gwlearn.ensemble import GWRandomForestClassifier

    >>> gdf = gpd.read_file(get_path('geoda.guerry'))
    >>> X = gdf[['Crm_prp', 'Litercy', 'Donatns', 'Lottery']]
    >>> y = gdf["Region"] == 'E'

    >>> gw = GWRandomForestClassifier(
    ...     bandwidth=30,
    ...     fixed=False,
    ...     geometry=gdf.representative_point(),
    ...     random_state=0,
    ... ).fit(X, y)
    >>> gw.pred_.head()
    0    False
    1    False
    2    False
    3     True
    4     True
    dtype: boolean
    """

    def __init__(
        self,
        *,
        bandwidth: float | None = None,
        fixed: bool = False,
        kernel: Literal[
            "triangular",
            "parabolic",
            # "gaussian",
            "bisquare",
            "tricube",
            "cosine",
            "boxcar",
            # "exponential",
        ]
        | Callable = "bisquare",
        include_focal: bool = False,
        geometry: gpd.GeoSeries | None = None,
        graph: graph.Graph | None = None,
        n_jobs: int = -1,
        fit_global_model: bool = True,
        strict: bool | None = False,
        keep_models: bool | str | Path = False,
        temp_folder: str | None = None,
        batch_size: int | None = None,
        min_proportion: float = 0.2,
        undersample: bool | float = False,
        leave_out: float | int | None = None,
        random_state: int | None = None,
        verbose: bool = False,
        **kwargs,
    ):
        super().__init__(
            model=RandomForestClassifier,
            bandwidth=bandwidth,
            fixed=fixed,
            kernel=kernel,
            include_focal=include_focal,
            geometry=geometry,
            graph=graph,
            n_jobs=n_jobs,
            fit_global_model=fit_global_model,
            strict=strict,
            keep_models=keep_models,
            temp_folder=temp_folder,
            batch_size=batch_size,
            min_proportion=min_proportion,
            undersample=undersample,
            leave_out=leave_out,
            random_state=random_state,
            verbose=verbose,
            **kwargs,
        )

        self._model_type = "random_forest"
        self._model_kwargs["oob_score"] = self._get_oob_score_data

        self._empty_score_data = (np.array([]).reshape(-1, 1), np.array([]))

    def _get_oob_score_data(self, true, pred):
        """Callback used by scikit-learn to collect OOB targets/predictions."""
        return true, pred


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "GWRandomForestClassifier":
        """Fit geographically weighted random forests.

        Parameters
        ----------
        X : pandas.DataFrame
            Feature matrix.
        y : pandas.Series
            Binary target encoded as boolean or ``{0, 1}``.

        Returns
        -------
        GWRandomForestClassifier
            Fitted estimator.

        Notes
        -----
        In addition to the base classifier outputs, this method also populates
        ``oob_y_pooled_`` and ``oob_pred_pooled_`` by pooling OOB values across all
        fitted local models.
        """
        self._empty_feature_imp = np.array([np.nan] * (X.shape[1]))
        super().fit(X=X, y=y)

        self._y_local = [x[0] for x in self._score_data]
        self._pred_local = [x[1] for x in self._score_data]

        del self._score_data

        # Filter out empty arrays before concatenation
        non_empty_y = [arr for arr in self._y_local if arr.size > 0]
        non_empty_pred = [arr for arr in self._pred_local if arr.size > 0]
        if non_empty_y:
            self.oob_y_pooled_ = np.concatenate(non_empty_y)
        else:
            # Set to empty array with same dtype as y
            self.oob_y_pooled_ = np.array([], dtype=y.dtype)  # ty:ignore[no-matching-overload]
        if non_empty_pred:
            self.oob_pred_pooled_ = np.concatenate(non_empty_pred)
        else:
            # Set to empty array with float dtype (typical for predictions)
            self.oob_pred_pooled_ = np.array([], dtype=float)

        # feature importances
        self.feature_importances_ = pd.DataFrame(
            self._feature_importances, index=self._names, columns=X.columns
        )

        if self.verbose:
            print(f"{(time() - self._start):.2f}s: Finished")

        return self


    def _get_score_data(self, local_model, X, y):  # noqa: ARG002
        return local_model.oob_score_




[docs]
class GWGradientBoostingClassifier(BaseClassifier):
    """Geographically weighted gradient boosting classifier.

    Fits one :class:`sklearn.ensemble.GradientBoostingClassifier` per focal observation
    using spatially varying sample weights.

    The spatial interaction is defined either by (a) ``geometry`` + bandwidth/kernel
    settings or (b) a precomputed :class:`libpysal.graph.Graph` passed via ``graph``.

    Notes
    -----
    - ``y`` must be binary (``{0, 1}`` or boolean).
    - To enable prediction on new data via :meth:`predict`/:meth:`predict_proba`, you
      must set ``keep_models=True`` (store in memory) or ``keep_models=Path(...)``
      (serialize to disk).
    - Only point geometries are supported.

    Parameters
    ----------
    bandwidth : float | int | None
        Bandwidth for defining neighborhoods.

        - If ``fixed=True``, this is a distance threshold.
        - If ``fixed=False``, this is the number of nearest neighbors used to form the
          local neighborhood.

        If ``graph`` is provided, ``bandwidth`` is ignored.
    fixed : bool, optional
        True for distance based bandwidth and False for adaptive (nearest neighbor)
        bandwidth, by default False
    kernel : str | Callable, optional
        type of kernel function used to weight observations, by default "bisquare"
    include_focal : bool, optional
        Include focal in the local model training. Excluding it allows
        assessment of geographically weighted metrics on unseen data without a need for
        train/test split, hence providing value for all samples. This is needed for
        futher spatial analysis of the model performance (and generalises to models
        that do not support OOB scoring). However, it leaves out the most representative
        sample. By default False
    geometry : gpd.GeoSeries, optional
        Geographic location of the observations in the sample. Used to determine the
        spatial interaction weight based on specification by ``bandwidth``, ``fixed``,
        ``kernel``, and ``include_focal`` keywords.  Either ``geometry`` or ``graph``
        need to be specified. To allow prediction, it is required to specify
        ``geometry``.
    graph : Graph, optional
        Custom libpysal.graph.Graph object encoding the spatial interaction between
        observations in the sample. If given, it is used directly and ``bandwidth``,
        ``fixed``, ``kernel``, and ``include_focal`` keywords are ignored. Either
        ``geometry`` or ``graph`` need to be specified. To allow prediction, it is
        required to specify ``geometry``. Potentially, both can be specified where
        ``graph`` encodes spatial interaction between observations in ``geometry``.
    n_jobs : int, optional
        The number of jobs to run in parallel. ``-1`` means using all processors
        by default ``-1``
    fit_global_model : bool, optional
        Determines if the global baseline model shall be fitted alognside
        the geographically weighted, by default True
    strict : bool | None, optional
        Do not fit any models if at least one neighborhood has invariant ``y``,
        by default False. None is treated as False but provides a warning if there are
        invariant models.
    keep_models : bool | str | Path, optional
        Keep all local models (required for prediction), by default False. Note that
        for some models, like random forests, the objects can be large. If string or
        Path is provided, the local models are not held in memory but serialized to
        the disk from which they are loaded in prediction.
    temp_folder : str | None, optional
        Folder to be used by the pool for memmapping large arrays for sharing memory
        with worker processes, e.g., ``/tmp``. Passed to ``joblib.Parallel``, by default
        None
    batch_size : int | None, optional
        Number of models to process in each batch. Specify batch_size fi your models do
        not fit into memory. By default None
    min_proportion : float, optional
        Minimum proportion of minority class for a model to be fitted, by default 0.2
    undersample : bool | float, optional
        Whether to apply random undersampling to balance classes.

        If ``True``, undersample the majority class to match the minority class
        (i.e., minority/majority ratio = 1.0).

        If a float ``alpha > 0``, target a minority/majority ratio of ``alpha`` after
        resampling, i.e. ``alpha = N_min / N_resampled_majority``.
        By default False
    random_state : int | None, optional
        Random seed for reproducibility, by default None
    verbose : bool, optional
        Whether to print progress information, by default False
    **kwargs
        Additional keyword arguments passed to ``model`` initialisation

    Attributes
    ----------
    proba_ : pd.DataFrame
        Probability predictions for focal locations based on a local model trained
        around the point itself.
    pred_ : pd.Series
        Binary predictions for focal locations based on a local model trained around the
        location itself.
    hat_values_ : pd.Series
        Hat values for each location (diagonal elements of hat matrix)
    effective_df_ : float
        Effective degrees of freedom (sum of hat values)
    log_likelihood_ : float
        Global log likelihood of the model
    aic_ : float
        Akaike inofrmation criterion of the model
    aicc_ : float
        Corrected Akaike information criterion to account to account for model
        complexity (smaller bandwidths)
    bic_ : float
        Bayesian information criterion
    feature_importances_ : pd.DataFrame
        Feature importance values for each local model
    prediction_rate_ : float
        Proportion of models that are fitted, where the rest are skipped due to not
        fulfilling ``min_proportion``.

    Examples
    --------
    >>> import geopandas as gpd
    >>> from geodatasets import get_path
    >>> from gwlearn.ensemble import GWGradientBoostingClassifier

    >>> gdf = gpd.read_file(get_path('geoda.guerry'))
    >>> X = gdf[['Crm_prp', 'Litercy', 'Donatns', 'Lottery']]
    >>> y = gdf["Region"] == 'E'

    >>> gw = GWGradientBoostingClassifier(
    ...     bandwidth=30,
    ...     fixed=False,
    ...     geometry=gdf.representative_point(),
    ...     random_state=0,
    ... ).fit(X, y)
    >>> gw.pred_.head()
    0    False
    1    False
    2    False
    3     True
    4     True
    dtype: boolean
    """

    def __init__(
        self,
        *,
        bandwidth: int | float | None = None,
        fixed: bool = False,
        kernel: Literal[
            "triangular",
            "parabolic",
            # "gaussian",
            "bisquare",
            "tricube",
            "cosine",
            "boxcar",
            # "exponential",
        ]
        | Callable = "bisquare",
        include_focal: bool = False,
        geometry: gpd.GeoSeries | None = None,
        graph: graph.Graph | None = None,
        n_jobs: int = -1,
        fit_global_model: bool = True,
        strict: bool | None = False,
        keep_models: bool | str | Path = False,
        temp_folder: str | None = None,
        batch_size: int | None = None,
        min_proportion: float = 0.2,
        undersample: bool | float = False,
        random_state: int | None = None,
        verbose: bool = False,
        **kwargs,
    ):
        super().__init__(
            model=GradientBoostingClassifier,
            bandwidth=bandwidth,
            fixed=fixed,
            kernel=kernel,
            include_focal=include_focal,
            geometry=geometry,
            graph=graph,
            n_jobs=n_jobs,
            fit_global_model=fit_global_model,
            strict=strict,
            keep_models=keep_models,
            temp_folder=temp_folder,
            batch_size=batch_size,
            min_proportion=min_proportion,
            undersample=undersample,
            random_state=random_state,
            verbose=verbose,
            **kwargs,
        )

        self._model_type = "gradient_boosting"
        self._empty_score_data = np.nan


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "GWGradientBoostingClassifier":
        """Fit geographically weighted gradient boosting classifiers.

        Parameters
        ----------
        X : pandas.DataFrame
            Feature matrix.
        y : pandas.Series
            Binary target encoded as boolean or ``{0, 1}``.

        Returns
        -------
        GWGradientBoostingClassifier
            Fitted estimator.

        Notes
        -----
        Populates ``feature_importances_`` from the fitted local models.
        """
        self._empty_feature_imp = np.array([np.nan] * (X.shape[1]))
        super().fit(X=X, y=y)

        # feature importances
        self.feature_importances_ = pd.DataFrame(
            self._feature_importances, index=self._names, columns=X.columns
        )

        if self.verbose:
            print(f"{(time() - self._start):.2f}s: Finished")

        return self