Utils

Validation

`hgp_lib.utils.validation.ComplexityCheck`

Create a validity predicate that rejects rules exceeding max_complexity nodes.

Intended for use as the check_valid argument of BooleanGPConfig.

Parameters:

Name	Type	Description	Default
`max_complexity`	`int`	Maximum allowed node count. Default: `100`.	`100`

Examples:

from hgp_lib.rules import Literal, And from hgp_lib.utils.validation import ComplexityCheck check = ComplexityCheck(3) check(Literal(value=0)) True check(And([Literal(value=0), Literal(value=1)])) True check(And([Literal(value=0), And([Literal(value=1), Literal(value=2)])])) False

Source code in hgp_lib\utils\validation.py

class ComplexityCheck:
    """
    Create a validity predicate that rejects rules exceeding ``max_complexity`` nodes.

    Intended for use as the ``check_valid`` argument of ``BooleanGPConfig``.

    Args:
        max_complexity (int):
            Maximum allowed node count. Default: `100`.

    Examples:
    >>> from hgp_lib.rules import Literal, And
    >>> from hgp_lib.utils.validation import ComplexityCheck
    >>> check = ComplexityCheck(3)
    >>> check(Literal(value=0))
    True
    >>> check(And([Literal(value=0), Literal(value=1)]))
    True
    >>> check(And([Literal(value=0), And([Literal(value=1), Literal(value=2)])]))
    False
    """

    def __init__(self, max_complexity: int = 100):
        self.max_complexity = max_complexity

    def __call__(self, rule: Rule) -> bool:
        """
        Check if rule complexity (node count) is within a limit.

        Args:
            rule (Rule): The rule to check.

        Returns:
            bool: ``True`` if ``len(rule) <= self.max_complexity``.

        Examples:
            >>> from hgp_lib.rules import Literal, And
            >>> from hgp_lib.utils.validation import ComplexityCheck
            >>> ComplexityCheck(5)(Literal(value=0))
            True
            >>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
            False
        """
        return len(rule) <= self.max_complexity

`call(rule)`

Check if rule complexity (node count) is within a limit.

Parameters:

Name	Type	Description	Default
`rule`	`Rule`	The rule to check.	required

Returns:

Name	Type	Description
`bool`	`bool`	`True` if `len(rule) <= self.max_complexity`.

Examples:

>>> from hgp_lib.rules import Literal, And
>>> from hgp_lib.utils.validation import ComplexityCheck
>>> ComplexityCheck(5)(Literal(value=0))
True
>>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
False

Source code in hgp_lib\utils\validation.py

def __call__(self, rule: Rule) -> bool:
    """
    Check if rule complexity (node count) is within a limit.

    Args:
        rule (Rule): The rule to check.

    Returns:
        bool: ``True`` if ``len(rule) <= self.max_complexity``.

    Examples:
        >>> from hgp_lib.rules import Literal, And
        >>> from hgp_lib.utils.validation import ComplexityCheck
        >>> ComplexityCheck(5)(Literal(value=0))
        True
        >>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
        False
    """
    return len(rule) <= self.max_complexity

`hgp_lib.utils.validation.validate_callable(maybe_callable, error_message=None)`

Validate that a value is callable.

Parameters:

Name	Type	Description	Default
`maybe_callable`	`Callable`	Value to check.	required
`error_message`	`str \| None`	Optional custom error message. Default: `None`.	`None`

Raises:

Type	Description
`TypeError`	If value is not callable.

Examples:

>>> from hgp_lib.utils.validation import validate_callable
>>> validate_callable(len)  # no error
>>> validate_callable(42)
Traceback (most recent call last):
...
TypeError: score_fn must be callable, is <class 'int'>

Source code in hgp_lib\utils\validation.py

def validate_callable(maybe_callable: Callable, error_message: str | None = None):
    """
    Validate that a value is callable.

    Args:
        maybe_callable (Callable): Value to check.
        error_message (str | None): Optional custom error message. Default: `None`.

    Raises:
        TypeError: If value is not callable.

    Examples:
        >>> from hgp_lib.utils.validation import validate_callable
        >>> validate_callable(len)  # no error
        >>> validate_callable(42)
        Traceback (most recent call last):
        ...
        TypeError: score_fn must be callable, is <class 'int'>
    """
    if not callable(maybe_callable):
        if error_message is None:
            error_message = f"score_fn must be callable, is {type(maybe_callable)}"
        raise TypeError(error_message)

`hgp_lib.utils.validation.check_isinstance(value, expected_type)`

Check that a value is an instance of expected type(s).

Parameters:

Name	Type	Description	Default
`value`	`Any`	Value to check.	required
`expected_type`	`Type \| Tuple[Type, ...]`	Expected type or tuple of types.	required

Raises:

Type	Description
`TypeError`	If value is not an instance of expected type.

Source code in hgp_lib\utils\validation.py

def check_isinstance(value: Any, expected_type: Type | Tuple[Type, ...]):
    """
    Check that a value is an instance of expected type(s).

    Args:
        value (Any): Value to check.
        expected_type (Type | Tuple[Type, ...]): Expected type or tuple of types.

    Raises:
        TypeError: If value is not an instance of expected type.
    """
    if not isinstance(value, expected_type):
        name = "<unknown value>"
        # Search the name in the caller
        frame = inspect.currentframe()
        if frame is not None:
            frame = frame.f_back
            for var_name, var_val in {**frame.f_locals, **frame.f_globals}.items():
                if var_val is value:
                    name = var_name
                    break
        if isinstance(expected_type, tuple):
            expected_type = " or ".join([str(t) for t in expected_type])
        else:
            expected_type = str(expected_type)
        raise TypeError(
            f"{name} should be of type {expected_type}, but is {type(value)}"
        )

`hgp_lib.utils.validation.validate_num_literals(num_literals)`

Validate num_literals parameter.

Parameters:

Name	Type	Description	Default
`num_literals`	`int`	Number of literals (must be > 1).	required

Raises:

Type	Description
`TypeError`	If not an integer.
`ValueError`	If <= 1.

Examples:

>>> from hgp_lib.utils.validation import validate_num_literals
>>> validate_num_literals(5)  # no error
>>> validate_num_literals(1)
Traceback (most recent call last):
...
ValueError: Number of literals must be greater than 1, is '1'

Source code in hgp_lib\utils\validation.py

def validate_num_literals(num_literals: int):
    """
    Validate ``num_literals`` parameter.

    Args:
        num_literals (int): Number of literals (must be > 1).

    Raises:
        TypeError: If not an integer.
        ValueError: If <= 1.

    Examples:
        >>> from hgp_lib.utils.validation import validate_num_literals
        >>> validate_num_literals(5)  # no error
        >>> validate_num_literals(1)
        Traceback (most recent call last):
        ...
        ValueError: Number of literals must be greater than 1, is '1'
    """
    check_isinstance(num_literals, int)
    if num_literals <= 1:
        raise ValueError(
            f"Number of literals must be greater than 1, is '{num_literals}'"
        )

`hgp_lib.utils.validation.validate_operator_types(operator_types)`

Validate operator_types parameter.

Parameters:

Name	Type	Description	Default
`operator_types`	`Sequence[Type[Rule]]`	Sequence of Rule subclasses.	required

Raises:

Type	Description
`TypeError`	If not a sequence or contains non-Rule types.
`ValueError`	If fewer than 2 types.

Examples:

>>> from hgp_lib.rules import And, Or
>>> from hgp_lib.utils.validation import validate_operator_types
>>> validate_operator_types((And, Or))  # no error
>>> validate_operator_types((And,))
Traceback (most recent call last):
...
ValueError: operator_types must have at least two operator types

Source code in hgp_lib\utils\validation.py

def validate_operator_types(operator_types: Sequence[Type[Rule]]):
    """
    Validate ``operator_types`` parameter.

    Args:
        operator_types (Sequence[Type[Rule]]): Sequence of Rule subclasses.

    Raises:
        TypeError: If not a sequence or contains non-Rule types.
        ValueError: If fewer than 2 types.

    Examples:
        >>> from hgp_lib.rules import And, Or
        >>> from hgp_lib.utils.validation import validate_operator_types
        >>> validate_operator_types((And, Or))  # no error
        >>> validate_operator_types((And,))
        Traceback (most recent call last):
        ...
        ValueError: operator_types must have at least two operator types
    """
    check_isinstance(operator_types, Sequence)
    if len(operator_types) < 2:
        raise ValueError("operator_types must have at least two operator types")
    for operator_type in operator_types:
        if not issubclass(operator_type, Rule):
            raise TypeError(
                f"All operator types must be subclassing Rule. Found '{type(operator_type)}'"
            )

`hgp_lib.utils.validation.check_X_y(X, y, x_type=np.ndarray)`

Validate input data and labels.

Checks that X is an instance of x_type, y is a numpy array, both are non-None, non-empty, 2-D/1-D respectively, and have the same number of samples.

Parameters:

Name	Type	Description	Default
`X`	`ndarray \| DataFrame`	Input data.	required
`y`	`ndarray`	Target labels (1-D).	required
`x_type`	`Type[ndarray] \| Type[DataFrame]`	Expected type for X. Default: `np.ndarray`.	`ndarray`

Raises:

Type	Description
`ValueError`	If X or y is None, empty, or have mismatched lengths.
`TypeError`	If X is not an instance of `x_type` or y is not an ndarray.

Source code in hgp_lib\utils\validation.py

def check_X_y(
    X: np.ndarray | pd.DataFrame,
    y: np.ndarray,
    x_type: Type[np.ndarray] | Type[pd.DataFrame] = np.ndarray,
):
    """
    Validate input data and labels.

    Checks that X is an instance of `x_type`, y is a numpy array, both are
    non-None, non-empty, 2-D/1-D respectively, and have the same number of
    samples.

    Args:
        X (np.ndarray | pd.DataFrame): Input data.
        y (np.ndarray): Target labels (1-D).
        x_type (Type[np.ndarray] | Type[pd.DataFrame]): Expected type for X.
            Default: `np.ndarray`.

    Raises:
        ValueError: If X or y is None, empty, or have mismatched lengths.
        TypeError: If X is not an instance of `x_type` or y is not an ndarray.
    """
    if X is None:
        raise ValueError("X (data) cannot be None")
    if y is None:
        raise ValueError("y (labels) cannot be None")

    check_isinstance(X, x_type)
    check_isinstance(y, np.ndarray)

    if len(X) != len(y):
        raise ValueError(
            f"X and y must have the same length. Got X={len(X)}, y={len(y)}"
        )

    if len(X) == 0:
        raise ValueError("X and y cannot be empty")
    if X.ndim != 2:
        raise ValueError(f"X must be 2D array (samples, features), got shape {X.shape}")
    if y.ndim != 1:
        raise ValueError(f"y must be 1D array (samples), got shape {y.shape}")

Metrics

`hgp_lib.utils.metrics.confusion_matrix(y_pred, y_true, sample_weight=None)`

Compute confusion matrix values from boolean prediction and label arrays.

Parameters:

Name	Type	Description	Default
`y_pred`	`ndarray`	Boolean predictions.	required
`y_true`	`ndarray`	Boolean ground-truth labels.	required
`sample_weight`	`ndarray \| None`	Optional per-sample weights. Default: `None`.	`None`

Returns:

Type	Description
`Tuple[int, int, int, int]`	Tuple[int, int, int, int]: `(tp, fp, fn, tn)`.

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import confusion_matrix
>>> y_pred = np.array([True, True, False, False])
>>> y_true = np.array([True, False, True, False])
>>> confusion_matrix(y_pred, y_true)
(1, 1, 1, 1)

Source code in hgp_lib\utils\metrics.py

def confusion_matrix(
    y_pred: np.ndarray, y_true: np.ndarray, sample_weight: np.ndarray | None = None
) -> Tuple[int, int, int, int]:
    """
    Compute confusion matrix values from boolean prediction and label arrays.

    Args:
        y_pred (np.ndarray):
            Boolean predictions.
        y_true (np.ndarray):
            Boolean ground-truth labels.
        sample_weight (np.ndarray | None):
            Optional per-sample weights. Default: `None`.

    Returns:
        Tuple[int, int, int, int]: ``(tp, fp, fn, tn)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import confusion_matrix
        >>> y_pred = np.array([True, True, False, False])
        >>> y_true = np.array([True, False, True, False])
        >>> confusion_matrix(y_pred, y_true)
        (1, 1, 1, 1)
    """
    if sample_weight is None:
        tp = (y_pred & y_true).sum()
        fp = (y_pred & ~y_true).sum()
        total_true = y_true.sum()
        fn = total_true - tp
        tn = len(y_pred) - total_true - fp
    else:
        tp = ((y_pred & y_true) * sample_weight).sum()
        fp = ((y_pred & ~y_true) * sample_weight).sum()
        total_true = (y_true * sample_weight).sum()
        fn = total_true - tp
        tn = sample_weight.sum() - total_true - fp
    return int(tp), int(fp), int(fn), int(tn)

`hgp_lib.utils.metrics.fast_f1_score(y_pred, y_true, sample_weight=None)`

Compute F1 score with optional sample weights.

This function supports the optimize_scorer feature of BooleanGP by accepting sample_weight parameter. It's optimized for boolean arrays.

Parameters:

Name	Type	Description	Default
`y_pred`	`ndarray`	Boolean predictions array.	required
`y_true`	`ndarray`	True labels array.	required
`sample_weight`	`ndarray \| None`	Optional sample weights for weighted F1.	`None`

Returns:

Type	Description
`float`	F1 score as float in [0, 1].

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import fast_f1_score
>>> y_pred = np.array([True, True, False, False])
>>> y_true = np.array([True, False, False, True])
>>> fast_f1_score(y_pred, y_true)
0.5

Source code in hgp_lib\utils\metrics.py

def fast_f1_score(
    y_pred: ndarray,
    y_true: ndarray,
    sample_weight: ndarray | None = None,
) -> float:
    """
    Compute F1 score with optional sample weights.

    This function supports the optimize_scorer feature of BooleanGP
    by accepting sample_weight parameter. It's optimized for boolean arrays.

    Args:
        y_pred: Boolean predictions array.
        y_true: True labels array.
        sample_weight: Optional sample weights for weighted F1.

    Returns:
        F1 score as float in [0, 1].

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import fast_f1_score
        >>> y_pred = np.array([True, True, False, False])
        >>> y_true = np.array([True, False, False, True])
        >>> fast_f1_score(y_pred, y_true)
        0.5
    """
    if sample_weight is None:
        y_pred_sum = y_pred.sum()
        y_true_sum = y_true.sum()
        if y_pred_sum == 0 or y_true_sum == 0:
            return 1.0 if y_pred_sum == 0 and y_true_sum == 0 else 0.0
        return float(2 * (y_pred & y_true).sum() / (y_pred_sum + y_true_sum))

    y_pred_sum = np.dot(y_pred, sample_weight)
    y_true_sum = np.dot(y_true, sample_weight)
    if y_pred_sum == 0 or y_true_sum == 0:
        return 1.0 if y_pred_sum == 0 and y_true_sum == 0 else 0.0
    return float(2 * np.dot(y_pred & y_true, sample_weight) / (y_pred_sum + y_true_sum))

`hgp_lib.utils.metrics.accepts_sample_weight(scorer)`

Check if a scorer function accepts a sample_weight parameter.

Inspects the function signature first; falls back to a runtime probe if signature inspection fails.

Parameters:

Name	Type	Description	Default
`scorer`	`Callable`	The scoring function to check.	required

Returns:

Name	Type	Description
`bool`	`bool`	`True` if the scorer accepts `sample_weight`.

Examples:

>>> from hgp_lib.utils.metrics import accepts_sample_weight
>>> def with_sw(p, l, sample_weight=None): return 0.0
>>> accepts_sample_weight(with_sw)
True
>>> def without_sw(p, l): return 0.0
>>> accepts_sample_weight(without_sw)
False

Source code in hgp_lib\utils\metrics.py

def accepts_sample_weight(scorer: Callable) -> bool:
    """
    Check if a scorer function accepts a ``sample_weight`` parameter.

    Inspects the function signature first; falls back to a runtime probe if
    signature inspection fails.

    Args:
        scorer (Callable):
            The scoring function to check.

    Returns:
        bool: ``True`` if the scorer accepts ``sample_weight``.

    Examples:
        >>> from hgp_lib.utils.metrics import accepts_sample_weight
        >>> def with_sw(p, l, sample_weight=None): return 0.0
        >>> accepts_sample_weight(with_sw)
        True
        >>> def without_sw(p, l): return 0.0
        >>> accepts_sample_weight(without_sw)
        False
    """
    try:
        sig = inspect.signature(scorer)
        for param in sig.parameters.values():
            if param.name == "sample_weight":
                return True

    except (TypeError, ValueError):
        pass

    try:
        labels = np.array([1, 0, 1], dtype=bool)
        count = np.array([2, 1, 1])
        scorer(labels, labels, sample_weight=count)
        return True
    except TypeError:
        return False

`hgp_lib.utils.metrics.transform_duplicates_to_sample_weight(data, labels)`

Remove duplicate rows from (data, labels) and return sample weights.

Rows that appear multiple times are collapsed into a single row with a weight equal to the original count.

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	2-D input data.	required
`labels`	`ndarray`	1-D label array (same length as `data`).	required

Returns:

Type	Description
	Tuple[ndarray, ndarray, ndarray]: `(unique_data, unique_labels, sample_weights)`.

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import transform_duplicates_to_sample_weight
>>> data = np.array([[1, 0], [1, 0], [0, 1]])
>>> labels = np.array([1, 1, 0])
>>> ud, ul, sw = transform_duplicates_to_sample_weight(data, labels)
>>> len(ud) < len(data)
True
>>> bool(sw.sum() == len(data))
True

Source code in hgp_lib\utils\metrics.py

def transform_duplicates_to_sample_weight(data: ndarray, labels: ndarray):
    """
    Remove duplicate rows from ``(data, labels)`` and return sample weights.

    Rows that appear multiple times are collapsed into a single row with a
    weight equal to the original count.

    Args:
        data (ndarray):
            2-D input data.
        labels (ndarray):
            1-D label array (same length as ``data``).

    Returns:
        Tuple[ndarray, ndarray, ndarray]: ``(unique_data, unique_labels, sample_weights)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import transform_duplicates_to_sample_weight
        >>> data = np.array([[1, 0], [1, 0], [0, 1]])
        >>> labels = np.array([1, 1, 0])
        >>> ud, ul, sw = transform_duplicates_to_sample_weight(data, labels)
        >>> len(ud) < len(data)
        True
        >>> bool(sw.sum() == len(data))
        True
    """
    Xy = np.hstack((data, labels[:, None]))
    Xy_unique, sample_weight = np.unique(Xy, axis=0, return_counts=True)
    return Xy_unique[:, :-1], Xy_unique[:, -1], sample_weight

`hgp_lib.utils.metrics.optimize_scorers_for_data(*scorers, data, labels)`

Optimise scorers by deduplicating data and binding sample_weight.

If every scorer accepts sample_weight, duplicate rows are removed and each scorer is wrapped with SampleWeightScorer to inject the computed weights. Otherwise a warning is issued (once per scorer) and the original data is returned unchanged.

Parameters:

Name	Type	Description	Default
`*scorers`	`Callable[[ndarray, ndarray], Any]`	One or more scoring functions.	`()`
`data`	`ndarray`	2-D input data.	required
`labels`	`ndarray`	1-D label array.	required

Returns:

Name	Type	Description
`Tuple`		`(*optimised_scorers, data, labels)`.

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import optimize_scorers_for_data
>>> def acc(p, l, sample_weight=None): return float((p == l).mean())
>>> data = np.array([[1, 0], [1, 0], [0, 1]])
>>> labels = np.array([1, 1, 0])
>>> opt_acc, opt_data, opt_labels = optimize_scorers_for_data(acc, data=data, labels=labels)
>>> len(opt_data) <= len(data)
True

Source code in hgp_lib\utils\metrics.py

def optimize_scorers_for_data(
    *scorers: Callable[[ndarray, ndarray], Any], data: ndarray, labels: ndarray
):
    """
    Optimise scorers by deduplicating data and binding ``sample_weight``.

    If every scorer accepts ``sample_weight``, duplicate rows are removed and
    each scorer is wrapped with ``SampleWeightScorer`` to inject the computed
    weights. Otherwise a warning is issued (once per scorer) and the original
    data is returned unchanged.

    Args:
        *scorers (Callable[[ndarray, ndarray], Any]):
            One or more scoring functions.
        data (ndarray):
            2-D input data.
        labels (ndarray):
            1-D label array.

    Returns:
        Tuple: ``(*optimised_scorers, data, labels)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import optimize_scorers_for_data
        >>> def acc(p, l, sample_weight=None): return float((p == l).mean())
        >>> data = np.array([[1, 0], [1, 0], [0, 1]])
        >>> labels = np.array([1, 1, 0])
        >>> opt_acc, opt_data, opt_labels = optimize_scorers_for_data(acc, data=data, labels=labels)
        >>> len(opt_data) <= len(data)
        True
    """
    scorers_ok = True
    for scorer in scorers:
        validate_callable(scorer)
        if not accepts_sample_weight(scorer):
            scorers_ok = False
            # Only warn once per scorer function to avoid repeated warnings
            scorer_id = id(scorer)
            if scorer_id not in _warned_scorers:
                _warned_scorers.add(scorer_id)
                warnings.warn(
                    'The scorer must accept "sample_weight" to be optimized by '
                    "removing duplicates in the data. Scorer optimization is disabled "
                    "for this scorer.",
                    stacklevel=2,
                )
    if scorers_ok:
        data, labels, sample_weight = transform_duplicates_to_sample_weight(
            data, labels
        )
        scorers = [SampleWeightScorer(scorer, sample_weight) for scorer in scorers]
    return *scorers, data, labels

Utils

Validation

hgp_lib.utils.validation.ComplexityCheck

__call__(rule)

hgp_lib.utils.validation.validate_callable(maybe_callable, error_message=None)

hgp_lib.utils.validation.check_isinstance(value, expected_type)

hgp_lib.utils.validation.validate_num_literals(num_literals)

hgp_lib.utils.validation.validate_operator_types(operator_types)

hgp_lib.utils.validation.check_X_y(X, y, x_type=np.ndarray)

Metrics

hgp_lib.utils.metrics.confusion_matrix(y_pred, y_true, sample_weight=None)

hgp_lib.utils.metrics.fast_f1_score(y_pred, y_true, sample_weight=None)

hgp_lib.utils.metrics.accepts_sample_weight(scorer)

hgp_lib.utils.metrics.transform_duplicates_to_sample_weight(data, labels)

hgp_lib.utils.metrics.optimize_scorers_for_data(*scorers, data, labels)

`hgp_lib.utils.validation.ComplexityCheck`

`call(rule)`

`hgp_lib.utils.validation.validate_callable(maybe_callable, error_message=None)`

`hgp_lib.utils.validation.check_isinstance(value, expected_type)`

`hgp_lib.utils.validation.validate_num_literals(num_literals)`

`hgp_lib.utils.validation.validate_operator_types(operator_types)`

`hgp_lib.utils.validation.check_X_y(X, y, x_type=np.ndarray)`

`hgp_lib.utils.metrics.confusion_matrix(y_pred, y_true, sample_weight=None)`

`hgp_lib.utils.metrics.fast_f1_score(y_pred, y_true, sample_weight=None)`

`hgp_lib.utils.metrics.accepts_sample_weight(scorer)`

`hgp_lib.utils.metrics.transform_duplicates_to_sample_weight(data, labels)`

`hgp_lib.utils.metrics.optimize_scorers_for_data(*scorers, data, labels)`