Skip to content

Utils

Validation

hgp_lib.utils.validation.ComplexityCheck

Create a validity predicate that rejects rules exceeding max_complexity nodes.

Intended for use as the check_valid argument of BooleanGPConfig.

Parameters:

Name Type Description Default
max_complexity int

Maximum allowed node count. Default: 100.

100

Examples:

from hgp_lib.rules import Literal, And from hgp_lib.utils.validation import ComplexityCheck check = ComplexityCheck(3) check(Literal(value=0)) True check(And([Literal(value=0), Literal(value=1)])) True check(And([Literal(value=0), And([Literal(value=1), Literal(value=2)])])) False

Source code in hgp_lib\utils\validation.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class ComplexityCheck:
    """
    Create a validity predicate that rejects rules exceeding ``max_complexity`` nodes.

    Intended for use as the ``check_valid`` argument of ``BooleanGPConfig``.

    Args:
        max_complexity (int):
            Maximum allowed node count. Default: `100`.

    Examples:
    >>> from hgp_lib.rules import Literal, And
    >>> from hgp_lib.utils.validation import ComplexityCheck
    >>> check = ComplexityCheck(3)
    >>> check(Literal(value=0))
    True
    >>> check(And([Literal(value=0), Literal(value=1)]))
    True
    >>> check(And([Literal(value=0), And([Literal(value=1), Literal(value=2)])]))
    False
    """

    def __init__(self, max_complexity: int = 100):
        self.max_complexity = max_complexity

    def __call__(self, rule: Rule) -> bool:
        """
        Check if rule complexity (node count) is within a limit.

        Args:
            rule (Rule): The rule to check.

        Returns:
            bool: ``True`` if ``len(rule) <= self.max_complexity``.

        Examples:
            >>> from hgp_lib.rules import Literal, And
            >>> from hgp_lib.utils.validation import ComplexityCheck
            >>> ComplexityCheck(5)(Literal(value=0))
            True
            >>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
            False
        """
        return len(rule) <= self.max_complexity

__call__(rule)

Check if rule complexity (node count) is within a limit.

Parameters:

Name Type Description Default
rule Rule

The rule to check.

required

Returns:

Name Type Description
bool bool

True if len(rule) <= self.max_complexity.

Examples:

>>> from hgp_lib.rules import Literal, And
>>> from hgp_lib.utils.validation import ComplexityCheck
>>> ComplexityCheck(5)(Literal(value=0))
True
>>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
False
Source code in hgp_lib\utils\validation.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __call__(self, rule: Rule) -> bool:
    """
    Check if rule complexity (node count) is within a limit.

    Args:
        rule (Rule): The rule to check.

    Returns:
        bool: ``True`` if ``len(rule) <= self.max_complexity``.

    Examples:
        >>> from hgp_lib.rules import Literal, And
        >>> from hgp_lib.utils.validation import ComplexityCheck
        >>> ComplexityCheck(5)(Literal(value=0))
        True
        >>> ComplexityCheck(2)(And([Literal(value=0), Literal(value=1)]))
        False
    """
    return len(rule) <= self.max_complexity

hgp_lib.utils.validation.validate_callable(maybe_callable, error_message=None)

Validate that a value is callable.

Parameters:

Name Type Description Default
maybe_callable Callable

Value to check.

required
error_message str | None

Optional custom error message. Default: None.

None

Raises:

Type Description
TypeError

If value is not callable.

Examples:

>>> from hgp_lib.utils.validation import validate_callable
>>> validate_callable(len)  # no error
>>> validate_callable(42)
Traceback (most recent call last):
...
TypeError: score_fn must be callable, is <class 'int'>
Source code in hgp_lib\utils\validation.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def validate_callable(maybe_callable: Callable, error_message: str | None = None):
    """
    Validate that a value is callable.

    Args:
        maybe_callable (Callable): Value to check.
        error_message (str | None): Optional custom error message. Default: `None`.

    Raises:
        TypeError: If value is not callable.

    Examples:
        >>> from hgp_lib.utils.validation import validate_callable
        >>> validate_callable(len)  # no error
        >>> validate_callable(42)
        Traceback (most recent call last):
        ...
        TypeError: score_fn must be callable, is <class 'int'>
    """
    if not callable(maybe_callable):
        if error_message is None:
            error_message = f"score_fn must be callable, is {type(maybe_callable)}"
        raise TypeError(error_message)

hgp_lib.utils.validation.check_isinstance(value, expected_type)

Check that a value is an instance of expected type(s).

Parameters:

Name Type Description Default
value Any

Value to check.

required
expected_type Type | Tuple[Type, ...]

Expected type or tuple of types.

required

Raises:

Type Description
TypeError

If value is not an instance of expected type.

Source code in hgp_lib\utils\validation.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def check_isinstance(value: Any, expected_type: Type | Tuple[Type, ...]):
    """
    Check that a value is an instance of expected type(s).

    Args:
        value (Any): Value to check.
        expected_type (Type | Tuple[Type, ...]): Expected type or tuple of types.

    Raises:
        TypeError: If value is not an instance of expected type.
    """
    if not isinstance(value, expected_type):
        name = "<unknown value>"
        # Search the name in the caller
        frame = inspect.currentframe()
        if frame is not None:
            frame = frame.f_back
            for var_name, var_val in {**frame.f_locals, **frame.f_globals}.items():
                if var_val is value:
                    name = var_name
                    break
        if isinstance(expected_type, tuple):
            expected_type = " or ".join([str(t) for t in expected_type])
        else:
            expected_type = str(expected_type)
        raise TypeError(
            f"{name} should be of type {expected_type}, but is {type(value)}"
        )

hgp_lib.utils.validation.validate_num_literals(num_literals)

Validate num_literals parameter.

Parameters:

Name Type Description Default
num_literals int

Number of literals (must be > 1).

required

Raises:

Type Description
TypeError

If not an integer.

ValueError

If <= 1.

Examples:

>>> from hgp_lib.utils.validation import validate_num_literals
>>> validate_num_literals(5)  # no error
>>> validate_num_literals(1)
Traceback (most recent call last):
...
ValueError: Number of literals must be greater than 1, is '1'
Source code in hgp_lib\utils\validation.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def validate_num_literals(num_literals: int):
    """
    Validate ``num_literals`` parameter.

    Args:
        num_literals (int): Number of literals (must be > 1).

    Raises:
        TypeError: If not an integer.
        ValueError: If <= 1.

    Examples:
        >>> from hgp_lib.utils.validation import validate_num_literals
        >>> validate_num_literals(5)  # no error
        >>> validate_num_literals(1)
        Traceback (most recent call last):
        ...
        ValueError: Number of literals must be greater than 1, is '1'
    """
    check_isinstance(num_literals, int)
    if num_literals <= 1:
        raise ValueError(
            f"Number of literals must be greater than 1, is '{num_literals}'"
        )

hgp_lib.utils.validation.validate_operator_types(operator_types)

Validate operator_types parameter.

Parameters:

Name Type Description Default
operator_types Sequence[Type[Rule]]

Sequence of Rule subclasses.

required

Raises:

Type Description
TypeError

If not a sequence or contains non-Rule types.

ValueError

If fewer than 2 types.

Examples:

>>> from hgp_lib.rules import And, Or
>>> from hgp_lib.utils.validation import validate_operator_types
>>> validate_operator_types((And, Or))  # no error
>>> validate_operator_types((And,))
Traceback (most recent call last):
...
ValueError: operator_types must have at least two operator types
Source code in hgp_lib\utils\validation.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def validate_operator_types(operator_types: Sequence[Type[Rule]]):
    """
    Validate ``operator_types`` parameter.

    Args:
        operator_types (Sequence[Type[Rule]]): Sequence of Rule subclasses.

    Raises:
        TypeError: If not a sequence or contains non-Rule types.
        ValueError: If fewer than 2 types.

    Examples:
        >>> from hgp_lib.rules import And, Or
        >>> from hgp_lib.utils.validation import validate_operator_types
        >>> validate_operator_types((And, Or))  # no error
        >>> validate_operator_types((And,))
        Traceback (most recent call last):
        ...
        ValueError: operator_types must have at least two operator types
    """
    check_isinstance(operator_types, Sequence)
    if len(operator_types) < 2:
        raise ValueError("operator_types must have at least two operator types")
    for operator_type in operator_types:
        if not issubclass(operator_type, Rule):
            raise TypeError(
                f"All operator types must be subclassing Rule. Found '{type(operator_type)}'"
            )

hgp_lib.utils.validation.check_X_y(X, y, x_type=np.ndarray)

Validate input data and labels.

Checks that X is an instance of x_type, y is a numpy array, both are non-None, non-empty, 2-D/1-D respectively, and have the same number of samples.

Parameters:

Name Type Description Default
X ndarray | DataFrame

Input data.

required
y ndarray

Target labels (1-D).

required
x_type Type[ndarray] | Type[DataFrame]

Expected type for X. Default: np.ndarray.

ndarray

Raises:

Type Description
ValueError

If X or y is None, empty, or have mismatched lengths.

TypeError

If X is not an instance of x_type or y is not an ndarray.

Source code in hgp_lib\utils\validation.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def check_X_y(
    X: np.ndarray | pd.DataFrame,
    y: np.ndarray,
    x_type: Type[np.ndarray] | Type[pd.DataFrame] = np.ndarray,
):
    """
    Validate input data and labels.

    Checks that X is an instance of `x_type`, y is a numpy array, both are
    non-None, non-empty, 2-D/1-D respectively, and have the same number of
    samples.

    Args:
        X (np.ndarray | pd.DataFrame): Input data.
        y (np.ndarray): Target labels (1-D).
        x_type (Type[np.ndarray] | Type[pd.DataFrame]): Expected type for X.
            Default: `np.ndarray`.

    Raises:
        ValueError: If X or y is None, empty, or have mismatched lengths.
        TypeError: If X is not an instance of `x_type` or y is not an ndarray.
    """
    if X is None:
        raise ValueError("X (data) cannot be None")
    if y is None:
        raise ValueError("y (labels) cannot be None")

    check_isinstance(X, x_type)
    check_isinstance(y, np.ndarray)

    if len(X) != len(y):
        raise ValueError(
            f"X and y must have the same length. Got X={len(X)}, y={len(y)}"
        )

    if len(X) == 0:
        raise ValueError("X and y cannot be empty")
    if X.ndim != 2:
        raise ValueError(f"X must be 2D array (samples, features), got shape {X.shape}")
    if y.ndim != 1:
        raise ValueError(f"y must be 1D array (samples), got shape {y.shape}")

Metrics

hgp_lib.utils.metrics.confusion_matrix(y_pred, y_true, sample_weight=None)

Compute confusion matrix values from boolean prediction and label arrays.

Parameters:

Name Type Description Default
y_pred ndarray

Boolean predictions.

required
y_true ndarray

Boolean ground-truth labels.

required
sample_weight ndarray | None

Optional per-sample weights. Default: None.

None

Returns:

Type Description
Tuple[int, int, int, int]

Tuple[int, int, int, int]: (tp, fp, fn, tn).

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import confusion_matrix
>>> y_pred = np.array([True, True, False, False])
>>> y_true = np.array([True, False, True, False])
>>> confusion_matrix(y_pred, y_true)
(1, 1, 1, 1)
Source code in hgp_lib\utils\metrics.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def confusion_matrix(
    y_pred: np.ndarray, y_true: np.ndarray, sample_weight: np.ndarray | None = None
) -> Tuple[int, int, int, int]:
    """
    Compute confusion matrix values from boolean prediction and label arrays.

    Args:
        y_pred (np.ndarray):
            Boolean predictions.
        y_true (np.ndarray):
            Boolean ground-truth labels.
        sample_weight (np.ndarray | None):
            Optional per-sample weights. Default: `None`.

    Returns:
        Tuple[int, int, int, int]: ``(tp, fp, fn, tn)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import confusion_matrix
        >>> y_pred = np.array([True, True, False, False])
        >>> y_true = np.array([True, False, True, False])
        >>> confusion_matrix(y_pred, y_true)
        (1, 1, 1, 1)
    """
    if sample_weight is None:
        tp = (y_pred & y_true).sum()
        fp = (y_pred & ~y_true).sum()
        total_true = y_true.sum()
        fn = total_true - tp
        tn = len(y_pred) - total_true - fp
    else:
        tp = ((y_pred & y_true) * sample_weight).sum()
        fp = ((y_pred & ~y_true) * sample_weight).sum()
        total_true = (y_true * sample_weight).sum()
        fn = total_true - tp
        tn = sample_weight.sum() - total_true - fp
    return int(tp), int(fp), int(fn), int(tn)

hgp_lib.utils.metrics.fast_f1_score(y_pred, y_true, sample_weight=None)

Compute F1 score with optional sample weights.

This function supports the optimize_scorer feature of BooleanGP by accepting sample_weight parameter. It's optimized for boolean arrays.

Parameters:

Name Type Description Default
y_pred ndarray

Boolean predictions array.

required
y_true ndarray

True labels array.

required
sample_weight ndarray | None

Optional sample weights for weighted F1.

None

Returns:

Type Description
float

F1 score as float in [0, 1].

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import fast_f1_score
>>> y_pred = np.array([True, True, False, False])
>>> y_true = np.array([True, False, False, True])
>>> fast_f1_score(y_pred, y_true)
0.5
Source code in hgp_lib\utils\metrics.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def fast_f1_score(
    y_pred: ndarray,
    y_true: ndarray,
    sample_weight: ndarray | None = None,
) -> float:
    """
    Compute F1 score with optional sample weights.

    This function supports the optimize_scorer feature of BooleanGP
    by accepting sample_weight parameter. It's optimized for boolean arrays.

    Args:
        y_pred: Boolean predictions array.
        y_true: True labels array.
        sample_weight: Optional sample weights for weighted F1.

    Returns:
        F1 score as float in [0, 1].

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import fast_f1_score
        >>> y_pred = np.array([True, True, False, False])
        >>> y_true = np.array([True, False, False, True])
        >>> fast_f1_score(y_pred, y_true)
        0.5
    """
    if sample_weight is None:
        y_pred_sum = y_pred.sum()
        y_true_sum = y_true.sum()
        if y_pred_sum == 0 or y_true_sum == 0:
            return 1.0 if y_pred_sum == 0 and y_true_sum == 0 else 0.0
        return float(2 * (y_pred & y_true).sum() / (y_pred_sum + y_true_sum))

    y_pred_sum = np.dot(y_pred, sample_weight)
    y_true_sum = np.dot(y_true, sample_weight)
    if y_pred_sum == 0 or y_true_sum == 0:
        return 1.0 if y_pred_sum == 0 and y_true_sum == 0 else 0.0
    return float(2 * np.dot(y_pred & y_true, sample_weight) / (y_pred_sum + y_true_sum))

hgp_lib.utils.metrics.accepts_sample_weight(scorer)

Check if a scorer function accepts a sample_weight parameter.

Inspects the function signature first; falls back to a runtime probe if signature inspection fails.

Parameters:

Name Type Description Default
scorer Callable

The scoring function to check.

required

Returns:

Name Type Description
bool bool

True if the scorer accepts sample_weight.

Examples:

>>> from hgp_lib.utils.metrics import accepts_sample_weight
>>> def with_sw(p, l, sample_weight=None): return 0.0
>>> accepts_sample_weight(with_sw)
True
>>> def without_sw(p, l): return 0.0
>>> accepts_sample_weight(without_sw)
False
Source code in hgp_lib\utils\metrics.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def accepts_sample_weight(scorer: Callable) -> bool:
    """
    Check if a scorer function accepts a ``sample_weight`` parameter.

    Inspects the function signature first; falls back to a runtime probe if
    signature inspection fails.

    Args:
        scorer (Callable):
            The scoring function to check.

    Returns:
        bool: ``True`` if the scorer accepts ``sample_weight``.

    Examples:
        >>> from hgp_lib.utils.metrics import accepts_sample_weight
        >>> def with_sw(p, l, sample_weight=None): return 0.0
        >>> accepts_sample_weight(with_sw)
        True
        >>> def without_sw(p, l): return 0.0
        >>> accepts_sample_weight(without_sw)
        False
    """
    try:
        sig = inspect.signature(scorer)
        for param in sig.parameters.values():
            if param.name == "sample_weight":
                return True

    except (TypeError, ValueError):
        pass

    try:
        labels = np.array([1, 0, 1], dtype=bool)
        count = np.array([2, 1, 1])
        scorer(labels, labels, sample_weight=count)
        return True
    except TypeError:
        return False

hgp_lib.utils.metrics.transform_duplicates_to_sample_weight(data, labels)

Remove duplicate rows from (data, labels) and return sample weights.

Rows that appear multiple times are collapsed into a single row with a weight equal to the original count.

Parameters:

Name Type Description Default
data ndarray

2-D input data.

required
labels ndarray

1-D label array (same length as data).

required

Returns:

Type Description

Tuple[ndarray, ndarray, ndarray]: (unique_data, unique_labels, sample_weights).

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import transform_duplicates_to_sample_weight
>>> data = np.array([[1, 0], [1, 0], [0, 1]])
>>> labels = np.array([1, 1, 0])
>>> ud, ul, sw = transform_duplicates_to_sample_weight(data, labels)
>>> len(ud) < len(data)
True
>>> bool(sw.sum() == len(data))
True
Source code in hgp_lib\utils\metrics.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def transform_duplicates_to_sample_weight(data: ndarray, labels: ndarray):
    """
    Remove duplicate rows from ``(data, labels)`` and return sample weights.

    Rows that appear multiple times are collapsed into a single row with a
    weight equal to the original count.

    Args:
        data (ndarray):
            2-D input data.
        labels (ndarray):
            1-D label array (same length as ``data``).

    Returns:
        Tuple[ndarray, ndarray, ndarray]: ``(unique_data, unique_labels, sample_weights)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import transform_duplicates_to_sample_weight
        >>> data = np.array([[1, 0], [1, 0], [0, 1]])
        >>> labels = np.array([1, 1, 0])
        >>> ud, ul, sw = transform_duplicates_to_sample_weight(data, labels)
        >>> len(ud) < len(data)
        True
        >>> bool(sw.sum() == len(data))
        True
    """
    Xy = np.hstack((data, labels[:, None]))
    Xy_unique, sample_weight = np.unique(Xy, axis=0, return_counts=True)
    return Xy_unique[:, :-1], Xy_unique[:, -1], sample_weight

hgp_lib.utils.metrics.optimize_scorers_for_data(*scorers, data, labels)

Optimise scorers by deduplicating data and binding sample_weight.

If every scorer accepts sample_weight, duplicate rows are removed and each scorer is wrapped with SampleWeightScorer to inject the computed weights. Otherwise a warning is issued (once per scorer) and the original data is returned unchanged.

Parameters:

Name Type Description Default
*scorers Callable[[ndarray, ndarray], Any]

One or more scoring functions.

()
data ndarray

2-D input data.

required
labels ndarray

1-D label array.

required

Returns:

Name Type Description
Tuple

(*optimised_scorers, data, labels).

Examples:

>>> import numpy as np
>>> from hgp_lib.utils.metrics import optimize_scorers_for_data
>>> def acc(p, l, sample_weight=None): return float((p == l).mean())
>>> data = np.array([[1, 0], [1, 0], [0, 1]])
>>> labels = np.array([1, 1, 0])
>>> opt_acc, opt_data, opt_labels = optimize_scorers_for_data(acc, data=data, labels=labels)
>>> len(opt_data) <= len(data)
True
Source code in hgp_lib\utils\metrics.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def optimize_scorers_for_data(
    *scorers: Callable[[ndarray, ndarray], Any], data: ndarray, labels: ndarray
):
    """
    Optimise scorers by deduplicating data and binding ``sample_weight``.

    If every scorer accepts ``sample_weight``, duplicate rows are removed and
    each scorer is wrapped with ``SampleWeightScorer`` to inject the computed
    weights. Otherwise a warning is issued (once per scorer) and the original
    data is returned unchanged.

    Args:
        *scorers (Callable[[ndarray, ndarray], Any]):
            One or more scoring functions.
        data (ndarray):
            2-D input data.
        labels (ndarray):
            1-D label array.

    Returns:
        Tuple: ``(*optimised_scorers, data, labels)``.

    Examples:
        >>> import numpy as np
        >>> from hgp_lib.utils.metrics import optimize_scorers_for_data
        >>> def acc(p, l, sample_weight=None): return float((p == l).mean())
        >>> data = np.array([[1, 0], [1, 0], [0, 1]])
        >>> labels = np.array([1, 1, 0])
        >>> opt_acc, opt_data, opt_labels = optimize_scorers_for_data(acc, data=data, labels=labels)
        >>> len(opt_data) <= len(data)
        True
    """
    scorers_ok = True
    for scorer in scorers:
        validate_callable(scorer)
        if not accepts_sample_weight(scorer):
            scorers_ok = False
            # Only warn once per scorer function to avoid repeated warnings
            scorer_id = id(scorer)
            if scorer_id not in _warned_scorers:
                _warned_scorers.add(scorer_id)
                warnings.warn(
                    'The scorer must accept "sample_weight" to be optimized by '
                    "removing duplicates in the data. Scorer optimization is disabled "
                    "for this scorer.",
                    stacklevel=2,
                )
    if scorers_ok:
        data, labels, sample_weight = transform_duplicates_to_sample_weight(
            data, labels
        )
        scorers = [SampleWeightScorer(scorer, sample_weight) for scorer in scorers]
    return *scorers, data, labels