Skip to content

Populations

hgp_lib.populations.generator.PopulationGenerator

Generates a population of rules using one or more strategies with weighted probability.

Attributes:

Name Type Description
strategies Sequence[PopulationStrategy]

The list of strategies to use.

population_size int

The total number of rules to generate. Default: 100.

weights Sequence[float] | ndarray | None

Weights for random selection of strategies. If None, all strategies are selected with equal probability. Default: None.

Examples:

>>> from hgp_lib.populations import PopulationGenerator, RandomStrategy
>>> strategy = RandomStrategy(num_literals=5)
>>> generator = PopulationGenerator(strategies=[strategy], population_size=10)
>>> population = generator.generate()
>>> len(population)
10
Source code in hgp_lib\populations\generator.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class PopulationGenerator:
    """
    Generates a population of rules using one or more strategies with weighted probability.

    Attributes:
        strategies (Sequence[PopulationStrategy]): The list of strategies to use.
        population_size (int): The total number of rules to generate. Default: `100`.
        weights (Sequence[float] | np.ndarray | None): Weights for random selection of strategies.
            If `None`, all strategies are selected with equal probability. Default: `None`.

    Examples:
        >>> from hgp_lib.populations import PopulationGenerator, RandomStrategy
        >>> strategy = RandomStrategy(num_literals=5)
        >>> generator = PopulationGenerator(strategies=[strategy], population_size=10)
        >>> population = generator.generate()
        >>> len(population)
        10
    """

    def __init__(
        self,
        strategies: Sequence[PopulationStrategy],
        population_size: int = 100,
        weights: Sequence[float] | np.ndarray | None = None,
    ):
        """
        Initialize the PopulationGenerator.

        Args:
            strategies (Sequence[PopulationStrategy]): A non-empty sequence of PopulationStrategy instances.
            population_size (int): The number of rules to generate. Must be greater than `0`.
                Default: `100`.
            weights (Sequence[float] | np.ndarray | None): Optional weights for each strategy.
                Must sum to > `0` and be non-negative. Default: `None`.
        """
        check_isinstance(population_size, int)
        check_isinstance(strategies, Sequence)

        if len(strategies) == 0:
            raise ValueError("Strategies must be a non-empty Sequence")
        for strategy in strategies:
            check_isinstance(strategy, PopulationStrategy)

        if population_size <= 0:
            raise ValueError(
                f"population_size must be a positive integer, got {population_size}"
            )

        self.strategies = strategies
        self.population_size = population_size

        self.counts = self._init_counts(weights)

    def _init_counts(self, weights: Sequence[float] | np.ndarray | None) -> np.ndarray:
        if weights is not None:
            if isinstance(weights, np.ndarray):
                weights = weights.tolist()
            check_isinstance(weights, Sequence)
            if len(weights) != len(self.strategies):
                raise ValueError(
                    f"weights length ({len(weights)}) must match strategies length ({len(self.strategies)})"
                )
            if any(w < 0 for w in weights):
                raise ValueError("weights must be non-negative")
            if sum(weights) <= 0:
                raise ValueError("Sum of weights must be positive")

        pvals = (
            weights
            if weights is not None
            else [1.0 / len(self.strategies)] * len(self.strategies)
        )
        sum_weights = sum(pvals)
        pvals = [w / sum_weights for w in pvals]
        return np.random.multinomial(self.population_size, pvals)

    def generate(self) -> List[Rule]:
        """
        Generates the full population of rules.

        Returns:
            List[Rule]: A list containing `population_size` generated rules.
        """
        population = []
        for strategy, count in zip(self.strategies, self.counts):
            if count > 0:
                population.extend(strategy.generate(count))
        return population

__init__(strategies, population_size=100, weights=None)

Initialize the PopulationGenerator.

Parameters:

Name Type Description Default
strategies Sequence[PopulationStrategy]

A non-empty sequence of PopulationStrategy instances.

required
population_size int

The number of rules to generate. Must be greater than 0. Default: 100.

100
weights Sequence[float] | ndarray | None

Optional weights for each strategy. Must sum to > 0 and be non-negative. Default: None.

None
Source code in hgp_lib\populations\generator.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    strategies: Sequence[PopulationStrategy],
    population_size: int = 100,
    weights: Sequence[float] | np.ndarray | None = None,
):
    """
    Initialize the PopulationGenerator.

    Args:
        strategies (Sequence[PopulationStrategy]): A non-empty sequence of PopulationStrategy instances.
        population_size (int): The number of rules to generate. Must be greater than `0`.
            Default: `100`.
        weights (Sequence[float] | np.ndarray | None): Optional weights for each strategy.
            Must sum to > `0` and be non-negative. Default: `None`.
    """
    check_isinstance(population_size, int)
    check_isinstance(strategies, Sequence)

    if len(strategies) == 0:
        raise ValueError("Strategies must be a non-empty Sequence")
    for strategy in strategies:
        check_isinstance(strategy, PopulationStrategy)

    if population_size <= 0:
        raise ValueError(
            f"population_size must be a positive integer, got {population_size}"
        )

    self.strategies = strategies
    self.population_size = population_size

    self.counts = self._init_counts(weights)

generate()

Generates the full population of rules.

Returns:

Type Description
List[Rule]

List[Rule]: A list containing population_size generated rules.

Source code in hgp_lib\populations\generator.py
86
87
88
89
90
91
92
93
94
95
96
97
def generate(self) -> List[Rule]:
    """
    Generates the full population of rules.

    Returns:
        List[Rule]: A list containing `population_size` generated rules.
    """
    population = []
    for strategy, count in zip(self.strategies, self.counts):
        if count > 0:
            population.extend(strategy.generate(count))
    return population

hgp_lib.populations.populations_factory.PopulationGeneratorFactory

Factory for creating PopulationGenerator instances.

Stores configuration-time parameters (population_size) and defers data-dependent construction to create. Override create_strategies to customise which strategies are instantiated.

Attributes:

Name Type Description
population_size int

Number of rules the generator will produce. Default: 100.

Examples:

>>> from hgp_lib.populations import PopulationGeneratorFactory
>>> factory = PopulationGeneratorFactory(population_size=50)
>>> factory.population_size
50

Subclass to use custom strategies:

>>> import numpy as np
>>> from hgp_lib.populations import PopulationGeneratorFactory, BestLiteralStrategy
>>> class MyFactory(PopulationGeneratorFactory):
...     def create_strategies(self, num_literals, score_fn, train_data, train_labels):
...         return [BestLiteralStrategy(
...             num_literals=num_literals, score_fn=score_fn,
...             train_data=train_data, train_labels=train_labels,
...         )]
>>> factory = MyFactory(population_size=20)
>>> data = np.array([[True, False], [False, True]])
>>> labels = np.array([1, 0])
>>> def acc(p, l): return float((p == l).mean())
>>> gen = factory.create(2, acc, data, labels)
>>> len(gen.generate())
20
Source code in hgp_lib\populations\populations_factory.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class PopulationGeneratorFactory:
    """
    Factory for creating `PopulationGenerator` instances.

    Stores configuration-time parameters (`population_size`) and defers
    data-dependent construction to `create`. Override `create_strategies`
    to customise which strategies are instantiated.

    Attributes:
        population_size (int): Number of rules the generator will produce.
            Default: `100`.

    Examples:
        >>> from hgp_lib.populations import PopulationGeneratorFactory
        >>> factory = PopulationGeneratorFactory(population_size=50)
        >>> factory.population_size
        50

        Subclass to use custom strategies:

        >>> import numpy as np
        >>> from hgp_lib.populations import PopulationGeneratorFactory, BestLiteralStrategy
        >>> class MyFactory(PopulationGeneratorFactory):
        ...     def create_strategies(self, num_literals, score_fn, train_data, train_labels):
        ...         return [BestLiteralStrategy(
        ...             num_literals=num_literals, score_fn=score_fn,
        ...             train_data=train_data, train_labels=train_labels,
        ...         )]
        >>> factory = MyFactory(population_size=20)
        >>> data = np.array([[True, False], [False, True]])
        >>> labels = np.array([1, 0])
        >>> def acc(p, l): return float((p == l).mean())
        >>> gen = factory.create(2, acc, data, labels)
        >>> len(gen.generate())
        20
    """

    def __init__(self, population_size: int = 100):
        check_isinstance(population_size, int)
        if population_size <= 0:
            raise ValueError(
                f"population_size must be a positive integer, got {population_size}"
            )
        self.population_size = population_size

    def create_strategies(
        self,
        num_literals: int,
        score_fn: Callable[[np.ndarray, np.ndarray], float],
        train_data: np.ndarray,
        train_labels: np.ndarray,
    ) -> List[PopulationStrategy]:
        """
        Create the list of strategies for the generator.

        Override this method to use custom strategies. The default creates
        a single `RandomStrategy(num_literals=num_literals)`.

        Args:
            num_literals (int): Number of boolean features (columns in train_data).
            score_fn (Callable): Fitness function `(predictions, labels) -> float`.
            train_data (np.ndarray): Training data (2-D boolean array).
            train_labels (np.ndarray): Training labels (1-D array).

        Returns:
            List[PopulationStrategy]: Strategies to pass to `PopulationGenerator`.
        """
        return [RandomStrategy(num_literals=num_literals)]

    def create(
        self,
        num_literals: int,
        score_fn: Callable[[np.ndarray, np.ndarray], float],
        train_data: np.ndarray,
        train_labels: np.ndarray,
    ) -> PopulationGenerator:
        """
        Create a `PopulationGenerator` with data-dependent strategies.

        Args:
            num_literals (int): Number of boolean features (columns in train_data).
            score_fn (Callable): Fitness function `(predictions, labels) -> float`.
            train_data (np.ndarray): Training data (2-D boolean array).
            train_labels (np.ndarray): Training labels (1-D array).

        Returns:
            PopulationGenerator: A generator ready to produce the initial population.
        """
        strategies = self.create_strategies(
            num_literals, score_fn, train_data, train_labels
        )
        return PopulationGenerator(
            strategies=strategies, population_size=self.population_size
        )

create_strategies(num_literals, score_fn, train_data, train_labels)

Create the list of strategies for the generator.

Override this method to use custom strategies. The default creates a single RandomStrategy(num_literals=num_literals).

Parameters:

Name Type Description Default
num_literals int

Number of boolean features (columns in train_data).

required
score_fn Callable

Fitness function (predictions, labels) -> float.

required
train_data ndarray

Training data (2-D boolean array).

required
train_labels ndarray

Training labels (1-D array).

required

Returns:

Type Description
List[PopulationStrategy]

List[PopulationStrategy]: Strategies to pass to PopulationGenerator.

Source code in hgp_lib\populations\populations_factory.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def create_strategies(
    self,
    num_literals: int,
    score_fn: Callable[[np.ndarray, np.ndarray], float],
    train_data: np.ndarray,
    train_labels: np.ndarray,
) -> List[PopulationStrategy]:
    """
    Create the list of strategies for the generator.

    Override this method to use custom strategies. The default creates
    a single `RandomStrategy(num_literals=num_literals)`.

    Args:
        num_literals (int): Number of boolean features (columns in train_data).
        score_fn (Callable): Fitness function `(predictions, labels) -> float`.
        train_data (np.ndarray): Training data (2-D boolean array).
        train_labels (np.ndarray): Training labels (1-D array).

    Returns:
        List[PopulationStrategy]: Strategies to pass to `PopulationGenerator`.
    """
    return [RandomStrategy(num_literals=num_literals)]

create(num_literals, score_fn, train_data, train_labels)

Create a PopulationGenerator with data-dependent strategies.

Parameters:

Name Type Description Default
num_literals int

Number of boolean features (columns in train_data).

required
score_fn Callable

Fitness function (predictions, labels) -> float.

required
train_data ndarray

Training data (2-D boolean array).

required
train_labels ndarray

Training labels (1-D array).

required

Returns:

Name Type Description
PopulationGenerator PopulationGenerator

A generator ready to produce the initial population.

Source code in hgp_lib\populations\populations_factory.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def create(
    self,
    num_literals: int,
    score_fn: Callable[[np.ndarray, np.ndarray], float],
    train_data: np.ndarray,
    train_labels: np.ndarray,
) -> PopulationGenerator:
    """
    Create a `PopulationGenerator` with data-dependent strategies.

    Args:
        num_literals (int): Number of boolean features (columns in train_data).
        score_fn (Callable): Fitness function `(predictions, labels) -> float`.
        train_data (np.ndarray): Training data (2-D boolean array).
        train_labels (np.ndarray): Training labels (1-D array).

    Returns:
        PopulationGenerator: A generator ready to produce the initial population.
    """
    strategies = self.create_strategies(
        num_literals, score_fn, train_data, train_labels
    )
    return PopulationGenerator(
        strategies=strategies, population_size=self.population_size
    )

Sampling Strategies

hgp_lib.populations.sampling.FeatureSamplingStrategy

Bases: SamplingStrategy

Samples a subset of features from the training data.

Each child population receives a subset of the parent's feature columns. The number of features per child is ceil(num_features * feature_fraction).

Overlap behavior (controlled by replace parameter): - replace=False: No overlap between children (partitioning) — each feature appears in at most one child population. - replace=True: Overlap allowed — features can appear in multiple children.

When feature_fraction=1.0, all children receive all features regardless of replace.

Within each child, features are always unique (no duplicates within a single child).

Attributes:

Name Type Description
feature_fraction float

Fraction of features per child. Default: 1.0.

replace bool

Allow feature overlap between children. Default: False.

Examples:

>>> import numpy as np
>>> np.random.seed(42)
>>> strategy = FeatureSamplingStrategy(feature_fraction=0.5)
>>> data = np.random.rand(100, 10) > 0.5
>>> labels = np.random.randint(0, 2, 100)
>>> results = strategy.sample(data, labels, num_children=3)
>>> len(results)
3
>>> len(results[0].feature_indices)
5
Source code in hgp_lib\populations\sampling.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
class FeatureSamplingStrategy(SamplingStrategy):
    """Samples a subset of features from the training data.

    Each child population receives a subset of the parent's feature columns.
    The number of features per child is `ceil(num_features * feature_fraction)`.

    Overlap behavior (controlled by `replace` parameter):
        - `replace=False`: No overlap between children (partitioning) — each feature
          appears in at most one child population.
        - `replace=True`: Overlap allowed — features can appear in multiple children.

    When `feature_fraction=1.0`, all children receive all features regardless of
    `replace`.

    Within each child, features are always unique (no duplicates within a single child).

    Attributes:
        feature_fraction (float): Fraction of features per child. Default: `1.0`.
        replace (bool): Allow feature overlap between children. Default: `False`.

    Examples:
        >>> import numpy as np
        >>> np.random.seed(42)
        >>> strategy = FeatureSamplingStrategy(feature_fraction=0.5)
        >>> data = np.random.rand(100, 10) > 0.5
        >>> labels = np.random.randint(0, 2, 100)
        >>> results = strategy.sample(data, labels, num_children=3)
        >>> len(results)
        3
        >>> len(results[0].feature_indices)
        5
    """

    def __init__(self, feature_fraction: float = 1.0, replace: bool = False):
        super().__init__(feature_fraction=feature_fraction, replace=replace)

    def sample(
        self,
        data: ndarray,
        labels: ndarray,
        num_children: int,
    ) -> List[SamplingResult]:
        """Sample features for child populations.

        Args:
            data: Training data as 2D boolean array (instances x features).
            labels: Training labels as 1D integer array.
            num_children: Number of child populations to create.

        Returns:
            List of SamplingResult, one per child, with sampled feature columns,
            all instances preserved, and instance_indices set to None.
        """
        num_features = data.shape[1]
        features_per_child = ceil(num_features * self.feature_fraction)
        if features_per_child < self.MIN_FEATURES:
            raise ValueError(
                f"Cannot sample less than {self.MIN_FEATURES} features. "
                f"There are only {num_features} features and feature_fraction is {self.feature_fraction}!"
            )
        feature_allocation = self.allocate_indices_to_children(
            features_per_child, num_features, num_children
        )

        return [
            self.create_sampling_result(data, labels, feature_indices, None)
            for feature_indices in feature_allocation
        ]

sample(data, labels, num_children)

Sample features for child populations.

Parameters:

Name Type Description Default
data ndarray

Training data as 2D boolean array (instances x features).

required
labels ndarray

Training labels as 1D integer array.

required
num_children int

Number of child populations to create.

required

Returns:

Type Description
List[SamplingResult]

List of SamplingResult, one per child, with sampled feature columns,

List[SamplingResult]

all instances preserved, and instance_indices set to None.

Source code in hgp_lib\populations\sampling.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def sample(
    self,
    data: ndarray,
    labels: ndarray,
    num_children: int,
) -> List[SamplingResult]:
    """Sample features for child populations.

    Args:
        data: Training data as 2D boolean array (instances x features).
        labels: Training labels as 1D integer array.
        num_children: Number of child populations to create.

    Returns:
        List of SamplingResult, one per child, with sampled feature columns,
        all instances preserved, and instance_indices set to None.
    """
    num_features = data.shape[1]
    features_per_child = ceil(num_features * self.feature_fraction)
    if features_per_child < self.MIN_FEATURES:
        raise ValueError(
            f"Cannot sample less than {self.MIN_FEATURES} features. "
            f"There are only {num_features} features and feature_fraction is {self.feature_fraction}!"
        )
    feature_allocation = self.allocate_indices_to_children(
        features_per_child, num_features, num_children
    )

    return [
        self.create_sampling_result(data, labels, feature_indices, None)
        for feature_indices in feature_allocation
    ]

hgp_lib.populations.sampling.InstanceSamplingStrategy

Bases: SamplingStrategy

Samples a subset of instances from the training data.

Each child population receives a subset of the parent's rows. All features are preserved. The number of instances per child is ceil(num_instances * sample_fraction).

Overlap behavior (controlled by replace parameter): - replace=False: No overlap between children (partitioning). - replace=True: Overlap allowed.

When sample_fraction=1.0, all children receive all instances regardless of replace.

Examples:

>>> import numpy as np
>>> np.random.seed(42)
>>> strategy = InstanceSamplingStrategy(sample_fraction=0.8)
>>> data = np.random.rand(100, 10) > 0.5
>>> labels = np.random.randint(0, 2, 100)
>>> results = strategy.sample(data, labels, num_children=3)
>>> len(results)
3
>>> len(results[0].instance_indices)
80
Source code in hgp_lib\populations\sampling.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
class InstanceSamplingStrategy(SamplingStrategy):
    """Samples a subset of instances from the training data.

    Each child population receives a subset of the parent's rows. All features
    are preserved. The number of instances per child is
    `ceil(num_instances * sample_fraction)`.

    Overlap behavior (controlled by `replace` parameter):
        - `replace=False`: No overlap between children (partitioning).
        - `replace=True`: Overlap allowed.

    When `sample_fraction=1.0`, all children receive all instances regardless of
    `replace`.

    Examples:
        >>> import numpy as np
        >>> np.random.seed(42)
        >>> strategy = InstanceSamplingStrategy(sample_fraction=0.8)
        >>> data = np.random.rand(100, 10) > 0.5
        >>> labels = np.random.randint(0, 2, 100)
        >>> results = strategy.sample(data, labels, num_children=3)
        >>> len(results)
        3
        >>> len(results[0].instance_indices)
        80
    """

    def __init__(self, sample_fraction: float = 1.0, replace: bool = False):
        super().__init__(sample_fraction=sample_fraction, replace=replace)

    def sample(
        self,
        data: ndarray,
        labels: ndarray,
        num_children: int,
    ) -> List[SamplingResult]:
        """Sample instances for child populations.

        Args:
            data: Training data as 2D boolean array (instances x features).
            labels: Training labels as 1D integer array.
            num_children: Number of child populations to create.

        Returns:
            List of SamplingResult, one per child, with sampled instance rows,
            all features preserved, and feature_mapping set to None.
        """
        num_instances = len(data)
        samples_per_child = ceil(num_instances * self.sample_fraction)
        if samples_per_child < self.MIN_INSTANCES:
            # ValueError: Cannot sample less than 2 instances. There are only 1 instances and sample_fraction is 0.39!
            raise ValueError(
                f"Cannot sample less than {self.MIN_INSTANCES} instances. "
                f"There are only {num_instances} instances and sample_fraction is {self.sample_fraction}!"
            )
        sample_allocation = self.allocate_indices_to_children(
            samples_per_child, num_instances, num_children
        )

        return [
            self.create_sampling_result(data, labels, None, sample_indices)
            for sample_indices in sample_allocation
        ]

sample(data, labels, num_children)

Sample instances for child populations.

Parameters:

Name Type Description Default
data ndarray

Training data as 2D boolean array (instances x features).

required
labels ndarray

Training labels as 1D integer array.

required
num_children int

Number of child populations to create.

required

Returns:

Type Description
List[SamplingResult]

List of SamplingResult, one per child, with sampled instance rows,

List[SamplingResult]

all features preserved, and feature_mapping set to None.

Source code in hgp_lib\populations\sampling.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def sample(
    self,
    data: ndarray,
    labels: ndarray,
    num_children: int,
) -> List[SamplingResult]:
    """Sample instances for child populations.

    Args:
        data: Training data as 2D boolean array (instances x features).
        labels: Training labels as 1D integer array.
        num_children: Number of child populations to create.

    Returns:
        List of SamplingResult, one per child, with sampled instance rows,
        all features preserved, and feature_mapping set to None.
    """
    num_instances = len(data)
    samples_per_child = ceil(num_instances * self.sample_fraction)
    if samples_per_child < self.MIN_INSTANCES:
        # ValueError: Cannot sample less than 2 instances. There are only 1 instances and sample_fraction is 0.39!
        raise ValueError(
            f"Cannot sample less than {self.MIN_INSTANCES} instances. "
            f"There are only {num_instances} instances and sample_fraction is {self.sample_fraction}!"
        )
    sample_allocation = self.allocate_indices_to_children(
        samples_per_child, num_instances, num_children
    )

    return [
        self.create_sampling_result(data, labels, None, sample_indices)
        for sample_indices in sample_allocation
    ]

hgp_lib.populations.sampling.CombinedSamplingStrategy

Bases: SamplingStrategy

Combines feature and instance sampling.

Applies both feature sampling and instance sampling to create child populations with reduced feature and instance sets.

Attributes:

Name Type Description
feature_fraction float

Fraction of features per child. Default: 1.0.

sample_fraction float

Fraction of instances per child. Default: 1.0.

replace bool

Whether to allow overlap between children. Default: False.

Examples:

>>> import numpy as np
>>> np.random.seed(42)
>>> strategy = CombinedSamplingStrategy(
...     feature_fraction=0.5,
...     sample_fraction=0.5,
...     replace=False
... )
>>> data = np.random.rand(100, 10) > 0.5
>>> labels = np.random.randint(0, 2, 100)
>>> results = strategy.sample(data, labels, num_children=3)
>>> len(results)
3
>>> results[0].data.shape
(50, 5)
Source code in hgp_lib\populations\sampling.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class CombinedSamplingStrategy(SamplingStrategy):
    """Combines feature and instance sampling.

    Applies both feature sampling and instance sampling to create
    child populations with reduced feature and instance sets.

    Attributes:
        feature_fraction (float): Fraction of features per child. Default: `1.0`.
        sample_fraction (float): Fraction of instances per child. Default: `1.0`.
        replace (bool): Whether to allow overlap between children. Default: `False`.

    Examples:
        >>> import numpy as np
        >>> np.random.seed(42)
        >>> strategy = CombinedSamplingStrategy(
        ...     feature_fraction=0.5,
        ...     sample_fraction=0.5,
        ...     replace=False
        ... )
        >>> data = np.random.rand(100, 10) > 0.5
        >>> labels = np.random.randint(0, 2, 100)
        >>> results = strategy.sample(data, labels, num_children=3)
        >>> len(results)
        3
        >>> results[0].data.shape
        (50, 5)
    """

    def __init__(
        self,
        feature_fraction: float = 1.0,
        sample_fraction: float = 1.0,
        replace: bool = False,
    ):
        super().__init__(
            feature_fraction=feature_fraction,
            sample_fraction=sample_fraction,
            replace=replace,
        )

    def sample(
        self,
        data: ndarray,
        labels: ndarray,
        num_children: int,
    ) -> List[SamplingResult]:
        """Sample both features and instances for all children at once.

        Args:
            data: Training data as 2D boolean array (instances x features).
            labels: Training labels as 1D integer array.
            num_children: Number of child populations to create.

        Returns:
            List of SamplingResult, one per child, with both feature and instance
            subsets applied, containing both feature_indices and instance_indices.
        """
        num_instances, num_features = data.shape
        samples_per_child = ceil(num_instances * self.sample_fraction)
        features_per_child = ceil(num_features * self.feature_fraction)
        if samples_per_child < self.MIN_INSTANCES:
            raise ValueError(
                f"Cannot sample less than {self.MIN_INSTANCES} instances. "
                f"There are only {num_instances} instances and sample_fraction is {self.sample_fraction}!"
            )
        if features_per_child < self.MIN_FEATURES:
            raise ValueError(
                f"Cannot sample less than {self.MIN_FEATURES} features. "
                f"There are only {num_features} features and feature_fraction is {self.feature_fraction}!"
            )
        sample_allocation = self.allocate_indices_to_children(
            samples_per_child, num_instances, num_children
        )
        feature_allocation = self.allocate_indices_to_children(
            features_per_child, num_features, num_children
        )

        return [
            self.create_sampling_result(
                data,
                labels,
                feature_indices,
                sample_indices,
            )
            for sample_indices, feature_indices in zip(
                sample_allocation, feature_allocation
            )
        ]

sample(data, labels, num_children)

Sample both features and instances for all children at once.

Parameters:

Name Type Description Default
data ndarray

Training data as 2D boolean array (instances x features).

required
labels ndarray

Training labels as 1D integer array.

required
num_children int

Number of child populations to create.

required

Returns:

Type Description
List[SamplingResult]

List of SamplingResult, one per child, with both feature and instance

List[SamplingResult]

subsets applied, containing both feature_indices and instance_indices.

Source code in hgp_lib\populations\sampling.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def sample(
    self,
    data: ndarray,
    labels: ndarray,
    num_children: int,
) -> List[SamplingResult]:
    """Sample both features and instances for all children at once.

    Args:
        data: Training data as 2D boolean array (instances x features).
        labels: Training labels as 1D integer array.
        num_children: Number of child populations to create.

    Returns:
        List of SamplingResult, one per child, with both feature and instance
        subsets applied, containing both feature_indices and instance_indices.
    """
    num_instances, num_features = data.shape
    samples_per_child = ceil(num_instances * self.sample_fraction)
    features_per_child = ceil(num_features * self.feature_fraction)
    if samples_per_child < self.MIN_INSTANCES:
        raise ValueError(
            f"Cannot sample less than {self.MIN_INSTANCES} instances. "
            f"There are only {num_instances} instances and sample_fraction is {self.sample_fraction}!"
        )
    if features_per_child < self.MIN_FEATURES:
        raise ValueError(
            f"Cannot sample less than {self.MIN_FEATURES} features. "
            f"There are only {num_features} features and feature_fraction is {self.feature_fraction}!"
        )
    sample_allocation = self.allocate_indices_to_children(
        samples_per_child, num_instances, num_children
    )
    feature_allocation = self.allocate_indices_to_children(
        features_per_child, num_features, num_children
    )

    return [
        self.create_sampling_result(
            data,
            labels,
            feature_indices,
            sample_indices,
        )
        for sample_indices, feature_indices in zip(
            sample_allocation, feature_allocation
        )
    ]