Source code for derdava.data_source

import numpy as np


[docs]def generate_random_data_sources(X: np.ndarray, y: np.ndarray, num_of_data_sources: int=10):
    """Splits a given dataset to a specified number of data sources randomly.

    :param X: Feature set of the given dataset.
    :param y: Label set of the given dataset.
    :param num_of_data_sources: Number of data sources to be generated (default: ``10``).
    :return: A dictionary containing mappings between data source indices and their data ``(X, y)``.
    """

    data_sources = {}
    n = len(X) // num_of_data_sources
    for i in range(num_of_data_sources):
        data_sources[i] = (X[n * i:n * i + n, :].copy(), y[n * i:n * i + n].copy())

    return data_sources


[docs]def add_classification_noise(y: np.ndarray, noise_level: float=0.2):
    """Adds noises to the classification labels by randomly choosing one from the remaining label set.

    :param y: Labels of target dataset.
    :param noise_level: Amount of noise to be added (defaults ``0.2``).
    :return: ``None``.
    :raises ValueError: If ``noise_level`` is not in the range ``[0, 1]``.
    """

    if not 0 <= noise_level <= 1:
        raise ValueError("Noise level must be between 0 and 1.")

    labels = set(y.tolist())
    has_noises = np.random.binomial(1, noise_level, len(y))
    for i in range(len(y)):
        if has_noises[i]:
            label_candidates = list(labels)
            label_candidates.remove(y[i])
            noisy_label = label_candidates[np.random.randint(len(label_candidates))]
            y[i] = noisy_label

    return y