Source code for gridgene.get_arrays

import pandas as pd
import numpy as np
import timeit


[docs]
def transform_df_to_array(df: pd.DataFrame, target_dict: dict, array_shape: tuple) -> np.ndarray:
    """
    Transforms a DataFrame into a 3D numpy array based on specified target dictionary and array shape.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing 'X', 'Y', and 'target' columns.
    target_dict : dict
        A dictionary mapping target values to unique indices.
    array_shape : tuple
        The shape of the output array (max(X)+1, max(Y)+1, number of targets).

    Returns
    -------
    np.ndarray
        A 3D numpy array with dimensions specified by array_shape, where each position [x, y, target_index] is set to 1
        if there is an entry in the DataFrame with coordinates (x, y) and the corresponding target.
    """

    # Create a numpy array of zeros with the specified shape
    output_array = np.zeros(array_shape, dtype=np.int8)

    # Map the target values to their indices using the target_dict
    target_indices = df['target'].map(target_dict).values

    # Extract x and y coordinates
    x_coords = df['X'].astype(int).values
    y_coords = df['Y'].astype(int).values

    # Set the appropriate positions in the output array to 1 using advanced indexing
    output_array[x_coords, y_coords, target_indices] = 1

    return output_array







[docs]
def get_subset_arrays_V1(df_total: pd.DataFrame, target_list: list, target_col: str = 'target',
               col_x: str = 'X', col_y: str = 'Y') -> tuple:
    """
    PROBABLY LESS EFFICIENT !

    Filters the DataFrame based on target_list, then creates and returns a subset DataFrame, a dictionary of target mappings,
    a 3D array representing the data, and a 2D summed array along the third axis.

    Parameters
    ----------
    df_total : pd.DataFrame
        The input DataFrame containing the data.
    target_list : list
        List of target values to filter the DataFrame.
    target_col : str, optional
        Column name in the DataFrame containing target values, by default 'target'.
    col_x : str, optional
        Column name in the DataFrame representing the X-coordinate, by default 'X'.
    col_y : str, optional
        Column name in the DataFrame representing the Y-coordinate, by default 'Y'.

    Returns
    -------
    tuple
        A tuple containing:
        - df_subset (pd.DataFrame): The filtered DataFrame.
        - target_dict_subset (dict): A dictionary mapping each target to a unique index.
        - array_subset (np.ndarray): A 3D numpy array of shape (max(X)+1, max(Y)+1, len(target_list)), filled based on the filtered DataFrame.
        - array_subset_2d (np.ndarray): A 2D numpy array obtained by summing `array_subset` along the third axis.
    """

    # Filter the DataFrame based on target_list
    df_subset = df_total.loc[df_total[target_col].isin(target_list)]

    # Create a dictionary mapping each target to a unique index
    target_dict_subset = {target: index for index, target in enumerate(df_subset[target_col].unique())}

    # Define the shape of the 3D array
    array_shape_subset = (df_total[col_x].max() + 1, df_total[col_y].max() + 1, len(target_list))

    # Create the 3D array using the provided get_array function
    array_subset = transform_df_to_array(df=df_subset, target_dict=target_dict_subset, array_shape=array_shape_subset).astype(np.int8)

    # # Sum the 3D array along the third axis to create a 2D array
    # array_subset_2d = np.sum(array_subset, axis=2)

    return df_subset, array_subset, target_dict_subset




[docs]
def get_subset_arrays(df_total: pd.DataFrame, array_total: np.ndarray, target_dict_total: dict,
                      target_list: list, target_col: str = 'target') -> tuple:
    """
    Get a subset of the DataFrame, the corresponding slices from the total array, and the subset target dictionary.

    Parameters
    ----------
    df_total : pd.DataFrame
        The input DataFrame containing the data.
    array_total : np.ndarray
        The 3D array representing the entire dataset.
    target_dict_total : dict
        A dictionary mapping each target in the total dataset to its index.
    target_list : list
        List of target values to filter the DataFrame and array.
    target_col : str, optional
        Column name in the DataFrame containing target values, by default 'target'.

    Returns
    -------
    tuple
        A tuple containing:
        - df_subset (pd.DataFrame): The filtered DataFrame.
        - array_subset (np.ndarray): The subset of the array corresponding to the target_list.
        - target_dict_subset (dict): The subset dictionary mapping the filtered targets to indices.
    """

    # Filter the DataFrame based on target_list
    df_subset = df_total.loc[df_total[target_col].isin(target_list)]

    # Create a mapping from target_list to indices in the total array
    target_indices_subset = [target_dict_total.get(target, -1) for target in target_list]

    # Initialize an array of zeros with the same shape as array_total for the first two dimensions,
    # and the length of target_list for the last dimension
    array_subset = np.zeros(array_total.shape[:2] + (len(target_list),))

    # Extract the relevant slices from the array
    for i, target_index in enumerate(target_indices_subset):
        if target_index != -1:  # if the target is in target_dict_total
            array_subset[:, :, i] = array_total[:, :, target_index]

    # Create the subset target dictionary
    target_dict_subset = {target: index for index, target in enumerate(target_list)}

    return df_subset, array_subset, target_dict_subset


if __name__ == "__main__":

    def compare_functions(df_total, array_total, target_dict_total, target_list):
        setup_code = """
import pandas as pd
import numpy as np
from __main__ import get_subset_arrays, get_subset_arrays_V1, df_total, array_total, target_dict_total, target_list
"""
        stmt_V1 = "get_subset_arrays(df_total, array_total, target_dict_total, target_list)"
        stmt_V2 = "get_subset_arrays_V1(df_total, target_list)"

        time_V1 = timeit.timeit(stmt=stmt_V1, setup=setup_code, number=100)
        time_V2 = timeit.timeit(stmt=stmt_V2, setup=setup_code, number=100)

        result_V1 = get_subset_arrays(df_total, array_total, target_dict_total, target_list)
        result_V2 = get_subset_arrays_V1(df_total, target_list)

        df_equal = result_V1[0].equals(result_V2[0])
        target_dict_equal = result_V1[2] == result_V2[2]
        arrays_equal = np.array_equal(result_V1[1], result_V2[1])

        return time_V1, time_V2, df_equal, target_dict_equal, arrays_equal

    # Sample data for testing
    data = {'X': np.random.randint(0, 10, size=1000),
            'Y': np.random.randint(0, 10, size=1000),
            'target': np.random.choice(['target1', 'target2', 'target3'], size=1000)}
    df_total = pd.DataFrame(data)

    target_dict_total = {target: index for index, target in enumerate(df_total['target'].unique())}
    height, width = df_total['X'].max() + 1, df_total['Y'].max() + 1
    array_total = transform_df_to_array(df=df_total, target_dict=target_dict_total,
                                        array_shape=(height, width, len(target_dict_total))).astype(np.int8)

    target_list = ['target1', 'target2']

    time_V1, time_V2, df_equal, target_dict_equal, arrays_equal = compare_functions(df_total, array_total,
                                                                                    target_dict_total, target_list)

    print(f"Execution time for get_subset_arrays: {time_V1:.6f} seconds")
    print(f"Execution time for get_subset_arrays_V1: {time_V2:.6f} seconds")
    print(f"DataFrames are equal: {df_equal}")
    print(f"Target dictionaries are equal: {target_dict_equal}")
    print(f"Arrays are equal: {arrays_equal}")


    """
    Execution time for get_subset_arrays: 0.028953 seconds        -----  !!!!!!!
    Execution time for get_subset_arrays_V1: 0.087730 seconds
    DataFrames are equal: True
    Target dictionaries are equal: True
    Arrays are equal: True

    """