Module `brevettiai.utils.pandas_utils`

Expand source code

import pandas as pd
def explode(df, on=None, fillna="N/A", duplicate_id="id", keep_empty=True):
    """
    Explode all explodable columns in dataframe, see: pd.DataFrame.explode

    Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids
    df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum()

    :param df:
    :param on: explode on columns
    :param fillna: fill NA's with the following value
    :param duplicate_id: column on return df to set group duplication id or None to avoid grouping
    :param keep_empty: keep empty lists as NAN rows
    :return: see: pd.DataFrame.explode
    """
    on = on or df.columns.tolist()

    # Ensure empty lists are converted to pd.NA to keep them during under explosion
    if keep_empty:
        df = df.mask(df.applymap(pd.api.types.is_list_like) & ~df.fillna(1, inplace=False).astype(bool))

    mask = df[on].applymap(pd.api.types.is_hashable).all(axis=0) & df[on].applymap(pd.api.types.is_list_like).any(axis=0)
    explodable = mask[mask].index.tolist()

    if duplicate_id is not None:
        # nan must be filled to be grouped
        for x in df.select_dtypes("category"):
            if fillna not in df[x].cat.categories:
                df[x].cat.add_categories(fillna, inplace=True)
        df = df.fillna(fillna)
        df[duplicate_id] = df.groupby(on).ngroup()

    for c in explodable:
        df = df.explode(c)

    return df

Functions

def explode(df, on=None, fillna='N/A', duplicate_id='id', keep_empty=True)

Explode all explodable columns in dataframe, see: pd.DataFrame.explode

Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum()

:param df: :param on: explode on columns :param fillna: fill NA's with the following value :param duplicate_id: column on return df to set group duplication id or None to avoid grouping :param keep_empty: keep empty lists as NAN rows :return: see: pd.DataFrame.explode

Expand source code

def explode(df, on=None, fillna="N/A", duplicate_id="id", keep_empty=True):
    """
    Explode all explodable columns in dataframe, see: pd.DataFrame.explode

    Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids
    df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum()

    :param df:
    :param on: explode on columns
    :param fillna: fill NA's with the following value
    :param duplicate_id: column on return df to set group duplication id or None to avoid grouping
    :param keep_empty: keep empty lists as NAN rows
    :return: see: pd.DataFrame.explode
    """
    on = on or df.columns.tolist()

    # Ensure empty lists are converted to pd.NA to keep them during under explosion
    if keep_empty:
        df = df.mask(df.applymap(pd.api.types.is_list_like) & ~df.fillna(1, inplace=False).astype(bool))

    mask = df[on].applymap(pd.api.types.is_hashable).all(axis=0) & df[on].applymap(pd.api.types.is_list_like).any(axis=0)
    explodable = mask[mask].index.tolist()

    if duplicate_id is not None:
        # nan must be filled to be grouped
        for x in df.select_dtypes("category"):
            if fillna not in df[x].cat.categories:
                df[x].cat.add_categories(fillna, inplace=True)
        df = df.fillna(fillna)
        df[duplicate_id] = df.groupby(on).ngroup()

    for c in explodable:
        df = df.explode(c)

    return df