Module brevettiai.utils.pandas_utils
Expand source code
import pandas as pd
def explode(df, on=None, fillna="N/A", duplicate_id="id", keep_empty=True):
"""
Explode all explodable columns in dataframe, see: pd.DataFrame.explode
Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids
df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum()
:param df:
:param on: explode on columns
:param fillna: fill NA's with the following value
:param duplicate_id: column on return df to set group duplication id or None to avoid grouping
:param keep_empty: keep empty lists as NAN rows
:return: see: pd.DataFrame.explode
"""
on = on or df.columns.tolist()
# Ensure empty lists are converted to pd.NA to keep them during under explosion
if keep_empty:
df = df.mask(df.applymap(pd.api.types.is_list_like) & ~df.fillna(1, inplace=False).astype(bool))
mask = df[on].applymap(pd.api.types.is_hashable).all(axis=0) & df[on].applymap(pd.api.types.is_list_like).any(axis=0)
explodable = mask[mask].index.tolist()
if duplicate_id is not None:
# nan must be filled to be grouped
for x in df.select_dtypes("category"):
if fillna not in df[x].cat.categories:
df[x].cat.add_categories(fillna, inplace=True)
df = df.fillna(fillna)
df[duplicate_id] = df.groupby(on).ngroup()
for c in explodable:
df = df.explode(c)
return df
Functions
def explode(df, on=None, fillna='N/A', duplicate_id='id', keep_empty=True)
-
Explode all explodable columns in dataframe, see: pd.DataFrame.explode
Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum()
:param df: :param on: explode on columns :param fillna: fill NA's with the following value :param duplicate_id: column on return df to set group duplication id or None to avoid grouping :param keep_empty: keep empty lists as NAN rows :return: see: pd.DataFrame.explode
Expand source code
def explode(df, on=None, fillna="N/A", duplicate_id="id", keep_empty=True): """ Explode all explodable columns in dataframe, see: pd.DataFrame.explode Count unique items by grouping on all columns, counting each group size, then dropping duplicate ids df.groupby(df.columns.tolist()).size().reset_index(name="count").drop_duplicates("id")["count"].sum() :param df: :param on: explode on columns :param fillna: fill NA's with the following value :param duplicate_id: column on return df to set group duplication id or None to avoid grouping :param keep_empty: keep empty lists as NAN rows :return: see: pd.DataFrame.explode """ on = on or df.columns.tolist() # Ensure empty lists are converted to pd.NA to keep them during under explosion if keep_empty: df = df.mask(df.applymap(pd.api.types.is_list_like) & ~df.fillna(1, inplace=False).astype(bool)) mask = df[on].applymap(pd.api.types.is_hashable).all(axis=0) & df[on].applymap(pd.api.types.is_list_like).any(axis=0) explodable = mask[mask].index.tolist() if duplicate_id is not None: # nan must be filled to be grouped for x in df.select_dtypes("category"): if fillna not in df[x].cat.categories: df[x].cat.add_categories(fillna, inplace=True) df = df.fillna(fillna) df[duplicate_id] = df.groupby(on).ngroup() for c in explodable: df = df.explode(c) return df