Module `brevettiai.data.tf_utils`

Expand source code

import json
import tensorflow as tf
import numpy as np
from tensorflow.python.data.util import nest


class TfEncoder(json.JSONEncoder):
    def default(self, obj):
        if tf.is_tensor(obj):
            return obj.numpy()
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return obj.decode()
        elif isinstance(obj, np.generic):
            return obj.item()
        return json.JSONEncoder.default(self, obj)


def unpack(obj):
    if tf.is_tensor(obj):
        return unpack(obj.numpy())
    elif isinstance(obj, np.ndarray):
        return tuple(unpack(x) for x in obj)
    elif isinstance(obj, list):
        return tuple(unpack(x) for x in obj)
    elif isinstance(obj, bytes):
        return unpack(obj.decode())
    elif isinstance(obj, np.generic):
        return unpack(obj.item())
    elif isinstance(obj, tf.RaggedTensor):
        return unpack(obj.to_list())
    return obj


class NumpyStringIterator:
    """Iterator over a dataset with elements converted to numpy. and strings decoded"""

    def __init__(self, dataset):
        self._iterator = iter(dataset)

    def __iter__(self):
        return self

    @staticmethod
    def parser(x):
        try:
            v = x.numpy()
            if x.dtype == tf.string:
                try:
                    return v.astype(str, copy=False)
                except AttributeError:
                    return v.decode()
                except UnicodeDecodeError:
                    return v
            else:
                return v
        except Exception:
            return unpack(x)

    def __next__(self):
        return nest.map_structure(self.parser, next(self._iterator))


def dataset_from_pandas(df):
    """
    Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample
    :param df:
    :return:
    """
    df = df.copy()

    def sampler():
        for k, row in df.iterrows():
            yield {**row}

    ds = tf.data.Dataset.from_generator(sampler, **tf_dataset_metadata(df))
    return ds


def tf_dataset_metadata(df):
    """
    Generate tf dataset metadata-object from pandas dataframe
    :param df:
    :return:
    """
    tftypes = (tf.float16, tf.float32, tf.float64,
               tf.int8, tf.int16, tf.int32, tf.int64,
               tf.uint8, tf.uint16, tf.uint32, tf.uint64)
    dtypes = {}
    shapes = {}
    for c in df:
        col = df[c]

        # Direct dtype/shape assignment via pandas
        dtype = col.dtype.type
        for tftype in tftypes:
            if dtype == tftype:
                dtypes[c] = tftype
                shapes[c] = ()

        first = col.values[0]
        # Numpy nd array dtype/shape inference
        if isinstance(first, np.ndarray):
            tpold = None
            for tp in set(v.dtype for v in col.values):
                tpold = tpold or tp
                if np.can_cast(tp, tpold):
                    continue
                elif np.can_cast(tpold, tp):
                    tpold = tp
                else:
                    raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))

            for tftype in tftypes:
                if tpold == tftype.as_numpy_dtype:
                    dtypes[c] = tftype

            for shape in set(v.shape for v in col.values):
                shold = shapes.setdefault(c, shape)
                if shold != shape:
                    assert len(shape) == len(shold), \
                        "Shapes in input generator [%s] does not match; %s, %s" % (c, shape, shold)
                shapes[c] = tuple(x if x == y else None for x, y in zip(shold, shape))

        # Tuple shape/dtype inference
        if isinstance(first, tuple):
            for shape in set(len(v) for v in col.values):
                shold = shapes.setdefault(c, (shape,))
                shapes[c] = (shold,) if shold == shape else (None,)

            tpold = None
            for tp in set(np.dtype(type(v[0])) if len(v) != 0 else None for v in col.values):
                tpold = tpold or tp
                if tpold is not None and tp is not None:
                    if np.can_cast(tp, tpold):
                        continue
                    elif np.can_cast(tpold, tp):
                        tpold = tp
                    else:
                        raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))

            for tftype in tftypes:
                if tpold == tftype.as_numpy_dtype:
                    dtypes[c] = tftype

        # Fallback
        if c not in dtypes:
            dtypes[c] = tf.string

        if c not in shapes:
            shapes[c] = ()

    return dict(output_types=dtypes, output_shapes=shapes)


def fill_empty(df):
    set_empty_list = {
        bool: -1,
        int: -1,
        str: "N/A",
        float: np.nan,
        np.int32: -1,
        np.int64: -1,
        np.float64: np.float64(np.nan),
        np.float32: np.float32(np.nan)
    }
    df_out = df.copy()
    replace_summary = df.copy()
    for col_name in df_out.columns:
        col = df_out[col_name]
        valid_col = col[~col.isna()]
        first = valid_col.values[0] if len(valid_col) else col.values[0]
        dtype = type(first)
        if dtype in [tuple, list]:
            elem_dtype = str
            for elem in first:
                if elem:
                    elem_dtype = type(elem)
            shape = len(first)
            prototype = dtype([set_empty_list[elem_dtype]] * shape)
        elif dtype is np.ndarray:
            fill_val = set_empty_list[first.dtype.type]
            prototype = (np.ones_like(first) * fill_val).astype(first.dtype)
        else:
            elem_dtype = valid_col.apply(type).iloc[0] if len(valid_col) else col.apply(type).iloc[0]
            prototype = set_empty_list.get(elem_dtype, "Unknown Type")

        def mapper(c):
            """Map invalid objects to prototype"""
            if isinstance(c, type(prototype)) or prototype == "Unknown Type":
                return c
            return prototype
        df_out[col_name] = col.apply(mapper)
        replace_summary[col_name] = col.apply(lambda c: not isinstance(c, type(prototype)))
        if replace_summary[col_name].sum() > 0:
            print("Replacing: ", col_name, replace_summary[col_name].sum())

    return df_out, replace_summary

Functions

def dataset_from_pandas(df)

Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample :param df: :return:

Expand source code

def dataset_from_pandas(df):
    """
    Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample
    :param df:
    :return:
    """
    df = df.copy()

    def sampler():
        for k, row in df.iterrows():
            yield {**row}

    ds = tf.data.Dataset.from_generator(sampler, **tf_dataset_metadata(df))
    return ds

def fill_empty(df)

Expand source code

def fill_empty(df):
    set_empty_list = {
        bool: -1,
        int: -1,
        str: "N/A",
        float: np.nan,
        np.int32: -1,
        np.int64: -1,
        np.float64: np.float64(np.nan),
        np.float32: np.float32(np.nan)
    }
    df_out = df.copy()
    replace_summary = df.copy()
    for col_name in df_out.columns:
        col = df_out[col_name]
        valid_col = col[~col.isna()]
        first = valid_col.values[0] if len(valid_col) else col.values[0]
        dtype = type(first)
        if dtype in [tuple, list]:
            elem_dtype = str
            for elem in first:
                if elem:
                    elem_dtype = type(elem)
            shape = len(first)
            prototype = dtype([set_empty_list[elem_dtype]] * shape)
        elif dtype is np.ndarray:
            fill_val = set_empty_list[first.dtype.type]
            prototype = (np.ones_like(first) * fill_val).astype(first.dtype)
        else:
            elem_dtype = valid_col.apply(type).iloc[0] if len(valid_col) else col.apply(type).iloc[0]
            prototype = set_empty_list.get(elem_dtype, "Unknown Type")

        def mapper(c):
            """Map invalid objects to prototype"""
            if isinstance(c, type(prototype)) or prototype == "Unknown Type":
                return c
            return prototype
        df_out[col_name] = col.apply(mapper)
        replace_summary[col_name] = col.apply(lambda c: not isinstance(c, type(prototype)))
        if replace_summary[col_name].sum() > 0:
            print("Replacing: ", col_name, replace_summary[col_name].sum())

    return df_out, replace_summary

def tf_dataset_metadata(df)

Generate tf dataset metadata-object from pandas dataframe :param df: :return:

Expand source code

def tf_dataset_metadata(df):
    """
    Generate tf dataset metadata-object from pandas dataframe
    :param df:
    :return:
    """
    tftypes = (tf.float16, tf.float32, tf.float64,
               tf.int8, tf.int16, tf.int32, tf.int64,
               tf.uint8, tf.uint16, tf.uint32, tf.uint64)
    dtypes = {}
    shapes = {}
    for c in df:
        col = df[c]

        # Direct dtype/shape assignment via pandas
        dtype = col.dtype.type
        for tftype in tftypes:
            if dtype == tftype:
                dtypes[c] = tftype
                shapes[c] = ()

        first = col.values[0]
        # Numpy nd array dtype/shape inference
        if isinstance(first, np.ndarray):
            tpold = None
            for tp in set(v.dtype for v in col.values):
                tpold = tpold or tp
                if np.can_cast(tp, tpold):
                    continue
                elif np.can_cast(tpold, tp):
                    tpold = tp
                else:
                    raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))

            for tftype in tftypes:
                if tpold == tftype.as_numpy_dtype:
                    dtypes[c] = tftype

            for shape in set(v.shape for v in col.values):
                shold = shapes.setdefault(c, shape)
                if shold != shape:
                    assert len(shape) == len(shold), \
                        "Shapes in input generator [%s] does not match; %s, %s" % (c, shape, shold)
                shapes[c] = tuple(x if x == y else None for x, y in zip(shold, shape))

        # Tuple shape/dtype inference
        if isinstance(first, tuple):
            for shape in set(len(v) for v in col.values):
                shold = shapes.setdefault(c, (shape,))
                shapes[c] = (shold,) if shold == shape else (None,)

            tpold = None
            for tp in set(np.dtype(type(v[0])) if len(v) != 0 else None for v in col.values):
                tpold = tpold or tp
                if tpold is not None and tp is not None:
                    if np.can_cast(tp, tpold):
                        continue
                    elif np.can_cast(tpold, tp):
                        tpold = tp
                    else:
                        raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))

            for tftype in tftypes:
                if tpold == tftype.as_numpy_dtype:
                    dtypes[c] = tftype

        # Fallback
        if c not in dtypes:
            dtypes[c] = tf.string

        if c not in shapes:
            shapes[c] = ()

    return dict(output_types=dtypes, output_shapes=shapes)

def unpack(obj)

Expand source code

def unpack(obj):
    if tf.is_tensor(obj):
        return unpack(obj.numpy())
    elif isinstance(obj, np.ndarray):
        return tuple(unpack(x) for x in obj)
    elif isinstance(obj, list):
        return tuple(unpack(x) for x in obj)
    elif isinstance(obj, bytes):
        return unpack(obj.decode())
    elif isinstance(obj, np.generic):
        return unpack(obj.item())
    elif isinstance(obj, tf.RaggedTensor):
        return unpack(obj.to_list())
    return obj

Classes

class NumpyStringIterator (dataset)

Iterator over a dataset with elements converted to numpy. and strings decoded

Expand source code

class NumpyStringIterator:
    """Iterator over a dataset with elements converted to numpy. and strings decoded"""

    def __init__(self, dataset):
        self._iterator = iter(dataset)

    def __iter__(self):
        return self

    @staticmethod
    def parser(x):
        try:
            v = x.numpy()
            if x.dtype == tf.string:
                try:
                    return v.astype(str, copy=False)
                except AttributeError:
                    return v.decode()
                except UnicodeDecodeError:
                    return v
            else:
                return v
        except Exception:
            return unpack(x)

    def __next__(self):
        return nest.map_structure(self.parser, next(self._iterator))

Static methods

def parser(x)

Expand source code

@staticmethod
def parser(x):
    try:
        v = x.numpy()
        if x.dtype == tf.string:
            try:
                return v.astype(str, copy=False)
            except AttributeError:
                return v.decode()
            except UnicodeDecodeError:
                return v
        else:
            return v
    except Exception:
        return unpack(x)

class TfEncoder (*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)

Extensible JSON http://json.org encoder for Python data structures.

Supports the following objects and types by default:

To extend this to recognize other objects, subclass and implement a .default() method with another method that returns a serializable object for o if possible, otherwise it should call the superclass implementation (to raise TypeError).

Constructor for JSONEncoder, with sensible defaults.

If skipkeys is false, then it is a TypeError to attempt encoding of keys that are not str, int, float or None. If skipkeys is True, such items are simply skipped.

If ensure_ascii is true, the output is guaranteed to be str objects with all incoming non-ASCII characters escaped. If ensure_ascii is false, the output can contain non-ASCII characters.

If check_circular is true, then lists, dicts, and custom encoded objects will be checked for circular references during encoding to prevent an infinite recursion (which would cause an OverflowError). Otherwise, no such check takes place.

If allow_nan is true, then NaN, Infinity, and -Infinity will be encoded as such. This behavior is not JSON specification compliant, but is consistent with most JavaScript based encoders and decoders. Otherwise, it will be a ValueError to encode such floats.

If sort_keys is true, then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure that JSON serializations can be compared on a day-to-day basis.

If indent is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. None is the most compact representation.

If specified, separators should be an (item_separator, key_separator) tuple. The default is (', ', ': ') if indent is None and (',', ': ') otherwise. To get the most compact JSON representation, you should specify (',', ':') to eliminate whitespace.

If specified, default is a function that gets called for objects that can't otherwise be serialized. It should return a JSON encodable version of the object or raise a TypeError.

Expand source code

class TfEncoder(json.JSONEncoder):
    def default(self, obj):
        if tf.is_tensor(obj):
            return obj.numpy()
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return obj.decode()
        elif isinstance(obj, np.generic):
            return obj.item()
        return json.JSONEncoder.default(self, obj)

Ancestors

json.encoder.JSONEncoder

Methods

def default(self, obj)

Implement this method in a subclass such that it returns a serializable object for o, or calls the base implementation (to raise a TypeError).

For example, to support arbitrary iterators, you could implement default like this::

def default(self, o):
    try:
        iterable = iter(o)
    except TypeError:
        pass
    else:
        return list(iterable)
    # Let the base class default method raise the TypeError
    return JSONEncoder.default(self, o)

Expand source code

def default(self, obj):
    if tf.is_tensor(obj):
        return obj.numpy()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, bytes):
        return obj.decode()
    elif isinstance(obj, np.generic):
        return obj.item()
    return json.JSONEncoder.default(self, obj)