Module brevettiai.data.tf_utils
Expand source code
import json
import tensorflow as tf
import numpy as np
from tensorflow.python.data.util import nest
class TfEncoder(json.JSONEncoder):
def default(self, obj):
if tf.is_tensor(obj):
return obj.numpy()
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, bytes):
return obj.decode()
elif isinstance(obj, np.generic):
return obj.item()
return json.JSONEncoder.default(self, obj)
def unpack(obj):
if tf.is_tensor(obj):
return unpack(obj.numpy())
elif isinstance(obj, np.ndarray):
return tuple(unpack(x) for x in obj)
elif isinstance(obj, list):
return tuple(unpack(x) for x in obj)
elif isinstance(obj, bytes):
return unpack(obj.decode())
elif isinstance(obj, np.generic):
return unpack(obj.item())
elif isinstance(obj, tf.RaggedTensor):
return unpack(obj.to_list())
return obj
class NumpyStringIterator:
"""Iterator over a dataset with elements converted to numpy. and strings decoded"""
def __init__(self, dataset):
self._iterator = iter(dataset)
def __iter__(self):
return self
@staticmethod
def parser(x):
try:
v = x.numpy()
if x.dtype == tf.string:
try:
return v.astype(str, copy=False)
except AttributeError:
return v.decode()
except UnicodeDecodeError:
return v
else:
return v
except Exception:
return unpack(x)
def __next__(self):
return nest.map_structure(self.parser, next(self._iterator))
def dataset_from_pandas(df):
"""
Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample
:param df:
:return:
"""
df = df.copy()
def sampler():
for k, row in df.iterrows():
yield {**row}
ds = tf.data.Dataset.from_generator(sampler, **tf_dataset_metadata(df))
return ds
def tf_dataset_metadata(df):
"""
Generate tf dataset metadata-object from pandas dataframe
:param df:
:return:
"""
tftypes = (tf.float16, tf.float32, tf.float64,
tf.int8, tf.int16, tf.int32, tf.int64,
tf.uint8, tf.uint16, tf.uint32, tf.uint64)
dtypes = {}
shapes = {}
for c in df:
col = df[c]
# Direct dtype/shape assignment via pandas
dtype = col.dtype.type
for tftype in tftypes:
if dtype == tftype:
dtypes[c] = tftype
shapes[c] = ()
first = col.values[0]
# Numpy nd array dtype/shape inference
if isinstance(first, np.ndarray):
tpold = None
for tp in set(v.dtype for v in col.values):
tpold = tpold or tp
if np.can_cast(tp, tpold):
continue
elif np.can_cast(tpold, tp):
tpold = tp
else:
raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))
for tftype in tftypes:
if tpold == tftype.as_numpy_dtype:
dtypes[c] = tftype
for shape in set(v.shape for v in col.values):
shold = shapes.setdefault(c, shape)
if shold != shape:
assert len(shape) == len(shold), \
"Shapes in input generator [%s] does not match; %s, %s" % (c, shape, shold)
shapes[c] = tuple(x if x == y else None for x, y in zip(shold, shape))
# Tuple shape/dtype inference
if isinstance(first, tuple):
for shape in set(len(v) for v in col.values):
shold = shapes.setdefault(c, (shape,))
shapes[c] = (shold,) if shold == shape else (None,)
tpold = None
for tp in set(np.dtype(type(v[0])) if len(v) != 0 else None for v in col.values):
tpold = tpold or tp
if tpold is not None and tp is not None:
if np.can_cast(tp, tpold):
continue
elif np.can_cast(tpold, tp):
tpold = tp
else:
raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold))
for tftype in tftypes:
if tpold == tftype.as_numpy_dtype:
dtypes[c] = tftype
# Fallback
if c not in dtypes:
dtypes[c] = tf.string
if c not in shapes:
shapes[c] = ()
return dict(output_types=dtypes, output_shapes=shapes)
def fill_empty(df):
set_empty_list = {
bool: -1,
int: -1,
str: "N/A",
float: np.nan,
np.int32: -1,
np.int64: -1,
np.float64: np.float64(np.nan),
np.float32: np.float32(np.nan)
}
df_out = df.copy()
replace_summary = df.copy()
for col_name in df_out.columns:
col = df_out[col_name]
valid_col = col[~col.isna()]
first = valid_col.values[0] if len(valid_col) else col.values[0]
dtype = type(first)
if dtype in [tuple, list]:
elem_dtype = str
for elem in first:
if elem:
elem_dtype = type(elem)
shape = len(first)
prototype = dtype([set_empty_list[elem_dtype]] * shape)
elif dtype is np.ndarray:
fill_val = set_empty_list[first.dtype.type]
prototype = (np.ones_like(first) * fill_val).astype(first.dtype)
else:
elem_dtype = valid_col.apply(type).iloc[0] if len(valid_col) else col.apply(type).iloc[0]
prototype = set_empty_list.get(elem_dtype, "Unknown Type")
def mapper(c):
"""Map invalid objects to prototype"""
if isinstance(c, type(prototype)) or prototype == "Unknown Type":
return c
return prototype
df_out[col_name] = col.apply(mapper)
replace_summary[col_name] = col.apply(lambda c: not isinstance(c, type(prototype)))
if replace_summary[col_name].sum() > 0:
print("Replacing: ", col_name, replace_summary[col_name].sum())
return df_out, replace_summary
Functions
def dataset_from_pandas(df)
-
Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample :param df: :return:
Expand source code
def dataset_from_pandas(df): """ Build a tensorflow generator dataset from a pandas dataframe allowing tuples of different sizes in each sample :param df: :return: """ df = df.copy() def sampler(): for k, row in df.iterrows(): yield {**row} ds = tf.data.Dataset.from_generator(sampler, **tf_dataset_metadata(df)) return ds
def fill_empty(df)
-
Expand source code
def fill_empty(df): set_empty_list = { bool: -1, int: -1, str: "N/A", float: np.nan, np.int32: -1, np.int64: -1, np.float64: np.float64(np.nan), np.float32: np.float32(np.nan) } df_out = df.copy() replace_summary = df.copy() for col_name in df_out.columns: col = df_out[col_name] valid_col = col[~col.isna()] first = valid_col.values[0] if len(valid_col) else col.values[0] dtype = type(first) if dtype in [tuple, list]: elem_dtype = str for elem in first: if elem: elem_dtype = type(elem) shape = len(first) prototype = dtype([set_empty_list[elem_dtype]] * shape) elif dtype is np.ndarray: fill_val = set_empty_list[first.dtype.type] prototype = (np.ones_like(first) * fill_val).astype(first.dtype) else: elem_dtype = valid_col.apply(type).iloc[0] if len(valid_col) else col.apply(type).iloc[0] prototype = set_empty_list.get(elem_dtype, "Unknown Type") def mapper(c): """Map invalid objects to prototype""" if isinstance(c, type(prototype)) or prototype == "Unknown Type": return c return prototype df_out[col_name] = col.apply(mapper) replace_summary[col_name] = col.apply(lambda c: not isinstance(c, type(prototype))) if replace_summary[col_name].sum() > 0: print("Replacing: ", col_name, replace_summary[col_name].sum()) return df_out, replace_summary
def tf_dataset_metadata(df)
-
Generate tf dataset metadata-object from pandas dataframe :param df: :return:
Expand source code
def tf_dataset_metadata(df): """ Generate tf dataset metadata-object from pandas dataframe :param df: :return: """ tftypes = (tf.float16, tf.float32, tf.float64, tf.int8, tf.int16, tf.int32, tf.int64, tf.uint8, tf.uint16, tf.uint32, tf.uint64) dtypes = {} shapes = {} for c in df: col = df[c] # Direct dtype/shape assignment via pandas dtype = col.dtype.type for tftype in tftypes: if dtype == tftype: dtypes[c] = tftype shapes[c] = () first = col.values[0] # Numpy nd array dtype/shape inference if isinstance(first, np.ndarray): tpold = None for tp in set(v.dtype for v in col.values): tpold = tpold or tp if np.can_cast(tp, tpold): continue elif np.can_cast(tpold, tp): tpold = tp else: raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold)) for tftype in tftypes: if tpold == tftype.as_numpy_dtype: dtypes[c] = tftype for shape in set(v.shape for v in col.values): shold = shapes.setdefault(c, shape) if shold != shape: assert len(shape) == len(shold), \ "Shapes in input generator [%s] does not match; %s, %s" % (c, shape, shold) shapes[c] = tuple(x if x == y else None for x, y in zip(shold, shape)) # Tuple shape/dtype inference if isinstance(first, tuple): for shape in set(len(v) for v in col.values): shold = shapes.setdefault(c, (shape,)) shapes[c] = (shold,) if shold == shape else (None,) tpold = None for tp in set(np.dtype(type(v[0])) if len(v) != 0 else None for v in col.values): tpold = tpold or tp if tpold is not None and tp is not None: if np.can_cast(tp, tpold): continue elif np.can_cast(tpold, tp): tpold = tp else: raise AssertionError("dtypes do not match %s; %s, %s" % (c, tp, tpold)) for tftype in tftypes: if tpold == tftype.as_numpy_dtype: dtypes[c] = tftype # Fallback if c not in dtypes: dtypes[c] = tf.string if c not in shapes: shapes[c] = () return dict(output_types=dtypes, output_shapes=shapes)
def unpack(obj)
-
Expand source code
def unpack(obj): if tf.is_tensor(obj): return unpack(obj.numpy()) elif isinstance(obj, np.ndarray): return tuple(unpack(x) for x in obj) elif isinstance(obj, list): return tuple(unpack(x) for x in obj) elif isinstance(obj, bytes): return unpack(obj.decode()) elif isinstance(obj, np.generic): return unpack(obj.item()) elif isinstance(obj, tf.RaggedTensor): return unpack(obj.to_list()) return obj
Classes
class NumpyStringIterator (dataset)
-
Iterator over a dataset with elements converted to numpy. and strings decoded
Expand source code
class NumpyStringIterator: """Iterator over a dataset with elements converted to numpy. and strings decoded""" def __init__(self, dataset): self._iterator = iter(dataset) def __iter__(self): return self @staticmethod def parser(x): try: v = x.numpy() if x.dtype == tf.string: try: return v.astype(str, copy=False) except AttributeError: return v.decode() except UnicodeDecodeError: return v else: return v except Exception: return unpack(x) def __next__(self): return nest.map_structure(self.parser, next(self._iterator))
Static methods
def parser(x)
-
Expand source code
@staticmethod def parser(x): try: v = x.numpy() if x.dtype == tf.string: try: return v.astype(str, copy=False) except AttributeError: return v.decode() except UnicodeDecodeError: return v else: return v except Exception: return unpack(x)
class TfEncoder (*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)
-
Extensible JSON http://json.org encoder for Python data structures.
Supports the following objects and types by default:
+-------------------+---------------+ | Python | JSON | +===================+===============+ | dict | object | +-------------------+---------------+ | list, tuple | array | +-------------------+---------------+ | str | string | +-------------------+---------------+ | int, float | number | +-------------------+---------------+ | True | true | +-------------------+---------------+ | False | false | +-------------------+---------------+ | None | null | +-------------------+---------------+
To extend this to recognize other objects, subclass and implement a
.default()
method with another method that returns a serializable object foro
if possible, otherwise it should call the superclass implementation (to raiseTypeError
).Constructor for JSONEncoder, with sensible defaults.
If skipkeys is false, then it is a TypeError to attempt encoding of keys that are not str, int, float or None. If skipkeys is True, such items are simply skipped.
If ensure_ascii is true, the output is guaranteed to be str objects with all incoming non-ASCII characters escaped. If ensure_ascii is false, the output can contain non-ASCII characters.
If check_circular is true, then lists, dicts, and custom encoded objects will be checked for circular references during encoding to prevent an infinite recursion (which would cause an OverflowError). Otherwise, no such check takes place.
If allow_nan is true, then NaN, Infinity, and -Infinity will be encoded as such. This behavior is not JSON specification compliant, but is consistent with most JavaScript based encoders and decoders. Otherwise, it will be a ValueError to encode such floats.
If sort_keys is true, then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure that JSON serializations can be compared on a day-to-day basis.
If indent is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. None is the most compact representation.
If specified, separators should be an (item_separator, key_separator) tuple. The default is (', ', ': ') if indent is
None
and (',', ': ') otherwise. To get the most compact JSON representation, you should specify (',', ':') to eliminate whitespace.If specified, default is a function that gets called for objects that can't otherwise be serialized. It should return a JSON encodable version of the object or raise a
TypeError
.Expand source code
class TfEncoder(json.JSONEncoder): def default(self, obj): if tf.is_tensor(obj): return obj.numpy() elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return obj.decode() elif isinstance(obj, np.generic): return obj.item() return json.JSONEncoder.default(self, obj)
Ancestors
- json.encoder.JSONEncoder
Methods
def default(self, obj)
-
Implement this method in a subclass such that it returns a serializable object for
o
, or calls the base implementation (to raise aTypeError
).For example, to support arbitrary iterators, you could implement default like this::
def default(self, o): try: iterable = iter(o) except TypeError: pass else: return list(iterable) # Let the base class default method raise the TypeError return JSONEncoder.default(self, o)
Expand source code
def default(self, obj): if tf.is_tensor(obj): return obj.numpy() elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return obj.decode() elif isinstance(obj, np.generic): return obj.item() return json.JSONEncoder.default(self, obj)