Module brevettiai.data.sample_tools
Tools for filtering samples and associating meta-data and tags
Expand source code
"""
Tools for filtering samples and associating meta-data and tags
"""
import logging
import concurrent.futures
import pandas as pd
from brevettiai.platform.models import Tag
import brevettiai.interfaces.vue_schema_utils as vue
import numpy as np
log = logging.getLogger(__name__)
class BrevettiDatasetSamples(vue.VueSettingsModule):
def __init__(self, classes: list = None, class_mapping: dict = None, annotations: dict = None,
calculate_md5: bool = False, walk: bool = True, samples_file_name: str = None,
contains_column: str = None, contains_regex: str = None):
"""
:param classes: Force samples to be of the categories in this list
:param class_mapping: dict of mapping from path to (category) class. See example for description
"""
self.classes = classes or []
self.class_mapping = class_mapping or {}
self.annotations = annotations or {}
self.calculate_md5 = calculate_md5
self.walk = walk
self.samples_file_name = samples_file_name or ""
self.contains_column = contains_column or ""
self.contains_regex = contains_regex or ""
def get_image_samples(self, datasets, *args, **kwargs):
"""
:param sample_filter: Filter samples by regex
:param annotations: boolean, or dict Load annotation paths
:param kwargs:
:return: Dataframe of samples
Mapping from path to category:
Start from the leaf folder, and work towards the dataset root folder. If folder in class_mapping then apply
its key as the category. If no match is found, apply leaf folder name
Example:
class_mapping={
"A": ["Tilted"],
"B": ["Tilted"],
"1": ["Crimp"]
}
If classes is True or a list/set of categories. The filter is applied after the mapping.
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
futures = [executor.submit(ds.get_image_samples, *args, **{**self.__dict__, **kwargs})
for ds in sorted(datasets, key=lambda x: x.id)]
return pd.concat([f.result() for f in futures]).reset_index(drop=True)
def get_samples(self, datasets, walk=None, *args, **kwargs):
"""
Utility function for getting samples across multiple datasets by sample files
:param datasets:
:param target:
:param args:
:param kwargs:
:return:
"""
walk = walk if walk is not None else self.walk
if walk:
df = self.get_image_samples(datasets, *args, **kwargs)
else:
df = get_samples(datasets, self.samples_file_name)
if self.contains_column:
df = df[df[self.contains_column].str.contains(self.contains_regex, regex=True, na=False)]
assert not df.empty, "No samples found"
return df
def get_samples(datasets, target, *args, **kwargs):
"""
Utility function for getting samples across multiple datasets by sample files
:param datasets:
:param target:
:param args:
:param kwargs:
:return:
"""
samples = (d.get_samples(target) for d in sorted(datasets, key=lambda x: x.id))
return pd.concat(samples).reset_index(drop=True)
def save_samples(datasets, target, df):
for d in datasets:
d.save_samples(target, df)
def dataset_meta(datasets, tags):
"""
Build dataset meta dataframe from datasets and tags tree
:param datasets:
:param tags:
:return:
"""
# Collect all tags
meta = []
for dataset in datasets:
for tag in dataset.tags:
for path in Tag.find_path(tags, "id", tag.id):
name = path[0].name
key = "tag_" + path[0].id.replace(" ", "").replace('-', '_')
value = path[1].name if len(path) > 1 else path[0].name
meta.append({"dataset_id": dataset.id, "name": name, "key": key, "value": value})
meta = pd.DataFrame.from_records(meta)
# Pivot keys into columns and multivalues to tuples
if len(meta):
meta = meta.pivot_table(index="dataset_id", columns=["key", "name"], values="value", aggfunc=pd.unique)
meta = meta.applymap(lambda x: tuple(x) if pd.api.types.is_list_like(x) else x)
return meta
def join_dataset_meta(df, datasets, tags):
"""
join dataset meta
:param df: sample dataframe with dataset_id to join on
:param datasets: Dataset objects with metadata
:param tags: tag root tree, to find parent tags
:return: df, name/column_id dictionary
"""
assert "dataset_id" in df.columns, "df must contain dataset_id to join dataset metadata"
meta = dataset_meta(datasets, tags)
if len(meta):
df = df.join(meta.droplevel(1, axis=1), on="dataset_id", how="left")
return df, dict(meta.columns)
def get_grid_bboxes(bbox, size, tile_size=(1024, 1024), overlap=128, num_tile_steps: int = 1, max_steps: int = -1):
"""
Get tiled bounding boxes with overlaps, the last row/column will have a larger overlap to fit the image
:param bbox:
:param size:
:param tile_size:
:param overlap:
:param num_tile_steps:
:param max_steps:
:return:
"""
assert num_tile_steps >= 1
# Adjust input parameters for shorter square brackets
if not hasattr(overlap, "__len__"):
overlap = (overlap, overlap)
if not hasattr(tile_size, "__len__"):
tile_size = (tile_size, tile_size)
tile_size = np.array(tile_size)
# Ensure the used "bbox" is valid
bbox = bbox.clip(min=(0, 0, 0, 0), max=(size[0], size[1], size[0], size[1]))
w_offset = np.floor(tile_size[0] - overlap[0]).astype(np.int)
h_offset = np.floor(tile_size[1] - overlap[1]).astype(np.int)
img_width = bbox[2] - bbox[0]
img_height = bbox[3] - bbox[1]
assert img_width > 0
assert img_height > 0
# Extract width indexes
if img_width > tile_size[0]:
# Indexes should cover at least width - padding
width = np.arange(0, img_width - overlap[0], w_offset)
width = np.clip(width, 0, img_width - tile_size[0])
lr_offset = np.pad((tile_size[0] - np.diff(width)) / 2, 1)
else:
width = np.array((0,))
lr_offset = np.array((0, tile_size[0] - img_width))
# Extract height indexes
if img_height > tile_size[1]:
height = np.arange(0, img_height - overlap[1], h_offset)
height = np.clip(height, 0, img_height - tile_size[1])
ud_offset = np.pad((tile_size[1] - np.diff(height)) / 2, 1)
else:
height = np.array((0,))
ud_offset = np.array((0, tile_size[1] - img_height))
# Find start coordinates of tiles, and move to bbox offset
start_ix = np.stack([x.flatten() for x in np.meshgrid(width, height)], -1)
start_ix = start_ix + bbox[:2]
bboxes = np.concatenate((start_ix, start_ix + tile_size), 1)
l_offset = np.ceil(lr_offset[:-1]).astype(np.int)
u_offset = np.ceil(ud_offset[:-1]).astype(np.int)
left_offsets = np.stack([x.flatten() for x in np.meshgrid(l_offset, u_offset)], -1)
r_offset = tile_size[0] - np.floor(lr_offset[1:]).astype(np.int)
d_offset = tile_size[1] - np.floor(ud_offset[1:]).astype(np.int)
right_offsets = np.stack([x.flatten() for x in np.meshgrid(r_offset, d_offset)], -1)
#
# tile_fractional_step = tile_size // num_tile_steps
# step_size = tile_size - 2 * overlap
# if max_steps > 0:
# bbox[2] = min(bbox[2], bbox[0] - 2 * overlap + max_steps * step_size[0] - 1)
# bbox[3] = min(bbox[3], bbox[1] - 2 * overlap + max_steps * step_size[1] - 1)
#
# init_tile = np.array(bbox[:2]).clip(min=overlap)
# end_tile = np.array([(vv + 1).clip(max=sz-overlap) for sz, vv in zip(size, bbox[2:])])
#
# tile_area = (end_tile - init_tile)
#
# full_tiles = (tile_area // step_size).clip(min=0)
# fractional_tile = (tile_area - full_tiles * step_size).clip(min=0)
# fractional_tile_apply = ((fractional_tile > 0) | (full_tiles == 0)).astype(np.int)
# fractional_size = np.ceil((fractional_tile + 2 * overlap) / tile_fractional_step).astype(np.int) * tile_fractional_step
#
# # Build grid
# grid_steps_full = [np.arange(ft + 1) * ss + iv for iv, ft, ss in zip(init_tile, full_tiles, step_size)]
# grid_steps_fractional = [np.arange(frt) + fiv[-1] for fiv, frt in zip(grid_steps_full, fractional_tile_apply)]
#
# tiles_full = [np.array((gsf[:-1] - overlap, gsf[:-1] - overlap + ts)).T for gsf, ts in zip(grid_steps_full, tile_size)]
# tiles_fractional = [np.hstack((gsf - overlap, gsf - overlap + fsize)).reshape(-1, 2)
# for gsf, fsize in zip(grid_steps_fractional, fractional_size)]
#
# tiles_vectors = [np.vstack((v_fu, v_fr)) for v_fu, v_fr in zip(tiles_full, tiles_fractional)]
# left_offsets = [np.ones(len(vv), dtype=int) * overlap for vv in tiles_vectors]
# right_offsets = [vv[:, 1] - vv[:, 0] - overlap for vv in tiles_vectors]
# for ii in range(2):
# left_offsets[ii][0] = bbox[ii] - tiles_vectors[ii][0, 0]
# right_offsets[ii][-1] = (tiles_vectors[ii][-1, 1] - tiles_vectors[ii][-1, 0]) - \
# (tiles_vectors[ii][-1, 1] - (bbox[2+ii]+1)).clip(min=0)
# bboxes = np.array([(v0[0], v1[0], v0[1], v1[1]) for v1 in tiles_vectors[1] for v0 in tiles_vectors[0]])
# left_offsets = np.array([(lo0, lo1) for lo1 in left_offsets[1] for lo0 in left_offsets[0]])
# right_offsets = np.array([(ro0, ro1) for ro1 in right_offsets[1] for ro0 in right_offsets[0]])
return pd.Series([(len(height), len(width)), bboxes, left_offsets, right_offsets])
Functions
def dataset_meta(datasets, tags)
-
Build dataset meta dataframe from datasets and tags tree :param datasets: :param tags: :return:
Expand source code
def dataset_meta(datasets, tags): """ Build dataset meta dataframe from datasets and tags tree :param datasets: :param tags: :return: """ # Collect all tags meta = [] for dataset in datasets: for tag in dataset.tags: for path in Tag.find_path(tags, "id", tag.id): name = path[0].name key = "tag_" + path[0].id.replace(" ", "").replace('-', '_') value = path[1].name if len(path) > 1 else path[0].name meta.append({"dataset_id": dataset.id, "name": name, "key": key, "value": value}) meta = pd.DataFrame.from_records(meta) # Pivot keys into columns and multivalues to tuples if len(meta): meta = meta.pivot_table(index="dataset_id", columns=["key", "name"], values="value", aggfunc=pd.unique) meta = meta.applymap(lambda x: tuple(x) if pd.api.types.is_list_like(x) else x) return meta
def get_grid_bboxes(bbox, size, tile_size=(1024, 1024), overlap=128, num_tile_steps: int = 1, max_steps: int = -1)
-
Get tiled bounding boxes with overlaps, the last row/column will have a larger overlap to fit the image :param bbox: :param size: :param tile_size: :param overlap: :param num_tile_steps: :param max_steps: :return:
Expand source code
def get_grid_bboxes(bbox, size, tile_size=(1024, 1024), overlap=128, num_tile_steps: int = 1, max_steps: int = -1): """ Get tiled bounding boxes with overlaps, the last row/column will have a larger overlap to fit the image :param bbox: :param size: :param tile_size: :param overlap: :param num_tile_steps: :param max_steps: :return: """ assert num_tile_steps >= 1 # Adjust input parameters for shorter square brackets if not hasattr(overlap, "__len__"): overlap = (overlap, overlap) if not hasattr(tile_size, "__len__"): tile_size = (tile_size, tile_size) tile_size = np.array(tile_size) # Ensure the used "bbox" is valid bbox = bbox.clip(min=(0, 0, 0, 0), max=(size[0], size[1], size[0], size[1])) w_offset = np.floor(tile_size[0] - overlap[0]).astype(np.int) h_offset = np.floor(tile_size[1] - overlap[1]).astype(np.int) img_width = bbox[2] - bbox[0] img_height = bbox[3] - bbox[1] assert img_width > 0 assert img_height > 0 # Extract width indexes if img_width > tile_size[0]: # Indexes should cover at least width - padding width = np.arange(0, img_width - overlap[0], w_offset) width = np.clip(width, 0, img_width - tile_size[0]) lr_offset = np.pad((tile_size[0] - np.diff(width)) / 2, 1) else: width = np.array((0,)) lr_offset = np.array((0, tile_size[0] - img_width)) # Extract height indexes if img_height > tile_size[1]: height = np.arange(0, img_height - overlap[1], h_offset) height = np.clip(height, 0, img_height - tile_size[1]) ud_offset = np.pad((tile_size[1] - np.diff(height)) / 2, 1) else: height = np.array((0,)) ud_offset = np.array((0, tile_size[1] - img_height)) # Find start coordinates of tiles, and move to bbox offset start_ix = np.stack([x.flatten() for x in np.meshgrid(width, height)], -1) start_ix = start_ix + bbox[:2] bboxes = np.concatenate((start_ix, start_ix + tile_size), 1) l_offset = np.ceil(lr_offset[:-1]).astype(np.int) u_offset = np.ceil(ud_offset[:-1]).astype(np.int) left_offsets = np.stack([x.flatten() for x in np.meshgrid(l_offset, u_offset)], -1) r_offset = tile_size[0] - np.floor(lr_offset[1:]).astype(np.int) d_offset = tile_size[1] - np.floor(ud_offset[1:]).astype(np.int) right_offsets = np.stack([x.flatten() for x in np.meshgrid(r_offset, d_offset)], -1) # # tile_fractional_step = tile_size // num_tile_steps # step_size = tile_size - 2 * overlap # if max_steps > 0: # bbox[2] = min(bbox[2], bbox[0] - 2 * overlap + max_steps * step_size[0] - 1) # bbox[3] = min(bbox[3], bbox[1] - 2 * overlap + max_steps * step_size[1] - 1) # # init_tile = np.array(bbox[:2]).clip(min=overlap) # end_tile = np.array([(vv + 1).clip(max=sz-overlap) for sz, vv in zip(size, bbox[2:])]) # # tile_area = (end_tile - init_tile) # # full_tiles = (tile_area // step_size).clip(min=0) # fractional_tile = (tile_area - full_tiles * step_size).clip(min=0) # fractional_tile_apply = ((fractional_tile > 0) | (full_tiles == 0)).astype(np.int) # fractional_size = np.ceil((fractional_tile + 2 * overlap) / tile_fractional_step).astype(np.int) * tile_fractional_step # # # Build grid # grid_steps_full = [np.arange(ft + 1) * ss + iv for iv, ft, ss in zip(init_tile, full_tiles, step_size)] # grid_steps_fractional = [np.arange(frt) + fiv[-1] for fiv, frt in zip(grid_steps_full, fractional_tile_apply)] # # tiles_full = [np.array((gsf[:-1] - overlap, gsf[:-1] - overlap + ts)).T for gsf, ts in zip(grid_steps_full, tile_size)] # tiles_fractional = [np.hstack((gsf - overlap, gsf - overlap + fsize)).reshape(-1, 2) # for gsf, fsize in zip(grid_steps_fractional, fractional_size)] # # tiles_vectors = [np.vstack((v_fu, v_fr)) for v_fu, v_fr in zip(tiles_full, tiles_fractional)] # left_offsets = [np.ones(len(vv), dtype=int) * overlap for vv in tiles_vectors] # right_offsets = [vv[:, 1] - vv[:, 0] - overlap for vv in tiles_vectors] # for ii in range(2): # left_offsets[ii][0] = bbox[ii] - tiles_vectors[ii][0, 0] # right_offsets[ii][-1] = (tiles_vectors[ii][-1, 1] - tiles_vectors[ii][-1, 0]) - \ # (tiles_vectors[ii][-1, 1] - (bbox[2+ii]+1)).clip(min=0) # bboxes = np.array([(v0[0], v1[0], v0[1], v1[1]) for v1 in tiles_vectors[1] for v0 in tiles_vectors[0]]) # left_offsets = np.array([(lo0, lo1) for lo1 in left_offsets[1] for lo0 in left_offsets[0]]) # right_offsets = np.array([(ro0, ro1) for ro1 in right_offsets[1] for ro0 in right_offsets[0]]) return pd.Series([(len(height), len(width)), bboxes, left_offsets, right_offsets])
def get_samples(datasets, target, *args, **kwargs)
-
Utility function for getting samples across multiple datasets by sample files :param datasets: :param target: :param args: :param kwargs: :return:
Expand source code
def get_samples(datasets, target, *args, **kwargs): """ Utility function for getting samples across multiple datasets by sample files :param datasets: :param target: :param args: :param kwargs: :return: """ samples = (d.get_samples(target) for d in sorted(datasets, key=lambda x: x.id)) return pd.concat(samples).reset_index(drop=True)
def join_dataset_meta(df, datasets, tags)
-
join dataset meta :param df: sample dataframe with dataset_id to join on :param datasets: Dataset objects with metadata :param tags: tag root tree, to find parent tags :return: df, name/column_id dictionary
Expand source code
def join_dataset_meta(df, datasets, tags): """ join dataset meta :param df: sample dataframe with dataset_id to join on :param datasets: Dataset objects with metadata :param tags: tag root tree, to find parent tags :return: df, name/column_id dictionary """ assert "dataset_id" in df.columns, "df must contain dataset_id to join dataset metadata" meta = dataset_meta(datasets, tags) if len(meta): df = df.join(meta.droplevel(1, axis=1), on="dataset_id", how="left") return df, dict(meta.columns)
def save_samples(datasets, target, df)
-
Expand source code
def save_samples(datasets, target, df): for d in datasets: d.save_samples(target, df)
Classes
class BrevettiDatasetSamples (classes: list = None, class_mapping: dict = None, annotations: dict = None, calculate_md5: bool = False, walk: bool = True, samples_file_name: str = None, contains_column: str = None, contains_regex: str = None)
-
Base class for serializable modules
:param classes: Force samples to be of the categories in this list :param class_mapping: dict of mapping from path to (category) class. See example for description
Expand source code
class BrevettiDatasetSamples(vue.VueSettingsModule): def __init__(self, classes: list = None, class_mapping: dict = None, annotations: dict = None, calculate_md5: bool = False, walk: bool = True, samples_file_name: str = None, contains_column: str = None, contains_regex: str = None): """ :param classes: Force samples to be of the categories in this list :param class_mapping: dict of mapping from path to (category) class. See example for description """ self.classes = classes or [] self.class_mapping = class_mapping or {} self.annotations = annotations or {} self.calculate_md5 = calculate_md5 self.walk = walk self.samples_file_name = samples_file_name or "" self.contains_column = contains_column or "" self.contains_regex = contains_regex or "" def get_image_samples(self, datasets, *args, **kwargs): """ :param sample_filter: Filter samples by regex :param annotations: boolean, or dict Load annotation paths :param kwargs: :return: Dataframe of samples Mapping from path to category: Start from the leaf folder, and work towards the dataset root folder. If folder in class_mapping then apply its key as the category. If no match is found, apply leaf folder name Example: class_mapping={ "A": ["Tilted"], "B": ["Tilted"], "1": ["Crimp"] } If classes is True or a list/set of categories. The filter is applied after the mapping. """ with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: futures = [executor.submit(ds.get_image_samples, *args, **{**self.__dict__, **kwargs}) for ds in sorted(datasets, key=lambda x: x.id)] return pd.concat([f.result() for f in futures]).reset_index(drop=True) def get_samples(self, datasets, walk=None, *args, **kwargs): """ Utility function for getting samples across multiple datasets by sample files :param datasets: :param target: :param args: :param kwargs: :return: """ walk = walk if walk is not None else self.walk if walk: df = self.get_image_samples(datasets, *args, **kwargs) else: df = get_samples(datasets, self.samples_file_name) if self.contains_column: df = df[df[self.contains_column].str.contains(self.contains_regex, regex=True, na=False)] assert not df.empty, "No samples found" return df
Ancestors
Methods
def get_image_samples(self, datasets, *args, **kwargs)
-
:param sample_filter: Filter samples by regex :param annotations: boolean, or dict Load annotation paths :param kwargs: :return: Dataframe of samples
Mapping from path to category: Start from the leaf folder, and work towards the dataset root folder. If folder in class_mapping then apply its key as the category. If no match is found, apply leaf folder name Example: class_mapping={ "A": ["Tilted"], "B": ["Tilted"], "1": ["Crimp"] } If classes is True or a list/set of categories. The filter is applied after the mapping.
Expand source code
def get_image_samples(self, datasets, *args, **kwargs): """ :param sample_filter: Filter samples by regex :param annotations: boolean, or dict Load annotation paths :param kwargs: :return: Dataframe of samples Mapping from path to category: Start from the leaf folder, and work towards the dataset root folder. If folder in class_mapping then apply its key as the category. If no match is found, apply leaf folder name Example: class_mapping={ "A": ["Tilted"], "B": ["Tilted"], "1": ["Crimp"] } If classes is True or a list/set of categories. The filter is applied after the mapping. """ with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: futures = [executor.submit(ds.get_image_samples, *args, **{**self.__dict__, **kwargs}) for ds in sorted(datasets, key=lambda x: x.id)] return pd.concat([f.result() for f in futures]).reset_index(drop=True)
def get_samples(self, datasets, walk=None, *args, **kwargs)
-
Utility function for getting samples across multiple datasets by sample files :param datasets: :param target: :param args: :param kwargs: :return:
Expand source code
def get_samples(self, datasets, walk=None, *args, **kwargs): """ Utility function for getting samples across multiple datasets by sample files :param datasets: :param target: :param args: :param kwargs: :return: """ walk = walk if walk is not None else self.walk if walk: df = self.get_image_samples(datasets, *args, **kwargs) else: df = get_samples(datasets, self.samples_file_name) if self.contains_column: df = df[df[self.contains_column].str.contains(self.contains_regex, regex=True, na=False)] assert not df.empty, "No samples found" return df
Inherited members