Module `brevettiai.tests.test_data`

Expand source code

import unittest
from itertools import islice

import numpy as np
import pandas as pd

from brevettiai.data.data_generator import DataGenerator, OneHotEncoder
from brevettiai.data.sample_integrity import merge_sample_identification, SampleSplit
from brevettiai.utils.pandas_utils import explode


class TestDataGenerator(unittest.TestCase):
    n = 10
    samples = pd.DataFrame({
        "category": pd.Series(np.random.RandomState(20).randint(0, 3, size=n)).apply(lambda x: (x,)).astype("category"),
        "id": pd.Series(np.arange(n))
    })
    samples.category.cat.categories = [("A",), ("B",), ("A", "B")]
    samples["path"] = samples.id.map(str)

    def test_unshuffled_unbatched(self):
        ds = DataGenerator(self.samples)

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

    def test_unshuffled_batched(self, batch_size=3):
        ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size)

        self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "")

        df = pd.DataFrame(ds.get_samples_numpy(batch=True))
        self.assertTrue(len(ds) == len(df))
        self.assertTrue(all(len(b) == batch_size for b in df.id[:-1]))

        residual = len(self.samples) % batch_size
        residual = batch_size if residual == 0 else residual
        self.assertTrue(len(df.id.iloc[-1]) == residual)

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

    def test_shuffled(self, batch_size=3):
        ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size, shuffle=True)

        self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertFalse((self.samples.id == df.id).all(), "Samples out of order")
        self.assertTrue((df.id.value_counts() == self.samples.id.value_counts()).all(),
                        "All samples in set same amount of times")

        self.assertTrue(
            all(a["id"] == b["id"] for a, b in
                zip(ds.get_samples_numpy(batch=False), ds.get_dataset_numpy(batch=False))),
            "Shuffled samples must match across instances"
        )

        df1 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
        df2 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
        self.assertTrue((df1.id.values == df2.id.values).all(), "Seed should make sequence equal")

        df3 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=121).get_samples_numpy(batch=False))
        self.assertTrue((df1.id.values != df3.id.values).any(), "Different seeds should make sequence different")

    def test_repeated(self, repeat=2):
        ds = DataGenerator(self.samples, batch_size=1, repeat=repeat)

        self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
        self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                        "All samples in set repeat times")

    def test_shuffle_repeated(self, repeat=2):
        ds = DataGenerator(self.samples, batch_size=1, repeat=repeat, shuffle=True)

        self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertFalse((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
        self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                        "All samples in set repeat times")

    def test_sample_weighing_unshuffled(self):
        dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
        dfs = pd.concat([item for sublist in dfs for item in sublist])
        dfs = dfs.sort_values("id")

        df = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
        counts = df.category.value_counts()
        self.assertAlmostEqual(0, counts.std() / counts.mean(), delta=0.01, msg="Counts must match")
        for k, g in df.groupby("category"):
            self.assertTrue(all(g.id.iloc[0] == g.id[g.id.diff() < 0]), "All groups must be sampled in input order")

        df2 = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
        self.assertTrue((df2.values == df.values).all(),
                        "Repeated unshuffled dataset should match, even when oversampling")

        self._compare_weighting(np.sqrt, dfs, shuffle=False)

    def test_sample_weighing(self):
        dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
        dfs = pd.concat([item for sublist in dfs for item in sublist])
        dfs = dfs.sort_values("id")
        self._compare_weighting(np.sqrt, dfs, shuffle=True)
        self._compare_weighting(lambda x: 1, dfs, shuffle=True)
        self._compare_weighting(lambda x: x, dfs, shuffle=True)

    @staticmethod
    def _get_df_through_dataset(samples, **kwargs):
        ds = DataGenerator(samples, **kwargs)
        df = pd.DataFrame(islice(ds.get_samples_numpy(batch=False), 1000))
        df.category = df.category.apply(tuple)
        return df

    def _compare_weighting(self, func, samples, delta=0.08, **kwargs):
        df = self._get_df_through_dataset(samples, sampling_groupby=["category"], sampling_group_weighing=func,
                                          repeat=-1, **kwargs)
        target = samples.category.value_counts().apply(func)
        target = target / target.sum()
        error = ((df.category.value_counts() / len(df))[target.index] - target).abs()
        print(func, error.max())
        self.assertAlmostEqual(0, error.max(), delta=delta,
                               msg=f"All groups must be sampled in correct amount for func {func}")


class TestSampleIdentification(unittest.TestCase):
    samples = pd.DataFrame({
        "etag": ["1", "2", "3", "4", "5"],
    })

    sample_id = pd.DataFrame({
        "purpose": ["test", "test", "train", "train", pd.NA]
    }, index=["5", "2", "1", "4", "3"])

    def test_merge_sample_identification(self):
        df, known_ids = merge_sample_identification(self.samples, self.sample_id)

        pd.testing.assert_series_equal(df.etag, self.samples.etag)
        pd.testing.assert_series_equal(self.sample_id.purpose[df.etag].reset_index(drop=True),
                                       df.purpose.reset_index(drop=True))

        dfid = self.sample_id
        dfid.purpose = ["test", pd.NA, "train", "train", "new"]
        df2, known_ids = merge_sample_identification(df.copy(), self.sample_id)
        reord = self.sample_id.purpose[df.etag].reset_index(drop=True)

        # Check NA values in identification frame is not changed
        pd.testing.assert_frame_equal(df2.set_index("etag").loc[dfid[dfid.purpose.isna()].index],
                                      df.set_index("etag").loc[dfid[dfid.purpose.isna()].index])

        # Check non NA values are transfered
        pd.testing.assert_series_equal(reord[~reord.isna()],
                                       df2.purpose.reset_index(drop=True)[~reord.isna()])


class TestDataPurposeAssignment(unittest.TestCase):
    samples = pd.DataFrame({
        "etag": np.arange(10000),
        "path": [str(i) for i in range(10000)],
        "group1": np.random.randint(0, 5, 10000),
        "group2": np.random.randint(0, 5, 10000),
        "group3": np.random.randint(0, 5, 10000),
    })

    def test_basic_assignment(self):
        split = np.random.rand()
        df = SampleSplit(split=split).assign(self.samples.copy(), remainder="test")
        df2 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
        df3 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.001, msg="Split should match")

        self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
        self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

    def test_basic_assignment_mmh3(self):
        split = np.random.rand()
        mmhmode = SampleSplit.MODE_MURMURHASH3
        df = SampleSplit(split=split, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        df2 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        df3 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.04, msg="Split should match")

        self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
        self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

    def test_no_data(self):
        df = SampleSplit().assign(self.samples.iloc[:0].copy())
        self.assertTrue(df.empty, "Empty in -> empty out")

    def test_stratification(self):
        split = np.random.rand()
        with self.assertRaises(AssertionError):
            SampleSplit(stratification=["column_not_in_samples"], split=split).assign(self.samples.copy())

        df = SampleSplit(stratification=r"(\d)$", split=split).assign(self.samples.copy())

        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(df.path.str[-1]).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.01, msg="All shards must be split accordingly")

        grouping = ["group1"]
        df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.01, msg="All shards must be split accordingly")

        grouping = ["group1", "group2"]
        df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.03, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.04, msg="All shards must be split accordingly")

    def assertAlmostEqual(self, first: float, second: float, delta, *args, **kwargs) -> None:
        print(first - second, "<=", delta )
        super().assertAlmostEqual(first, second, delta=delta, *args, **kwargs)

    def test_uniqueness(self, split=0.5):
        with self.assertRaises(AssertionError):
            SampleSplit(uniqueness=["column_not_in_samples"], split=split).assign(self.samples.copy())

        df = SampleSplit(uniqueness=r"^(\d)", split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0,1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

        grouping = ["group1"]
        df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.13, msg="Split should match")

        grouping = ["group1", "group2"]
        df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.05, msg="Split should match")

    def test_uniqueness_mmh3(self, split=0.5):
        mmhmode = SampleSplit.MODE_MURMURHASH3
        with self.assertRaises(AssertionError):
            SampleSplit(uniqueness=["column_not_in_samples"], split=split, mode=mmhmode).assign(self.samples.copy())

        df = SampleSplit(uniqueness=r"^(\d)", split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

        grouping = ["group1"]
        df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")

        grouping = ["group1", "group2"]
        df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")


class TestSamplesExplosion(unittest.TestCase):
    samples = pd.DataFrame({
        "cat": [("A", "B"), ("A",), "B", ("A", "B", "C"), "D", ("A", "B"), pd.NA],
        "idx": np.arange(7)
    })

    def test_sample_explode(self):
        df = explode(self.samples)
        self.assertTrue((self.samples.cat.explode().fillna("N/A") == df.cat).all(),
                        "exploded column matches pd.DataFrame.explode")

        self.assertEqual(df.groupby(df.columns.tolist()).size().reset_index(name="c_").drop_duplicates("id").c_.sum(),
                         self.samples.shape[0],
                         "Duplicates excluded by dropping duplicate id after grouping")


if __name__ == '__main__':
    unittest.main()

Classes

class TestDataGenerator (methodName='runTest')

A class whose instances are single test cases.

By default, the test code itself should be placed in a method named 'runTest'.

If the fixture may be used for many test cases, create as many test methods as are needed. When instantiating such a TestCase subclass, specify in the constructor arguments the name of the test method that the instance is to execute.

Test authors should subclass TestCase for their own tests. Construction and deconstruction of the test's environment ('fixture') can be implemented by overriding the 'setUp' and 'tearDown' methods respectively.

If it is necessary to override the init method, the base class init method must always be called. It is important that subclasses should not change the signature of their init method, since instances of the classes are instantiated automatically by parts of the framework in order to be run.

When subclassing TestCase, you can set these attributes: * failureException: determines which exception will be raised when the instance's assertion methods fail; test methods raising this exception will be deemed to have 'failed' rather than 'errored'. * longMessage: determines whether long messages (including repr of objects used in assert methods) will be printed on failure in addition to any explicit message passed. * maxDiff: sets the maximum length of a diff in failure messages by assert methods using difflib. It is looked up as an instance attribute so can be configured by individual tests if required.

Create an instance of the class that will use the named test method when executed. Raises a ValueError if the instance does not have a method with the specified name.

Expand source code

class TestDataGenerator(unittest.TestCase):
    n = 10
    samples = pd.DataFrame({
        "category": pd.Series(np.random.RandomState(20).randint(0, 3, size=n)).apply(lambda x: (x,)).astype("category"),
        "id": pd.Series(np.arange(n))
    })
    samples.category.cat.categories = [("A",), ("B",), ("A", "B")]
    samples["path"] = samples.id.map(str)

    def test_unshuffled_unbatched(self):
        ds = DataGenerator(self.samples)

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

    def test_unshuffled_batched(self, batch_size=3):
        ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size)

        self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "")

        df = pd.DataFrame(ds.get_samples_numpy(batch=True))
        self.assertTrue(len(ds) == len(df))
        self.assertTrue(all(len(b) == batch_size for b in df.id[:-1]))

        residual = len(self.samples) % batch_size
        residual = batch_size if residual == 0 else residual
        self.assertTrue(len(df.id.iloc[-1]) == residual)

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((self.samples.id.values == df.id).all())

    def test_shuffled(self, batch_size=3):
        ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size, shuffle=True)

        self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertFalse((self.samples.id == df.id).all(), "Samples out of order")
        self.assertTrue((df.id.value_counts() == self.samples.id.value_counts()).all(),
                        "All samples in set same amount of times")

        self.assertTrue(
            all(a["id"] == b["id"] for a, b in
                zip(ds.get_samples_numpy(batch=False), ds.get_dataset_numpy(batch=False))),
            "Shuffled samples must match across instances"
        )

        df1 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
        df2 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
        self.assertTrue((df1.id.values == df2.id.values).all(), "Seed should make sequence equal")

        df3 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=121).get_samples_numpy(batch=False))
        self.assertTrue((df1.id.values != df3.id.values).any(), "Different seeds should make sequence different")

    def test_repeated(self, repeat=2):
        ds = DataGenerator(self.samples, batch_size=1, repeat=repeat)

        self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertTrue((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
        self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                        "All samples in set repeat times")

    def test_shuffle_repeated(self, repeat=2):
        ds = DataGenerator(self.samples, batch_size=1, repeat=repeat, shuffle=True)

        self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

        df = pd.DataFrame(ds.get_samples_numpy(batch=False))
        self.assertFalse((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
        self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                        "All samples in set repeat times")

    def test_sample_weighing_unshuffled(self):
        dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
        dfs = pd.concat([item for sublist in dfs for item in sublist])
        dfs = dfs.sort_values("id")

        df = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
        counts = df.category.value_counts()
        self.assertAlmostEqual(0, counts.std() / counts.mean(), delta=0.01, msg="Counts must match")
        for k, g in df.groupby("category"):
            self.assertTrue(all(g.id.iloc[0] == g.id[g.id.diff() < 0]), "All groups must be sampled in input order")

        df2 = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
        self.assertTrue((df2.values == df.values).all(),
                        "Repeated unshuffled dataset should match, even when oversampling")

        self._compare_weighting(np.sqrt, dfs, shuffle=False)

    def test_sample_weighing(self):
        dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
        dfs = pd.concat([item for sublist in dfs for item in sublist])
        dfs = dfs.sort_values("id")
        self._compare_weighting(np.sqrt, dfs, shuffle=True)
        self._compare_weighting(lambda x: 1, dfs, shuffle=True)
        self._compare_weighting(lambda x: x, dfs, shuffle=True)

    @staticmethod
    def _get_df_through_dataset(samples, **kwargs):
        ds = DataGenerator(samples, **kwargs)
        df = pd.DataFrame(islice(ds.get_samples_numpy(batch=False), 1000))
        df.category = df.category.apply(tuple)
        return df

    def _compare_weighting(self, func, samples, delta=0.08, **kwargs):
        df = self._get_df_through_dataset(samples, sampling_groupby=["category"], sampling_group_weighing=func,
                                          repeat=-1, **kwargs)
        target = samples.category.value_counts().apply(func)
        target = target / target.sum()
        error = ((df.category.value_counts() / len(df))[target.index] - target).abs()
        print(func, error.max())
        self.assertAlmostEqual(0, error.max(), delta=delta,
                               msg=f"All groups must be sampled in correct amount for func {func}")

Ancestors

unittest.case.TestCase

Class variables

var n
var samples

Methods

def test_repeated(self, repeat=2)

Expand source code

def test_repeated(self, repeat=2):
    ds = DataGenerator(self.samples, batch_size=1, repeat=repeat)

    self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertTrue((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
    self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                    "All samples in set repeat times")

def test_sample_weighing(self)

Expand source code

def test_sample_weighing(self):
    dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
    dfs = pd.concat([item for sublist in dfs for item in sublist])
    dfs = dfs.sort_values("id")
    self._compare_weighting(np.sqrt, dfs, shuffle=True)
    self._compare_weighting(lambda x: 1, dfs, shuffle=True)
    self._compare_weighting(lambda x: x, dfs, shuffle=True)

def test_sample_weighing_unshuffled(self)

Expand source code

def test_sample_weighing_unshuffled(self):
    dfs = [[g[1]] * cnt for g, cnt in zip(self.samples.groupby("category"), [10, 3, 1])]
    dfs = pd.concat([item for sublist in dfs for item in sublist])
    dfs = dfs.sort_values("id")

    df = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
    counts = df.category.value_counts()
    self.assertAlmostEqual(0, counts.std() / counts.mean(), delta=0.01, msg="Counts must match")
    for k, g in df.groupby("category"):
        self.assertTrue(all(g.id.iloc[0] == g.id[g.id.diff() < 0]), "All groups must be sampled in input order")

    df2 = self._get_df_through_dataset(dfs, repeat=-1, shuffle=False, sampling_groupby=["category"])
    self.assertTrue((df2.values == df.values).all(),
                    "Repeated unshuffled dataset should match, even when oversampling")

    self._compare_weighting(np.sqrt, dfs, shuffle=False)

def test_shuffle_repeated(self, repeat=2)

Expand source code

def test_shuffle_repeated(self, repeat=2):
    ds = DataGenerator(self.samples, batch_size=1, repeat=repeat, shuffle=True)

    self.assertTrue(len(ds) == len(self.samples), "Epoch length mismatch")

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertFalse((np.tile(self.samples.id.values, repeat) == df.id).all(), "Samples repeated in order")
    self.assertTrue((df.id.value_counts() == (self.samples.id.value_counts() * repeat)).all(),
                    "All samples in set repeat times")

def test_shuffled(self, batch_size=3)

Expand source code

def test_shuffled(self, batch_size=3):
    ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size, shuffle=True)

    self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "Epoch length mismatch")

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertFalse((self.samples.id == df.id).all(), "Samples out of order")
    self.assertTrue((df.id.value_counts() == self.samples.id.value_counts()).all(),
                    "All samples in set same amount of times")

    self.assertTrue(
        all(a["id"] == b["id"] for a, b in
            zip(ds.get_samples_numpy(batch=False), ds.get_dataset_numpy(batch=False))),
        "Shuffled samples must match across instances"
    )

    df1 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
    df2 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=120).get_samples_numpy(batch=False))
    self.assertTrue((df1.id.values == df2.id.values).all(), "Seed should make sequence equal")

    df3 = pd.DataFrame(DataGenerator(self.samples, shuffle=True, seed=121).get_samples_numpy(batch=False))
    self.assertTrue((df1.id.values != df3.id.values).any(), "Different seeds should make sequence different")

def test_unshuffled_batched(self, batch_size=3)

Expand source code

def test_unshuffled_batched(self, batch_size=3):
    ds = DataGenerator(self.samples[["id", "path"]], batch_size=batch_size)

    self.assertTrue(len(ds) == np.ceil(len(self.samples) / batch_size), "")

    df = pd.DataFrame(ds.get_samples_numpy(batch=True))
    self.assertTrue(len(ds) == len(df))
    self.assertTrue(all(len(b) == batch_size for b in df.id[:-1]))

    residual = len(self.samples) % batch_size
    residual = batch_size if residual == 0 else residual
    self.assertTrue(len(df.id.iloc[-1]) == residual)

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertTrue((self.samples.id.values == df.id).all())

def test_unshuffled_unbatched(self)

Expand source code

def test_unshuffled_unbatched(self):
    ds = DataGenerator(self.samples)

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertTrue((self.samples.id.values == df.id).all())

    df = pd.DataFrame(ds.get_samples_numpy(batch=False))
    self.assertTrue((self.samples.id.values == df.id).all())

class TestDataPurposeAssignment (methodName='runTest')

A class whose instances are single test cases.

By default, the test code itself should be placed in a method named 'runTest'.

Create an instance of the class that will use the named test method when executed. Raises a ValueError if the instance does not have a method with the specified name.

Expand source code

class TestDataPurposeAssignment(unittest.TestCase):
    samples = pd.DataFrame({
        "etag": np.arange(10000),
        "path": [str(i) for i in range(10000)],
        "group1": np.random.randint(0, 5, 10000),
        "group2": np.random.randint(0, 5, 10000),
        "group3": np.random.randint(0, 5, 10000),
    })

    def test_basic_assignment(self):
        split = np.random.rand()
        df = SampleSplit(split=split).assign(self.samples.copy(), remainder="test")
        df2 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
        df3 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.001, msg="Split should match")

        self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
        self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

    def test_basic_assignment_mmh3(self):
        split = np.random.rand()
        mmhmode = SampleSplit.MODE_MURMURHASH3
        df = SampleSplit(split=split, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        df2 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        df3 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.04, msg="Split should match")

        self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
        self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

    def test_no_data(self):
        df = SampleSplit().assign(self.samples.iloc[:0].copy())
        self.assertTrue(df.empty, "Empty in -> empty out")

    def test_stratification(self):
        split = np.random.rand()
        with self.assertRaises(AssertionError):
            SampleSplit(stratification=["column_not_in_samples"], split=split).assign(self.samples.copy())

        df = SampleSplit(stratification=r"(\d)$", split=split).assign(self.samples.copy())

        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(df.path.str[-1]).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.01, msg="All shards must be split accordingly")

        grouping = ["group1"]
        df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.01, msg="All shards must be split accordingly")

        grouping = ["group1", "group2"]
        df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.03, msg="Split should match")
        self.assertAlmostEqual(
            (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
            delta=0.04, msg="All shards must be split accordingly")

    def assertAlmostEqual(self, first: float, second: float, delta, *args, **kwargs) -> None:
        print(first - second, "<=", delta )
        super().assertAlmostEqual(first, second, delta=delta, *args, **kwargs)

    def test_uniqueness(self, split=0.5):
        with self.assertRaises(AssertionError):
            SampleSplit(uniqueness=["column_not_in_samples"], split=split).assign(self.samples.copy())

        df = SampleSplit(uniqueness=r"^(\d)", split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0,1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

        grouping = ["group1"]
        df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.13, msg="Split should match")

        grouping = ["group1", "group2"]
        df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.05, msg="Split should match")

    def test_uniqueness_mmh3(self, split=0.5):
        mmhmode = SampleSplit.MODE_MURMURHASH3
        with self.assertRaises(AssertionError):
            SampleSplit(uniqueness=["column_not_in_samples"], split=split, mode=mmhmode).assign(self.samples.copy())

        df = SampleSplit(uniqueness=r"^(\d)", split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

        grouping = ["group1"]
        df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")

        grouping = ["group1", "group2"]
        df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
        self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                        "must be split according to uniqueness")
        self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")

Ancestors

unittest.case.TestCase

Class variables

var samples

Methods

def assertAlmostEqual(self, first: float, second: float, delta, *args, **kwargs) ‑> None

Fail if the two objects are unequal as determined by their difference rounded to the given number of decimal places (default 7) and comparing to zero, or by comparing that the difference between the two objects is more than the given delta.

Note that decimal places (from zero) are usually not the same as significant digits (measured from the most significant digit).

If the two objects compare equal then they will automatically compare almost equal.

Expand source code

def assertAlmostEqual(self, first: float, second: float, delta, *args, **kwargs) -> None:
    print(first - second, "<=", delta )
    super().assertAlmostEqual(first, second, delta=delta, *args, **kwargs)

def test_basic_assignment(self)

Expand source code

def test_basic_assignment(self):
    split = np.random.rand()
    df = SampleSplit(split=split).assign(self.samples.copy(), remainder="test")
    df2 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
    df3 = SampleSplit(split=split, seed=1234).assign(self.samples.copy(), remainder="test")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.001, msg="Split should match")

    self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
    self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

def test_basic_assignment_mmh3(self)

Expand source code

def test_basic_assignment_mmh3(self):
    split = np.random.rand()
    mmhmode = SampleSplit.MODE_MURMURHASH3
    df = SampleSplit(split=split, mode=mmhmode).assign(self.samples.copy(), remainder="test")
    df2 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
    df3 = SampleSplit(split=split, seed=1234, mode=mmhmode).assign(self.samples.copy(), remainder="test")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.04, msg="Split should match")

    self.assertFalse((df.purpose == df2.purpose).all(), "Unseeded should result in different assignment")
    self.assertTrue((df3.purpose == df2.purpose).all(), "Seeding should result in equal assignment")

def test_no_data(self)

Expand source code

def test_no_data(self):
    df = SampleSplit().assign(self.samples.iloc[:0].copy())
    self.assertTrue(df.empty, "Empty in -> empty out")

def test_stratification(self)

Expand source code

def test_stratification(self):
    split = np.random.rand()
    with self.assertRaises(AssertionError):
        SampleSplit(stratification=["column_not_in_samples"], split=split).assign(self.samples.copy())

    df = SampleSplit(stratification=r"(\d)$", split=split).assign(self.samples.copy())

    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
    self.assertAlmostEqual(
        (df.groupby(df.path.str[-1]).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
        delta=0.01, msg="All shards must be split accordingly")

    grouping = ["group1"]
    df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.01, msg="Split should match")
    self.assertAlmostEqual(
        (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
        delta=0.01, msg="All shards must be split accordingly")

    grouping = ["group1", "group2"]
    df = SampleSplit(stratification=grouping, split=split).assign(self.samples.copy())
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.03, msg="Split should match")
    self.assertAlmostEqual(
        (df.groupby(grouping).purpose.apply(lambda x: (x == 'train').mean()) - split).abs().max(), 0,
        delta=0.04, msg="All shards must be split accordingly")

def test_uniqueness(self, split=0.5)

Expand source code

def test_uniqueness(self, split=0.5):
    with self.assertRaises(AssertionError):
        SampleSplit(uniqueness=["column_not_in_samples"], split=split).assign(self.samples.copy())

    df = SampleSplit(uniqueness=r"^(\d)", split=split).assign(self.samples.copy())
    self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0,1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

    grouping = ["group1"]
    df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
    self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.13, msg="Split should match")

    grouping = ["group1", "group2"]
    df = SampleSplit(uniqueness=grouping, split=split).assign(self.samples.copy())
    self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.05, msg="Split should match")

def test_uniqueness_mmh3(self, split=0.5)

Expand source code

def test_uniqueness_mmh3(self, split=0.5):
    mmhmode = SampleSplit.MODE_MURMURHASH3
    with self.assertRaises(AssertionError):
        SampleSplit(uniqueness=["column_not_in_samples"], split=split, mode=mmhmode).assign(self.samples.copy())

    df = SampleSplit(uniqueness=r"^(\d)", split=split, mode=mmhmode).assign(self.samples.copy())
    self.assertTrue(df.groupby(df.path.str[0]).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.7, msg="Split should match")

    grouping = ["group1"]
    df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
    self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")

    grouping = ["group1", "group2"]
    df = SampleSplit(uniqueness=grouping, split=split, mode=mmhmode).assign(self.samples.copy())
    self.assertTrue(df.groupby(grouping).purpose.apply(lambda x: (x == "train").mean()).isin((0, 1)).all(),
                    "must be split according to uniqueness")
    self.assertAlmostEqual((df.purpose == "train").mean(), split, delta=0.6, msg="Split should match")

class TestSampleIdentification (methodName='runTest')

A class whose instances are single test cases.

By default, the test code itself should be placed in a method named 'runTest'.

Create an instance of the class that will use the named test method when executed. Raises a ValueError if the instance does not have a method with the specified name.

Expand source code

class TestSampleIdentification(unittest.TestCase):
    samples = pd.DataFrame({
        "etag": ["1", "2", "3", "4", "5"],
    })

    sample_id = pd.DataFrame({
        "purpose": ["test", "test", "train", "train", pd.NA]
    }, index=["5", "2", "1", "4", "3"])

    def test_merge_sample_identification(self):
        df, known_ids = merge_sample_identification(self.samples, self.sample_id)

        pd.testing.assert_series_equal(df.etag, self.samples.etag)
        pd.testing.assert_series_equal(self.sample_id.purpose[df.etag].reset_index(drop=True),
                                       df.purpose.reset_index(drop=True))

        dfid = self.sample_id
        dfid.purpose = ["test", pd.NA, "train", "train", "new"]
        df2, known_ids = merge_sample_identification(df.copy(), self.sample_id)
        reord = self.sample_id.purpose[df.etag].reset_index(drop=True)

        # Check NA values in identification frame is not changed
        pd.testing.assert_frame_equal(df2.set_index("etag").loc[dfid[dfid.purpose.isna()].index],
                                      df.set_index("etag").loc[dfid[dfid.purpose.isna()].index])

        # Check non NA values are transfered
        pd.testing.assert_series_equal(reord[~reord.isna()],
                                       df2.purpose.reset_index(drop=True)[~reord.isna()])

Ancestors

unittest.case.TestCase

Class variables

var sample_id
var samples

Methods

def test_merge_sample_identification(self)

Expand source code

def test_merge_sample_identification(self):
    df, known_ids = merge_sample_identification(self.samples, self.sample_id)

    pd.testing.assert_series_equal(df.etag, self.samples.etag)
    pd.testing.assert_series_equal(self.sample_id.purpose[df.etag].reset_index(drop=True),
                                   df.purpose.reset_index(drop=True))

    dfid = self.sample_id
    dfid.purpose = ["test", pd.NA, "train", "train", "new"]
    df2, known_ids = merge_sample_identification(df.copy(), self.sample_id)
    reord = self.sample_id.purpose[df.etag].reset_index(drop=True)

    # Check NA values in identification frame is not changed
    pd.testing.assert_frame_equal(df2.set_index("etag").loc[dfid[dfid.purpose.isna()].index],
                                  df.set_index("etag").loc[dfid[dfid.purpose.isna()].index])

    # Check non NA values are transfered
    pd.testing.assert_series_equal(reord[~reord.isna()],
                                   df2.purpose.reset_index(drop=True)[~reord.isna()])

class TestSamplesExplosion (methodName='runTest')

A class whose instances are single test cases.

By default, the test code itself should be placed in a method named 'runTest'.

Create an instance of the class that will use the named test method when executed. Raises a ValueError if the instance does not have a method with the specified name.

Expand source code

class TestSamplesExplosion(unittest.TestCase):
    samples = pd.DataFrame({
        "cat": [("A", "B"), ("A",), "B", ("A", "B", "C"), "D", ("A", "B"), pd.NA],
        "idx": np.arange(7)
    })

    def test_sample_explode(self):
        df = explode(self.samples)
        self.assertTrue((self.samples.cat.explode().fillna("N/A") == df.cat).all(),
                        "exploded column matches pd.DataFrame.explode")

        self.assertEqual(df.groupby(df.columns.tolist()).size().reset_index(name="c_").drop_duplicates("id").c_.sum(),
                         self.samples.shape[0],
                         "Duplicates excluded by dropping duplicate id after grouping")

Ancestors

unittest.case.TestCase

Class variables

var samples

Methods

def test_sample_explode(self)

Expand source code

def test_sample_explode(self):
    df = explode(self.samples)
    self.assertTrue((self.samples.cat.explode().fillna("N/A") == df.cat).all(),
                    "exploded column matches pd.DataFrame.explode")

    self.assertEqual(df.groupby(df.columns.tolist()).size().reset_index(name="c_").drop_duplicates("id").c_.sum(),
                     self.samples.shape[0],
                     "Duplicates excluded by dropping duplicate id after grouping")