from fastai_datasets.mnist import TinyMNIST, MNIST
Patches - enhancing fastai’s data classes
Lazy Subsets
TfmdLists.sublist
TfmdLists.sublist (indices:Iterable[int])
a sublist that maintains laziness
= TfmdLists(list('abcd'), 'f({})'.format, splits=[[0, 1], [2, 3]])
l = l.sublist([1, 3])
sub_l
'f(b)', 'f(d)')) test_eq(sub_l, L(
Each split is also intersected with the requested indices:
'f(b)'))
test_eq(sub_l.train, L('f(d)')) test_eq(sub_l.valid, L(
Datasets.sub_dsets
Datasets.sub_dsets (indices:Iterable[int])
= Datasets(list('abcd'), ['f({})'.format, 'g({})'.format], splits=[[0, 1], [2, 3]])
ds = ds.sub_dsets([1, 3])
sub_ds
'f(b)', 'g(b)'), ('f(d)', 'g(d)'))) test_eq(sub_ds, L((
Each split is also intersected with the requested indices:
'f(b)', 'g(b)')]))
test_eq(sub_ds.train, L([('f(d)', 'g(d)')])) test_eq(sub_ds.valid, L([(
DataLoader.sub_dl
DataLoader.sub_dl (indices:Iterable[int])
= ds.dl()
dl = dl.sub_dl([1, 3])
sub_dl
test_eq(sub_dl.dataset, sub_ds)
SubDataLoaders
inherit their parent’s parameters:
= ds.dl(shuffle=True, bs=10, after_item=lambda o: o[1])
dl = dl.sub_dl([1, 3])
sub_dl
test_eq(sub_dl.shuffle, dl.shuffle)
test_eq(sub_dl.bs, dl.bs) test_eq(sub_dl.after_item, dl.after_item)
Random Subsets
Datasets.random_sub_dsets
Datasets.random_sub_dsets (size, with_replacement=False, less_ok=False)
len(ds.random_sub_dsets(2)), 2) test_eq(
=6, less_ok=False))
test_fail(partial(ds.random_sub_dsets, sizelen(ds.random_sub_dsets(6, less_ok=True)), len(ds)) test_eq(
DataLoader.random_sub_dl
DataLoader.random_sub_dl (*args, with_replacement=False, less_ok=False)
= ds.dl()
dl 2).n, 2) test_eq(dl.random_sub_dl(
Arithmetics
Concatenating TfmdList
s
= TfmdLists(list('abc'), 'f({})'.format, splits=[[0, 1], [2]])
l1 = TfmdLists(list('bcd'), 'g({})'.format, splits=[[0], [1, 2]])
l2
+ l2, L('f(a)', 'f(b)', 'f(c)', 'g(b)', 'g(c)', 'g(d)')) test_eq(l1
Also concatenates each split separtely:
+l2).train, l1.train + l2.train)
test_eq((l1+l2).valid, l1.valid + l2.valid) test_eq((l1
Shares common transform postfix to allow show
ing:
= TinyMNIST()
mnist = mnist.tls[0]+mnist.tls[0]
concat_l 0) show_at(concat_l,
<AxesSubplot:>
Concatenating Datasets
s
= Datasets(list('abc'), ['f1({})'.format, 'f2({})'.format], splits=[[0, 1], [2]])
ds1 = Datasets(list('bcd'), ['g1({})'.format, 'g2({})'.format], splits=[[0], [1, 2]])
ds2
+ ds2, L(('f1(a)', 'f2(a)'), ('f1(b)', 'f2(b)'), ('f1(c)', 'f2(c)'),
test_eq(ds1 'g1(b)', 'g2(b)'), ('g1(c)', 'g2(c)'), ('g1(d)', 'g2(d)'))) (
Also concatenates each split separtely:
+ds2).train, ds1.train + ds2.train)
test_eq((ds1+ds2).valid, ds1.valid + ds2.valid) test_eq((ds1
Subtracting SubDatasets
-sub_ds, L(('f(a)', 'g(a)'), ('f(c)', 'g(c)')))
test_eq(ds-sub_ds).train, L([('f(a)', 'g(a)')]))
test_eq((ds-sub_ds).valid, L([('f(c)', 'g(c)')])) test_eq((ds
Concatenating DataLoader
s
= ds1.dl()
dl1 = ds2.dl()
dl2
+dl2).dataset , ds1+ds2) test_eq((dl1
Dataloaders have to have identical parameters:
lambda: ds1.dl(shuffle=False) + ds2.dl(shuffle=True))
test_fail(lambda: ds1.dl(bs=16) + ds2.dl(bs=32))
test_fail(lambda: ds1.dl() + ds2.dl(after_item=lambda o: o[0])) test_fail(
Targets
Datasets.i2t
Datasets.i2t ()
= Datasets(list('abcd'), ['f({})'.format, 'g({})'.format, 'h({})'.format])
ds 'h(a)', 'h(b)', 'h(c)', 'h(d)')) test_eq(ds.i2t, L(
Datasets.by_target
Datasets.by_target ()
= Datasets(range(10), [noop, [lambda o: ['Even', 'Odd'][o%2], Categorize()]])
ds
'Even', 'Odd'])
test_eq(ds.by_target.keys(), ['Even'], L((i, ds.vocab.o2i['Even']) for i in [0, 2, 4, 6, 8]))
test_eq(ds.by_target['Odd'], L((i, ds.vocab.o2i['Odd']) for i in [1, 3, 5, 7, 9])) test_eq(ds.by_target[
We can also partition DataLoader
s by targets:
DataLoader.by_target
DataLoader.by_target ()
= ds.dl()
dl
test_eq(dl.by_target.keys(), ds.by_target.keys())for k in ds.by_target.keys():
test_eq(dl.by_target[k].dataset, ds.by_target[k])
Datasets.plot_class_distribution
Datasets.plot_class_distribution ()
MNIST().plot_class_distribution()
Loading
Common default parameters for dataloaders:
ListToTuple
ListToTuple (enc=None, dec=None, split_idx=None, order=None)
Transforms lists to tuples, useful for fixing a bug in pytorch (pin_memory turns inner tuples into lists)
= {'pin_memory': default_device() != torch.device('cpu'), 'device': default_device(),
dl_defaults 'after_item': [ToTensor], 'after_batch': [ListToTuple, IntToFloatTensor]}
Convenience methods for creating loaders with dl_defaults
Datasets.dl
Datasets.dl (**kwargs)
Creates a DataLoader
(ignoring splits) with defaults from dl_defaults
Datasets.dls
Datasets.dls (**kwargs)
Calls Datasets.dataloaders
with defaults from dl_defaults
For small enough datasets, we might want to load all of it to memory:
Datasets.load
Datasets.load (**kwargs)
= TinyMNIST()
mnist = mnist.random_sub_dsets(10).load()
x, y 10, 1, 28, 28])
test_eq(x.shape, [10]) test_eq(y.shape, [
Misc
Datasets.subsets
Datasets.subsets ()
Lazy list of a Datasets
’s subsets
= Datasets(list('abcd'), ['f({})'.format, 'g({})'.format], splits=[[0, 2], [1, 3]])
ds test_eq(ds.subsets, L(ds.train, ds.valid))
Datasets.resplit
Datasets.resplit (splits:Union[Callable,List[List[int]]])
Sets the splits of a Datasets
Type | Details | |
---|---|---|
splits | typing.Union[typing.Callable, typing.List[typing.List[int]]] | a splitter function or a list of splits |
= Datasets(list('abcd'), ['f({})'.format, 'g({})'.format], splits=[[0, 2], [1, 3]])
ds .75))
ds.resplit(EndSplitter(
0], [1, 2, 3]]) test_eq(ds.splits, [[
Datasets.repr
Datasets.repr
shows all splits:
= Datasets(list('abcd'), ['f({})'.format, 'g({})'.format], splits=[[0, 2], [1, 3]])
ds for split in ds.subsets:
assert repr(split) in repr(ds)
Usage Examples
from fastai_datasets.all import *
= MNIST() mnist
Show dataset structure:
mnist
[(#60000) [(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7))...]
(#10000) [(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7)),(PILImageBW mode=L size=28x28, TensorCategory(7))...]]
Let’s sample a random subset:
= mnist.random_sub_dsets(1000) mnist
Show its class distribution:
mnist.plot_class_distribution()
Use only the even digits:
= mnist.by_target['0'] + mnist.by_target['2'] + mnist.by_target['4'] + mnist.by_target['6'] + mnist.by_target['8']
evens evens.dls().show_batch()
Drop specific classes:
= mnist - mnist.by_target['9'] - mnist.by_target['8'] - mnist.by_target['7']
less_than_7 =25) less_than_7.dl().show_batch(max_n
Estimate the mean sample from a specific class:
= mnist.by_target['3'].random_sub_dsets(20)
threes_sample 0].mean(0).show() threes_sample.load()[
<AxesSubplot:>