Source code for bayesgm.datasets.base_sampler

import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler,StandardScaler


[docs] class Base_sampler(object): """Mini-batch sampler for causal inference datasets. Stores treatment :math:`X`, outcome :math:`Y`, and covariates :math:`V` and provides an infinite mini-batch iterator that cycles through the data. Parameters ---------- x : array-like Treatment variable with shape ``(n,)`` or ``(n, 1)``. y : array-like Outcome variable with shape ``(n,)`` or ``(n, 1)``. v : array-like Covariates with shape ``(n, v_dim)``. batch_size : int, default=32 Number of samples per mini-batch. normalize : bool, default=False If ``True``, covariates :math:`V` are standardised (zero mean, unit variance) before storage. random_seed : int, default=123 Random seed used for shuffling. """ def __init__(self, x, y, v, batch_size=32, normalize=False, random_seed=123): assert len(x)==len(y)==len(v) np.random.seed(random_seed) self.data_x = np.array(x, dtype='float32') self.data_y = np.array(y, dtype='float32') self.data_v = np.array(v, dtype='float32') if len(self.data_x.shape) == 1: self.data_x = self.data_x.reshape(-1,1) if len(self.data_y.shape) == 1: self.data_y = self.data_y.reshape(-1,1) self.batch_size = batch_size if normalize: self.data_v = StandardScaler().fit_transform(self.data_v) #self.data_v = MinMaxScaler().fit_transform(self.data_v) self.sample_size = len(x) self.full_index = np.arange(self.sample_size) np.random.shuffle(self.full_index) self.idx_gen = self.create_idx_generator(sample_size=self.sample_size) def create_idx_generator(self, sample_size, random_seed=123): while True: for step in range(math.ceil(sample_size/self.batch_size)): if (step+1)*self.batch_size <= sample_size: yield self.full_index[step*self.batch_size:(step+1)*self.batch_size] else: yield np.hstack([self.full_index[step*self.batch_size:], self.full_index[:((step+1)*self.batch_size-sample_size)]]) np.random.shuffle(self.full_index)
[docs] def next_batch(self): """Return the next mini-batch of ``(x, y, v)``. Returns ------- data_x : np.ndarray Treatment batch with shape ``(batch_size, 1)``. data_y : np.ndarray Outcome batch with shape ``(batch_size, 1)``. data_v : np.ndarray Covariates batch with shape ``(batch_size, v_dim)``. """ indx = next(self.idx_gen) return self.data_x[indx,:], self.data_y[indx,:], self.data_v[indx, :]
[docs] def load_all(self): """Return the full dataset. Returns ------- data_x : np.ndarray Treatment variable with shape ``(n, 1)``. data_y : np.ndarray Outcome variable with shape ``(n, 1)``. data_v : np.ndarray Covariates with shape ``(n, v_dim)``. """ return self.data_x, self.data_y, self.data_v