Source code for bayesgm.datasets.causal_samplers

from .base_sampler import Base_sampler
import numpy as np
import math
from scipy.sparse import diags
from scipy.stats import norm
import pandas as pd



[docs] class Semi_acic_sampler(Base_sampler): """ACIC 2018 competition dataset (binary treatment) sampler (inherited from Base_sampler). Parameters ---------- batch_size Int object denoting the batch size for mini-batch training. Default: ``32``. path Str object denoting the path to the original dataset. ufid Str object denoting the unique id of a specific semi-synthetic setting. Examples -------- >>> from CausalEGM import Semi_acic_sampler >>> import numpy as np >>> x = np.random.normal(size=(2000,)) >>> y = np.random.normal(size=(2000,)) >>> v = np.random.normal(size=(2000,100)) >>> ds = Semi_acic_sampler(path='../data/ACIC_2018',ufid='d5bd8e4814904c58a79d7cdcd7c2a1bb') """ def __init__(self, batch_size=32, path='../data/ACIC_2018', ufid='d5bd8e4814904c58a79d7cdcd7c2a1bb'): self.df_covariants = pd.read_csv('%s/x.csv'%path, index_col='sample_id',header=0, sep=',') self.df_sim = pd.read_csv('%s/scaling/factuals/%s.csv'%(path, ufid),index_col='sample_id',header=0, sep=',') dataset = self.df_covariants.join(self.df_sim, how='inner') x = dataset['z'].values.reshape(-1,1) y = dataset['y'].values.reshape(-1,1) v = dataset.values[:,:-2] super().__init__(x,y,v,batch_size=batch_size,normalize=True)
[docs] class Sim_Hirano_Imbens_sampler(Base_sampler): """Hirano Imbens simulation dataset (continuous treatment) sampler (inherited from Base_sampler). Parameters ---------- batch_size Int object denoting the batch size for mini-batch training. Default: ``32``. N Sample size. Default: ``20000``. v_dim Int object denoting the dimension for covariates. Default: ``200``. seed Int object denoting the random seed. Default: ``0``. Examples -------- >>> from CausalEGM import Sim_Hirano_Imbens_sampler >>> ds = Sim_Hirano_Imbens_sampler(batch_size=32, N=20000, v_dim=200, seed=0) """ def __init__(self, batch_size=32, N=20000, v_dim=200, seed=0): np.random.seed(seed) v = np.random.exponential(scale=1.0, size=(N, v_dim)) rate = v[:,0] + v[:,1] scale = 1/rate x = np.random.exponential(scale=scale) y = np.random.normal(x + (v[:,0] + v[:,2]) * np.exp(-x * (v[:,0] + v[:,2])) , 1) x = x.reshape(-1,1) y = y.reshape(-1,1) super().__init__(x,y,v,batch_size=batch_size,normalize=True)
[docs] class Sim_Sun_sampler(Base_sampler): """Sun simulation dataset (continuous treatment) sampler (inherited from Base_sampler). Parameters ---------- batch_size Int object denoting the batch size for mini-batch training. Default: ``32``. N Sample size. Default: ``20000``. v_dim Int object denoting the dimension for covariates. Default: ``200``. seed Int object denoting the random seed. Default: ``0``. Examples -------- >>> from CausalEGM import Sim_Sun_sampler >>> ds = Sim_Sun_sampler(batch_size=32, N=20000, v_dim=200, seed=0) """ def __init__(self, batch_size=32, N=20000, v_dim=200, seed=0): np.random.seed(seed) v = np.random.normal(0, 1, size=(N, v_dim)) x = np.random.normal(-2*(np.sin(2*v[:,0]))+ ((v[:,1])**2 - 1/3) + (v[:,2]-1/2) + np.cos(v[:,3]), 1) y = np.random.normal(((v[:,0] - 1/2)+ np.cos(v[:,1]) + (v[:,4])**2 + (v[:,5])) + x, 1) x = x.reshape(-1,1) y = y.reshape(-1,1) super().__init__(x,y,v,batch_size=batch_size,normalize=True)
[docs] class Sim_Colangelo_sampler(Base_sampler): """Colangelo simulation dataset (continuous treatment) sampler (inherited from Base_sampler). Parameters ---------- batch_size Int object denoting the batch size for mini-batch training. Default: ``32``. N Sample size. Default: ``20000``. v_dim Int object denoting the dimension for covariates. Default: ``200``. seed Int object denoting the random seed. Default: ``0``. Examples -------- >>> from CausalEGM import Sim_Colangelo_sampler >>> ds = Sim_Colangelo_sampler(batch_size=32, N=20000, v_dim=100, seed=0) """ def __init__(self, batch_size=32, N=20000, v_dim=100, seed=0, rho=0.5, offset = [-1,0,1], d=1, a=3, b=0.75): np.random.seed(seed) k = np.array([rho*np.ones(v_dim-1),np.ones(v_dim),rho*np.ones(v_dim-1)],dtype=object) sigma = diags(k,offset).toarray() theta = np.array([(1/(l**2)) for l in list(range(1,(v_dim+1)))]) epsilon = np.random.normal(0,1,N) nu = np.random.normal(0,1,N) v = np.random.multivariate_normal(np.zeros(v_dim),sigma,size=[N,]) x = d*norm.cdf((a*v@theta)) + b*nu - 0.5 y = 1.2*x + (x**3) + (x*v[:,0]) + 1.2*(v@theta) + epsilon x = x.reshape(-1,1) y = y.reshape(-1,1) super().__init__(x,y,v,batch_size=batch_size,normalize=True)
[docs] class Semi_Twins_sampler(Base_sampler): """Twins semi synthetic dataset sampler (inherited from Base_sampler). Parameters ---------- batch_size Int object denoting the batch size for mini-batch training. Default: ``32``. seed Int object denoting the random seed. Default: ``0``. path Str obejct denoting the path to the original data. Examples -------- >>> from CausalEGM import Semi_Twins_sampler >>> ds = Semi_Twins_sampler(batch_size=32, path='../data/Twins') """ def __init__(self, batch_size=32, seed=0, path='../data/Twins'): covariate_df = pd.read_csv('%s/twin_pairs_X_3years_samesex.csv'%path).iloc[:,2:].drop(['infant_id_0', 'infant_id_1'], axis=1) treatment_df_ = pd.read_csv('%s/twin_pairs_T_3years_samesex.csv'%path).iloc[:,1:] outcome_df = pd.read_csv('%s/twin_pairs_Y_3years_samesex.csv'%path).iloc[:,1:] #### discard NAN values rows_with_nan = [index for index, row in covariate_df.iterrows() if row.isnull().any()] covariate_df = covariate_df.drop(rows_with_nan) treatment_df_ = treatment_df_.drop(rows_with_nan) outcome_df = outcome_df.drop(rows_with_nan) #### select those below 2kg: rows_less2kg = [index for index, row in treatment_df_.iterrows() if (row['dbirwt_1']>=2000)] covariate_df = covariate_df.drop(rows_less2kg) treatment_df_ = treatment_df_.drop(rows_less2kg) outcome_df = outcome_df.drop(rows_less2kg) x = np.concatenate([treatment_df_.values[:,0], treatment_df_.values[:,1]])/1000 v = np.concatenate([covariate_df.values, covariate_df.values]) np.random.seed(seed) eps = np.random.normal(0, 0.25, size=(v.shape[0],)) gamma = np.random.normal(0, 0.025, size=(v.shape[1],)) y = -2 * 1/(1 + np.exp(-3 * x)) + np.dot(v, gamma) + eps self.auxiliary_constant = np.mean(np.dot(v, gamma)) x = x.reshape(-1,1) y = y.reshape(-1,1) super().__init__(x,y,v,batch_size=batch_size,normalize=True)