Source code for bayesgm.datasets.simulators

import numpy as np
from sklearn.datasets import make_low_rank_matrix


[docs] def simulate_regression(n_samples, n_features, n_targets, effective_rank=None, variance=None, random_state=123): """Simulate a linear regression dataset with optional low-rank design. Generates :math:`X` (optionally low-rank) and :math:`Y = X_{\\text{aug}} \\beta + \\varepsilon` where :math:`X_{\\text{aug}}` includes an intercept column. Parameters ---------- n_samples : int Number of samples. n_features : int Number of features (columns of :math:`X`). n_targets : int Number of target (response) variables. effective_rank : int or None, optional If provided, the design matrix :math:`X` is generated as a low-rank matrix with this effective rank. Otherwise :math:`X` is i.i.d. standard normal. variance : np.ndarray or None, optional Per-sample noise variance. If ``None``, defaults to ``0.01 * mean(X^2)`` per sample. random_state : int, default=123 Random seed for reproducibility. Returns ------- X : np.ndarray Feature matrix with shape ``(n_samples, n_features)``. Y : np.ndarray Response matrix with shape ``(n_samples, n_targets)``. """ np.random.seed(random_state) if effective_rank is None: X = np.random.normal(size=(n_samples, n_features)) else: X = 100*make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=effective_rank, random_state=random_state) X_aug = np.c_[np.ones(n_samples), X] # n x (p+1) matrix beta = 0.1 * np.random.uniform(low=0.0, high=1.0, size=(1+n_features, n_targets)) # Coefficients for the mean (includes intercept) mu = np.dot(X_aug, beta) if variance is None: variance = 0.01*np.mean(X**2, axis=1) variance = np.tile(variance,(n_targets,1)).T Y = np.random.normal(loc=mu, scale=np.sqrt(variance)) return X, Y
[docs] def simulate_low_rank_data(n_samples=10000, z_dim=2, x_dim=4, rank=2, sigma_z=False, random_state=123): """ Simulate low-rank observed data with latent variables. The generator is: - ``Z ~ N(0, I)`` - ``X | Z ~ N(mu(Z), Sigma(Z))`` Parameters ---------- n_samples : int, default=10000 Number of samples. z_dim : int, default=2 Dimension of latent variable ``Z``. x_dim : int, default=4 Dimension of observed variable ``X``. rank : int, default=2 Rank for the low-rank covariance component. sigma_z : bool, default=False If ``True``, covariance ``Sigma`` depends on ``Z`` (scaled by ``z[0]``). If ``False``, ``Sigma`` is constant across samples. random_state : int, default=123 Random seed for reproducibility. Returns ------- X : np.ndarray Observed data with shape ``(n_samples, x_dim)``. Z : np.ndarray Latent variables with shape ``(n_samples, z_dim)``. """ np.random.seed(random_state) # Generate Z ~ N(0, I) Z = np.random.randn(n_samples, z_dim).astype(np.float32) # Set fixed parameters for the simulation # Randomly generate A and b, or choose them to illustrate particular structure. A = np.array([[ 1.0, -0.5], [ 0.3, 0.8], [-0.7, 0.2], [ 0.5, 1.0]]) b = np.array([0.0, 0.5, 1.0, 2.0]) # Compute the mean for X given Z: μ(Z) = A Z + b mu = Z.dot(A.T) + b # Shape: (n_samples, x_dim) W = np.array([[ 0.25, 0.], [ 0.25, 0.], [ 0., 0.25], [ 0., 0.25]]) diag_values = np.array([0.1, 0.1, 0.2, 0.2]) D = np.diag(diag_values) X = np.zeros((n_samples, x_dim), dtype=np.float32) for i in range(n_samples): if sigma_z: scale_factor = Z[i, 0] W_scaled = W * scale_factor # Element-wise multiplication Sigma = D * scale_factor**2 + np.dot(W_scaled, W_scaled.T) else: # Use constant covariance: Σ = D + W W^T Sigma = D + np.dot(W, W.T) # Sample X_i ~ N(μ_i, Σ) X[i] = np.random.multivariate_normal(mean=mu[i], cov=Sigma) return X, Z
[docs] def simulate_heteroskedastic_data(n=1000, d=5, seed=42): """Simulate a heteroskedastic regression dataset. The noise standard deviation depends on the second feature :math:`X_2`: ``sigma = 0.5 + 0.5 * sin(2 * pi * X_2)`` (clipped to [0.1, 2.0] outside :math:`|X_2| > 2`). Parameters ---------- n : int, default=1000 Number of samples. d : int, default=5 Number of features. seed : int, default=42 Random seed. Returns ------- X : np.ndarray Feature matrix with shape ``(n, d)``. Y : np.ndarray Response vector with shape ``(n,)``. sigma : np.ndarray True noise standard deviation with shape ``(n,)``. """ np.random.seed(seed) X = np.random.randn(n, d) X1 = X[:, 0] X2 = X[:, 1] # Define heteroskedastic noise sigma = np.where( X2 < -2, 0.1, np.where(X2 > 2, 2.0, 0.5 + 0.5 * np.sin(2 * np.pi * X2)) ) epsilon = np.random.randn(n) * sigma Y = X1 + epsilon return X, Y, sigma
[docs] def simulate_z_hetero(n=20000, k=3, d=20-1, seed=42): """Simulate a latent-factor heteroskedastic regression dataset. Observed features :math:`X` are a noisy low-rank projection of a :math:`k`-dimensional latent variable :math:`Z`. The response :math:`Y` depends nonlinearly on :math:`Z` with heteroskedastic noise. Parameters ---------- n : int, default=20000 Number of samples. k : int, default=3 Dimension of the latent variable :math:`Z`. d : int, default=19 Number of observed features. seed : int, default=42 Random seed. Returns ------- X : np.ndarray Observed feature matrix with shape ``(n, d)``. Y : np.ndarray Response vector with shape ``(n,)``. """ np.random.seed(seed) Z = np.random.randn(n, k) # Low-rank X from latent Z A = np.random.randn(d, k) X = 0.2*Z @ A.T + 0.1 * np.random.randn(n, d) # Define nonlinear mean and heteroskedastic std w = np.random.randn(k) u = np.random.randn(k) mean_Y = np.sin(Z @ w) std_Y = 0.1 + 0.5 * 1 / (1 + np.exp(-(Z @ u))) Y = mean_Y + std_Y * np.random.randn(n) return X, Y