import numpy as np
from sklearn.datasets import make_low_rank_matrix
[docs]
def simulate_regression(n_samples, n_features, n_targets, effective_rank=None, variance=None, random_state=123):
"""Simulate a linear regression dataset with optional low-rank design.
Generates :math:`X` (optionally low-rank) and
:math:`Y = X_{\\text{aug}} \\beta + \\varepsilon` where
:math:`X_{\\text{aug}}` includes an intercept column.
Parameters
----------
n_samples : int
Number of samples.
n_features : int
Number of features (columns of :math:`X`).
n_targets : int
Number of target (response) variables.
effective_rank : int or None, optional
If provided, the design matrix :math:`X` is generated as a low-rank
matrix with this effective rank. Otherwise :math:`X` is i.i.d.
standard normal.
variance : np.ndarray or None, optional
Per-sample noise variance. If ``None``, defaults to
``0.01 * mean(X^2)`` per sample.
random_state : int, default=123
Random seed for reproducibility.
Returns
-------
X : np.ndarray
Feature matrix with shape ``(n_samples, n_features)``.
Y : np.ndarray
Response matrix with shape ``(n_samples, n_targets)``.
"""
np.random.seed(random_state)
if effective_rank is None:
X = np.random.normal(size=(n_samples, n_features))
else:
X = 100*make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=effective_rank, random_state=random_state)
X_aug = np.c_[np.ones(n_samples), X] # n x (p+1) matrix
beta = 0.1 * np.random.uniform(low=0.0, high=1.0, size=(1+n_features, n_targets)) # Coefficients for the mean (includes intercept)
mu = np.dot(X_aug, beta)
if variance is None:
variance = 0.01*np.mean(X**2, axis=1)
variance = np.tile(variance,(n_targets,1)).T
Y = np.random.normal(loc=mu, scale=np.sqrt(variance))
return X, Y
[docs]
def simulate_low_rank_data(n_samples=10000, z_dim=2, x_dim=4, rank=2, sigma_z=False, random_state=123):
"""
Simulate low-rank observed data with latent variables.
The generator is:
- ``Z ~ N(0, I)``
- ``X | Z ~ N(mu(Z), Sigma(Z))``
Parameters
----------
n_samples : int, default=10000
Number of samples.
z_dim : int, default=2
Dimension of latent variable ``Z``.
x_dim : int, default=4
Dimension of observed variable ``X``.
rank : int, default=2
Rank for the low-rank covariance component.
sigma_z : bool, default=False
If ``True``, covariance ``Sigma`` depends on ``Z`` (scaled by ``z[0]``).
If ``False``, ``Sigma`` is constant across samples.
random_state : int, default=123
Random seed for reproducibility.
Returns
-------
X : np.ndarray
Observed data with shape ``(n_samples, x_dim)``.
Z : np.ndarray
Latent variables with shape ``(n_samples, z_dim)``.
"""
np.random.seed(random_state)
# Generate Z ~ N(0, I)
Z = np.random.randn(n_samples, z_dim).astype(np.float32)
# Set fixed parameters for the simulation
# Randomly generate A and b, or choose them to illustrate particular structure.
A = np.array([[ 1.0, -0.5],
[ 0.3, 0.8],
[-0.7, 0.2],
[ 0.5, 1.0]])
b = np.array([0.0, 0.5, 1.0, 2.0])
# Compute the mean for X given Z: μ(Z) = A Z + b
mu = Z.dot(A.T) + b # Shape: (n_samples, x_dim)
W = np.array([[ 0.25, 0.],
[ 0.25, 0.],
[ 0., 0.25],
[ 0., 0.25]])
diag_values = np.array([0.1, 0.1, 0.2, 0.2])
D = np.diag(diag_values)
X = np.zeros((n_samples, x_dim), dtype=np.float32)
for i in range(n_samples):
if sigma_z:
scale_factor = Z[i, 0]
W_scaled = W * scale_factor # Element-wise multiplication
Sigma = D * scale_factor**2 + np.dot(W_scaled, W_scaled.T)
else:
# Use constant covariance: Σ = D + W W^T
Sigma = D + np.dot(W, W.T)
# Sample X_i ~ N(μ_i, Σ)
X[i] = np.random.multivariate_normal(mean=mu[i], cov=Sigma)
return X, Z
[docs]
def simulate_heteroskedastic_data(n=1000, d=5, seed=42):
"""Simulate a heteroskedastic regression dataset.
The noise standard deviation depends on the second feature :math:`X_2`:
``sigma = 0.5 + 0.5 * sin(2 * pi * X_2)`` (clipped to [0.1, 2.0]
outside :math:`|X_2| > 2`).
Parameters
----------
n : int, default=1000
Number of samples.
d : int, default=5
Number of features.
seed : int, default=42
Random seed.
Returns
-------
X : np.ndarray
Feature matrix with shape ``(n, d)``.
Y : np.ndarray
Response vector with shape ``(n,)``.
sigma : np.ndarray
True noise standard deviation with shape ``(n,)``.
"""
np.random.seed(seed)
X = np.random.randn(n, d)
X1 = X[:, 0]
X2 = X[:, 1]
# Define heteroskedastic noise
sigma = np.where(
X2 < -2, 0.1,
np.where(X2 > 2, 2.0, 0.5 + 0.5 * np.sin(2 * np.pi * X2))
)
epsilon = np.random.randn(n) * sigma
Y = X1 + epsilon
return X, Y, sigma
[docs]
def simulate_z_hetero(n=20000, k=3, d=20-1, seed=42):
"""Simulate a latent-factor heteroskedastic regression dataset.
Observed features :math:`X` are a noisy low-rank projection of a
:math:`k`-dimensional latent variable :math:`Z`. The response
:math:`Y` depends nonlinearly on :math:`Z` with heteroskedastic noise.
Parameters
----------
n : int, default=20000
Number of samples.
k : int, default=3
Dimension of the latent variable :math:`Z`.
d : int, default=19
Number of observed features.
seed : int, default=42
Random seed.
Returns
-------
X : np.ndarray
Observed feature matrix with shape ``(n, d)``.
Y : np.ndarray
Response vector with shape ``(n,)``.
"""
np.random.seed(seed)
Z = np.random.randn(n, k)
# Low-rank X from latent Z
A = np.random.randn(d, k)
X = 0.2*Z @ A.T + 0.1 * np.random.randn(n, d)
# Define nonlinear mean and heteroskedastic std
w = np.random.randn(k)
u = np.random.randn(k)
mean_Y = np.sin(Z @ w)
std_Y = 0.1 + 0.5 * 1 / (1 + np.exp(-(Z @ u)))
Y = mean_Y + std_Y * np.random.randn(n)
return X, Y