The SIMPLE model

Author

Šimon Kucharský

Published

May 6, 2025

Code
import os

if "KERAS_BACKEND" not in os.environ:
    # set this to "torch", "tensorflow", or "jax"
    os.environ["KERAS_BACKEND"] = "jax"

import matplotlib.pyplot as plt
import numpy as np
import bayesflow as bf
import keras

\[\begin{equation} \begin{aligned} c & \sim \text{Uniform}(0, 100) \\ s & \sim \text{Uniform}(0, 100) \\ t & \sim \text{Uniform}(0, 1) \\ \eta_{ij} & \leftarrow \exp\left(-c \lvert \log T_{i} - T_{j} \rvert \right) \\ d_{ij} & \leftarrow \frac{\eta_{ij}}{\sum_k \eta_{ik}} \\ r_{ij} & \leftarrow \frac{1}{1 + \exp\left(-s (d_{ij} - t)\right)} \\ \theta_i & \leftarrow \begin{cases} \min\left(1, \sum_j r_{ij}\right) & \text{if version = 0} \\ 1-\prod_j (1-r_{ij}) & \text{if version = 1} \end{cases}\\ y_i & \sim \text{Binomial}(\theta_i, n) \end{aligned} \end{equation}\]

Simulator

Code
max_n_words=40
def context(version=None, max_n_words=max_n_words):
    if version is None:
        version = np.random.randint(0, 2)
    return dict(
        n_obs = np.random.randint(1200, 1521),
        n_words = np.random.randint(10, max_n_words+1),
        offset = np.random.randint(10, 26),
        lag = np.random.randint(1, 3),
        version = version
    )
Code
def prior():
    return dict(
        c = np.random.gamma(shape=5, scale=2.5),
        s = np.random.gamma(shape=5, scale=2.5),
        t = np.random.beta(a=5, b=3)
    )
Code
def calculate_log_T(offset, lag, n_words):
    x = np.arange(n_words, 0, -1) - 1
    return np.log(offset + lag * x)

def prob_correct(c, s, t, n_words, offset, lag, version):
    log_T = calculate_log_T(offset, lag, n_words)

    eta = np.zeros((n_words, n_words))
    d = np.zeros((n_words, n_words))

    for i in range(n_words):
        for j in range(n_words):
            abs_diff = np.abs(log_T[i] - log_T[j])
            eta[i, j] = np.exp(-c * abs_diff)
        d[i,:] = eta[i,:] / np.sum(eta[i,:])
    
    r = 1 / (1 + np.exp(-s * (d - t)))

    theta = np.ones(max_n_words)

    for i in range(n_words):
        if version < 0.5:
            theta[i] = np.min([1, np.sum(r[i,:])])
        else:
            theta[i] = 1 - np.prod(1-r[i,:])

    return theta

def likelihood(c, s, t, n_obs, n_words, offset, lag, version, max_n_words=max_n_words):
    theta = prob_correct(c, s, t, n_words, offset, lag, version)

    y = np.zeros(max_n_words)
    observed = np.zeros(max_n_words)
    trial = np.arange(max_n_words)

    y[range(n_words)] = np.random.binomial(n=n_obs, p=theta[range(n_words)], size=n_words)
    observed[range(n_words)] = 1

    return dict(y=y, observed=observed, trial=trial)
Code
simulator=bf.make_simulator([context, prior, likelihood])
Code
df=simulator.sample(100)

for key, value in df.items():
    print(key, "shape: ", value.shape)
n_obs shape:  (100, 1)
n_words shape:  (100, 1)
offset shape:  (100, 1)
lag shape:  (100, 1)
version shape:  (100, 1)
c shape:  (100, 1)
s shape:  (100, 1)
t shape:  (100, 1)
y shape:  (100, 40)
observed shape:  (100, 40)
trial shape:  (100, 40)

Approximator

We will use the observed successfull trials as variables that will be passed into a time series summary network, as well as the number of observations, number of words, offset, lag, and version of the model directly into the inference network. The inference network will learn to predict the posterior distribution of parameters \(c\), \(s\), and \(t\). Using the adapter, we will also make sure that we respect the natural constraints of the parameters.

Code
adapter = (bf.Adapter()
    .constrain(["c", "s"], lower=0)
    .constrain("t", lower=0, upper=1)
    .standardize(include=["c", "s", "t"])
    .as_set(["y", "observed", "trial"])
    .concatenate(["y", "observed", "trial"], into="summary_variables")
    .concatenate(["n_obs", "n_words", "offset", "lag", "version"], into="inference_conditions")
    .concatenate(["c", "s", "t"], into="inference_variables")
)
Code
df=adapter(simulator.sample(100), stage="inference")

for key, value in df.items():
    print(key, "shape: ", value.shape)
summary_variables shape:  (100, 40, 3)
inference_conditions shape:  (100, 5)
inference_variables shape:  (100, 3)
Code
workflow=bf.BasicWorkflow(
    simulator=simulator,
    adapter=adapter,
    inference_network=bf.networks.CouplingFlow(),
    summary_network=bf.networks.DeepSet(),
    initial_learning_rate=1e-3
)

Training

Since the simulation is relatively involved, we will presimulate data and fit the networks in an offline regime to speed up training.

Code
train_data = simulator.sample(20_000)
Code
val_data = simulator.sample(5_000)
Code
history=workflow.fit_offline(
    data=train_data, 
    validation_data=val_data,
    epochs=200,
    batch_size=500)

Validation

Code
test_data = simulator.sample(1000, version=0)
figs=workflow.plot_default_diagnostics(test_data=test_data, num_samples=500)

Inference

Code
y = np.array([
    [994,806,691,634,634,835,965,1008,1181,1382,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [794,640,576,512,486,486,512,512,538,640,742,794,1024,1126,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [836,623,562,517,441,471,441,395,350,441,456,486,456,593,608,684,897,1110,1353,1459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [699,410,304,243,258,243,228,213,304,350,395,319,365,410,456,608,958,1170,1277,1474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [576,360,240,228,240,240,228,216,240,240,240,240,228,240,216,204,240,228,240,240,264,252,288,324,444,480,660,912,1080,1188,0,0,0,0,0,0,0,0,0,0],
    [384,256,154,154,141,128,154,154,154,128,179,141,128,141,154,179,154,141,179,128,154,128,154,192,179,166,154,218,218,218,256,256,230,307,384,512,691,922,1114,1254]
])[...,np.newaxis]

observed = np.zeros_like(y)
observed[np.where(y != 0)] = 1

trial = np.arange(max_n_words)[np.newaxis]
trial = np.tile(trial, [6, 1])
trial.shape
(6, 40)
Code
inference_data = dict(
    y = y,
    observed = observed,
    trial = trial,
    n_obs = np.array([1440, 1280, 1520, 1520, 1200, 1280])[...,np.newaxis],
    n_words = np.array([10, 15, 20, 20, 30, 40])[...,np.newaxis],
    offset = np.array([15, 20, 25, 10, 15, 20])[...,np.newaxis],
    lag = np.array([2, 2, 2, 1, 1, 1])[...,np.newaxis],
    version = np.zeros(6)[...,np.newaxis]
)
Code
posterior_samples = workflow.sample(num_samples=2000, conditions=inference_data)
Code
def plot_data(c, s, t, data, max_n_words=max_n_words):
    n_datasets, n_samples, _ = c.shape
    y_pred = np.zeros((n_datasets, n_samples, max_n_words))
    for d in range(n_datasets):
        for i in range(n_samples):
            n_obs = int(data['n_obs'][d, 0])
            n_words = int(data['n_words'][d, 0])
            offset = int(data['offset'][d, 0])
            lag = int(data['lag'][d, 0])
            version = int(data['version'][d, 0])
            theta = prob_correct(c=c[d, i, 0], s=s[d, i, 0], t=t[d, i, 0], 
                                 n_words=n_words, offset=offset, 
                                 lag=lag, version=version)
            y_pred[d, i, range(n_words)] = np.random.binomial(n=n_obs, p=theta[range(n_words)], size=n_words)

    y = np.squeeze(data['y'])/data['n_obs']
    y_hat = np.mean(y_pred, axis=1)/data['n_obs']
    lower = np.quantile(y_pred, q = 0.025, axis=1)/data['n_obs']
    upper = np.quantile(y_pred, q = 0.975, axis=1)/data['n_obs']

    return y, y_hat, lower, upper
Code
y, y_hat, lower, upper = plot_data(**posterior_samples, data=inference_data)
Code
fig, axes = plt.subplots(2, 3, figsize=(12, 10))
axes = axes.flatten()

for d, ax in enumerate(axes):
    ax.set_ylim([0, 1])
    ax.set_xlim([0, max_n_words])
    ax.set_xlabel("Serial position")
    ax.set_ylabel("Probability correct")

    n_obs = int(inference_data['n_obs'][d, 0])
    n_words = int(inference_data['n_words'][d, 0])
    observed = range(n_words)
    x = range(1, n_words+1)

    ax.fill_between(x, lower[d, observed], upper[d, observed], alpha=0.3)
    ax.plot(x, y_hat[d, observed])
    ax.scatter(x, y[d, observed])

fig.tight_layout()

References

Lee, M. D., & Wagenmakers, E.-J. (2013). Bayesian Cognitive Modeling: A Practical Course. Cambridge University Press.

Reuse