Code
import os
if "KERAS_BACKEND" not in os.environ:
# set this to "torch", "tensorflow", or "jax"
"KERAS_BACKEND"] = "jax"
os.environ[
import matplotlib.pyplot as plt
import numpy as np
import bayesflow as bf
import json
import os
if "KERAS_BACKEND" not in os.environ:
# set this to "torch", "tensorflow", or "jax"
"KERAS_BACKEND"] = "jax"
os.environ[
import matplotlib.pyplot as plt
import numpy as np
import bayesflow as bf
import json
The basic model has three parameters - generalization (c), attention weight (w), and bias (b). In this example, we assume no bias between the categories, so b will be fixed to \(\frac{1}{2}\).
\[\begin{equation} \begin{aligned} c & \sim \text{Exponential}(0.5) \\ w & \sim \text{Uniform}(0, 1) \\ b & \leftarrow \frac{1}{2} \\ s_{ij} & \leftarrow \exp\left[- c \left(w d_{ij}^{(1)} + (1-w) d_{ij}^{(2)}\right)\right] \\ r_i & \leftarrow \frac{b \sum_j a_j s_{ij}}{b \sum_j a_j s_{ij} + (1-b) \sum_j (1-a_j) s_{ij}} \\ y_i & \sim \text{Binomial}(r_i, t) \end{aligned} \end{equation}\]
We use the category learning data from the “Condensation B” condition reported by Kruschke (1993).
with open(os.path.join("data", "KruschkeData.json")) as f:
= json.load(f)
data
data.keys()
dict_keys(['a', 'd1', 'd2', 'n', 'nstim', 'nsubj', 'y'])
The element y
contains the array of responses \(y_{ik}\) for the trial \(i \in \{1, \dots, 8\}\) and participant \(k \in \{1, \dots, 40\}\).
'y']).shape np.array(data[
(8, 40)
d1
and d2
contain the physical distances between the 8 stimuli on the along the two dimensions, such that
\[\begin{equation} d_{ij}^{(k)} = | p_{ik} - p_{jk} | \end{equation}\]
The indicator vector a
contains information whether the stimulus \(a_i\) is an element from Category A or an element from Category B.
Here we will assume that the design of the experiment (i.e., number of trials, number of stimuli, number of participants, nature of the stimuli) is fixed. That is, we will only amortize over the observation outcomes.
def prior():
return dict(
= np.random.exponential(scale=1),
c = np.random.uniform(low=0, high=1)
w
)
def likelihood(c, w, n=data['n'], nstim=data['nstim'], nsubj=data['nsubj'], d1=np.array(data['d1']), d2=np.array(data['d2']), a=np.array(data['a'])):
= 0.5
b
= np.exp( - c * (w * d1 + (1-w) * d2))
s
= np.zeros((nsubj, nstim))
r
for i in range(nstim):
= a == 1
isA = a != 1
isB
= b * np.sum(s[i,isA])
numerator = numerator + (1-b) * np.sum(s[i,isB])
denominator
= numerator / denominator
r[:, i]
= np.random.binomial(n=n, p=r, size=(nsubj, nstim))
y
return dict(y=y)
= bf.make_simulator([prior, likelihood]) simulator
= (bf.Adapter()
adapter "c", lower=0)
.constrain("w", lower=0, upper=1)
.constrain("c", "w"], into="inference_variables")
.concatenate(["y", "summary_variables")
.rename( )
=bf.BasicWorkflow(
workflow=simulator,
simulator=adapter,
adapter=bf.networks.CouplingFlow(),
inference_network=bf.networks.DeepSet(),
summary_network=1e-3
initial_learning_rate )
=workflow.fit_online(batch_size=256) history
= simulator.sample(1000) test_data
= workflow.plot_default_diagnostics(test_data=test_data, num_samples=500) figs
Here we apply the networks to the data from Kruschke (1993) as reported by Lee & Wagenmakers (2013). The quantity of interest is the joint posterior distribution of parameters \(c\) and \(w\).
= dict(y = np.array(data['y']).transpose()[np.newaxis,...]) inference_data
= workflow.sample(num_samples=2000, conditions=inference_data) posterior_samples
plt.scatter(= posterior_samples['c'],
x = posterior_samples['w'],
y =0.05
alpha
)0, 5])
plt.xlim([0, 1])
plt.ylim(["Generalization (c)")
plt.xlabel(=plt.ylabel("Attention Weight (w)") f