Appendix A: Derivations related to the little model
1.1 A.1 Derivation of Eq. 12
Fix an \(x^0\) and a direction \(v \in \mathbb {R}^{n\times n}\times \mathbb {R}^n\). Then
$$\begin{aligned}&\nabla _{\lambda }P_{\theta + \lambda v}(x^{0},x^{1})\\&\quad = P_{\theta + \lambda v}(x^0,x^1) \nabla _{\lambda }\log P_{\theta + \lambda v}(x^0,x^1) \\&\quad = P_{\theta + \lambda v}(x^0,x^1) \sum \limits _{i=1}^{n}\nabla _{\lambda } \log \left( \sigma ( (x_{i}^{1})^{\dag }u_{i}(x^{0},\theta +\lambda v))\right) \\&\quad = P_{\theta + \lambda v}(x^0,x^1) \sum \limits _{i=1}^{n} \left( 1 -\sigma ( (x_{i}^{1})^{\dag }u_{i}(x^{0},\theta +\lambda v))\right) (x_{i}^{1})^{\dag } \nabla _{\lambda }u_{i}(x^{0},\theta +\lambda v) \\&\quad = P_{\theta + \lambda v}(x^0,x^1) \sum \limits _{i=1}^{n} \left( 1 -\sigma ( (x_{i}^{1})^{\dag }u_{i}(x^{0},\theta +\lambda v))\right) (x_{i}^{1})^{\dag } \left( \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_{i}\right) . \end{aligned}$$
Evaluating this at \(\delta =0\) we find that
$$\begin{aligned} \nabla _{\theta }P_{\theta }(x^{0},x^{1})v = P_{\theta }(x^0,x^1) \sum \limits _{i=1}^{n}(1 -\sigma ( (x_{i}^{1})^{\dag }u_{i}(x^{0},\theta ))) (x_{i}^{1})^{\dag } \left( \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_{i}\right) . \end{aligned}$$
(24)
Note also that
$$\begin{aligned} (1- \sigma ( x^{\dag }u))x^{\dag } = {\left\{ \begin{array}{ll} (1-\sigma (u)) &{}\text { if } x = 1 \\ -\sigma (u)&{}\text { if } x = 0 \end{array}\right. } \end{aligned}$$
which means
$$\begin{aligned} (1-\sigma (x^{\dag }u))x^{\dag } = x - \sigma (u). \end{aligned}$$
(25)
Combining (24) and (25),
$$\begin{aligned}&\nabla _{\theta }\textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^{0},x^{1})v\nonumber \\&\quad = \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1) \textstyle \sum \limits _{i=1}^{n}(1 -\sigma ( (x_{i}^{1})^{\dag }u_{i}(x^{0},\theta )))(x_{i}^{1})^{\dag } \left( \textstyle \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_i\right) \nonumber \\&\quad = \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1) \textstyle \sum \limits _{i=1}^{n}(x_{i}^{1} -\sigma (u_{i}(x^{0},\theta ))) \left( \textstyle \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_i\right) \nonumber \\&\quad = \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\left[ \textstyle \sum \limits _{i=1}^{n}x_{i}^{1} \left( \textstyle \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_i\right) \right. \nonumber \\&\qquad \left. - \textstyle \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}v_{i,j}x^{0}_{j} + v_i\right) \right] . \end{aligned}$$
(26)
Splitting each \(v_{i,j}\) and \(v_i\) into positive and negative parts,
$$\begin{aligned}&= \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\left[ \textstyle \sum \limits _{i=1}^{n}x_{i}^{1} \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_+\right) \right. \nonumber \\&\qquad \left. - \textstyle \sum \limits _{i=1}^{n}x_{i}^{1} \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_{i})_{-}\right) \right] \nonumber \\&- \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\left[ \textstyle \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_{+}\right) \right. \nonumber \\&\qquad \left. - \textstyle \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_{i})_{-}\right) \right] \nonumber \\&= \left( \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\left[ \textstyle \sum \limits _{i=1}^{n}x_{i}^{1} \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_i)_{+}\right) \right. \right. \nonumber \\&\qquad \left. \left. + \textstyle \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_{i})_{-}\right) \right] \right) \nonumber \\&\quad - \left( \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\left[ \textstyle \sum \limits _{i=1}^{n}x_{i}^{1} \left( \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_i)_{-}\right) \right. \right. \nonumber \\&\qquad \left. \left. + \textstyle \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_{+}\right) \right] \right) \nonumber \\&= \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\sum \limits _{i=1}^{n} \left[ x_{i}^{1}\left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_{+}\right) \right. \nonumber \\&\qquad \left. + \sigma (u_{i}(x^{0},\theta )) \left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_i)_{-} \right) \right] \nonumber \\&\quad - \textstyle \sum \limits _{x^{1}}e(x^{1})P_{\theta }(x^0,x^1)\textstyle \sum \limits _{i=1}^{n} \left[ x_{i}^{1}\left( \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_i)_-\right) \right. \nonumber \\&\qquad \left. + \sigma (u_{i}(x^{0},\theta ))\left( \textstyle \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_i)_{+}\right) \right] .\nonumber \\ \end{aligned}$$
(27)
Note that
$$\begin{aligned}&\sum \limits _{x^{1}}P_{\theta }(x^0,x^1)\sum \limits _{i=1}^{n} \left[ x_{i}^{1}\left( \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_{+}\right) \right. \nonumber \\&\qquad \left. + \sigma (u_{i}(x^{0},\theta )) \left( \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_{i})_{-}\right) \right] \nonumber \\&\quad =\sum \limits _{i=1}^{n}\left( \sum \limits _{x^{1}}P_{\theta }(x^0,x^1)x_{i}^{1}\right) \left( \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_{i})_{+}\right) \nonumber \\&\qquad + \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_i)_{-}\right) \nonumber \\&\quad =\sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \sum \limits _{j=1}^{n}(v_{i,j})_{+}x^{0}_{j} + (v_i)_{+}\right) \nonumber \\&\qquad + \sum \limits _{i=1}^{n}\sigma (u_{i}(x^{0},\theta )) \left( \sum \limits _{j=1}^{n}(v_{i,j})_{-}x^{0}_{j} + (v_{i})_{-}\right) \nonumber \\&\quad = \sum \limits _{i=1}^{n}\sigma (u_{i}(x^0))|v_i| + \sum \limits _{i=1}^{n}\sum \limits _{j=1}^{n} \sigma (u_{i}(x^{0},\theta ))|v_{i,j}|x^{0}_{j}. \end{aligned}$$
(28)
Combining (27) with (28) and the definitions (13) and (14) we obtain (12).
1.2 A.2 Derivation of Eqs. 17 and 18
We have
$$\begin{aligned} Q(x_1)= & {} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} Q(x_1,x_2,\ldots ,x_n) \nonumber \\= & {} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=1}^{n} \beta _i^{x_i} (1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=1}^{n}\alpha _{i}x_i \right) \nonumber \\= & {} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \beta _1^{x_1}(1-\beta _1)^{1-x_{1}} \left( d + \alpha _1x_1 + \sum \limits _{i=2}^{n}\alpha _{i}x_i \right) \nonumber \\= & {} \beta _1^{x_1}(1-\beta _1)^{1-x_1} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \alpha _1x_1 + \sum \limits _{i=2}^{n}\alpha _{i}x_i \right) \nonumber \\= & {} \beta _1^{x_1}(1-\beta _1)^{1-x_1} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c}\prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=2}^{n}\alpha _{i}x_i \right) \nonumber \\&\qquad + \beta _1^{x_1}(1-\beta _1)^{1-x_1} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c}\prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i}\alpha _1x_1 \nonumber \\= & {} \beta _1^{x_1}(1-\beta _1)^{1-x_1} \alpha _1x_1\frac{1}{c}\nonumber \\&\qquad + \beta _1^{x_1}(1-\beta _1)^{1-x_1} \frac{1}{c} \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i}\left( d + \sum \limits _{i=2}^{n}\alpha _{i}x_i \right) .\nonumber \\ \end{aligned}$$
(29)
To simplify this equation, note that for \(n>1\),
$$\begin{aligned}&\sum \limits _{x_1 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \prod _{i=1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=1}^{n}a_ix_i\right) \nonumber \\&= \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \left[ \beta _1\prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=2}^na_ix_i + a_1\right) \right. \nonumber \\&\qquad \left. + (1-\beta _1)\prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=2}^na_ix_i\right) \right] \nonumber \\&\quad =\sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}}\left[ \beta _1a_1\prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} + \prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=2}^na_ix_i\right) \right] \nonumber \\&\quad = \beta _1\alpha _1 + \sum \limits _{x_2 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \prod _{i=2}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=2}^na_ix_i\right) ,\nonumber \\ \end{aligned}$$
(30)
and if \(n=1\) then
$$\begin{aligned} \begin{aligned} \sum \limits _{x_1 \in \{0,1\}}\prod _{i=1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=1}^{n}a_ix_i\right)&= \beta _1(d + a_1) + (1-\beta _1)d \\&= \beta _1d + \beta _1a_1 + d - \beta _1d = \beta _1a_1 +d. \end{aligned} \end{aligned}$$
(31)
Combining Eqs. 30 and 31, we see that for any \(n\ge 1\),
$$\begin{aligned} \sum \limits _{x_1 \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \prod _{i=1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=1}^{n}a_ix_i \right) = d + \sum \limits _{i=1}^{n}\beta _i\alpha _i. \end{aligned}$$
(32)
Combining Eqs. 29 with 32,
$$\begin{aligned} Q(x_1)&= \beta _1^{x_1}(1-\beta _1)^{1-x_1}\frac{\alpha _1x_1}{c} + \beta _1^{x_1}(1-\beta _1)^{1-x_1}\frac{1}{c} \left( d + \sum \limits _{i=2}^{n}\beta _i\alpha _i\right) \\&= \beta _1^{x_1}(1-\beta _1)^{1-x_1}\frac{1}{c} \left[ d + \alpha _1x_1 + \sum \limits _{i=2}^{n}\beta _i\alpha _i\right] . \end{aligned}$$
In general,
$$\begin{aligned}&Q(x_k, x_{k-1},\ldots , x_1) \\&\quad = \textstyle \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} Q(x_1,\ldots ,x_k,,\ldots ,x_n) \\&\quad = \textstyle \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c}\prod _{i=1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=1}^{n}a_{i}x_i\right) \\&\quad = \textstyle \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=k+1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \\&\quad \quad \times \beta _k^{x_k}(1-\beta _k)^{1-x_k} \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( \sum \limits _{i=1}^{k-1}a_ix_i + a_kx_k + d + \sum \limits _{i=k+1}^{n}a_{i}x_i \right) \\&\quad = \textstyle \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=k+1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \beta _k^{x_k}(1-\beta _k)^{1-x_k}\\&\qquad \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( a_kx_k + \sum \limits _{i=1}^{k-1}a_ix_i\right) \\&\qquad + \textstyle \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}} \frac{1}{c} \prod _{i=k+1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \beta _k^{x_k}(1-\beta _k)^{1-x_k}\\&\qquad \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=k+1}^{n}a_{i}x_i \right) \\&\quad = \textstyle \beta _k^{x_k}(1-\beta _k)^{1-x_k}\frac{1}{c} \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( a_kx_k + \sum \limits _{i=1}^{k-1}a_ix_i\right) \\&\qquad + \textstyle \beta _k^{x_k}(1-\beta _k)^{1-x_k}\frac{1}{c} \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \sum \limits _{x_{k+1} \in \{0,1\}}\ldots \sum \limits _{x_n \in \{0,1\}}\\&\qquad \prod _{i=k+1}^{n}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d + \sum \limits _{i=k+1}^{n}a_{i}x_i \right) \\&\quad = \textstyle \beta _k^{x_k}(1-\beta _k)^{1-x_k}\frac{1}{c} \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( a_kx_k + \sum \limits _{i=1}^{k-1}a_ix_i\right) \\&\qquad + \textstyle \beta _k^{x_k}(1-\beta _k)^{1-x_k}\frac{1}{c} \prod _{i=1}^{k-1}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d +\sum \limits _{i=k+1}^{n}\beta _i\alpha _i\right) \\&\quad = \textstyle \frac{1}{c} \prod _{i=1}^{k}\beta _i^{x_i}(1-\beta _i)^{1-x_i} \left( d+ \sum \limits _{i=1}^{k}a_ix_i + \sum \limits _{i=k+1}^{n}\beta _i\alpha _i\right) . \end{aligned}$$