Diffusion ELBO

$$

\newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t} \begin{align}-\log p_\theta(x_0) &= -\log \int p_\theta(x_{0:T}) dx_{1:T} \\ &=-\log \int p_\theta(x_{0:T}) \frac{q(x_{1:T}|x_0) }{q(x_{1:T}|x_0)}dx_{1:T} \\ &= -\log \E_{q(x_{1:T}|x_0)} \frac{p_\theta(x_{0:T})}{q(x_{1:T}|x_0)} \\ &\le \E_{q(x_{1:T}|x_0)} -\log \frac{p_\theta(x_{0:T})}{q(x_{1:T}|x_0)} \\&= \E_{q(x_{1:T}|x_0)} -\log \frac{p(x_T)\prod_{t=1}^T p_\theta(x_{t-1}|x_t)}{\prod_{t=1}^T q(x_{t}|x_{t-1})} \\&= \E_{q(x_{1:T}|x_0)} -\log \frac{p(x_T)\prod_{t=1}^T p_\theta(x_{t-1}|x_t)}{\prod_{t=1}^T q(x_{t}|x_{t-1}, x_0)} \\&= \E_{q(x_{1:T}|x_0)} -\log \frac{p(x_T)\prod_{t=1}^T p_\theta(x_{t-1}|x_t)}{\prod_{t=1}^T q(x_{t-1}|x_{t}, x_0) q(x_t|x_0) / q(x_{t-1}|x_0)} \\&= \E_{q(x_{1:T}|x_0)} -\log \frac{p(x_T)\prod_{t=1}^T p_\theta(x_{t-1}|x_t)}{ q(x_T|x_0)\prod_{t=1}^T q(x_{t-1}|x_{t}, x_0)} \\&= \E_{q(x_{1:T}|x_0)} -\log \frac{ \prod_{t=1}^T p_\theta(x_{t-1}|x_t)}{ \prod_{t=1}^T q(x_{t-1}|x_{t}, x_0)} \\&= \E_{q(x_{1:T}|x_0)} \sum_{t=1}^T -\log \frac{ p_\theta(x_{t-1}|x_t)}{ q(x_{t-1}|x_{t}, x_0)} \\&= \int q(x_{1:T}|x_0) \sum_{t=1}^T -\log \frac{ p_\theta(x_{t-1}|x_t)}{ q(x_{t-1}|x_{t}, x_0)} dx_{1:T} \\ &= \sum_{t=1}^T \int -q(x_{t-1}, x_t|x_0) \log \frac{ p_\theta(x_{t-1}|x_t)}{ q(x_{t-1}|x_{t}, x_0)} dx_{t-1} dx_t \\ &= \sum_{t=1}^T \int -q(x_{t-1}|x_t,x_0) q(x_t|x_0)\log \frac{ p_\theta(x_{t-1}|x_t)}{ q(x_{t-1}|x_{t}, x_0)} dx_{t-1} dx_t \\ &= \sum_{t=1}^T \E_{q(x_t|x_0)} \text{KL}(q(x_{t-1}|x_{t}, x_0) || p_\theta(x_{t-1}|x_t) )\end{align} $$

Discrete Diffusion

We omit $o(\Delta t)$ in the following derivations.

Forward and Backward Processes

$$

\newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t}

\begin{align} q(x_t|x_{t-\Dt}) &= \delta_{x_t,x_{t-\Dt}} + Q_t(x_{t-\Dt}, x_t)\Dt, &Q_t(x, x) = -\sum_{y\ne x}Q_t(x,y) \\ p(x_{t-\Dt}|x_t) &= \delta_{x_t,x_{t-\Dt}} + \tilde{Q}t(x_t,x{t-\Dt})\Dt, &\tilde{Q}t(x, x) = -\sum{y\ne x}\tilde{Q}_t(x,y) \end{align} $$

Concrete Score

$$

\newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t}

\begin{align} \tilde{Q}t(x_t,x{t-\Dt}) = Q_t(x_{t-\Dt}, x_t) s(x_t, t){x{t-\Dt}}, && s(x_t, t){x{t-\Dt}} = \frac{p(x_{t-\Dt})}{p(x_t)}\end{align} $$

Denoising Score Entropy [1]

$$

\newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t}

\begin{align} &\text{KL}(q(x_{t-\Dt}|x_{t}, x_0) || p_\theta(x_{t-\Dt}|x_t) ) \\ =& \sum_{x_{t-\Dt}} q(x_{t-\Dt}|x_{t}, x_0) \log \frac{q(x_{t-\Dt}|x_{t}, x_0)}{p_\theta(x_{t-\Dt}|x_t)} \\ =& \sum_{x_{t-\Dt}} q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)} \log \frac{q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)}}{p_\theta(x_{t-\Dt}|x_t)} \\ \end{align} $$

Case 1: $x_{t-\Delta t} = x_t$,

$$ \newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t} \begin{align} &q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)} \log \frac{q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)}}{p_\theta(x_{t-\Dt}|x_t)} \\ =& q_{t|t-\Dt}(x_{t}|x_t) \log \frac{q_{t|t-\Dt}(x_{t}|x_t)}{p^\theta_{t|t-\Dt}(x_{t}|x_t)} \\ =& (1-\sum_{z \ne x_t} Q_t(x_t, z)\Dt)\log\frac{1-\sum_{z \ne x_t} Q_t(x_t, z)\Dt }{1-\sum_{z \ne x_t} \tilde{Q}t^\theta(x_t, z)\Dt } \\ =& -\sum{z \ne x_t} Q_t(x_t, z)\Dt + \sum_{z \ne x_t} \tilde{Q}t^\theta(x_t, z)\Dt \\ =& -\sum{z \ne x_t} Q_t(x_t, z)\Dt + \sum_{y \ne x_t} Q_t(y, x_t) s_\theta(x_t, t)_y \Dt \end{align} $$

where

$$ \newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t} \begin{align} &\E_{q(x_t|x_0)} \sum_{z\ne x_t} Q_t(x_t, z) \Dt \\ =& \sum_{z\ne x_t} q_{t+\Dt |t}(z |x_t) q_{t|0}(x_t|x_0)\\ =& \sum_{z\ne x_t} q_{t+\Dt ,t|0}(z ,x_t|x_0) \\ =& q_{t|0}(x_t|x_0) - q_{t+\Dt,t|0}(x_t,x_t|x_0) \\ =& q_{t+\Dt|0}(x_t|x_0) - q_{t+\Dt,t|0}(x_t,x_t|x_0) \\ =& \sum_{y\ne x_t} q_{t+\Dt,t|0}(x_t,y|x_0) \\ =& \sum_{y\ne x_t} q_{t+\Dt|t}(x_t|y)q_{t|0}(y|x_0) \\ =& \E_{q_{t|0}(x_t |x_0)} \sum_{y\ne x_t} Q_t(y, x_t)\Dt\frac{q_{t|0}(y|x_0)}{q_{t|0}(x_t|x_0)} \end{align} $$

Case 2: $x_{t-\Delta t} \ne x_t$,

$$ \newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t} \begin{align} &q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)} \log \frac{q(x_{t}|x_{t-\Dt})\frac{q(x_{t-\Dt}|x_0)}{q(x_{t}| x_0)}}{p_\theta(x_{t-\Dt}|x_t)} \\ =& \sum_{y \ne x_t} Q_t(y, x_t)\Dt \frac{q(y|x_0)}{q(x_{t}| x_0)} \log \frac{Q_t(y, x_t)\Dt \frac{q(y|x_0)}{q(x_{t}| x_0)}}{\tilde{Q}t^\theta(x_t, y)\Dt} \\ =&\sum{y \ne x_t} Q_t(y, x_t)\Dt \frac{q(y|x_0)}{q(x_{t}| x_0)} [\log \frac{q(y|x_0)}{q(x_{t}| x_0)} - \log s_\theta(x_t, t)_y] \end{align} $$

Therefore, combining Eq. 14, Eq. 20, Eq. 25, Eq. 33, and Eq. 36, we have:

$$ \newcommand{\E}{\mathbb{E}} \newcommand{\Dt}{\Delta t} \begin{align} &\E_{q(x_t|x_0)}\text{KL}(q(x_{t-\Dt}|x_{t}, x_0) || p_\theta(x_{t-\Dt}|x_t) ) \\ =&\E_{q(x_t|x_0)} \{ -\sum_{y \ne x_t} Q_t(y, x_t)\Dt \frac{q_{t|0}(y|x_0)}{q_{t|0}(x_{t}| x_0)} + \sum_{y \ne x_t} Q_t(y, x_t) s_\theta(x_t, t)y \Dt \\&+ \sum{y \ne x_t} Q_t(y, x_t)\Dt \frac{q(y|x_0)}{q(x_{t}| x_0)} [\log \frac{q(y|x_0)}{q(x_{t}| x_0)} - \log s_\theta(x_t, t)y] \}\\ =& \Dt\, \E{q(x_t|x_0)} \sum_{y \ne x_t} Q_t(y, x_t)[s_\theta(x_t, t)y - \frac{q(y|x_0)}{q(x{t}| x_0)} \log s_\theta(x_t, t)y + K(\frac{q(y|x_0)}{q(x{t}| x_0)})] \quad\quad \end{align} $$

where $K(a) = a(\log a - 1)$.

Then we have the Denoising Score Entropy (DSE) loss: