Link

Back Translation Review

Equations

\[\begin{gathered} \mathcal{B}=\{x^n,y^n\}_{n=1}^N \\ \mathcal{M}=\{y^s\}_{s=1}^S \end{gathered}\] \[\mathcal{L}(\theta_{x\rightarrow{y}})=-\sum_{n=1}^N{ \log{P(y^n|x^n;\theta_{x\rightarrow{y}})} }-\sum_{s=1}^S{ \log{P(y^s)} }\] \[\begin{aligned} \log{P(y)}&=\log{\sum_{x\in\mathcal{X}}{P(x,y)}} \\ &=\log{\sum_{x\in\mathcal{X}}{P(y|x)P(x)}} \\ &=\log{\sum_{x\in\mathcal{X}}{\frac{P(y|x)P(x)}{P(x|y)}P(x|y)}} \\ \end{aligned}\]

Jensen’s Inequality

\[\begin{aligned} \log{P(y)}&=\log{\sum_{x\in\mathcal{X}}{P(x,y)}} \\ &=\log{\sum_{x\in\mathcal{X}}{P(y|x)P(x)}} \\ &=\log{\sum_{x\in\mathcal{X}}{\frac{P(y|x)P(x)}{P(x|y)}P(x|y)}} \\ &\ge\sum_{x\in\mathcal{X}}{ P(x|y)\log{\frac{P(y|x)P(x)}{P(x|y)}} } \\ &=\mathbb{E}_{x\sim{P(\text{x}|y)}}\Big[ \log{\frac{P(y|x)P(x)}{P(x|y)}} \Big] \\ &=\mathbb{E}_{x\sim{P(\text{x}|y)}}\big[ \log{P(y|x)} \big]-\text{KL}\big(P(\text{x}|y)|P(\text{x})\big) \end{aligned}\] \[\begin{aligned} \mathcal{L}(\theta_{x\rightarrow{y}}) &\le-\sum_{n=1}^N{ \log{P(y^n|x^n;\theta_{x\rightarrow{y}})} }-\sum_{s=1}^S{ \Big( \mathbb{E}_{x\sim{P(\text{x}|y;\theta_{y\rightarrow{x}})}}\big[ \log{P(y^s|x;\theta_{x\rightarrow{y}})} \big]-\text{KL}\big( P(\text{x}|y^s;\theta_{y\rightarrow{x}})|P(\text{x}) \big) \Big) } \\ &\approx-\sum_{n=1}^N{ \log{P(y^n|x^n;\theta_{x\rightarrow{y}})} }-\sum_{s=1}^S{\Bigg( \frac{1}{K}\sum_{k=1}^K{ \log{P(y^s|x_k^s;\theta_{x\rightarrow{y}})} }-\text{KL}\big( P(\text{x}|y^s;\theta_{y\rightarrow{x}})|P(\text{x}) \big) }\Bigg)\text{, where }x_k^s\sim{P(\text{x}|y^s;\theta_{y\rightarrow{x}})} \\ &=\tilde{\mathcal{L}}(\theta_{x\rightarrow{y}}) \end{aligned}\] \[\begin{aligned} \nabla_{\theta_{x\rightarrow{y}}}\tilde{\mathcal{L}}(\theta_{x\rightarrow{y}}) &=-\nabla_{\theta_{x\rightarrow{y}}}\sum_{n=1}^N{ \log{P(y^n|x^n;\theta_{x\rightarrow{y}})} }-\nabla_{\theta_{x\rightarrow{y}}}\frac{1}{K}\sum_{s=1}^S{ \sum_{k=1}^K{ \log{P(y^s|x_k^s;\theta_{x\rightarrow{y}})} } } \\ &\approx-\nabla_{\theta_{x\rightarrow{y}}}\sum_{n=1}^N{ \log{P(y^n|x^n;\theta_{x\rightarrow{y}})} }-\nabla_{\theta_{x\rightarrow{y}}}\sum_{s=1}^S{ \log{P(y^s|\hat{x}^s;\theta_{x\rightarrow{y}})} }\text{, where }\hat{x}^s\sim{P(\text{x}|y^s;\theta_{y\rightarrow{x}})}\text{ and }K=1. \end{aligned}\] \[\theta_{x\rightarrow{y}}\leftarrow\theta_{x\rightarrow{y}}-\eta\nabla_{\theta_{x\rightarrow{y}}}\tilde{\mathcal{L}}(\theta_{x\rightarrow{y}})\]

Evaluations

\[\begin{aligned} \log{P(y)}&=\log{\sum_{c}{\sum_{x}{P(x,y,c)}}} \\ &=\log{\sum_{c}{\sum_{x}{P(y|x,c)P(x|c)P(c)}}} \\ &=\log{\sum_{c}{\sum_{x}{ \frac{P(y|x,c)P(x|c)}{P(x|y,c)}P(x|y,c)P(c) }}} \\ &\ge\sum_{c}{\sum_{x}}P(c)P(x|y,c)\log{\frac{P(y|x,c)P(x|c)}{P(x|y,c)}} \\ &=\mathbb{E}_{c\sim{P(\text{c})}}\Big[ \mathbb{E}_{x\sim{P(\text{x}|y,c)}}\big[ \log{P(y|x,c)} \big] -\text{KL}(P(\text{x}|y,c)\|P(\text{x}|c)) \Big] \end{aligned}\] \[\begin{aligned} \log{P(y)}&=\log{\sum_{c}{\sum_{x}{P(x,y,c)}}} \\ &=\log{\sum_{c}{\sum_{x}{P(y|x,c)P(c|x)P(x)}}} \\ &=\log{\sum_{c}{\sum_{x}{ \frac{P(y|x,c)P(c|x)P(x)}{P(c)P(x|y,c)}P(x|y,c)P(c) }}} \\ &\ge\sum_{c}{\sum_{x}}P(c)P(x|y,c)\log{\frac{P(y|x,c)P(c|x)P(x)}{P(c)P(x|y,c)}} \\ &=\mathbb{E}_{c\sim{P(\text{c})},x\sim{P(\text{x}|y,c)}}\Big[\log{P(y|x,c)}+\log{\frac{P(c|x)}{P(c)}+\log{\frac{P(x)}{P(x|y,c)}}}\Big] \end{aligned}\]