\[\begin{gathered}
\mathcal{L}_\text{PT}=\sum_{i=1}^N{
\sum_{j=1}^\ell{
\bigg\|
\frac{h_{i,j}^S}{\|h_{i,j}^S\|_2}-\frac{h_{i,g(j)}^T}{\|h_{i,g(j)}^T\|_2}
\bigg\|_2^2
}
}, \\
\text{where }g(j)\text{ is a layer index mapping function from student to teacher.}
\end{gathered}\]
\[\begin{gathered}
\mathcal{L}_\text{PKD}=(1-\alpha)\mathcal{L}_\text{CE}+\alpha\mathcal{L}_\text{KD}+\beta\mathcal{L}_\text{PT}
\end{gathered}\]
\[\begin{gathered}
\mathcal{L}_\text{TKD}=\sum_{x\in\mathcal{X}}{
\sum_{i=0}^{\ell_S}{
\lambda_i\mathcal{L}_i\big(
f_i^S(x),f_{g(i)}^T(x)
\big)
}
}, \\
\text{where }g(i)\text{ is a layer index mapping function from student to teacher.}
\end{gathered}\]
\[\begin{gathered}
\mathcal{L}_\text{attn}=\frac{1}{h}\sum_{i=1}^h{
\text{MSE}(A_i^S,A_i^T)
}, \\
A=\frac{QK^\intercal}{\sqrt{d_k}}, \\
\text{Attention}(Q,K,V)=\text{softmax}(A)V, \\
\text{where }h\text{ is the number of attention heads and }A\text{ is attention matrix.}
\end{gathered}\]
\[\begin{gathered}
\mathcal{L}_\text{hidn}=\text{MSE}(H^S\cdot{W_h},H^T), \\
\text{where }H^S\in\mathbb{R}^{\text{length}\times{\tilde{d}}}\text{ and }H^T\in\mathbb{R}^{\text{length}\times{d}} \\
\text{ refer to the hidden states of student and teacher respectively,} \\
\text{and }W_h\in\mathbb{R}^{\tilde{d}\times{d}}\text{ is a projection matrix.}
\end{gathered}\]
\[\begin{gathered}
\mathcal{L}_\text{embd}=\text{MSE}(E^S\cdot{W_e},E^T), \\
\text{where }E^S,E^T,W_e\text{ refer to the token embeddings of student} \\
\text{and teacher networks and projection matrix, respectively.}
\end{gathered}\]
\[\begin{aligned}
\mathcal{L}_\text{pred}&=-\sum_{c\in\mathcal{C}}{
P^T(\text{y}=c|x)\cdot\log{P^S(\text{y}=c|x)}
} \\
&=-\mathbb{E}_{\text{y}\sim{P^T(\cdot|x)}}\Big[
\log{P^S(\text{y}|x)}
\Big]
\end{aligned}\]
\[\begin{gathered}
\mathcal{L}_i=\begin{cases}
\mathcal{L}_\text{embd} &\text{if } i=0, \\
\mathcal{L}_\text{attn}+\mathcal{L}_\text{hidn} &\text{if } 0<i\le\ell_S, \\
\mathcal{L}_\text{pred} &\text{if }i=\ell_S+1.
\end{cases}
\end{gathered}\]