}, \\
\text{where }g(j)\text{ is a layer index mapping function from student to teacher.}
}, \\
\text{where }g(i)\text{ is a layer index mapping function from student to teacher.}
}, \\
A=\frac{QK^\intercal}{\sqrt{d_k}}, \\
\text{Attention}(Q,K,V)=\text{softmax}(A)V, \\
\text{where }h\text{ is the number of attention heads and }A\text{ is attention matrix.}
\mathcal{L}_\text{hidn}=\text{MSE}(H^S\cdot{W_h},H^T), \\
\text{where }H^S\in\mathbb{R}^{\text{length}\times{\tilde{d}}}\text{ and }H^T\in\mathbb{R}^{\text{length}\times{d}} \\
\text{ refer to the hidden states of student and teacher respectively,} \\
\text{and }W_h\in\mathbb{R}^{\tilde{d}\times{d}}\text{ is a projection matrix.}
\mathcal{L}_\text{embd}=\text{MSE}(E^S\cdot{W_e},E^T), \\
\text{where }E^S,E^T,W_e\text{ refer to the token embeddings of student} \\
\text{and teacher networks and projection matrix, respectively.}
} \\
\mathcal{L}_\text{embd} &\text{if } i=0, \\
\mathcal{L}_\text{attn}+\mathcal{L}_\text{hidn} &\text{if } 0<i\le\ell_S, \\
\mathcal{L}_\text{pred} &\text{if }i=\ell_S+1.