\[\begin{gathered}
\mathcal{D}_{train}=\{(x_i, y_i)\}_{i=1}^N \\
|\mathcal{D}_{train}|=|\mathcal{D}_{dev}|=N
\end{gathered}\]
\[\begin{gathered}
x_{in}=\begin{cases}
x^1, &\text{if a single sentence} \\
(x^1,x^2), &\text{if a pair of sentence}
\end{cases}
\end{gathered}\]
\[\begin{gathered}
\mathcal{M}:\mathcal{Y}\rightarrow\mathcal{V}\text{, where }\mathcal{M}\text{ is mapping function from class label to word in vocabulary }\mathcal{V}. \\
x_\text{prompt}=\mathcal{T}(x_{in})\text{, where }x_\text{prompt}\text{ contains exactly one [MASK] token.}
\end{gathered}\]
\[\begin{aligned}
P(y|x_\text{in})&=P(\text{[MASK]}=\mathcal{M}(y)|x_\text{prompt}) \\
&=\frac{\exp(\text{w}_{\mathcal{M}(y)}\cdot\text{h}_\text{[MASK]})}{\sum_{y'\in\mathcal{Y}}{\exp(\text{W}_{\mathcal{M}(y')}\cdot\text{h}_\text{[MASK]})}}
\end{aligned}\]