Appendix
Proof of Theorem 1
For \(y_{i} \sim NB(\mu _{i}, \alpha )\), the following basic results are repeatedly used:
$$\begin{aligned} E(y_{i} - \mu _{i})^2= & {} \mu _{i}(1 + \alpha \mu _{i}) \\ E(y_{i} - \mu _{i})^3= & {} \mu _{i}(1 + \alpha \mu _{i})(1 + 2\alpha \mu _{i}) \\ E(y_{i} - \mu _{i})^4= & {} \mu _{i}(1 + \alpha \mu _{i})(1 + 3\mu _{i} + 6\alpha \mu _{i} + 3\alpha \mu _{i}^{2} + 6\alpha ^2 \mu _{i}^2) \end{aligned}$$
From these expectations, we have
$$\begin{aligned} E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \right) ^{2}= & {} E \left\{ \frac{1}{2} \cdot \frac{(y_{i} - \mu _{i})^{2} - (\mu _{i} + \alpha \mu _{i} y_{i})}{(1 + \alpha \mu _{i})^{2}} \right\} ^{2} \\= & {} \frac{1}{4} \cdot \frac{E(y_{i} - \mu _{i})^{4} - 2 \alpha \mu _{i} E(y_{i} - \mu _{i})^{3} + [\alpha ^{2} \mu _{i}^2 - 2(\mu _{i} + \alpha \mu _{i}^{2})] E(y_{i} - \mu _{i})^{2} + (\mu _{i} + \alpha \mu _{i}^{2})^{2}}{(1 + \alpha \mu _{i})^{4}} \\= & {} \frac{1}{4} \cdot \frac{(1 + 2 \mu _{i} + 3 \alpha \mu _{i}) \mu _{i}}{(1 + \alpha \mu _{i})^{2}}, \\ E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \frac{\partial \ell _{i}}{\partial \beta ^{T}} \right)= & {} E \left\{ \frac{1}{2} \cdot \frac{(y_{i} - \mu _{i})^{2} - (\mu _{i} + \alpha \mu _{i} y_{i})}{(1 + \alpha \mu _{i})^{2}} \right\} \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} \\= & {} \frac{1}{2} \cdot \frac{E(y_{i} - \mu _{i})^{3} - \mu _{i}E(y_{i} - \mu _{i}) - \alpha \mu _{i} (E(y_{i}^2) - \mu _{i}^2)}{(1 + \alpha \mu _{i})^{3}} x_{i}^{T} \\= & {} \frac{1}{2} \cdot \frac{\mu _{i}}{(1 + \alpha \mu _{i})} x_{i}^{T}, \\ E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \frac{\partial \ell _{i}}{\partial \alpha } \right)= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \sigma _{v}^{2} \partial \alpha } \right) = E \left( \frac{\partial }{\partial \alpha } \left\{ - \frac{1}{2} \cdot \frac{(y_{i} - \mu _{i})^{2} - (\mu _{i} + \alpha \mu _{i} y_{i})}{(1 + \alpha \mu _{i})^{2}} \right\} \right) \\= & {} \frac{1}{2} \cdot \frac{\mu _{i} (1 + \alpha \mu _{i})^2 E(y_{i}) - [E(y_{i} - \mu _{i})^2 - \mu _{i} -\alpha \mu _{i} E(y_{i})] \cdot 2 (1 + \alpha \mu _{i}) \mu _{i}}{(1 + \alpha \mu _{i})^{4}} \\= & {} \frac{1}{2} \cdot \frac{\mu _{i}^2}{(1 + \alpha \mu _{i})^2}, \\ E \left( \frac{\partial \ell _{i}}{\partial \beta } \frac{\partial \ell _{i}}{\partial \beta ^{T}} \right)= & {} E \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i} \right\} \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} = x_{i}^{T} \frac{E(y_{i} - \mu _{i})^2}{(1 + \alpha \mu _{i})^2} x_{i} \\= & {} x_{i}^{T} \frac{\mu _{i}}{(1 + \alpha \mu _{i})} x_{i}, \\ E \left( \frac{\partial \ell _{i}}{\partial \beta } \frac{\partial \ell _{i}}{\partial \alpha } \right)= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \beta \partial \alpha } \right) = E \left( \frac{\partial }{\partial \alpha } \left\{ - \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} \right) \\= & {} - \frac{E( y_{i} - \mu _{i}) \mu _{i}}{(1 + \alpha \mu _{i})^{2}} x_{i}^{T} = 0. \end{aligned}$$
Following Lawless (1987), we have
$$\begin{aligned} E \left( \frac{\partial \ell _{i}}{\partial \alpha } \right) ^{2}= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \alpha ^{2}} \right) = \alpha ^{-4} \left\{ E \left( \sum _{j=0}^{y_{i}-1}(\alpha ^{-1} + j)^{-2} \right) - \frac{\alpha \mu _{i}}{\mu _{i} + \alpha ^{-1}} \right\} \\= & {} \alpha ^{-4} \left( \sum _{j=0}^{\infty }(\alpha ^{-1} + j)^{-2} \Pr (y_{i} \ge j+1) - \frac{\alpha \mu _{i}}{\mu _{i} + \alpha ^{-1}} \right) . \end{aligned}$$
Under the regularity conditions on the central limit theorem for the score components and maximum likelihood theory (see Chesher 1984), asymptotic normality follows for \(T_A\). \(\square \)
Proof of Theorem 2
By using the basic results used in the proof of Theorem 1, we have
$$\begin{aligned} E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \right) ^{2}= & {} E \left\{ \frac{1}{2} \cdot \frac{(1 + \alpha )(y_{i} - \mu _{i})^{2} - \alpha y_{i}^{2} - y_{i}}{(1 + \alpha \mu _{i})^{2}} \right\} ^{2} \\= & {} \frac{1}{4} \cdot \frac{E(y_{i} - \mu _{i})^{4} - 2 (1 + 2 \alpha \mu _{i}) E(y_{i} - \mu _{i})^{3} + [(1 + 2 \alpha \mu _{i})^2 - 2(\mu _{i} + \alpha \mu _{i}^{2})] E(y_{i} - \mu _{i})^{2} + (\mu _{i} + \alpha \mu _{i}^{2})^{2}}{(1 + \alpha \mu _{i})^{4}} \\= & {} \frac{1}{4} \cdot \frac{2 \mu _{i}^{2} (1 + \alpha )}{(1 + \alpha \mu _{i})^{2}}, \\ E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \frac{\partial \ell _{i}}{\partial \beta ^{T}} \right)= & {} E \left\{ \frac{1}{2} \cdot \frac{(1 + \alpha )(y_{i} - \mu _{i})^{2} - \alpha y_{i}^{2} - y_{i}}{(1 + \alpha \mu _{i})^{2}} \right\} \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} \\= & {} \frac{1}{2} \cdot \frac{(1 + \alpha ) E(y_{i} - \mu _{i})^{3} - \alpha E(y_{i}^{3}) + \alpha \mu E(y_{i}^{2}) - E(y_{i}^{2}) + \mu E(y_{i})}{(1 + \alpha \mu _{i})^{3}} x_{i}^{T} \\= & {} 0, \\ E \left( \frac{\partial \ell _{i}}{\partial \sigma _{v}^{2}} \frac{\partial \ell _{i}}{\partial \alpha } \right)= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \sigma _{v}^{2} \partial \alpha } \right) = E \left( \frac{\partial }{\partial \alpha } \left\{ - \frac{1}{2} \cdot \frac{(1 + \alpha )(y_{i} - \mu _{i})^{2} - \alpha y_{i}^{2} - y_{i}}{(1 + \alpha \mu _{i})^{2}} \right\} \right) \\= & {} - \frac{1}{2} \cdot \frac{[E(y_{i} - \mu _{i})^2 - E(y_{i}^2)](1 + \alpha \mu _{i})^{2} - [(1 + \alpha ) E(y_{i} - \mu _{i})^{2} - \alpha E(y_{i}^{2}) - E(y_{i})] \cdot 2 \mu _{i} (1 + \alpha \mu _{i})}{(1 + \alpha \mu _{i})^{4}} \\= & {} \frac{1}{2} \cdot \frac{\mu _{i}^2}{(1 + \alpha \mu _{i})^2}, \\ E \left( \frac{\partial \ell _{i}}{\partial \beta } \frac{\partial \ell _{i}}{\partial \beta ^{T}} \right)= & {} E \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i} \right\} \left\{ \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} = x_{i}^{T} \frac{E(y_{i} - \mu _{i})^2}{(1 + \alpha \mu _{i})^2} x_{i} \\= & {} x_{i}^{T} \frac{\mu _{i}}{(1 + \alpha \mu _{i})} x_{i}, \\ E \left( \frac{\partial \ell _{i}}{\partial \beta } \frac{\partial \ell _{i}}{\partial \alpha } \right)= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \beta \partial \alpha } \right) = E \left( \frac{\partial }{\partial \alpha } \left\{ - \frac{y_{i} - \mu _{i}}{1 + \alpha \mu _{i}} x_{i}^{T} \right\} \right) \\= & {} - \frac{E( y_{i} - \mu _{i}) \mu _{i}}{(1 + \alpha \mu _{i})^{2}} x_{i}^{T} = 0.\\ E \left( \frac{\partial \ell _{i}}{\partial \alpha } \right) ^{2}= & {} E \left( - \frac{\partial ^{2} \ell _{i}}{\partial \alpha ^{2}} \right) = \alpha ^{-4} \left( \sum _{j=0}^{\infty }(\alpha ^{-1} + j)^{-2} {\Pr (y_{i} \ge j + 1)} - \frac{\alpha \mu _{i}}{\mu _{i} + \alpha ^{-1}} \right) . \end{aligned}$$
Similar to the proof of Theorem 1, asymptotic normality follows for T under the same regularity conditions. \(\square \)
Proof of Theorem 3
Consider the numerator of the score statistics first.
$$\begin{aligned}&S(\beta ,\alpha ) = \dfrac{1}{2} \displaystyle \sum \limits _{i} \frac{(1 + \alpha )(y_{i} - \mu _{i})^{2} - \alpha y_{i}^2 - y_{i}}{(1 + \alpha \mu _{i})^{2}}, \quad S_{A}(\beta ,\alpha ) = \dfrac{1}{2} \displaystyle \sum \limits _{i} \frac{(y_{i} - \mu _{i})^{2} - (\mu _{i} + \alpha \mu _{i} y_{i})}{(1 + \alpha \mu _{i})^{2}} \end{aligned}$$
Their difference becomes:
$$\begin{aligned} S(\beta ,\alpha ) - S_{A}(\beta ,\alpha )= & {} \frac{1}{2} \sum _{i} \frac{\alpha (y_{i} - \mu _{i})^{2} - \alpha y_{i}^2 - y_{i} + \mu _{i} + \alpha \mu _{i} y_{i}}{(1 + \alpha \mu _{i})^{2}} \\= & {} - \frac{1}{2} \sum _{i} \frac{(y_{i} - \mu _{i}) (1 + \alpha \mu _{i}) }{(1 + \alpha \mu _{i})^{2}} = - \frac{1}{2} \sum _{i} \frac{(y_{i} - \mu _{i}) }{(1 + \alpha \mu _{i})} = 0. \end{aligned}$$
Suppose that the NB regression model includes the intercept. Then, the MLE for \(\beta _{0}\) satisfies
$$\begin{aligned} \sum _{i} \frac{\partial \ell _{i}}{\partial \beta _{0}}= & {} \sum _{i} \frac{(y_{i} - \widehat{\mu }_{i})}{(1 + \alpha \widehat{\mu }_{i})} = 0, \end{aligned}$$
which implies \(S(\widehat{\beta },\widehat{\alpha })=S^{A}(\widehat{\beta },\widehat{\alpha })\).
Now consider the denominator part of the score statistics. Note that
$$\begin{aligned} V= & {} \frac{1}{4} \sum _{i} \frac{2 \widehat{\mu }_{i}^2 (1 + \widehat{\alpha })}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} - \begin{pmatrix} 0&\frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^{2}} \end{pmatrix} \begin{pmatrix} X^{T} W X&{} 0 \\ 0 &{} i(\widehat{\beta }, \widehat{\alpha }) \\ \end{pmatrix}^{-1}\\&\begin{pmatrix} 0 \\ \frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^{2}} \end{pmatrix} \text {and} \\ V_{A}= & {} \frac{1}{4} \sum _{i} \frac{(1 + 2\widehat{\mu }_{i} + 3\widehat{\alpha }\widehat{\mu }_{i}) \widehat{\mu }_{i}}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} - \begin{pmatrix} \frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x^{T}_{i}&\frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^{2}} \end{pmatrix}\\&\begin{pmatrix} X^{T} W X &{} 0 \\ 0 &{} i(\widehat{\beta }, \widehat{\alpha }) \\ \end{pmatrix}^{-1} \begin{pmatrix} \frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x_{i} \\ \frac{1}{2} \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^{2}} \end{pmatrix}. \end{aligned}$$
By simplifying the second terms in V and \(V_{A}\), we have
$$\begin{aligned} V= & {} \frac{1}{4} \sum _{i} \frac{2 \widehat{\mu }_{i}^2 (1 + \widehat{\alpha })}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} - \frac{1}{4} \left( \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} \right) ^{2} i(\widehat{\beta }, \widehat{\alpha })^{-1} \end{aligned}$$
From
$$\begin{aligned} V_{A}= & {} \frac{1}{4} \sum _{i} \frac{(1 + 2\widehat{\mu }_{i} + 3\widehat{\alpha }\widehat{\mu }_{i}) \widehat{\mu }_{i}}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} - \frac{1}{4} \left( \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x^{T}_{i} \right) \left( X^{T} W X \right) ^{-1} \\&\left( \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x_{i} \right) - \frac{1}{4} \left( \sum _{i} \frac{\widehat{\mu }_{i}^2}{(1 + \widehat{\alpha } \widehat{\mu }_{i})^2} \right) ^{2} i(\widehat{\beta }, \widehat{\alpha })^{-1} \end{aligned}$$
Since
$$\begin{aligned}&\left( \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x^{T}_{i} \right) \left( X^{T} W X \right) ^{-1} \left( \sum _{i} \frac{\widehat{\mu }_{i}}{1 + \widehat{\alpha } \widehat{\mu }_{i}} x_{i} \right) =1^{T}WX(X^{T}WX)^{-1}X^{T}W1\\&\quad =(X^{T}WX(X^{T}WX)^{-1}X^{T}WX)_{11}=(X^{T}WX)_{11}\\&\quad =1^{T}W1=\sum _{i} \frac{\widehat{\mu }_{i}}{(1 + \widehat{\alpha } \widehat{\mu }_{i})}, \end{aligned}$$
we have \(V=V_{A}\). Consequently, the two score statistics are equivalent. \(\square \)