% \documentclass[mathserif]{beamer} % Get Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usetheme{Berlin} % Displays sections on top \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom % \usetheme{Berlin} % Diplays sections on top % \usetheme{Warsaw} % Diplays sections on top % \usetheme{Frankfurt} % Diplays sections on top: Fairly thin but swallows some material at bottom of crowded slides \usepackage[english]{babel} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Likelihood Part Two\footnote{See last slide for copyright information.}} \subtitle{STA2101 Fall 2019} \date{} % To suppress date % Cut out a lot of detail in 2014: see 2013 version \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} Appendix A, Section 6 \end{frame} \begin{frame}{Vector of MLEs is Asymptotically Normal}{That is, Multivariate Normal} \pause This yields \pause \begin{itemize} \item Confidence intervals for the parameters. \item $Z$-tests of $H_0: \theta_j=\theta_0$. \item Wald tests. \item Score Tests. \item Indirectly, the Likelihood Ratio tests. \end{itemize} \end{frame} \begin{frame}{Under Regularity Conditions}{(Thank you, Mr. Wald)} \pause \begin{itemize} \item $\widehat{\boldsymbol{\theta}}_n \stackrel{a.s.}{\rightarrow} \boldsymbol{\theta}$ \pause \item $\sqrt{n}(\widehat{\boldsymbol{\theta}}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N_k\left(\mathbf{0}, \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$ \pause \item So we say that $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$. \pause \item $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ is the Fisher Information in one observation. \pause \item A $k \times k$ matrix \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \pause \end{displaymath} \item The Fisher Information in the whole sample is $n\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ \end{itemize} \end{frame} \begin{frame} \frametitle{$\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$} \pause %\framesubtitle{} \begin{itemize} \item Asymptotic covariance matrix of $\widehat{\boldsymbol{\theta}}_n $ is $\frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$, and of course we don't know $\boldsymbol{\theta}$. \pause \item For tests and confidence intervals, we need a good \emph{approximate} asymptotic covariance matrix, \pause \item Based on a consistent estimate of the Fisher information matrix. \pause \item $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_n)$ would do. \pause \item But it's inconvenient\pause: Need to compute partial derivatives and expected values in \pause \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \end{displaymath} \pause and then substitute $\widehat{\boldsymbol{\theta}}_n$ for $\boldsymbol{\theta}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Another approximation of the asymptotic covariance matrix} \pause % \framesubtitle{} Approximate \begin{displaymath} \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1} = \left[ n \, E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right]^{-1} \end{displaymath} \pause with \begin{displaymath} \widehat{\mathbf{V}}_n = \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \end{displaymath} \vspace{4mm} $ \widehat{\mathbf{V}}_n^{-1}$ is called the ``observed Fisher information." \end{frame} \begin{frame}{Observed Fisher Information} \begin{itemize} \item To find $\widehat{\boldsymbol{\theta}}_n$, minimize the minus log likelihood. \item Matrix of mixed partial derivatives of the minus log likelihood is \begin{displaymath} \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right] = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \sum_{i=1}^n \log f(Y_i;\boldsymbol{\theta}) \right] \end{displaymath} \item So by the Strong Law of Large Numbers, \begin{eqnarray*} \boldsymbol{\mathcal{J}}_n(\boldsymbol{\theta}) &=& \left[\frac{1}{n}\sum_{i=1}^n -\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y_i;\boldsymbol{\theta}) \right] \\ &\stackrel{a.s.}{\rightarrow}& \left[E\left(-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})\right)\right] = \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) \end{eqnarray*} \end{itemize} \end{frame} \begin{frame} {A Consistent Estimator of $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$} {Just substitute $\widehat{\boldsymbol{\theta}}_n$ for $\boldsymbol{\theta}$} \begin{eqnarray*} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n) &=& \left[\frac{1}{n}\sum_{i=1}^n -\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y_i;\boldsymbol{\theta}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \\ &\stackrel{a.s.}{\rightarrow}& \left[E\left(-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})\right)\right] = \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) \end{eqnarray*} \begin{itemize} \item Convergence is believable but not trivial. \item Now we have a consistent estimator, more convenient than $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_n)$: Use $\widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n = \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)$ \end{itemize} \end{frame} \begin{frame}{Approximate the Asymptotic Covariance Matrix} \begin{itemize} \item Asymptotic covariance matrix of $\widehat{\boldsymbol{\theta}}_n$ is $\frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$. \vspace{5mm} \item Approximate it with \begin{eqnarray*} \widehat{\mathbf{V}}_n &=& \frac{1}{n} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)^{-1} \\ &=& \frac{1}{n}\left( \frac{1}{n}\left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \\ &=& \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \end{eqnarray*} \end{itemize} \end{frame} \begin{frame}{Compare}{Hessian and (Estimated) Asymptotic Covariance Matrix} \pause \begin{itemize} \item $\widehat{\mathbf{V}}_n = \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1}$ \pause \item Hessian at MLE is $\mathbf{H} = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}$ \pause \item So to estimate the asymptotic covariance matrix of $\boldsymbol{\theta}$, just invert the Hessian. \pause \item The Hessian is usually available as a by-product of numerical search for the MLE. \end{itemize} \end{frame} \begin{frame}{Connection to Numerical Optimization} \pause \begin{itemize} \item Suppose we are minimizing the minus log likelihood by a direct search. \pause \item We have reached a point where the gradient is close to zero. Is this point a minimum? \pause \item The Hessian is a matrix of mixed partial derivatives. If all its eigenvalues are positive at a point, the function is concave up there. \pause \item Partial derivatives are often approximated by the slopes of secant lines -- no need to calculate them symbolically. \pause \item It's \emph{the} multivariable second derivative test. \end{itemize} \end{frame} \begin{frame}{So to find the estimated asymptotic covariance matrix} \pause \begin{itemize} \item Minimize the minus log likelihood numerically. \pause \item The Hessian at the place where the search stops is usually available. \pause \item Invert it to get $\widehat{\mathbf{V}}_n$. \pause \item This is so handy that sometimes we do it even when a closed-form expression for the MLE is available. \end{itemize} \end{frame} \begin{frame}{Estimated Asymptotic Covariance Matrix $\widehat{\mathbf{V}}_n$ is Useful} \pause \begin{itemize} \item Asymptotic standard error of $\widehat{\theta}_j$ is the square root of the $j$th diagonal element. \pause \item Denote the asymptotic standard error of $\widehat{\theta}_j$ by $S_{\widehat{\theta}_j}$. \pause \item Thus \begin{displaymath} Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}} \end{displaymath} \pause is approximately standard normal. \end{itemize} \end{frame} \begin{frame}{Confidence Intervals and $Z$-tests} \pause Have $ Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}}$ approximately standard normal, yielding \pause \begin{itemize} \item Confidence intervals: $\widehat{\theta}_j \pm S_{\widehat{\theta}_j} z_{\alpha/2}$ \pause \item Test $H_0: \theta_j=\theta_0$ using \begin{displaymath} Z = \frac{\widehat{\theta}_j-\theta_0}{S_{\widehat{\theta}_j}} \end{displaymath} \end{itemize} \end{frame} \begin{frame}{And Wald Tests for $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$} \framesubtitle{Based on $(\mathbf{X}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$} \pause {\LARGE \begin{displaymath} W_n = (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top \left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\top\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \end{displaymath} \pause } % End size \vspace{5mm} $\widehat{\boldsymbol{\theta}}_n \stackrel{\cdot}{\sim} N_p(\boldsymbol{\theta},\mathbf{V_n})$ \pause so if $H_0$ is true, \pause $\mathbf{L}\widehat{\boldsymbol{\theta}}_n \pause \stackrel{\cdot}{\sim} N_r(\mathbf{h},\mathbf{L} \mathbf{V}_n \mathbf{L}^\top)$. \pause Thus $(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top \left(\mathbf{L} \mathbf{V}_n \mathbf{L}^\top\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \stackrel{\cdot}{\sim} \chi^2(r)$. \pause And substitute $\widehat{\mathbf{V}}_n$ for $\mathbf{V}_n$. \pause \vspace{3mm} Slutsky arguments omitted. \end{frame} \begin{frame}{Score Tests}{Thank you Mr. Rao} \pause \begin{itemize} \item $\widehat{\boldsymbol{\theta}}$ is the MLE of $\boldsymbol{\theta}$, dimension $k \times 1$ \pause \item $\widehat{\boldsymbol{\theta}}_0$ is the MLE under $H_0$, dimension $k \times 1$ \pause \item $\mathbf{u}(\boldsymbol{\theta}) = (\frac{\partial \ell}{\partial \theta_1}, \ldots \frac{\partial \ell}{\partial \theta_k})^\top$ is the gradient. \pause \item $\mathbf{u}(\widehat{\boldsymbol{\theta}})=\mathbf{0}$. \pause \item If $H_0$ is true, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0)$ should also be close to zero too. \pause \item Under $H_0$ for large $N$, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \sim N_k(\mathbf{0},\frac{1}{n}\boldsymbol{\mathcal{I}}(\boldsymbol{\theta}))$, approximately. \pause \item And, \end{itemize} \begin{displaymath} S = \mathbf{u}(\widehat{\boldsymbol{\theta}}_0)^\top \frac{1}{n}\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_0)^{-1} \mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \stackrel{\cdot}{\sim} \chi^2(r) \end{displaymath} Where $r$ is the number of restrictions imposed by $H_0$. \pause Or use the inverse of the Hessian (under $H_0$) instead of $\frac{1}{n}\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_0)$. \end{frame} \begin{frame}{Three Big Tests} \begin{itemize} \item Score Tests: Fit just the restricted model \item Wald Tests: Fit just the unrestricted model \item Likelihood Ratio Tests: Fit Both \end{itemize} \end{frame} \begin{frame}{Comparing Likelihood Ratio and Wald tests} \pause \begin{itemize} \item Asymptotically equivalent under $H_0$, meaning \pause $(W_n-G^2_n) \stackrel{p}{\rightarrow} 0$ \pause % Score too \item Under $H_1$, \pause \begin{itemize} \item Both have the same approximate distribution (non-central chi-square). \pause \item Both go to infinity as $n \rightarrow \infty$. \pause \item But values are not necessarily close. \pause \end{itemize} \item Likelihood ratio test tends to get closer to the right Type I error probability for small samples. \pause \item Wald can be more convenient when testing lots of hypotheses, because you only need to fit the model once. \pause \item Wald can be more convenient if it's a lot of work to write the restricted likelihood. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/2101f19} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/2101f19}} \end{frame} \end{document}