% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} % Uncomment next 2 lines instead of the first for article-style handout: % \documentclass[12pt]{article} % \usepackage{beamerarticle} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom % \usetheme{Berlin} % Displays sections on top % \usetheme{Warsaw} % Displays sections on top \usetheme{Frankfurt} % Displays sections on top: Fairly thin but swallows some material at bottom of crowded slides \usepackage[english]{babel} \usepackage{tikz} % for tikzpicture \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{The Zipper Example\footnote{See last slide for copyright information.}} \subtitle{STA2101 Fall 2019} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} % \begin{frame} % \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} % Davison Chapter 4, especially Sections 4.3 and 4.4 % \end{frame} \section{Preparation} \begin{frame}{Preparation: Indicator functions} \framesubtitle{Conditional expectation and the Law of Total Probability} \pause $I_A(x)$ is the \emph{indicator function} for the set $A$. It is defined by \pause \begin{displaymath} I_A(x) = \left\{ \begin{array}{ll} % ll means left left 1 & \mbox{for } x \in A \\ 0 & \mbox{for } x \notin A \end{array} \right. % Need that crazy invisible right period! \end{displaymath} \pause Also sometimes written $I(x \in A)$. \pause \vspace{3mm} Using $E(g(X)) = \sum_x g(x) p(x)$, \pause we have \begin{eqnarray*} E(I_A(X)) &=& \pause \sum_x I_A(x) p(x) \\ \pause &=& \sum_{x \in A} p(x) \\ \pause &=& P\{ X \in A \} \end{eqnarray*} \pause So the expected value of an indicator is a probability. \end{frame} \begin{frame} \frametitle{If $X$ is continuous} %\framesubtitle{} \begin{eqnarray*} E(I_A(X)) &=& \int_{-\infty}^\infty I_A(x) f(x) \, dx \\ \pause &=& \int_A f(x) \, dx \\ \pause &=& P\{ X \in A \} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Applies to conditional probabilities too} \pause %\framesubtitle{} {\LARGE \begin{eqnarray*} E(I_A(X)|Y) \pause &=& \sum_x I_A(x) p(x|Y) \mbox{, or}\\ \pause & & \int_{-\infty}^\infty I_A(x) f(x|Y) \, dx \\ \pause &&\\ &=& Pr\{ X \in A|Y\} \end{eqnarray*} } % End size \pause So the conditional expected value of an indicator is a \emph{conditional} probability. \end{frame} \begin{frame} \frametitle{Double expectation: $E\left( g(X) \right) = E\left( E[g(X)|Y]\right)$} %\framesubtitle{} \pause \begin{displaymath} E\left(E[I_A(X)|Y]\right) = E[I_A(X)] = Pr\{ X \in A\} \mbox{, so} \end{displaymath} \pause \begin{eqnarray*} Pr\{ X \in A\} &=& E\left(E[I_A(X)|Y]\right) \pause \\ &=& E\left(Pr\{ X \in A|Y\}\right) \pause \\ &=& \int_{-\infty}^\infty Pr\{ X \in A|Y=y\} f_Y(y) \, dy \pause \\ &\mbox{or}\pause& \sum_y Pr\{ X \in A|Y=y\} p_Y(y) \end{eqnarray*} \pause This is known as the \emph{Law of Total Probability} \end{frame} \section{The Example} \begin{frame} \frametitle{The Zipper Example} \pause %\framesubtitle{} Members of a Senior Kindergarten class (which we shall treat as a sample) try to zip their coats within one minute. We observe whether each child succeeds. \vspace{5mm} \pause How about a model? \vspace{5mm} \pause $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$, where $\theta$ is the probability of success. \pause \vspace{5mm} Do you actually like this model? \end{frame} \section{A better model} \begin{frame} \frametitle{A better model than $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$} %\framesubtitle{} \pause \begin{itemize} \item Obviously, the probability of success is not the same for each child. \pause \item Some kids are better at it than others, and some coats have easier zippers than others. \pause \item Some children are almost certain to succeed, and others have almost no chance. \end{itemize} \vspace{10mm} \pause \textbf{Alternative Model}: $Y_1, \ldots, Y_n$ are independent random variables, with $Y_i \sim B(1,\theta_i)$. \end{frame} \begin{frame} \frametitle{$Y_1, \ldots, Y_n$ independent $B(1,\theta_i)$} %\framesubtitle{} \pause \begin{itemize} \item This is a two-stage sampling model. \pause \item First, sample from a population of children in which each child has a personal probability of success, $\theta_i$. \pause \item Then for child $i$, use $\theta_i$ to randomly generate success or failure. \pause \item Note that $\theta_1, \ldots, \theta_n$ are random variables with some probability distribution. \pause \item This distribution is supported on $[0,1]$ \pause \item How about a beta distribution? \pause \end{itemize} \begin{displaymath} f(\theta) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta^{\alpha-1} (1-\theta)^{\beta-1} \end{displaymath} \end{frame} % Five pictures \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta1} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta2} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta3} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta4} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta5} \end{center} \end{frame} \begin{frame} \frametitle{Law of total probability} \framesubtitle{Double expectation} \begin{eqnarray*} P(Y_i=1) \pause & = & \int_0^1 P(Y_i=1|\theta_i) \, f(\theta_i) \, d\theta_i \\ \pause & = & \int_0^1 \theta_i \, f(\theta_i) \, d\theta_i \\ \pause & = & \int_0^1 \theta_i \, \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta_i^{\alpha-1} (1-\theta_i)^{\beta-1} \, d\theta_i \\ \pause & = & \frac{\alpha}{\alpha+\beta} \end{eqnarray*} \end{frame} \section{Identifiability} \begin{frame} \frametitle{Distribution of the observable data} \framesubtitle{Corresponds to the likelihood} \begin{displaymath} P(\mathbf{Y}=\mathbf{y}|\alpha,\beta) = \prod_{i=1}^n \left(\frac{\alpha}{\alpha+\beta} \right)^{y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{1-y_i} \end{displaymath} \pause \begin{itemize} \item Distribution of the observable data depends on the parameters $\alpha$ and $\beta$ only through $\frac{\alpha}{\alpha+\beta}$. \pause \item Infinitely many $(\alpha,\beta)$ pairs yield the same distribution of the data. \pause \item How could you use the data to decide which one is right? \end{itemize} \end{frame} \begin{frame} \frametitle{Parameter Identifiability} \framesubtitle{The general idea} \begin{itemize} \item The parameters of the Zipper Model are not \emph{identifiable}. \pause \item The model parameters cannot be recovered from the distribution of the sample data. \pause \item And all you can ever learn from sample data is the distribution from which it comes. \pause \item So there will be problems using the sample data for estimation and inference about the parameters. \pause \item This is true \emph{even if the model is completely correct.} \end{itemize} \end{frame} \begin{frame} \frametitle{Definitions} \framesubtitle{Connected to parameter identifiability} \pause \begin{itemize} \item A \emph{Statistical Model} is a set of assertions that partly specify the probability distribution of the observable data. \pause \item Suppose a statistical model implies $\mathbf{D} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. \pause If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be \emph{identifiable.} \pause \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$. \pause \item On the other hand, if there exist distinct $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ in $\Theta$ with $P_{\boldsymbol{\theta}_1} = P_{\boldsymbol{\theta}_2}$, the parameter $\boldsymbol{\theta}$ is \emph{not identifiable.} \end{itemize} \end{frame} \begin{frame} \frametitle{An equivalent definition} \framesubtitle{Equivalent to $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2 \Rightarrow P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$} \pause \begin{itemize} \item The probability distribution is always a function of the parameter vector. \pause \item If that function is one-to-one, the parameter vector is identifiable, \pause because then $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ yielding the same distribution could not happen. \pause \item[] \item That is, if the parameter vector can somehow be recovered from the distribution of the data, it is identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation for all points in the parameter space is impossible. \vspace{5mm} \begin{center} \includegraphics[width=3in]{consistent} \end{center} \pause \vspace{5mm} \begin{itemize} \item Suppose $\theta_1 \neq \theta_2$ but $P_{\theta_1} = P_{\theta_2}$ \pause \item $T_n = T_n(D_1, \ldots, D_n) \stackrel{p}{\rightarrow} \theta$ for all $\theta \in \Theta$. \pause \item Distribution of $T_n$ is identical for $\theta_1$ and $\theta_2$. \end{itemize} \end{frame} \begin{frame} \frametitle{Why don't we hear more about identifiability?} \pause \begin{itemize} \item Consistent estimation indirectly proves identifiability. \pause \item Because without identifiability, consistent estimation would be impossible. \pause \item Any \emph{function} of the parameter vector that can be estimated consistently is identifiable. \end{itemize} \end{frame} \section{Maximum Likelihood Fails} \begin{frame} \frametitle{Maximum likelihood fails for the Zipper Example} \framesubtitle{It has to fail.} \pause \begin{eqnarray*} L(\alpha,\beta) & = & \left(\frac{\alpha}{\alpha+\beta} \right)^{\sum_{i=1}^n y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{n-\sum_{i=1}^n y_i} \\ && \\ \pause \ell(\alpha,\beta) & = & \log \left( \left(\frac{\alpha}{\alpha+\beta} \right)^{\sum_{i=1}^n y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{n-\sum_{i=1}^n y_i} \right ) \end{eqnarray*} \pause Partially differentiate with respect to $\alpha$ and $\beta$, set to zero, and solve. \end{frame} \begin{frame} \frametitle{Two equations in two unknowns} \pause %\framesubtitle{} \begin{eqnarray*} \frac{\partial\ell}{\partial\alpha} \stackrel{set}{=} 0 \pause & \Rightarrow & \frac{\alpha}{\alpha+\beta} = \overline{y} \\ \pause \frac{\partial\ell}{\partial\beta} \stackrel{set}{=} 0 \pause & \Rightarrow & \pause\frac{\alpha}{\alpha+\beta} = \overline{y} \end{eqnarray*} \vspace{5mm} \pause Any pair $(\alpha,\beta)$ with $\frac{\alpha}{\alpha+\beta} = \overline{y}$ will maximize the likelihood. \pause \vspace{5mm} The MLE is not unique. \end{frame} \begin{frame} \frametitle{What is happening geometrically?} % \framesubtitle{} \begin{columns} \column{0.5\textwidth} \begin{displaymath} \frac{\alpha}{\alpha+\beta} = \overline{y} \pause \Leftrightarrow \beta = \left( \frac{1-\overline{y}}{\overline{y}} \right) \alpha \end{displaymath} \pause \column{0.5\textwidth} \begin{tikzpicture}[scale=1/2] % \draw[help lines] (0,0) grid (10,10); % Graph paper % Draw axes \draw[->] (0,0) -- (9,0) coordinate (x axis); % From -- To With arrows! \draw[->] (0,0) -- (0,9) coordinate (y axis); % Label axes \draw (9 cm, 0) node[anchor=west] {\scriptsize $\alpha$}; \draw (0, 9 cm) node[anchor=south] {\scriptsize $\beta$}; % Tick marks and numbers % Draw the line \draw[thick] (0,0) -- (12,8); \end{tikzpicture} \end{columns} \end{frame} \begin{frame} \frametitle{Look what has happened to us.} \pause %\framesubtitle{} \begin{itemize} \item We made an honest attempt to come up with a better model. \pause \item And it \emph{was} a better model. \pause \item But the result was disaster. \end{itemize} \end{frame} \begin{frame} \frametitle{There is some good news.} \pause %\framesubtitle{} {\small Remember from earlier that by the Law of Total Probability, \begin{displaymath} P(Y_i = 1) = \int_0^1 \theta_i \, f(\theta_i) \, d\theta_i \pause = E(\Theta_i) \end{displaymath} \pause \begin{itemize} \item Even when the probability distribution of the (random) probability of success is completely unknown, \pause \item We can estimate its expected value (call it $\mu$) consistently with $\overline{Y}_n$. \pause \item So that \emph{function} of the unknown probability distribution is identifiable. \pause \item And often that's all we care about anyway, say for comparing group means. \pause \item So the usual procedures, based on a model nobody can believe (Bernoulli)\pause, are actually informative about a much more realistic model whose parameter vector is not fully identifiable. \pause \item We don't often get this lucky. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{One more question about the parametric version} %\framesubtitle{} What would it take to estimate $\alpha$ and $\beta$ successfully? \pause \begin{itemize} \item Get the children to try zipping their coats twice, say on two consecutive days. \item Assume their ability does not change, and conditionally on their ability, the two tries are independent. \item That will do it. % \pause There are 4 probabilities that add to one; three are free. Write them as functions of $\alpha$ and $\beta$ and solve for $\alpha$ and $\beta$. \pause \item[] \item This kind of thing often happens. When the parameters of a reasonable model are not identifiable, it is often possible to design a different way of collecting data so that the parameters \emph{can} be identified. \end{itemize} \end{frame} \begin{frame} \frametitle{Moral of the story} %\framesubtitle{} \pause \begin{itemize} \item If you think up a better model for standard kinds of data, the parameters of the model may not be identifiable. You need to check. \pause \item The problem is not with the model. It's with the data. \pause \item The solution is better \emph{research design}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/2101f19} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/2101f19}} \end{frame} \end{document} # R work for beta density plots # alpha=2, beta=4 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,2, 4) tstring = expression(paste("Beta density with ", alpha,"=2 and ", beta,"=4")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=4, beta=2 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,4, 2) tstring = expression(paste("Beta density with ", alpha,"=4 and ", beta,"=2")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=3, beta=3 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,3, 3) tstring = expression(paste("Beta density with ", alpha,"=3 and ", beta,"=3")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=1/2, beta=1/2 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,1/2, 1/2) tstring = expression(paste("Beta density with ", alpha,"=1/2 and ", beta,"=1/2")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=1/2, beta=1/4 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,1/2, 1/4) tstring = expression(paste("Beta density with ", alpha,"=1/2 and ", beta,"=1/4")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) tstring = expression(paste("Beta density with ", alpha,"=3 and ", beta,"=3")) plot(x,y,type='l' ,ylab="f(x)", main = cat(expression(alpha),"=2") ######################################################## x = seq(from =0.00, to = 1.00, by = 0.01) a = 2; b = 4 # alpha and beta y = dbeta(x,a,b) tstring = expression(paste("Beta density with ", alpha,"=2 and ", beta,"=4")) plot(x,y,type='l' ,ylab="f(x)", main = tstring) plot(x,y,type='l' ,ylab="f(x)", main = tstring,asp=1) plot(x,y,type='l' ,ylab="f(x)", main = tstring, asp=1,xlim=c(0,1),ylim=c(0,3)) plot(x,y,type='l' ,ylab="f(x)", main = tstring,xlim=c(0,1),ylim=c(0,3)) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%% The following was cut out for 2018 and 2019 because this example was so early. Maybe put it later, as an example. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Fisher Information: $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E\{-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y|\boldsymbol{\theta})\}\right]$} \framesubtitle{The Hessian of the minus log likelihood approximates $n$ times the Fisher Information.} \pause \begin{eqnarray*} \log f(Y|\alpha,\beta) & = & \log \left(\left(\frac{\alpha}{\alpha+\beta} \right)^Y \left(1-\frac{\alpha}{\alpha+\beta} \right)^{1-Y}\right) \\ \pause &&\\ & = & Y\log\alpha + (1-Y)\log\beta - \log(\alpha+\beta) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\boldsymbol{\mathcal{I}}(\alpha,\beta) = \left[E\{-\frac{\partial^2}{\partial\alpha\partial\beta} \log f(Y|\alpha,\beta)\}\right]$} \framesubtitle{Where $\log f(Y|\alpha,\beta) = Y\log\alpha + (1-Y)\log\beta - \log(\alpha+\beta)$} \pause \begin{eqnarray*} \boldsymbol{\mathcal{I}}(\alpha,\beta) & = & E\left( \begin{array}{c c} -\frac{\partial^2\log f}{\partial\alpha^2} & -\frac{\partial^2\log f}{\partial\alpha\partial\beta} \\ -\frac{\partial^2\log f}{\partial\beta\partial\alpha} & -\frac{\partial^2\log f}{\partial\beta^2} \end{array} \right) \\ \pause & = & \ldots \\ \pause & = & \frac{1}{(\alpha+\beta)^2} \left( \begin{array}{c c} \frac{\beta}{\alpha} & 1 \\ 1 & \frac{\alpha}{\beta} \end{array} \right) \end{eqnarray*} \vspace{5mm} \pause \begin{itemize} \item Determinant equals zero. \item The inverse does not exist. \item Large sample theory fails. \item Second derivative test fails. \item The likelihood is flat (in a particular direction). \end{itemize} \end{frame}