% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Double Measurement Regression, Part Two\footnote{See last slide for copyright information.}} \subtitle{STA 2101 Fall 2019} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{The general model} \begin{frame} \frametitle{Double measurement} %\framesubtitle{} \begin{itemize} \item We have studied an example where two independent measurements of a latent explanatory variable made all the model parameter identifiable. \item Extend the model. \item Double measurement can also help with correlated measurement error. \end{itemize} \end{frame} \begin{frame} \frametitle{Correlated measurement error} %\framesubtitle{} \begin{itemize} \item We are ``measuring" exercise and snack food consumption by self-report. \item A simple additive model: What people report is the truth, plus a piece of noise that pushes the number up or down by a random amount that is different for each person. \item Is it reasonable to assume the error term for snack food is independent of the error term for exercise? \pause \item This is another case of omitted variables. \pause \item[] \item Acres planted by farmer's report and aerial photograph is a different story. \pause \item Double measurement can help with correlated measurement error. \end{itemize} \end{frame} \begin{frame} \frametitle{The general double measurement design} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{DoublePath2} \end{center} These are all matrices. \pause \begin{itemize} \item The main idea is that $\mathbf{X}$ and $\mathbf{Y}$ are each measured twice, perhaps at different times using different methods. \pause \item Measurement errors may be correlated within but not between sets of measurements. \end{itemize} \end{frame} \begin{frame} \frametitle{Double Measurement Regression: A Two-Stage Model} \framesubtitle{Setting up a two-stage proof of identifiability} \pause {\LARGE \begin{eqnarray*} \mathbf{Y}_i &=& \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i \\ \pause \mathbf{F}_i &=& \left( \begin{array}{c} \mathbf{X}_i \\ \mathbf{Y}_i \end{array} \right) \\ \pause \mathbf{D}_{i,1} &=& \boldsymbol{\nu}_1 + \mathbf{F}_i + \mathbf{e}_{i,1} \\ \pause \mathbf{D}_{i,2} &=& \boldsymbol{\nu}_2 + \mathbf{F}_i + \mathbf{e}_{i,2} \end{eqnarray*} \pause } % End size Observable variables are $\mathbf{D}_{i,1}$ and $\mathbf{D}_{i,2}$: both are $(p+q) \times 1$. \pause \vspace{2mm} $E(\mathbf{X}_i) = \boldsymbol{\mu}_x$, $cov(\mathbf{X}_i)=\boldsymbol{\Phi}_x$, $cov(\boldsymbol{\epsilon}_i)=\boldsymbol{\Psi}$, $cov(\mathbf{e}_{i,1})=\boldsymbol{\Omega}_1$, $cov(\mathbf{e}_{i,2})=\boldsymbol{\Omega}_2$. Also, $\mathbf{X}_i$, $\boldsymbol{\epsilon}_i$, $\mathbf{e}_{i,1}$ and $\mathbf{e}_{i,2}$ are independent. \end{frame} \begin{frame} \frametitle{Measurement errors may be correlated} \framesubtitle{Look at the measurement model} \pause \begin{eqnarray*} \mathbf{F}_i &=& \left( \begin{array}{c} \mathbf{X}_i \\ \mathbf{Y}_i \end{array} \right) \\ \mathbf{D}_{i,1} &=& \boldsymbol{\nu}_1 + \mathbf{F}_i + \mathbf{e}_{i,1} \\ \mathbf{D}_{i,2} &=& \boldsymbol{\nu}_2 + \mathbf{F}_i + \mathbf{e}_{i,2} \end{eqnarray*} \pause \renewcommand{\arraystretch}{1.2} \begin{eqnarray*} cov(\mathbf{e}_{i,1}) &=& \boldsymbol{\Omega}_1 = \left( \begin{array}{c|c} \boldsymbol{\Omega}_{11} & \boldsymbol{\Omega}_{12} \\ \hline \boldsymbol{\Omega}_{12}^\top & \boldsymbol{\Omega}_{22} \end{array} \right) \\ \pause cov(\mathbf{e}_{i,2}) &=& \boldsymbol{\Omega}_2 = \left( \begin{array}{c|c} \boldsymbol{\Omega}_{33} & \boldsymbol{\Omega}_{34} \\ \hline \boldsymbol{\Omega}_{34}^\top & \boldsymbol{\Omega}_{44} \end{array} \right) \end{eqnarray*} \renewcommand{\arraystretch}{1.0} \end{frame} \begin{frame} \frametitle{Expected values of the observable variables} \framesubtitle{$\mathbf{D}_{i,1} = \boldsymbol{\nu}_1 + \mathbf{F}_i + \mathbf{e}_{i,1}$ and $\mathbf{D}_{i,2} = \boldsymbol{\nu}_2 + \mathbf{F}_i + \mathbf{e}_{i,2}$} \pause \begin{columns} % Use Beamer's columns to use more of the margins! \column{1.2\textwidth} \begin{eqnarray*} E(\mathbf{D}_{i,1}) &=& \left( \begin{array}{c} \boldsymbol{\mu}_{1,1} \\ \boldsymbol{\mu}_{1,2} \end{array} \right) \pause = \left( \begin{array}{c} \boldsymbol{\nu}_{1,1} + E(\mathbf{X}_i) \\ \boldsymbol{\nu}_{1,2} + E(\mathbf{Y}_i) \end{array} \right) \pause = \left( \begin{array}{c} \boldsymbol{\nu}_{1,1} + \boldsymbol{\mu}_x \\ \boldsymbol{\nu}_{1,2} + \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1\boldsymbol{\mu}_x \end{array} \right) \\ \\ \pause E(\mathbf{D}_{i,2}) &=& \left( \begin{array}{c} \boldsymbol{\mu}_{2,1} \\ \boldsymbol{\mu}_{2,2} \end{array} \right) = \left( \begin{array}{c} \boldsymbol{\nu}_{2,1} + E(\mathbf{X}_i) \\ \boldsymbol{\nu}_{2,2} + E(\mathbf{Y}_i) \end{array} \right) = \left( \begin{array}{c} \boldsymbol{\nu}_{2,1} + \boldsymbol{\mu}_x \\ \boldsymbol{\nu}_{2,2} + \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1\boldsymbol{\mu}_x \end{array} \right) \end{eqnarray*} \pause \end{columns} \vspace{5mm} \begin{itemize} \item $\boldsymbol{\nu}_1$, $\boldsymbol{\nu}_2$, $\boldsymbol{\beta}_0$ and $\boldsymbol{\mu}_x$ parameters appear only in expected value, not covariance matrix. \pause \item $\mathbf{X}_i$ is $p \times 1$ and $\mathbf{Y}_i$ is $q \times 1$. \pause \item Even with $\boldsymbol{\beta}_1$ identified from the covariance matrix, have $2(p+q)$ equations in $3(p+q)$ unknown parameters. \pause \item Identifying the expected values and intercepts is impossible. \pause \item Re-parameterize, \pause absorbing them into $\boldsymbol{\mu} = \pause E\left( \begin{array}{c} \mathbf{D}_{i,1} \\ \mathbf{D}_{i,2} \end{array} \right)$. \end{itemize} \end{frame} \begin{frame} \frametitle{Losing the intercepts and expected values by re-parameterization} %\framesubtitle{So routine that it's often not mentioned} \pause \begin{itemize} \item We cannot identify $\boldsymbol{\nu}_1$, $\boldsymbol{\nu}_2$, $\boldsymbol{\beta}_0$ and $\boldsymbol{\mu}_x$ separately. \pause \item Swallow them into $\boldsymbol{\mu}$. \pause \item Estimate $\boldsymbol{\mu}$ with $\overline{\mathbf{D}}$. \pause \item And it disappears from $L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{D}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{D}}-\boldsymbol{\mu}) \right\}$. \pause \item And forget it. It's no great loss. \pause \item Concentrate on the parameters that appear only in the covariance matrix of the observable data. \pause \item Try to identify $\boldsymbol{\theta} = (\boldsymbol{\beta}_1, \boldsymbol{\Phi}_x, \boldsymbol{\Psi}, \boldsymbol{\Omega}_1, \boldsymbol{\Omega}_2)$ from $\boldsymbol{\Sigma} = cov\left( \begin{array}{c} \mathbf{D}_{i,1} \\ \mathbf{D}_{i,2} \end{array} \right)$. \end{itemize} \end{frame} \begin{frame} \frametitle{Stage One: The latent variable model} \framesubtitle{$\boldsymbol{\theta} = (\boldsymbol{\beta}_1, \boldsymbol{\Phi}_x, \boldsymbol{\Psi}, \boldsymbol{\Omega}_1, \boldsymbol{\Omega}_2)$} $\mathbf{Y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i$, where \pause \begin{itemize} \item $cov(\mathbf{X}_i)=\boldsymbol{\Phi}_x$ \item $cov(\boldsymbol{\epsilon}_i)=\boldsymbol{\Psi}$ \item $\mathbf{X}_i$ and $\boldsymbol{\epsilon}_i$ are independent. \end{itemize} \pause Vector of ``factors" is $\mathbf{F}_i = \left( \begin{array}{c} \mathbf{X}_i \\ \mathbf{Y}_i \end{array} \right)$. \pause \begin{itemize} \item Let $\boldsymbol{\Phi} = cov(\mathbf{F}_i)$. \pause \item We know that $\boldsymbol{\Phi}_x$, $\boldsymbol{\beta}_1$ and $\boldsymbol{\Psi}$ are functions of $\boldsymbol{\Phi}$. \pause \item We've already shown it; this is a regression model. \pause \end{itemize} That's Stage One. Parameters of the latent variable model are functions of $\boldsymbol{\Phi}$. \end{frame} \begin{frame} \frametitle{Stage Two: The measurement model} \pause %\framesubtitle{} \begin{eqnarray*} \mathbf{D}_{i,1} &=& \boldsymbol{\nu}_1 + \mathbf{F}_i + \mathbf{e}_{i,1} \\ \mathbf{D}_{i,2} &=& \boldsymbol{\nu}_2 + \mathbf{F}_i + \mathbf{e}_{i,2} \end{eqnarray*} \pause $cov(\mathbf{e}_{i,1})=\boldsymbol{\Omega}_1$, $cov(\mathbf{e}_{i,2})=\boldsymbol{\Omega}_2$. Also, $\mathbf{F}_i$, $\mathbf{e}_{i,1}$ and $\mathbf{e}_{i,2}$ are independent. \pause \begin{displaymath} \boldsymbol{\Sigma} = cov\left( \begin{array}{c} \mathbf{D}_{i,1} \\ \mathbf{D}_{i,2} \end{array} \right) \pause = \left( \begin{array}{c c} \boldsymbol{\Phi}+\boldsymbol{\Omega}_1 & \boldsymbol{\Phi} \\ \boldsymbol{\Phi} & \boldsymbol{\Phi}+\boldsymbol{\Omega}_2 \end{array} \right) \end{displaymath} \pause $\boldsymbol{\Phi}$, $\boldsymbol{\Omega}_1$ and $\boldsymbol{\Omega}_2$ can easily be recovered from $\boldsymbol{\Sigma}$. \end{frame} \begin{frame} \frametitle{All the parameters in the covariance matrix are identifiable} \framesubtitle{$\boldsymbol{\theta} = (\boldsymbol{\beta}_1, \boldsymbol{\Phi}_x, \boldsymbol{\Psi}, \boldsymbol{\Omega}_1, \boldsymbol{\Omega}_2)$} \pause \begin{itemize} \item $\boldsymbol{\Phi}_x$, $\boldsymbol{\beta}_1$ and $\boldsymbol{\Psi}$ are functions of $\boldsymbol{\Phi} = cov(\mathbf{F}_i)$. \pause \item $\boldsymbol{\Phi}$, $\boldsymbol{\Omega}_1$ and $\boldsymbol{\Omega}_2$ are functions of $\boldsymbol{\Sigma} = cov\left( \begin{array}{c} \mathbf{D}_{i,1} \\ \mathbf{D}_{i,2} \end{array} \right)$. \pause \item $\boldsymbol{\Sigma}$ is a function of the probability distribution of the observable data. \pause \item So $\boldsymbol{\beta}_1, \boldsymbol{\Phi}_x, \boldsymbol{\Psi}, \boldsymbol{\Omega}_1, \boldsymbol{\Omega}_2$ are all functions of the probability distribution of the observable data. \pause \item They are identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameters of the double measurement regression model are identifiable} \framesubtitle{After re-parameterization} \begin{center} \includegraphics[width=3in]{DoublePath2} \end{center} \pause \begin{itemize} \item Correlated measurement error within sets is allowed. \pause \item This is a big plus, because omitted variables are a reality. \pause \item Correlated measurement error between sets must be ruled out by careful data collection. \pause \item No need to do the calculations ever again. \end{itemize} \end{frame} \section{The BMI study} \begin{frame} \frametitle{The BMI Health Study} \pause %\framesubtitle{} \begin{itemize} \item Body Mass Index: Weight in Kilograms divided by Height in Meters Squared. \pause \item Under 18 means underweight, Over 25 means overweight, Over 30 means obese. \pause \item High BMI is associated with poor health, like high blood pressure and high cholesterol. \pause \item People with high BMI tend to be older and fatter. \pause \item \emph{But}, what if you have a high BMI but are in good physical shape (low percent body fat)? \end{itemize} \end{frame} \begin{frame} \frametitle{The Question} %\framesubtitle{} \begin{itemize} \item If you control for age and percent body fat, is BMI still associated with indicators for poor health? \pause \item But percent body fat (and to a lesser extent, age) are measured with error. Standard ways of controlling for them with ordinary regression are highly suspect. \pause \item Use the double measurement design. \end{itemize} \end{frame} \begin{frame} \frametitle{True variables (all latent)} %\framesubtitle{} \begin{itemize} \item $X_1$ = Age \item $X_2$ = BMI \item $X_3$ = Percent body fat \item $Y_1$ = Cholesterol \item $Y_2$ = Diastolic blood pressure \end{itemize} \end{frame} \begin{frame} \frametitle{Measure twice with different personnel at different locations and by different methods} \pause %\framesubtitle{} \begin{columns} % Use Beamer's columns to use more of the margins! \column{1.1\textwidth} {\footnotesize \renewcommand{\arraystretch}{1.2} \begin{tabular}{lll} \hline & Measurement Set One & Measurement Set Two \\ \hline Age & Self report & Passport or birth certificate \\ BMI & Dr. Office measurements & Lab technician, no shoes, gown \\ \% Body Fat & Tape and calipers, Dr. Office & Submerge in water tank \\ Cholesterol & Lab 1 & Lab 2 \\ Diastolic BP & Blood pressure cuff, Dr. office & Digital readout, mostly automatic \\ \hline \end{tabular} \pause \renewcommand{\arraystretch}{1.0} \vspace{5mm} \begin{itemize} \item Set two is of generally higher quality. \pause \item Correlation of measurement errors is unlikely between sets. \end{itemize} } % End size \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/2101f19} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/2101f19}} \end{frame} \end{document} # Simulate some data for a simple example set.seed(9999) n = 150; beta1 = 1; phi=1; psi = 9; omega1=1; omega2=1 X = rnorm(n,10,sqrt(phi)); epsilon = rnorm(n,0,sqrt(psi)) e1 = rnorm(n,0,sqrt(omega1)); e2 = rnorm(n,0,sqrt(omega2)) Y = 3 + beta1*X + epsilon W1 = X + e1; W2 = X + e2 datta = round(cbind(W1,W2,Y),2) cor(datta) summary(lm(Y~X)) summary(lm(Y~W1+W2)) rnorm(n,0,sqrt()) ########### Nicely ambiguous traditional output ########### > cor(datta) W1 W2 Y W1 1.0000000 0.5748331 0.1714324 W2 0.5748331 1.0000000 0.1791539 Y 0.1714324 0.1791539 1.0000000 > summary(lm(Y~X)) Call: lm(formula = Y ~ X) Residuals: Min 1Q Median 3Q Max -6.8778 -2.0571 -0.0718 2.1200 7.4284 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.8122 2.5348 1.504 0.134723 X 0.9288 0.2521 3.684 0.000322 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 3.096 on 148 degrees of freedom Multiple R-squared: 0.08398, Adjusted R-squared: 0.07779 F-statistic: 13.57 on 1 and 148 DF, p-value: 0.0003218 > summary(lm(Y~W1+W2)) Call: lm(formula = Y ~ W1 + W2) Residuals: Min 1Q Median 3Q Max -7.6711 -2.3889 -0.1303 2.3442 7.6879 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 7.9647 2.1148 3.766 0.000239 *** W1 0.2369 0.2282 1.038 0.300870 W2 0.2799 0.2300 1.217 0.225615 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 3.182 on 147 degrees of freedom Multiple R-squared: 0.03918, Adjusted R-squared: 0.02611 F-statistic: 2.997 on 2 and 147 DF, p-value: 0.053 > > W = W1+W2 > summary(lm(Y~W)) Call: lm(formula = Y ~ W) Residuals: Min 1Q Median 3Q Max -7.6787 -2.3707 -0.1431 2.3242 7.6763 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 7.9721 2.1066 3.784 0.000223 *** W 0.2583 0.1052 2.454 0.015280 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 3.171 on 148 degrees of freedom Multiple R-squared: 0.0391, Adjusted R-squared: 0.03261 F-statistic: 6.023 on 1 and 148 DF, p-value: 0.01528