% Omitted variables and instrumental variables for Applied Stat I % Long hairy 2016 section on measurement error omitted in 2017. Graphs are still in this folder. % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Omitted Variables\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2019} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Omitted Variables} \begin{frame} \frametitle{A Practical Data Analysis Problem} When more explanatory variables are added to a regression model \pause and these additional explanatory variables are correlated with explanatory variables already in the model \pause (as they usually are in an observational study), \pause \begin{itemize} \item Statistical significance can appear when it was not present originally. \pause \item Statistical significance that was originally present can disappear. \pause \item Even the signs of the $\widehat{\beta}$s can change, reversing the interpretation of how their variables are related to the response variable. \end{itemize} \end{frame} \begin{frame} \frametitle{An extreme, artificial example} \framesubtitle{To make a point} Suppose that in a certain population, the correlation between age and strength is $r=-0.93$. \pause \begin{center} \includegraphics[width=2.8in]{Chimps} \end{center} \end{frame} \begin{frame} \frametitle{The fixed $x$ regression model} \begin{displaymath} Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,p-1} + \epsilon_i, \mbox{ with } \epsilon_i \sim N(0,\sigma^2) \end{displaymath} \pause \vspace{5mm} \begin{itemize} \item If viewed as conditional on $\mathbf{X}_i = \mathbf{x}_i$, this model implies independence of $\epsilon_i$ and $\mathbf{X}_i$, \pause because the conditional distribution of $\epsilon_i$ given $\mathbf{X}_i = \mathbf{x}_i$ does not depend on $\mathbf{x}_i$. \pause \item What is $\epsilon_i$? \emph{Everything else} that affects $Y_i$. \pause \item So the usual model says that if the explanatory varables are random, they have \emph{zero covariance} with all other variables that are related to $Y_i$, but are not included in the model. \pause \item For observational data, this assumption is almost always violated. \pause \item Does it matter? \end{itemize} \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{The explanatory variables are random.} \pause Suppose that the variables $X_2$ and $X_3$ affect $Y$ and are correlated with $X_1$, but they are not part of the data set. \pause The values of the response variable are generated as follows: \pause \begin{displaymath} Y_i = \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_2 X_{i,3} + \epsilon_i, \end{displaymath} independently for $i= 1, \ldots, n$, where $\epsilon_i \sim N(0,\sigma^2)$. \pause The explanatory variables are random, with expected value and variance-covariance matrix \pause \begin{displaymath} E\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right) = \left( \begin{array}{c} \mu_1 \\ \mu_2 \\ \mu_3 \end{array} \right) \mbox{ ~and~ } cov\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right) = \left( \begin{array}{rrr} \phi_{11} & \phi_{12} & \phi_{13} \\ & \phi_{22} & \phi_{23} \\ & & \phi_{33} \end{array} \right), \end{displaymath} where $\epsilon_i$ is independent of $X_{i,1}$, $X_{i,2}$ and $X_{i,3}$. \end{frame} \begin{frame} \frametitle{Absorb $X_2$ and $X_3$} \begin{columns} % Use Beamer's columns to make narrower margins! \column{1.1\textwidth} Since $X_2$ and $X_3$ are not observed, they are absorbed by the intercept and error term. \pause {\small \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_2 X_{i,3} + \epsilon_i \\ \pause &=& (\beta_0 + \beta_2\mu_2 + \beta_3\mu_3) + \beta_1 X_{i,1} + (\beta_2 X_{i,2} + \beta_3 X_{i,3} - \beta_2\mu_2 - \beta_3\mu_3 + \epsilon_i) \\ \pause &=& \beta^\prime_0 + \beta_1 X_{i,1} + \epsilon^\prime_i. \end{eqnarray*} } % End size \pause And, \begin{displaymath} Cov(X_{i,1},\epsilon^\prime_i) = \beta_2\phi_{12} + \beta_3\phi_{13} \neq 0 \end{displaymath} \end{columns} \end{frame} \begin{frame} \frametitle{The ``True" Model} \framesubtitle{Almost always closer to the truth than the usual model, for observational data} \pause {\LARGE \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} \pause } % End Size \vspace{5mm} where $E(X_i)=\mu_x$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \vspace{5mm} \pause Under this model, \begin{displaymath} \sigma_{xy} = Cov(X_i,Y_i) \pause = Cov(X_i,\beta_0 + \beta_1 X_i + \epsilon_i) \pause = \beta_1 \sigma^2_x + c \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimate $\beta_1$ as usual with least squares} \begin{eqnarray*} \widehat{\beta}_1 &=& \frac{\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\sum_{i=1}^n(X_i-\overline{X})^2} \\ \pause &=& \frac{\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2} \\ \pause &=& \frac{\widehat{\sigma}_{xy}}{\widehat{\sigma}^2_x} \\ \pause &\stackrel{a.s.}{\rightarrow}& \frac{\sigma_{xy}}{\sigma^2_x}\\ \pause &=& \frac{\beta_1 \sigma^2_x + c}{\sigma^2_x} \\ \pause &=& \beta_1 + \frac{c}{\sigma^2_x} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\widehat{\beta}_1 \stackrel{a.s.}{\rightarrow} \beta_1 + \frac{c}{\sigma^2_x}$} \framesubtitle{It converges to the wrong thing.} \pause \begin{itemize} \item $\widehat{\beta}_1$ is inconsistent. \pause \item For large samples it could be almost anything, depending on the value of $c$, the covariance between $X_i$ and $\epsilon_i$. \pause \item Small sample estimates could be accurate, but only by chance. \pause \item The only time $\widehat{\beta}_1$ behaves properly is when $c=0$. \pause \item Test $H_0: \beta_1=0$: Probability of Type I error goes almost surely to one. % \pause % \item What if $\beta_1 < 0$ but $\beta_1 + \frac{c}{\sigma^2_x} > 0$, \pause % and you test $H_0: \beta_1=0$? \end{itemize} \end{frame} \begin{frame} \frametitle{All this applies to multiple regression} \framesubtitle{Of course} \pause \emph{When a regression model fails to include all the explanatory variables that contribute to the response variable, and those omitted explanatory variables have non-zero covariance with variables that are in the model, the regression coefficients are inconsistent. \pause Estimation and inference are almost guaranteed to be misleading, especially for large samples.} \end{frame} \begin{frame} \frametitle{Correlation-Causation} \begin{itemize} \item The problem of omitted variables is the technical version of the correlation-causation issue. \pause \item The omitted variables are ``confounding" variables. \pause \item With random assignment and good procedure, $x$ and $\epsilon$ have zero covariance. \pause \item But random assignment is not always possible. \pause \item Most applications of regression to observational data provide very poor information about the regression coefficients. \pause \item Is bad information better than no information at all? \end{itemize} \end{frame} \begin{frame} \frametitle{How about another estimation method?} \framesubtitle{Other than ordinary least squares} \pause \begin{itemize} \item Can \emph{any} other method be successful? \pause \item This is a very practical question, \pause because almost all regressions with observational data have the disease. \end{itemize} \end{frame} \begin{frame} \frametitle{For simplicity, assume normality} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$} \pause \begin{itemize} \item Assume $(X_i,\epsilon_i)$ are bivariate normal. \pause \item This makes $(X_i,Y_i)$ bivariate normal. \pause \item $(X_1,Y_1), \ldots, (X_n,Y_n) \stackrel{i.i.d.}{\sim} N_2(\mathbf{m},\mathbf{V})$, \pause where \end{itemize} \begin{displaymath} \mathbf{m} = \left( \begin{array}{c} m_1 \\ m_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) \end{displaymath} \pause and \begin{displaymath} \mathbf{V} = \left( \begin{array}{c c} v_{11} & v_{12} \\ & v_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right). \end{displaymath} \pause \begin{itemize} \item All you can ever learn from the data are the approximate values of $\mathbf{m}$ and $\mathbf{V}$. \pause \item Even if you knew $\mathbf{m}$ and $\mathbf{V}$ exactly, could you know $\beta_1$? \end{itemize} \end{frame} \begin{frame} \frametitle{Five equations in six unknowns} The parameter is $\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$. \pause The distribution of the data is determined by \pause \vspace{2mm} {\footnotesize \begin{displaymath} \left( \begin{array}{c} m_1 \\ m_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) ~~\mbox{ and }~~ \left( \begin{array}{c c} v_{11} & v_{12} \\ & v_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right) \end{displaymath} \pause } % End size \begin{itemize} \item $\mu_x=m_1$ and $\sigma^2_x=v_{11}$. \pause \item The remaining 3 equations in 4 unknowns have infinitely many solutions. \pause \item So infinitely many sets of parameter values yield the \emph{same distribution of the sample data}. \pause \item This is serious trouble -- lack of parameter identifiability. \pause \item \emph{Definition}: If a parameter is a function of the distribution of the observable data, it is said to be \emph{identifiable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Skipping the High School algebra} \framesubtitle{$\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$} \pause \begin{itemize} \item For \emph{any} given $\mathbf{m}$ and $\mathbf{V}$, all the points in a one-dimensional subset of the 6-dimensional parameter space yield $\mathbf{m}$ and $\mathbf{V}$, and hence the same distribution of the sample data. \pause \item In that subset, values of $\beta_1$ range from $-\infty$ to $-\infty$, \pause so $\mathbf{m}$ and $\mathbf{V}$ could have been produced by \emph{any} value of $\beta_1$. \pause \item There is no way to distinguish between the possible values of $\beta_1$ based on sample data. \pause \item The problem is fatal, \pause if all you can observe is a single $X$ and a single $Y$. \end{itemize} \end{frame} \begin{frame} \frametitle{Details for the record} \framesubtitle{$\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$} %\begin{eqnarray*} % m_2 & = & \beta_0+\beta_1 m_1 \\ % v_{12} & = & \beta_1 v_{11} + c \\ % v_{22} & = & \beta_1^2\ v_{11} + 2 \beta_1c + \sigma^2_\epsilon %\end{eqnarray*} For \emph{any} given $\mathbf{m}$ and $\mathbf{V}$, all the points in a one-dimensional subset of the 6-dimensional parameter space yield $\mathbf{m}$ and $\mathbf{V}$, and hence the same distribution of the sample data. \begin{itemize} \item $\mu_x=m_1$ and $\sigma^2_x=v_{11}$ remain fixed. \item $\sigma^2_\epsilon \geq |\mathbf{V}|/v_{11}$ \item When $\sigma^2_\epsilon = |\mathbf{V}|/v_{11}$, $\beta_1 = v_{12}/v_{11}$ \item For $\sigma^2_\epsilon > |\mathbf{V}|/v_{11}$, two values of $\beta_1$ are compatible with $\mathbf{m}$ and $\mathbf{V}$. \item As $\sigma^2_\epsilon$ increases, the lower $\beta_1$ goes to $-\infty$ and the upper $\beta_1$ goes to $-\infty$. \item $\beta_0$ and $c$ are linear functions of $\beta_1$: \begin{itemize} \item $\beta_0 = m_2-\beta_1 m_1$ \item $c = v_{12} - \beta_1 v_{11}$ \end{itemize} \item This set of parameter values is geometrically interesting. \end{itemize} \end{frame} \section{Instrumental Variables} \begin{frame} \frametitle{Instrumental Variables (Wright, 1928)} \framesubtitle{A partial solution} \pause \begin{itemize} \item An instrumental variable is a variable that is correlated with an explanatory variable, but is not correlated with any error terms and has no direct effect on the response variable. \pause \item Usually, the instrumental variable \emph{influences} the explanatory variable. \pause \item An instrumental variable is often not the main focus of attention; it's just a tool. \end{itemize} \end{frame} \begin{frame} \frametitle{A Simple Example} What is the contribution of income to credit card debt? \pause \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} \pause where $E(X_i)=\mu_x$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \end{frame} \begin{frame} \frametitle{A path diagram} \pause \begin{displaymath} Y_i = \alpha + \beta X_i + \epsilon_i, \end{displaymath} where $E(X_i)=\mu$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \pause \begin{center} \includegraphics[width=3in]{OmittedPath} \end{center} \pause Least squares estimate of $\beta$ is inconsistent, and so is every other possible estimate. \pause If the data are normal. \end{frame} \begin{frame} \frametitle{Add an instrumental variable} \framesubtitle{$X$ is income, $Y$ is credit card debt.} \pause Focus the study on real estate agents in many cities. \pause Include median price of resale home $W_i$. \pause \begin{eqnarray*} X_i & = & \alpha_1 + \beta_1W_i +\epsilon_{i1} \\ \pause Y_i & = & \alpha_2 + \beta_2X_i +\epsilon_{i2} \pause \end{eqnarray*} \begin{center} \includegraphics[width=4in]{InstruVar} \end{center} Main interest is in $\beta_2$. \end{frame} \begin{frame} \frametitle{Base estimation and inference on the covariance matrix} \framesubtitle{of $(W_i,X_i,Y_i)$: Call it $V = [v_{ij}]$} \pause From $X_i = \alpha_1 + \beta_1W_i +\epsilon_{i1}$ and $Y_i = \alpha_2 + \beta_2X_i +\epsilon_{i2}$, \pause \vspace{5mm} {\LARGE $V =$} \renewcommand{\arraystretch}{1.5} \begin{tabular}{|c|ccc|} \hline & $W$ & $X$ & $Y$ \\ \hline $W$ & $\sigma^2_w$ & $\beta_1\sigma^2_w$ & $\beta_1\beta_2\sigma^2_w$ \\ $X$ & & $\beta_1^2\sigma^2_w+\sigma^2_1$ & $\beta_2(\beta_1^2\sigma^2_w+\sigma^2_1)+c$ \\ $Y$ & & & $\beta_1^2\beta_2^2\sigma^2_w + \beta_2^2\sigma^2_1 + 2\beta_2c + \sigma^2_2$ \\ \hline \end{tabular} \pause \renewcommand{\arraystretch}{1.0} \vspace{2mm} \begin{displaymath} \beta_2 = \frac{v_{13}}{v_{12}} \end{displaymath} \pause And all the other parameters are identifiable too. \end{frame} \begin{frame} \frametitle{A close look} %\framesubtitle{} The $v_{ij}$ are elements of the covariance matrix of the observable data. \pause \begin{displaymath} \beta_2 = \frac{v_{13}}{v_{12}} \pause = \frac{\beta_1\beta_2\sigma^2_w}{\beta_1\sigma^2_w} \pause = \frac{Cov(W,Y)}{Cov(W,X)} \end{displaymath} \pause \begin{itemize} \item $\widehat{v}_{ij}$ are sample variances and covariances. \pause \item $\widehat{v}_{ij} \stackrel{a.s.}{\rightarrow} v_{ij}$. \pause \item It is safe to assume $\beta_1 \neq 0$. \pause \item Because it's the connection between real estate prices and the income of real estate agents. \pause \item $\frac{\widehat{v}_{13}}{\widehat{v}_{12}}$ is a (strongly) consistent estimate of $\beta_2$. \pause \item $H_0: \beta_2=0$ is true if and only if $v_{13}=0$. \pause \item Test $H_0: v_{13} = 0$ by standard methods. \end{itemize} \end{frame} % Long hairy 2016 section on measurement error omitted in 2017. \begin{frame} \frametitle{Comments} %\framesubtitle{} \begin{itemize} \item Instrumental variables can help with measurement error in the explanatory variables too. \pause % \item If there is measurement error, regression coefficients of interest are not identifiable and cannot be estimated consistently, but their signs can. \pause \item Good instrumental variables are not easy to find. \pause % BMI and popularity (study in adopted children) % Twins raised apart. \item They will not just happen to be in the data set, except by a miracle. \pause \item They really have to come from another universe, but still have a strong and clear effect. \pause \item Wright's original example was tax policy for cooking oil. \pause \item Econometricians are good at this. \pause \item Time series applications are common. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf19} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/2101f19}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} \begin{frame} \frametitle{} % \framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi ######## First IV model, with no latent variables except errors ######## # xi' = (W,epsilon1,epsilon2) and eta' = (X,Y) B = ZeroMatrix(2,2) # beta B[1,0] = var('beta2') ; B P = ZeroMatrix(3,3) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11'); P[2,2] = var('sigma22') P[1,2] = var('c'); P[2,1] = var('c'); P G = ZeroMatrix(2,3) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G pickout = 3,1,2 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Agrees with hand calculations ######## Second IV model, with true income ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (T,X,Y) and xi' = (W,epsilon1,epsilon2,epsilon3) B = ZeroMatrix(3,3) # beta B[1,0] = var('beta2') ; B[2,0] = var('beta3') ; B P = ZeroMatrix(4,4) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11') P[2,2] = var('sigma22'); P[3,3] = var('sigma33'); P[2,3] = var('c'); P[3,2] = var('c'); P G = ZeroMatrix(3,4) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G[2,3] = 1; G pickout = 4,2,3 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace notation # Play with nasty P = SymmetricMatrix(4,'sigma'); P[0,0]=var('sigma00') P[0,1]=0; P[0,2]=0; P[0,3]=0 P[1,0]=0; P[2,0]=0; P[3,0]=0; P V = EqsVar(B,G,P,pickout); V %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \left(\begin{array}{rrr} \sigma_{00} & \beta_{1} \beta_{2} \sigma_{00} & \beta_{1} \beta_{3} \sigma_{00} \\ \beta_{1} \beta_{2} \sigma_{00} & \beta_{1}^{2} \beta_{2}^{2} \sigma_{00} + \beta_{2}^{2} \sigma_{11} + \sigma_{22} & \beta_{1}^{2} \beta_{2} \beta_{3} \sigma_{00} + \beta_{2} \beta_{3} \sigma_{11} + c \\ \beta_{1} \beta_{3} \sigma_{00} & \beta_{1}^{2} \beta_{2} \beta_{3} \sigma_{00} + \beta_{2} \beta_{3} \sigma_{11} + c & \beta_{1}^{2} \beta_{3}^{2} \sigma_{00} + \beta_{3}^{2} \sigma_{11} + \sigma_{33} \end{array}\right) ######## Third IV model, with true income and debt, correlated errors ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (Tx,Ty,X,Y) and xi' = (epsilon1,epsilon2,epsilon3,epsilon4,W) B = ZeroMatrix(4,4) # beta B[1,0] = var('beta3') ; B[2,0] = var('beta2') ; B[3,1] = var('beta4'); B P = SymmetricMatrix(5,'sigma'); P[4,4]=var('sigma00') # No correlations between W and the errors for j in interval(0,3): P[j,4] = 0 P[4,j] = 0 P G = ZeroMatrix(4,5) # gamma G[0,0] = 1; G[0,4] = var('beta1'); G[1,1] = 1; G[2,3] = 1; G[3,2] = 1; G pickout = 9,3,4 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace sigma00 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \left(\begin{array}{ccc} \sigma^2_w & \beta_{1} \beta_{2} \sigma^2_w & \beta_{1} \beta_{3} \beta_{4} \sigma^2_w \\ \beta_{1} \beta_{2} \sigma^2_w & \beta_{1}^{2} \beta_{2}^{2} \sigma^2_w + \beta_{2}^{2} \sigma_{11} + 2 \, \beta_{2} \sigma_{14} + \sigma_{44} & \beta_{1}^{2} \beta_{2} \beta_{3} \beta_{4} \sigma^2_w + \beta_{2} \beta_{3} \beta_{4} \sigma_{11} + \beta_{2} \beta_{4} \sigma_{12} + \beta_{3} \beta_{4} \sigma_{14} + \beta_{2} \sigma_{13} + \beta_{4} \sigma_{24} + \sigma_{34} \\ \beta_{1} \beta_{3} \beta_{4} \sigma^2_w & \beta_{1}^{2} \beta_{2} \beta_{3} \beta_{4} \sigma^2_w + \beta_{2} \beta_{3} \beta_{4} \sigma_{11} + \beta_{2} \beta_{4} \sigma_{12} + \beta_{3} \beta_{4} \sigma_{14} + \beta_{2} \sigma_{13} + \beta_{4} \sigma_{24} + \sigma_{34} & \beta_{1}^{2} \beta_{3}^{2} \beta_{4}^{2} \sigma^2_w + \beta_{3}^{2} \beta_{4}^{2} \sigma_{11} + 2 \, \beta_{3} \beta_{4}^{2} \sigma_{12} + 2 \, \beta_{3} \beta_{4} \sigma_{13} + \beta_{4}^{2} \sigma_{22} + 2 \, \beta_{4} \sigma_{23} + \sigma_{33} \end{array}\right) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sigma_{00} OLD2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ######## Second IV model, with true income ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (T,X,Y) and xi' = (W,epsilon1,epsilon2,epsilon3) B = ZeroMatrix(3,3) # beta B[1,0] = var('beta2') ; B[2,0] = var('beta3') ; B P = ZeroMatrix(4,4) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11') P[2,2] = var('sigma22'); P[3,3] = var('sigma33'); P[2,3] = var('c'); P[3,2] = var('c'); P G = ZeroMatrix(3,4) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G[2,3] = 1; G pickout = 4,2,3 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace notation # Play with nasty P = SymmetricMatrix(4,'sigma'); P[0,0]=var('sigma00') P[0,1]=0; P[0,2]=0; P[0,3]=0 P[1,0]=0; P[2,0]=0; P[3,0]=0; P V = EqsVar(B,G,P,pickout); V \begin{eqnarray*} m_2 & = & \beta_0+\beta_1 m_1 \\ v_{12} & = & \beta_1 v_{11} + c \\ v_{22} & = & \beta_1^2\ v_{11} + 2 \beta_1c + \sigma^2_\epsilon \end{eqnarray*}