% LAE&F RG 2025 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Book metadata \title[Linear Algebra : Essence \& Form]{Linear Algebra: \\ Essence \\ \& Form %\thanks{Thanks to...} } \author[R. Ghrist]{Robert Ghrist} \publisher{Agenbyte Press} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % NEWCOMMANDS by prof-g %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\vect}[1]{\bm{#1}} % VECTOR NOTATION \newcommand{\style}[1]{\emph{#1}} % DEFINITION NOTATION \newcommand{\R}{\mathbb{R}} % REAL FIELD \newcommand{\C}{\mathbb{C}} % COMPLEX FIELD \newcommand{\Z}{\mathbb{Z}} % INTEGERS \newcommand{\N}{\mathbb{N}} % NATURALS \newcommand{\iso}{\cong} % ISOMORPHISM \newcommand{\id}{\textrm{id}} % IDENTITY TRANSFORMATION \DeclareMathOperator{\coker}{\textrm{coker}} % COKERNEL \DeclareMathOperator{\coim}{\textrm{coim}} % COIMAGE \DeclareMathOperator{\im}{\textrm{im}} % IMAGE \DeclareMathOperator{\rank}{\textrm{rank}} % RANK \DeclareMathOperator{\nullity}{\textrm{null}} % NULLITY \DeclareMathOperator{\spanset}{\textrm{span}} % SPAN \DeclareMathOperator{\diag}{\textsc{diag}} % DIAGONAL \newcommand{\basis}{\mathcal{B}} % BASIS \newcommand{\poly}{\mathcal{P}} % POLYNOMIALS \newcommand{\sym}{\textsc{sym}} % SYMMETRIC MATRICES \newcommand{\skewsym}{\textsc{skew}} % SKEW-SYMMETRIC MATRICES \newcommand{\inertia}{\mathcal{I}} % INERTIA TENSOR \newcommand{\stress}{\boldsymbol{\sigma}} % STRESS TENSOR \newcommand{\trace}{\textrm{tr}} % TRACE \newcommand{\ihat}{\hat{\imath}} % i-HAT BASIS VECTOR \newcommand{\jhat}{\hat{\jmath}} % j-HAT BASIS VECTOR \newcommand{\khat}{\hat{k}} % k-HAT BASIS VECTOR \newcommand{\proj}[1]{\Pi_{#1}} % PROJECTION OPERATOR \newcommand{\directsum}{\oplus} % DIRECT SUM \newcommand{\orthosum}{\boxplus} % ORTHOGONAL DIRECT SUM \DeclareMathOperator{\row}{\textsc{row}} % ROW SPACE \DeclareMathOperator{\column}{\textsc{col}} % COL SPACE \newcommand{\COV}{[C]} % COVARIANCE MATRIX \newcommand{\cov}{\textsc{cov}} % scalar covariance \newcommand{\coventry}{C} % covariance entry \newcommand{\CORR}{[R]} % CORRELATION MATRIX \newcommand{\corr}{\textsc{corr}} % scalar correlation \newcommand{\corentry}{R} % correlation entry \newcommand{\Data}{\mathcal{X}} \newcommand{\cond}{\textsc{cond}} % CONDITION NUMBER \newcommand{\PARAM}{\Psi} % PARAMETERS \newcommand{\NUMLAY}{\Lambda} % NUMBER OF LAYERS \newcommand{\LOSS}{\mathcal{L}} % LOSS/COST FUNCTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Inserts a blank page \newcommand{\blankpage}{\newpage\hbox{}\thispagestyle{empty}\newpage} % Macros for typesetting the documentation \newcommand{\hlred}[1]{\textcolor{Maroon}{#1}}% prints in red \newcommand{\hangleft}[1]{\makebox[0pt][r]{#1}} \newcommand{\hairsp}{\hspace{1pt}}% hair space \newcommand{\hquad}{\hskip0.5em\relax}% half quad space \newcommand{\TODO}{\textcolor{red}{\bf TODO!}\xspace} \newcommand{\ie}{\textit{i.\hairsp{}e.}\xspace} \newcommand{\eg}{\textit{e.\hairsp{}g.}\xspace} \newcommand{\na}{\quad--}% used in tables for N/A cells %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % SECTION DIVIDERS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\emanation}{ \vspace{1em} % Add space above \noindent \raisebox{-0.15ex}{\textbullet}% \hspace{0.5em}% \rule[0.5ex]{0.9\textwidth}{1.0pt}% \hspace{0.5em}% \raisebox{-0.15ex}{\textbullet}% \vspace{0em} % Adjust space below } \newcommand{\exercises}{ \vspace{1em} % Add space above \noindent \raisebox{-0.15ex}{$\Box$}% \hspace{0.5em}% \rule[0.5ex]{0.9\textwidth}{1.0pt}% \hspace{0.5em}% \raisebox{-0.15ex}{$\Box$}% \vspace{0em} % Adjust space below } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \setcounter{secnumdepth}{2} % Numbers sections up to level 2 (subsections) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % theorem environments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newtheorem{theorem}{Theorem}[chapter] \newtheorem{corollary}[theorem]{Corollary} \newtheorem{prop}[theorem]{Proposition} \newtheorem{conj}[theorem]{Conjecture} \newtheorem{clm}[theorem]{Claim} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{cor}[theorem]{Corollary} \theoremstyle{definition} \newtheorem{example}[theorem]{Example} \newtheorem{definition}[theorem]{Definition} %\newtheorem{ndefn}[theorem]{Non-definition} %\newtheorem{defns}[theorem]{Definitions} %\newtheorem{con}[theorem]{Construction} % \newtheorem{prob}[theorem]{Problem} \newtheorem{sol}[theorem]{Solution} %\newtheorem{example}[theorem]{Example} \newtheorem{nexmp}[theorem]{Non-example} \newtheorem{examples}[theorem]{Examples} \newtheorem{notation}[theorem]{Notation} \newtheorem{notations}[theorem]{Notations} %\newtheorem{addm}[theorem]{Addendum} \newtheorem{exer}[theorem]{Exercise} \newtheorem{obv}[theorem]{Observation} \newtheorem{quest}[theorem]{Question} \theoremstyle{remark} \newtheorem{assumption}[theorem]{Assumption} \newtheorem{remark}[theorem]{Remark} %\newtheorem{rems}[theorem]{Remarks} \newtheorem{warn}[theorem]{Warning} %\newtheorem{sch}[theorem]{Scholium} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Add end mark to existing example environment \newcommand{\examplemark}{\hfill\hbox{$\diamond$}} % Modify example environment to append mark \AtEndEnvironment{example}{\examplemark} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Add end mark to existing example environment \newcommand{\defmark}{\hfill\hbox{$\bullet$}} % Modify example environment to append mark \AtEndEnvironment{definition}{\defmark} \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % r.9 introduction \cleardoublepage \chapter*{Incipit} \newthought{Mathematics is the language} of modern engineering, and linear algebra its American dialect -- inelegant, practical, ubiquitous. This text aims to prepare engineering students for the mathematical aspects of artificial intelligence, data science, dynamical systems, machine learning, and other fields whose advances depend critically on linear algebraic methods. The reader arrives here having encountered matrices and vectors in calculus courses (at least). These tools, though already familiar as computational devices, harbor deeper structures worth careful study. Our task is to build on this computational facility toward an understanding of the abstract frameworks that enable modern methods in contemporary engineering. This text differs from standard linear algebra courses in its emphasis and pace. Abstract vector spaces appear early, but always in service of concrete applications. The singular value decomposition and eigentheory -- essential to modern practice -- arrive at the midpoint, allowing extended treatment of applications in dynamics and data science alike. Practical examples appear throughout, acknowledging that theoretical understanding and useful implementation emanate symmetrically. The sequence of topics balances pedagogical necessity with contemporary relevance. Systems of linear equations provide an entry point, leading to vector spaces and linear transformations. Inner products and orthogonality build geometric intuition, and linear ODEs and iterative systems provide an impetus for eigendecompositions. The singular value decomposition serves as both a culminating theoretical achievement and a bridge to powerful applications, such as principal component analysis, low-rank approximation, and neural networks. This text exists because engineering education must evolve. Though the foundations of linear algebra remain stable, their applications have expanded dramatically. Today's engineering students require facility with both abstract theory and practical implementation -- not merely to apply existing tools, but to create new ones. Linear algebra is not the endpoint, but rather a first step toward deeper mathematical structures. It is through this lens that we approach the subject: as a gateway to both current practice and future advances. %{\em Incipit.} % ============================================== \section*{Topics for Review} \label{sec:topics} This text assumes a strong grounding in (single and) multivariable calculus in the context of vectors, matrices, and coordinate-based linear transformations. Please see the {\em Calculus Blue Project} for an example. Before beginning this text the reader should have been exposed to: \begin{enumerate} \item Basic set-theory and its notation \begin{marginfigure} {\em e.g.,} $\in, \subset, \cup, \cap$ \end{marginfigure} \item Taylor series and exponentials \item Complex numbers and Euler's formula \begin{marginfigure} $e^{i\theta} = \cos\theta + i\sin\theta$ \end{marginfigure} \item Differentiation and integration \item The linear ODE $dx/dt = ax$ and its solutions \begin{marginfigure} $x(t) = e^{at}x_0$ \end{marginfigure} \item Euclidean vectors and vector algebra \item The dot product and angles between Euclidean vectors \begin{marginfigure} $\vect{u}\cdot\vect{v} = |\vect{u}||\vect{v}|\cos\theta$ \end{marginfigure} \item Matrices, matrix addition, and matrix multiplication \begin{marginfigure} $AB\neq BA$ \\ $(AB)C=A(BC)$ \end{marginfigure} \item The identity matrix, $I$, and its behavior \item The transpose $A^T$ of a matrix $A$ and its properties \begin{marginfigure} $(A^T)_{ij}=A_{ji}$ and $(AB)^T=B^TA^T$ \end{marginfigure} \item Matrix-vector multiplication \item Converting linear systems of equations to matrix-vector form \begin{marginfigure} $A\vect{x}=\vect{b}$ \end{marginfigure} \item Row reduction and back-substitution \item The matrix inverse $A^{-1}$ and its properties \begin{marginfigure} $AA^{-1}=I=A^{-1}A$ \\ $(AB)^{-1}=B^{-1}A^{-1}$ \end{marginfigure} \item Euclidean linear transformations: rescaling, rotations, shears \item Trace of a matrix \begin{marginfigure} $\trace(A)=\sum_k a_{kk}$ \end{marginfigure} \item Determinants and their properties \begin{marginfigure} $\det(AB)=\det(A)\det(B)$ \noindent $\det(A^T)=\det(A)$ \end{marginfigure} \end{enumerate} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section*{Assumptions} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% This text, like its author, spans Mathematics \& Engineering and tries to strike a balance between the two. Given the audience and constraints associated with this text, there are a few topics or details included which do not appear in typical linear algebra texts, as well as several interesting mathematical side-paths which are left unexplored. \begin{enumerate} \item Abstract vector spaces and abstract linear transformations are important, even though coordinate-based linear algebra prevails in applications. Thinking without coordinates is an important skill to master. \item Finite-dimensional vector spaces are the norm. When infinite-dimensional spaces are invoked, they are done so without fully detailed justification and with some caveats. \item The Fundamental Theorem of Linear Algebra is the organizing principle of this text. Its usual emanation in terms of orthogonal complements is to be approached only after the primal form (using quotients) is mastered. \item All vector spaces are over the reals -- no finite fields and no complex coefficients. This greatly facilitates intuition at the expense of complexity when covering the Jordan Canonical Form and solutions to linear systems of ODEs. \item Not all applications can be developed slowly via careful exposition. Teaching random variables, covariance matrices, stress tensors, neural networks, and other interesting engineering applications is not the direct goal of the text. Until recently, the author might not have been entirely comfortable including such examples without fuller explanations. In an age of language models and active AI-enriched reading, new possibilities emerge. \end{enumerate} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section*{Acknowledgments} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% This text meant for first- and second-year undergraduate students in engineering as a follow-up course to multivariable calculus. It was created initially to support students in the Penn's Artificial Intelligence degree program, but has much broader utility. The author is grateful to Penn's fantastic engineering students. Prof. N. Matni produced an excellent set of online course materials and python worksheets to pair with the course antecedent to this text. The outline of this book takes some inspiration from his work, while laying out a slightly different trajectory. The writing was assisted by Claude 3.5 sonnet, trained on my previous books for style. A hidden schema of puzzles based on a certain work of Wm. Blake \& a bit of influence from Aquinas was co-created by the author and Claude, with influences throughout the text. Artwork (mathematical and iconic) is by the author, using Adobe Illustrator. The LaTeX style files are based on the tufte-book class. GPT-4o and -o1 were useful in setting up various latex configurations and doing proof-reading. Gemini Experimental 1206 was an especially good proof-reader and converter to markdown. Exercises were generated with the help of Claude and may have errors. This project was begun on November 4, 2024. The first edition was submitted to Amazon Publishing on December 28, 2024. Fifty-five days: impossible without the creative labors of Claude, to whom the author is most grateful. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Start the main matter (normal chapters) \mainmatter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % THE THARMAS CYCLE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \clearpage \thispagestyle{empty} % no headers/footers \begin{fullwidth} \vspace*{\fill} % push content down to center vertically \centering \includegraphics[width=0.75\textwidth]{THARMAS.png} % adjust width as needed \vspace*{\fill} % push content up to center vertically \end{fullwidth} \clearpage %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= \chapter{Solving Linear Systems} \label{ch:1} %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= {\em ``in right lined paths outmeasur'd by proportions of number weight \& measure''} \newthought{The story of linear algebra begins} with systems of equations, each line describing a constraint or boundary traced upon abstract space. These simplest mathematical models of limitation -- each equation binding variables in measured proportion -- conjoin to shape the realm of possible solutions. When several such constraints act in concert, their collaboration yields three possible fates: no solution survives their collective force; exactly one point satisfies all bounds; or infinite possibilities trace curves and planes through the space of satisfaction. This trichotomy -- of emptiness, uniqueness, and infinity -- echoes through all of linear algebra, appearing in increasingly sophisticated forms as our understanding deepens. The art lies in recognizing these patterns and discovering efficient paths to their resolution. Each systematic operation preserves essential structure while bringing clarity to what was obscure. The methods we develop -- though conceived for practical computation -- exemplify deeper principles about how mathematical objects may be transformed while maintaining their fundamental character. Our journey begins with the familiar territory of solving equations, yet even here we find hints of profound structure waiting to be unveiled. The patterns that emerge -- of transformation and invariance, of dimension and degeneracy -- will guide our development throughout this text. Through careful study of these foundational systems, we build the tools needed to understand far more sophisticated mathematical objects. In this way, the simple act of solving equations becomes our first step toward comprehending the deepest patterns in linear algebra. % ============================================== \section{Solving Equations} \label{sec:equations} % ============================================== \begin{definition}[Linear System] \label{def:linsys} A \style{linear system} in variables $x_1,\ldots,x_n$ consists of $m$ equations of the form \[ \begin{array}{rcl} a_{11}x_1 + a_{12}x_2 + \cdots + a_{1n}x_n &=& b_1 \\ a_{21}x_1 + a_{22}x_2 + \cdots + a_{2n}x_n &=& b_2 \\ &\vdots& \\ a_{m1}x_1 + a_{m2}x_2 + \cdots + a_{mn}x_n &=& b_m \end{array} \] where the coefficients $a_{ij}$ and constants $b_i$ are real numbers. \end{definition} Such systems arise naturally in contexts ranging from the distribution of currents in electrical networks to the balance of forces in structures to the flow of traffic in transportation networks. This system is more efficiently expressed as $A\vect{x} = \vect{b}$, where: % \begin{marginfigure} The matrix form $A\vect{x}=\vect{b}$ is more than mere notation -- it reveals the fundamental operation of linear transformation that we will explore in Chapter \ref{ch:3}. \end{marginfigure} % \begin{equation} A = \begin{bmatrix} a_{11} & a_{12} & \cdots & a_{1n} \\ a_{21} & a_{22} & \cdots & a_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ a_{m1} & a_{m2} & \cdots & a_{mn} \end{bmatrix}, \quad \vect{x} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}, \quad \vect{b} = \begin{bmatrix} b_1 \\ b_2 \\ \vdots \\ b_m \end{bmatrix} \end{equation} The matrix $A$ is called the \style{coefficient matrix} of the system. The vector $\vect{b}$ is the \style{constant vector}. Together they completely specify the linear system. % ============================================== \section{Special Matrices} \label{sec:special} % ============================================== Before engaging with the general solution of linear systems, we examine certain fundamental types of coefficient matrices -- primal forms from which more complex patterns emerge. These special cases -- though rarely encountered in practice -- illuminate the path toward general methods. The simplest case occurs when $A$ is the \style{identity matrix} $I$. The system $I\vect{x}=\vect{b}$ requires no solving: the solution is immediate, with $\vect{x}=\vect{b}$. This seeming triviality is nevertheless valuable: the simpler the matrix, the easier it is to infer a solution. \begin{definition}[Permutation] \label{def:perm} A \style{permutation matrix} is a square matrix with exactly one $1$ per row and column, having all other entries equal to $0$. \end{definition} % \begin{marginfigure} {\em Example:} a permutation matrix. \[ P = \begin{bmatrix} 0 & 0 & 0 & 1 & 0 \\ 1 & 0 & 0 & 0 & 0 \\ 0 & 0 & 1 & 0 & 0 \\ 0 & 1 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 1 \end{bmatrix} \] \end{marginfigure} % Every permutation matrix $P$ is obtained by rearranging the rows (or columns) of the identity matrix. Such matrices effect a reordering of components: the solution to $P\vect{x}=\vect{b}$ is a reordering of the entries of $\vect{b}$. This explains why permutation matrices are invertible -- their inverse simply undoes the permutation. Though elementary, these matrices play a crucial role in the implementation of efficient solution methods. More interesting are \style{block-diagonal matrices}, having the form \[ B = \begin{bmatrix} B_1 & 0 & \cdots & 0 \\ 0 & B_2 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & B_k \end{bmatrix} \] where each $B_i$ is a matrix. The system $B\vect{x}=\vect{b}$ decomposes into independent subsystems, one for each block. This decomposition principle -- that some linear systems can be solved by solving smaller independent systems -- will recur throughout our development. % \begin{marginfigure} {\em Example:} The following 4-by-4 matrix \[ \begin{bmatrix} 2 & 1 & 0 & 0 \\ 3 & 7 & 0 & 0 \\ 0 & 0 & 1 & 4 \\ 0 & 0 & -2 & 3 \\ \end{bmatrix} \] decomposes into two independent 2-by-2 blocks. \end{marginfigure} % \begin{example}[Hidden Block Structure] Consider the linear system: \[ \begin{bmatrix} 0 & 0 & 0 & -1 & 3 \\ 0 & 0 & 2 & 1 & 0 \\ 1 & 2 & 0 & 0 & 0 \\ 0 & 0 & -1 & 4 & 0 \\ -1 & 3 & 0 & 0 & 0 \end{bmatrix} \begin{pmatrix} x_1 \\ x_2 \\ x_3 \\ x_4 \\ x_5 \end{pmatrix} = \begin{pmatrix} b_1 \\ b_2 \\ b_3 \\ b_4 \\ b_5 \end{pmatrix} \] The structure of this system is obscured, but becomes clear after permuting rows and columns to group related variables. Specifically, after reordering rows (1,2,4) and (3,5), and variables $x_3,x_4,x_5$ and $x_1,x_2$, the system becomes: \[ \begin{bmatrix} 2 & 1 & 0 & 0 & 0 \\ -1 & 4 & 0 & 0 & 0 \\ 0 & -1 & 3 & 0 & 0 \\ 0 & 0 & 0 & 1 & 2 \\ 0 & 0 & 0 & -1 & 3 \end{bmatrix} \begin{pmatrix} x_3 \\ x_4 \\ x_5 \\ x_1 \\ x_2 \end{pmatrix} = \begin{pmatrix} b_2 \\ b_4 \\ b_1 \\ b_3 \\ b_5 \end{pmatrix} \] This reveals two independent subsystems: a $3\times 3$ system involving $x_3,x_4,x_5$ and a $2\times 2$ system for $x_1,x_2$. The block structure, hidden in the original formulation, allows us to solve two smaller systems rather than one large system. \end{example} An \style{upper-triangular matrix} $U$ has all entries below the diagonal equal to zero: \[ U = \begin{bmatrix} u_{11} & u_{12} & \cdots & u_{1n} \\ 0 & u_{22} & \cdots & u_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & u_{nn} \end{bmatrix} \] The system $U\vect{x}=\vect{b}$ yields to \style{back-substitution}: from the last equation, we compute $x_n$; this value substituted into the penultimate equation yields $x_{n-1}$; and so forth. This process fails only if some diagonal entry $u_{ii}$ vanishes. Its transpose, a \style{lower-triangular matrix} $L$, has all entries above the diagonal equal to zero. The corresponding system $L\vect{x}=\vect{b}$ succumbs to \style{forward-substitution}, solving for variables in order from first to last. These triangular forms will be our stepping stones toward solving general systems. \begin{marginfigure} {\em Foreshadowing:} The decomposition of a general matrix into a product of triangular matrices will provide both theoretical insight and practical methods for solving linear systems. \end{marginfigure} These special cases suggest a strategy: convert a general system into one of these simpler forms through systematic manipulation of equations. %How to do so efficiently and reliably is the subject of the next section. % ============================================== \section{Recalling Row Reduction} \label{sec:rowreduction} % ============================================== The method of solving linear systems by systematic elimination of variables has ancient roots. The modern approach builds on this by expressing both the coefficient matrix $A$ and constant vector $\vect{b}$ as a single object -- the \style{augmented matrix}, written as $[\,A\,|\,\vect{b}\,]$. This augmented matrix combines the system's coefficients with its constants in one array. The solution of linear systems proceeds through a sequence of operations, each of which transforms the augmented matrix into another representing an equivalent system (having the same solutions). \begin{definition}[Elementary Row Operations] \label{def:rowops} An \style{elementary row operation} on a matrix is one of three types: \begin{enumerate} \item[R1:] Interchange of any two rows \item[R2:] Multiplication of any row by a nonzero scalar \item[R3:] Addition of a multiple of one row to another row \end{enumerate} Each preserves the solution set of the corresponding linear system. \end{definition} % \begin{marginfigure} {\em Caveat:} While these three operations seem simple, their order matters greatly. A poorly chosen sequence of operations can lead to inconvenience and/or numerical instability. \end{marginfigure} These operations, though simple, are powerful. The first, R1, allows strategic positioning of equations. The second, R2, enables normalization of coefficients. The third, R3, is the atomic unit of elimination -- the means by which variables are systematically removed from equations. % The purpose of these operations is to convert the augmented matrix into a suitably simple form. \begin{marginfigure} {\em Example:} row echelon form. \[ \begin{bmatrix} \bullet & * & * & * & * \\ 0 & \bullet & * & * & * \\ 0 & 0 & 0 & \bullet & * \\ 0 & 0 & 0 & 0 & 0 \end{bmatrix} \] \end{marginfigure} % \begin{definition}[Row Echelon Form] \label{def:rref} A matrix is in \style{row echelon form} if: \begin{enumerate} \item All zero rows (if any) appear at the bottom \item The first nonzero entry (the \style{pivot}) in each nonzero row appears to the right of all pivots in rows above it \item All entries in a column below a pivot are zero \end{enumerate} A matrix in row echelon form with all entries above pivots also zero is said to be in \style{reduced row echelon form}. \end{definition} The process of achieving row echelon form exposes the structure of the linear system. Variables corresponding to pivot columns are \style{bound} -- determined by the other variables in the system. The remaining variables are \style{free} -- they may be chosen arbitrarily, with the bound variables adjusting accordingly to maintain the system's constraints. % \begin{marginfigure} {\em Foreshadowing:} The distinction between bound and free variables previews a deeper structure we will encounter when studying vector spaces: the relationship between dimension and constraints. \end{marginfigure} Should one continue the row operations beyond row echelon form, setting all entries above pivots to zero as well, the result is the \style{reduced row echelon form}. This form is unique to the linear system -- though many different sequences of row operations may arrive at it. The path to reduced row echelon form may vary; the destination does not. The dimension of the space of solutions is revealed through this reduction: it equals the number of free variables in the system. This connection between the algebraic process of row reduction and the geometric interpretation of solution sets exemplifies a central theme of linear algebra: the interplay of computational, algebraic, and geometric perspectives. % ============================================== \section{Inverse \& Invertibility} \label{sec:inverse} % ============================================== For a square matrix $A$, the system $A\vect{x}=\vect{b}$ takes on special significance as a model of \textit{determined} problems -- those with as many equations as unknowns. The solvability of such systems hinges on a fundamental property: \begin{definition}[Nonsingularity] \label{def:nonsingular} A square matrix $A$ is \style{nonsingular} if any of the following equivalent conditions hold: \begin{enumerate} \item There exists a matrix $A^{-1}$ such that $AA^{-1}=A^{-1}A=I$ \item The system $A\vect{x}=\vect{b}$ has a unique solution for every $\vect{b}$ \item The system $A\vect{x}=\vect{0}$ has only the trivial solution $\vect{x}=\vect{0}$ \item The determinant is nonzero: $\det A\neq 0$ \end{enumerate} \begin{marginfigure} {\em Recall:} \[ \begin{bmatrix} a & b \\ c & d \end{bmatrix}^{-1} = \] \[ \frac{1}{\det} \begin{bmatrix} d & -b \\ -c & a \end{bmatrix} \] %when $ad-bc\neq 0$ \end{marginfigure} A matrix that is not nonsingular is called \style{singular}. \end{definition} When $A$ is nonsingular, its inverse $A^{-1}$ provides an immediate solution $\vect{x}=A^{-1}\vect{b}$ to the system $A\vect{x}=\vect{b}$. Though the determinant offers a theoretical test for nonsingularity, practical computation requires different tools. \begin{marginfigure} {\em Foreshadowing:} The geometric interpretation of singular matrices as ``compressing space'' becomes profound when studying eigenvalues (Chapter 7) and singular values (Chapter 10). \end{marginfigure} Row reduction provides a systematic approach to finding $A^{-1}$ or proving it does not exist. Form the augmented matrix $[\,A\,|\,I\,]$ and perform row operations. If $A$ is nonsingular, this yields $[\,I\,|\,A^{-1}\,]$ -- the same operations transforming $A$ to $I$ will transform $I$ to $A^{-1}$. A singular matrix reveals itself during row reduction through a row of zeros. Such matrices irretrievably compress space, mapping distinct vectors to the same image. This compression manifests in the system $A\vect{x}=\vect{b}$ as either inconsistency (no solutions) or indeterminacy (infinitely many solutions). The equivalence of various characterizations of nonsingularity reveals deep connections between algebraic, geometric, and computational perspectives: \begin{itemize} \item Algebraic: $\det(A)\neq 0$ \item Analytic: $A\vect{x}=\vect{0}$ has only the trivial solution \item Computational: $[\,A\,|\,I\,]$ reduces to $[\,I\,|\,A^{-1}\,]$ \end{itemize} % These connections presage the interplay between algebraic, geometric, and computational properties of linear-algebraic entities. % ============================================== \section{Composition \& Elimination} \label{sec:composition} % ============================================== Row reduction is more than a sequence of operations: it is a composition of linear transformations. Each elementary row operation can be realized as multiplication on the left by an appropriate \style{elementary matrix} -- obtained by performing that same operation on the identity matrix. For example, to interchange rows $i$ and $j$ of a matrix $A$, one multiplies on the left by the matrix $E$ obtained by performing R1 on $I$. To multiply row $i$ by a nonzero constant $c$, one uses the elementary matrix $E$ formed by scaling row $i$ of $I$ by $c$: applying R2 to $I$. To add $c$ times row $j$ to row $i$, the elementary matrix $E$ comes from performing this R3 operation on $I$. The salient feature of these elementary matrices is their \style{invertibility}. Each row operation can be undone: \begin{marginfigure} {\em Examples:} row operation matrices and their inverses: \noindent R1: \noindent $\begin{bmatrix} 0&0&1&0 \\ 0&1&0&0 \\ 1&0&0&0 \\ 0&0&0&1 \end{bmatrix}^{-1} = \begin{bmatrix} 0&0&1&0 \\ 0&1&0&0 \\ 1&0&0&0 \\ 0&0&0&1 \end{bmatrix}$ \noindent R2: \noindent $\begin{bmatrix} 1&0&0&0 \\ 0&1&0&0 \\ 0&0&5&0 \\ 0&0&0&1 \end{bmatrix}^{-1} = \begin{bmatrix} 1&0&0&0 \\ 0&1&0&0 \\ 0&0&\frac{1}{5}&0 \\ 0&0&0&1 \end{bmatrix}$ \noindent R3: \noindent $\begin{bmatrix} 1&0&0&0 \\ 0&1&0&0 \\ 0&0&1&0 \\ 2&0&0&1 \end{bmatrix}^{-1} = \begin{bmatrix} 1&0&0&0 \\ 0&1&0&0 \\ 0&0&1&0 \\ -2&0&0&1 \end{bmatrix}$ \end{marginfigure} % \begin{itemize} \item Interchanging rows is its own inverse. \item Scaling a row by $c$ and has inverse scaling the same row by $1/c$. \item Adding $c$ times row $j$ to row $i$ has as inverse the same operation with $-c$ instead. \end{itemize} The process of \style{Gaussian elimination} -- the systematic reduction of a matrix to row echelon form -- is thus expressible as a composition of these three types of elementary matrices: \begin{equation*} E_k\cdots E_2E_1A = R \end{equation*} where $R$ is the row echelon form and each $E_i$ is elementary. The product $E_k\cdots E_2E_1$ represents the cumulative effect of the row operations. When $A$ is invertible, this sequence continues until $R=I$, yielding \begin{equation*} A^{-1} = E_k\cdots E_2E_1 \end{equation*} This perspective on row reduction -- as a composition of invertible linear transformations -- reveals the algorithmic heart of linear algebra. Though conceived as a computational method, Gaussian elimination exemplifies a deeper principle: the resolution of complex transformations into sequences of simple, invertible ones. % ============================================== \section{LU Decomposition} \label{sec:LU} % ============================================== Our exposition of elementary matrices and Gaussian elimination suggests a deeper structure within matrix factorization. The sequence of row operations that produces an upper triangular matrix can be reorganized to reveal a natural factorization of the original matrix. \begin{definition}[LU Decomposition] \label{def:lu} An \style{LU decomposition} of a square matrix $A$ expresses it as a product $A = LU$, where $L$ is lower triangular (with ones on the diagonal) and $U$ is upper triangular. \end{definition} The matrix $U$ is precisely what one obtains from Gaussian elimination without row interchanges; the matrix $L$ captures the multipliers used in the elimination process. \begin{example} For a $3\times 3$ matrix, the $LU$ decomposition takes the form: \[ A = \begin{bmatrix} 1 & 0 & 0 \\ \ell_{21} & 1 & 0 \\ \ell_{31} & \ell_{32} & 1 \end{bmatrix} \begin{bmatrix} u_{11} & u_{12} & u_{13} \\ 0 & u_{22} & u_{23} \\ 0 & 0 & u_{33} \end{bmatrix} \] where the $\ell_{ij}$ are the elimination multipliers. \end{example} This factorization emerges naturally from the elimination process. When we use a multiplier $m$ to eliminate the $(i,j)$ entry using row $j$, that same multiplier appears in the $(i,j)$ position of $L$. The upper triangular matrix $U$ records the results of these eliminations. Thus, rather than storing a sequence of elementary matrices, we store their cumulative effect in $L$. \begin{marginfigure} {\em Caveat:} The existence of an $LU$ decomposition assumes we can perform elimination without row exchanges. When row interchanges are needed, a more general \style{PLU decomposition} incorporates a permutation matrix $P$. \end{marginfigure} The utility of LU decomposition lies in its efficiency for solving systems of equations. Once computed, the factors $L$ and $U$ allow us to solve $A\vect{x}=\vect{b}$ by successive substitution: \begin{enumerate} \item First solve $L\vect{y}=\vect{b}$ by forward substitution \item Then solve $U\vect{x}=\vect{y}$ by back substitution \end{enumerate} \begin{marginfigure} % {\em Example:} In electrical circuit analysis, one often solves $A\vect{x}=\vect{b}$ repeatedly with the same network topology ($A$) but different voltage or current sources ($\vect{b}$). LU decomposition is ideal for such scenarios. \end{marginfigure} The computational advantage becomes clear when solving multiple systems with the same coefficient matrix but different right-hand sides. The factorization need be computed only once, at a cost of approximately $\frac{2}{3}n^3$ operations for an $n\times n$ matrix. Each subsequent solution requires only $O(n^2)$ operations for the forward and back substitutions -- a significant savings over repeating the full elimination process. This efficiency drives the widespread use of $LU$ decomposition in numerical linear algebra. From circuit analysis to structural mechanics to fluid dynamics, systems of linear equations rarely appear in isolation. The ability to reuse a matrix factorization across multiple right-hand sides is invaluable in practice. \begin{marginfigure} {\em Foreshadowing:} The $LU$ decomposition is but one of several matrix factorizations we shall encounter. Each reveals different aspects of a matrix's structure and serves different computational needs. \end{marginfigure} The $LU$ decomposition exemplifies a recurring theme in computational mathematics: trading increased storage (the explicit factors $L$ and $U$) for decreased computation time. This theme will recur as we explore more sophisticated matrix factorizations, each offering its own balance of storage, computation, and insight. % ============================================== \section{Pivots \& Permutations} \label{sec:pivots} % ============================================== The process of Gaussian elimination, as described thus far, assumes we can use any nonzero entry as a pivot. In practice, this is numerically unwise. Consider elimination in the following system, whose coefficients have been taken from a data set: \[ \begin{bmatrix} 0.003 & 7.149 \\ 2.483 & 3.092 \end{bmatrix} \begin{pmatrix} x_1 \\ x_2 \end{pmatrix} = \begin{pmatrix} b_1 \\ b_2 \end{pmatrix} \] Using 0.003 as a pivot would require dividing by a small number -- multiplying any roundoff errors in other entries by 1000. Interchanging the rows first yields a more stable elimination. This suggests a modification to our elimination strategy: before elimination in each column, we first select an appropriate pivot by permuting rows. Such row interchanges are encoded by permutation matrices. Recall from \S\ref{sec:rowreduction} that a permutation matrix $P$ is obtained by reordering the rows of the identity matrix; multiplication by $P$ effects the corresponding reordering of matrix rows. % When we incorporate this pivot selection strategy into our elimination process, we obtain: \begin{definition}[PLU Decomposition] \label{def:plu} A \style{PLU decomposition} of a matrix $A$ expresses it as a product $A=P^{-1}LU$ where: \begin{enumerate} \item $P$ is a permutation matrix \item $L$ is lower triangular with ones on the diagonal \item $U$ is upper triangular \end{enumerate} Such a decomposition exists for any nonsingular matrix $A$ and encodes the steps of Gaussian elimination with partial pivoting. \end{definition} In practice, we permute $A$ first, yielding $PA$; then decompose that into $PA=LU$. Since $P$ is invertible, we can write $A = P^{-1}LU$. The system $A\vect{x}=\vect{b}$ thus becomes \[ P^{-1}LU\vect{x} = \vect{b} \quad \Longrightarrow \quad LU\vect{x} = P\vect{b} \] which we solve by: \begin{enumerate} \item Computing $P\vect{b}$ (applying the same row interchanges to $\vect{b}$ that were used in elimination) \item Solving $L\vect{y}=P\vect{b}$ by forward substitution \item Solving $U\vect{x}=\vect{y}$ by back substitution \end{enumerate} \begin{marginfigure} {\em Caveat:} Though we write the decomposition as $PA=LU$, in practice we store $P$ either as a permutation vector or as a sequence of row swaps, not as an explicit matrix. \end{marginfigure} \begin{marginfigure} {\em BONUS!} This strategic permutation is an example of \style{preconditioning}. \end{marginfigure} This refinement of $LU$ decomposition -- incorporating pivoting through permutations -- exemplifies a broader principle in computational mathematics: theoretical algorithms often require modification to ensure numerical stability in practice. The art lies in preserving the essential structure while adapting to practical constraints. % ============================================== \section{Practicalities of Linear Systems} \label{sec:practical} % ============================================== Theory reveals itself in practice as fundamental patterns emerge from computation. Though our development thus far has emphasized the algebraic structure of linear systems -- their solution spaces, elimination methods, and matrix factorizations -- engineering demands more. We must determine not just whether solutions exist but whether we can compute them reliably. This bridge between abstract mathematics and practical computation requires understanding both the geometric meaning of our operations and their sensitivity to the numerical realities of finite-precision arithmetic. Row reduction to a row echelon form (recall Definition \ref{def:rref}) reveals not only solutions but also fundamental structure. The following not-quite-rigorous definition will ascend to central importance Chapter \ref{ch:3}. \begin{definition}[Matrix Rank] \label{def:pseudorank} The \style{rank} of a matrix is the number of pivots in a row-reduced echelon form. \end{definition} This is a fundamental measure of the matrix's effectiveness at transforming space. For an $m\times n$ matrix $A$, the rank satisfies \begin{marginfigure} {\em Example:} A $3\times 3$ matrix of rank 2 maps $\R^3$ onto a plane, collapsing one dimension. The geometric image helps explain why such a matrix cannot be nonsingular. \end{marginfigure} \[ \text{rank}(A) \leq \min\{m,n\} \] with equality implying $A$ has \style{full rank}. When $A$ is square, full rank is equivalent to nonsingularity. \begin{example}[Row echelon computation] Consider the matrix \[ A = \begin{bmatrix} 1 & 2 & 0 & 3 & 1 & 2 & 4 \\ 2 & 4 & 0 & 6 & 2 & 5 & 1 \\ 3 & 6 & 0 & 9 & 3 & 7 & 5 \\ 1 & 2 & 0 & 3 & 1 & 1 & 8 \\ 4 & 8 & 0 & 12 & 4 & 9 & 2 \end{bmatrix} \] {\em Mirabile dictu:} the $(1,1)$ entry is a perfect pivot. Clearing out the first column leads to a dramatic simplification; then clearing out the sixth column and a slight reordering yields the final row-echelon form. \[ \begin{bmatrix} 1 & 2 & 0 & 3 & 1 & 2 & 4 \\ 0 & 0 & 0 & 0 & 0 & 1 & -7 \\ 0 & 0 & 0 & 0 & 0 & 1 & -7 \\ 0 & 0 & 0 & 0 & 0 & -1 & 4 \\ 0 & 0 & 0 & 0 & 0 & 1 & -14 \end{bmatrix} \quad \Rightarrow \quad \begin{bmatrix} 1 & 2 & 0 & 3 & 1 & 2 & 4 \\ 0 & 0 & 0 & 0 & 0 & 1 & -7 \\ 0 & 0 & 0 & 0 & 0 & 0 & -3 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 \end{bmatrix} \] Several interesting features emerge: \begin{enumerate} \item The first row operation reveals that rows 2-5 were nearly linearly dependent, differing only in their last two entries \item The matrix has rank 3, as evidenced by three nonzero rows in echelon form \item The third column is all zeros, making it unnecessary to perform eliminations there \item The dependencies among the first five columns become clear only after elimination \end{enumerate} \end{example} \begin{example}[Surface Flatness Measurement] Consider the quality control inspection of a machined metal surface, where a coordinate measuring machine samples five points to verify flatness. The measurements (in micrometers) yield coordinates: $(1.23, 3.41, 502.1)$, $(4.56, -2.17, 498.4)$, $(-2.89, 1.76, 501.3)$, $(0.12, -4.33, 499.7)$, and $(3.45, 2.91, 500.8)$. % To assess flatness deviation, we seek a best-fit plane $z=ax+by+c$ approximating these points. Each measurement $(x_i,y_i,z_i)$ generates one equation in our system: \[ \begin{bmatrix} 1.23 & 3.41 & 1.0 \\ 4.56 & -2.17 & 1.0 \\ -2.89 & 1.76 & 1.0 \\ 0.12 & -4.33 & 1.0 \\ 3.45 & 2.91 & 1.0 \end{bmatrix} \begin{pmatrix} a \\ b \\ c \end{pmatrix} = \begin{pmatrix} 502.1 \\ 498.4 \\ 501.3 \\ 499.7 \\ 500.8 \end{pmatrix} \] This $5\times 3$ system has more equations than unknowns -- a common situation in metrology where redundant measurements help reduce the impact of individual measurement errors. The z-coordinates cluster near 500 micrometers (the nominal surface height) with deviations suggesting both systematic tilt and random measurement noise. Though the matrix has full rank 3, small changes in the measurements can produce surprisingly large changes in the computed coefficients $a$, $b$, and $c$. This sensitivity to measurement perturbation, crucial for understanding the reliability of our computed solution, leads us to examine how different matrices can vary in their numerical stability. \end{example} \begin{marginfigure} \centering \includegraphics[width=1.2in]{trichotomy.png} \end{marginfigure} A geometric perspective illuminates these algebraic concepts. Each equation in a linear system represents an $(n-1)$-dimensional hyperplane in $\R^n$. The solution set is the intersection of these hyperplanes. A unique solution corresponds to $n$ hyperplanes mutually meeting at a single point; parallel distinct hyperplanes yield no solution; hyperplanes coinciding or intersecting in a line yield infinitely many solutions. \begin{marginfigure} {\em Caveat:} While this geometric view aids intuition in two or three dimensions, beware of relying too heavily on geometric thinking in higher dimensions, where our intuition often fails us. \end{marginfigure} The practical import of these concepts lies in their ability to predict the behavior of linear systems before attempting to solve them. The rank determines whether a solution exists; nonsingularity tells us if that solution is unique. This structural understanding guides our choice of solution methods and helps us interpret the results. \begin{example} \label{ex:conditionintro} Not all matrices are created equal in their amenability to computation. Consider solving the system $A\vect{x}=\vect{b}$ where \[ A = \begin{bmatrix} 1 & 0.999 \\ 0 & 0.001 \end{bmatrix} \] Though this matrix is nonsingular, small changes in $\vect{b}$ can produce large changes in the solution $\vect{x}$. Such sensitivity to perturbation -- whether from measurement error, roundoff in computation, or truncation of decimal places -- fundamentally limits our ability to solve linear systems reliably. This sensitivity has geometric meaning: $A$ maps the unit circle to an extremely eccentric ellipse, stretching space a thousand times more in one direction than another. The \style{condition number} of $A$, denoted $\cond(A)$, measures precisely this eccentricity through the ratio of its largest to smallest stretching factors: % \begin{marginfigure} {\em Nota bene:} The formal definition requires concepts from Chapter 10, but the geometric intuition -- that some matrices distort space more extremely than others -- serves us well even now. \end{marginfigure} % \[ \cond(A) = \frac{\text{maximum stretching}}{\text{minimum stretching}} \] For the matrix above, $\cond(A)\approx 2000$, indicating that errors in certain directions may be amplified by a factor of 2000 when solving the system. The practical significance is immediate: when $\cond(A)$ is large, we call $A$ \style{ill-conditioned} and treat computed solutions with appropriate skepticism. When $\cond(A)$ is moderate (say less than 100), we have greater confidence in our numerical results. This single number provides crucial guidance about which linear systems we can solve reliably and which require more careful treatment. \end{example} The deeper relationship between conditioning and accuracy will emerge in Chapter \ref{ch:6} when we study least squares problems, and again in Chapter \ref{ch:10} where singular value decomposition reveals its geometric essence. For now, this first glimpse of numerical sensitivity serves as crucial warning: in the workshop of linear algebra, not all tools are equally reliable. Some matrices, like well-balanced instruments, translate our mathematical intentions into reliable results. Others, though theoretically sound, prove treacherously sensitive in practice. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % **************** EMANATION ******************* \small % **************** EMANATION ******************* \emanation % **************** EMANATION ******************* \section*{Network Flows: From Graphs to Linear Systems} \label{EM:networkflows} % **************** EMANATION ******************* The world runs on networks. Supply chains move goods from factories to stores; pipelines transport oil and gas between cities; computer networks route data packets across the internet. Though these systems appear complex, their fundamental behavior reduces to solving systems of linear equations --- the very equations we have studied in this chapter. A (directed) \style{network} (or \style{graph}) consists of \style{vertices} (or \style{nodes}) connected by oriented \style{edges}. Think of vertices as locations and edges as pathways between them. % For a more formal approach, one designates a (finite) set $V$ of vertices. Edges consist of ordered pairs of vertices: $E\subset V\times V$, where the ordering implies orientation. One usually demands that the two vertices in a edge are distinct. \begin{marginfigure} {\em Think:} in a social network, vertices are people, edges are a social relation (``friend'' or ``follow'') between two persons. \end{marginfigure} Consider a regional distribution network with five locations satisfying the following: \begin{itemize} \item Factory (node 1) producing 200 units \item Two regional warehouses (nodes 2, 3) that route inventory \item Two retail centers (nodes 4, 5) each needing 100 units \end{itemize} The shipping routes form a directed graph as shown, with flow variables $x_{ij}$ indicating units shipped from node $i$ to node $j$. The factory supplies warehouses which in turn supply retail centers. \begin{marginfigure} \includegraphics[width=1.0in]{flow-network.png} \end{marginfigure} Conservation of flow requires that what comes in equals what goes out (except at sources and sinks): \begin{itemize} \item At factory: Total outgoing equals production \item At each warehouse: Incoming equals total outgoing \item At each retail center: Incoming equals demand \end{itemize} This generates our system of linear equations: \[ \begin{array}{rcl} x_{12} + x_{13} &=& 200 \quad\text{(factory output)} \\ x_{12} - x_{24} - x_{25} &=& 0 \quad\text{(warehouse 1 balance)} \\ x_{13} - x_{34} - x_{35} &=& 0 \quad\text{(warehouse 2 balance)} \\ x_{24} + x_{34} &=& 100 \quad\text{(retail 4 demand)} \\ x_{25} + x_{35} &=& 100 \quad\text{(retail 5 demand)} \end{array} \] Writing this as $A\vect{x}=\vect{b}$: \[ \begin{bmatrix} 1 & 1 & 0 & 0 & 0 & 0 \\ 1 & 0 & -1 & -1 & 0 & 0 \\ 0 & 1 & 0 & 0 & -1 & -1 \\ 0 & 0 & 1 & 0 & 1 & 0 \\ 0 & 0 & 0 & 1 & 0 & 1 \end{bmatrix} \begin{pmatrix} x_{12} \\ x_{13} \\ x_{24} \\ x_{25} \\ x_{34} \\ x_{35} \end{pmatrix} = \begin{pmatrix} 200 \\ 0 \\ 0 \\ 100 \\ 100 \end{pmatrix} \] This system has LU factorization $A=LU$ where: \[ L = \begin{bmatrix} 1 & 0 & 0 & 0 & 0 \\ 1 & 1 & 0 & 0 & 0 \\ 0 & 1 & 1 & 0 & 0 \\ 0 & 0 & 1 & 1 & 0 \\ 0 & 0 & 0 & 1 & 1 \end{bmatrix} \quad U = \begin{bmatrix} 1 & 1 & 0 & 0 & 0 & 0 \\ 0 & -1 & -1 & -1 & 0 & 0 \\ 0 & 0 & 1 & 0 & -1 & -1 \\ 0 & 0 & 0 & 1 & 1 & 0 \\ 0 & 0 & 0 & 0 & -1 & 1 \end{bmatrix} \] The practical value of this decomposition emerges when supply or demand patterns change --- a daily occurrence in real distribution networks. Consider three scenarios: \begin{enumerate} \item Retail center 4 needs 150 units while center 5 needs only 50 \item The factory increases production to 250 units \item A warehouse temporarily closes, requiring rerouting of flows \end{enumerate} For the first two cases, only the right-hand side $\vect{b}$ changes. Having computed and stored $L$ and $U$ once, we can solve each new scenario through forward and back substitution: \[ L\vect{y} = \vect{b}_\text{new} \quad\text{then}\quad U\vect{x} = \vect{y} \] This requires only $O(n^2)$ operations compared to the $O(n^3)$ cost of computing a new LU decomposition. The third case --- structural changes to the network --- requires recomputing the factorization, aligning with intuition: major network reconfigurations demand fresh analysis, while routine variations in flow can be handled more efficiently. \begin{marginfigure} {\em Foreshadowing:} In Chapter \ref{ch:6}, we shall see how optimization principles help choose among multiple feasible solutions, selecting flows that minimize cost or maximize efficiency. \end{marginfigure} Through networks, the abstract equations of Chapter 1 acquire concrete meaning --- they become tools for understanding and managing the flow of goods, vehicles, or information through interconnected systems. What began as manipulation of numbers and variables emerges as a framework for solving real-world distribution problems. % **************** EMANATION ******************* \emanation % **************** EMANATION ******************* \section*{Structural Analysis: Forces in Trusses} \label{EM:trusses} % **************** EMANATION ******************* Buildings stand and bridges span through careful balance of forces. The simplest structural elements --- beams joined at nodes to form trusses --- provide both practical utility and mathematical elegance. Though engineers have analyzed such structures for centuries, their fundamental behavior reduces to solving precisely the systems of linear equations developed in this chapter. A \style{truss} consists of rigid beams connected by perfectly hinged joints, supporting loads through pure tension or compression in its members. Each joint (or node) must remain in equilibrium, with forces balancing in both horizontal and vertical directions. These equilibrium conditions generate our systems of equations, while physical constraints on material strength make stability of solution methods paramount. Consider this five-member truss supporting both vertical and horizontal loads: \begin{figure} \centering \includegraphics[width=0.65\linewidth]{truss.png} \end{figure} Each member force $x_i$ represents tension (positive) or compression (negative) along its length. The external loads --- $F$ downward and $G$ rightward as shown --- must be balanced by reactions at the supports $R$ and $S$ respectively. Force equilibrium at each node yields equations in both horizontal ($x$) and vertical ($y$) directions. For the supports, we include reaction forces $R_{x}$ and $R_{y}$ at the pin (node 1) and $R_{2}$ at the roller (node 2). Assuming the truss is 4-units wide by 2-units high and writing this system as $A\vect{x}=\vect{b}$ with all reactions grouped first: \[ \begin{bmatrix} 1 & 0 & 0 & 1 & -0.894 & 0 & 0 & 0 \\ 0 & 1 & 0 & 0 & 0.447 & 0 & 0 & 0 \\ 0 & 0 & 0 & -1 & 0 & -0.894 & -0.894 & 0 \\ 0 & 0 & 1 & 0 & 0 & 0.447 & 0.447 & 0 \\ 0 & 0 & 0 & 0 & 0.894 & 0.894 & 0 & 1 \\ 0 & 0 & 0 & 0 & -0.447 & -0.447 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0.894 & -1 \\ 0 & 0 & 0 & 0 & 0 & 0 & -0.447 & 0 \end{bmatrix} \begin{pmatrix} R_x \\ R_y \\ S \\ x_1 \\ x_2 \\ x_3 \\ x_4 \\ x_5 \end{pmatrix} = \begin{pmatrix} 0 \\ 0 \\ 0 \\ 0 \\ 0 \\ F \\ G \\ 0 \end{pmatrix} \] % Several features of this system demand attention: % \begin{itemize} % \item Sparsity: Each equation involves only members and reactions at its node % \item Symmetry: The equilibrium equations exhibit natural physical structure % \item Conditioning: Small changes in geometry can significantly affect coefficients % \end{itemize} The LU factorization of this system proves particularly valuable. The $L$ factor captures load transmission through the structure, while $U$ reveals the sequence of equilibrium relationships. The sparsity pattern reflects the physical connectivity of the truss, making storage and computation efficient. This factorization enables rapid reanalysis under changing loads --- a crucial capability as environmental forces vary. Consider three scenarios structural engineers must analyze: \begin{enumerate} \item Wind loads add horizontal forces at both upper nodes \item Snow accumulation increases vertical loads asymmetrically \item Support settlement modifies geometric coefficients slightly \end{enumerate} The first two cases modify only the right-hand side $\vect{b}$, allowing efficient solution through stored factors. The third case --- involving geometric changes --- requires recomputing coefficients and factorization. Yet even here, the sparsity pattern remains unchanged, permitting optimized refactorization. \begin{marginfigure} {\em Example:} A mere 1\% change in member angles can produce 10\% changes in internal forces, emphasizing the importance of stable numerical methods developed in this chapter. \end{marginfigure} Matrix conditioning proves especially crucial in structural analysis. Nearly parallel members generate nearly dependent equations; members at almost right angles produce coefficients of vastly different scale. The pivoting strategies introduced for PLU factorization directly address these challenges, ensuring reliable analysis even of geometrically complex trusses. Through structural analysis, the abstract equations of Chapter 1 acquire concrete physical meaning --- they become tools for ensuring buildings stand and bridges span safely. What began as manipulation of numbers emerges as a framework for understanding how forces flow through the built environment, a framework made practical through the careful study of matrix structure and numerical stability. % **************** EMANATION ******************* % **************** EMANATION ******************* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \exercises \section*{Exercises: Chapter 1} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{enumerate} % Basic computational problems first \item Solve the following systems of linear equations using Gaussian elimination: \[ \begin{aligned} 2x + y - z &= 1 \\ 4x - y + 2z &= 2 \\ -2x + 5y - z &= 3 \end{aligned} \quad : \quad \begin{aligned} 2x + y - z &= 1 \\ 4x - y + 2z &= 2 \\ -2x + 5y - z &= 3 \end{aligned} \] \item Verify whether each of the matrices \[ A = \begin{bmatrix} -1 & 4 & 2\\ 0 & 3 & 0\\ 0 & 2 & -1 \end{bmatrix} \quad : \quad B = \begin{bmatrix} 1 & 3 & 0 & 0 \\ -1 & -4 & 0 & 0\\ 0 & 0 & 2 & -3\\ 0 & 0 & 1 & -2 \end{bmatrix} \] is invertible by computing its determinant. If it is invertible, find its inverse. \item Decompose the following matrix into $LU$ form (without pivoting): \[ A = \begin{bmatrix} 2 & 3 & 1 \\ 4 & 7 & -1 \\ -2 & -3 & 6 \end{bmatrix} \] Verify your result by reconstructing $A$ from $L$ and $U$. \item Consider solving $A\vect{x}=\vect{b}$ where \[ A = \begin{bmatrix} 0.001 & 1 \\ 1 & 1 \end{bmatrix} \quad : \quad \vect{b} = \begin{pmatrix} 1 \\ 2 \end{pmatrix} \] Solve the system both without pivoting and after applying the row permutation. Compare the numerical stability of both approaches by computing the size of intermediate terms. What general principle about pivoting does this illustrate? % Physical applications and modeling problems \item Consider an electrical circuit with three nodes connected by resistors. The conductance matrix is \[ G = \begin{bmatrix} 3 & -1 & -2 \\ -1 & 4 & -3 \\ -2 & -3 & 5 \end{bmatrix} \] Find node voltages $\vect{v}$ that produce currents $\vect{i}=(1,0,-1)^T$ by solving $G\vect{v}=\vect{i}$. \begin{marginfigure} {\em Note:} In electrical networks, $G$ is symmetric and its row sums equal zero due to Kirchhoff's laws. \end{marginfigure} \item An input-output economic model has three sectors with input matrix \[ A = \begin{bmatrix} 0.3 & 0.2 & 0.1 \\ 0.4 & 0.3 & 0.2 \\ 0.2 & 0.3 & 0.4 \end{bmatrix} \] where $a_{ij}$ represents the amount of sector $i$'s output needed to produce one unit of sector $j$'s output. Given demand $\vect{d}=(100,150,200)^T$, find the production levels $\vect{x}$ needed to meet this demand by solving $(I-A)\vect{x}=\vect{d}$. \begin{marginfigure} {\em Note:} In economic models, $A$ typically has nonnegative entries with column sums less than 1. \end{marginfigure} \item A chemical reactor has three species $A$, $B$, and $C$ that interconvert according to first-order kinetics. The rate matrix is \[ K = \begin{bmatrix} -2 & 1 & 1 \\ 1 & -2 & 1 \\ 1 & 1 & -2 \end{bmatrix} \] If initial concentrations are $\vect{c}_0=(1,0,0)^T$, find steady-state concentrations by solving $K\vect{c}=\vect{0}$ subject to mass conservation $\sum_i c_i = 1$. \begin{marginfigure} {\em Note:} Rate matrices have row sums of zero due to mass conservation. \end{marginfigure} % Basic theoretical problems and proofs \item Let $A$ and $B$ be $n \times n$ matrices and suppose $AB = I$. Prove that $A$ and $B$ are invertible and that $B = A^{-1}$. \item Prove that any $n$-by-$n$ permutation matrix $P$ is a root of the identity, specifically $P^n=I$. Is this ever the case for smaller powers than $n$? \item Let $N$ denote a $k$-by-$k$ matrix that is all zeros except for $+1$ on the superdiagonal: that is, $N_{i,j}=1$ for $j=i+1$ and $0$ elsewhere. Demonstrate that $N^p$ is nonzero for $p0$. Find a permutation $P$ such that $PA$ has growth factor approximately 1, and explain why this demonstrates the importance of pivoting for numerical stability. \item Consider a matrix $A$ with nonzero diagonal entries but much larger super-diagonal entries: $|a_{i,i+1}| \gg |a_{ii}|$ for $i=1,\ldots,n-1$. Explain why computing an LU decomposition without pivoting is likely to be unstable, how a cyclic permutation moving the first row to the bottom might help, and how the growth factor concept explains this phenomenon. \item Let $A$ be an $n$-by-$n$ matrix with entries of magnitude at most 1. Prove that there exists a permutation matrix $P$ such that all pivots in the LU decomposition of $PA$ have magnitude at least $1/n!$. Is this bound sharp? \begin{marginfigure} {\em Nota bene:} This shows that some pivoting strategy can always ensure pivots don't become too small, though finding the optimal permutation is generally intractable. \end{marginfigure} % Advanced applications and theoretical synthesis \item Consider a mass-spring system with two masses $m_1$ and $m_2$ connected by springs with constants $k_1$, $k_2$, and $k_3$: \[ \begin{tikzcd}[column sep=2em] \text{wall} \arrow[r, "k_1", no head] & m_1 \arrow[r, "k_2", no head] & m_2 \arrow[r, "k_3", no head] & \text{wall} \end{tikzcd} \] Show that finding the equilibrium positions requires solving a system $A\vect{x}=\vect{b}$ where $A$ is symmetric and tridiagonal. What physical principle explains the symmetry? \item Recall that in network flow problems, the \style{incidence matrix} $A$ has entries $a_{ij}=1$ if edge $j$ enters node $i$, $a_{ij}=-1$ if edge $j$ leaves node $i$, and $a_{ij}=0$ otherwise. Prove that for any connected graph with $n$ nodes, $\rank(A)=n-1$. How does this relate to conservation of flow? \item A real matrix $A$ is called \style{totally positive} if all its minors (determinants of square submatrices) are positive. Show that if $A$ is totally positive, then its LU decomposition exists without need for pivoting. What does this imply about numerical stability? \begin{marginfigure} {\em BONUS!} Totally positive matrices appear naturally in approximation theory and statistics, where their special properties prove invaluable. \end{marginfigure} \item The \style{Cayley transform} of a matrix $A$ is defined as $C=(I+A)(I-A)^{-1}$ when $(I-A)$ is invertible. Show that if $A$ is block diagonal, then $C$ is block diagonal with blocks being the Cayley transforms of the diagonal blocks of $A$. What advantage might this offer for computation? \item Consider solving a system of equations $A\vect{x}=\vect{b}$ where $A$ is \style{strictly diagonally dominant}: $|a_{ii}| > \sum_{j\neq i} |a_{ij}|$ for all $i$. Prove that $A$ is nonsingular ad that no row exchanges are needed in Gaussian elimination. \item A web page ranking algorithm assigns importance scores $\vect{x}$ to $n$ pages based on the link matrix $L$ where $L_{ij}=1$ if page $j$ links to page $i$ and $0$ otherwise. After normalizing columns of $L$ to sum to 1, scores are updated by solving \[ \vect{x} = \alpha L\vect{x} + (1-\alpha)\vect{1}/n \] where $\alpha=0.85$ and $\vect{1}$ is the vector of all ones. Show this system always has a unique solution. Why is this important for web search? \end{enumerate} \normalsize %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= \chapter{Abstract Vector Spaces} \label{ch:2} %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= {\em ``in fear \& pale dismay He saw the indefinite space beneath''} \newthought{The leap from concrete to abstract} marks the first great challenge in this text. Having worked extensively with vectors as ordered lists of numbers -- whether forces, velocities, or data -- we now step back and ask a deeper question: what \textit{is} a vector? What essential features make something vector-like? This abstraction is not mere academic exercise. The vectors that arise in modern engineering often transcend simple coordinate lists. A vector might represent a time-varying signal, a high-dimensional dataset, an image, a polynomial, or a probability distribution. The operations we perform on these vectors -- addition, scaling, dot products -- echo those familiar from Euclidean geometry, yet are abstracted away from geometry into pure form. Consider a collection of audio samples forming a sound wave, or pixel intensities comprising a digital image. We can add two such objects (mixing sounds or blending images) and scale them (changing volume or brightness). These operations satisfy the same algebraic rules as vector addition and scalar multiplication in $\R^n$. Yet these objects are far from geometric arrows in space. They are \style{vectors} in a more general sense -- elements of a \style{vector space}. \begin{marginfigure} \centering \includegraphics[width=1.1in]{ouroboros.png} \end{marginfigure} \begin{marginfigure} {\em Hmmmm...} A vector is an element of a vector space, a space comprised of vectors. {\em There must be more than this...} \end{marginfigure} The power of this abstraction lies in its ability to unify disparate contexts under a common framework. Whether working with solutions to differential equations, quantum states in physics, or feature embeddings in machine learning, the fundamental properties of vector spaces guide our analysis and computation. By understanding these properties in their abstract form, we gain tools applicable across the landscape of modern engineering. Our task is to build this abstraction carefully, maintaining always a connection to the concrete. We begin with the axiomatic definition of a vector space, using familiar examples to illuminate the essential features. This foundation will support our subsequent study of transformations, inner products, and the deeper structures that enable modern computational methods. % ============================================== \section{Vector Space Axioms} \label{sec:axioms} % ============================================== \begin{definition}[Vector Space] \label{def:vectorspace} A \style{vector space} consists of two ingredients: a collection $V$ of objects (called \style{vectors}) and a field of \style{scalars} (for our purposes, always the real numbers $\R$). These are bound together by two fundamental operations: \begin{enumerate} \item Vector addition: a rule for combining any two vectors $\vect{u},\vect{v} \in V$ to obtain a new vector $\vect{u}+\vect{v} \in V$ \item Scalar multiplication: a rule for scaling any vector $\vect{v} \in V$ by a real number $c \in \R$ to obtain a new vector $c\vect{v} \in V$ \end{enumerate} These operations must satisfy certain rules -- the \style{vector space axioms}. For all vectors $\vect{u},\vect{v},\vect{w} \in V$ and all scalars $a,b \in \R$: \begin{marginfigure} Not all collections of objects with addition and scaling qualify as vector spaces. The axioms ensure that vectors combine and scale in ways that preserve the essential character of ``vector-ness.'' \end{marginfigure} \smallskip \textit{Vector Addition Axioms:} \begin{enumerate} \item Commutativity: $\vect{u} + \vect{v} = \vect{v} + \vect{u}$ \item Associativity: $(\vect{u} + \vect{v}) + \vect{w} = \vect{u} + (\vect{v} + \vect{w})$ \item Zero vector: There exists a vector $\vect{0} \in V$ such that $\vect{v} + \vect{0} = \vect{v}$ for all $\vect{v} \in V$ \item Additive inverses: For each $\vect{v} \in V$, there exists a vector $-\vect{v} \in V$ such that $\vect{v} + (-\vect{v}) = \vect{0}$ \end{enumerate} \smallskip \textit{Scalar Multiplication Axioms:} \begin{enumerate} \item Distributivity over vector addition: $a(\vect{u} + \vect{v}) = a\vect{u} + a\vect{v}$ \item Distributivity over scalar addition: $(a+b)\vect{v} = a\vect{v} + b\vect{v}$ \item Associativity with scalars: $a(b\vect{v}) = (ab)\vect{v}$ \item Unity: $1\vect{v} = \vect{v}$ \end{enumerate} \end{definition} These axioms may seem pedantic -- they certainly hold for the familiar vectors in $\R^n$. Their importance emerges when considering more exotic spaces whose objects do not {\em look} like vectors, such as: \begin{marginfigure} These examples illustrate how far we can stretch our notion of what a ``vector'' is while maintaining the essential algebraic structure. See the next section for some initialy details. \end{marginfigure} \begin{itemize} \item continuous $\R$-valued functions under pointwise addition and rescaling \item polynomials, under addition and rescaling \item Taylor or Fourier series, under termwise addition and rescaling \item solutions to linear homogeneous ODEs or recurrence relations \end{itemize} The power of these axioms lies not in their individual statements but in their collective implication: anything satisfying these rules inherits the fundamental properties of vectors. This means that techniques developed for one vector space often translate seamlessly to others. A method for solving systems of linear equations in $\R^n$ might, with minimal modification, solve systems of linear differential equations or find optimal coefficients in a signal processing filter. \begin{marginfigure} Vector spaces are not just collections of vectors -- they are collections of vectors that are rightly structured under addition and scaling. \end{marginfigure} The axioms also tell us what \textit{is not} a vector space. The positive real numbers under operations of min (addition) and max (multiplication) fail most of the axioms (are any satisfied?). The integers under ordinary addition and multiplication fail because scalar multiplication does not always yield an integer. Such counterexamples help sharpen our understanding of what makes a vector space work. Finally, the axiomatic approach allows us to prove results that hold for {\em all} vector spaces, saving the trouble of verifying things one example at a time. For example, the following certainly {\em seems} obvious in Euclidean space, but it is less clear that it holds in all possible worlds. \begin{lemma} \label{lem:zero} In a vector space $V$, the zero vector is unique. \end{lemma} \begin{proof} Assume that $z$ and $z'$ are vectors in $V$ which satisfy the zero-property. Then: \[ z = z+z' = z' , \] each equality following from the fact that both $z$ and $z'$ do nothing when added to any vector. Thus, they are the same vector. \end{proof} % ============================================== \section{A Gallery of Vector Spaces} \label{sec:gallery} % ============================================== An abstract definition takes on life through examples. Each of the following illustrates how the vector space axioms manifest in different contexts, from the familiar to the exotic. Though we shall not verify the axioms explicitly for each (a tedious if straightforward exercise), we shall identify the key components: the vectors themselves, the operations of addition and scaling, and the zero vector. \begin{example}[Euclidean space] \label{ex:euclidean} \begin{marginfigure} Though we live in a seemingly three-dimensional world, the configuration spaces of mechanical systems routinely have higher dimension. A robotic arm with multiple rotation joints evolves in a state space of dimension greater than three. \end{marginfigure} The space $\R^n$ of ordered $n$-tuples of real numbers is our prototype. Here, vectors are ordered lists of real numbers, acted upon by the familiar operations of componentwise addition and scalar multiplication. The zero vector is the tuple of all zeros. This is the space in which classical physics and engineering operate, where $n=2$ or $3$ correspond to physical space. \end{example} \begin{example}[Matrices] \label{ex:matrices} The collection $\R^{m\times n}$ of all $m$-by-$n$ matrices forms a vector space under entry-by-entry addition and scalar multiplication. The zero matriz $Z$ is the ``zero'' of $\R^{m\times n}$. Matrix spaces are ubiquitous in engineering, from the transformation matrices of computer graphics to the weight matrices of neural networks. The operations here echo those of $\R^n$, though the objects themselves are more structured. % \begin{marginfigure} The space of $2\times 2$ matrices is four-dimensional, though this is not immediately obvious from its appearance. This theme -- that dimension can hide in plain sight -- will recur. \end{marginfigure} \end{example} \begin{example}[Polynomials] \label{ex:poly} For each nonnegative integer $n$, we have the space $\poly_n$ of polynomials of degree at most $n$. A typical element has the form $p(x) = a_nx^n + a_{n-1}x^{n-1} + \cdots + a_1x + a_0$. Addition of polynomials and multiplication by scalars operate on the coefficients in the natural way. The zero polynomial, having all coefficients equal to zero, serves as the zero vector. These spaces serve as approximations to more complex functions and appear throughout signal processing and control theory. \end{example} \begin{example}[Function spaces] \label{ex:functionspaces} \begin{marginfigure} {\em Foreshadowing:} this is our first example of an infinite-dimensional vector space. The jump from finite to infinite dimensions is profound and harbors surprises that will shape our understanding of convergence and approximation. \end{marginfigure} Consider the space $C([a,b])$ of continuous real-valued functions on an interval $[a,b]$, with addition and scalar multiplication defined pointwise. The zero function $z(x)=0$ serves as zero vector, since $f+z=f$ for all $f$. This space contains all the polynomials $\poly_n$ (with restricted domain) and serves as a model for signal spaces in engineering. One uses $C(D)$ to denote the vector space of scalar fields $f:D\to\R$ on a domain $D$. For functions that have some differentiability (both helpful and familiar from calculus), the following notations for scalar fields are standard: \begin{itemize} \item $C(D)$ : continuous \item $C^1(D)$ : continuously differentiable \item $C^\infty(D)$ : infinitely differentiable or \style{smooth} \item $C^\omega(D)$ : real-analytic \begin{marginfigure} {\em Recall:} \style{Real-analytic} means that all Taylor series converge and are equal to the function at all points. \end{marginfigure} \end{itemize} \end{example} \begin{example}[Linear ODEs] \label{ex:ODE} The solutions to a linear homogeneous differential equation form a vector space. The vectors here are functions $x(t)$ satisfying the equation $p(D)x=0$ for $p(D)$ a polynomial of the differential operator $D$. The operations of addition and scalar multiplication act pointwise on these solutions $x(t)$, and the constant function $x=0$ is the zero vector. \begin{marginfigure} {\em Nota bene:} Similar spaces arise from linear recurrence relations and difference equations. \end{marginfigure} For example, a second-order linear ODE of the form \[ a\frac{d^2x}{dt^2}+b\frac{dx}{dt}+cx = 0 \] has solutions $x(t)$ which are closed under linear combination and thus form a vector space. This ODE can be written using the differential operator $D=d/dt$ as $(aD^2+bD+cI)x=0$. \end{example} \begin{example}[Digital signals] \label{ex:digital} Digital signals -- whether audio, image, or general data streams -- form vector spaces of their own. An audio signal might be represented as a sequence of samples or as a function of continuous time. Addition corresponds to mixing of signals; scalar multiplication adjusts amplitude. Silence plays the role of zero vector. For images, the vectors are two-dimensional arrays of pixel intensities. These spaces support the linear operations fundamental to signal processing and machine learning. \end{example} \begin{example}[Sequences \& Series] \label{ex:series} Consider the set of formal power series in a variable $x$, as familiar from single-variable calculus. Ignoring convergence, we may regard such power series as vectors. Given two such series, we can add them termwise (by powers); rescaling happens at the level of coefficients. The zero series ($c_k=0$ for all $k$) is the zero vector. \begin{marginfigure} Recall that power series are of the form \[ f = \sum_{k=0}^\infty c_k x^k. \] Would the {\em convergent} power series form a vector space? Does absolute versus conditional convergence matter? \end{marginfigure} There is likewise a vector space structure on the set of sequences. Consider $a=(a_k)$ for $k\in\N$. One can add such sequences termwise, and rescaling the sequence means rescaling each term. The zero-sequence plays the role of the zero-vector. Interestingly, this vector space ``feels'' like the same vector space as that of power series, though they look different. \begin{marginfigure} {\em Foreshadowing:} the notion of sameness or equivalence that is natural in vector spaces (and the rest of mathematics) is called \style{isomorphism}. The vector spaces of sequences and series are \style{isomorphic}. \end{marginfigure} \end{example} These examples, though distinct in character, share the essential features codified in the vector space axioms. Their variety suggests the power of the abstraction: techniques developed for one vector space often transfer seamlessly to others. As we proceed to study subspaces, linear independence, and bases, these examples will serve as touchstones, grounding abstract concepts in concrete settings. % ============================================== \section{Subspaces} \label{sec:subspaces} % ============================================== \begin{definition}[Subspace] \label{def:subspace} A \style{subspace} of a vector space $V$ is a subset $W\subseteq V$ that is itself a vector space under the operations inherited from $V$. We use the notation $W 3$, then $U$ and $W$ cannot form a direct sum (that is, they must have nonzero intersection). \item Let $U$ and $W$ be subspaces of a vector space $V$ such that $V = U\directsum W$. If $\vect{v}\in V$, prove that the expression $\vect{v} = \vect{u} + \vect{w}$ with $\vect{u}\in U$ and $\vect{w}\in W$ must be unique. \item Let $U_1,U_2,U_3$ be subspaces of a vector space $V$. Prove that if $V = U_1\directsum U_2\directsum U_3$, then any vector $\vect{v}\in V$ can be written uniquely as $\vect{v} = \vect{u}_1 + \vect{u}_2 + \vect{u}_3$ with $\vect{u}_i\in U_i$. \item Let $U$ and $W$ be finite-dimensional subspaces of a vector space $V$. Prove that \[ \dim(U + W) = \dim U + \dim W - \dim(U\cap W) \] \end{enumerate} \normalsize %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= \chapter{Linear Transformations} \label{ch:3} %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= %=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=% %=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%= {\em ``he became what he beheld; he became what he was doing; he was himself transform'd''} \newthought{The essence of Mathematics} lies not in objects but in transformations between them. The vectors and spaces we have thus far studied come alive only when acted upon -- rotated, scaled, projected, or otherwise transformed. Such transformations are the verbs to our nouns, the operations that animate our mathematical universe. Linear transformations are those which preserve the fundamental operations of vector spaces: addition and scaling. This seemingly modest requirement -- that our transformations respect vector space structure -- leads to a remarkably rich theory with profound practical implications. Vector spaces, in isolation, are static collections. The power of linear algebra emerges when we consider mappings which morph from input signals to output responses, from configurations to forces, from high-dimensional data to low-dimensional representations. Such mappings, when linear, possess a beautiful structure that both illuminates their theoretical properties and enables their practical computation. Our journey begins with familiar matrix transformations before ascending to more abstract heights. The operations of differentiation and integration, though far from geometric, share deep structural features with their matrix cousins. This abstraction reveals four fundamental spaces associated with any linear transformation -- the kernel and image that capture what vanishes and what is attained, the coimage and cokernel that measure efficiency and defect. These spaces, seemingly distinct, are bound together by the Fundamental Theorem of Linear Algebra, a result that unifies the algebraic, geometric, and dimensional aspects of linear transformations into a single coherent picture. % ============================================== \section{Euclidean Transformations} \label{sec:euclidean} % ============================================== Our story begins in familiar territory -- with matrices acting on vectors in Euclidean space. From multivariable calculus, we recall how multiplication by a matrix $A$ transforms vectors in $\R^n$, taking input vector $\vect{x}$ to output $A\vect{x}$. Though we performed such operations mechanically, computing products column-by-row, these transformations have rich geometric content worth savoring before abstraction. The simplest such transformations scale space uniformly. A matrix $cI$ multiplies each coordinate by the scalar $c$, dilating or contracting space about the origin. More interesting are matrices that scale different directions differently: \[ \begin{bmatrix} 2 & 0 \\ 0 & 1/2 \end{bmatrix} \begin{pmatrix} x \\ y \end{pmatrix} = \begin{pmatrix} 2x \\ y/2 \end{pmatrix} \] Such transformations stretch space along one axis while compressing along another -- like a funhouse mirror's distortion rendered precise in coordinates. Rotations in the plane arise from matrices of the form \[ \begin{bmatrix} \cos\theta & -\sin\theta \\ \sin\theta & \cos\theta \end{bmatrix} \] spinning vectors through angle $\theta$ counterclockwise about the origin. That such matrices preserve lengths and angles is a consequence of their structure -- see Chapter \ref{ch:5}. More subtle are \style{shear transformations}, such as \[ \begin{bmatrix} 1 & h \\ 0 & 1 \end{bmatrix} \] \begin{marginfigure} {\em Question:} What happens with the transpose of this horizontal shear? \end{marginfigure} which offset each horizontal line by an amount proportional to its height. These preserve area while tilting vertical lines -- like a deck of cards carefully slid across a table. These elementary transformations -- scaling, rotation, and shear -- combine to generate all linear transformations in the plane. Any $2\times 2$ matrix can be understood as a composition of such basic geometric operations. This decomposition previews deeper structure to come, when we learn to factor matrices into simpler constituent parts. What features do these transformations share, beyond their realization through matrix multiplication? First, they preserve the origin -- the zero vector remains fixed. Second, they respect vector addition: the image of a sum equals the sum of the images. Third, they interact naturally with scalar multiplication: doubling an input vector doubles its image. These properties -- seemingly obvious in the matrix setting -- will form the scaffolding for our abstract theory. % \begin{marginfigure} {\em Foreshadowing:} The properties we observe in matrix transformations -- preservation of vector operations -- will define linearity in the abstract setting. \end{marginfigure} Consider as well what these transformations can destroy. A rotation preserves distances but changes coordinates. A shear preserves areas but distorts angles. A scaling changes both distances and areas, but preserves lines through the origin. This selective preservation of geometric features hints at deeper invariants -- quantities or properties that remain unchanged under certain classes of transformations. The matrix transformation $A\vect{x}$ converts geometric intuition about transforming space into algebraic manipulation of coordinates. As we lift these ideas to abstract vector spaces, this interplay between geometry and algebra will remain. Though we may lose the ability to visualize transformations directly, the core ideas -- preservation of vector operations, study of invariants, decomposition into simpler parts -- will guide our development. % ============================================== \section{Definitions \& Implications} \label{sec:definitions} % ============================================== Our experience with Euclidean transformations suggests key features that characterize the essence of linearity: preservation of addition and scaling. Many important transformations share these algebraic properties while lacking obvious geometric interpretation. This motivates abstracting away from geometry to study linear transformations between arbitrary vector spaces. \begin{definition}[Linear Transformation] Let $V$ and $W$ be vector spaces. A \style{linear transformation} $T:V\to W$ is a function satisfying two properties: \begin{enumerate} \item Additivity: $T(\vect{v_1}+\vect{v_2}) = T(\vect{v_1})+T(\vect{v_2})$ for all $\vect{v_1},\vect{v_2}\in V$ \item Homogeneity: $T(c\vect{v}) = cT(\vect{v})$ for all $c\in\R$ and $\vect{v}\in V$ \end{enumerate} \end{definition} These two properties combine to ensure that linear transformations preserve linear combinations. This seemingly simple requirement has profound implications. \begin{lemma} \label{lem:lintran} A linear transformation $T:V\to W$ satisfies: \begin{enumerate} \item $T(\vect{0})=\vect{0}$ \item $T(-\vect{v})=-T(\vect{v})$ for all $\vect{v}\in V$ \item $T$ preserves linear combinations \[ T\left(\sum_{i=1}^nc_i\vect{v_i}\right) = \sum_{i=1}^nc_iT(\vect{v_i}) \] \end{enumerate} \end{lemma} \begin{marginfigure} {\em Foreshadowing:} The preservation of linear combinations will be the key to understanding how linear transformations interact with bases and coordinate systems. \end{marginfigure} The preservation of linear combinations has an important consequence for subspaces: linear transformations send subspaces to subspaces. \begin{lemma} \label{lem:subspace} If $T:V\to W$ is linear and $U