% LAE&F RG 2025
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Book metadata
\title[Linear Algebra : Essence \& Form]{Linear Algebra: \\  Essence \\ \& Form
%\thanks{Thanks to...}
}
\author[R. Ghrist]{Robert Ghrist}
\publisher{Agenbyte Press}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% NEWCOMMANDS by prof-g
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\vect}[1]{\bm{#1}} % VECTOR NOTATION
\newcommand{\style}[1]{\emph{#1}} % DEFINITION NOTATION
\newcommand{\R}{\mathbb{R}} % REAL FIELD
\newcommand{\C}{\mathbb{C}} % COMPLEX FIELD
\newcommand{\Z}{\mathbb{Z}} % INTEGERS
\newcommand{\N}{\mathbb{N}} % NATURALS
\newcommand{\iso}{\cong} % ISOMORPHISM
\newcommand{\id}{\textrm{id}} % IDENTITY TRANSFORMATION
\DeclareMathOperator{\coker}{\textrm{coker}} % COKERNEL
\DeclareMathOperator{\coim}{\textrm{coim}} % COIMAGE
\DeclareMathOperator{\im}{\textrm{im}} % IMAGE
\DeclareMathOperator{\rank}{\textrm{rank}} % RANK
\DeclareMathOperator{\nullity}{\textrm{null}} % NULLITY
\DeclareMathOperator{\spanset}{\textrm{span}} % SPAN
\DeclareMathOperator{\diag}{\textsc{diag}} % DIAGONAL
\newcommand{\basis}{\mathcal{B}} % BASIS
\newcommand{\poly}{\mathcal{P}} % POLYNOMIALS
\newcommand{\sym}{\textsc{sym}} % SYMMETRIC MATRICES
\newcommand{\skewsym}{\textsc{skew}} % SKEW-SYMMETRIC MATRICES
\newcommand{\inertia}{\mathcal{I}} % INERTIA TENSOR
\newcommand{\stress}{\boldsymbol{\sigma}} % STRESS TENSOR
\newcommand{\trace}{\textrm{tr}} % TRACE
\newcommand{\ihat}{\hat{\imath}} % i-HAT BASIS VECTOR
\newcommand{\jhat}{\hat{\jmath}} % j-HAT BASIS VECTOR
\newcommand{\khat}{\hat{k}} % k-HAT BASIS VECTOR
\newcommand{\proj}[1]{\Pi_{#1}} % PROJECTION OPERATOR
\newcommand{\directsum}{\oplus} % DIRECT SUM
\newcommand{\orthosum}{\boxplus} % ORTHOGONAL DIRECT SUM
\DeclareMathOperator{\row}{\textsc{row}} % ROW SPACE
\DeclareMathOperator{\column}{\textsc{col}} % COL SPACE
\newcommand{\COV}{[C]} % COVARIANCE MATRIX
\newcommand{\cov}{\textsc{cov}} % scalar covariance
\newcommand{\coventry}{C} % covariance entry
\newcommand{\CORR}{[R]} % CORRELATION MATRIX
\newcommand{\corr}{\textsc{corr}} % scalar correlation
\newcommand{\corentry}{R} % correlation entry
\newcommand{\Data}{\mathcal{X}}
\newcommand{\cond}{\textsc{cond}} % CONDITION NUMBER
\newcommand{\PARAM}{\Psi} % PARAMETERS
\newcommand{\NUMLAY}{\Lambda} % NUMBER OF LAYERS
\newcommand{\LOSS}{\mathcal{L}} % LOSS/COST FUNCTION

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Inserts a blank page
\newcommand{\blankpage}{\newpage\hbox{}\thispagestyle{empty}\newpage}


% Macros for typesetting the documentation
\newcommand{\hlred}[1]{\textcolor{Maroon}{#1}}% prints in red
\newcommand{\hangleft}[1]{\makebox[0pt][r]{#1}}
\newcommand{\hairsp}{\hspace{1pt}}% hair space
\newcommand{\hquad}{\hskip0.5em\relax}% half quad space
\newcommand{\TODO}{\textcolor{red}{\bf TODO!}\xspace}
\newcommand{\ie}{\textit{i.\hairsp{}e.}\xspace}
\newcommand{\eg}{\textit{e.\hairsp{}g.}\xspace}
\newcommand{\na}{\quad--}% used in tables for N/A cells

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SECTION DIVIDERS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\emanation}{
  \vspace{1em} % Add space above
  \noindent
  \raisebox{-0.15ex}{\textbullet}%
  \hspace{0.5em}%
  \rule[0.5ex]{0.9\textwidth}{1.0pt}%
  \hspace{0.5em}%
  \raisebox{-0.15ex}{\textbullet}%
  \vspace{0em} % Adjust space below
}

\newcommand{\exercises}{
  \vspace{1em} % Add space above
  \noindent
  \raisebox{-0.15ex}{$\Box$}%
  \hspace{0.5em}%
  \rule[0.5ex]{0.9\textwidth}{1.0pt}%
  \hspace{0.5em}%
  \raisebox{-0.15ex}{$\Box$}%
  \vspace{0em} % Adjust space below
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\setcounter{secnumdepth}{2}  % Numbers sections up to level 2 (subsections)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% theorem environments
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{prop}[theorem]{Proposition}
\newtheorem{conj}[theorem]{Conjecture}
\newtheorem{clm}[theorem]{Claim}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{cor}[theorem]{Corollary}

\theoremstyle{definition}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
%\newtheorem{ndefn}[theorem]{Non-definition}
%\newtheorem{defns}[theorem]{Definitions}
%\newtheorem{con}[theorem]{Construction}
%
\newtheorem{prob}[theorem]{Problem}
\newtheorem{sol}[theorem]{Solution}
%\newtheorem{example}[theorem]{Example}
\newtheorem{nexmp}[theorem]{Non-example}
\newtheorem{examples}[theorem]{Examples}
\newtheorem{notation}[theorem]{Notation}
\newtheorem{notations}[theorem]{Notations}
%\newtheorem{addm}[theorem]{Addendum}
\newtheorem{exer}[theorem]{Exercise}
\newtheorem{obv}[theorem]{Observation}
\newtheorem{quest}[theorem]{Question}

\theoremstyle{remark}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
%\newtheorem{rems}[theorem]{Remarks}
\newtheorem{warn}[theorem]{Warning}
%\newtheorem{sch}[theorem]{Scholium}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Add end mark to existing example environment 
\newcommand{\examplemark}{\hfill\hbox{$\diamond$}}
% Modify example environment to append mark
\AtEndEnvironment{example}{\examplemark}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Add end mark to existing example environment 
\newcommand{\defmark}{\hfill\hbox{$\bullet$}}
% Modify example environment to append mark
\AtEndEnvironment{definition}{\defmark}


\begin{document}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% r.9 introduction
\cleardoublepage
\chapter*{Incipit}

\newthought{Mathematics is the language} of modern engineering, and linear algebra its American dialect -- inelegant, practical, ubiquitous. This text aims to prepare engineering students for the mathematical aspects of artificial intelligence, data science, dynamical systems, machine learning, and other fields whose advances depend critically on linear algebraic methods.

The reader arrives here having encountered matrices and vectors in calculus courses (at least). These tools, though already familiar as computational devices, harbor deeper structures worth careful study. Our task is to build on this computational facility toward an understanding of the abstract frameworks that enable modern methods in contemporary engineering.

This text differs from standard linear algebra courses in its emphasis and pace. Abstract vector spaces appear early, but always in service of concrete applications. The singular value decomposition and eigentheory -- essential to modern practice -- arrive at the midpoint, allowing extended treatment of applications in dynamics and data science alike. Practical examples appear throughout, acknowledging that theoretical understanding and useful implementation emanate symmetrically.

The sequence of topics balances pedagogical necessity with contemporary relevance. Systems of linear equations provide an entry point, leading to vector spaces and linear transformations. Inner products and orthogonality build geometric intuition, and linear ODEs and iterative systems provide an impetus for eigendecompositions. The singular value decomposition serves as both a culminating theoretical achievement and a bridge to powerful applications, such as principal component analysis, low-rank approximation, and neural networks.

This text exists because engineering education must evolve. Though the foundations of linear algebra remain stable, their applications have expanded dramatically. Today's engineering students require facility with both abstract theory and practical implementation -- not merely to apply existing tools, but to create new ones. Linear algebra is not the endpoint, but rather a first step toward deeper mathematical structures. It is through this lens that we approach the subject: as a gateway to both current practice and future advances.

%{\em Incipit.}




% ==============================================
\section*{Topics for Review}
\label{sec:topics}

This text assumes a strong grounding in (single and) multivariable calculus in the context of vectors, matrices, and coordinate-based linear transformations. Please see the {\em Calculus Blue Project} for an example. 
Before beginning this text the reader should have been exposed to:
\begin{enumerate}
    \item Basic set-theory and its notation
        \begin{marginfigure}
            {\em e.g.,} $\in, \subset, \cup, \cap$
        \end{marginfigure}
    \item Taylor series and exponentials
    \item Complex numbers and Euler's formula
        \begin{marginfigure}
            $e^{i\theta} = \cos\theta + i\sin\theta$
        \end{marginfigure}
    \item Differentiation and integration
    \item The linear ODE $dx/dt = ax$ and its solutions
        \begin{marginfigure}
            $x(t) = e^{at}x_0$
        \end{marginfigure}
    \item Euclidean vectors and vector algebra
    \item The dot product and angles between Euclidean vectors
        \begin{marginfigure}
            $\vect{u}\cdot\vect{v} = |\vect{u}||\vect{v}|\cos\theta$
        \end{marginfigure} 
    \item Matrices, matrix addition, and matrix multiplication
        \begin{marginfigure}
            $AB\neq BA$ \\ $(AB)C=A(BC)$
        \end{marginfigure} 
    \item The identity matrix, $I$, and its behavior
    \item The transpose $A^T$ of a matrix $A$ and its properties
        \begin{marginfigure}
            $(A^T)_{ij}=A_{ji}$ and $(AB)^T=B^TA^T$
        \end{marginfigure} 
    \item Matrix-vector multiplication
    \item Converting linear systems of equations to matrix-vector form
        \begin{marginfigure}
            $A\vect{x}=\vect{b}$
        \end{marginfigure} 
    \item Row reduction and back-substitution
    \item The matrix inverse $A^{-1}$ and its properties
        \begin{marginfigure}
            $AA^{-1}=I=A^{-1}A$
            \\
            $(AB)^{-1}=B^{-1}A^{-1}$
        \end{marginfigure} 
    \item Euclidean linear transformations: rescaling, rotations, shears
    \item Trace of a matrix
        \begin{marginfigure}
            $\trace(A)=\sum_k a_{kk}$
        \end{marginfigure}
    \item Determinants and their properties
        \begin{marginfigure}
            $\det(AB)=\det(A)\det(B)$

            \noindent $\det(A^T)=\det(A)$
        \end{marginfigure}     
\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Assumptions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This text, like its author, spans Mathematics \& Engineering and tries to strike a balance between the two. Given the audience and constraints associated with this text, there are a few topics or details included which do not appear in typical linear algebra texts, as well as several interesting mathematical side-paths which are left unexplored. 

\begin{enumerate}
    \item Abstract vector spaces and abstract linear transformations are important, even though coordinate-based linear algebra prevails in applications. Thinking without coordinates is an important skill to master.

    \item Finite-dimensional vector spaces are the norm. When infinite-dimensional spaces are invoked, they are done so without fully detailed justification and with some caveats.

    \item The Fundamental Theorem of Linear Algebra is the organizing principle of this text. Its usual emanation in terms of orthogonal complements is to be approached only after the primal form (using quotients) is mastered.

    \item All vector spaces are over the reals -- no finite fields and no complex coefficients. This greatly facilitates intuition at the expense of complexity when covering the Jordan Canonical Form and solutions to linear systems of ODEs. 

    \item Not all applications can be developed slowly via careful exposition. Teaching random variables, covariance matrices, stress tensors, neural networks, and other interesting engineering applications is not the direct goal of the text. Until recently, the author might not have been entirely comfortable including such examples without fuller explanations. In an age of language models and active AI-enriched reading, new possibilities emerge.
\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgments}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This text meant for first- and second-year undergraduate students in engineering as a follow-up course to multivariable calculus. It was created initially to support students in the Penn's Artificial Intelligence degree program, but has much broader utility. The author is grateful to Penn's fantastic engineering students. 

Prof. N. Matni produced an excellent set of online course materials and python worksheets to pair with the course antecedent to this text. The outline of this book takes some inspiration from his work, while laying out a slightly different trajectory. 

The writing was assisted by Claude 3.5 sonnet, trained on my previous books for style. A hidden schema of puzzles based on a certain work of Wm. Blake \& a bit of influence from Aquinas was co-created by the author and Claude, with influences throughout the text. 

Artwork (mathematical and iconic) is by the author, using Adobe Illustrator. The LaTeX style files are based on the tufte-book class. GPT-4o and -o1 were useful in setting up various latex configurations and doing proof-reading. Gemini Experimental 1206 was an especially good proof-reader and converter to markdown. Exercises were generated with the help of Claude and may have errors.

This project was begun on November 4, 2024. The first edition was submitted to Amazon Publishing on December 28, 2024. Fifty-five days: impossible without the creative labors of Claude, to whom the author is most grateful.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Start the main matter (normal chapters)
\mainmatter


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THE THARMAS CYCLE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.75\textwidth]{THARMAS.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
\end{fullwidth}

\clearpage

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Solving Linear Systems}
\label{ch:1}

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``in right lined paths outmeasur'd by proportions of number weight \& measure''}

\newthought{The story of linear algebra begins} with systems of equations, each line describing a constraint or boundary traced upon abstract space. These simplest mathematical models of limitation  --  each equation binding variables in measured proportion  --  conjoin to shape the realm of possible solutions. When several such constraints act in concert, their collaboration yields three possible fates: no solution survives their collective force; exactly one point satisfies all bounds; or infinite possibilities trace curves and planes through the space of satisfaction. This trichotomy  --  of emptiness, uniqueness, and infinity  --  echoes through all of linear algebra, appearing in increasingly sophisticated forms as our understanding deepens.

The art lies in recognizing these patterns and discovering efficient paths to their resolution. Each systematic operation preserves essential structure while bringing clarity to what was obscure. The methods we develop  --  though conceived for practical computation  --  exemplify deeper principles about how mathematical objects may be transformed while maintaining their fundamental character.

Our journey begins with the familiar territory of solving equations, yet even here we find hints of profound structure waiting to be unveiled. The patterns that emerge  --  of transformation and invariance, of dimension and degeneracy  --  will guide our development throughout this text. Through careful study of these foundational systems, we build the tools needed to understand far more sophisticated mathematical objects. In this way, the simple act of solving equations becomes our first step toward comprehending the deepest patterns in linear algebra.

% ==============================================
\section{Solving Equations}
\label{sec:equations}
% ==============================================

\begin{definition}[Linear System]
\label{def:linsys}
A \style{linear system} in variables $x_1,\ldots,x_n$ consists of $m$ equations of the form
\[
\begin{array}{rcl}
a_{11}x_1 + a_{12}x_2 + \cdots + a_{1n}x_n &=& b_1 \\
a_{21}x_1 + a_{22}x_2 + \cdots + a_{2n}x_n &=& b_2 \\
&\vdots& \\
a_{m1}x_1 + a_{m2}x_2 + \cdots + a_{mn}x_n &=& b_m
\end{array}
\]
where the coefficients $a_{ij}$ and constants $b_i$ are real numbers.
\end{definition} 

Such systems arise naturally in contexts ranging from the distribution of currents in electrical networks to the balance of forces in structures to the flow of traffic in transportation networks.

This system is more efficiently expressed as $A\vect{x} = \vect{b}$, where:
%
\begin{marginfigure}
The matrix form $A\vect{x}=\vect{b}$ is more than mere notation -- it reveals the fundamental operation of linear transformation that we will explore in Chapter \ref{ch:3}.
\end{marginfigure}
%
\begin{equation}
A = \begin{bmatrix} 
a_{11} & a_{12} & \cdots & a_{1n} \\
a_{21} & a_{22} & \cdots & a_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
a_{m1} & a_{m2} & \cdots & a_{mn}
\end{bmatrix}, \quad
\vect{x} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}, \quad
\vect{b} = \begin{bmatrix} b_1 \\ b_2 \\ \vdots \\ b_m \end{bmatrix}
\end{equation}

The matrix $A$ is called the \style{coefficient matrix} of the system. The vector $\vect{b}$ is the \style{constant vector}. Together they completely specify the linear system.

% ==============================================
\section{Special Matrices}
\label{sec:special}
% ==============================================

Before engaging with the general solution of linear systems, we examine certain fundamental types of coefficient matrices -- primal forms from which more complex patterns emerge. These special cases  --  though rarely encountered in practice  --  illuminate the path toward general methods.

The simplest case occurs when $A$ is the \style{identity matrix} $I$. The system $I\vect{x}=\vect{b}$ requires no solving: the solution is immediate, with $\vect{x}=\vect{b}$. This seeming triviality is nevertheless valuable: the simpler the matrix, the easier it is to infer a solution.

\begin{definition}[Permutation]
\label{def:perm}
A \style{permutation matrix} is a square matrix with exactly one $1$ per row and column, having all other entries equal to $0$.
\end{definition}
%
\begin{marginfigure}
{\em Example:} a permutation matrix. 
\[
P = \begin{bmatrix} 
0 & 0 & 0 & 1 & 0 \\
1 & 0 & 0 & 0 & 0 \\
0 & 0 & 1 & 0 & 0 \\
0 & 1 & 0 & 0 & 0 \\
0 & 0 & 0 & 0 & 1 
\end{bmatrix}
\]
\end{marginfigure}
%
Every permutation matrix $P$ is obtained by rearranging the rows (or columns) of the identity matrix. Such matrices effect a reordering of components: the solution to $P\vect{x}=\vect{b}$ is a reordering of the entries of $\vect{b}$. This explains why permutation matrices are invertible  --  their inverse simply undoes the permutation. Though elementary, these matrices play a crucial role in the implementation of efficient solution methods.

More interesting are \style{block-diagonal matrices}, having the form
\[
B = \begin{bmatrix} 
B_1 & 0 & \cdots & 0 \\
0 & B_2 & \cdots & 0 \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0 & \cdots & B_k
\end{bmatrix}
\]
where each $B_i$ is a matrix. The system $B\vect{x}=\vect{b}$ decomposes into independent subsystems, one for each block. This decomposition principle  --  that some linear systems can be solved by solving smaller independent systems  --  will recur throughout our development.
%
\begin{marginfigure}
{\em Example:} The following 4-by-4 matrix
\[
\begin{bmatrix} 
2 & 1 & 0 & 0 \\
3 & 7 & 0 & 0 \\
0 & 0 & 1 & 4 \\
0 & 0 & -2 & 3 \\
\end{bmatrix}
\]
decomposes into two independent 2-by-2 blocks.
\end{marginfigure}
%
\begin{example}[Hidden Block Structure]
Consider the linear system:
\[
\begin{bmatrix}
0 & 0 & 0 & -1 & 3 \\
0 & 0 & 2 & 1 & 0 \\
1 & 2 & 0 & 0 & 0 \\
0 & 0 & -1 & 4 & 0 \\
-1 & 3 & 0 & 0 & 0
\end{bmatrix}
\begin{pmatrix}
x_1 \\ x_2 \\ x_3 \\ x_4 \\ x_5
\end{pmatrix}
=
\begin{pmatrix}
b_1 \\ b_2 \\ b_3 \\ b_4 \\ b_5
\end{pmatrix}
\]
The structure of this system is obscured, but becomes clear after permuting rows and columns to group related variables. Specifically, after reordering rows (1,2,4) and (3,5), and variables $x_3,x_4,x_5$ and $x_1,x_2$, the system becomes:
\[
\begin{bmatrix}
2 & 1 & 0 & 0 & 0 \\
-1 & 4 & 0 & 0 & 0 \\
0 & -1 & 3 & 0 & 0 \\
0 & 0 & 0 & 1 & 2 \\
0 & 0 & 0 & -1 & 3
\end{bmatrix}
\begin{pmatrix}
x_3 \\ x_4 \\ x_5 \\ x_1 \\ x_2
\end{pmatrix}
=
\begin{pmatrix}
b_2 \\ b_4 \\ b_1 \\ b_3 \\ b_5
\end{pmatrix}
\]
This reveals two independent subsystems: a $3\times 3$ system involving $x_3,x_4,x_5$ and a $2\times 2$ system for $x_1,x_2$. The block structure, hidden in the original formulation, allows us to solve two smaller systems rather than one large system.
\end{example}

An \style{upper-triangular matrix} $U$ has all entries below the diagonal equal to zero:
\[
U = \begin{bmatrix} 
u_{11} & u_{12} & \cdots & u_{1n} \\
0 & u_{22} & \cdots & u_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0 & \cdots & u_{nn}
\end{bmatrix}
\]
The system $U\vect{x}=\vect{b}$ yields to \style{back-substitution}: from the last equation, we compute $x_n$; this value substituted into the penultimate equation yields $x_{n-1}$; and so forth. This process fails only if some diagonal entry $u_{ii}$ vanishes.

Its transpose, a \style{lower-triangular matrix} $L$, has all entries above the diagonal equal to zero. The corresponding system $L\vect{x}=\vect{b}$ succumbs to \style{forward-substitution}, solving for variables in order from first to last. These triangular forms will be our stepping stones toward solving general systems.

\begin{marginfigure}
{\em Foreshadowing:} The decomposition of a general matrix into a product of triangular matrices will provide both theoretical insight and practical methods for solving linear systems.
\end{marginfigure}

These special cases suggest a strategy: convert a general system into one of these simpler forms through systematic manipulation of equations. 
%How to do so efficiently and reliably is the subject of the next section.

% ==============================================
\section{Recalling Row Reduction}
\label{sec:rowreduction}
% ==============================================

The method of solving linear systems by systematic elimination of variables has ancient roots. The modern approach builds on this by expressing both the coefficient matrix $A$ and constant vector $\vect{b}$ as a single object -- the \style{augmented matrix}, written as $[\,A\,|\,\vect{b}\,]$. This augmented matrix combines the system's coefficients with its constants in one array.

The solution of linear systems proceeds through a sequence of operations, each of which transforms the augmented matrix into another representing an equivalent system (having the same solutions). 

\begin{definition}[Elementary Row Operations]
\label{def:rowops}
An \style{elementary row operation} on a matrix is one of three types:
\begin{enumerate}
    \item[R1:] Interchange of any two rows
    \item[R2:] Multiplication of any row by a nonzero scalar
    \item[R3:] Addition of a multiple of one row to another row
\end{enumerate}
Each preserves the solution set of the corresponding linear system.
\end{definition}
%
\begin{marginfigure}
{\em Caveat:} While these three operations seem simple, their order matters greatly. A poorly chosen sequence of operations can lead to inconvenience and/or numerical instability.
\end{marginfigure}
These operations, though simple, are powerful. The first, R1, allows strategic positioning of equations. The second, R2, enables normalization of coefficients. The third, R3, is the atomic unit of elimination -- the means by which variables are systematically removed from equations.

%
The purpose of these operations is to convert the augmented matrix into a suitably simple form.
\begin{marginfigure}
{\em Example:} row echelon form.
\[
\begin{bmatrix} 
\bullet & * & * & * & * \\
0 & \bullet & * & * & * \\
0 & 0 & 0 & \bullet & * \\
0 & 0 & 0 & 0 & 0 
\end{bmatrix}
\]
\end{marginfigure}
%
\begin{definition}[Row Echelon Form]
\label{def:rref}
A matrix is in \style{row echelon form} if:
\begin{enumerate}
    \item All zero rows (if any) appear at the bottom
    \item The first nonzero entry (the \style{pivot}) in each nonzero row appears to the right of all pivots in rows above it
    \item All entries in a column below a pivot are zero
\end{enumerate}
A matrix in row echelon form with all entries above pivots also zero is said to be in \style{reduced row echelon form}.
\end{definition}

The process of achieving row echelon form exposes the structure of the linear system. Variables corresponding to pivot columns are \style{bound} -- determined by the other variables in the system. The remaining variables are \style{free} -- they may be chosen arbitrarily, with the bound variables adjusting accordingly to maintain the system's constraints.
%
\begin{marginfigure}
{\em Foreshadowing:} The distinction between bound and free variables previews a deeper structure we will encounter when studying vector spaces: the relationship between dimension and constraints.
\end{marginfigure}

Should one continue the row operations beyond row echelon form, setting all entries above pivots to zero as well, the result is the \style{reduced row echelon form}. This form is unique to the linear system -- though many different sequences of row operations may arrive at it. The path to reduced row echelon form may vary; the destination does not.

The dimension of the space of solutions is revealed through this reduction: it equals the number of free variables in the system. This connection between the algebraic process of row reduction and the geometric interpretation of solution sets exemplifies a central theme of linear algebra: the interplay of computational, algebraic, and geometric perspectives.

% ==============================================
\section{Inverse \& Invertibility}
\label{sec:inverse}
% ==============================================

For a square matrix $A$, the system $A\vect{x}=\vect{b}$ takes on special significance as a model of \textit{determined} problems -- those with as many equations as unknowns. The solvability of such systems hinges on a fundamental property:

\begin{definition}[Nonsingularity]
\label{def:nonsingular}
A square matrix $A$ is \style{nonsingular} if any of the following equivalent conditions hold:
\begin{enumerate}
    \item There exists a matrix $A^{-1}$ such that $AA^{-1}=A^{-1}A=I$
    \item The system $A\vect{x}=\vect{b}$ has a unique solution for every $\vect{b}$
    \item The system $A\vect{x}=\vect{0}$ has only the trivial solution $\vect{x}=\vect{0}$
    \item The determinant is nonzero: $\det A\neq 0$
\end{enumerate}
\begin{marginfigure}
{\em Recall:}
\[
    \begin{bmatrix}
        a & b \\ c & d
    \end{bmatrix}^{-1}
    =
\]
\[
    \frac{1}{\det}
    \begin{bmatrix}
        d & -b \\ -c & a
    \end{bmatrix}
\]
%when $ad-bc\neq 0$
\end{marginfigure}
A matrix that is not nonsingular is called \style{singular}.
\end{definition}

When $A$ is nonsingular, its inverse $A^{-1}$ provides an immediate solution $\vect{x}=A^{-1}\vect{b}$ to the system $A\vect{x}=\vect{b}$. Though the determinant offers a theoretical test for nonsingularity, practical computation requires different tools.

\begin{marginfigure}
{\em Foreshadowing:} The geometric interpretation of singular matrices as ``compressing space'' becomes profound when studying eigenvalues (Chapter 7) and singular values (Chapter 10).
\end{marginfigure}
Row reduction provides a systematic approach to finding $A^{-1}$ or proving it does not exist. Form the augmented matrix $[\,A\,|\,I\,]$ and perform row operations. If $A$ is nonsingular, this yields $[\,I\,|\,A^{-1}\,]$ -- the same operations transforming $A$ to $I$ will transform $I$ to $A^{-1}$.

A singular matrix reveals itself during row reduction through a row of zeros. Such matrices irretrievably compress space, mapping distinct vectors to the same image. This compression manifests in the system $A\vect{x}=\vect{b}$ as either inconsistency (no solutions) or indeterminacy (infinitely many solutions).

The equivalence of various characterizations of nonsingularity reveals deep connections between algebraic, geometric, and computational perspectives:
\begin{itemize}
    \item Algebraic: $\det(A)\neq 0$
    \item Analytic: $A\vect{x}=\vect{0}$ has only the trivial solution
    \item Computational: $[\,A\,|\,I\,]$ reduces to $[\,I\,|\,A^{-1}\,]$
\end{itemize}
%
These connections presage the interplay between algebraic, geometric, and computational properties of linear-algebraic entities.

% ==============================================
\section{Composition \& Elimination}
\label{sec:composition}
% ==============================================

Row reduction is more than a sequence of operations: it is a composition of linear transformations. Each elementary row operation can be realized as multiplication on the left by an appropriate \style{elementary matrix} -- obtained by performing that same operation on the identity matrix.

For example, to interchange rows $i$ and $j$ of a matrix $A$, one multiplies on the left by the matrix $E$ obtained by performing R1 on $I$. To multiply row $i$ by a nonzero constant $c$, one uses the elementary matrix $E$ formed by scaling row $i$ of $I$ by $c$: applying R2 to $I$. To add $c$ times row $j$ to row $i$, the elementary matrix $E$ comes from performing this R3 operation on $I$.

The salient feature of these elementary matrices is their \style{invertibility}. Each row operation can be undone:
\begin{marginfigure}
{\em Examples:} row operation matrices and their inverses:

\noindent R1:

\noindent
$\begin{bmatrix}
    0&0&1&0 \\
    0&1&0&0 \\
    1&0&0&0 \\
    0&0&0&1
\end{bmatrix}^{-1} 
=
\begin{bmatrix}
    0&0&1&0 \\
    0&1&0&0 \\
    1&0&0&0 \\
    0&0&0&1
\end{bmatrix}$

\noindent R2:

\noindent
$\begin{bmatrix}
    1&0&0&0 \\
    0&1&0&0 \\
    0&0&5&0 \\
    0&0&0&1
\end{bmatrix}^{-1} 
=
\begin{bmatrix}
    1&0&0&0 \\
    0&1&0&0 \\
    0&0&\frac{1}{5}&0 \\
    0&0&0&1
\end{bmatrix}$

\noindent R3:

\noindent
$\begin{bmatrix}
    1&0&0&0 \\
    0&1&0&0 \\
    0&0&1&0 \\
    2&0&0&1
\end{bmatrix}^{-1} 
=
\begin{bmatrix}
    1&0&0&0 \\
    0&1&0&0 \\
    0&0&1&0 \\
    -2&0&0&1
\end{bmatrix}$
\end{marginfigure}
%
\begin{itemize}
    \item Interchanging rows is its own inverse.
    \item Scaling a row by $c$ and has inverse scaling the same row by $1/c$.
    \item Adding $c$ times row $j$ to row $i$ has as inverse the same operation with $-c$ instead.
\end{itemize}

The process of \style{Gaussian elimination} -- the systematic reduction of a matrix to row echelon form -- is thus expressible as a composition of these three types of elementary matrices:
\begin{equation*}
E_k\cdots E_2E_1A = R
\end{equation*}
where $R$ is the row echelon form and each $E_i$ is elementary. The product $E_k\cdots E_2E_1$ represents the cumulative effect of the row operations. When $A$ is invertible, this sequence continues until $R=I$, yielding
\begin{equation*}
A^{-1} = E_k\cdots E_2E_1
\end{equation*}

This perspective on row reduction -- as a composition of invertible linear transformations -- reveals the algorithmic heart of linear algebra. Though conceived as a computational method, Gaussian elimination exemplifies a deeper principle: the resolution of complex transformations into sequences of simple, invertible ones.

% ==============================================
\section{LU Decomposition}
\label{sec:LU}
% ==============================================

Our exposition of elementary matrices and Gaussian elimination suggests a deeper structure within matrix factorization. The sequence of row operations that produces an upper triangular matrix can be reorganized to reveal a natural factorization of the original matrix.

\begin{definition}[LU Decomposition]
\label{def:lu}
An \style{LU decomposition} of a square matrix $A$ expresses it as a product $A = LU$, where $L$ is lower triangular (with ones on the diagonal) and $U$ is upper triangular. 
\end{definition}

The matrix $U$ is precisely what one obtains from Gaussian elimination without row interchanges; the matrix $L$ captures the multipliers used in the elimination process.

\begin{example}
For a $3\times 3$ matrix, the $LU$ decomposition takes the form:
\[
A = 
\begin{bmatrix}
1 & 0 & 0 \\
\ell_{21} & 1 & 0 \\
\ell_{31} & \ell_{32} & 1
\end{bmatrix}
\begin{bmatrix}
u_{11} & u_{12} & u_{13} \\
0 & u_{22} & u_{23} \\
0 & 0 & u_{33}
\end{bmatrix}
\]
where the $\ell_{ij}$ are the elimination multipliers.
\end{example}

This factorization emerges naturally from the elimination process. When we use a multiplier $m$ to eliminate the $(i,j)$ entry using row $j$, that same multiplier appears in the $(i,j)$ position of $L$. The upper triangular matrix $U$ records the results of these eliminations. Thus, rather than storing a sequence of elementary matrices, we store their cumulative effect in $L$.

\begin{marginfigure}
{\em Caveat:} The existence of an $LU$ decomposition assumes we can perform elimination without row exchanges. When row interchanges are needed, a more general \style{PLU decomposition} incorporates a permutation matrix $P$.
\end{marginfigure}

The utility of LU decomposition lies in its efficiency for solving systems of equations. Once computed, the factors $L$ and $U$ allow us to solve $A\vect{x}=\vect{b}$ by successive substitution:
\begin{enumerate}
    \item First solve $L\vect{y}=\vect{b}$ by forward substitution
    \item Then solve $U\vect{x}=\vect{y}$ by back substitution
\end{enumerate}
\begin{marginfigure}
%
{\em Example:} In electrical circuit analysis, one often solves $A\vect{x}=\vect{b}$ repeatedly with the same network topology ($A$) but different voltage or current sources ($\vect{b}$). LU decomposition is ideal for such scenarios.
\end{marginfigure}

The computational advantage becomes clear when solving multiple systems with the same coefficient matrix but different right-hand sides. The factorization need be computed only once, at a cost of approximately $\frac{2}{3}n^3$ operations for an $n\times n$ matrix. Each subsequent solution requires only $O(n^2)$ operations for the forward and back substitutions  --  a significant savings over repeating the full elimination process.

This efficiency drives the widespread use of $LU$ decomposition in numerical linear algebra. From circuit analysis to structural mechanics to fluid dynamics, systems of linear equations rarely appear in isolation. The ability to reuse a matrix factorization across multiple right-hand sides is invaluable in practice.

\begin{marginfigure}
{\em Foreshadowing:} The $LU$ decomposition is but one of several matrix factorizations we shall encounter. Each reveals different aspects of a matrix's structure and serves different computational needs.
\end{marginfigure}

The $LU$ decomposition exemplifies a recurring theme in computational mathematics: trading increased storage (the explicit factors $L$ and $U$) for decreased computation time. This theme will recur as we explore more sophisticated matrix factorizations, each offering its own balance of storage, computation, and insight.

% ==============================================
\section{Pivots \& Permutations}
\label{sec:pivots}
% ==============================================

The process of Gaussian elimination, as described thus far, assumes we can use any nonzero entry as a pivot. In practice, this is numerically unwise. Consider elimination in the following system, whose coefficients have been taken from a data set:
\[
\begin{bmatrix}
0.003 & 7.149 \\
2.483 & 3.092
\end{bmatrix}
\begin{pmatrix}
x_1 \\ x_2
\end{pmatrix}
=
\begin{pmatrix}
b_1 \\ b_2
\end{pmatrix}
\]
Using 0.003 as a pivot would require dividing by a small number  --  multiplying any roundoff errors in other entries by 1000. Interchanging the rows first yields a more stable elimination.

This suggests a modification to our elimination strategy: before elimination in each column, we first select an appropriate pivot by permuting rows. Such row interchanges are encoded by permutation matrices. Recall from \S\ref{sec:rowreduction} that a  permutation matrix $P$ is obtained by reordering the rows of the identity matrix; multiplication by $P$ effects the corresponding reordering of matrix rows.
%
When we incorporate this pivot selection strategy into our elimination process, we obtain:
\begin{definition}[PLU Decomposition]
\label{def:plu}
A \style{PLU decomposition} of a matrix $A$ expresses it as a product $A=P^{-1}LU$ where:
\begin{enumerate}
    \item $P$ is a permutation matrix
    \item $L$ is lower triangular with ones on the diagonal
    \item $U$ is upper triangular
\end{enumerate}
Such a decomposition exists for any nonsingular matrix $A$ and encodes the steps of Gaussian elimination with partial pivoting.
\end{definition}

In practice, we permute $A$ first, yielding $PA$; then decompose that into $PA=LU$. Since $P$ is invertible, we can write $A = P^{-1}LU$. The system $A\vect{x}=\vect{b}$ thus becomes
\[
P^{-1}LU\vect{x} = \vect{b}
\quad \Longrightarrow \quad
LU\vect{x} = P\vect{b}
\]
which we solve by:
\begin{enumerate}
    \item Computing $P\vect{b}$ (applying the same row interchanges to $\vect{b}$ that were used in elimination)
    \item Solving $L\vect{y}=P\vect{b}$ by forward substitution
    \item Solving $U\vect{x}=\vect{y}$ by back substitution
\end{enumerate}

\begin{marginfigure}
{\em Caveat:} Though we write the decomposition as $PA=LU$, in practice we store $P$ either as a permutation vector or as a sequence of row swaps, not as an explicit matrix.
\end{marginfigure}

\begin{marginfigure}
{\em BONUS!} This strategic permutation is an example of \style{preconditioning}.
\end{marginfigure}

This refinement of $LU$ decomposition  --  incorporating pivoting through permutations  --  exemplifies a broader principle in computational mathematics: theoretical algorithms often require modification to ensure numerical stability in practice. The art lies in preserving the essential structure while adapting to practical constraints.

% ==============================================
\section{Practicalities of Linear Systems}
\label{sec:practical}
% ==============================================

Theory reveals itself in practice as fundamental patterns emerge from computation. Though our development thus far has emphasized the algebraic structure of linear systems  --  their solution spaces, elimination methods, and matrix factorizations  --  engineering demands more. We must determine not just whether solutions exist but whether we can compute them reliably. This bridge between abstract mathematics and practical computation requires understanding both the geometric meaning of our operations and their sensitivity to the numerical realities of finite-precision arithmetic.

Row reduction to a row echelon form (recall Definition \ref{def:rref}) reveals not only solutions but also fundamental structure. The following not-quite-rigorous definition will ascend to central importance Chapter \ref{ch:3}. 

\begin{definition}[Matrix Rank]
\label{def:pseudorank}
    The \style{rank} of a matrix is the number of pivots in a row-reduced echelon form.
\end{definition}

This is a fundamental measure of the matrix's effectiveness at transforming space. For an $m\times n$ matrix $A$, the rank satisfies
\begin{marginfigure}
{\em Example:} A $3\times 3$ matrix of rank 2 maps $\R^3$ onto a plane, collapsing one dimension. The geometric image helps explain why such a matrix cannot be nonsingular.
\end{marginfigure}
\[
\text{rank}(A) \leq \min\{m,n\}
\]
with equality implying $A$ has \style{full rank}. When $A$ is square, full rank is equivalent to nonsingularity.

\begin{example}[Row echelon computation]
Consider the matrix 
\[
A = \begin{bmatrix}
1 & 2 & 0 & 3 & 1 & 2 & 4 \\
2 & 4 & 0 & 6 & 2 & 5 & 1 \\
3 & 6 & 0 & 9 & 3 & 7 & 5 \\
1 & 2 & 0 & 3 & 1 & 1 & 8 \\
4 & 8 & 0 & 12 & 4 & 9 & 2
\end{bmatrix}
\]
{\em Mirabile dictu:} the $(1,1)$ entry is a perfect pivot. Clearing out the first column leads to a dramatic simplification; then clearing out the sixth column and a slight reordering yields the final row-echelon form.
\[
\begin{bmatrix}
1 & 2 & 0 & 3 & 1 & 2 & 4 \\
0 & 0 & 0 & 0 & 0 & 1 & -7 \\
0 & 0 & 0 & 0 & 0 & 1 & -7 \\
0 & 0 & 0 & 0 & 0 & -1 & 4 \\
0 & 0 & 0 & 0 & 0 & 1 & -14
\end{bmatrix}
\quad \Rightarrow \quad 
\begin{bmatrix}
1 & 2 & 0 & 3 & 1 & 2 & 4 \\
0 & 0 & 0 & 0 & 0 & 1 & -7 \\
0 & 0 & 0 & 0 & 0 & 0 & -3 \\
0 & 0 & 0 & 0 & 0 & 0 & 0 \\
0 & 0 & 0 & 0 & 0 & 0 & 0
\end{bmatrix}
\]

Several interesting features emerge:
\begin{enumerate}
\item The first row operation reveals that rows 2-5 were nearly linearly dependent, differing only in their last two entries
\item The matrix has rank 3, as evidenced by three nonzero rows in echelon form
\item The third column is all zeros, making it unnecessary to perform eliminations there
\item The dependencies among the first five columns become clear only after elimination
\end{enumerate}
\end{example}

\begin{example}[Surface Flatness Measurement]
Consider the quality control inspection of a machined metal surface, where a coordinate measuring machine samples five points to verify flatness. The measurements (in micrometers) yield coordinates:
$(1.23, 3.41, 502.1)$, $(4.56, -2.17, 498.4)$, $(-2.89, 1.76, 501.3)$, $(0.12, -4.33, 499.7)$, and $(3.45, 2.91, 500.8)$.
%
To assess flatness deviation, we seek a best-fit plane $z=ax+by+c$ approximating these points. Each measurement $(x_i,y_i,z_i)$ generates one equation in our system:
\[
\begin{bmatrix}
1.23 & 3.41 & 1.0 \\
4.56 & -2.17 & 1.0 \\
-2.89 & 1.76 & 1.0 \\
0.12 & -4.33 & 1.0 \\
3.45 & 2.91 & 1.0
\end{bmatrix}
\begin{pmatrix}
a \\ b \\ c
\end{pmatrix}
=
\begin{pmatrix}
502.1 \\ 498.4 \\ 501.3 \\ 499.7 \\ 500.8
\end{pmatrix}
\]
This $5\times 3$ system has more equations than unknowns -- a common situation in metrology where redundant measurements help reduce the impact of individual measurement errors. The z-coordinates cluster near 500 micrometers (the nominal surface height) with deviations suggesting both systematic tilt and random measurement noise. Though the matrix has full rank 3, small changes in the measurements can produce surprisingly large changes in the computed coefficients $a$, $b$, and $c$. This sensitivity to measurement perturbation, crucial for understanding the reliability of our computed solution, leads us to examine how different matrices can vary in their numerical stability.
\end{example}

\begin{marginfigure}
    \centering
    \includegraphics[width=1.2in]{trichotomy.png}
\end{marginfigure}
A geometric perspective illuminates these algebraic concepts. Each equation in a linear system represents an $(n-1)$-dimensional  hyperplane in $\R^n$. The solution set is the intersection of these hyperplanes. A unique solution corresponds to $n$ hyperplanes mutually meeting at a single point; parallel distinct hyperplanes yield no solution; hyperplanes coinciding or intersecting in a line yield infinitely many solutions.

\begin{marginfigure}
{\em Caveat:} While this geometric view aids intuition in two or three dimensions, beware of relying too heavily on geometric thinking in higher dimensions, where our intuition often fails us.
\end{marginfigure}
The practical import of these concepts lies in their ability to predict the behavior of linear systems before attempting to solve them. The rank determines whether a solution exists; nonsingularity tells us if that solution is unique. This structural understanding guides our choice of solution methods and helps us interpret the results.

\begin{example}
\label{ex:conditionintro}
Not all matrices are created equal in their amenability to computation. Consider solving the system $A\vect{x}=\vect{b}$ where
\[
    A = \begin{bmatrix}
    1 & 0.999 \\
    0 & 0.001
    \end{bmatrix}
\]
Though this matrix is nonsingular, small changes in $\vect{b}$ can produce large changes in the solution $\vect{x}$. Such sensitivity to perturbation  --  whether from measurement error, roundoff in computation, or truncation of decimal places  --  fundamentally limits our ability to solve linear systems reliably.

This sensitivity has geometric meaning: $A$ maps the unit circle to an extremely eccentric ellipse, stretching space a thousand times more in one direction than another. The \style{condition number} of $A$, denoted $\cond(A)$, measures precisely this eccentricity through the ratio of its largest to smallest stretching factors:
%
\begin{marginfigure}
{\em Nota bene:} The formal definition requires concepts from Chapter 10, but the geometric intuition  --  that some matrices distort space more extremely than others  --  serves us well even now.
\end{marginfigure}
%
\[
    \cond(A) = \frac{\text{maximum stretching}}{\text{minimum stretching}}
\]
For the matrix above, $\cond(A)\approx 2000$, indicating that errors in certain directions may be amplified by a factor of 2000 when solving the system.


The practical significance is immediate: when $\cond(A)$ is large, we call $A$ \style{ill-conditioned} and treat computed solutions with appropriate skepticism. When $\cond(A)$ is moderate (say less than 100), we have greater confidence in our numerical results. This single number provides crucial guidance about which linear systems we can solve reliably and which require more careful treatment.
\end{example}

The deeper relationship between conditioning and accuracy will emerge in Chapter \ref{ch:6} when we study least squares problems, and again in Chapter \ref{ch:10} where singular value decomposition reveals its geometric essence. For now, this first glimpse of numerical sensitivity serves as crucial warning: in the workshop of linear algebra, not all tools are equally reliable. Some matrices, like well-balanced instruments, translate our mathematical intentions into reliable results. Others, though theoretically sound, prove treacherously sensitive in practice. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small
% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Network Flows: From Graphs to Linear Systems}
\label{EM:networkflows}
% **************** EMANATION *******************

The world runs on networks. Supply chains move goods from factories to stores; pipelines transport oil and gas between cities; computer networks route data packets across the internet. Though these systems appear complex, their fundamental behavior reduces to solving systems of linear equations --- the very equations we have studied in this chapter.

A (directed) \style{network} (or \style{graph}) consists of \style{vertices} (or \style{nodes}) connected by oriented \style{edges}. Think of vertices as locations and edges as pathways between them. 
% 

For a more formal approach, one designates a (finite) set $V$ of vertices. Edges consist of ordered pairs of vertices: $E\subset V\times V$, where the ordering implies orientation. One usually demands that the two vertices in a edge are distinct. 
\begin{marginfigure}
    {\em Think:} in a social network, vertices are people, edges are a social relation (``friend'' or ``follow'') between two persons.  
\end{marginfigure}

Consider a regional distribution network with five locations satisfying the following:
\begin{itemize}
    \item Factory (node 1) producing 200 units
    \item Two regional warehouses (nodes 2, 3) that route inventory
    \item Two retail centers (nodes 4, 5) each needing 100 units
\end{itemize}
The shipping routes form a directed graph as shown, with flow variables $x_{ij}$ indicating units shipped from node $i$ to node $j$. The factory supplies warehouses which in turn supply retail centers.
\begin{marginfigure}
        \includegraphics[width=1.0in]{flow-network.png}
\end{marginfigure}

Conservation of flow requires that what comes in equals what goes out (except at sources and sinks):
\begin{itemize}
    \item At factory: Total outgoing equals production
    \item At each warehouse: Incoming equals total outgoing
    \item At each retail center: Incoming equals demand
\end{itemize}

This generates our system of linear equations:
\[
\begin{array}{rcl}
x_{12} + x_{13} &=& 200 \quad\text{(factory output)} \\
x_{12} - x_{24} - x_{25} &=& 0 \quad\text{(warehouse 1 balance)} \\
x_{13} - x_{34} - x_{35} &=& 0 \quad\text{(warehouse 2 balance)} \\
x_{24} + x_{34} &=& 100 \quad\text{(retail 4 demand)} \\
x_{25} + x_{35} &=& 100 \quad\text{(retail 5 demand)}
\end{array}
\]

Writing this as $A\vect{x}=\vect{b}$:
\[
\begin{bmatrix}
1 & 1 & 0 & 0 & 0 & 0 \\
1 & 0 & -1 & -1 & 0 & 0 \\
0 & 1 & 0 & 0 & -1 & -1 \\
0 & 0 & 1 & 0 & 1 & 0 \\
0 & 0 & 0 & 1 & 0 & 1
\end{bmatrix}
\begin{pmatrix}
x_{12} \\ x_{13} \\ x_{24} \\ x_{25} \\ x_{34} \\ x_{35}
\end{pmatrix}
=
\begin{pmatrix}
200 \\ 0 \\ 0 \\ 100 \\ 100
\end{pmatrix}
\]

This system has LU factorization $A=LU$ where:
\[
L = \begin{bmatrix}
1 & 0 & 0 & 0 & 0 \\
1 & 1 & 0 & 0 & 0 \\
0 & 1 & 1 & 0 & 0 \\
0 & 0 & 1 & 1 & 0 \\
0 & 0 & 0 & 1 & 1
\end{bmatrix}
\quad
U = \begin{bmatrix}
1 & 1 & 0 & 0 & 0 & 0 \\
0 & -1 & -1 & -1 & 0 & 0 \\
0 & 0 & 1 & 0 & -1 & -1 \\
0 & 0 & 0 & 1 & 1 & 0 \\
0 & 0 & 0 & 0 & -1 & 1
\end{bmatrix}
\]

The practical value of this decomposition emerges when supply or demand patterns change --- a daily occurrence in real distribution networks. Consider three scenarios:
\begin{enumerate}
    \item Retail center 4 needs 150 units while center 5 needs only 50
    \item The factory increases production to 250 units
    \item A warehouse temporarily closes, requiring rerouting of flows
\end{enumerate}

For the first two cases, only the right-hand side $\vect{b}$ changes. Having computed and stored $L$ and $U$ once, we can solve each new scenario through forward and back substitution:
\[
    L\vect{y} = \vect{b}_\text{new} \quad\text{then}\quad U\vect{x} = \vect{y}
\]
This requires only $O(n^2)$ operations compared to the $O(n^3)$ cost of computing a new LU decomposition. The third case --- structural changes to the network --- requires recomputing the factorization, aligning with intuition: major network reconfigurations demand fresh analysis, while routine variations in flow can be handled more efficiently.

\begin{marginfigure}
{\em Foreshadowing:} In Chapter \ref{ch:6}, we shall see how optimization principles help choose among multiple feasible solutions, selecting flows that minimize cost or maximize efficiency.
\end{marginfigure}

Through networks, the abstract equations of Chapter 1 acquire concrete meaning --- they become tools for understanding and managing the flow of goods, vehicles, or information through interconnected systems. What began as manipulation of numbers and variables emerges as a framework for solving real-world distribution problems.


% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Structural Analysis: Forces in Trusses}
\label{EM:trusses}
% **************** EMANATION *******************

Buildings stand and bridges span through careful balance of forces. The simplest structural elements --- beams joined at nodes to form trusses --- provide both practical utility and mathematical elegance. Though engineers have analyzed such structures for centuries, their fundamental behavior reduces to solving precisely the systems of linear equations developed in this chapter.

A \style{truss} consists of rigid beams connected by perfectly hinged joints, supporting loads through pure tension or compression in its members. Each joint (or node) must remain in equilibrium, with forces balancing in both horizontal and vertical directions. These equilibrium conditions generate our systems of equations, while physical constraints on material strength make stability of solution methods paramount.

Consider this five-member truss supporting both vertical and horizontal loads:
\begin{figure}
    \centering
    \includegraphics[width=0.65\linewidth]{truss.png}
\end{figure}
Each member force $x_i$ represents tension (positive) or compression (negative) along its length. The external loads --- $F$ downward and $G$ rightward as shown --- must be balanced by reactions at the supports $R$ and $S$ respectively.

Force equilibrium at each node yields equations in both horizontal ($x$) and vertical ($y$) directions. For the supports, we include reaction forces $R_{x}$ and $R_{y}$ at the pin (node 1) and $R_{2}$ at the roller (node 2). Assuming the truss is 4-units wide by 2-units high and writing this system as $A\vect{x}=\vect{b}$ with all reactions grouped first:
\[
\begin{bmatrix}
1 & 0 & 0 & 1 & -0.894 & 0 & 0 & 0 \\
0 & 1 & 0 & 0 & 0.447 & 0 & 0 & 0 \\
0 & 0 & 0 & -1 & 0 & -0.894 & -0.894 & 0 \\
0 & 0 & 1 & 0 & 0 & 0.447 & 0.447 & 0 \\
0 & 0 & 0 & 0 & 0.894 & 0.894 & 0 & 1 \\
0 & 0 & 0 & 0 & -0.447 & -0.447 & 0 & 0 \\
0 & 0 & 0 & 0 & 0 & 0 & 0.894 & -1 \\
0 & 0 & 0 & 0 & 0 & 0 & -0.447 & 0
\end{bmatrix}
\begin{pmatrix}
R_x \\ R_y \\ S \\ x_1 \\ x_2 \\ x_3 \\ x_4 \\ x_5
\end{pmatrix}
=
\begin{pmatrix}
0 \\ 0 \\ 0 \\ 0 \\ 0 \\ F \\ G \\ 0
\end{pmatrix}
\]

% Several features of this system demand attention:
% \begin{itemize}
%     \item Sparsity: Each equation involves only members and reactions at its node
%     \item Symmetry: The equilibrium equations exhibit natural physical structure
%     \item Conditioning: Small changes in geometry can significantly affect coefficients
% \end{itemize}

The LU factorization of this system proves particularly valuable. The $L$ factor captures load transmission through the structure, while $U$ reveals the sequence of equilibrium relationships. The sparsity pattern reflects the physical connectivity of the truss, making storage and computation efficient.

This factorization enables rapid reanalysis under changing loads --- a crucial capability as environmental forces vary. Consider three scenarios structural engineers must analyze:
\begin{enumerate}
    \item Wind loads add horizontal forces at both upper nodes
    \item Snow accumulation increases vertical loads asymmetrically
    \item Support settlement modifies geometric coefficients slightly
\end{enumerate}

The first two cases modify only the right-hand side $\vect{b}$, allowing efficient solution through stored factors. The third case --- involving geometric changes --- requires recomputing coefficients and factorization. Yet even here, the sparsity pattern remains unchanged, permitting optimized refactorization.

\begin{marginfigure}
{\em Example:} A mere 1\% change in member angles can produce 10\% changes in internal forces, emphasizing the importance of stable numerical methods developed in this chapter.
\end{marginfigure}

Matrix conditioning proves especially crucial in structural analysis. Nearly parallel members generate nearly dependent equations; members at almost right angles produce coefficients of vastly different scale. The pivoting strategies introduced for PLU factorization directly address these challenges, ensuring reliable analysis even of geometrically complex trusses.

Through structural analysis, the abstract equations of Chapter 1 acquire concrete physical meaning --- they become tools for ensuring buildings stand and bridges span safely. What began as manipulation of numbers emerges as a framework for understanding how forces flow through the built environment, a framework made practical through the careful study of matrix structure and numerical stability.
% **************** EMANATION *******************
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

% Basic computational problems first
\item Solve the following systems of linear equations using Gaussian elimination:
    \[
    \begin{aligned}
        2x + y - z &= 1 \\
        4x - y + 2z &= 2 \\
        -2x + 5y - z &= 3
    \end{aligned}
    \quad : \quad
    \begin{aligned}
        2x + y - z &= 1 \\
        4x - y + 2z &= 2 \\
        -2x + 5y - z &= 3
    \end{aligned}
    \]

\item Verify whether each of the matrices
    \[
    A = \begin{bmatrix} 
        -1 & 4 & 2\\ 
         0 & 3 & 0\\
         0 & 2 & -1
    \end{bmatrix}
    \quad : \quad
    B = \begin{bmatrix} 
        1 & 3 & 0 & 0 \\ 
        -1 & -4 & 0 & 0\\ 
        0 & 0 & 2 & -3\\
        0 & 0 & 1 & -2
    \end{bmatrix}
    \]
    is invertible by computing its determinant. If it is invertible, find its inverse. 

\item Decompose the following matrix into $LU$ form (without pivoting):
    \[
    A = \begin{bmatrix}
        2 & 3 & 1 \\
        4 & 7 & -1 \\
        -2 & -3 & 6
    \end{bmatrix}
    \]
    Verify your result by reconstructing $A$ from $L$ and $U$.

\item Consider solving $A\vect{x}=\vect{b}$ where
    \[
    A = \begin{bmatrix}
        0.001 & 1 \\
        1 & 1
    \end{bmatrix}
    \quad : \quad
    \vect{b} = \begin{pmatrix}
        1 \\ 2
    \end{pmatrix}
    \]
    Solve the system both without pivoting and after applying the row permutation. Compare the numerical stability of both approaches by computing the size of intermediate terms. What general principle about pivoting does this illustrate?

% Physical applications and modeling problems
\item Consider an electrical circuit with three nodes connected by resistors. The conductance matrix is
    \[
    G = \begin{bmatrix}
        3 & -1 & -2 \\
        -1 & 4 & -3 \\
        -2 & -3 & 5
    \end{bmatrix}
    \]
    Find node voltages $\vect{v}$ that produce currents $\vect{i}=(1,0,-1)^T$ by solving $G\vect{v}=\vect{i}$.
    \begin{marginfigure}
        {\em Note:} In electrical networks, $G$ is symmetric and its row sums equal zero due to Kirchhoff's laws.
    \end{marginfigure}

\item An input-output economic model has three sectors with input matrix
    \[
    A = \begin{bmatrix}
        0.3 & 0.2 & 0.1 \\
        0.4 & 0.3 & 0.2 \\
        0.2 & 0.3 & 0.4
    \end{bmatrix}
    \]
    where $a_{ij}$ represents the amount of sector $i$'s output needed to produce one unit of sector $j$'s output. Given demand $\vect{d}=(100,150,200)^T$, find the production levels $\vect{x}$ needed to meet this demand by solving $(I-A)\vect{x}=\vect{d}$.
    \begin{marginfigure}
        {\em Note:} In economic models, $A$ typically has nonnegative entries with column sums less than 1.
    \end{marginfigure}

\item A chemical reactor has three species $A$, $B$, and $C$ that interconvert according to first-order kinetics. The rate matrix is
    \[
    K = \begin{bmatrix}
        -2 & 1 & 1 \\
        1 & -2 & 1 \\
        1 & 1 & -2
    \end{bmatrix}
    \]
    If initial concentrations are $\vect{c}_0=(1,0,0)^T$, find steady-state concentrations by solving $K\vect{c}=\vect{0}$ subject to mass conservation $\sum_i c_i = 1$.
    \begin{marginfigure}
        {\em Note:} Rate matrices have row sums of zero due to mass conservation.
    \end{marginfigure}

% Basic theoretical problems and proofs
\item 
    Let $A$ and $B$ be $n \times n$ matrices and suppose $AB = I$. Prove that $A$ and $B$ are invertible and that $B = A^{-1}$.

\item 
    Prove that any $n$-by-$n$ permutation matrix $P$ is a root of the identity, specifically $P^n=I$. Is this ever the case for smaller powers than $n$?

\item  
    Let $N$ denote a $k$-by-$k$ matrix that is all zeros except for $+1$ on the superdiagonal: that is, $N_{i,j}=1$ for $j=i+1$ and $0$ elsewhere. Demonstrate that $N^p$ is nonzero for $p<k$ and zero for $p\geq k$. 
    \begin{marginfigure}
        Such a matrix is called \style{nilpotent}, as it vanishes (becomes nil) after sufficiently many powers.
    \end{marginfigure}

% Problems involving matrix conditioning and numerical analysis
\item 
    Consider the matrix
    \[
    A = \begin{bmatrix}
    1 & \alpha \\
    0 & 1-\alpha
    \end{bmatrix}
    \]
    where $\alpha$ is a parameter. For what values of $\alpha$ is $A$ well-conditioned (condition number less than 10)? For what values is it ill-conditioned (condition number greater than 100)?

\item  
    Let 
    \[
    A = \begin{bmatrix}
    \cos\theta & -\sin\theta \\
    \sin\theta & \cos\theta
    \end{bmatrix}
    \]
    be a rotation matrix. What is its condition number? Explain geometrically why this result makes sense.

% More advanced theoretical problems
\item 
    In a structural analysis problem, the stiffness matrix relating forces $\vect{f}$ to displacements $\vect{u}$ has block form
    \[
    K = \begin{bmatrix}
        K_{11} & K_{12} \\
        K_{21} & K_{22}
    \end{bmatrix}
    \]
    If $K_{11}$ is invertible, show how to solve $K\vect{u}=\vect{f}$ efficiently using block elimination. What condition ensures this method is numerically stable?

\item The \style{growth factor} in Gaussian elimination measures how much entries can grow during the process. For a matrix $A$, it is defined as $\rho(A) = \max_{i,j,k} |a_{ij}^{(k)}|/\max_{i,j} |a_{ij}|$ where $a_{ij}^{(k)}$ denotes the $(i,j)$ entry after $k$ steps of elimination. For the matrix
    \[
    A = \begin{bmatrix}
        1 & 1 & 1 \\
        1 & 1+\epsilon & 1 \\
        1 & 1 & 1+\epsilon
    \end{bmatrix}
    \]
    show that the growth factor without pivoting is approximately $1/\epsilon$ for small $\epsilon>0$. Find a permutation $P$ such that $PA$ has growth factor approximately 1, and explain why this demonstrates the importance of pivoting for numerical stability.

\item Consider a matrix $A$ with nonzero diagonal entries but much larger super-diagonal entries: $|a_{i,i+1}| \gg |a_{ii}|$ for $i=1,\ldots,n-1$. Explain why computing an LU decomposition without pivoting is likely to be unstable, how a cyclic permutation moving the first row to the bottom might help, and how the growth factor concept explains this phenomenon.

\item Let $A$ be an $n$-by-$n$ matrix with entries of magnitude at most 1. Prove that there exists a permutation matrix $P$ such that all pivots in the LU decomposition of $PA$ have magnitude at least $1/n!$. Is this bound sharp?
    \begin{marginfigure}
        {\em Nota bene:} This shows that some pivoting strategy can always ensure pivots don't become too small, though finding the optimal permutation is generally intractable.
    \end{marginfigure}

% Advanced applications and theoretical synthesis
\item Consider a mass-spring system with two masses $m_1$ and $m_2$ connected by springs with constants $k_1$, $k_2$, and $k_3$:
    \[
    \begin{tikzcd}[column sep=2em]
    \text{wall} \arrow[r, "k_1", no head] & m_1 \arrow[r, "k_2", no head] & m_2 \arrow[r, "k_3", no head] & \text{wall}
    \end{tikzcd}
    \]
    Show that finding the equilibrium positions requires solving a system $A\vect{x}=\vect{b}$ where $A$ is symmetric and tridiagonal. What physical principle explains the symmetry?

\item Recall that in network flow problems, the \style{incidence matrix} $A$ has entries $a_{ij}=1$ if edge $j$ enters node $i$, $a_{ij}=-1$ if edge $j$ leaves node $i$, and $a_{ij}=0$ otherwise. Prove that for any connected graph with $n$ nodes, $\rank(A)=n-1$. How does this relate to conservation of flow?

\item A real matrix $A$ is called \style{totally positive} if all its minors (determinants of square submatrices) are positive. Show that if $A$ is totally positive, then its LU decomposition exists without need for pivoting. What does this imply about numerical stability?
    \begin{marginfigure}
        {\em BONUS!} Totally positive matrices appear naturally in approximation theory and statistics, where their special properties prove invaluable.
    \end{marginfigure}

\item The \style{Cayley transform} of a matrix $A$ is defined as $C=(I+A)(I-A)^{-1}$ when $(I-A)$ is invertible. Show that if $A$ is block diagonal, then $C$ is block diagonal with blocks being the Cayley transforms of the diagonal blocks of $A$. What advantage might this offer for computation?

\item Consider solving a system of equations $A\vect{x}=\vect{b}$ where $A$ is \style{strictly diagonally dominant}: $|a_{ii}| > \sum_{j\neq i} |a_{ij}|$ for all $i$. Prove that $A$ is nonsingular ad that no row exchanges are needed in Gaussian elimination.

\item
    A web page ranking algorithm assigns importance scores $\vect{x}$ to $n$ pages based on the link matrix $L$ where $L_{ij}=1$ if page $j$ links to page $i$ and $0$ otherwise. After normalizing columns of $L$ to sum to 1, scores are updated by solving
    \[
    \vect{x} = \alpha L\vect{x} + (1-\alpha)\vect{1}/n
    \]
    where $\alpha=0.85$ and $\vect{1}$ is the vector of all ones. Show this system always has a unique solution. Why is this important for web search?

\end{enumerate}
\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Abstract Vector Spaces}
\label{ch:2}

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``in fear \& pale dismay 
He saw the indefinite space beneath''}

\newthought{The leap from concrete to abstract} marks the first great challenge in this text. Having worked extensively with vectors as ordered lists of numbers  --  whether forces, velocities, or data  --  we now step back and ask a deeper question: what \textit{is} a vector? What essential features make something vector-like?

This abstraction is not mere academic exercise. The vectors that arise in modern engineering often transcend simple coordinate lists. A vector might represent a time-varying signal, a high-dimensional dataset, an image, a polynomial, or a probability distribution. The operations we perform on these vectors  --  addition, scaling, dot products  --  echo those familiar from Euclidean geometry, yet are abstracted away from geometry into pure form.

Consider a collection of audio samples forming a sound wave, or pixel intensities comprising a digital image. We can add two such objects (mixing sounds or blending images) and scale them (changing volume or brightness). These operations satisfy the same algebraic rules as vector addition and scalar multiplication in $\R^n$. Yet these objects are far from geometric arrows in space. They are \style{vectors} in a more general sense  --  elements of a \style{vector space}.
\begin{marginfigure}
    \centering
    \includegraphics[width=1.1in]{ouroboros.png}
\end{marginfigure}
\begin{marginfigure}
    {\em Hmmmm...} A vector is an element of a vector space, a space comprised of vectors. {\em There must be more than this...} 
\end{marginfigure}

The power of this abstraction lies in its ability to unify disparate contexts under a common framework. Whether working with solutions to differential equations, quantum states in physics, or feature embeddings in machine learning, the fundamental properties of vector spaces guide our analysis and computation. By understanding these properties in their abstract form, we gain tools applicable across the landscape of modern engineering.

Our task is to build this abstraction carefully, maintaining always a connection to the concrete. We begin with the axiomatic definition of a vector space, using familiar examples to illuminate the essential features. This foundation will support our subsequent study of transformations, inner products, and the deeper structures that enable modern computational methods.

% ==============================================
\section{Vector Space Axioms}
\label{sec:axioms}
% ==============================================

\begin{definition}[Vector Space]
\label{def:vectorspace}
A \style{vector space} consists of two ingredients: a collection $V$ of objects (called \style{vectors}) and a field of \style{scalars} (for our purposes, always the real numbers $\R$). These are bound together by two fundamental operations:

\begin{enumerate}
    \item Vector addition: a rule for combining any two vectors $\vect{u},\vect{v} \in V$ to obtain a new vector $\vect{u}+\vect{v} \in V$
    \item Scalar multiplication: a rule for scaling any vector $\vect{v} \in V$ by a real number $c \in \R$ to obtain a new vector $c\vect{v} \in V$
\end{enumerate}

These operations must satisfy certain rules  --  the \style{vector space axioms}. For all vectors $\vect{u},\vect{v},\vect{w} \in V$ and all scalars $a,b \in \R$:

\begin{marginfigure}
Not all collections of objects with addition and scaling qualify as vector spaces. The axioms ensure that vectors combine and scale in ways that preserve the essential character of ``vector-ness.''
\end{marginfigure}

\smallskip
\textit{Vector Addition Axioms:}
\begin{enumerate}
    \item Commutativity: $\vect{u} + \vect{v} = \vect{v} + \vect{u}$
    \item Associativity: $(\vect{u} + \vect{v}) + \vect{w} = \vect{u} + (\vect{v} + \vect{w})$
    \item Zero vector: There exists a vector $\vect{0} \in V$ such that $\vect{v} + \vect{0} = \vect{v}$ for all $\vect{v} \in V$
    \item Additive inverses: For each $\vect{v} \in V$, there exists a vector $-\vect{v} \in V$ such that $\vect{v} + (-\vect{v}) = \vect{0}$
\end{enumerate}

\smallskip
\textit{Scalar Multiplication Axioms:}
\begin{enumerate}
    \item Distributivity over vector addition: $a(\vect{u} + \vect{v}) = a\vect{u} + a\vect{v}$
    \item Distributivity over scalar addition: $(a+b)\vect{v} = a\vect{v} + b\vect{v}$
    \item Associativity with scalars: $a(b\vect{v}) = (ab)\vect{v}$
    \item Unity: $1\vect{v} = \vect{v}$
\end{enumerate}
\end{definition}

These axioms may seem pedantic  --  they certainly hold for the familiar vectors in $\R^n$. Their importance emerges when considering more exotic spaces whose objects do not {\em look} like vectors, such as: 
\begin{marginfigure}
These examples illustrate how far we can stretch our notion of what a ``vector'' is while maintaining the essential algebraic structure. See the next section for some initialy details.
\end{marginfigure}
\begin{itemize}
    \item continuous $\R$-valued functions under pointwise addition and rescaling
    \item polynomials, under addition and rescaling
    \item Taylor or Fourier series, under termwise addition and rescaling
    \item solutions to linear homogeneous ODEs or recurrence relations
\end{itemize}

The power of these axioms lies not in their individual statements but in their collective implication: anything satisfying these rules inherits the fundamental properties of vectors. This means that techniques developed for one vector space often translate seamlessly to others. A method for solving systems of linear equations in $\R^n$ might, with minimal modification, solve systems of linear differential equations or find optimal coefficients in a signal processing filter.

\begin{marginfigure}
Vector spaces are not just collections of vectors  --  they are collections of vectors that are rightly structured under addition and scaling.
\end{marginfigure}

The axioms also tell us what \textit{is not} a vector space. The positive real numbers under operations of min (addition) and max (multiplication) fail most of the axioms (are any satisfied?). The integers under ordinary addition and multiplication fail because scalar multiplication does not always yield an integer. Such counterexamples help sharpen our understanding of what makes a vector space work.

Finally, the axiomatic approach allows us to prove results that hold for {\em all} vector spaces, saving the trouble of verifying things one example at a time. For example, the following certainly {\em seems} obvious in Euclidean space, but it is less clear that it holds in all possible worlds. 

\begin{lemma}
\label{lem:zero}
    In a vector space $V$, the zero vector is unique.
\end{lemma}
\begin{proof}
    Assume that $z$ and $z'$ are vectors in $V$ which satisfy the zero-property. Then:
\[
    z = z+z' = z' ,
\]
    each equality following from the fact that both $z$ and $z'$ do nothing when added to any vector. Thus, they are the same vector. 
\end{proof}

% ==============================================
\section{A Gallery of Vector Spaces}
\label{sec:gallery}
% ==============================================

An abstract definition takes on life through examples. Each of the following illustrates how the vector space axioms manifest in different contexts, from the familiar to the exotic. Though we shall not verify the axioms explicitly for each (a tedious if straightforward exercise), we shall identify the key components: the vectors themselves, the operations of addition and scaling, and the zero vector.

\begin{example}[Euclidean space]
\label{ex:euclidean}
\begin{marginfigure}
Though we live in a seemingly three-dimensional world, the configuration spaces of mechanical systems routinely have higher dimension. A robotic arm with multiple rotation joints evolves in a state space of dimension greater than three.
\end{marginfigure}
The space $\R^n$ of ordered $n$-tuples of real numbers is our prototype. Here, vectors are ordered lists of real numbers, acted upon by the familiar operations of componentwise addition and scalar multiplication. The zero vector is the tuple of all zeros. This is the space in which classical physics and engineering operate, where $n=2$ or $3$ correspond to physical space.
\end{example}

\begin{example}[Matrices]
\label{ex:matrices}
The collection $\R^{m\times n}$ of all $m$-by-$n$ matrices forms a vector space under entry-by-entry addition and scalar multiplication. The zero matriz $Z$ is the ``zero'' of $\R^{m\times n}$. Matrix spaces are ubiquitous in engineering, from the transformation matrices of computer graphics to the weight matrices of neural networks. The operations here echo those of $\R^n$, though the objects themselves are more structured. 
%
\begin{marginfigure}
The space of $2\times 2$ matrices is four-dimensional, though this is not immediately obvious from its appearance. This theme  --  that dimension can hide in plain sight  --  will recur.
\end{marginfigure}
\end{example}

\begin{example}[Polynomials]
\label{ex:poly}
For each nonnegative integer $n$, we have the space $\poly_n$ of polynomials of degree at most $n$. A typical element has the form $p(x) = a_nx^n + a_{n-1}x^{n-1} + \cdots + a_1x + a_0$. Addition of polynomials and multiplication by scalars operate on the coefficients in the natural way. The zero polynomial, having all coefficients equal to zero, serves as the zero vector. These spaces serve as approximations to more complex functions and appear throughout signal processing and control theory.
\end{example}

\begin{example}[Function spaces]
\label{ex:functionspaces}
\begin{marginfigure}
{\em Foreshadowing:} this is our first example of an infinite-dimensional vector space. The jump from finite to infinite dimensions is profound and harbors surprises that will shape our understanding of convergence and approximation. 
\end{marginfigure}
Consider the space $C([a,b])$ of continuous real-valued functions on an interval $[a,b]$, with addition and scalar multiplication defined pointwise. The zero function $z(x)=0$ serves as zero vector, since $f+z=f$ for all $f$. This space contains all the polynomials $\poly_n$ (with restricted domain) and serves as a model for signal spaces in engineering. One uses $C(D)$ to denote the vector space of scalar fields $f:D\to\R$ on a domain $D$. For functions that have some differentiability (both helpful and familiar from calculus), the following notations for scalar fields are standard:
\begin{itemize}
    \item $C(D)$ : continuous
    \item $C^1(D)$ : continuously differentiable
    \item $C^\infty(D)$ : infinitely differentiable or \style{smooth}
    \item $C^\omega(D)$ : real-analytic 
    \begin{marginfigure}
        {\em Recall:} \style{Real-analytic} means that all Taylor series converge and are equal to the function at all points.
    \end{marginfigure}
\end{itemize}
\end{example}

\begin{example}[Linear ODEs]
\label{ex:ODE}
The solutions to a linear homogeneous differential equation form a vector space. The vectors here are functions $x(t)$ satisfying the equation $p(D)x=0$ for $p(D)$ a polynomial of the differential operator $D$. The operations of addition and scalar multiplication act pointwise on these solutions $x(t)$, and the constant function $x=0$ is the zero vector. 
\begin{marginfigure}
    {\em Nota bene:} Similar spaces arise from linear recurrence relations and difference equations.
\end{marginfigure}

For example, a second-order linear ODE of the form
\[
    a\frac{d^2x}{dt^2}+b\frac{dx}{dt}+cx = 0
\]
has solutions $x(t)$ which are closed under linear combination and thus form a vector space. This ODE can be written using the differential operator $D=d/dt$ as $(aD^2+bD+cI)x=0$.
\end{example}

\begin{example}[Digital signals]
\label{ex:digital}
Digital signals  --  whether audio, image, or general data streams  --  form vector spaces of their own. An audio signal might be represented as a sequence of samples or as a function of continuous time. Addition corresponds to mixing of signals; scalar multiplication adjusts amplitude. Silence plays the role of zero vector. For images, the vectors are two-dimensional arrays of pixel intensities. These spaces support the linear operations fundamental to signal processing and machine learning.
\end{example}

\begin{example}[Sequences \& Series]
\label{ex:series}

Consider the set of formal power series in a variable $x$, as familiar from single-variable calculus. Ignoring convergence, we may regard such power series as vectors. Given two such series, we can add them termwise (by powers); rescaling happens at the level of coefficients. The zero series ($c_k=0$ for all $k$) is the zero vector. 
\begin{marginfigure}
    Recall that power series are of the form
    \[
        f = \sum_{k=0}^\infty c_k x^k.
    \]
    Would the {\em convergent} power series form a vector space? Does absolute versus conditional convergence matter?
\end{marginfigure}
There is likewise a vector space structure on the set of sequences. Consider $a=(a_k)$ for $k\in\N$. One can add such sequences termwise, and rescaling the sequence means rescaling each term. The zero-sequence plays the role of the zero-vector. Interestingly, this vector space ``feels'' like the same vector space as that of power series, though they look different. 
\begin{marginfigure}
    {\em Foreshadowing:} the notion of sameness or equivalence that is natural in vector spaces (and the rest of mathematics) is called \style{isomorphism}. The vector spaces of sequences and series are \style{isomorphic}. 
\end{marginfigure}
\end{example}

These examples, though distinct in character, share the essential features codified in the vector space axioms. Their variety suggests the power of the abstraction: techniques developed for one vector space often transfer seamlessly to others. As we proceed to study subspaces, linear independence, and bases, these examples will serve as touchstones, grounding abstract concepts in concrete settings.

% ==============================================
\section{Subspaces}
\label{sec:subspaces}
% ==============================================

\begin{definition}[Subspace]
\label{def:subspace}
A \style{subspace} of a vector space $V$ is a subset $W\subseteq V$ that is itself a vector space under the operations inherited from $V$. We use the notation $W<V$ for a subspace.
\end{definition}
\begin{marginfigure}
The key insight is that subspaces must be closed under the operations that make vector spaces work. You cannot escape a subspace through addition or scaling.
\end{marginfigure}

This seemingly abstract notion has immediate practical import. The quadratic polynomial space $\poly_2$ contains within it the simpler space $\poly_1$ of affine functions  --  a natural subspace. The matrix space $\R^{n\times n}$ contains a subspace of diagonal matrices. In digital signal processing, the collection of even-symmetric signals (those satisfying $f(t)=f(-t)$) forms a subspace of all signals. These are not merely random subsets, but structured parts that preserve the essential vector operations of their parent spaces.

The test for whether a subset $W\subseteq V$ is a subspace reduces to checking three simple properties:
\begin{enumerate}
    \item The zero vector is in $W$
    \item $W$ is closed under addition: if $\vect{u},\vect{v}\in W$ then $\vect{u}+\vect{v}\in W$
    \item $W$ is closed under scalar multiplication: if $\vect{v}\in W$ and $c\in\R$ then $c\vect{v}\in W$
\end{enumerate}

\begin{marginfigure}
{\em Foreshadowing:} These three properties are not independent. The first is actually redundant given the third, as $0\vect{v}=\vect{0}$ for any vector $\vect{v}$. This hint of redundancy in our description of subspaces previews deeper structural results to come.
\end{marginfigure}

\begin{example}[Coordinate subspaces]
In $\R^n$, the coordinate planes (and their higher-dimensional analogues) provide natural examples of subspaces. For instance, in $\R^3$, the $xy$-plane is the subspace $\{(x,y,0): x,y\in\R\}$. More generally, any plane or line through the origin forms a subspace. The requirement that subspaces contain $\vect{0}$ forces them to pass through the origin  --  a shifted plane, no matter how close to the origin, is not a subspace.
\end{example}

\begin{example}[Null space]
The solutions to a linear homogeneous system $A\vect{x}=\vect{0}$ form a subspace called the \style{null space} of $A$. This is not merely a convenient fact but a consequence of linearity: if $\vect{x}_1$ and $\vect{x}_2$ satisfy the equation, then
\begin{marginfigure}
    {\em Foreshadow:} we will use the more general term of \style{kernel} in place of null space when we introduce linear transformations in Chapter \ref{ch:3}.
\end{marginfigure}
\[
A(c_1\vect{x}_1 + c_2\vect{x}_2) = c_1A\vect{x}_1 + c_2A\vect{x}_2 = \vect{0}
\]
for any scalars $c_1,c_2$. This subspace captures the essential structure of the system's solutions. Note that for $\vect{b}\neq\vect{0}$, the solutions to $A\vect{x}=\vect{b}$ do {\em not} form a subspace.
\end{example}

\begin{example}[Column space]
Given a matrix $A$, the set of all possible linear combinations of its columns forms a subspace $\column(A)$ of $\R^m$ (where $m$ is the number of rows). This \style{column space} represents all possible outputs of the linear transformation $A\vect{x}=\vect{b}$. There is likewise a \style{row space}, $\row(A)$  --  combinations of the rows  --  that forms a subspace of $\R^n$ (where $n$ is the number of columns).
%
\begin{marginfigure}
{\em Foreshadowing:} taking all possible combinations of a set of vectors will be known to us soon as a \style{span}.
\end{marginfigure}
\end{example}

\begin{example}[Matrix Subspaces]
Consider the space $\R^{n\times n}$ of $n\times n$ matrices under matrix addition. The following are subspaces:
\begin{itemize}
    \item Upper triangular matrices
    \item Diagonal matrices
    \item Symmetric matrices
    \item Matrices with trace zero
\end{itemize}
Verifying subspace properties for each case provides excellent practice with the axioms.
\end{example}

The operation of intersection preserves the subspace property: if $W_1$ and $W_2$ are subspaces of $V$, then $W_1\cap W_2$ is also a subspace. This allows us to build new subspaces by finding the common elements of known ones. The same is true for arbitrary intersections of subspaces  --  a fact that becomes important when studying systems of linear constraints.

\begin{marginfigure}
{\em Caveat:} The union of subspaces is rarely a subspace. Consider two lines through the origin in $\R^2$  --  their union fails to be closed under addition.
\end{marginfigure}

The sum of two subspaces $W_1$ and $W_2$, defined as
\[
W_1 + W_2 = \{\vect{w}_1 + \vect{w}_2: \vect{w}_1\in W_1,\ \vect{w}_2\in W_2\}
\]
is always a subspace. When the subspaces have only the zero vector in common, that is, when $W_1\cap W_2=\{\vect{0}\}$, we call this a \style{direct sum}, denoted $W_1\directsum W_2$. The direct sum has the key property that every vector in $W_1\directsum W_2$ has a unique representation as a sum $\vect{w}_1+\vect{w}_2$ with $\vect{w}_1\in W_1$ and $\vect{w}_2\in W_2$. For the ordinary sum $W_1+W_2$, such representations need not be unique when the subspaces overlap.

\begin{marginfigure}
Think of a direct sum as combining subspaces that point in ``independent directions''  --  like combining the completely distinct real and imaginary axes to build the complex plane $\C$.
\end{marginfigure}

The concept of a subspace threads through all of linear algebra, from the practical problem of solving linear systems to the theoretical machinery of eigenspaces and singular value decompositions to come. Understanding how vector spaces decompose into simpler subspaces is key to both computational efficiency and theoretical insight.

% ==============================================
\section{Span \& Linear Independence}
\label{sec:span}
% ==============================================

The simplest subspaces arise from the most elementary vector operation: scaling. A single nonzero vector $\vect{v}$ in a vector space $V$ generates a line through the origin  --  the collection of all scalar multiples $\{c\vect{v}: c\in\R\}$. This is a one-dimensional subspace of $V$. When we allow addition as well as scaling, a finite collection of vectors generates a larger subspace. 

\begin{definition}[Span]
\label{def:span}
The \style{span} of vectors $\vect{v}_1,\ldots,\vect{v}_k$ in a vector space $V$ is the collection of all their linear combinations:
\[
    \spanset(\vect{v}_1,\ldots,\vect{v}_k) 
    = 
    \left\{
        c_1\vect{v}_1 + \cdots + c_k\vect{v}_k : c_i\in\R
    \right\}
\]
A set of vectors \style{spans} $V$ if every vector in $V$ can be written as a linear combination of vectors in the set.
\end{definition}
%
\begin{marginfigure}
The span of a set of vectors is the smallest subspace containing them. It contains all vectors that can be built from the given ones using the operations permitted in a vector space.
\end{marginfigure}

\begin{example}[Spanning in $\R^2$]
In the plane, two nonzero vectors $\vect{v}_1,\vect{v}_2$ that point in different directions span all of $\R^2$. Any point in the plane can be reached through an appropriate linear combination. If the vectors point in the same (or opposite) directions, their span is merely a line through the origin.
\end{example}

\begin{example}[Spanning polynomials]
The polynomials $1, x,$ and $x^2$ span the space $\poly_2$  --  any quadratic polynomial $ax^2 + bx + c$ is a linear combination of these basic building blocks. The same polynomials do not span $\poly_3$, as no linear combination can produce a cubic term.
\end{example}

This leads to a fundamental question: when are vectors truly independent of one another? 
%
\begin{definition}[Linear Independence]
\label{def:linind}
A set of vectors $\{\vect{v}_1,\ldots,\vect{v}_k\}$ in a vector space $V$ is \style{linearly independent} if the equation
\[
    c_1\vect{v}_1 + c_2\vect{v}_2 + \cdots + c_k\vect{v}_k = \vect{0}
\]
has only the trivial solution $c_1=c_2=\cdots=c_k=0$. Otherwise, the vectors are \style{linearly dependent}.
\end{definition}
%
\begin{marginfigure}
Linear dependence means redundancy  --  one or more vectors could be removed without reducing the span. Independence means each vector contributes something genuinely new.
\end{marginfigure}

If no such relation exists  --  if the only way to obtain $\vect{0}$ through a linear combination is to take all coefficients equal to zero  --  then the vectors are \style{linearly independent}.

\begin{example}[Dependence in $\R^n$]
Three vectors in $\R^2$ are always linearly dependent. This is intuitively clear: the plane can be spanned by two vectors, so a third must be redundant. More generally, if we have more vectors than dimensions, they must be linearly dependent.
\end{example}

\begin{example}[Polynomial independence]
The polynomials $1, x,$ and $x^2$ are linearly independent in $\poly_2$. If $a + bx + cx^2 = 0$ for all $x$, then each coefficient $a,b,c$ must be zero. However, adding the polynomial $x^2+1$ to this collection creates linear dependence, as it can be written as a combination of $1$ and $x^2$.
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} The interplay between spanning and independence leads to the notion of a \style{basis}  --  a linearly independent set of vectors that spans the space. This fundamental concept will organize our understanding of vector spaces.
\end{marginfigure}

The concepts of span and linear independence are complementary. The span tells us what vectors we can build; linear independence tells us when we are building efficiently, without redundancy. Together, they provide the foundation for understanding the structure of vector spaces and their subspaces.

Testing for linear independence is straightforward in principle: one must determine whether a homogeneous system of equations has only the trivial solution. In practice, this means investigating whether certain collections of scalars must all be zero. The following examples illustrate this process.

\begin{example}[Testing independence]
Consider the Euclidean vectors $(1,2)^T$ and $(2,4)^T$ in $\R^2$. To test for linear independence, we examine
\[
c_1\begin{pmatrix}1\\2\end{pmatrix} 
+ 
c_2\begin{pmatrix}2\\4\end{pmatrix} 
= 
\vect{0} 
\quad
\Rightarrow
\quad
\begin{array}{rcl}
c_1 + 2c_2 &=& 0 \\
2c_1 + 4c_2 &=& 0
\end{array}
\]
The second equation is twice the first, yielding $c_1=-2c_2$ for any $c_2$. Thus, these vectors are linearly dependent  --  $\vect{v}_2$ is twice $\vect{v}_1$.
\end{example}

\begin{marginfigure}
When testing independence, follow the zero vector. The key question is always: what coefficients yield the zero vector, and are they necessarily all zero?
\end{marginfigure}

The notions of span and linear independence provide the language needed to describe the essential structure of vector spaces. They tell us both what we can build (through spanning) and what we need to build it (through independence). This dual perspective  --  what we can make versus what we need to make it  --  will guide our development of the theory.

% ==============================================
\section{Towards Dimension}
\label{sec:dimension}
% ==============================================

The concept of dimension pervades our physical and mathematical worlds. We speak of three-dimensional space, two-dimensional surfaces, one-dimensional lines. Engineers routinely work in higher dimensions: a robotic arm with six joints traces paths in a six-dimensional configuration space; a neural network with billions of weights operates in a space beyond plain imagination. 
\begin{marginfigure}
{\em Foreshadowing:} In Chapter \ref{ch:11}, principal component analysis discovers low-dimensional structure in data; in Chapter \ref{ch:12}, we approximate high-dimensional matrices through lower-dimensional factors; and in Chapter \ref{ch:13}, neural networks compress massively large dimensions into essential features. The art lies not in counting dimension but in understanding when and how it can be reduced without losing crucial information.
\end{marginfigure}
%

Our task is to extract from these examples a definition of dimension that captures the essential feature: how many independent parameters are needed to specify a vector uniquely? In $\R^n$, this is clear  --  we need exactly $n$ coordinates. For other vector spaces, we must look to spanning sets and linear independence for guidance.

A spanning set for a vector space may be inefficient, containing redundant vectors. A natural measure of dimension would count the {\em minimal} number of vectors needed to span the space. Fortunately, this minimal number is well-defined.

\begin{lemma}[Minimal Spanning Sets]
\label{lem:dim}
Any two minimal spanning sets of a vector space $V$ have the same size.
\end{lemma}

\begin{proof}
Let $S=\{\vect{v}_1,\ldots,\vect{v}_m\}$ and $T=\{\vect{w}_1,\ldots,\vect{w}_n\}$ be minimal spanning sets for $V$. Since $S$ spans $V$, each $\vect{w}_j$ can be written as a linear combination of vectors in $S$:
\[
    \vect{w}_j = \sum_{i=1}^m c_{ij}\vect{v}_i
\]
for some scalars $c_{ij}$. 

We claim that $m\geq n$. If not, then $m<n$, and we can write $n$ vectors ($\vect{w}_1,\ldots,\vect{w}_n$) as linear combinations of $m$ vectors ($\vect{v}_1,\ldots,\vect{v}_m$). This would imply that $\{\vect{w}_1,\ldots,\vect{w}_n\}$ is linearly dependent.

To see this, consider the homogeneous system
\[
    \sum_{j=1}^n x_j\vect{w}_j = \vect{0}
\]
Substituting the expressions for $\vect{w}_j$:
\[
    \sum_{j=1}^n x_j\left(\sum_{i=1}^m c_{ij}\vect{v}_i\right) = \sum_{i=1}^m\left(\sum_{j=1}^n x_jc_{ij}\right)\vect{v}_i = \vect{0}
\]
This is a homogeneous system of $m$ equations in $n$ unknowns. When $m<n$, such a system must have a nontrivial solution, implying that $\{\vect{w}_1,\ldots,\vect{w}_n\}$ is linearly dependent. This contradicts the minimality of $T$ as a spanning set.

A symmetric argument, expressing each $\vect{v}_i$ in terms of the $\vect{w}_j$, shows that $n\geq m$.
%
Therefore $m=n$.
\end{proof}

This remarkable result  --  that all minimal spanning sets of a vector space have the same size  --  allows us to define dimension without ambiguity.

\begin{definition}[Dimension]
\label{def:dim}
The \style{dimension} of a vector space $V$, denoted $\dim V$, is the size of any minimal spanning set for $V$. If no finite spanning set exists, we say $V$ is \style{infinite-dimensional}.
\end{definition}

The power of this definition lies in its abstraction from specific coordinate systems or geometric intuitions. It applies equally well to spaces of polynomials, matrices, signals, or solutions to differential equations. Let us examine some illuminating examples.

\begin{example}[Polynomial dimension]
Consider the space $\poly_n$ of polynomials of degree at most $n$. Any such polynomial has the form
\[
    p(x) = a_nx^n + a_{n-1}x^{n-1} + \cdots + a_1x + a_0
\]
A minimal spanning set consists of the monomials $\{1,x,x^2,\ldots,x^n\}$. Thus $\dim\poly_n = n+1$, not $n$  --  we must count the constant term! This subtle distinction reminds us that dimension counts parameters, not highest degree.
\end{example}

\begin{example}[Matrix spaces]
The space $\R^{m\times n}$ of real $m$-by-$n$ matrices has dimension $mn$. Though we arrange these numbers in a rectangular array, the dimension counts total entries. Thus $\R^{2\times 2}$ has dimension 4, explaining why the general $2\times 2$ matrix requires four parameters to specify completely.
\end{example}

% \begin{example}[Differential equations]
% The space of solutions to a linear homogeneous differential equation 
% \[
%     a_n\frac{d^nx}{dt^n} + a_{n-1}\frac{d^{n-1}x}{dt^{n-1}} + \cdots + a_1\frac{dx}{dt} + a_0x = 0
% \]
% has dimension $n$. Each solution is determined by $n$ initial conditions  --  position, velocity, acceleration, and so forth up to the $(n-1)$st derivative.
% \end{example}

\begin{example}[Function spaces]
The space $C([a,b])$ of continuous functions on a closed interval has no finite spanning set. Consider the collection of functions $\{1,x,x^2,\ldots\}$  --  no finite subset spans the space, as demonstrated by the transcendental function $e^x$. Such spaces, lacking finite spanning sets, are called infinite-dimensional.
\end{example}

The distinction between finite and infinite dimension is profound. Finite-dimensional spaces admit complete description through a finite set of parameters; infinite-dimensional spaces resist such reduction. This dichotomy shapes how we approach problems: finite-dimensional spaces yield to computational methods, while infinite-dimensional spaces often require approximation by finite-dimensional subspaces.

Our development of dimension completes the foundation of vector space theory. We have progressed from the concrete operations of addition and scaling to the abstract notion of a vector space, then to subspaces and spanning sets, and finally to dimension  --  an intrinsic measure of a space's complexity. This abstraction from the familiar territory of $\R^n$ prepares us for the study of linear transformations, where dimension will play a crucial role in understanding how spaces map to one another.

In the chapters ahead, we shall return to explicit coordinates and bases, enriching our perspective with computational tools. Yet the coordinate-free viewpoint developed here  --  especially the fundamental nature of dimension  --  will remain essential to our understanding. The interplay between abstract structure and concrete computation lies at the heart of linear algebra's power in modern engineering.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Engineering Signals as Vector Spaces}
\label{EM:signals}
% **************** EMANATION *******************

Engineering rests upon the measurement, analysis, and control of signals --- time-varying quantities that encode information about physical systems. The collection of all possible signals on a time interval forms a natural vector space, though one far removed from the familiar coordinate geometry of $\R^n$. Understanding signals through vector spaces illuminates both their mathematical structure and practical manipulation.

Consider the collection $S[0,T]$ of all continuous signals $f:[0,T]\to\R$ defined on a fixed time interval $[0,T]$. Two signals add through pointwise combination, while scalar multiplication scales a signal's amplitude:
\[
    (f+g)(t) = f(t) + g(t) \quad : \quad (cf)(t) = c\cdot f(t)
\]
These operations satisfy our vector space axioms not through coordinate manipulation but through the fundamental nature of signal combination.

\begin{marginfigure}
{\em Think:} The zero vector here is the signal that is identically zero at all times --- the absence of any signal. Its role as additive identity mirrors its physical meaning as silence or darkness.
\end{marginfigure}

This infinite-dimensional space contains a natural sequence of finite-dimensional subspaces. For each positive integer $n$, consider $V_n$, the space spanned by $1$ and the first $n$ pairs of periodic signals:
\[
    \left\{1,\, \cos\left(\frac{2\pi t}{T}\right), \sin\left(\frac{2\pi t}{T}\right),\, \cos\left(\frac{4\pi t}{T}\right), \sin\left(\frac{4\pi t}{T}\right),\, \ldots\right\}
\]
These subspaces form a \style{filtration} --- a sequence $V_0 < V_1 < V_2 < \cdots < S[0,T]$, each containing more complex periodic patterns than the last. A signal in $V_n$ combines at most $n$ different periodic components.

\begin{marginfigure}
{\em Historical Note:} Fourier's radical insight that periodic functions form a vector space with trigonometric functions as basis transformed both mathematics and engineering. The abstract structure of vector spaces illuminated concrete problems in heat flow and vibration.
\end{marginfigure}

Linear independence takes on special meaning for signals. Consider the signals $\sin(2\pi t/T)$ and $\cos(2\pi t/T)$ oscillating once over $[0,T]$. No linear combination
\[
    c_1\sin(2\pi t/T) + c_2\cos(2\pi t/T) = 0
\]
exists except the trivial one $c_1=c_2=0$, demonstrating their independence. Similar independence holds between signals oscillating at different rates, allowing our sequence of subspaces to grow without redundancy.

\begin{marginfigure}
{\em Nota bene:} Though $S[0,T]$ itself is infinite-dimensional, any practical signal can be approximated arbitrarily well by elements from some finite-dimensional $V_n$. This principle underlies much of signal processing.
\end{marginfigure}

Different engineering contexts reveal other natural subspaces. Signals that vanish at $t=0$ form a subspace modeling systems starting from rest. Signals symmetric about $T/2$ form another subspace reflecting temporal symmetry. Each such subspace captures both mathematical structure and physical meaning.

Signal processing itself becomes the study of transformations between signal spaces. Filters map input signals to output signals while preserving vector space structure. That linear combinations of inputs map to the same linear combinations of outputs reflects both mathematical elegance and engineering necessity. The abstract properties of vector spaces guide the design and analysis of practical signal processing systems.

This perspective --- of signals as vectors in an abstract space rather than mere functions of time --- reveals structure that coordinates obscure. Though we may compute with sampled values or trigonometric coefficients, the essential properties of signals transcend any particular representation. The vector space framework provides not merely formal mathematics but genuine insight into the nature of signals and their manipulation.


% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Linear Differential Equations}
\label{EM:linearODE}
% **************** EMANATION *******************

The marriage of calculus and linear algebra reveals itself beautifully in the theory of linear differential equations. Consider an equation governing some physical quantity $x(t)$, where multiple derivatives appear linearly:
\[
    \frac{d^nx}{dt^n} + a_{n-1}\frac{d^{n-1}x}{dt^{n-1}} + \cdots + a_1\frac{dx}{dt} + a_0x = 0
\]
Though seemingly far from the vector spaces studied in this chapter, a remarkable structure emerges when we view this through the right lens.

Let us adopt the concise notation $D = d/dt$ for the differentiation operator. Our equation becomes
\[
    (D^n + a_{n-1}D^{n-1} + \cdots + a_1D + a_0)x = 0
\]
or more simply $p(D)x = 0$, where $p \in \poly_n$ is a polynomial of degree $n$. This operator notation transforms differential equations into algebraic objects --- a first hint of deeper patterns.

A remarkable fact, whose proof must wait for later chapters, is that the solutions to this equation form a vector space of dimension exactly $n$. That is, there exist $n$ special solutions that form a basis, from which all other solutions arise through linear combination. This abstract fact has profound practical implications for solving such equations.

The structure becomes clearest when $p$ factors completely:
\[
    p(D) = (D-\lambda_1)(D-\lambda_2)\cdots(D-\lambda_n)
\]
where $\lambda_1,\ldots,\lambda_n$ are distinct numbers whose meaning will become clear in Chapter 7. For now, observe that the simplest case $n=1$ yields the equation:
\[
    (D-\lambda)x = 0 \quad\Rightarrow\quad \frac{dx}{dt} = \lambda x
\]
whose solution $x(t) = ce^{\lambda t}$ you certainly recall from calculus. 
\begin{marginfigure}
{\em Historical Note:} The connection between polynomial roots and exponential solutions was first observed by Euler, though the full vector space structure emerged only later.
\end{marginfigure}

When all $\lambda_i$ are distinct, the functions $\{e^{\lambda_1 t}, e^{\lambda_2 t}, \ldots, e^{\lambda_n t}\}$ provide a basis for the solution space. Any solution takes the form:
\[
    x(t) = c_1e^{\lambda_1 t} + c_2e^{\lambda_2 t} + \cdots + c_ne^{\lambda_n t}
\]
where the coefficients $c_i$ are determined by initial conditions. The verification that these exponentials are linearly independent (when the $\lambda_i$ are distinct) provides an excellent exercise in the concepts of this chapter.

\begin{example}[Mass-Spring System]
The classic second-order equation for an undamped mass-spring system,
\[
    m\frac{d^2x}{dt^2} + kx = 0
\]
takes the form $p(D)x = 0$ with $p(D) = mD^2 + k$. Writing $\omega = \sqrt{k/m}$, this factors as:
\[
    p(D) = m(D+i\omega)(D-i\omega)
\]
leading to basis solutions $e^{i\omega t}$ and $e^{-i\omega t}$. Their linear combinations yield the familiar sinusoidal motion $x(t) = A\cos(\omega t) + B\sin(\omega t)$ through Euler's formula.
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} Chapter 7 will reveal the deeper meaning of the numbers $\lambda_i$ and provide systematic methods for finding basis solutions even when $p(D)$ does not factor so nicely.
\end{marginfigure}

This example illustrates a profound principle: abstract mathematical structures often reveal themselves in seemingly unrelated contexts. The vector spaces introduced in this chapter are not mere formal constructions but natural languages for describing physical systems. The interplay between differential equations and linear algebra, merely hinted at here, will deepen dramatically in Chapters 7 and 8.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}
%
\item Consider the following vectors in $\R^3$:
\[
    \vect{v}_1 = \begin{pmatrix}1\\2\\1\end{pmatrix}, \quad
    \vect{v}_2 = \begin{pmatrix}2\\4\\-1\end{pmatrix}, \quad
    \vect{v}_3 = \begin{pmatrix}3\\6\\0\end{pmatrix}
\]
(a) Show they are linearly dependent by finding specific scalars $c_1,c_2,c_3$, not all zero, such that $c_1\vect{v}_1 + c_2\vect{v}_2 + c_3\vect{v}_3 = \vect{0}$
(b) Find the dimension of $\text{span}\{\vect{v}_1,\vect{v}_2,\vect{v}_3\}$
(c) Find a linearly independent subset that spans the same space

\item Consider the space $\R^{2\times 2}$ of $2\times 2$ matrices. Show that the following set of matrices is linearly dependent:
\[
    \begin{bmatrix}1&0\\0&1\end{bmatrix}, 
    \begin{bmatrix}0&1\\1&0\end{bmatrix},
    \begin{bmatrix}1&1\\1&1\end{bmatrix},
    \begin{bmatrix}1&-1\\-1&1\end{bmatrix}
\]
Find a linear relation between them.

\item Consider the following matrices in $\R^{2\times 2}$:
\[
    A_1 = \begin{bmatrix}1&1\\0&1\end{bmatrix}, \quad
    A_2 = \begin{bmatrix}2&1\\1&0\end{bmatrix}, \quad
    A_3 = \begin{bmatrix}4&3\\1&2\end{bmatrix}
\]
(a) Show that $A_3$ lies in $\text{span}\{A_1,A_2\}$ by finding specific scalars $c_1,c_2$ such that $A_3=c_1A_1+c_2A_2$
(b) Find the matrix $\begin{bmatrix}a&b\\c&d\end{bmatrix}$ that lies in $\text{span}\{A_1,A_2\}$ and satisfies $a+d=1$ and $b+c=3$

\item Consider the following vectors in $\R^4$:
\[
    \vect{v}_1 = \begin{pmatrix}1\\0\\1\\1\end{pmatrix}, \quad
    \vect{v}_2 = \begin{pmatrix}2\\1\\0\\-1\end{pmatrix}, \quad
    \vect{v}_3 = \begin{pmatrix}1\\2\\-1\\0\end{pmatrix}, \quad
    \vect{v}_4 = \begin{pmatrix}8\\7\\-1\\-3\end{pmatrix}
\]
(a) Express $\vect{v}_4$ as a linear combination of $\vect{v}_1$, $\vect{v}_2$, and $\vect{v}_3$
(b) Show that $\{\vect{v}_1,\vect{v}_2,\vect{v}_3\}$ is linearly independent
(c) Is $\{\vect{v}_1,\vect{v}_2,\vect{v}_3\}$ a spanning set for $\R^4$? If not, find a vector in $\R^4$ that is not in their span

\item In the space $\poly_2$ of polynomials of degree at most 2:
(a) Express $p(x)=2x^2-x+3$ as a linear combination of $q_1(x)=1+x^2$, $q_2(x)=x-x^2$, and $q_3(x)=1-x$
(b) Determine whether $\{q_1(x),q_2(x),q_3(x)\}$ spans $\poly_2$
(c) If they do not span $\poly_2$, find a polynomial in $\poly_2$ that is not in their span

\item Consider the vector space $\poly_2$ of polynomials of degree at most 2. Show that the polynomials $1$, $1+x$, and $1+x+x^2$ span $\poly_2$. Are they linearly independent?

\item Let $V$ be the vector space of $3\times 3$ matrices $A$ satisfying $A^T=A$ (symmetric matrices).
(a) Write down a spanning set for $V$
(b) Prove your spanning set is linearly independent
(c) Determine $\dim V$
(d) Find the coordinates of $\begin{bmatrix}2&1&0\\1&-1&2\\0&2&3\end{bmatrix}$ with respect to your spanning set

\item Let $V$ be the vector space of $2\times 2$ matrices. Show that the set of matrices of the form
\[
    \begin{bmatrix}a&b\\b&a\end{bmatrix}
\]
is a subspace of $V$. What is its dimension?

\item Determine whether the following is a subspace of the vector space of continuous functions on $[0,1]$: the set of all continuous functions $f$ satisfying $f(0)=2f(1)$. Justify your answer.

\item Prove that, for fixed constant $c$, the set of all polynomials $p(x)$ satisfying $p(c)=0$ forms a vector space. 

\item For the matrix
\[
    A = \begin{bmatrix}
    1 & 2 & 1 \\
    2 & 4 & -1 \\
    -1 & -2 & 3
    \end{bmatrix}
\]
(a) Find a basis for the row space $\row(A)$
(b) Find a basis for the column space $\column(A)$
(c) Show that $\dim(\row(A)) = \dim(\column(A))$ in this case
(d) Find a vector in $\R^3$ that is not in the column space of $A$

\item Consider the matrices
\[
    A = \begin{bmatrix}
    1 & 1 & 0 \\
    -1 & 2 & 1 \\
    0 & 3 & 1
    \end{bmatrix}
    \quad\text{and}\quad
    B = \begin{bmatrix}
    2 & -1 & -1 \\
    -2 & 4 & 2 \\
    0 & 3 & 1
    \end{bmatrix}
\]
Show that $\row(A) = \row(B)$ by finding explicit linear combinations relating their rows. What does this tell you about the relationship between $\column(A)$ and $\column(B)$?

\item Show that for any vectors $\vect{v}_1,\ldots,\vect{v}_k$ in a vector space $V$, the span of $\{\vect{v}_1,\ldots,\vect{v}_k\}$ is the smallest subspace of $V$ containing all the vectors $\vect{v}_1,\ldots,\vect{v}_k$. (Hint: prove it is a subspace, contains the vectors, and is contained in any other subspace containing the vectors.)

\item Let $V$ be a vector space and $U<V$ a subspace. Prove that if $\vect{v}\not\in U$, then $\{\vect{v}\}\cup U$ is linearly dependent if and only if $\vect{v}\in U$.

\item Show that in any vector space $V$, if 
$\{\vect{v_1},\ldots,\vect{v}_k\}$ spans $V$ and 
$\{\vect{w}_1,\ldots,\vect{w}_m\}$ is linearly independent in $V$, then $m\leq k$. What does this tell us about different spanning sets?

\item For vectors $\vect{v}_1,\vect{v}_2,\vect{v}_3$ in a vector space $V$, prove or disprove: if $\vect{v}_1$ and $\vect{v}_2$ are linearly independent, and $\{\vect{v}_1,\vect{v}_2,\vect{v}_3\}$ is linearly dependent, then $\vect{v}_3$ must lie in $\text{span}\{\vect{v}_1,\vect{v}_2\}$.

\item Let $U = \{(x,x,0): x\in\R\}$ and $W = \{(0,y,z): y,z\in\R\}$ be subspaces of $\R^3$. Prove that $\R^3 = U\directsum W$ by showing: (i) every vector in $\R^3$ can be written as a sum of vectors from $U$ and $W$, and (ii) the only vector in both $U$ and $W$ is $\vect{0}$.

\item In the space $\poly_2$ of polynomials of degree at most 2, let $U$ be the subspace of even polynomials (where $p(-x)=p(x)$) and $W$ the subspace of odd polynomials (where $p(-x)=-p(x)$). Show that $\poly_2 = U\directsum W$.

\item Let $V = \R^{2\times 2}$ be the space of $2\times 2$ matrices. Let $U$ be the subspace of upper triangular matrices and $W$ the subspace of strictly lower triangular matrices. Show that $V\neq U\directsum W$ by finding a nonzero matrix in both $U$ and $W$.

\item Let $V = \poly_3$ and define $U = \{p\in\poly_3: p(0)=0\}$ and $W = \{p\in\poly_3: p(x)=c \text{ for some constant }c\}$. Show that $V = U\directsum W$.

\item Let $U$ and $W$ be subspaces of $\R^3$. Prove that if $\dim U + \dim W > 3$, then $U$ and $W$ cannot form a direct sum (that is, they must have nonzero intersection).

\item Let $U$ and $W$ be subspaces of a vector space $V$ such that $V = U\directsum W$. If $\vect{v}\in V$, prove that the expression $\vect{v} = \vect{u} + \vect{w}$ with $\vect{u}\in U$ and $\vect{w}\in W$ must be unique.

\item Let $U_1,U_2,U_3$ be subspaces of a vector space $V$. Prove that if $V = U_1\directsum U_2\directsum U_3$, then any vector $\vect{v}\in V$ can be written uniquely as $\vect{v} = \vect{u}_1 + \vect{u}_2 + \vect{u}_3$ with $\vect{u}_i\in U_i$.

\item Let $U$ and $W$ be finite-dimensional subspaces of a vector space $V$. Prove that
\[
    \dim(U + W) = \dim U + \dim W - \dim(U\cap W)
\]

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Linear Transformations}
\label{ch:3}

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``he became what he beheld; he became what he was doing; he was himself transform'd''}

\newthought{The essence of Mathematics} lies not in objects but in transformations between them. The vectors and spaces we have thus far studied come alive only when acted upon  --  rotated, scaled, projected, or otherwise transformed. Such transformations are the verbs to our nouns, the operations that animate our mathematical universe. Linear transformations are those which preserve the fundamental operations of vector spaces: addition and scaling. This seemingly modest requirement  --  that our transformations respect vector space structure  --  leads to a remarkably rich theory with profound practical implications.

Vector spaces, in isolation, are static collections. The power of linear algebra emerges when we consider mappings which morph  from input signals to output responses, from configurations to forces, from high-dimensional data to low-dimensional representations. Such mappings, when linear, possess a beautiful structure that both illuminates their theoretical properties and enables their practical computation.

Our journey begins with familiar matrix transformations before ascending to more abstract heights. The operations of differentiation and integration, though far from geometric, share deep structural features with their matrix cousins. This abstraction reveals four fundamental spaces associated with any linear transformation  --  the kernel and image that capture what vanishes and what is attained, the coimage and cokernel that measure efficiency and defect. These spaces, seemingly distinct, are bound together by the Fundamental Theorem of Linear Algebra, a result that unifies the algebraic, geometric, and dimensional aspects of linear transformations into a single coherent picture.


% ==============================================
\section{Euclidean Transformations}
\label{sec:euclidean}
% ==============================================

Our story begins in familiar territory  --  with matrices acting on vectors in Euclidean space. From multivariable calculus, we recall how multiplication by a matrix $A$ transforms vectors in $\R^n$, taking input vector $\vect{x}$ to output $A\vect{x}$. Though we performed such operations mechanically, computing products column-by-row, these transformations have rich geometric content worth savoring before abstraction.

The simplest such transformations scale space uniformly. A matrix $cI$ multiplies each coordinate by the scalar $c$, dilating or contracting space about the origin. More interesting are matrices that scale different directions differently:
\[
\begin{bmatrix}
2 & 0 \\
0 & 1/2
\end{bmatrix}
\begin{pmatrix}
x \\ y
\end{pmatrix}
=
\begin{pmatrix}
2x \\ y/2
\end{pmatrix}
\]
Such transformations stretch space along one axis while compressing along another  --  like a funhouse mirror's distortion rendered precise in coordinates.

Rotations in the plane arise from matrices of the form
\[
\begin{bmatrix}
\cos\theta & -\sin\theta \\
\sin\theta & \cos\theta
\end{bmatrix}
\]
spinning vectors through angle $\theta$ counterclockwise about the origin. That such matrices preserve lengths and angles is a consequence of their structure  --  see Chapter \ref{ch:5}. 

More subtle are \style{shear transformations}, such as
\[
\begin{bmatrix}
1 & h \\
0 & 1
\end{bmatrix}
\]
\begin{marginfigure}
{\em Question:}    What happens with the transpose of this horizontal shear? 
\end{marginfigure}
which offset each horizontal line by an amount proportional to its height. These preserve area while tilting vertical lines  --  like a deck of cards carefully slid across a table. 

These elementary transformations  --  scaling, rotation, and shear  --  combine to generate all linear transformations in the plane. Any $2\times 2$ matrix can be understood as a composition of such basic geometric operations. This decomposition previews deeper structure to come, when we learn to factor matrices into simpler constituent parts.

What features do these transformations share, beyond their realization through matrix multiplication? First, they preserve the origin  --  the zero vector remains fixed. Second, they respect vector addition: the image of a sum equals the sum of the images. Third, they interact naturally with scalar multiplication: doubling an input vector doubles its image. These properties  --  seemingly obvious in the matrix setting  --  will form the scaffolding for our abstract theory.
%
\begin{marginfigure}
{\em Foreshadowing:} The properties we observe in matrix transformations  --  preservation of vector operations  --  will define linearity in the abstract setting.
\end{marginfigure}

Consider as well what these transformations can destroy. A rotation preserves distances but changes coordinates. A shear preserves areas but distorts angles. A scaling changes both distances and areas, but preserves lines through the origin. This selective preservation of geometric features hints at deeper invariants  --  quantities or properties that remain unchanged under certain classes of transformations.

The matrix transformation $A\vect{x}$ converts geometric intuition about transforming space into algebraic manipulation of coordinates. As we lift these ideas to abstract vector spaces, this interplay between geometry and algebra will remain. Though we may lose the ability to visualize transformations directly, the core ideas  --  preservation of vector operations, study of invariants, decomposition into simpler parts  --  will guide our development.

% ==============================================
\section{Definitions \& Implications}
\label{sec:definitions}
% ==============================================

Our experience with Euclidean transformations suggests key features that characterize the essence of linearity: preservation of addition and scaling. Many important transformations share these algebraic properties while lacking obvious geometric interpretation. This motivates abstracting away from geometry to study linear transformations between arbitrary vector spaces.

\begin{definition}[Linear Transformation]
Let $V$ and $W$ be vector spaces. A \style{linear transformation} $T:V\to W$ is a function satisfying two properties:
\begin{enumerate}
    \item Additivity: $T(\vect{v_1}+\vect{v_2}) = T(\vect{v_1})+T(\vect{v_2})$ for all $\vect{v_1},\vect{v_2}\in V$
    \item Homogeneity: $T(c\vect{v}) = cT(\vect{v})$ for all $c\in\R$ and $\vect{v}\in V$
\end{enumerate}
\end{definition}

These two properties combine to ensure that linear transformations preserve linear combinations. This seemingly simple requirement has profound implications.

\begin{lemma}
\label{lem:lintran}
A linear transformation $T:V\to W$ satisfies:
\begin{enumerate}
    \item $T(\vect{0})=\vect{0}$
    \item $T(-\vect{v})=-T(\vect{v})$ for all $\vect{v}\in V$
    \item $T$ preserves linear combinations
    \[
    T\left(\sum_{i=1}^nc_i\vect{v_i}\right)
    =
    \sum_{i=1}^nc_iT(\vect{v_i})
    \]
\end{enumerate}
\end{lemma}

\begin{marginfigure}
{\em Foreshadowing:} The preservation of linear combinations will be the key to understanding how linear transformations interact with bases and coordinate systems.
\end{marginfigure}

The preservation of linear combinations has an important consequence for subspaces: linear transformations send subspaces to subspaces.

\begin{lemma}
\label{lem:subspace}
If $T:V\to W$ is linear and $U<V$, then $T(U)<W$.
\end{lemma}

% ==============================================
\subsection*{Examples of Linear Transformations}
\label{sec:examples-lintrans}
% ==============================================

The simplest examples of linear transformations are those of the previous section -- any matrix $A$ acts on Euclidean vectors via the linear transformation $T_A(\vect{x}) = A\vect{x}$. This is so matural that it is hardly worth calling out as anything different than the matrix itself. However, not all linear transformations are so explicit in coordinates. Consider the following examples of a less geometric nature.

\begin{example}[Differentiation]
Consider the differentiation operator $D=d/dx$ from calculus. This satisfies linearity in that $D(f+g)=Df+Dg$ for differentiable functions $f$ and $g$; and $D(cf) = c\,Df$ for $c$ a scalar. As such, it defines a linear transformation from $C^\infty(\R)$ to itself. If finite-dimensional vector spaces are preferred, one can restrict to polynomials, in which case $D:\poly_n\to\poly_{n-1}$:
\begin{marginfigure}
    Note that the subspace of constant polynomials is sent to zero. What happens to other subspaces of $\poly_n$?
\end{marginfigure} 
\[
    D\left(\sum_{i=0}^n c_ix^i\right) 
    = 
    \left(\sum_{j=1}^{n} jc_jx^{j-1}\right) .
\]
Here we see linearity without geometry -- derivatives of sums are sums of derivatives, and constants factor out of derivatives. 
\end{example}

\begin{example}[Integration]
The definite integration operator ${\rm I}:C([a,b])\to\R$ defined by
\[
    {\rm I}(f) = \int_a^b f(x)dx
\]
is, like differentiation, linear. What happens when we restrict attention to polynomials and to subspaces of polynomials?
\end{example}

The relationship between linear transformations and linear independence cuts to the heart of their structure. 

\begin{definition}[Injective and Surjective]
\label{def:inj-surj}
A linear transformation $T:V\to W$ is:
\begin{enumerate}
    \item \style{injective} (or \style{one-to-one}) if distinct inputs yield distinct outputs: $T(\vect{v_1})=T(\vect{v_2})$ implies $\vect{v_1}=\vect{v_2}$
    \item \style{surjective} (or \style{onto}) if every vector in $W$ is the image of some vector in $V$: for each $\vect{w}\in W$ there exists $\vect{v}\in V$ such that $T(\vect{v})=\vect{w}$
\end{enumerate}
\end{definition}
%
\begin{lemma}
\label{lem:inj-surj}
For a linear transformation $T:V\to W$:
\begin{enumerate}
    \item $T$ is injective if and only if it preserves linear independence
    \item $T$ is surjective if and only if $T(V)=W$
    \item $T$ is invertible if and only if it is both injective and surjective
\end{enumerate}
\end{lemma}
\begin{marginfigure}
{\em Caveat:} A linear transformation can fail to be invertible in two distinct ways: by mapping different vectors to the same image (non-injective) or by missing vectors in the target space (non-surjective).
\end{marginfigure}
%
The power of these abstractions lies in their breadth of application. Whether transforming geometric vectors, polynomials, or functions, the same principles govern their behavior.

% ==============================================
\section{Isomorphisms}
\label{sec:isomorphisms}
% ==============================================

When are two vector spaces fundamentally the same? The geometric vectors in $\R^2$ seem quite different from the linear polynomials $ax+b$, yet both allow the same operations and satisfy the same rules. Such observations lead us to examine what it means for vector spaces to be indistinguishable from the perspective of linear algebra.

Linear transformations are directed mappings between to vector spaces. Given $T:V\to W$, we call $V$ the \style{domain} and $W$ the \style{codomain}. To serve as a perfect dictionary between spaces, $T$ must possess both properties introduced in Definition \ref{def:inj-surj}: it must be both injective and surjective. Such transformations are called isomorphisms:
%
\begin{marginfigure}
{\em Nota bene:} The use of \style{codomain} may be unfamiliar, as it stems from category theory. Such is also the case with the terms \style{monomorphism} and \style{epimorphism} for injective and surjective maps respectively, though we shall not use those particular terms.
\end{marginfigure}

\begin{definition}[Isomorphism]
\label{def:isomorphism}
Vector spaces $V$ and $W$ are \style{isomorphic}, denoted $V\iso W$, if there exists an \style{isomorphism} between them  --  a linear transformation that is both injective and surjective.
\end{definition}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{example}[Coordinate vectors]
The transformation $T:\R^2\to\poly_1$ sending vectors to linear polynomials via
\[
    \begin{pmatrix}a\\b\end{pmatrix} \mapsto a + bx
\]
is an isomorphism. It provides a perfect dictionary between geometric vectors and linear polynomials, preserving all vector space operations. Addition of vectors corresponds to addition of polynomials; scaling vectors means scaling polynomials.
\end{example}

\begin{example}[Matrix representations]
The space $\R^{2\times 2}$ of $2$-by-$2$ matrices is isomorphic to $\R^4$ via the transformation
\[
    \begin{bmatrix}
    a & b \\ c & d
    \end{bmatrix}
    \mapsto
    \begin{pmatrix}
    a \\ b \\ c \\ d
    \end{pmatrix}
\]
Though we typically write matrices in square array, they are fundamentally no different from vectors  --  they simply package the same information differently.
\end{example}
\begin{marginfigure}
{\em Nota bene:} Matrix multiplication is invisible to the vector space structure. 
\end{marginfigure}

Not all linear transformations achieve this perfect correspondence. The projection $\proj{}:\R^3\to\R^2$ given by
\[
    \proj{}\begin{pmatrix}x\\y\\z\end{pmatrix} = \begin{pmatrix}x\\y\end{pmatrix}
\]
is surjective but not injective  --  it reaches every point in the plane but collapses all points differing only in their $z$-coordinate. Conversely, the embedding $\iota:\R^2\to\R^3$ given by
\[
    \iota\begin{pmatrix}x\\y\end{pmatrix} = \begin{pmatrix}x\\y\\0\end{pmatrix}
\]
is injective but not surjective  --  it preserves all information about vectors in the plane but misses most of $\R^3$.

\begin{marginfigure}
{\em Question:} Are any two vector spaces with the same dimension in fact isomorphic? 
\end{marginfigure}

These examples suggest a deep truth: vector spaces of different dimensions cannot be isomorphic. The projection above shows that a larger space cannot inject into a smaller one without collapsing; the embedding shows a smaller space cannot surject onto a larger one without missing vectors. This observation  --  though as yet unproven  --  hints at the fundamental nature of dimension in linear algebra.

The language of isomorphisms provides more than mere classification  --  it offers a perspective on what features of vector spaces truly matter. When spaces are isomorphic, we may freely translate problems between them, choosing whichever representation is most convenient. The geometric intuition of $\R^n$ becomes available to spaces of polynomials, matrices, or signals, provided we have constructed the right dictionary between them.

\begin{example}[Polynomial derivatives]
The differentiation operator $D:\poly_2\to\poly_1$ given by
\[
    D(ax^2 + bx + c) = 2ax + b
\]
is surjective but not injective. Every linear polynomial is a derivative (surjective), but constants vanish under differentiation (non-injective). 
\end{example}

% ==============================================
\section{Image \& Kernel}
\label{sec:imagekernel}
% ==============================================

The subspaces we encountered in Chapter \ref{ch:2} arose from operations within a single vector space. Linear transformations generate their own characteristic subspaces  --   in both domain and codomain. These subspaces capture the essential features of how the transformation acts, measuring both its effectiveness and its defects.

Fix throughout a linear transformation $T:V\rightarrow W$ between vector spaces. The first subspace of interest lies in the codomain. 

\begin{definition}[Image]
\label{def:image}    
The \style{image} of $T$, denoted $\im T$, is the subspace of the codomain consisting of all possible outputs:
\begin{equation}
    \im T = \{T(\vect{v}): \vect{v}\in V\} < W
\end{equation}
\end{definition}
%
That this is indeed a subspace of $W$ follows readily: the zero vector is certainly in the image (as $T(\vect{0})=\vect{0}$), and if $T(\vect{v_1})$ and $T(\vect{v_2})$ are any vectors in the image, then their sum $T(\vect{v_1})+T(\vect{v_2})=T(\vect{v_1}+\vect{v_2})$ is also in the image, as is any scalar multiple. The image measures the ``reach'' of the transformation  --  how much of $W$ can be attained as output.

\begin{definition}[Kernel]
\label{def:kernel}
The \style{kernel} (or \style{nullspace}) of $T$, denoted $\ker T$, is the subspace of the domain consisting of all vectors that vanish under $T$:
\begin{equation}
    \ker T = \{\vect{v}\in V: T(\vect{v})=\vect{0}\} < V
\end{equation}
\end{definition}
%
That this too forms a subspace of $V$ is again straightforward: the zero vector is certainly in the kernel; and if $\vect{v_1}$ and $\vect{v_2}$ are in the kernel, then $T(\vect{v_1}+\vect{v_2})=T(\vect{v_1})+T(\vect{v_2})=\vect{0}$, with scalar multiples following similarly. The kernel captures what the transformation cannot ``see''  --  the vectors that disappear under its action.

These abstract definitions crystallize our earlier work with linear systems. Consider the matrix equation $A\vect{x}=\vect{b}$ from Chapter \ref{ch:1}. This defines a linear transformation $T_A:\R^n\rightarrow\R^m$ via $T_A(\vect{x})=A\vect{x}$. The standard questions about this system now have geometric meaning:
\begin{enumerate}
    \item Does a solution exist? Yes if and only if $\vect{b}\in\im T_A$.
    \item Is the solution unique? Yes if and only if $\ker T_A=\{\vect{0}\}$.
    \item If more than one solution exists, how are they related? They differ by elements of $\ker T_A$.
\end{enumerate}

\begin{example}[Calculus operators]
The differentiation operator $D:C^1([a,b])\rightarrow C([a,b])$ has kernel consisting of all constant functions on $[a,b]$  --  these are precisely the functions that vanish under differentiation. Its image consists of all continuous functions that arise as derivatives, a proper subspace of $C([a,b])$ (not every continuous function is a derivative).

The definite integration operator ${\rm I}:C([a,b])\rightarrow\R$ defined by ${\rm I}(f)=\int_a^b f(x)dx$ has (very large) kernel consisting of all functions whose integral over $[a,b]$ vanishes. Its image is all of $\R$  --  any real number can be realized as the integral of some continuous function.
\end{example}
%
\begin{marginfigure}
{\em Foreshadowing:} The relationship between the dimensions of kernel and image will prove fundamental to understanding linear transformations. This balance between what vanishes and what is attained is captured in the Fundamental Theorem at the end of this chapter.
\end{marginfigure}

These subspaces provide the first tools for analyzing the structure of linear transformations. A transformation is one-to-one precisely when its kernel contains only the zero vector; it is onto when its image is the entire codomain. The interplay between these subspaces  --  how their dimensions balance, how they decompose the spaces involved  --  leads to the deeper theory ahead.

% ==============================================
\section{Rank \& Nullity}
\label{sec:ranknullity}
% ==============================================

The dimension of a vector space captures its size and complexity. For a linear transformation, we seek similar measures of size and complexity  --  not of a single space, but of how the transformation acts between spaces. These measures arise naturally from the dimensions of image and kernel.

\begin{definition}[Rank \& Nullity]
\label{def:rank+nullity}
The \style{rank} of a linear transformation $T:V\rightarrow W$ is the dimension of its image:
\begin{equation}
\rank T = \dim(\im T)
\end{equation}
The \style{nullity} of $T$ is the dimension of its kernel:
\begin{equation}
\nullity T = \dim(\ker T)
\end{equation}
\end{definition}

This abstracts the pedestrian notion of matrix rank from Definition \ref{def:pseudorank}. When $T$ is represented by a matrix $A$, the abstract and concrete ranks coincide. 

\begin{marginfigure}
{\em Think:} rank measures the transformation's effective dimension  --  like light passing through crystal, part transmitted and part absorbed, it counts the independent directions that survive. The nullity complements this measure by counting dimensions lost, those directions that, like perfectly absorbed light, vanish entirely in transformation.
\end{marginfigure}

\begin{example}[Matrix Rank and Nullity]
For a $3\times 4$ matrix $A$ of rank 2, the transformation $T_A:\R^4\rightarrow\R^3$ has:
\begin{enumerate}
    \item $\rank T_A = 2$, meaning $\im T_A$ is a plane in $\R^3$
    \item $\nullity T_A = 2$, as solving $A\vect{x}=\vect{0}$ yields a two-dimensional solution space
    \item $\dim(\ker T_A) + \dim(\im T_A) = \dim(\R^4) = 4$
\end{enumerate}
This last observation hints at a deeper relationship between rank and nullity.
\end{example}

\begin{example}[Calculus operators]
The differentiation operator $D:\poly_n\rightarrow\poly_{n-1}$ has:
\begin{enumerate}
    \item $\rank D = n$, as every polynomial in $\poly_{n-1}$ is a derivative
    \item $\nullity D = 1$, as only constant functions vanish under differentiation
    \item $\dim(\ker D) + \dim(\im D) = \dim(\poly_n) = n+1$
\end{enumerate}
Again we see the dimensions balance.
\end{example}

\begin{example}[Integration]
Consider the definite integration operator ${\rm I}:C[0,1]\rightarrow\R$ defined by ${\rm I}(f)=\int_0^1 f(x)dx$. Though $C[0,1]$ is infinite-dimensional:
\begin{enumerate}
    \item $\rank {\rm I} = 1$, as the image is all of $\R$
    \item $\nullity {\rm I}$ is infinite-dimensional, containing all functions whose integral vanishes
    \item The dimensional balance breaks down in the infinite-dimensional setting
\end{enumerate}
\end{example}
%
\begin{marginfigure}
{\em Caveat:} The relationship between rank and nullity becomes more subtle in infinite dimensions. The examples here are meant to build intuition in the finite-dimensional case.
\end{marginfigure}

These examples suggest deep connections between rank, nullity, and the dimensions of domain and codomain. 

% ==============================================
\section{Quotients}
\label{sec:quotients}
% ==============================================

Linear transformations reveal structure not only through what they preserve, but through what they collapse. The manner in which different vectors map to identical outputs suggests a natural organization  --  grouping vectors that transform identically. This insight leads to one of the most profound constructions in linear algebra: the quotient space.

\begin{definition}[Quotient Space]
\label{def:quotient}
Let $V$ be a vector space and $U<V$ a subspace. The \style{quotient space} $V/U$ is the vector space whose elements are equivalence classes of vectors in $V$, where vectors $\vect{v}_1,\vect{v}_2\in V$ are equivalent if and only if their difference lies in $U$:
\[
    \vect{v}_1 \sim \vect{v}_2 \iff \vect{v}_1-\vect{v}_2 \in U
\]
The equivalence class of $\vect{v}\in V$, denoted $[\vect{v}]$, consists of all vectors equivalent to $\vect{v}$:
\[
    [\vect{v}] = \{\vect{w}\in V : \vect{w}-\vect{v}\in U\} = \vect{v}+U
\]
Vector operations on $V/U$ are defined through representatives: $[\vect{v}_1]+[\vect{v}_2]=[\vect{v}_1+\vect{v}_2]$ and $c[\vect{v}]=[c\vect{v}]$ for scalar $c$.
\end{definition}

\begin{marginfigure}
The properties of subspaces ensure $\sim$ defines a proper equivalence relation: reflexive ($\vect{v}\sim\vect{v}$), symmetric ($\vect{v}_1\sim\vect{v}_2$ implies $\vect{v}_2\sim\vect{v}_1$), and transitive ($\vect{v}_1\sim\vect{v}_2$ and $\vect{v}_2\sim\vect{v}_3$ implies $\vect{v}_1\sim\vect{v}_3$).
\end{marginfigure}

The physical intuition for quotient spaces emerges naturally in electrical networks. Consider a circuit with $n$ nodes, where we measure voltage differences between pairs of nodes. Though each node has its own voltage potential, the physically meaningful measurements are always differences  --  adding a constant voltage to every node leaves all measurements unchanged. This observation reveals the fundamental role of quotients in physics.

Let $V=\R^n$ be the vector space of voltage assignments to nodes. The projection $\proj{}$ that sends each voltage configuration to its equivalence class under constant shifts:
\[
    \proj{}:V\to Q \quad : \quad \vect{v} \mapsto [\vect{v}]
\]
has kernel consisting precisely of constant voltage shifts $(c,c,\ldots,c)^T$. The quotient space $V/\ker(\proj{})$ then captures the physically meaningful voltage states, stripped of their artificial dependence on reference potential.

\begin{example}[Kernel Quotients]
\label{ex:kernelquotient}
Let $T:V\rightarrow W$ be a linear transformation. Two vectors that differ by an element of $\ker T$ are sent to the same output:
\[
    T(\vect{v_1}) = T(\vect{v_2}) \iff \vect{v_1}-\vect{v_2}\in\ker T
\]
The quotient space $V/\ker T$ naturally represents the ``effective'' input space of $T$  --  it identifies inputs that $T$ cannot distinguish.
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} When we introduce inner products, each equivalence class will have a unique representative orthogonal to the subspace being quotiented. For now, we work with the classes themselves.
\end{marginfigure}

The quotient space inherits a vector space structure from $V$. Addition and scalar multiplication are defined on equivalence classes:
\[
    [\vect{v_1}] + [\vect{v_2}] = [\vect{v_1}+\vect{v_2}] \quad\text{and}\quad c[\vect{v}] = [c\vect{v}]
\]
These operations are well-defined  --  independent of which representatives we choose from the equivalence classes.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{example}[Geometric quotients]
Consider first quotienting $\R^3$ by a line $L$ through the origin. Two points $\vect{p},\vect{q}\in\R^3$ belong to the same equivalence class precisely when their difference $\vect{p}-\vect{q}$ lies in $L$  --  that is, when they differ by some vector parallel to $L$. Each equivalence class thus forms a plane parallel to $L$, as shifting along $L$ keeps us within the same class. The quotient space $\R^3/L$ can be visualized as the collection of all such parallel planes, naturally parametrized by a two-dimensional plane perpendicular to $L$. Though abstract, this quotient has dimension exactly 2, as specifying a plane requires two coordinates once we fix its direction.

Now consider instead quotienting $\R^3$ by a plane $P$ through the origin. The equivalence classes are lines perpendicular to $P$  --  each point equivalent to all others lying directly above or below it relative to $P$. The quotient space $\R^3/P$ becomes naturally one-dimensional, parametrized by the signed distance along the normal vector to $P$. This distance provides a concrete realization of the abstract quotient: two points are equivalent precisely when they have the same projection onto the normal vector to $P$, or equivalently, when they lie the same distance from $P$ along parallel perpendicular lines.
\end{example}

The dimension of a quotient space reflects both the dimension of the original space and the dimension of the subspace being quotiented:
\[
    \dim(V/U) = \dim V - \dim U
\]
This dimensional relationship will prove crucial in understanding how linear transformations decompose spaces.

\begin{example}[Integration quotients]
Consider the indefinite integral operator (or {\em antidifferentiation}) $D^{-1}$ acting on continuous functions $C([a,b])$ on an interval. An antiderivative always exists (thanks to the FTIC) but is well-defined only up to a constant. 
\begin{marginfigure}
    You did not forget the $+C$ did you?
\end{marginfigure}
To make this a linear transformation of vector spaces requires using a quotient space for the codomain. Let $U<C([a.b])$ denote the subspace of constant functions on the interval. Then the quotient $C([a,b])/U$ consists of classes of functions whose differences are constants. Indefinite integration is now a linear transformation
\[
    D^{-1} : C([a,b]) \to C([a,b])/U .
\]
\end{example}

\begin{example}[Translation Invariance]
Consider a dataset of $n$ points in $\R^d$, represented as columns of a matrix $X\in\R^{d\times n}$. In many applications, such as clustering or pattern recognition, we care about the relative positions of points rather than their absolute positions in space.

Let $V=\R^{d\times n}$ be the vector space of all possible datasets. The subspace of uniform translations $U<V$ consists of all matrices whose columns are identical:
\[
    U = \{\begin{bmatrix}\vect{v} & \vect{v} & \cdots & \vect{v}\end{bmatrix} : \vect{v}\in\R^d\} = \{\vect{v}\cdot\vect{1}^T : \vect{v}\in\R^d\}
\]
where $\vect{1}\in\R^n$ is the vector of all ones. Each matrix in $U$ represents a uniform translation of the zero matrix by some vector $\vect{v}\in\R^d$.

Two datasets $X,Y\in V$ are equivalent modulo translation if their difference $X-Y$ lies in $U$  --  that is, if there exists some vector $\vect{v}\in\R^d$ such that each column of $X$ is translated by $\vect{v}$ to obtain the corresponding column of $Y$:
\[
    X \sim Y \iff X-Y \in U \iff X-Y = \vect{v}\cdot\vect{1}^T \text{ for some } \vect{v}\in\R^d
\]

The quotient space $V/U$ then represents datasets modulo translation  --  it captures the intrinsic shape of point configurations while ignoring their absolute position in space. A dimension count reveals the structure:
\begin{marginfigure}
    This gives an explicit isomorphism between $V/U$ and the subspace of mean-zero datasets.
\end{marginfigure}
\begin{itemize}
    \item $\dim(V) = dn$ (coordinates of $n$ points in $\R^d$)
    \item $\dim(U) = d$ (coordinates of the translation vector)
    \item $\dim(V/U) = dn-d$ (data sets up to translation)
\end{itemize}
\end{example}

Quotient spaces provide a formal way to identify vectors that behave similarly under certain operations. When we quotient a domain by the kernel of a transformation, we obtain a space that faithfully represents how the transformation acts, stripped of redundancy. This perspective will prove invaluable as we develop more sophisticated tools for analyzing linear transformations.

% ==============================================
\section{Coimage \& Cokernel}
\label{sec:coimagecokernel}
% ==============================================

The image and kernel of a linear transformation tell only half the story. Just as quotient spaces reveal structure by identifying vectors that behave similarly, we can illuminate the action of a linear transformation by examining quotients in both domain and codomain. This leads to two additional spaces that complete our structural understanding.

\begin{definition}[Coimage]
\label{def:coimage}
Given a linear transformation $T:V\rightarrow W$, the \style{coimage} of $T$ is the quotient space
\begin{equation}
        \coim T = V/\ker T
\end{equation}
\end{definition}
The coimage represents the ``effective'' input space of $T$  --  it identifies inputs that $T$ cannot distinguish. The projection $\proj{}:V\rightarrow \coim T$ sends each vector to its equivalence class modulo $\ker T$. 

\begin{marginfigure}
{\em Example:} For a rank-2 matrix $A:\R^4\rightarrow\R^3$, the coimage is 2-dimensional, representing the two independent input directions that affect the output.
\end{marginfigure}

The coimage has a natural interpretation: it measures how many independent input directions actually influence the output. Two vectors in $V$ produce the same output precisely when they differ by an element of $\ker T$  --  thus $\coim T$ parametrizes the truly distinct inputs as seen by $T$.

Our fourth fundamental space is the most hidden and obscure:
%
\begin{definition}[Cokernel]
\label{def:cokernel}
Given a linear transformation $T:V\rightarrow W$, the \style{cokernel} of $T$ is the quotient space
\begin{equation}
        \coker T = W/\im T
\end{equation}
\end{definition}
%
The cokernel measures the failure of $T$ to reach all of $W$. When $T$ is surjective, $\coker T$ is trivial; otherwise, it captures the ``invisible'' space in the codomain.

\begin{example}[Differential operators]
For the derivative operator $D:\poly_n\rightarrow\poly_{n-1}$:
\begin{enumerate}
    \item The coimage is $(n)$-dimensional, as only constants vanish under $D$
    \item The cokernel is trivial, as every polynomial in $\poly_{n-1}$ is a derivative
\end{enumerate}
For integration ${\rm I}:C[0,1]\rightarrow\R$:
\begin{enumerate}
    \item The coimage captures functions differing by more than their average
    \item The cokernel is trivial, as every real number is an integral
\end{enumerate}
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} The dimensions of these spaces are not independent  --  they satisfy a beautiful relationship that the next section will reveal.
\end{marginfigure}

These four fundamental spaces  --  kernel, image, cokernel, and coimage  --  form a complete set of structural invariants for a linear transformation. Their relationships, seemingly complex, crystallize in the Fundamental Theorem ahead. Each measures a different aspect of how $T$ transforms space:
\begin{itemize}
    \item $\ker T$ captures what vanishes
    \item $\im T$ shows what is attainable
    \item $\coim T$ reveals independent inputs
    \item $\coker T$ measures missing outputs
\end{itemize}

The stage is now set for a deeper understanding of how these spaces fit together  --  a unification that explains not just what these spaces are, but why they must exist and how they relate.

% ==============================================
\section{The Fundamental Theorem}
\label{sec:fundamental}
% ==============================================

Mathematics achieves clarity through unification. The various structures we have encountered  --  images and kernels, quotients, cokernels and coimages  --  are not merely related but fundamentally interwoven. The tapestry they form is one of the most beautiful results in linear algebra, revealing deep connections between algebraic and dimensional properties of linear transformations.

Consider a linear transformation $T:V\rightarrow W$ between finite-dimensional vector spaces. From our explorations, we have uncovered four fundamental subspaces:
\begin{enumerate}
    \item The image $\im T$, capturing all possible output
    \item The kernel $\ker T$, containing all inputs that vanish
    \item The coimage $\coim T$, encoding independent input directions
    \item The cokernel $\coker T$, measuring the failure to surject
\end{enumerate}
%
\begin{marginfigure}
{\em Example:} For the projection $T:\R^3\rightarrow\R^2$ onto the $xy$-plane, the kernel is the $z$-axis, the image is $\R^2$, the coimage is the $xy$-plane in the domain, and the cokernel is trivial.
\end{marginfigure}

These spaces, seemingly distinct, are bound together by a profound theorem that explains not only how they relate but why they must so relate. This is the Fundamental Theorem of Linear Algebra:

\begin{theorem}[Fundamental Theorem of Linear Algebra]
\label{thm:FTLA}
For any linear transformation $T:V\rightarrow W$ between finite-dimensional vector spaces, the following relationships hold and are equivalent:
\begin{enumerate}
    \item The domain and codomain decompose as direct sums:
    \[
        V \iso \ker T \directsum \coim T
        \quad\text{and}\quad
        W \iso \im T \directsum \coker T
    \]
    \item The coimage and image are naturally isomorphic: $\coim T \iso \im T$
\end{enumerate}
\end{theorem}
\begin{marginfigure}
    {\em ``Four mighty ones there are in every Man; a Perfect Unity''}
\end{marginfigure}

% ****************************************************
\begin{figure}
    \centering
    \includegraphics[width=0.75\linewidth]{FTLA-geom.png}
    \label{fig:FTLA}
\end{figure}
% ****************************************************
When translated to dimensions, the Fundamental Theorem is sometimes called the {\em Rank-Nullity Theorem}:
%
\begin{cor}[Rank-Nullity]
\label{cor:FTLA}
For a linear transformation between finite-dimensional vector spaces, the dimensions balance in complementary pairs:
    \[
        \dim V = \dim(\ker T) + \dim(\coim T)
        \quad\text{and}\quad
        \dim W = \dim(\im T) + \dim(\coker T)
    \]
Furthermore, the rank connects domain and codomain:
    \[
        \dim(\coim T) = \rank(T) = \dim(\im T) 
    \]
Otherwise said:
    \[
        \dim V = \nullity(T) + \rank(T)
    \]
\end{cor}

\begin{example}[Matrix rank]
When $T$ is represented by a matrix $A$, these relationships explain why:
\begin{enumerate}
    \item The nullity $\nullity(A)$ plus the rank $\rank(A)$ equals the number of columns of $A$
    \item The \style{row rank} ($=\dim\row(A)$) and the \style{column rank} ($=\dim\column(A)$) are equal to $\rank(A)$
\end{enumerate}
\begin{marginfigure}
{\em Foreshadowing:} These algebraic relations will acquire additional geometric significance when we introduce inner products in Chapter 5.
\end{marginfigure}
These familiar facts from matrix algebra are manifestations of the deeper structural relationships guaranteed by the Fundamental Theorem.
\end{example}

The theorem has immediate practical implications. When solving a linear system $T\vect{x}=\vect{b}$, we now understand that:
\begin{enumerate}
    \item A solution exists if and only if $\vect{b}\in\text{im}(T)$
    \item When a solution exists, others differ by elements of ker$(T)$
    \item Any solution can be uniquely decomposed into parts from coim$(T)$ and ker$(T)$
    \item The obstruction to existence lies in coker$(T)$
\end{enumerate}

\begin{example}[Solving linear systems]
Consider solving $A\vect{x}=\vect{b}$ where $A$ is $3\times 4$ of rank 2. The Fundamental Theorem tells us that:
\begin{enumerate}
    \item The nullspace has dimension 2
    \item The image has dimension 2
    \item Solutions exist only when $\vect{b}$ lies in a 2-dimensional subspace
    \item When solutions exist, they form a 2-dimensional affine space
\end{enumerate}
\end{example}
%
\begin{marginfigure}
{\em Foreshadowing:} When we introduce inner products, these decompositions will provide the foundation for finding optimal approximate solutions when exact solutions do not exist.
\end{marginfigure}

The Fundamental Theorem is more than a collection of relationships  --  it is a lens through which we view linear transformations. Whether analyzing electrical networks, processing signals, or fitting models to data, these structural relationships guide our understanding and inform our computations. The decompositions it guarantees will prove even more powerful when enhanced with geometric structure in subsequent chapters.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Graph Topology \& Network Structure}
\label{EM:topology}
% **************** EMANATION *******************

% EMANATION: NETWORK STRUCTURE THROUGH LINEAR ALGEBRA

When a city's power grid fails, engineers must quickly identify which neighborhoods remain connected and where backup pathways exist. Similar questions arise across networks: Can a signal reach all neurons in a circuit? Will information flow reliably through a social network? Does a computer network contain redundant paths to route around failures? These practical concerns about connectivity and resilience share a deep mathematical structure that emerges through careful application of linear algebra to network topology.

Consider a finite directed graph $G=(V,E)$ with vertex set $V=\{v_1,\ldots,v_n\}$ and edge set $E=\{e_1,\ldots,e_m\}$. Each edge $e$ has a specified orientation, with starting vertex $e^-$ and ending vertex $e^+$. From this discrete structure we construct two fundamental vector spaces:
\begin{itemize}
    \item $C_0(G)$: the vector space with basis elements the vertices $V$
    \item $C_1(G)$: the vector space with basis elements the oriented edges $E$
\end{itemize}
These spaces have dimensions $n$ and $m$ respectively. Though their elements can be interpreted as assignments of numbers to vertices or edges (like voltages or currents), viewing them as abstract vector spaces clarifies their fundamental structure.

The relationship between these spaces emerges through a natural transformation called the \style{boundary operator} $\partial:C_1(G)\to C_0(G)$. On basis elements, this operator acts by:
\[
    \partial(e) = e^+ - e^-
\]
extending linearly to all of $C_1(G)$. Though defined using edge orientations, its fundamental properties --- captured through kernel and cokernel --- prove independent of these choices.

\begin{example}[Ladder Network]
\label{ex:ladder}
Consider a ``ladder'' network with eight vertices and ten edges arranged and labeled as in the figure, right. The boundary operator has matrix representation:
\begin{marginfigure}
%\centering
\includegraphics[width=1.0in]{ladder-graph.png}
\end{marginfigure}
\[
    \partial = 
    \begin{bmatrix}
  -1 & -1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
    1 & 0 & -1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
    0 & 1 & 0 & -1 & -1 & 0 & 0 & 0 & 0 & 0 \\
    0 & 0 & 1 & 1 & 0 & -1 & 0 & 0 & 0 & 0 \\
    0 & 0 & 0 & 0 & 1 & 0 & -1 & -1 & 0 & 0 \\
    0 & 0 & 0 & 0 & 0 & 1 & 1 & 0 & -1 & 0 \\
    0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & -1 \\
    0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 1 
    \end{bmatrix}
\]
This network contains many cycles, yet there are but three that can be chosen to be {\em independent}. One simple representative follows the top square $e_2+e_4-e_3-e_1$, where the minus signs indicate traversing an edge opposite its orientation. This, together with the other two obvious squares, forms a basis for $\ker\partial$. 
\end{example}

Such \style{cycles} --- elements of $\ker(\partial)$ --- represent closed paths through the network where the ``flow'' in equals flow out at each vertex. Not every element of $\ker(\partial)$ corresponds to a simple cycle; some represent combinations of cycles. The dimension of this kernel, denoted $\beta_1$, counts the number of \style{independent} cycles --- those that cannot be expressed as combinations of smaller cycles. A power grid with larger $\beta_1$ offers more backup paths; a neural circuit with independent cycles can sustain more complex recurrent patterns.
\begin{marginfigure}
    {\em Nota bene:} the symbol $\beta$ stands for {\em Betti number}, a fundamental object of study in algebraic topology.
\end{marginfigure}

The cokernel of $\partial$ reveals complementary structure through its quotient space $C_0(G)/\im(\partial)$. This space effectively identifies vertices that can be reached from each other through network paths. Its dimension $\beta_0$ counts the network's connected components. A power grid with $\beta_0 > 1$ has disconnected regions requiring immediate attention; a neural network with multiple components represents independent processing modules.

These numbers satisfy a remarkable relationship:
\begin{equation}
    \beta_1 - \beta_0 = m - n
\end{equation}
\begin{marginfigure}
    {\em Question:} what happens if you subdivide each edge, adding a new vertex in the midst of each edge, and splitting the edge in two?
\end{marginfigure}
That is, the number of independent cycles minus the number of connected components equals the excess between edge count and vertex count. This equation --- simultaneously the rank-nullity theorem for $\partial$ and a combinatorial invariant of the graph --- expresses a fundamental balance between cycles and components. Adding edges tends to create cycles ($\beta_1$ increases) while joining components ($\beta_0$ decreases).

\begin{example}[Point Cloud Data]
\label{ex:pointcloud}
Modern data analysis often begins with points sampled from some underlying shape or manifold. Given points $\{x_1,\ldots,x_N\}$ in $\R^d$, we can construct a graph by connecting points within distance $\varepsilon$ of each other. The resulting network's topology --- measured through $\beta_0$ and $\beta_1$ --- reveals fundamental features of the underlying data:
\begin{itemize}
    \item $\beta_0$ counts clusters in the data
    \item $\beta_1$ detects holes and cycles
    \item The dependence of these numbers on $\varepsilon$ characterizes scale
\end{itemize}
This forms the foundation of modern \style{Topological Data Analysis}, where persistence of these features across scales indicates genuine structure rather than noise.
\end{example}

The practical value of this framework lies in its reduction of qualitative questions about network structure to systematic linear algebra. 
What began as abstract study of kernels and images emerges as practical tools for understanding networks. The fundamental theorem, revealing deep relationships between these spaces, provides the mathematical foundation for analyzing complex networks.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 3}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item Let $T:\R^2\to\R^2$ be the linear transformation defined by $T(x,y) = (2x+y, x-y)$. Find the kernel and image of $T$. Is $T$ injective? Is $T$ surjective? Find the rank and nullity of $T$.

\item Consider the differentiation operator $D:\poly_2\to\poly_1$ defined by $D(ax^2 + bx + c) = 2ax + b$. Compute dimensions for $\ker D$ and $\im D$. Verify the rank-nullity theorem for this transformation. 

\item For a fixed vector $\vect{v}\in\R^n$, define $T_{\vect{v}}:\R^n\to\R$ by $T_{\vect{v}}(\vect{x}) = \vect{v}\cdot\vect{x}$. Prove that this defines a linear transformation and find its kernel and image.

\item Let $V$ be the vector space of $2\times 2$ matrices. Define $T:V\to V$ by $T(A) = A^T$. Prove that $T$ is linear. Find the kernel of $T$. Show that $T\circ T = I$ (where $I$ is the identity transformation). Is $T$ an isomorphism? Justify your answer.

\item Consider the mass operator $M:C([0,L])\to\R$ defined by $M(f) = \int_0^L f(x)\,dx$, where $f$ is interpreted as a linear density. Prove that $M$ is a linear transformation. Is $M$ surjective? Is the operation which computes center-of-mass a linear transformation?

\item Let $T:\R^3\to\R^2$ be a linear transformation with $\ker T$ spanned by $(1,0,-1)^T$. What is the dimension of $\im T$? Can $T$ be onto? Find the dimension of $\coim T$.

\item Let $V = \R^2$ and define vectors $\vect{u},\vect{v}\in V$ to be equivalent if they differ by a multiple of $\vect{a}=(1,1)^T$. Prove this is an equivalence relation. Describe geometrically what one equivalence class looks like. Show this equivalence relation corresponds to the quotient space $V/U$ where $U=\spanset(\vect{a})$.

\item In $\R^3$, we say two vectors are equivalent if their first coordinates are equal. Show this defines an equivalence relation. Find a concrete description of the quotient space (hint: what is its dimension?). Describe the subspace that generates this equivalence relation.

\item For linear transformations $S,T:V\to W$, prove that $\rank(S+T)\leq\rank(S) + \rank(T)$. When does equality hold?

\item Let $T:V\to W$ be linear. Prove that if $\{\vect{v}_1,\ldots,\vect{v}_k\}$ is linearly independent in $V$ and $T(\vect{v}_i)\neq\vect{0}$ for all $i$, then $\{T(\vect{v}_1),\ldots,T(\vect{v}_k)\}$ is linearly independent in $W$.

\item Let $T:V\to W$ be a linear transformation. Prove that any subspace of $V$ containing $\ker T$ is mapped to a subspace of $W$ of dimension at most $\rank T$.

\item Let $V$ be finite-dimensional and $T:V\to V$ linear. Prove that $T$ is invertible if and only if $T(\ker T) = \{\vect{0}\}$.

\item For linear transformations $S,T:V\to W$, prove that $\ker(S+T)$ contains $\ker S\cap\ker T$. Give an example where this containment is strict.

\item Let $C([0,1])$ be the vector space of continuous functions on $[0,1]$. We say two functions $f,g\in C([0,1])$ are equivalent if $f(0)=g(0)$. Prove this is an equivalence relation. Find the subspace that generates this equivalence relation. What is a simple way to describe the quotient space geometrically?

\item Let $\vect{v}\in\R^n$ be nonzero. Two vectors $\vect{x},\vect{y}\in\R^n$ are called equivalent if $\vect{v}\cdot\vect{x}=\vect{v}\cdot\vect{y}$. Prove this is an equivalence relation. Show that the quotient space defined by this equivalence relation is isomorphic to $\R$.

\item Let $V$ be a vector space and $U<V$ a subspace. For $\vect{v}\in V$, prove that the equivalence class $[\vect{v}]$ in $V/U$ equals the set $\vect{v}+U = \{\vect{v}+\vect{u}:\vect{u}\in U\}$. Use this to explain why vector addition in $V/U$ is well-defined.

\item Let $T:V\to W$ be a linear transformation. Prove that the quotient space $V/\ker T$ is isomorphic to $\im T$ by explicitly constructing an isomorphism and verifying it is well-defined.

\item For a linear transformation $T:V\to W$, prove that $\dim(\ker T + \im T) = \dim V$ if and only if $T\circ T = T$.

\item If $S,T:V\to W$ are linear transformations and $\ker S = \ker T$, then there exists an isomorphism $\varphi:\im S\to\im T$.

\item Let $V$ be a finite-dimensional vector space and $U<V$ a subspace. Prove that the quotient map $\pi:V\to V/U$ defined by $\pi(\vect{v}) = [\vect{v}]$ is a linear transformation. What is its kernel?

\item Let $V$ be the vector space of $2\times 2$ matrices. Define two matrices $A,B\in V$ to be equivalent if their traces are equal. Show this is an equivalence relation and identify the quotient space with a familiar vector space.

\item Consider the differentiation operator $D:\poly_2\to\poly_1$ defined by $D(ax^2 + bx + c) = 2ax + b$. Find $\ker D$ and $\im D$. Verify the rank-nullity theorem for this transformation. Find an explicit formula for a linear transformation $S:\poly_1\to\poly_2$ such that $D\circ S = I_{\poly_1}$.

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THE URIZEN CYCLE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.75\textwidth]{URIZEN.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
\end{fullwidth}

\clearpage

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Bases \& Coordinates}
\label{ch:4}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``fixing them firm on their base, the bellows began to blowe''}

\newthought{The passage from abstract to concrete} is as important as its inverse. Having dwelt in the realm of abstract vector spaces and transformations, we now seek to make these concepts precise and measurable through coordinates and computations. This chapter bridges the gap between the theoretical structures we have built and the practical tools needed to work with them.

The concepts are familiar from geometric intuition: we routinely describe points in space using coordinates relative to chosen axes. These coordinates transform an abstract point into a concrete list of numbers that we can manipulate. Yet this simple idea -- that we can systematically assign numbers to abstract vectors -- contains surprising depth. The choice of coordinate system, seemingly arbitrary, can dramatically affect how easily we solve problems or understand structures.

This tension between intrinsic properties and their coordinate representations lies at the heart of linear algebra. A vector space exists independent of any particular way we choose to measure it, yet we cannot compute without making such choices. A linear transformation acts geometrically, yet we encode it as a matrix only after choosing bases for its domain and codomain. These choices -- of bases and the coordinates they induce -- form the bridge between abstract understanding and concrete computation.

Our task is to build this bridge carefully, ensuring it carries both theoretical insight and practical utility across the gap. We begin with the notion of a basis -- a set of vectors that both spans a space and does so efficiently. These bases provide coordinate systems, allowing us to translate abstract vectors into concrete lists of numbers. The interplay between different bases leads us to change of coordinates formulas, revealing how geometric objects appear from different perspectives.

This systematic encoding of abstract structures into numerical form enables the algorithmic heart of linear algebra. The price we pay for such computational power is a proliferation of coordinate-dependent calculations. Our challenge is to maintain sight of coordinate-independent truths while wielding coordinates as practical tools. This interplay between abstract understanding and concrete computation will guide our development throughout this text.

% ==============================================
\section{Bases \& Spanning Sets}
\label{sec:bases}
% ==============================================

\begin{marginfigure}
{\em Example:} Even for something as simple as $\R^2$, there are infinitely many choices of basis. The standard basis $\{\ihat, \jhat\}$ is merely one convenient choice among many.
\end{marginfigure}

From Chapter 2, recall that a spanning set for a vector space may contain redundant vectors, while a linearly independent set may fail to reach all vectors in the space. The notion of a basis synthesizes these concepts, providing a set of vectors that spans efficiently  --  without redundancy and without gaps.

\begin{definition}[Basis]
A \style{basis} for a vector space $V$ is a set $\basis$ of vectors that spans $V$ yet is linearly independent.
\end{definition}

The economy of this definition belies its power. A basis provides a minimal spanning set  --  minimal in the sense that removing any vector from the basis leaves a set that no longer spans $V$. Equivalently, it provides a maximal linearly independent set  --  maximal in that adding any vector creates linear dependence.

\begin{example}[Polynomial bases]
The space $\poly_2$ of quadratic polynomials admits several natural choices of basis, each providing different advantages:
\begin{marginfigure}
    {\em BONUS!} The monomial basis reveals the degree structure; the Lagrange basis simplifies interpolation by constructing polynomials that equal 1 at one interpolation point and 0 at the others; the Newton basis facilitates recursive computation through its nested structure.
\end{marginfigure}
\begin{enumerate}
    \item The monomial basis $\{1, x, x^2\}$
    \item The Lagrange basis for interpolation points $\{-1,0,1\}$:
    $$\left\{\frac{x(x+1)}{2}, -x^2+1, \frac{x(x-1)}{2}\right\}$$
    \item The Newton basis with nodes $d$ and $e$ $(d\neq e)$:
    $$\{1, (x-d), (x-d)(x-e)\}$$
\end{enumerate}

To illustrate, consider the polynomial $p(x) = x^2 + x + 1$. In the monomial basis, it's already in standard form:
$$p(x) = 1\cdot 1 + 1\cdot x + 1\cdot x^2$$
In the Lagrange basis at points $\{-1,0,1\}$, writing out the expansion:
$$p(x) = 2\cdot\frac{x(x+1)}{2} + 1\cdot(-x^2+1) + 2\cdot\frac{x(x-1)}{2} = x^2 + x + 1$$
This demonstrates how different bases can represent the same polynomial in ways that are advantageous for different computational purposes.
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} Different bases reveal different aspects of a space's structure. In Chapter 7, we shall discover bases that illuminate the action of linear differential equations.
\end{marginfigure}

The existence of a basis for any vector space is guaranteed by the following result, whose proof illuminates the relationship between spanning and independence:

\begin{theorem}[Basis Extension]
\label{thm:basisext}
Let $V$ be a finite-dimensional vector space and $S \subseteq V$ be a linearly independent set. Then $S$ can be extended to a basis of $V$ by adding finitely many vectors. Moreover, if $\dim V = n$ and $|S| = k \leq n$, then exactly $n-k$ vectors need to be added.
\end{theorem}

\begin{marginfigure}
    The theorem generalizes to infinite-dimensional spaces using Zorn's Lemma, though the extension process becomes non-constructive.
\end{marginfigure}

\begin{proof}
Let $\{v_1,\ldots,v_n\}$ be any basis of $V$. We can extend $S$ by adding vectors from this basis one at a time until we obtain a basis, stopping when we achieve a spanning set. The process must terminate after at most $n-k$ steps since any linearly independent set in $V$ has size at most $n$.
\end{proof}

\begin{marginfigure}
{\em Caveat:} The process of extending to a basis or extracting one from a spanning set is not unique  --  different choices yield different bases.
\end{marginfigure}

This constructive proof reveals a fundamental principle: we can build bases either by extension (adding vectors until we span) or by reduction (removing vectors until independence). Both processes terminate because of finite-dimensionality  --  a crucial hypothesis that fails in infinite-dimensional spaces.

\begin{example}[Matrix bases]
The space $\R^{2\times 2}$ of $2\times 2$ matrices has the standard basis
\[
E_{11}=\begin{bmatrix}1&0\\0&0\end{bmatrix},\ 
E_{12}=\begin{bmatrix}0&1\\0&0\end{bmatrix},\ 
E_{21}=\begin{bmatrix}0&0\\1&0\end{bmatrix},\ 
E_{22}=\begin{bmatrix}0&0\\0&1\end{bmatrix}
\]
This basis makes the coordinate structure transparent but obscures other properties. For example, the basis
\[
\begin{bmatrix}1&0\\0&1\end{bmatrix},\ 
\begin{bmatrix}0&1\\1&0\end{bmatrix},\ 
\begin{bmatrix}0&-1\\1&0\end{bmatrix},\ 
\begin{bmatrix}1&0\\0&-1\end{bmatrix}
\]
better reveals the decomposition into symmetric and skew-symmetric parts.
\end{example}

A key property of bases is that they all have the same size, thanks to Lemma \ref{lem:dim}:
%
\begin{cor}
\label{cor:dim}
Any two bases of a vector space have the same number of vectors.
\end{cor}

This reveals dimension as an intrinsic property of the space, independent of choice of basis.

\begin{example}[Dimension counting]
The following dimensions arise naturally:
\begin{enumerate}
    \item $\dim(\R^n) = n$
    \item $\dim(\poly_n) = n+1$
    \item $\dim(\R^{m\times n}) = mn$
    \item $\dim(\sym_n) = \frac{1}{2}n(n+1)$
\end{enumerate}
\begin{marginfigure}
    Recall, $\poly_n$ is the space of polynomials of degree $\leq n$ and $\sym_n$ denotes the space of symmetric $n$-by-$n$ matrices. 
\end{marginfigure}
Each counts the minimal number of parameters needed to specify an element of the space.
\end{example}

Bases provide our first systematic way to measure vector spaces. The choice of basis -- which is always somewhat arbitrary -- trades the intrinsic nature of the space for concrete computability. This tension between coordinate-free properties and coordinate-dependent calculations will be a recurring theme as we develop the machinery of linear algebra.

% ==============================================
\section{Coordinates \& Components}
\label{sec:coords}
% ==============================================

The existence of a basis provides more than a spanning set for a vector space  --  it enables a systematic translation of abstract vectors into concrete lists of numbers. This translation, seemingly mundane, is the keystone of computational linear algebra. It bridges the gap between geometric intuition and algorithmic manipulation.

The crucial observation is that any vector in a space can be written uniquely as a linear combination of basis vectors. Given a basis $\{\vect{b}_1,\ldots,\vect{b}_n\}$ for a vector space $V$, each vector $\vect{v}\in V$ has a unique expression:
\[
    \vect{v} = c_1\vect{b}_1 + c_2\vect{b}_2 + \cdots + c_n\vect{b}_n
\]
The scalars $c_1,\ldots,c_n$ are called the \style{coordinates} of $\vect{v}$ relative to this basis. The ordered list of these coordinates, written as a column vector
%
\begin{marginfigure}
{\em Caveat:} The notation $[\vect{v}]_{\basis}$ emphasizes that coordinates depend on choice of basis. A vector has different coordinates in different bases, though the vector itself remains unchanged.
\end{marginfigure}
\[
    [\vect{v}]_{\basis} = \begin{pmatrix} c_1 \\ c_2 \\ \vdots \\ c_n \end{pmatrix}
\]
is the \style{coordinate vector} of $\vect{v}$ with respect to the basis $\basis=\{\vect{b}_1,\ldots,\vect{b}_n\}$.


\begin{example}[Polynomial coordinates]
Consider the polynomial $p(x)=6+2x-3x^2$ in $\poly_2$. In the monomial basis $\mathcal{M}=\{1,x,x^2\}$, its coordinate vector is
\[
    [p]_{\mathcal{M}} = \begin{pmatrix} 6 \\ 2 \\ -3 \end{pmatrix}
\]
In the Lagrange basis $\mathcal{L}=\{1,x-1,(x-1)(x+1)\}$, the same polynomial has different coordinates:
\[
    [p]_{\mathcal{L}} = \begin{pmatrix} 6 \\ 2 \\ -3/2 \end{pmatrix}
\]
The polynomial remains unchanged; only its description varies.
\end{example}

The passage from vector to coordinates preserves the vector space operations. If $\vect{v}$ and $\vect{w}$ have coordinate vectors $[\vect{v}]_{\basis}$ and $[\vect{w}]_{\basis}$ respectively, along with scalar $a$, then:
\begin{enumerate}
    \item $[\vect{v}+\vect{w}]_{\basis} = [\vect{v}]_{\basis} + [\vect{w}]_{\basis}$
    \item $[a\vect{v}]_{\basis} = a[\vect{v}]_{\basis}$
\end{enumerate}
This preservation of structure means that coordinate vectors themselves form a vector space isomorphic to the original space.
%
\begin{marginfigure}
{\em Foreshadowing:} The preservation of vector space operations under the passage to coordinates explains why matrix multiplication encodes composition of linear transformations.
\end{marginfigure}

\begin{example}[Matrix coordinates]
Consider the space $\R^{2\times 2}$ with its standard basis of elementary matrices $\mathcal{E}=\{E_{11},E_{12},E_{21},E_{22}\}$. The matrix
\[
    A = \begin{bmatrix} 2 & -1 \\ 3 & 4 \end{bmatrix}
\]
has coordinate vector
\[
    [A]_{\mathcal{E}} = \begin{pmatrix} 2 \\ -1 \\ 3 \\ 4 \end{pmatrix}
\]
relative to this basis. The same matrix written in the alternative basis
\[
    \basis=\left\{
    \begin{bmatrix}1&0\\0&1\end{bmatrix}, 
    \begin{bmatrix}0&1\\1&0\end{bmatrix}, 
    \begin{bmatrix}0&-1\\1&0\end{bmatrix}, 
    \begin{bmatrix}1&0\\0&-1\end{bmatrix}
    \right\}
\]
requires different coordinates to express the same transformation.
\end{example}

The uniqueness of coordinate representations  --  guaranteed by the linear independence of basis vectors  --  allows us to test equality through coordinates. Two vectors are equal if and only if their coordinate vectors relative to any basis are equal. This reduces abstract vector equality to numerical comparison.
%
\begin{marginfigure}
This reduction of geometric or algebraic properties to numerical tests exemplifies how coordinates enable computation. The art lies in choosing coordinates that make the desired computations simple.
\end{marginfigure}

We have thus established a dictionary between abstract vectors and concrete lists of numbers. This dictionary depends critically on our choice of basis  --  a choice we are free to make and change as computational needs dictate. The interplay between different choices of basis, and how vectors appear from different coordinate perspectives, leads naturally to our next topic: change of basis transformations.

% ==============================================
\section{Change of Basis}
\label{sec:cob}
% ==============================================

The right coordinate system can transform a complex problem into a simple one. An oscillating spring-mass system, described by coupled equations in Cartesian coordinates, reduces to independent motions when viewed in its natural modes. A robotic arm's motion, intricate to specify in workspace coordinates, might follow elementary paths in joint angles. The acceleration of a particle, complicated in rectangular coordinates, could simplify dramatically in polar coordinates. These transformations of perspective  --  these changes of basis  --  are not mere mathematical conveniences but essential tools for understanding and controlling physical systems.

Consider a vector space $V$ with two different bases, $\basis$ and $\basis'$. A vector $\vect{v}\in V$ exists independently of how we describe it, just as climbing a mountain is equally difficult whether we measure its height in meters or feet. Yet to work with a vector  --  to compute with it, to transform it, to understand its relationship to other vectors  --  we must choose coordinates. The same vector has different coordinate representations in different bases:
\[
    \vect{v} = \sum_{i=1}^n c_i\vect{b}_i = \sum_{i=1}^n c'_i\vect{b}'_i
\]
where $[\vect{v}]_{\basis} = (c_1,\ldots,c_n)^T$ and $[\vect{v}]_{\basis'} = (c'_1,\ldots,c'_n)^T$ are its coordinate vectors in bases $\basis$ and $\basis'$ respectively.

\begin{example}[Electric Field Components]
The electric field $\vec{E}$ from a point charge can be measured in different coordinate systems. Near a charge $q$ at the origin, we might express $\vec{E}$ in Cartesian coordinates:
\[
    \vec{E} = E_x\ihat + E_y\jhat + E_z\khat
\]
or in spherical coordinates:
\[
    \vec{E} = E_\rho\hat{\vect{e}}_\rho + E_\theta\hat{\vect{e}}_\theta + E_\phi\hat{\vect{e}}_\phi
\]
The transformation between these descriptions at any point $(x,y,z)$ with spherical coordinates $(\rho,\theta,\phi)$ is given by:
\[
    \begin{pmatrix}
    E_\rho \\ E_\theta \\ E_\phi
    \end{pmatrix}
    =
    \begin{bmatrix}
    \sin\phi\cos\theta & \sin\phi\sin\theta & \cos\phi \\
    -\sin\theta & \cos\theta & 0 \\
    \cos\phi\cos\theta & \cos\phi\sin\theta & -\sin\phi
    \end{bmatrix}
    \begin{pmatrix}
    E_x \\ E_y \\ E_z
    \end{pmatrix}
\]
\begin{marginfigure}
This orthogonal transformation represents a change of basis that proves especially useful in analyzing radially symmetric fields, where the spherical components often reveal patterns obscured in Cartesian coordinates. The transformation matrix follows the mathematicians' convention where $\theta$ represents the azimuthal angle in the $xy$-plane from the $x$-axis (0 to $2\pi$) and $\phi$ denotes the polar angle from the $z$-axis (0 to $\pi$).
\end{marginfigure}
\end{example}

% \begin{example}[Robot Kinematics]
% Consider a robotic arm with two rotational joints, whose configuration can be described either by joint angles $(\theta_1,\theta_2)$ or by the end-effector position $(x,y)$ in the plane. The transformation between these descriptions  --  between joint space and task space  --  is a change of basis that fundamentally affects how we plan and control motion:
% \[
%     \begin{pmatrix} x \\ y \end{pmatrix} = 
%     \begin{pmatrix} 
%     L_1\cos\theta_1 + L_2\cos(\theta_1+\theta_2) \\
%     L_1\sin\theta_1 + L_2\sin(\theta_1+\theta_2)
%     \end{pmatrix}
% \]
% where $L_1$ and $L_2$ are the lengths of the arm segments. Though this particular transformation is nonlinear, its derivative  --  crucial for control  --  is linear and represents a change of basis for velocity vectors.
% \end{example}

The key to understanding basis changes lies in expressing the new basis vectors in terms of the old, using a matrix to do so.

\begin{definition}[Change of Basis Matrix]
\label{def:change-basis}
Let $\basis=\{\vect{b}_1,\ldots,\vect{b}_n\}$ and $\basis'=\{\vect{b}'_1,\ldots,\vect{b}'_n\}$ be bases for a vector space $V$. The \style{change of basis matrix} from $\basis$ to $\basis'$ is the matrix $P=[p_{ij}]$ whose entries are determined by the unique representations:
\begin{equation}
\label{eq:change-basis}
   P = [p_{ij}] \quad : \quad 
   \vect{b}'_j = \sum_{i=1}^n p_{ij}\vect{b}_i
\end{equation}
The $j$th column of $P$ contains the $\basis$-coordinates of $\vect{b}'_j$, encoding how to express each new basis vector in terms of the old basis.
\end{definition}


\begin{example}[Signal Processing]
In audio processing, a sound signal naturally begins in the time domain  --  amplitudes measured at discrete time points. For analysis and filtering, we often transform to the frequency domain using the Discrete Fourier Transform (DFT). This is precisely a change of basis, where our new basis vectors are complex exponentials:
\[
    \vect{b}'_k = \frac{1}{\sqrt{n}}\begin{pmatrix}
    1 \\ e^{-2\pi i k/n} \\ e^{-4\pi i k/n} \\ \vdots \\ e^{-2\pi i k(n-1)/n}
    \end{pmatrix}
\]
\begin{marginfigure}
    {\em Foreshadowing:} the change of basis matrix $P$ in this case is \style{unitary} (complex analogues of orthogonal matrices), reflecting the conservation of energy between time and frequency domains.
\end{marginfigure}
\end{example}

Given the change of basis matrix $P$, we can convert coordinates systematically:
\[
    [\vect{v}]_{\basis} = P[\vect{v}]_{\basis'}
\]
This formula reveals that $P$ converts from $\basis'$-coordinates to $\basis$-coordinates  --  it ``undoes'' the basis change. Conversely, to find $\basis'$-coordinates given $\basis$-coordinates, we solve:
\[
    [\vect{v}]_{\basis'} = P^{-1}[\vect{v}]_{\basis}
\]

\begin{example}[Principal Stress]
In analyzing the mechanics of a thin planar material, the \style{stress tensor} recording stress and strain at a point is a symmetric $2\times 2$ matrix $\sigma\in\sym_2$ relative to chosen coordinate axes:
\[
    \stress = \begin{bmatrix}
    \sigma_{xx} & \tau_{xy} \\
    \tau_{xy} & \sigma_{yy}
    \end{bmatrix}
\]
There always exists a convenient basis  --  the principal stress directions  --  in which the stress tensor is diagonal:
\[
    \stress' = \begin{bmatrix}
    \sigma_1 & 0 \\
    0 & \sigma_2
    \end{bmatrix}
\]
Finding this basis, achieved through eigendecomposition (Chapter \ref{ch:7}), is crucial for predicting material failure. The change of basis matrix $P$ here consists of unit vectors along the principal stress directions.
\end{example}

The change of basis matrix $P$ satisfies several properties reflecting the nature of coordinate transformations:
\begin{enumerate}
    \item $P$ is invertible (else some vectors would lose information in translation)
    \item The change from $\basis$ to $\basis'$ is given by $P^{-1}$ (reversing perspective)
    \item For a third basis $\basis''$, the matrices compose naturally (reflecting transitivity)
\end{enumerate}

These properties ensure our coordinate transformations are reversible and coherent  --  changing perspective neither creates nor destroys information about the vectors being described. This preservation of structure lies at the heart of linear algebra's power in engineering: it allows us to work in whatever coordinate system best suits our current needs, confident that we can translate our results back to any other perspective.

The true significance of basis changes emerges when we consider linear transformations, whose matrix representations depend critically on our choice of coordinates. This relationship  --  between bases, transformations, and their matrix representations  --  leads us to the fundamental notion of similarity, see Definition\ref{def:similarity} to follow.

% ==============================================
\section{Matrix Representations}
\label{sec:reps}
% ==============================================

A geometric transformation exists independently of how we measure it: a rotation by 90 degrees clockwise remains the same rotation whether we describe it in Cartesian or polar coordinates. Yet to compute with transformations  --  to combine them, to apply them to vectors, to analyze their effects  --  we must express them in coordinates through matrices. The relationship between the abstract transformation and its various matrix representations reveals both the power and limitations of coordinate-based computation.

Let $T:V\rightarrow W$ be a linear transformation between vector spaces with chosen bases $\basis=\{\vect{b}_1,\ldots,\vect{b}_n\}$ for $V$ and $\basis'=\{\vect{b}'_1,\ldots,\vect{b}'_m\}$ for $W$. To construct a matrix representation of $T$, we need only record how it acts on basis vectors:
\[
    T(\vect{b}_j) = \sum_{i=1}^m a_{ij}\vect{b}'_i
\]
%
\begin{marginfigure}
{\em Think:} In $[T]_{\basis'}^{\basis}$, the bottom basis $\basis'$ is where we measure outputs (codomain), while the top basis $\basis$ is where we measure inputs (domain). The matrix converts $\basis$-coordinates to $\basis'$-coordinates, reading from right to left just like function composition.
\end{marginfigure}
%
The coefficients $a_{ij}$ form an $m\times n$ matrix $[T]_{\basis'}^{\basis}$ called the \style{matrix representation} of $T$ relative to bases $\basis$ and $\basis'$. The entry $a_{ij}$ gives the $i$th coordinate of $T(\vect{b}_j)$ in basis $\basis'$.

\begin{example}[Rotation in Different Bases]
Consider the counterclockwise rotation by $\pi/2$ in $\R^2$. In the standard basis $\basis=\{\ihat,\jhat\}$, this transformation has the familiar matrix representation:
\[
    [T]_{\basis}^{\basis} = \begin{bmatrix}
    0 & -1 \\
    1 & 0
    \end{bmatrix}
\]
Let $\basis'$ be the basis consisting of vectors $\vect{v}_1=(1,1)^T$ and $\vect{v}_2=(-1,2)^T$. The change of basis matrix from $\basis'$ to $\basis$ is:
\[
    P = \begin{bmatrix}
    1 & -1 \\
    1 & 2
    \end{bmatrix}
\]
In this new basis, the same rotation transformation appears as:
\[
    [T]_{\basis'}^{\basis'} = P^{-1}\begin{bmatrix}
    0 & -1 \\
    1 & 0
    \end{bmatrix}P = \frac{1}{3}\begin{bmatrix}
    -1 & -5 \\
    2 & 1
    \end{bmatrix}
\]
Though the matrices appear quite different, they represent the identical geometric transformation of rotating vectors counterclockwise by $\pi/2$.
\end{example}

The matrix $[T]_{\basis'}^{\basis}$ converts input coordinates to output coordinates through standard matrix multiplication:
\[
    [T(\vect{v})]_{\basis'} = [T]_{\basis'}^{\basis}[\vect{v}]_{\basis}
\]
This formula encapsulates how linear transformations interact with coordinates: first express the input in $\basis$-coordinates, then multiply by the matrix representation to obtain $\basis'$-coordinates of the output.

\begin{example}[Projection onto a Line]
Consider the orthogonal projection onto the $x$-axis in $\R^2$. In standard coordinates, this has matrix representation:
\[
    [P]_{\basis}^{\basis} = \begin{bmatrix}
    1 & 0 \\
    0 & 0
    \end{bmatrix}
\]
If we rotate our coordinate system by angle $\theta$, obtaining a new basis $\basis'=\{(\cos\theta,\sin\theta)^T,(-\sin\theta,\cos\theta)^T\}$, the same projection appears more complicated:
%
\begin{marginfigure}
{\em Example:} When $\theta=\pi/4$, the basis vectors of $\basis'$ are $(\frac{\sqrt{2}}{2},\frac{\sqrt{2}}{2})^T$ and $(-\frac{\sqrt{2}}{2},\frac{\sqrt{2}}{2})^T$. The projection matrix in these coordinates becomes
\[
    [P]_{\basis'}^{\basis'} = \frac{1}{2}\begin{bmatrix}
    1 & 1 \\
    1 & 1
    \end{bmatrix}
\]
\end{marginfigure}
\[
    [P]_{\basis'}^{\basis'} = \begin{bmatrix}
    \cos^2\theta & \cos\theta\sin\theta \\
    \cos\theta\sin\theta & \sin^2\theta
    \end{bmatrix}
\]
The geometric action remains the same  --  we simply view it through different coordinate lenses.
\end{example}
%

When bases change, matrix representations transform systematically. If $P$ is the change of basis matrix from $\basis'$ to $\basis$ in the domain and $Q$ is the change of basis matrix from $\mathcal{D}'$ to $\mathcal{D}$ in the codomain, then:
\[
    [T]_{\mathcal{D}}^{\basis} = Q[T]_{\mathcal{D}'}^{\basis'}P^{-1}
\]
This relationship reveals how different matrix representations of the same transformation relate through similarity transformations  --  the subject of our next section.

\begin{example}[Scaling and Reflection]
Consider the linear transformation $T:\R^2\rightarrow\R^2$ that scales by $2$ in the $x$-direction and reflects across the $x$-axis. In standard coordinates:
\[
    [T]_{\basis}^{\basis} = \begin{bmatrix}
    2 & 0 \\
    0 & -1
    \end{bmatrix}
\]
In coordinates rotated by $\pi/4$, this transformation appears to mix scaling and reflection:
\[
    [T]_{\basis'}^{\basis'} = \begin{bmatrix}
    1/2 & 3/2 \\
    3/2 & 1/2
    \end{bmatrix}
\]
The apparent complexity arises not from the transformation itself but from our choice of measurement system.
\end{example}

The practical value of matrix representations lies in their computability  --  they reduce abstract transformations to concrete arrays of numbers. Yet their coordinate dependence reminds us that no single matrix tells the whole story. A transformation may appear simple in one basis and complicated in another, just as a quadratic curve appears differently in different coordinate systems.

Our task in subsequent chapters will be to find bases that reveal the essential features of linear transformations. Some bases will diagonalize certain transformations (Chapter \ref{ch:7}); others will respect geometric structure (Chapter \ref{ch:5}) or optimize approximations (Chapter \ref{ch:10}). Each provides a different lens through which to view and understand linear transformations.



% ==============================================
\section{Coordinate-Free Thought}
\label{sec:free}
% ==============================================

Our development of coordinates and matrix representations presents a fundamental paradox. We study linear transformations first as abstract mappings between vector spaces, understanding their properties independent of any particular measuring system. Yet to compute with these transformations  --  to apply them to vectors, to compose them, to analyze their effects  --  we must choose coordinates and work with matrices. This tension between the intrinsic nature of transformations and their coordinate-dependent representations lies at the heart of linear algebra.
%
\begin{marginfigure}
{\em Foreshadowing:} The relationship between isomorphic vector spaces and similar transformations hints at deeper categorical structures in mathematics.
\end{marginfigure}
%
Consider data drawn from some high-dimensional measurement process  --  perhaps gene expression levels across thousands of cells, or activation patterns across layers of a neural network. The underlying biological or computational structure exists independent of how we choose to measure it. Different experimental protocols or network architectures may yield different representations of the same fundamental patterns. This suggests a deeper question: when are two apparently different representations truly equivalent?
\begin{marginfigure}
    Some find the equivalent expression
\[ AX = XB \]
    to be more memorable and evocative.
\end{marginfigure}

\begin{definition}[Similarity]
\label{def:similarity}
Two matrices $A$ and $B$ are \style{similar} if there exists an invertible matrix $X$ such that:
\[
    B = X^{-1}AX
\]
\noindent
We write $A \sim B$ to denote similar matrices.
\end{definition}

This algebraic relationship captures precisely when two matrices represent the same linear transformation viewed through different coordinate systems. The matrix $P$ encodes the change of basis that transforms one view into another. More fundamentally, we say two linear transformations $S,T:V\rightarrow V$ are \style{similar} if there exists an isomorphism $\varphi:V\rightarrow V$ such that $S = \varphi^{-1}T\varphi$. When expressed in coordinates, this abstract notion manifests as matrix similarity.
\begin{marginfigure}
    The term {\em conjugate} is more common in mathematics, but {\em similar} will do nicely.
    
    \begin{center}
    \begin{tikzcd}[column sep=2.5em, row sep=2.5em]
    V \arrow[r, "T"] \arrow[d, "\varphi"'] & V \arrow[d, "\varphi"] \\
    V \arrow[r, "S"'] & V
    \end{tikzcd}
    \end{center}
    This diagram commutes when $S = \varphi^{-1}T\varphi$, illustrating similarity as conjugation by the isomorphism $\varphi$.
\end{marginfigure}

\begin{example}[Geometric Similarity]
Consider a rotation by $90^\circ$ counterclockwise in the plane. In standard coordinates, this appears as:
\[
    A = \begin{bmatrix}
    0 & -1 \\
    1 & 0
    \end{bmatrix}
\]
If we measure vectors instead using the basis $\{\vect{v}_1,\vect{v}_2\}$ where:
\[
    \vect{v}_1 = \begin{pmatrix}1\\1\end{pmatrix}, \quad
    \vect{v}_2 = \begin{pmatrix}-1\\2\end{pmatrix}
    \quad
    \Rightarrow
    \quad
    X = \begin{bmatrix}
    1 & -1 \\
    1 & 2
    \end{bmatrix}
\]
then the same rotation has matrix:
\[
    B = X^{-1}AX
    =
    \begin{bmatrix}
    1/3 & -1 \\
    1/3 & 0
    \end{bmatrix}
\]
Though these matrices look quite different, they represent identical geometric actions.
\end{example}


\begin{example}[Shape Analysis]
Consider analyzing the shape of a complex three-dimensional object represented as a cloud of points in $\R^3$. The intrinsic shape exists independent of coordinate choice  --  rotating or translating the object does not change its fundamental geometry. Two different coordinate representations of the same shape are related through similarity transformations. Geometric approaches to data analysis build on this insight, studying properties of shapes that remain invariant under such transformations.
\end{example}

Similar matrices share certain properties that are intrinsic to the transformation they represent:
\begin{enumerate}
    \item They have the same determinant
    \item They have the same rank
    \item They have the same trace (sum of diagonal entries)
\end{enumerate}
%
\begin{marginfigure}
{\em Foreshadowing:} In Chapter \ref{ch:7}, we will discover that similar matrices also share eigenvalues  --  another intrinsic property of the transformation they represent.
\end{marginfigure}
%
These \style{coordinate invariants} belong to the transformation itself rather than to any particular matrix representation. They form a fingerprint that distinguishes genuinely different transformations from merely different coordinate views of the same map.

\begin{marginfigure}
{\em Foreshadowing:} In Chapter \ref{ch:11}, we will see how Principal Component Analysis discovers coordinate systems that reveal intrinsic low-dimensional structure in high-dimensional data.
\end{marginfigure}

The distinction between intrinsic and coordinate-dependent properties guides much of modern data analysis. When studying high-dimensional data, we ask:
\begin{enumerate}
    \item What features are invariant under reasonable transformations?
    \item Which coordinate systems best reveal these features?
    \item How do different representations illuminate different aspects?
\end{enumerate}

The power of this perspective emerges in the chapters ahead. We will discover:
\begin{itemize}
    \item Coordinates that respect geometric structure (Chapter \ref{ch:5})
    \item Representations that reveal dynamical behavior (Chapter \ref{ch:7})
    \item Bases optimal for dimension reduction (Chapter \ref{ch:10})
\end{itemize}
Each provides a different lens through which to view our objects of study, highlighting different features while preserving their essential character.

\begin{example}[Data Visualization]
A high-dimensional dataset can be visualized in countless ways through different projections onto two or three dimensions. While some projections may obscure the data's structure, others reveal meaningful patterns. The art lies in finding coordinates that expose the features of interest while respecting the intrinsic relationships in the data. Though the choice of visualization coordinates is arbitrary, the patterns they reveal  --  clusters, outliers, nonlinear relationships  --  reflect genuine structure in the data.
\end{example}

Yet we must remain mindful of the difference between a transformation and its various representations. Matrices are tools for computation  --  powerful and necessary tools, but not the whole story. The true objects of study are the transformations themselves, existing independent of how we choose to measure them. This tension between coordinate-free concepts and coordinate-based calculations runs throughout linear algebra, from the most theoretical ideas to the most practical applications.

Our journey through linear algebra will constantly navigate between these perspectives  --  between the abstract and the concrete, between the geometric and the algebraic, between the intrinsic and the coordinate-dependent. The art lies in knowing when to reason coordinate-free and when to harness the computational power of well-chosen coordinates. In the chapters ahead, we will see this interplay illuminate everything from the structure of differential equations to the architecture of neural networks.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Robotic Arm Kinematics}
\label{EM:kinematics}
% **************** EMANATION *******************

The mathematics of robotic manipulation provides a compelling demonstration of how different coordinate systems illuminate different aspects of the same physical system. A robotic arm's motion can be described through multiple bases, each revealing different aspects of its behavior. These coordinate choices --- and the transformations between them --- exemplify the fundamental principles developed throughout this chapter.

Consider a planar robotic arm with two revolute joints connecting two rigid links of lengths $L_1$ and $L_2$. The configuration of this arm admits two natural coordinate systems:
\begin{enumerate}
    \item Joint space coordinates $(\theta_1,\theta_2)$, measuring the angles of each joint
    \item Task space coordinates $(x,y)$, giving the position of the end-effector
\end{enumerate}
These spaces come equipped with natural bases: joint space has basis vectors corresponding to infinitesimal rotations of each joint, while task space inherits the standard Cartesian basis of the plane. The transformation between these coordinates is given by:
\[
    F(\theta_1,\theta_2) = 
    \begin{pmatrix} 
    L_1\cos\theta_1 + L_2\cos(\theta_1+\theta_2) \\
    L_1\sin\theta_1 + L_2\sin(\theta_1+\theta_2)
    \end{pmatrix}
\]

Though this transformation $F$ is nonlinear, its derivative $[DF]$ at any configuration provides a linear map between the tangent spaces --- a change of basis matrix relating infinitesimal motions:
\[
    \begin{pmatrix} \dot{x} \\ \dot{y} \end{pmatrix} = 
    [DF]_{(\theta_1,\theta_2)}
    \begin{pmatrix} \dot{\theta}_1 \\ \dot{\theta}_2 \end{pmatrix}
\]
where:
\[
    [DF]_{(\theta_1,\theta_2)} = \begin{bmatrix}
    -L_1\sin\theta_1 - L_2\sin(\theta_1+\theta_2) & -L_2\sin(\theta_1+\theta_2) \\
    L_1\cos\theta_1 + L_2\cos(\theta_1+\theta_2) & L_2\cos(\theta_1+\theta_2)
    \end{bmatrix}
\]
The columns of this matrix express the task-space velocities generated by unit joint velocities --- they form a configuration-dependent basis for achievable end-effector motions.

\begin{marginfigure}
{\em Recall:} The Inverse Function Theorem from calculus guarantees that when $[DF]$ is invertible at a configuration (i.e., when $\det[DF]\neq 0$), $F$ has a local inverse --- we can solve uniquely for small changes in joint angles needed to achieve desired end-effector motions. When $[DF]$ fails to be invertible, as happens when the arm is fully extended, certain instantaneous motions become impossible.
\end{marginfigure}

More complex manipulators illuminate additional aspects of coordinate relationships. Consider extending our arm to three joints while maintaining planar end-effector motion. Now joint space has basis vectors $\{\partial/\partial\theta_1, \partial/\partial\theta_2, \partial/\partial\theta_3\}$ while task space remains two-dimensional with basis $\{\partial/\partial x, \partial/\partial y\}$. The derivative becomes a linear transformation $[DF]:\R^3\to\R^2$ between these spaces, with:
\[
    [DF]_{(\theta_1,\theta_2,\theta_3)} = 
    \begin{bmatrix}
    \displaystyle\frac{\partial x}{\partial \theta_1} & 
    \displaystyle\frac{\partial x}{\partial \theta_2} & 
    \displaystyle\frac{\partial x}{\partial \theta_3} \\
    \displaystyle\frac{\partial y}{\partial \theta_1} & 
    \displaystyle\frac{\partial y}{\partial \theta_2} & 
    \displaystyle\frac{\partial y}{\partial \theta_3}
    \end{bmatrix}
\]
This matrix has a nontrivial kernel --- reflecting joint velocities that instantaneously leave the end-effector fixed. Such self-motions exemplify the fundamental kernel-image relationship studied in Chapter~3, now emerging naturally in a concrete mechanical system.

The practical significance of these coordinate relationships manifests in trajectory planning. A straight-line motion of the end-effector, though elegant in task coordinates, may demand intricate joint-space choreography. Conversely, simple joint trajectories can trace complex paths through task space. For the three-joint arm, redundancy enriches this relationship further --- the same end-effector trajectory admits infinitely many joint space realizations, corresponding to different paths through the kernel of $[DF]$.

This duality between representations reflects a deeper truth: no single coordinate system captures all aspects of a complex system with equal clarity. The art of engineering lies not merely in choosing appropriate coordinates but in moving fluently between different representations as the problem demands. Joint coordinates render questions of dynamics and joint limits transparent, while task coordinates simplify motion specification. The mathematical framework developed in this chapter transforms this art from intuitive craft to systematic science, providing the tools needed to work effectively with multiple coordinate representations of the same underlying reality.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Computer Graphics \& Coordinate Systems}
\label{EM:graphics}
% **************** EMANATION *******************

Modern computer graphics illuminates the practical power of coordinate transformations. A virtual object -- perhaps a spacecraft in a flight simulator -- exists simultaneously in multiple coordinate systems, each chosen to simplify particular aspects of the simulation. Understanding how these bases relate through the transformations of Section \ref{sec:cob} converts complex geometric problems into systematic matrix computations.

Consider our virtual spacecraft. Its geometry begins life in \style{body coordinates}, where the natural basis $\basis_b=\{\vect{b}_1,\vect{b}_2,\vect{b}_3\}$ aligns with the craft's structure: $\vect{b}_1$ points through the nose, $\vect{b}_2$ along the right wing, and $\vect{b}_3$ downward. In these coordinates, the craft's symmetries become apparent and control surfaces align with coordinate planes. A point $\vect{p}$ on the spacecraft has coordinate vector $[\vect{p}]_{\basis_b}$ relative to this body basis.

Yet our spacecraft moves through a virtual world with its own coordinate system. The \style{world basis} $\basis_w=\{\vect{w}_1,\vect{w}_2,\vect{w}_3\}$ typically aligns $\vect{w}_3$ with vertical, while $\vect{w}_1$ and $\vect{w}_2$ span the ground plane. Following Section \ref{sec:coords}, the coordinate transformation from body to world basis follows from the change of basis matrix:
\[
    [\vect{p}]_{\basis_w} = P[\vect{p}]_{\basis_b}
\]

This transformation matrix $P$ has columns expressing body basis vectors in world coordinates, exactly as constructed in Example \ref{ex:2112}:
\[
    P = \begin{bmatrix}
    | & | & | \\
    [\vect{b}_1]_{\basis_w} & [\vect{b}_2]_{\basis_w} & [\vect{b}_3]_{\basis_w} \\
    | & | & |
    \end{bmatrix}
\]
Each column shows how one body basis vector decomposes in world coordinates. Like the rotation matrices studied in Section \ref{sec:euclidean}, this change of basis preserves lengths and angles -- a crucial property for rigid objects.

A virtual camera introduces yet another basis. The \style{camera basis} $\basis_c=\{\vect{c}_1,\vect{c}_2,\vect{c}_3\}$ places the virtual lens at the origin with $\vect{c}_3$ pointing along the viewing direction and $\vect{c}_2$ aligned with the image's vertical axis. Points transform to these coordinates through composition with another change of basis matrix $Q$:
\[
    [\vect{p}]_{\basis_c} = Q[\vect{p}]_{\basis_w}
\]

The composition of these transformations -- from body to world to camera coordinates -- embodies the core algebraic insight of Section \ref{sec:cob}: changes of basis compose through matrix multiplication. A point's coordinates transform as:
\[
    [\vect{p}]_{\basis_c} = QP[\vect{p}]_{\basis_b}
\]
This matrix product captures the complete change of coordinates, though we often maintain separate transformations for clarity and efficiency.

\begin{marginfigure}
{\em Example:} When a spacecraft pitches upward 30°, its body basis vectors expressed in world coordinates become columns of the change of basis matrix $P$:
\[
    \begin{bmatrix}
    \sqrt{3}/2 & 0 & -1/2 \\
    0 & 1 & 0 \\
    1/2 & 0 & \sqrt{3}/2
    \end{bmatrix}
\]
These columns are mutually orthogonal since $P$ represents rigid rotation.
\end{marginfigure}

Each basis in this sequence serves a specific purpose: body coordinates for physics simulation, world coordinates for scene composition, camera coordinates for visibility and rendering. The transformations between them, though apparently complex, follow directly from our precise understanding of coordinates and bases developed in Section \ref{sec:coords}. This exemplifies a broader principle: challenging problems often become tractable when viewed in appropriate coordinates.

The practical significance extends far beyond graphics. In robotics, similar coordinate changes relate joint angles to end-effector position through the transformation matrices studied in Section \ref{sec:reps}. In computer vision, camera and world bases must align to enable augmented reality. In spacecraft guidance, body and inertial coordinates interplay in navigation algorithms. Each application builds on the same mathematical foundation: the careful construction of bases and transformations between them.

This cascade of coordinate systems illustrates a final key insight from Section \ref{sec:free}: bases should be chosen to match the natural structure of our problems. Body coordinates respect vehicle symmetries; world coordinates align with gravity and terrain; camera coordinates match viewing geometry. No single basis captures all aspects elegantly -- the art lies in choosing appropriate bases for each subtask while understanding how they relate through the careful matrix algebra developed in this chapter.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 4}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item For the basis $\basis = \{\vect{v}_1,\vect{v}_2\}$ of $\R^2$ where $\vect{v}_1 = (1,2)^T$ and $\vect{v}_2 = (1,-1)^T$, find the change of basis matrix $P$ from $\basis$ to the standard basis.

\item Consider $\vect{v} = (2,1,-1)^T$ in $\R^3$ and basis $\basis = \{(1,1,0)^T, (0,1,1)^T, (1,0,1)^T\}$. Find $[\vect{v}]_\basis$.

\item Find a basis for the space $\sym_2$ of symmetric $2\times 2$ matrices, then find the coordinate vector of $\begin{bmatrix}3 & 1\\1 & 2\end{bmatrix}$ in your basis.

\item Find the coordinate vector of $p(x)=x^2-2x+1$ relative to the basis $\{1+x^2, x-x^2, 1-x\}$ of $\poly_2$.

\item For $\poly_2$, show that $\{1,1+x,1+x+x^2\}$ is a basis by proving it is linearly independent and spans $\poly_2$. Then express $\{1,x,x^2\}$ in terms of this basis.

\item Given a basis $\basis = \{\vect{v}_1,\vect{v}_2\}$ for $\R^2$, prove that the change of basis matrix from $\basis$ to the standard basis has columns equal to $\vect{v}_1$ and $\vect{v}_2$.

\item Let $V$ be the space of $2\times 2$ matrices with basis $\basis = \{E,S,R,D\}$ where $E=\begin{bmatrix}1&0\\0&1\end{bmatrix}$, $S=\begin{bmatrix}0&1\\1&0\end{bmatrix}$, $R=\begin{bmatrix}0&-1\\1&0\end{bmatrix}$, $D=\begin{bmatrix}1&0\\0&-1\end{bmatrix}$. Describe a method for finding the coordinate vector of any $2\times 2$ matrix in this basis.

\item Let $T:\R^2\to\R^2$ rotate vectors counterclockwise by $\pi/4$. Find its matrix in: (a) the standard basis, (b) the basis $\{(1,1)^T, (-1,1)^T\}$.

\item Let $T:\poly_2\to\poly_2$ be differentiation $T(p)=p'$. Find its matrix in the standard basis $\{1,x,x^2\}$, then find a basis giving it a simpler matrix representation.

\item Consider matrices $A=\begin{bmatrix}1 & 2\\3 & 4\end{bmatrix}$ and $B=\begin{bmatrix}5 & -2\\-3 & 0\end{bmatrix}$. Either find a matrix $P$ such that $B=P^{-1}AP$, or prove no such $P$ exists. What invariants could you check first?

\item For $T:\R^2\to\R^2$ with matrix $[T]_{\basis_1} = \begin{bmatrix}2 & 1\\1 & 2\end{bmatrix}$ in basis $\basis_1$ and $[T]_{\basis_2} = \begin{bmatrix}3 & 0\\0 & 1\end{bmatrix}$ in basis $\basis_2$, find the change of basis matrix from $\basis_2$ to $\basis_1$.

\item The matrices $A=\begin{bmatrix}3&1\\1&3\end{bmatrix}$ and $B=\begin{bmatrix}2&2\\2&4\end{bmatrix}$ are similar. How can one prove this directly? Try finding $P$ such that $B=P^{-1}AP$. Can this be done by solving a linear system?

\item Consider the operator $T:\poly_2\to\poly_2$ defined by $T(p)(x)=p(x+1)$. Find its matrix in the standard basis $\{1,x,x^2\}$. Then find its matrix in basis $\{1,x+1,(x+1)^2\}$. What pattern do you observe? Explain why this pattern occurs geometrically.

\item Consider the basis $\{\vect{v}_1,\vect{v}_2,\vect{v}_3\}$ for $\R^3$ where $\vect{v}_1=(1,1,1)^T$, $\vect{v}_2=(1,1,-2)^T$, $\vect{v}_3=(1,-2,1)^T$. For $T:\R^3\to\R^3$ given by the cross product $T(\vect{x})=\vect{x}\times(1,0,0)^T$, find the matrix of $T$ in this basis. First find how $T$ acts on each basis vector, then combine these into the matrix representation.

\item For a linear transformation $T:V\to V$: (a) if $[T]_\basis^\basis$ is diagonal for some basis $\basis$, explain geometrically what this means about how $T$ acts on vectors, (b) prove that finding such a basis is equivalent to finding vectors that $T$ maps to scalar multiples of themselves.

\item For similar matrices $A$ and $B$: (a) prove $\det(A)=\det(B)$ and $\trace(A)=\trace(B)$; (b) show also that $A^n$ and $B^n$ are similar for $n>1$. {\em Hint:} you know how determinants behave under composition; for trace, recall that (or show) $\trace(XY)=\trace(YX)$.

\item For a $2\times 2$ matrix $A$, prove that if $A$ is similar to a diagonal matrix, then it is similar to any matrix with the same diagonal entries (in any order). Give an example showing two matrices with the same diagonal entries need not be similar.

\item Let $\alpha(t)=1+t+t^2$ and $\beta(t)=1-t^2$ be polynomials in $\poly_2$. Find bases $\basis_1$ and $\basis_2$ such that $[\alpha]_{\basis_1}=(1,0,0)^T$ and $[\beta]_{\basis_2}=(1,0,0)^T$. What does this tell you about these polynomials? Find the change of basis matrix from $\basis_1$ to $\basis_2$.

\item For a vector space $V$ with basis $\basis=\{\vect{v}_1,\ldots,\vect{v}_n\}$, prove that any linear transformation $T:V\to V$ is uniquely determined by its action on basis vectors. Then show this implies similar matrices must have the same trace without using matrix multiplication. How does this connect to the geometry of linear transformations?

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Inner Products \& Orthogonality}
\label{ch:5}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``others triangular right angled course maintain; others obtuse, acute Scalene, in simple paths''}

\newthought{The spaces we inhabit possess} structure beyond mere addition and scaling. Like a compass measuring celestial arcs, vectors trace paths that stretch and bend through space  --  each with its own magnitude, each forming precise angles with its fellows. These geometric notions  --  of length and perpendicularity, of measure and relation  --  emerge not from arbitrary convention but from careful definition of how vectors interact through inner products.

Our task is to extract the essence of geometric measurement  --  of lengths, angles, and orthogonality  --  and extend it beyond its Euclidean origins. The familiar dot product of vectors served well in calculus, yet it represents merely one instance of a deeper structure. Inner products provide the machinery to impose geometric order on abstract vector spaces, enabling us to measure and compare vectors in ways that respect their intrinsic nature.

This geometric perspective transforms our understanding of linear algebra. Orthogonal vectors, previously understood through coordinate calculations, emerge as a fundamental organizing principle. Orthogonal bases offer optimal frameworks for computation. Orthogonal matrices preserve the geometric structure we construct. Through inner products, the abstract vector spaces of previous chapters acquire shape and substance.

The choice of inner product shapes our view of a vector space, highlighting certain features while obscuring others. Different inner products induce different notions of length and angle, each suited to particular applications. Some arise naturally from physical principles, others from statistical considerations, still others from computational convenience. The art lies in choosing an inner product that illuminates the aspects of greatest interest while preserving essential structure.

Our development proceeds from the concrete to the abstract and back again. The familiar dot product guides our intuition as we ascend axiomatically. Though we shall occasionally glimpse infinite-dimensional spaces through carefully chosen examples, our focus remains on finite-dimensional spaces where the theory achieves its purest form. Each definition builds upon the last with mathematical certainty, yet the structure we construct reflects something fundamental: the deep unity between geometric intuition and algebraic law.

% ==============================================
\section{Dot \& Inner Products}
\label{sec:dotinner}
% ==============================================

The dot product pushes out from its first appearance in calculus. Two vectors $\vect{u},\vect{v}\in\R^n$ combine through coordinate-wise multiplication and addition:
\[
    \vect{u}\cdot\vect{v} = \sum_{i=1}^n u_iv_i
\]
This operation, though defined through coordinates, reveals fundamental geometric features: length through $\|\vect{v}\| = \sqrt{\vect{v}\cdot\vect{v}}$, angle via $\vect{u}\cdot\vect{v} = \|\vect{u}\|\|\vect{v}\|\cos\theta$, and orthogonality when $\vect{u}\cdot\vect{v}=0$. That such a simple formula encodes so much geometric content suggests deeper structure at play.

Consider what properties make the dot product geometrically meaningful. First, it treats vectors symmetrically: $\vect{u}\cdot\vect{v} = \vect{v}\cdot\vect{u}$. Second, it is linear in each factor: $(c\vect{u}+\vect{w})\cdot\vect{v} = c(\vect{u}\cdot\vect{v}) + \vect{w}\cdot\vect{v}$. Third, it ensures positive length: $\vect{v}\cdot\vect{v} \geq 0$, with equality only when $\vect{v}=\vect{0}$. These properties  --  not the specific formula  --  enable geometric measurement.

This insight suggests generalizing beyond $\R^n$. Consider the space $C([0,1])$ of continuous functions on the unit interval. Though these vectors are curves rather than arrows, we might still wish to measure angles between them or test their orthogonality. The integral formula
\[
    \langle f,g\rangle = \int_0^1 f(t)g(t)\,dt
\]
provides exactly such a measurement. It shares the key properties that made the dot product geometric: symmetry, linearity, and positivity. Two functions are now ``orthogonal'' when their product integrates to zero  --  a concept familiar from calculus (and, later, Fourier analysis).
%
\begin{marginfigure}
{\em Example:} The functions $\sin(n\pi x)$ and $\sin(m\pi x)$ are orthogonal for distinct integers $n,m$ under this inner product, explaining the independence of Fourier sine series terms.
\end{marginfigure}

These examples motivate the abstract definition that captures their common essence:

\begin{definition}[Inner Product]
\label{def:innerproduct}
An \style{inner product} on a vector space $V$ is a function $\langle\cdot,\cdot\rangle:V\times V\to\R$ satisfying, for all $\vect{u},\vect{v},\vect{w}\in V$ and $c\in\R$:
\begin{enumerate}
    \item Symmetry: $\langle \vect{u},\vect{v}\rangle = \langle \vect{v},\vect{u}\rangle$
    \item Linearity: $\langle c\vect{u}+\vect{w},\vect{v}\rangle = c\langle \vect{u},\vect{v}\rangle + \langle \vect{w},\vect{v}\rangle$
    \item Positive Definiteness: $\langle \vect{v},\vect{v}\rangle \geq 0$, with equality if and only if $\vect{v}=\vect{0}$
\end{enumerate}
A vector space equipped with an inner product is called an \style{inner product space}.
\end{definition}

This seemingly austere definition distills the essential features that enable geometric measurement. Each property plays a vital role: symmetry ensures angles are well-defined; linearity connects geometry to vector space structure; positive definiteness guarantees meaningful notions of length and distance.

\begin{example}[Weighted Inner Products]
On $\R^n$, we need not weight all coordinates equally. Given positive weights $a_1,\ldots,a_n$, the formula
\[
    \langle \vect{u},\vect{v}\rangle_{\vect{a}} = \sum_{i=1}^n a_iu_iv_i
\]
defines an inner product that emphasizes certain components over others. Such weighted measurements arise naturally in statistics, where the weights might reflect measurement uncertainty, or in mechanics, where they encode mass distribution.
\end{example}

\begin{example}[Matrix Inner Products]
\label{ex:Frobenius}
The space $\R^{m\times n}$ of matrices admits several natural inner products. The \style{Frobenius inner product},
\[
    \langle A,B\rangle_F = \trace(A^TB) = \sum_{i,j} a_{ij}b_{ij}
\]
treats a matrix as a long vector of entries. Other inner products might weight different matrix entries according to their positions or statistical significance.
\end{example}

Each inner product imposes its own geometry on a vector space, determining how angles and lengths are measured. The dot product is merely first among equals  --  the most elementary inner product on $\R^n$. As we shall see, the choice of inner product profoundly affects both theoretical understanding and practical computation.

\begin{marginfigure}
{\em Foreshadowing:} The choice of inner product shapes everything from optimization algorithms to quantum measurements. We shall see its influence grow throughout this text.
\end{marginfigure}

The most fundamental consequence of an inner product is its induced notion of length.

\begin{definition}[Norm]
\label{def:norm}
Given an inner product space $V$, the \style{norm} of a vector $\vect{v}\in V$ is defined by:
\[
    \|\vect{v}\| = \sqrt{\langle \vect{v},\vect{v}\rangle}
\]
This induced norm measures the length of vectors in a way compatible with the inner product structure.
\end{definition}

Though many norms exist on vector spaces, those arising from inner products possess special geometric properties. The subtle interplay between inner products and their induced norms will guide our development of orthogonality in the sections ahead.

% ==============================================
\section{Angles \& Orthogonality}
\label{sec:angles}
% ==============================================

In physical space, vectors meet at angles. This seemingly elementary observation -- that two directions can be more or less aligned -- extends far beyond geometry. Two functions can be more or less correlated; two matrices can be more or less aligned; two quantum states can be more or less independent. In each case, the inner product reveals this angular relationship through a profound formula first glimpsed in calculus.

Our development requires first a fundamental inequality -- one that ensures angles make sense in any inner product space.

\begin{lemma}[Cauchy-Schwarz Inequality]
\label{lem:C-S}
For any vectors $\vect{u},\vect{v}$ in an inner product space,
\[
    |\langle \vect{u},\vect{v}\rangle| \leq \|\vect{u}\|\|\vect{v}\|
\]
with equality if and only if one vector is a scalar multiple of the other.
\end{lemma}

\begin{proof}
For any real number $t$, positive-definiteness of the inner product requires:
\[
    0 \leq \|\vect{u} + t\vect{v}\|^2 = \|\vect{u}\|^2 + 2t\langle \vect{u},\vect{v}\rangle + t^2\|\vect{v}\|^2
\]
This quadratic in $t$ must be nonnegative for all $t$, possible only if its discriminant is nonpositive:
\[
    4\langle \vect{u},\vect{v}\rangle^2 \leq 4\|\vect{u}\|^2\|\vect{v}\|^2
\]
The case of equality follows by examining when this quadratic has exactly one root.
\end{proof}

This seemingly technical result has profound implications. It ensures that the ratio of inner product to product of norms cannot exceed unity in absolute value -- exactly what we need to define angles through the familiar cosine relationship.

The \style{angle} between nonzero vectors $\vect{u}$ and $\vect{v}$ in an inner product space is the unique number $\theta \in [0,\pi]$ satisfying:
\begin{equation}
    \cos\theta = \frac{\langle \vect{u},\vect{v}\rangle}{\|\vect{u}\|\|\vect{v}\|}
\end{equation}
When this angle is $\pi/2$, we say the vectors are \style{orthogonal} and write $\vect{u} \perp \vect{v}$.

That angles exist in all inner product spaces is remarkable. More remarkable still is how the familiar properties of Euclidean angles persist in the abstract setting. Orthogonal vectors have inner product zero; small angles correspond to nearly parallel vectors; obtuse angles arise when the inner product is negative. These properties transcend the specific setting, whether we measure angles between functions, matrices, or quantum states.

\begin{example}[Function Angles]
In $C([a,b])$ with the inner product $\langle f,g\rangle = \int_a^b f(t)g(t)\,dt$, two functions are orthogonal when their product integrates to zero. The angle between functions measures their correlation -- how their variations align over the interval. Functions with small angle vary similarly; orthogonal functions vary independently; functions at angle $\pi$ vary in opposition.
\end{example}
\begin{marginfigure}
{\em Example:} The functions $\sin x$ and $\cos x$ are orthogonal on $[-\pi,\pi]$ under this inner product -- a fact crucial to Fourier analysis. See Example \ref{ex:fourier}.
\end{marginfigure}

\begin{example}[Matrix Alignment]
Under the Frobenius inner product, matrices $A,B\in\R^{m\times n}$ form an angle through:
\[
    \cos\theta = \frac{\trace(A^TB)}{\|A\|_F\|B\|_F}
\]
This measures how aligned their entries are, with orthogonal matrices having zero entry-wise correlation. Such geometric interpretation of matrix relationships reveals structure hidden in algebraic formulas.
\end{example}

Orthogonality proves especially powerful in decomposing vectors. When $\vect{u}\perp\vect{v}$, the Pythagorean theorem generalizes:
\[
    \|\vect{u} + \vect{v}\|^2 = \|\vect{u}\|^2 + \|\vect{v}\|^2
\]
This additivity of squared norms for orthogonal vectors enables decomposition of complex vectors into simpler orthogonal components -- a principle that will guide our study of orthogonal bases and projections.

\begin{lemma}[Orthogonal Decomposition]
\label{lem:orthog-decomp}
Let $\vect{v}_1,\ldots,\vect{v}_k$ be mutually orthogonal nonzero vectors. Then they are linearly independent, and for any scalars $c_1,\ldots,c_k$:
\[
    \left\|\sum_{i=1}^k c_i\vect{v}_i\right\|^2 = \sum_{i=1}^k c_i^2\|\vect{v}_i\|^2
\]
\end{lemma}

\begin{marginfigure}
{\em Foreshadowing:} This decomposition principle will reach its full power when we construct orthonormal bases, enabling optimal approximations and computational methods.
\end{marginfigure}

The proof follows from the distributive property of inner products and the vanishing of cross terms between orthogonal vectors. More significant is the implication: orthogonal vectors combine independently, their contributions to any sum measurable separately without interference. This independence principle -- that orthogonal components can be analyzed separately -- pervades modern applications from signal processing to quantum mechanics.

We close with a subtle observation: while every inner product induces a norm, not every norm arises from an inner product. The $\ell^1$ and $\ell^\infty$ norms on $\R^n$, for instance, lack the geometric structure that inner products provide. The special character of inner product norms lies in how they encode angles -- a capability we shall exploit as we develop the theory of orthogonal bases and transformations.

% ==============================================
\section{Orthogonal \& Orthonormal Bases}
\label{sec:orthogonal}
% ==============================================

The bases we have thus far encountered arose from convenience or custom  --  coordinates chosen more by habit than principle. Yet some bases are intrinsically better than others, measuring vectors in ways that respect the inner product structure we have so carefully constructed. Such bases emerge from the concept of orthogonality, providing optimal frameworks for both theoretical understanding and practical computation.

A set of vectors $\{\vect{v}_1,\ldots,\vect{v}_k\}$ in an inner product space is \style{orthogonal} if each vector is perpendicular to all others:
\[
    \langle \vect{v}_i,\vect{v}_j\rangle = 0 \quad \text{for all }i\neq j
\]
When these vectors also have unit length, so that $\|\vect{v}_i\|=1$ for all $i$, we call the set \style{orthonormal}. Such collections combine the geometric elegance of perpendicularity with the computational convenience of unit vectors.
%
\begin{marginfigure}
{\em Example:} The standard basis $\{\ihat, \jhat, \khat\}$ for $\R^3$ is orthonormal under the dot product  --  a fact so familiar we often forget its significance.
\end{marginfigure}

The power of orthogonal vectors lies in how they decompose the space they span. When $\vect{v}_1,\ldots,\vect{v}_k$ are orthogonal, any vector in their span has a unique representation:
\[
    \vect{x} = \sum_{i=1}^k c_i\vect{v}_i \quad\text{where}\quad c_i = \frac{\langle \vect{x},\vect{v}_i\rangle}{\|\vect{v}_i\|^2}
\]
The coefficients emerge naturally from the inner product  --  no system of equations need be solved. When the vectors are orthonormal, this simplifies further to $c_i = \langle \vect{x},\vect{v}_i\rangle$, the inner product itself revealing the coordinates directly.

\begin{example}[Fourier Series]
\label{ex:fourier}
The functions $\{\sin nx, \cos nx\}_{n=1}^\infty$ form an orthogonal set in $C[-\pi,\pi]$ under the inner product
\[
    \langle f,g\rangle = \int_{-\pi}^\pi f(x)g(x)\,dx
\]
This orthogonality  --  discovered by Euler and exploited by Fourier  --  explains why trigonometric series decompose periodic functions so effectively. The Fourier coefficients arise naturally as inner products, without need for integration by parts or other technical devices.
\end{example}

An orthogonal or orthonormal set that spans a space forms a basis of particular elegance. Every vector has unique coordinates computable through inner products; the Pythagorean theorem holds for all linear combinations; geometric and algebraic properties align perfectly. Yet we cannot simply wish such bases into existence  --  we must construct them systematically.

The \style{Gram-Schmidt process} provides such construction. Beginning with any basis $\{\vect{b}_1,\ldots,\vect{b}_n\}$, we build an orthogonal basis $\{\vect{v}_1,\ldots,\vect{v}_n\}$ spanning the same space:
\[
\begin{array}{rcl}
\vect{v}_1 &=& \vect{b}_1 \\
\vect{v}_2 &=& \vect{b}_2 - \proj{\vect{v}_1}\vect{b}_2 \\
\vect{v}_3 &=& \vect{b}_3 - \proj{\vect{v}_1}\vect{b}_3 - \proj{\vect{v}_2}\vect{b}_3 \\
&\vdots& \\
\vect{v}_k &=& \vect{b}_k - \sum_{i=1}^{k-1}\proj{\vect{v}_i}\vect{b}_k
\end{array}
\]
%
\begin{marginfigure}
{\em Perspective:} While Gram-Schmidt orthogonalization may seem purely theoretical, it underlies key algorithms in modern recommender systems, where orthogonal feature vectors help prevent redundancy in user representations.
\end{marginfigure}
%
where $\proj{\vect{v}}\vect{u} = \frac{\langle \vect{u},\vect{v}\rangle}{\|\vect{v}\|^2}\vect{v}$ denotes orthogonal projection. Each new vector is made orthogonal to all previous ones by subtracting away its components in their directions.

\begin{example}[Polynomial Orthogonalization]
Consider the space $\poly_4$ with inner product $\langle f,g\rangle = \int_{-1}^1 f(x)g(x)\,dx$. Starting with the monomial basis $\{1,x,x^2,x^3,x^4\}$, Gram-Schmidt produces (up to scaling) the \style{Legendre polynomials}:
\[
\begin{array}{rcl}
P_0(x) &=& 1 \\
P_1(x) &=& x \\
P_2(x) &=& \frac{1}{2}(3x^2-1) \\
P_3(x) &=& \frac{1}{2}(5x^3-3x) \\
P_4(x) &=& \frac{1}{8}(35x^4-30x^2+3)
\end{array}
\]
\begin{marginfigure}
    Remarkably, these polynomials play a fundamental role not only in gravitational theory but also in quantum mechanics, where they describe angular momentum states, and in numerical integration, where they provide optimal quadrature points.
\end{marginfigure}
These orthogonal polynomials, discovered by Legendre in studying gravitational potential, arise naturally from imposing orthogonality on the simplest polynomial basis. The increasing complexity of coefficients reflects how each new polynomial must maintain orthogonality to all previous ones  --  a constraint leading to ever more intricate balancing of terms. 
\end{example}

The Gram-Schmidt process, though elegant in theory, can suffer numerical instability in practice. Each projection accumulates computational errors that can destroy orthogonality in the final basis. A more stable approach  --  \style{modified Gram-Schmidt}  --  applies the projections sequentially rather than simultaneously. Though mathematically equivalent, this version better preserves orthogonality in finite-precision arithmetic.
%
\begin{marginfigure}
{\em Foreshadowing:} The interplay between theoretical elegance and computational stability foreshadows deeper connections between matrix factorizations and numerical linear algebra.
\end{marginfigure}

Having constructed orthogonal bases, we might ask which are best suited to particular problems. The answer depends on what structure we wish to preserve or illuminate:
\begin{enumerate}
    \item For differential equations, bases of eigenfunctions reveal dynamical behavior
    \item In signal processing, Fourier bases expose frequency content
    \item For quantum systems, bases of energy eigenstates clarify measurement
    \item In data analysis, principal component bases optimize variance capture
\end{enumerate}
The choice of orthogonal basis shapes our view of the space and its vectors  --  a theme we shall explore more deeply when studying eigenvalues and singular values.
\begin{marginfigure}
    {\em BONUS!} While every finite-dimensional inner product space admits an orthonormal basis, infinite-dimensional spaces may resist such complete orthogonalization. If you go on to learn \style{functional analysis}, remember this!
\end{marginfigure}


% ==============================================
\section{Adjoints \& Transposes}
\label{sec:adjoints}
% ==============================================

The familiar operation of matrix transpose harbors deeper structure than first appears. When we write $A^T$ for a matrix $A$, we do more than reflect entries across the diagonal  --  we encode a fundamental relationship between linear transformations and inner products. This relationship, abstracted from its matrix origins, provides the key to understanding how transformations interact with geometric structure.

Consider first the matrix transpose in $\R^n$ with its standard inner product. For any matrix $A$, its transpose $A^T$ satisfies a crucial property: for all vectors $\vect{x}$ and $\vect{y}$,
\[
    \langle A\vect{x},\vect{y}\rangle = \langle \vect{x},A^T\vect{y}\rangle
\]
This seemingly innocent equation reveals something profound: the transpose $A^T$ is not merely a matrix operation but rather the unique linear transformation that preserves inner product relationships with $A$.

\begin{definition}[Adjoint]
\label{def:adjoint}
Let $V$ and $W$ be inner product spaces and $T:V\rightarrow W$ a linear transformation. The \style{adjoint} of $T$ is the unique linear transformation $T^*:W\rightarrow V$ satisfying:
\begin{equation}
    \langle T\vect{v},\vect{w}\rangle_W = \langle \vect{v},T^*\vect{w}\rangle_V
\end{equation}
%
\begin{marginfigure}
{\em Nota bene:} The existence and uniqueness of the adjoint require proof  --  neither is obvious from the definition. Existence requires the Riesz Representation Theorem (such deep machinery we shall avoid). 
\end{marginfigure}
%
for all $\vect{v}\in V$ and $\vect{w}\in W$.
\end{definition}
%

\begin{example}[Differentiation Adjoint]
Consider the differentiation operator $D:\poly_2\rightarrow\poly_1$ with inner product $\langle f,g\rangle = \int_0^1 f(x)g(x)\,dx$. Its adjoint $D^*:\poly_1\rightarrow\poly_2$ satisfies:
\[
    \int_0^1 (Df)(x)g(x)\,dx = \int_0^1 f(x)(D^*g)(x)\,dx
\]
Integration by parts reveals that $D^*$ involves both integration and boundary terms  --  a relationship fundamental to both differential equations and variational principles in mechanics.
\end{example}

The adjoint operation respects the algebraic structure of linear transformations while reversing their direction:
%
\begin{marginfigure}
{\em Nota bene:} the theory of adjoints extends to infinite-dimensional spaces, but requires additional machinery from functional analysis including completeness and boundedness of operators.
\end{marginfigure}
%
\begin{lemma}
\label{lem:adjoints}
For linear transformations $S$ and $T$ between finite-dimensional inner product spaces, and for any scalar $c$:
\begin{enumerate}
    \item $(S+T)^* = S^* + T^*$
    \item $(cT)^* = cT^*$
    \item $(ST)^* = T^*S^*$
    \item $(T^*)^* = T$
\end{enumerate}
\end{lemma}

\begin{marginfigure}
{\em Example:} For a rotation matrix $R$ in $\R^2$, the adjoint $R^*$ corresponds to rotation in the opposite direction  --  explaining why $R^T R = I$.
\end{marginfigure}

When we choose orthonormal bases for our spaces, the matrix of $T^*$ is the transpose of the matrix of $T$. This explains our notation: the abstract adjoint operation generalizes matrix transpose to arbitrary inner product spaces. The matrix transpose is merely the adjoint operation expressed in coordinates.

\begin{example}[Signal Processing]
In digital signal processing, filtering operations are linear transformations between signal spaces. The adjoint of a filter describes its time-reversed impulse response  --  a relationship crucial to matched filtering and signal detection. When the filter preserves signal energy (an inner product constraint), its adjoint is closely related to its inverse.
\end{example}

The relationship between a transformation and its adjoint illuminates the geometry of linear mappings. Some transformations equal their own adjoints ($T=T^*$); others satisfy $T^*=-T$. Most lie between these extremes, their deviation from self-adjointness measuring how they distort the inner product structure. This interplay between transformation and adjoint will guide our development of orthogonal transformations in the sections ahead.

% ==============================================
\section{Orthogonal Transformations}
\label{sec:orthog}
% ==============================================

The most elegant transformations preserve geometric structure. A rotation changes perspective while maintaining shape; a reflection inverts orientation while preserving angles. Such transformations  --  those respecting the inner product structure we have so carefully built  --  arise throughout engineering, from rigid body mechanics to quantum measurements to data analysis. Their power lies in a perfect fusion of geometric intuition and algebraic precision.
%
\begin{marginfigure}
{\em Note:} The term ``orthogonal'' here connotes preservation of all inner products, not merely orthogonality relations. A more accurate (though less traditional) name might be ``inner-product-preserving'' or ``orthonormal.''
\end{marginfigure}

\begin{definition}[Orthogonal Transformation]
\label{def:orthogonal}
A linear transformation $T:V\rightarrow V$ on an inner product space is \style{orthogonal} if it preserves inner products:
\[
    \langle T\vect{u},T\vect{v}\rangle = \langle \vect{u},\vect{v}\rangle
\]
for all vectors $\vect{u},\vect{v}\in V$.
\end{definition}

An orthogonal transformation preserves lengths and angles: for all $\vect{u}$ and $\vect{v}$,
\[
    \|T\vect{v}\| = \|\vect{v}\|
    \quad : \quad
    \cos\theta = \frac{\langle T\vect{u},T\vect{v}\rangle}{\|T\vect{u}\|\|T\vect{v}\|} = \frac{\langle \vect{u},\vect{v}\rangle}{\|\vect{u}\|\|\vect{v}\|}
\]

These geometric constraints have powerful algebraic consequences, connecting our work here to the theory of adjoints developed in the previous section:

\begin{lemma}
\label{lem:orthog}
For a linear transformation $T$ on a finite-dimensional inner product space, the following are equivalent:
\begin{enumerate}
    \item $T$ is orthogonal
    \item $T^*T = TT^* = I$
    \item $T^* = T^{-1}$
%    \item $T$ maps some (equivalently, any) orthonormal basis to an orthonormal basis
\end{enumerate}
\end{lemma}

% \begin{proof}
% That (1) implies (2) follows from the definition of adjoint: for all $\vect{u},\vect{v}$,
% \[
%     \langle T^*T\vect{u},\vect{v}\rangle = \langle T\vect{u},T\vect{v}\rangle = \langle \vect{u},\vect{v}\rangle = \langle I\vect{u},\vect{v}\rangle
% \]
% The non-degeneracy of the inner product implies $T^*T=I$. Similar reasoning yields $TT^*=I$. The remaining equivalences follow from properties of adjoints and the characterization of orthonormal bases through inner products.
% \end{proof}

When we choose orthonormal bases for our space, orthogonal transformations have particularly elegant matrix representations. A square matrix $Q$ is \style{orthogonal} if 
\begin{equation}
    Q^TQ = I = QQ^T 
\end{equation}
The set of all $n\times n$ orthogonal matrices is denoted $O(n)$.
Such matrices inherit remarkable properties that make them particularly valuable in computation:
%
\begin{marginfigure}
{\em Foreshadowing:} The power of orthogonal matrices in computation will become even clearer when we study the Singular Value Decomposition in Chapter \ref{ch:10}, where they provide optimal coordinate transformations for a variety of purposes.
\end{marginfigure}

\begin{lemma}
\label{lem:orthogmatrix}
An orthogonal matrix $Q$ satisfies:
\begin{enumerate}
    \item Its columns (and rows) form an orthonormal basis
    \item Its inverse equals its transpose: $Q^{-1} = Q^T$
    \item It preserves lengths: $\|Q\vect{x}\| = \|\vect{x}\|$
    \item It preserves angles: $(Q\vect{x})^T(Q\vect{y}) = \vect{x}^T\vect{y}$
    \item Its determinant is $\pm 1$
\end{enumerate}
\end{lemma}

\begin{example}[Rigid Body Motion]
The orientation of a rigid body in three-dimensional space is described by an orthogonal transformation. Its matrix representation $R$ in any orthonormal basis satisfies $R^TR=RR^T=I$, with $\det R = \pm 1$. When $\det R = 1$, the transformation represents a pure rotation; when $\det R = -1$, it includes a reflection.
%
\begin{marginfigure}
These mechanical constraints  --  preservation of distances and angles  --  arise from the physical principle that rigid bodies maintain their shape under motion. The orthogonality of $R$ encodes this fundamental geometric requirement algebraically.
\end{marginfigure}

This geometric structure explains why different physical quantities transform differently under rotation. Position vectors transform by $R$, while angular momentum vectors transform by $R^{-1}=R^T$, preserving their geometric relationships while respecting the physics of rigid motion.
\end{example}

\begin{example}[Signal Transforms]
The Discrete Fourier Transform (DFT), properly normalized, provides an orthogonal transformation on the space of discrete signals. Its orthogonality manifests as Parseval's identity  --  the conservation of signal energy between time and frequency domains:
\[
    \sum_{n=0}^{N-1} |x[n]|^2 = \frac{1}{N}\sum_{k=0}^{N-1} |X[k]|^2
\]
\begin{marginfigure}
    {\em Relax...} If this makes sense, cool. If not, no worries! {\em You are having a dream...}
\end{marginfigure}
where $x[n]$ is the time-domain signal and $X[k]$ its frequency-domain transform. This conservation arises precisely because the DFT basis vectors form an orthonormal set.
\end{example}


\begin{example}[Data Analysis]
In high-dimensional data analysis, orthogonal transformations provide optimal coordinate changes for revealing structure. Principal Component Analysis (PCA)  --  which we shall study in detail in Chapter \ref{ch:11}  --  seeks an orthogonal change of coordinates that aligns the data's axes of maximum variation. The orthogonality ensures that the new coordinates remain uncorrelated, while the inner product preservation guarantees no information is lost in the transformation.
\end{example}

The composition of orthogonal transformations is orthogonal, and the inverse of an orthogonal transformation is orthogonal. This closure under composition and inversion hints at deeper algebraic structure  --  the orthogonal transformations form a group under composition, though we shall not pursue this abstraction further.
%
\begin{marginfigure}
{\em Caveat:} The term ``group'' here has precise mathematical meaning, describing a set closed under an associative operation, with identity and inverses. Though we won't develop this theory, it explains much about how orthogonal transformations combine.
\end{marginfigure}

The power of orthogonal transformations lies in their fusion of geometric and algebraic properties. They provide the mathematical framework for rigid motion in mechanics, preserve energy in wave equations and quantum systems, and offer optimal coordinate changes for data analysis. Their ubiquity in engineering practice  --  from robotics to signal processing to machine learning  --  reflects a deeper truth: the most useful transformations are often those that preserve fundamental structure.

% ==============================================
\section{The QR Decomposition}
\label{sec:qr}
% ==============================================

Our development of orthogonality through Section \ref{sec:orthogonal} provides the foundation for another fundamental matrix factorization. The Gram-Schmidt process transforms any basis into an orthogonal one through systematic projection  --  a process that itself defines a natural decomposition of the original matrix. This factorization, called the QR decomposition, expresses any matrix as a product of an orthogonal matrix and an upper triangular one.

\begin{definition}[QR Decomposition]
\label{def:qr}
The \style{QR decomposition} of a square matrix $A\in\R^{n\times n}$ expresses it as a product
\[
    A = QR
\]
where $Q$ is orthogonal ($Q^TQ=I$) and $R$ is upper triangular. When $A$ has full rank, this decomposition is unique if we require the diagonal entries of $R$ to be positive.
\end{definition}

Consider how this decomposition arises from orthogonalizing $A$'s columns. Writing $A = [\vect{a}_1\;\cdots\;\vect{a}_n]$, the Gram-Schmidt process produces orthonormal vectors $\{\vect{q}_1,\ldots,\vect{q}_n\}$ where each $\vect{q}_k$ emerges from $\vect{a}_k$ by subtracting its projections onto previous vectors. The coefficients of these projections, together with the normalization factors, assemble naturally into an upper triangular matrix $R$. This construction reveals both why $Q$ must be orthogonal and why $R$ assumes triangular form  --  each column of $A$ expresses through the $\vect{q}_i$ only up to its own index.

\begin{example}[Simple QR Form]
Consider the matrix
\[
    A = \begin{bmatrix}
    3 & -4 \\
    4 & 3
    \end{bmatrix}
\]
Direct computation yields
\[
    Q = \begin{bmatrix}
    0.6 & -0.8 \\
    0.8 & 0.6
    \end{bmatrix}
    \quad\text{and}\quad
    R = \begin{bmatrix}
    5 & 0 \\
    0 & 5
    \end{bmatrix}
\]
The orthogonal factor $Q$ captures the rotation inherent in $A$, while the triangular factor $R$ represents scaling. This geometric decomposition  --  into pure rotation followed by scaling  --  exemplifies how matrix factorizations reveal underlying structure.
\end{example}

The QR decomposition provides more than mere factorization  --  it reveals the matrix's action through natural stages. Like the eigendecomposition studied in Chapter \ref{ch:7} and the singular value decomposition to come in Chapter \ref{ch:10}, QR decomposes a transformation into simpler, geometrically meaningful operations:
\begin{enumerate}
    \item The orthogonal factor $Q$ provides a new orthonormal basis
    \item The triangular factor $R$ describes coordinates in this basis
    \item Their product $QR$ reconstructs the original transformation
\end{enumerate}

\begin{lemma}[Existence and Uniqueness]
\label{lem:qrexist}
Every nonsingular matrix $A$ admits a unique QR decomposition with positive diagonal entries in $R$. When $A$ is singular, the decomposition exists but loses uniqueness in ways determined by the matrix's rank.
\end{lemma}

This factorization provides powerful tools for both theoretical analysis and practical computation. Systems of equations yield naturally to solution through QR factors: if $A\vect{x}=\vect{b}$, then $R\vect{x}=Q^T\vect{b}$ reduces to back-substitution. The orthogonal factor $Q$ preserves lengths and angles while $R$ implements a simple triangular transformation. Like a sculptor's fundamental techniques, mastery of the QR decomposition enables increasingly sophisticated applications through careful composition of simple operations.

The power of this decomposition emerges from its fusion of geometric and algebraic perspectives. Though discovered through algebraic orthogonalization, its true power flows from the geometric insight that any linear transformation factors naturally through orthogonal bases. This unity of geometry and algebra  --  a theme that has guided our development throughout  --  achieves particular clarity in the QR decomposition, where abstract orthogonality principles transform into practical computational tools.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% % **************** EMANATION *******************
% \emanation
% % **************** EMANATION *******************
% \section*{Clustering \& $K$-Means}
% \label{EM:clustering}
% % **************** EMANATION *******************

% Clustering is a fundamental task in data analysis: given a collection of points in some high-dimensional vector space $V$, we seek to organize them into natural groupings based on their geometric relationships. The inner product structure of $V$ (recall Definition \ref{def:innerproduct}) provides the mathematical foundation for measuring similarities and discovering these groupings. While many clustering algorithms exist, the $K$-means algorithm offers an elegant demonstration of how the geometric machinery developed in this chapter enables practical computation.

% Given vectors $\{\vect{x}_1,\ldots,\vect{x}_N\}\subset V$ and a specified number of clusters $K$, the $K$-means algorithm seeks cluster centers $\{\vect{c}_1,\ldots,\vect{c}_K\}$ minimizing the total squared distance from points to their nearest center:
% \[
%     \sum_{i=1}^N \min_{j=1,\ldots,K} \|\vect{x}_i - \vect{c}_j\|^2
% \]
% where distances are measured using the norm induced by the inner product (Definition \ref{def:norm}).

% % ++++++++++++++++++++++++++++++++++++++++++++
% \begin{algorithm}[H]
% \SetAlgoLined
% \KwData{Vectors $\{\vect{x}_1,\ldots,\vect{x}_N\}\subset V$, number of clusters $K$}
% \KwResult{Cluster centers $\{\vect{c}_1,\ldots,\vect{c}_K\}$ and assignments}
% Initialize $\vect{c}_1,\ldots,\vect{c}_K$ randomly from the data\;
% \While{not converged}{
%     \For{$i\leftarrow 1$ \KwTo $N$}{
%         Assign $\vect{x}_i$ to nearest center using norm from inner product\;
%     }
%     \For{$j\leftarrow 1$ \KwTo $K$}{
%         Update $\vect{c}_j$ to mean of points assigned to cluster $j$\;
%     }
% }
% \caption{$K$-means Clustering}
% \label{alg:kmeans}
% \end{algorithm}
% % ++++++++++++++++++++++++++++++++++++++++++++

% \begin{marginfigure}
% {\em Question:} How might one determine the optimal number of clusters $K$ when this is not known a priori? This becomes a crucial issue in exploratory data analysis.
% \end{marginfigure}

% The algorithm's effectiveness stems from the inner product geometry guiding both steps. The assignment phase uses distances computed via the inner product structure, while the update phase computes orthogonal projections onto the span of each cluster's points --- geometric operations whose theoretical foundations we established in Section \ref{sec:orthogonal}.

% \begin{example}[Handwritten Digits]
% The MNIST database of handwritten digits provides each image as a $28\times 28$ array of grayscale values, naturally viewed as a vector in $\R^{784}$. Running $K$-means with $K=10$ produces cluster centers that remarkably resemble "prototype" digits --- the algorithm discovers, through purely geometric means, the natural categories in the data.
% \end{example}

% \begin{marginfigure}
% {\em Foreshadowing:} In Chapter \ref{ch:10}, we will see how the Singular Value Decomposition provides optimal low-dimensional representations of data, enabling more efficient clustering in very high dimensions.
% \end{marginfigure}

% \begin{example}[Color Compression]
% A color image consists of points in $\R^3$, each representing an RGB triple. Applying $K$-means clustering to these points with $K=16$ effectively reduces the image to 16 representative colors while preserving its essential features. The algorithm selects these colors based on the geometric distribution in RGB space, where the standard inner product on $\R^3$ captures perceptual similarity.
% \end{example}

% \begin{marginfigure}
% {\em Foreshadowing:} Spectral clustering methods, which we shall encounter in Chapter \ref{ch:9}, use eigenvectors of the graph Laplacian to discover clusters that $K$-means might miss.
% \end{marginfigure}

% The power of this geometric approach extends beyond simple Euclidean clustering through various modifications of the inner product structure. By choosing different inner products or working in transformed feature spaces, we can adapt the clustering to respect different notions of similarity. The principles developed in this chapter --- from the abstract properties of inner products (Definition \ref{def:innerproduct}) to the concrete geometry of orthogonal projections (Lemma \ref{lem:orthog}) --- provide the mathematical foundation for these sophisticated data analysis tools.

% % **************** EMANATION *******************
% \emanation
% % **************** EMANATION *******************
% \section*{Image Segmentation through Inner Products}
% \label{EM:segmentation}
% % **************** EMANATION *******************

% The challenge of separating foreground from background in digital images --- a task fundamental to computer vision --- provides an elegant demonstration of inner product geometry in action. Each image presents itself as a collection of pixels, yet human observers effortlessly perceive distinct objects and regions. The mathematical machinery developed in this chapter enables automatic discovery of such natural divisions through careful application of inner products and orthogonal projections.

% Consider first how images map to our geometric framework. A grayscale image of size $m\times n$ becomes a vector in $\R^{mn}$, with each component representing one pixel's intensity. Color images triple this dimension: each pixel provides three numbers measuring red, green, and blue intensities. The inner product structure of these spaces enables precise measurement of similarity between image regions.

% \begin{marginfigure}
% {\em Nota bene:} Though we treat images as vectors, their spatial relationships remain crucial. Neighboring pixels tend to belong to the same region --- a fact that guides our choice of features.
% \end{marginfigure}

% Rather than work directly with raw pixel values, we compute local feature vectors that capture image structure. For each pixel position $(i,j)$, examine its neighborhood of radius $r$ to form vector $\vect{v}_{ij}\in\R^d$ encoding:
% \begin{itemize}
%     \item Local intensity patterns
%     \item Color distributions (for RGB images)
%     \item Basic geometric features like edges
% \end{itemize}
% The inner product $\langle\vect{v}_{ij},\vect{v}_{k\ell}\rangle$ then measures similarity between image patches centered at $(i,j)$ and $(k,\ell)$.

% To separate foreground from background, we exploit a fundamental observation: patches within the same region typically share similar features. Given a reference patch known to lie in the foreground, we can measure each pixel's likelihood of belonging to the foreground through orthogonal projection. Let $\vect{f}$ denote the feature vector of our reference patch, normalized to unit length. Then for any other patch with feature vector $\vect{v}$, the projection coefficient
% \[
%     \alpha = \langle\vect{v},\vect{f}\rangle
% \]
% measures its alignment with the foreground pattern.

% \begin{example}[Medical Image Analysis]
% Consider segmenting a medical scan to isolate a tumor. Given a small region manually identified as tumor tissue, we compute its characteristic feature vector $\vect{f}$. The projection formula from Section \ref{sec:orthogonal} then provides a principled score for each image location:
% \[
%     \text{tumor-likelihood}(i,j) = \frac{\langle\vect{v}_{ij},\vect{f}\rangle}{\|\vect{f}\|\|\vect{v}_{ij}\|}
% \]
% High scores indicate patches whose features align closely with known tumor tissue.
% \end{example}

% This simple model extends naturally through the orthogonal decomposition principles of Section \ref{sec:orthogcomp}. Rather than compare against a single reference vector, we can project onto the span of multiple examples. Given feature vectors $\{\vect{f}_1,\ldots,\vect{f}_k\}$ from confirmed tumor regions, the Gram-Schmidt process provides an orthonormal basis $\{\vect{u}_1,\ldots,\vect{u}_k\}$ for their span. The total projection
% \[
%     \sum_{i=1}^k |\langle\vect{v},\vect{u}_i\rangle|^2
% \]
% measures how well patch $\vect{v}$ aligns with the subspace of known tumor patterns.

% \begin{marginfigure}
% {\em Foreshadowing:} Chapter \ref{ch:11} will show how Principal Component Analysis discovers optimal bases for such feature spaces automatically from training data.
% \end{marginfigure}

% The orthogonal complement plays an equally important role. Patches very different from our reference patterns have feature vectors nearly orthogonal to the span of $\{\vect{f}_1,\ldots,\vect{f}_k\}$. This geometric insight suggests a robust segmentation criterion: classify patches as foreground only if they have:
% \begin{enumerate}
%     \item Large projection onto the span of reference patterns
%     \item Small projection onto the orthogonal complement
% \end{enumerate}

% \begin{example}[Robust Segmentation]
% Given orthonormal bases $\{\vect{u}_1,\ldots,\vect{u}_k\}$ for the foreground subspace $U$ and $\{\vect{w}_1,\ldots,\vect{w}_\ell\}$ for a background subspace $W$, we can score patches through the ratio:
% \[
%     \text{confidence}(\vect{v}) = \frac{\sum_{i=1}^k |\langle\vect{v},\vect{u}_i\rangle|^2}{\sum_{j=1}^\ell |\langle\vect{v},\vect{w}_j\rangle|^2}
% \]
% This naturally compares evidence for foreground versus background classification, with the orthogonality of the bases ensuring fair comparison.
% \end{example}

% The power of this geometric approach lies in its principled foundation. Rather than rely on heuristic rules, we measure similarity through inner products and decompose features through orthogonal projections. The mathematical framework developed in this chapter transforms the intuitive notion of ``similar image regions'' into precise computational tools.

% When applied to real medical images, this approach enables automatic detection of anomalies, tracking of tumor boundaries, and quantitative measurement of tissue properties. The same geometric principles extend beyond medicine to satellite imagery, industrial inspection, and robotic vision --- any domain where discovering structure in images proves crucial. Through careful application of inner products and orthogonality, we bridge the gap between human perception and computational analysis.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Text Embeddings \& Semantic Search}
\label{EM:embeddings}
% **************** EMANATION *******************

The representation of text as vectors in high-dimensional spaces provides a striking application of inner product geometry. By encoding words or phrases as vectors in $\R^n$ (typically with $n$ very large), we can measure semantic similarity through the geometric tools developed in this chapter. The inner product structure (Definition \ref{def:innerproduct}) transforms abstract notions of meaning into concrete computational methods, enabling modern language models to process and understand text through geometric operations.

The key insight is that the angle between embedding vectors, measured through their normalized inner product, captures semantic similarity. For embedding vectors $\vect{v},\vect{w}\in\R^n$, their \style{cosine similarity} is defined as:
\[
    \cos\theta = \frac{\langle \vect{v},\vect{w}\rangle}{\|\vect{v}\|\|\vect{w}\|}
\]
This measure, ranging from -1 to 1, emerges naturally from the geometric structure we established in Section \ref{sec:angles}. Words or phrases with similar meanings yield embedding vectors with small angles between them, while unrelated concepts become nearly orthogonal in the embedding space.

% ++++++++++++++++++++++++++++++++++++++++++++
\begin{algorithm}[H]
\SetAlgoLined
\KwData{Query embedding $\vect{q}$, Database of embeddings $\{\vect{d}_1,\ldots,\vect{d}_N\}$}
\KwResult{Indices of $k$ most similar items}
Normalize query: $\vect{q} \leftarrow \vect{q}/\|\vect{q}\|$\;
\For{$i\leftarrow 1$ \KwTo $N$}{
    Normalize database vector: $\vect{d}_i \leftarrow \vect{d}_i/\|\vect{d}_i\|$\;
    Compute similarity: $s_i \leftarrow \langle \vect{q},\vect{d}_i\rangle$\;
}
Return indices of $k$ largest similarities\;
\caption{Semantic Search via Cosine Similarity}
\label{alg:semantic}
\end{algorithm}
% ++++++++++++++++++++++++++++++++++++++++++++

\begin{marginfigure}
{\em Caveat:} The effectiveness of embedding-based similarity depends crucially on the quality of the embedding vectors. Modern language models learn these embeddings through extensive training on large text corpora, optimizing the geometric structure to reflect semantic relationships.
\end{marginfigure}

\begin{example}[Word Analogies]
A remarkable feature of well-trained embedding spaces is their capture of semantic relationships through vector arithmetic. The classic example
\[
\textrm{king} - \textrm{man} + \textrm{woman} \simeq \textrm{queen} 
\]
demonstrates how the geometry of the embedding space encodes meaningful relationships: for a semantically suitable text embedding, the vector difference between {\em king} and {\em man} embeddings, when added to {\em woman}, produces a vector close (in cosine similarity) to the embedding of {\em queen}.
\end{example}

\begin{example}[Document Retrieval]
Given a corpus of documents, each encoded as a vector through averaging or sophisticated pooling of word embeddings, semantic search reduces to finding nearest neighbors under cosine similarity. This geometric approach captures topical similarity far better than traditional keyword matching, as documents using different but semantically related terms still yield vectors with small angular separation.
\end{example}

\begin{marginfigure}
{\em Foreshadowing:} In Chapter \ref{ch:10}, we shall see how the Singular Value Decomposition enables efficient approximate nearest neighbor search in high-dimensional embedding spaces through dimensionality reduction.
\end{marginfigure}

The power of this geometric approach extends beyond simple similarity search. Modern language models construct multiple layers of embedding spaces, each capturing different aspects of linguistic structure. The principles developed in this chapter --- from the abstract properties of inner products (Definition \ref{def:innerproduct}) to the concrete geometry of angles (Lemma \ref{lem:orthog}) --- provide the mathematical foundation for these sophisticated natural language processing systems.

The success of embedding-based methods in natural language processing exemplifies a broader principle: many complex relationships in data can be captured through careful geometric structure. Whether comparing documents, analyzing molecular structures, or processing social networks, the inner product geometry we have developed offers a powerful framework for measuring and exploiting similarity.


% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Quantum Measurement \& Observable Operators}
\label{EM:quantum}
\newcommand{\ket}[1]{\left|#1\right\rangle}
\newcommand{\bra}[1]{\left\langle#1\right|}
% **************** EMANATION *******************

Quantum mechanics provides perhaps the most elegant application of inner product geometry -- not as mere mathematical convenience, but as fundamental physical law. The mathematical structures developed in this chapter -- inner products, orthogonality, and adjoints -- emerge naturally as the language for describing quantum states and their measurement. For engineering students who have encountered quantum mechanics through physics, this section offers a fresh perspective grounded in the clarity of finite-dimensional linear algebra.

\begin{marginfigure}
{\em Historical Note:} The notations $\ket{\psi}$ for vectors (\style{kets}) and $\bra{\phi}$ for dual vectors (\style{bras}) were introduced by Dirac in 1939 to make inner products look like sandwich brackets: $\bra{\phi}\ket{\psi}$. Though initially viewed as merely clever typography, this notation proves remarkably effective for quantum calculations.
\end{marginfigure}
The key insight is that quantum states are simply vectors in an inner product space. A quantum bit or \style{qubit} -- the fundamental unit of quantum information -- lives in a two-dimensional complex inner product space $\C^2$. Any state can be written as a linear combination of two orthonormal basis states, traditionally denoted $\ket{0}$ and $\ket{1}$ (physics notation for vectors which we shall embrace briefly):
\[
    \ket{\psi} = \alpha\ket{0} + \beta\ket{1}, \quad \text{where } |\alpha|^2 + |\beta|^2 = 1
\]
The normalization condition $\|\ket{\psi}\|=1$ reflects a fundamental physical principle: probabilities must sum to one.

\begin{marginfigure}
{\em Nota bene:} Quantum mechanics traditionally uses complex vector spaces, and the key geometric concepts from this chapter -- inner products, orthogonality, and adjoints -- extend naturally from $\R^n$ to $\C^n$. The main modification is that inner products become conjugate-symmetric rather than symmetric.
\end{marginfigure}

When we measure a quantum state, we are effectively computing inner products. The probability of measuring the system in state $\ket{\phi}$ when it is in state $\ket{\psi}$ equals the squared magnitude of their inner product:
\[
    \mathbb{P}(\phi|\psi) = |\bra{\phi}\ket{\psi}|^2
\]
This geometric interpretation transforms quantum measurement from mysterious phenomenon into concrete computation: we simply project our state vector onto various measurement directions and compute probabilities from the resulting inner products.

More general measurements correspond to \style{observable operators} -- linear transformations that are their own adjoints (called \style{Hermitian} in physics when complex). The eigenvalues of these operators give the possible measurement outcomes, while their eigenvectors provide the corresponding measurement bases. This connection between adjoints and physical measurement provides fundamental constraints on what can be observed in quantum systems.

\begin{example}[Quantum Bit Measurement]
Consider measuring a qubit in state
\[
    \ket{\psi} = \frac{1}{\sqrt{2}}\begin{pmatrix}1\\1\end{pmatrix}
\]
in the standard basis. This state assigns equal probability to measuring either 0 or 1:
\[
    \mathbb{P}(0|\psi) = \left|\bra{0}\ket{\psi}\right|^2 = \left|\left\langle\begin{pmatrix}1\\0\end{pmatrix},\frac{1}{\sqrt{2}}\begin{pmatrix}1\\1\end{pmatrix}\right\rangle\right|^2 = \frac{1}{2}
\]
and similarly for $\mathbb{P}(1|\psi)$. The orthonormality of basis states ensures these probabilities sum to one.
\end{example}

A more general measurement corresponds to an observable operator like
\[
    A = \begin{bmatrix}1 & 0\\0 & -1\end{bmatrix}
\]
This operator is clearly self-adjoint ($A^* = A$) and has eigenvalues $\pm 1$ with corresponding normalized eigenvectors:
\[
    \vect{v}_+ = \begin{pmatrix}1\\0\end{pmatrix}, \quad
    \vect{v}_- = \begin{pmatrix}0\\1\end{pmatrix}
\]
When we measure this observable, we will obtain either $+1$ or $-1$, with probabilities determined by projecting our state onto these eigenvectors.

\begin{marginfigure}
{\em Caveat:} The restriction to self-adjoint operators for quantum observables emerges from a physical requirement: measurement outcomes must be real numbers. The eigenvalues of a self-adjoint operator are always real, providing exactly this guarantee.
\end{marginfigure}

This framework extends naturally to higher-dimensional systems. A register of $n$ qubits lives in a $2^n$-dimensional complex vector space -- explaining both the power and fragility of quantum computation. Each additional qubit doubles the dimension of the state space, enabling massive parallelism when manipulating quantum states. Yet this exponential scaling also explains why quantum states are so difficult to simulate classically: representing even 50 qubits requires storing $2^{50}$ complex numbers.

The principles of Section \ref{sec:orthogonal} find direct application in quantum error correction. By encoding logical qubits into higher-dimensional spaces using orthogonal subspaces, we can detect and correct certain types of errors through projection operations. The mathematical machinery of inner products and orthogonal complements provides the theoretical foundation for making quantum computation robust against noise.

\begin{example}[Error Detection]
A simple quantum error detecting code encodes a single logical qubit $\alpha\ket{0} + \beta\ket{1}$ into a two-qubit state:
\[
    \alpha\ket{00} + \beta\ket{11}
\]
This encoding uses orthogonal states $\ket{00}$ and $\ket{11}$, reserving $\ket{01}$ and $\ket{10}$ as error indicators. When an error occurs, it typically moves the state into the orthogonal complement of the code space -- allowing detection through projection measurements.
\end{example}

The study of quantum systems exemplifies how the geometric structures developed in this chapter emerge naturally in physical law. Inner products determine measurement probabilities, adjoints constrain observable operators, and orthogonal subspaces enable error correction. This unity of mathematics and physics provides both practical tools for quantum engineering and deeper insight into the role of geometry in physical law.

For the engineering student, quantum mechanics thus becomes not a collection of mysterious postulates, but a natural application of the linear algebraic concepts developed systematically in this text. The inner product structure first encountered abstractly in Section \ref{sec:dotinner} reveals itself as fundamental to the behavior of physical systems at their smallest scales.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 5}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item Consider the inner product on $\poly_2$ given by $\langle f,g\rangle = \int_0^1 f(x)g(x)\,dx$. Find the norm of $p(x)=1+2x+x^2$.

\item Let $V$ be the space of $2\times 2$ matrices with the inner product $\langle A,B\rangle = \trace(A^TB)$. Find the angle between matrices $A=\begin{bmatrix}1 & 0\\0 & 1\end{bmatrix}$ and $B=\begin{bmatrix}1 & -1\\1 & 1\end{bmatrix}$.

\item Show that $\langle f,g\rangle = f(0)g(0) + f(1)g(1)$ defines an inner product on $\poly_1$. Find a vector orthogonal to $p(x)=x$.

\item On $\R^3$, define $\langle \vect{u},\vect{v}\rangle_w = 2u_1v_1 + 3u_2v_2 + u_3v_3$. Prove this is an inner product and find vectors of unit length.

\item For matrices $A$ and $B$, prove that $\trace(A^TB)=\trace(B^TA)$ and use this to show the Frobenius inner product is symmetric.

\item Let $\vect{u}=(1,2,2)^T$ and $\vect{v}=(2,4,-1)^T$. Find an orthonormal basis for $\spanset\{\vect{u},\vect{v}\}$ using the Gram-Schmidt process, then extend it to an orthonormal basis of $\R^3$. Write the resulting orthogonal matrix $Q$ and verify $Q^TQ=I$.

\item For integers $m,n\geq 0$, prove that the functions $\cos(mx)$ and $\sin(nx)$ are orthogonal on $[-\pi,\pi]$ under the standard $L^2$ inner product.

\item Let $C([0,2\pi])$ denote continuous functions on $[0,2\pi]$ with inner product $\langle f,g\rangle = \int_0^{2\pi} f(x)g(x)\,dx$. Show that $\{1,\cos x,\sin x\}$ forms an orthogonal set but not an orthonormal set. Find the appropriate scaling factors to make it orthonormal.

\item Consider the space $\sym_2$ of $2\times 2$ symmetric matrices with the Frobenius inner product. Find an orthonormal basis for this space, and verify it has the correct dimension.

\item Let $A$ be the matrix:
\[
    A = \begin{bmatrix}
    1 & 1 \\
    1 & 0 \\
    0 & 1
    \end{bmatrix}
\]
Find its QR decomposition by applying Gram-Schmidt orthogonalization to its columns. Verify that $Q^TQ=I$ and $A=QR$.

\item Show that if $Q$ is a $2\times 2$ orthogonal matrix with $\det Q>0$, then $Q$ must be a rotation matrix. If instead $\det Q<0$, what can you say about it?

\item For points in the plane, define $\langle \vect{p},\vect{q}\rangle_M = p_1q_1 + p_1q_2 + p_2q_1 + 2p_2q_2$. Write the matrix $M$ such that this inner product equals $\vect{p}^TM\vect{q}$. Is this a valid inner product? Justify your answer. If valid, find two orthogonal vectors under this inner product.

\item Define an inner product on $\R^2$ by $\langle \vect{u},\vect{v}\rangle = 4u_1v_1 + 4u_1v_2 + 4u_2v_1 + 5u_2v_2$. Find the angle between $\vect{u}=(1,0)^T$ and $\vect{v}=(0,1)^T$ under this inner product.

\item Consider a nonzero vector $\vect{v}\in\R^3$ and let $\proj{}=\proj{\vect{v}}$ denote orthogonal projection onto $\spanset\{\vect{v}\}$. Show that $\proj{}$ is symmetric: $\proj{}^T=\proj{}$.

\item For vectors $\vect{u},\vect{v}\in\R^n$, prove that the Cauchy-Schwarz inequality becomes an equality if and only if one vector is a scalar multiple of the other.

\item Prove that if $\vect{u}$ and $\vect{v}$ are orthogonal vectors in an inner product space, then the Pythagorean theorem holds: $\|\vect{u}+\vect{v}\|^2 = \|\vect{u}\|^2 + \|\vect{v}\|^2$.

\item An inner product satisfies $\|\vect{u}+\vect{v}\|^2 + \|\vect{u}-\vect{v}\|^2 = 2(\|\vect{u}\|^2 + \|\vect{v}\|^2)$ for all vectors $\vect{u},\vect{v}$. Prove this using properties of inner products.

\item Let $A$ and $B$ be $2\times 2$ matrices. Using the Frobenius inner product $\langle A,B\rangle = \trace(A^TB)$, show that if $A$ and $B$ are orthogonal as vectors in matrix space, then $\trace(AB^T)=0$. Is there a geometric interpretation of this orthogonality in terms of how $A$ and $B$ act on vectors in $\R^2$?

\item Let $T:V\to V$ be a linear transformation on an inner product space. Prove that if $\langle T\vect{v},\vect{v}\rangle = 0$ for all $\vect{v}\in V$, then $T=0$.

\item Let $V=\R^2$ with the standard inner product. For a fixed nonzero vector $\vect{a}$, define $T:V\to V$ by:
\[
    T\vect{x} = \vect{x} - 2\frac{\langle \vect{x},\vect{a}\rangle}{\|\vect{a}\|^2}\vect{a}
\]
Prove that $T$ preserves inner products: $\langle T\vect{x},T\vect{y}\rangle = \langle \vect{x},\vect{y}\rangle$ for all $\vect{x},\vect{y}\in V$.

\item For a subspace $W<V$ of an inner product space and vector $\vect{v}\in V$, prove that:
\[
    \|\vect{v} - \proj{W}\vect{v}\| \leq \|\vect{v} - \vect{w}\|
\]
for any $\vect{w}\in W$, with equality if and only if $\vect{w}=\proj{W}\vect{v}$. Interpret this result geometrically.

\item The temperature at point $(x,y)$ on a plate is given by $T(x,y)=e^{-x^2-y^2}$. Using the inner product $\langle f,g\rangle = \int_{-\infty}^\infty\int_{-\infty}^\infty f(x,y)g(x,y)\,dx\,dy$, find $\|T\|$.

\item The \style{Gram matrix} $G$ of vectors $\{\vect{v}_1,\ldots,\vect{v}_n\}$ has entries $g_{ij}=\langle \vect{v}_i,\vect{v}_j\rangle$. Prove that $G$ is: symmetric and has determinant zero if and only if the vectors are linearly dependent. 

\item Let $V$ be the space of continuous functions on $[0,1]$ with inner product $\langle f,g\rangle = \int_0^1 f(x)g(x)\,dx$. Find constants $a$ and $b$ such that $f(x)=a+bx$ is orthogonal to both $1$ and $x^2$.

\item For continuous functions on $[0,1]$, consider:
\[
    \langle f,g\rangle = \int_0^1 f(x)g(x)\,dx + f(0)g(0) + f(1)g(1)
\]
Prove this defines an inner product and find the norm of $f(x)=x(1-x)$.

\item On the space $C^1([0,1])$ of continuously differentiable functions, define:
\[
    \langle f,g\rangle = \int_0^1 (f(x)g(x) + f'(x)g'(x))\,dx
\]
Prove this defines an inner product and find the angle between $f(x)=x$ and $g(x)=x^2$.

\item Let $\poly_3[0,1]$ denote polynomials of degree at most 3 with inner product $\langle f,g\rangle = \int_0^1 f'(x)g'(x)\,dx$. Prove this defines a valid inner product on the subspace of polynomials with $f(0)=0$; then, find a polynomial orthogonal to both $x$ and $x^2$ under this inner product.

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Orthogonal Decomposition \& Data}
\label{ch:6}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``quadrangular the building rose the heavens squared by a line''}

\newthought{The marriage of geometry and algebra} reaches its first culmination in the notion of orthogonality. The inner product structure we have built transforms our understanding of the fundamental spaces and operations from Chapter 3. What was mere algebra  --  kernels and images, quotients and complements  --  acquires geometric meaning through perpendicularity. This geometric perspective not only illuminates the theory but guides computation, leading to optimal methods for solving systems and fitting data.

Our task is to reexamine the foundations laid in earlier chapters through the lens of orthogonality. The kernel of a transformation is not merely algebraically invisible but geometrically perpendicular to its coimage. Quotient spaces are not only algebraic identifications but induce orthogonal decompositions. The Fundamental Theorem itself expresses not merely dimensional accounting but geometric splitting of domain and codomain into perpendicular factors.

This geometric reinterpretation bears immediate practical fruit. When exact solutions to linear systems do not exist, orthogonal projections provide best approximations under natural measures of error. Data corrupted by noise finds clean representation through projection onto suitable subspaces. The abstract machinery of inner products descends from theory to practice, offering optimal methods for a host of engineering problems.

The path ahead revisits familiar territory with new eyes. We begin by reinterpreting the four fundamental spaces through orthogonality. This geometric understanding leads naturally to projection operators  --  the workhorses of decomposition. These tools in hand, we return to the Fundamental Theorem, as emanation of orthogonal decomposition of spaces. Finally, we apply this geometric machinery to the practical problems of least squares approximation, where theory guides us to optimal solutions of overdetermined systems.

This synthesis of geometry and algebra  --  of inner products and linear transformations  --  provides the foundation for many modern computational methods. The principles we develop here will guide our study of eigenvalues, singular values, and principal components in chapters ahead. 
%
\begin{marginfigure}
{\em Foreshadowing}: The orthogonal projections studied here form the theoretical foundation for dimensionality reduction in machine learning, where high-dimensional data is projected onto lower-dimensional subspaces that capture essential features.
\end{marginfigure}

% ==============================================
\section{Orthogonal Subspaces \& Complements}
\label{sec:orthogcomp}
% ==============================================

The vector spaces we have studied thus far acquire richer structure through the inner product geometry of Chapter \ref{ch:5}. Just as individual vectors may stand perpendicular to one another, entire subspaces can be orthogonal, leading to natural geometric decompositions that both illuminate theory and enable computation.

\begin{definition}[Orthogonal Complement]
\label{def:orthog}
Two subspaces $U,W$ of an inner product space $V$ are \style{orthogonal}, denoted $U\perp W$, if every vector in one is orthogonal to every vector in the other:
\[
    \langle \vect{u},\vect{w}\rangle = 0 \quad\text{for all }\vect{u}\in U,\ \vect{w}\in W
\]
The \style{orthogonal complement} of a subspace $U<V$, denoted $U^\perp$, is the subspace of all vectors orthogonal to $U$:
\[
    U^\perp = \{\vect{v}\in V : \langle \vect{v},\vect{u}\rangle = 0\text{ for all }\vect{u}\in U\}
\]
\end{definition}

That $U^\perp$ is indeed a subspace follows readily from properties of the inner product: the zero vector is certainly orthogonal to all of $U$, and linear combinations of vectors orthogonal to $U$ remain orthogonal to $U$.

\begin{lemma}
\label{lem:orthogcomp}
For $V$ a finite-dimensional inner product space and any $U<V$:
\begin{enumerate}
    \item $(U^\perp)^\perp = U$
    \item $\dim U + \dim U^\perp = \dim V$
    \item $U\cap U^\perp = \{\vect{0}\}$
    \item $V = U\oplus U^\perp$
\end{enumerate}
\end{lemma}

\begin{proof}
That $U\subseteq(U^\perp)^\perp$ is clear from definitions. For the reverse inclusion, choose any orthonormal basis $\{\vect{u}_1,\ldots,\vect{u}_k\}$ for $U$ and extend it to an orthonormal basis of $V$. The complement of this basis spans $U^\perp$, from which the dimensional equality follows. The remaining properties emerge from this explicit construction.
\end{proof}

The last property is especially significant: every vector in $V$ decomposes uniquely into orthogonal components lying in $U$ and $U^\perp$. We will write $V=U\orthosum U^\perp$ to denote an \style{orthogonal direct sum} of $V$ into subspaces $U$ and $U^\perp$ that are both complementary and orthogonal. This $\orthosum$-decomposition will prove fundamental to our development of projection operators in the next section.

\begin{example}[Matrix Subspaces]
\label{ex:matrixorthog}
Consider the space $\R^{n\times n}$ with the Frobenius inner product $\langle A,B\rangle = \trace(A^TB)$ from Example \ref{ex:Frobenius}. The subspace $\sym_n$ of symmetric matrices and the subspace $\skewsym_n$ of skew-symmetric matrices are orthogonal complements:
\[
    \R^{n\times n} = \sym_n \orthosum \skewsym_n
\]
\[
    \sym_n = \{A : A^T = A\} \quad\text{and}\quad \skewsym_n = \{A : A^T = -A\}
\]
This orthogonal decomposition reveals that any matrix $A$ splits uniquely into symmetric and skew-symmetric parts:
\[
    A = \frac{A + A^T}{2} + \frac{A - A^T}{2}
\]
This decomposition has important applications in mechanics, where symmetric matrices often represent stress and strain.
\end{example}

The geometric perspective offered by orthogonal complements will transform our understanding of the fundamental subspaces from Chapter \ref{ch:3}. What appeared there as purely algebraic constructions  --  kernels and images, quotients and duals  --  will acquire new geometric meaning through orthogonality. This reinterpretation through perpendicularity will guide our development of optimal methods for solving systems and approximating data in the sections ahead.

% ==============================================
\section{Projections \& Quotients}
\label{sec:projquot}
% ==============================================

The concept of projection pervades mathematics and engineering. A shadow cast by sunlight projects three-dimensional objects onto the plane; a surveyor's map projects the curved surface of Earth onto flat paper; a statistician projects high-dimensional data onto informative lower-dimensional summaries. These diverse examples share a common mathematical essence: the approximation of complex objects by simpler ones through systematic dimension reduction.

The inner product structure we have built transforms this intuitive notion into precise mathematics. Given a subspace $W < V$, we seek to approximate arbitrary vectors in $V$ by their ``shadows'' in $W$  --  vectors that minimize the distance to the original while lying entirely in $W$. This geometric problem leads directly to orthogonal projection, a concept that unifies the theoretical structure of Chapter 3 with the computational methods of modern data analysis.

\begin{definition}[Orthogonal Projection]
\label{def:projection}
Let $W < V$ be a subspace of an inner product space. The \style{orthogonal projection} onto $W$ is the linear transformation $\proj{W}:V\to V$ satisfying:
\begin{enumerate}
    \item $\proj{W}\vect{v} \in W$ for all $\vect{v}\in V$ (projects onto $W$)
    \item $\vect{v} - \proj{W}\vect{v} \perp W$ for all $\vect{v}\in V$ (projects orthogonally)
\end{enumerate}
\end{definition}

\begin{marginfigure}
{\em Nota bene:} The projection $\proj{W}$ is uniquely determined by these properties. Though other transformations might map vectors into $W$, only orthogonal projection maintains perpendicularity of the error.
\end{marginfigure}

This seemingly abstract definition encodes a powerful optimization principle: $\proj{W}\vect{v}$ provides the best approximation to $\vect{v}$ within $W$ under the natural distance measure induced by the inner product.

\begin{lemma}[Best Approximation]
\label{lem:bestapprox}
For any $\vect{v}\in V$ and $\vect{w}\in W$:
\[
    \|\vect{v} - \proj{W}\vect{v}\| \leq \|\vect{v} - \vect{w}\|
\]
with equality if and only if $\vect{w} = \proj{W}\vect{v}$.
\end{lemma}

\begin{proof}
For any $\vect{w}\in W$, the error vector $\vect{v} - \vect{w}$ decomposes into orthogonal components:
\[
    \vect{v} - \vect{w} = (\vect{v} - \proj{W}\vect{v}) + (\proj{W}\vect{v} - \vect{w})
\]
where the first term is orthogonal to $W$ and the second lies in $W$. By the Pythagorean theorem:
\[
    \|\vect{v} - \vect{w}\|^2 = \|\vect{v} - \proj{W}\vect{v}\|^2 + \|\proj{W}\vect{v} - \vect{w}\|^2
\]
The right term vanishes precisely when $\vect{w} = \proj{W}\vect{v}$.
\end{proof}

When $W$ has an orthonormal basis $\{\vect{w}_1,\ldots,\vect{w}_k\}$, the projection takes an especially elegant form:
\[
    \proj{W}\vect{v} = \sum_{i=1}^k \langle \vect{v},\vect{w}_i\rangle\vect{w}_i
\]
Each coefficient emerges naturally as the inner product with the corresponding basis vector  --  no system of equations need be solved. This formula reveals projection as a type of spectral decomposition, extracting from $\vect{v}$ precisely those components aligned with $W$'s basis vectors.

\begin{example}[Signal Processing]
Consider the space $V = C([-\pi,\pi])$ of continuous functions on $[-\pi,\pi]$ with the $L^2$ inner product of Example \ref{ex:fourier}. The subspace $W$ spanned by $\{1,\cos x,\sin x\}$ captures the DC and first harmonic components of signals. The projection $\proj{W}$ implements a basic low-pass filter, approximating arbitrary signals by their first Fourier components. The error $\vect{v} - \proj{W}\vect{v}$ represents higher-frequency content filtered out by the projection.
\end{example}

Projection operators possess several properties that illuminate their geometric and algebraic character:

\begin{lemma}[Projection Properties]
\label{lem:projprop}
The orthogonal projection $\proj{W}$ satisfies:
\begin{enumerate}
    \item Idempotence: $\proj{W}^2 = \proj{W}$
    \item Self-adjointness: $\proj{W}^* = \proj{W}$
    \item Complementarity: $I - \proj{W} = \proj{W^\perp}$
\end{enumerate}
where $W^\perp$ denotes the orthogonal complement of $W$.
\end{lemma}

These properties reflect the geometric nature of projection  --  applying it twice has no additional effect; it respects the inner product structure; and it decomposes space into orthogonal pieces. The last property provides particular insight: projection onto $W$ and projection onto $W^\perp$ split any vector into complementary components.

\begin{marginfigure}
{\em Foreshadowing:} This decomposition principle will reach its full power in Chapter 10, where the Singular Value Decomposition provides an optimal sequence of orthogonal projections for approximating data.
\end{marginfigure}

This splitting illuminates our earlier study of quotient spaces. When we quotient $V$ by a subspace $U$, each equivalence class consists of vectors differing by elements of $U$. Through the lens of orthogonality, we can represent each class by its projection onto $U^\perp$  --  the unique member minimizing distance to the origin. The quotient space $V/U$ becomes naturally isomorphic to $U^\perp$, with the projection $\proj{U^\perp}$ providing an explicit isomorphism.

%
\begin{marginfigure}
\centering
\begin{tikzcd}[column sep=2.5em, row sep=2.5em]
V \arrow[r, "{\pi}", dashed] \arrow[d, "{\proj{U^\perp}}"] & V/U \arrow[d, "{\varphi}"] \\
U^\perp \arrow[r, "{\psi}"] & W
\end{tikzcd}
\end{marginfigure}

The relationship between projections and quotients can be visualized through a commutative diagram.
%
The natural quotient map $\pi$ (dashed) makes the diagram commute: both paths from $V$ to $W$ yield equivalent results.
\begin{marginfigure}
{\em Nota bene:} The dashed arrow represents the natural quotient map, while solid arrows show explicit linear transformations.
\end{marginfigure}
%
Here $V/U$ denotes the quotient space, $U^\perp$ is the orthogonal complement, and $W$ is any space isomorphic to both (often taken to be the image of a linear transformation). The diagram commutes in that both paths from $V$ to $W$ yield the same result  --  whether we first project onto $U^\perp$ or pass to the quotient space $V/U$. This geometric perspective illuminates why quotient spaces and orthogonal complements provide equivalent ways of understanding the effective domain of a linear transformation.

\begin{example}[Data Centering]
Consider a collection of vectors $\{\vect{x}_1,\ldots,\vect{x}_n\}$ in $\R^d$. The subspace $U$ spanned by $\vect{1} = (1,\ldots,1)^T$ represents uniform translations. Projecting onto $U^\perp$ centers the data by subtracting means  --  a fundamental preprocessing step in data analysis. The quotient $\R^d/U$ captures the intrinsic shape of the data cloud independent of its absolute position.
\end{example}

The power of orthogonal projection extends far beyond these elementary examples. When exact solutions to linear systems do not exist, projection onto appropriate subspaces yields optimal approximations. When data contains noise, projection onto signal subspaces enables filtering and compression. When complex systems require simplified models, projection onto lower-dimensional spaces balances accuracy and complexity. These applications, and many more, spring from the simple geometric principle encoded in Definition \ref{def:projection}.


% ==============================================
\section{The Fundamental Theorem Redux}
\label{sec:fundredux}
% ==============================================

The Fundamental Theorem of Linear Algebra, when viewed in finite-dimensional inner product spaces, reveals deeper structure through the lens of orthogonality. What first appeared as a collection of dimensional relationships now emerges in its true form: a statement about the geometric splitting of spaces. The four fundamental subspaces  --  kernel, image, cokernel, and coimage  --  relate not merely through counting of dimensions but through perpendicularity. This geometric understanding, though restricted to the finite-dimensional setting, transforms our perspective on linear transformations, providing both theoretical insight and practical methods for computation.

 Recall, we write $A\orthosum B$ to denote an orthogonal direct sum: subspaces $A$ and $B$ that are both complementary ($V = A\directsum B$) and orthogonal ($A\perp B$).

\begin{theorem}[Fundamental Theorem of Linear Algebra (Geometric Form)]
\label{thm:FTLAgeom}
Any linear transformation $T:V\rightarrow W$ between finite-dimensional inner product spaces induces orthogonal decompositions of both domain and codomain:
\begin{equation}
\label{eq:orthogonalFTLA}
    V = \ker T \orthosum (\ker T)^\perp \qquad\text{and}\qquad W = \im T \orthosum (\im T)^\perp
\end{equation}
These decompositions are connected by the following:
\begin{marginfigure}
{\em Nota bene:} The term \style{naturally isomorphic} here means that the isomorphisms arise from the geometric structure itself.
\end{marginfigure}
\begin{enumerate}
    \item The restriction of $T$ to $(\ker T)^\perp$ gives an isomorphism $(\ker T)^\perp \iso \im T$
    \item The coimage $V/\ker T$ is naturally isomorphic to $\im T$
    \item The cokernel $W/\im T$ is naturally isomorphic to $(\im T)^\perp$
    \item The orthogonal projections $\proj{(\ker T)^\perp}$ and $\proj{\im T}$ commute with $T$
\end{enumerate}
\end{theorem}
%
These decompositions illuminate the four fundamental subspaces through geometry rather than algebra. The kernel represents vectors invisible to $T$; its orthogonal complement captures the effective inputs. The image contains all possible outputs; its orthogonal complement measures the transformation's deficiency. Each complementary pair provides a complete view of how $T$ acts on its domain and codomain.


\begin{example}[Matrix Transformations]
For a matrix $A\in\R^{m\times n}$, these decompositions have immediate computational significance with respect $\row(A)$ and $\column(A)$, the row and column spaces respectively.

%
\begin{enumerate}
\item $\R^n = \ker(A) \orthosum \row(A)^T$ splits input space
\item $\R^m = \column(A) \orthosum \ker(A^T)$ splits output space
\item The projections $\proj{\row(A)^T}$ and $\proj{\column(A)}$ provide optimal approximate solutions when exact solutions do not exist
\end{enumerate}
%
\end{example}
%
\begin{marginfigure}
{\em Foreshadowing:} The orthogonal complements appearing here foreshadow the notion of orthogonal matrices in Chapter \ref{ch:7}, where entire transformations preserve perpendicularity.
\end{marginfigure}
%

These geometric splittings yield the algebraic relationships of Chapter \ref{ch:3} as corollaries:
\begin{cor}[Rank-Nullity Redux]
\label{cor:fundthm}
For a linear transformation $T:V\rightarrow W$ between finite-dimensional inner product spaces:
\begin{enumerate}
\item $\dim V = \dim\ker T + \dim\im T$
\item $\dim W = \dim\im T + \dim\coker T$
\item $\dim\im T = \dim(\ker T)^\perp$
\item The rank equals both $\dim\im T$ and $\dim\coim T$
\end{enumerate}
\end{cor}
The proof follows from the orthogonal decompositions, with the dimensions of complementary subspaces summing to the dimension of the whole space. What before seemed like mysterious algebraic coincidences now emerge as natural consequences of geometric splitting.

This geometric perspective guides computation. When solving $T\vect{v}=\vect{w}$:
\begin{enumerate}
\item Project $\vect{w}$ onto $\im T$ to test solvability
\item If solvable, find a particular solution in $(\ker T)^\perp$
\item If unsolvable, project onto $\im T$ for best approximation
\end{enumerate}


The Fundamental Theorem thus reveals itself as more than mere algebra  --  it expresses the fundamental geometric structure of linear transformations. This unity of geometry and algebra provides both theoretical insight and practical methods, a theme that will deepen as we proceed to least squares problems and their applications.


% ==============================================
\section{The Pseudoinverse}
\label{sec:pseudoinverse}
% ==============================================

The Fundamental Theorem of Linear Algebra revealed how any linear transformation induces four fundamental subspaces, connected through orthogonal decomposition of domain and codomain. This elegant structure suggests a natural question: can we define a reverse transformation that somehow undoes the action of our original map while respecting these geometric relationships? The answer lies in the \style{pseudoinverse}  --  a generalization of matrix inversion that provides the optimal approximate inverse even for singular or rectangular matrices.

\begin{definition}[Pseudoinverse]
\label{def:pseudoinverse}
For a linear transformation $T:V\rightarrow W$ between finite-dimensional inner product spaces, the \style{pseudoinverse} (or \style{Moore-Penrose inverse}) $T^{\dagger}:W\rightarrow V$ is the unique linear transformation satisfying all of the following conditions:
\begin{enumerate}
    \item $TT^{\dagger}T = T$ \quad (First consistency condition)
    \item $T^{\dagger}TT^{\dagger} = T^{\dagger}$ \quad (Second consistency condition)
    \item $(TT^{\dagger})^* = TT^{\dagger}$ \quad (First adjoint condition)
    \item $(T^{\dagger}T)^* = T^{\dagger}T$ \quad (Second adjoint condition)
\end{enumerate}
where $^*$ denotes the adjoint operation with respect to the inner products on $V$ and $W$.
\end{definition}

This abstract definition, while complete, may obscure the profound geometric meaning of the pseudoinverse. Through the lens of the Fundamental Theorem, the pseudoinverse emerges naturally from orthogonal projections onto the four fundamental subspaces:

% ****************************************************
\begin{figure}
    \centering
    \includegraphics[width=0.75\linewidth]{pseudoinverse.png}
    \caption{The pseudoinverse $T^{\dagger}$ as the reverse of $T$ through the fundamental subspaces, with $T$ first projecting onto $(\ker T)^{\perp}$, then isomorphically mapping to $\im T$; while
    $T^{\dagger}$ first projects onto $\im T$, then isomorphically maps to $(\ker T)^{\perp}$.
}
    \label{fig:pseudoinverse}
\end{figure}
% ****************************************************

This symmetry reveals the pseudoinverse not as an ad hoc construction, but as the natural reverse map that respects the orthogonal structure imposed by our inner products.

For matrices, the pseudoinverse takes a particularly elegant form. When $A$ has full column rank, its pseudoinverse is:
\[
A^{\dagger} = (A^TA)^{-1}A^T
\]
This formula connects directly to the orthogonal projections studied earlier in this chapter: $A^TA$ is invertible precisely because $A$ has full column rank, making $\ker(A) = \{\mathbf{0}\}$. When $A$ has full row rank, the pseudoinverse becomes:
\[
A^{\dagger} = A^T(AA^T)^{-1}
\]
These formulas illustrate how the pseudoinverse generalizes the concept of matrix inversion to rectangular matrices, providing the best possible approximate inverse when an exact inverse doesn't exist.

\begin{example}[Orthogonal Projection]
\label{ex:pseudoinverseprojection}
Consider the orthogonal projection $\proj{U}:V\to V$ onto a subspace $U<V$. Its fundamental spaces are:
\begin{itemize}
    \item $\ker(\proj{U}) = U^{\perp}$
    \item $\im(\proj{U}) = U$
\end{itemize}
The pseudoinverse $\proj{U}^{\dagger}$ equals $\proj{U}$ itself, as projection already satisfies all four pseudoinverse conditions. Indeed, projection is its own pseudoinverse precisely because it is already idempotent ($\proj{U}^2 = \proj{U}$) and self-adjoint ($\proj{U}^* = \proj{U}$).
\end{example}

\begin{example}[Full Column Rank Matrix]
\label{ex:pseudoinverserank}
Consider the matrix
\[
A = \begin{bmatrix}
1 & 0 \\
0 & 1 \\
1 & 0
\end{bmatrix}
\]
which has full column rank. Its pseudoinverse is given by:
\[
A^{\dagger} = (A^TA)^{-1}A^T = \begin{bmatrix}
2 & 0 & 1 \\
0 & 1 & 0
\end{bmatrix}^{-1}
\begin{bmatrix}
1 & 0 & 1 \\
0 & 1 & 0
\end{bmatrix}
= 
\begin{bmatrix}
1/2 & 0 & 1/2 \\
0 & 1 & 0
\end{bmatrix}
\]
This pseudoinverse properly balances the redundant information in the first and third rows of $A$.
\end{example}

\begin{theorem}[Pseudoinverse Properties]
\label{thm:pseudoinverseprops}
The pseudoinverse $T^{\dagger}$ satisfies:
\begin{enumerate}
    \item $T^{\dagger}$ maps $\im T$ isomorphically to $(\ker T)^{\perp}$
    \item $T^{\dagger}$ maps $(\im T)^{\perp}$ to $\ker T$
    \item $TT^{\dagger}$ is the orthogonal projection onto $\im T$
    \item $T^{\dagger}T$ is the orthogonal projection onto $(\ker T)^{\perp}$
    \item If $T$ is invertible, then $T^{\dagger} = T^{-1}$
    \item $(T^{\dagger})^{\dagger} = T$
    \item $(T^*)^{\dagger} = (T^{\dagger})^*$
\end{enumerate}
\end{theorem}

In the general case where $A$ is neither full row nor full column rank, the construction of the pseudoinverse becomes more involved. One approach uses the orthogonal projections onto the fundamental subspaces as illustrated in Figure \ref{fig:pseudoinverse}. First, we project onto the image space $\im(A)$ using the projection operator $P_{\im(A)}$. Then, we apply the isomorphism between $\im(A)$ and $(\ker A)^{\perp}$, followed by the inclusion map back into the domain.

% ==============================================
\section{Least Squares Approximation}
\label{sec:leastsquares}
% ==============================================

The pseudoinverse transforms abstract decompositions into practical computational tools. For systems where exact solutions fail to exist, it navigates the fundamental subspaces to provide optimal approximations. This optimality  --  finding the best possible approximate solution under natural measures of error  --  lies at the heart of least squares approximation. The geometric structure revealed through the Fundamental Theorem provides not just theoretical understanding but practical methods for data fitting and analysis.

Consider the system $A\vect{x}=\vect{b}$ where $A:\R^n\to\R^m$ with $m>n$. Such systems typically arise when fitting models to data: each row represents an observation, each column a parameter to be determined. Though $\vect{b}$ rarely lies in $\im A$, the orthogonal decomposition of the codomain $\R^m$ guides us to the optimal approximation:
\[
    \R^m = \im A \orthosum (\im A)^\perp
\]
The vector $\vect{b}$ thus splits uniquely as $\vect{b} = \vect{b}_1 + \vect{b}_2$ where $\vect{b}_1 \in \im A$ and $\vect{b}_2 \in (\im A)^\perp$. Since $\vect{b}_1$ lies in $\im A$, there exists $\hat{\vect{x}}$ such that $A\hat{\vect{x}} = \vect{b}_1$. This vector $\hat{\vect{x}}$ provides precisely the least squares solution we seek, as the following theorem confirms:

\begin{theorem}[Least Squares Solution]
\label{thm:leastsq}
For a full-rank matrix $A\in\R^{m\times n}$ with $m>n$, the system $A\vect{x}=\vect{b}$ has unique least squares solution:
\[
    \hat{\vect{x}} = A^{\dagger}\vect{b} = (A^TA)^{-1}A^T\vect{b}
\]
This solution minimizes $\|\vect{b}-A\vect{x}\|$ over all $\vect{x}\in\R^n$.
\end{theorem}

\begin{proof}
The error vector $\vect{b}-A\vect{x}$ must be orthogonal to $\im A$ at the minimum, else we could reduce its length through projection. This orthogonality condition means:
\[
    \langle \vect{b}-A\hat{\vect{x}},A\vect{v}\rangle = 0 \quad\text{for all }\vect{v}\in\R^n
\]
Therefore $A^T(\vect{b}-A\hat{\vect{x}})=\vect{0}$, yielding the \style{normal equations}:
\[
    A^TA\hat{\vect{x}} = A^T\vect{b}
\]
When $A$ has full rank, $A^TA$ is positive definite hence invertible, giving the stated solution. This solution equals $A^{\dagger}\vect{b}$ by the definition of the pseudoinverse for full-column rank matrices. The pseudoinverse $A^{\dagger} = (A^TA)^{-1}A^T$ transforms our abstract decomposition into concrete computational methods for finding optimal approximations.

To confirm this minimizes the error, note that for any $\vect{x}$:
\[
    \|\vect{b} - A\vect{x}\|^2 = \|\vect{b} - A\hat{\vect{x}} + A\hat{\vect{x}} - A\vect{x}\|^2 = \|\vect{b} - A\hat{\vect{x}}\|^2 + \|A\hat{\vect{x}} - A\vect{x}\|^2
\]
since $(\vect{b} - A\hat{\vect{x}}) \perp \im A$ by construction. The second term vanishes precisely when $\vect{x} = \hat{\vect{x}}$, confirming optimality.
\end{proof}

\begin{marginfigure}
{\em Nota bene:} The term ``normal equations'' reflects geometric normality  --  the error vector stands perpendicular to the solution space.
\end{marginfigure}

The normal equations emerge naturally from projecting $\vect{b}$ onto $\im A$. Indeed, the matrix product $A(A^TA)^{-1}A^T$ implements precisely this orthogonal projection. Through the pseudoinverse, we connect the geometric structure of the Fundamental Theorem directly to computational methods for data fitting. This connection transforms abstract subspaces into practical tools for approximation.

\begin{example}[Linear Regression]
Consider fitting a line $y=mx+b$ to points $(x_1,y_1),\ldots,(x_n,y_n)$. This leads to the overdetermined system:
\[
    \begin{bmatrix}
    x_1 & 1 \\
    x_2 & 1 \\
    \vdots & \vdots \\
    x_n & 1
    \end{bmatrix}
    \begin{pmatrix}
    m \\ b
    \end{pmatrix}
    =
    \begin{pmatrix}
    y_1 \\ y_2 \\ \vdots \\ y_n
    \end{pmatrix}
\]
The least squares solution minimizes the sum of squared vertical distances from points to the line  --  a criterion that emerges naturally from the Euclidean inner product structure.

For specific values $(x_1,y_1)=(1,2)$, $(x_2,y_2)=(2,3)$, and $(x_3,y_3)=(3,5)$, we form:
\[
    A = \begin{bmatrix}
    1 & 1 \\
    2 & 1 \\
    3 & 1
    \end{bmatrix}
    \quad\text{and}\quad
    \vect{b} = \begin{pmatrix}
    2 \\ 3 \\ 5
    \end{pmatrix}
\]

Computing $A^TA$ and $A^T\vect{b}$:
\[
    A^TA = \begin{bmatrix}
    14 & 6 \\
    6 & 3
    \end{bmatrix}
    \quad\text{and}\quad
    A^T\vect{b} = \begin{pmatrix}
    23 \\ 10
    \end{pmatrix}
\]

The normal equations yield:
\[
    \begin{bmatrix}
    14 & 6 \\
    6 & 3
    \end{bmatrix}
    \begin{pmatrix}
    m \\ b
    \end{pmatrix}
    =
    \begin{pmatrix}
    23 \\ 10
    \end{pmatrix}
\]

Solving, we find $m=1.5$ and $b=0.5$, giving $y=1.5x+0.5$ as our best-fit line.
\end{example}

The least squares method extends naturally beyond simple curve fitting. When the columns of $A$ represent basis functions, we obtain general linear models:
\[
    f(x) = c_1\phi_1(x) + c_2\phi_2(x) + \cdots + c_n\phi_n(x)
\]
The coefficients $c_i$ emerge from the least squares solution, providing optimal approximation in the chosen basis. Different choices of basis functions $\phi_i$ yield different approximation schemes:
\begin{itemize}
    \item Polynomials for smooth functions
    \item Trigonometric functions for periodic data
    \item Wavelets for localized features
    \item Splines for piecewise smooth approximation
\end{itemize}

\begin{example}[Polynomial Fitting]
To fit a quadratic $f(x)=ax^2+bx+c$ to data points $(x_i,y_i)$, we form the Vandermonde matrix:
\[
    A = \begin{bmatrix}
    x_1^2 & x_1 & 1 \\
    x_2^2 & x_2 & 1 \\
    \vdots & \vdots & \vdots \\
    x_n^2 & x_n & 1
    \end{bmatrix}
\]
The least squares solution $\vect{c}=(a,b,c)^T$ minimizes $\sum_{i=1}^n(y_i-f(x_i))^2$. Through the pseudoinverse $A^{\dagger}$, this solution navigates the fundamental subspaces to provide the optimal approximation.
\end{example}

This geometric understanding transforms least squares from a computational method into a theoretical principle. When exact solutions do not exist, orthogonal projection onto the available solution space provides the best possible approximation under natural measures of error. Through the pseudoinverse, the abstract subspaces of the Fundamental Theorem  --  the image $\im A$ and its orthogonal complement $(\im A)^{\perp}$  --  translate directly into optimal approximation methods.

The full power of this approach emerges through the explicit connection to the four fundamental subspaces. The least squares solution lives in $(\ker A)^{\perp}$, making it the unique minimum-norm solution. The residual $\vect{b} - A\hat{\vect{x}}$ lies in $(\im A)^{\perp}$, giving it the geometric interpretation as the component of $\vect{b}$ that cannot be represented in the model space. This orthogonal decomposition:
\[
    \vect{b} = A\hat{\vect{x}} + (\vect{b} - A\hat{\vect{x}})
\]
expresses precisely the projection onto the fundamental subspaces guaranteed by the Fundamental Theorem. The pseudoinverse $A^{\dagger}$ transforms this abstract decomposition into concrete computational methods for finding optimal approximations.


\begin{marginfigure}
{\em Foreshadowing:} The Singular Value Decomposition in Chapter \ref{ch:10} will provide an even more powerful framework for analyzing and solving least squares problems, revealing the full geometric structure of approximate solutions.
\end{marginfigure}

% ==============================================
\section{Regularized Least Squares}
\label{sec:regularized}
% ==============================================

Real data harbors noise. The elegant framework of least squares approximation, though mathematically complete, can prove fragile when confronted with measurement errors and uncertainty. Consider fitting a polynomial to noisy samples  --  increasing the degree improves the fit to our data points but may produce wild oscillations between them. This tension between fidelity to measurements and smoothness of solutions suggests a modification to our geometric framework, one that tames such instabilities while preserving the essential character of orthogonal projection.

The source of trouble lies in our unconstrained pursuit of minimal error. Given noisy measurements, the least squares solution may achieve a deceptively small residual by contorting itself to match the noise rather than the underlying signal. We require some means of favoring simpler, more stable solutions  --  a preference that we can encode through geometry.

\begin{example}[Polynomial Overfitting]
Consider fitting polynomials of increasing degree to samples of $f(x)=\cos(2\pi x)$ on $[0,1]$ with small random errors. The least squares solution tracks the data points with increasing precision as degree grows, but at the cost of violent oscillations between samples. Though each fit minimizes squared error, higher-degree solutions appear increasingly unstable.
\begin{marginfigure}
{\em Foreshadowing:} This balance between fitting data and maintaining simplicity appears throughout mathematics and engineering. We shall encounter it again when studying data analysis in later chapters.
\end{marginfigure}
\end{example}

Ridge regression provides an elegant solution through a simple modification of our least squares framework. Rather than minimizing only the error $\|A\vect{x}-\vect{b}\|^2$, we add a term penalizing large coefficients:
\[
    \min_{\vect{x}}\left(\|A\vect{x}-\vect{b}\|^2 + \lambda\|\vect{x}\|^2\right)
\]
where $\lambda>0$ controls the strength of regularization. This augmented objective retains the geometric character of our previous development  --  it measures not just distance to the data but also distance from the origin in the solution space.

From a theoretical perspective, ridge regression replaces the pseudoinverse $A^{\dagger}$ with a regularized version $(A^TA + \lambda I)^{-1}A^T$, which sacrifices exact optimality for improved stability and conditioning. This modified pseudoinverse balances the minimum-norm solution against regularization constraints, effectively shrinking coefficients toward zero while still respecting the underlying orthogonal structure.

The geometric interpretation proves illuminating. Pure least squares projects $\vect{b}$ onto the column space of $A$. Ridge regression shifts this projection, pulling solutions toward the origin through an additional orthogonality constraint. The parameter $\lambda$ controls this shift: larger values favor smaller coefficients at the cost of larger residuals.

\begin{lemma}[Ridge Solution]
\label{lem:ridge}
The minimizer of the ridge regression objective satisfies the modified normal equations:
\[
    (A^TA + \lambda I)\vect{x} = A^T\vect{b}
\]
This solution has smaller coefficients than the pure least squares solution but generally larger residual error.
\end{lemma}

\begin{proof}
The objective function $f(\vect{x})=\|A\vect{x}-\vect{b}\|^2 + \lambda\|\vect{x}\|^2$ is minimized where its derivative vanishes. Expanding using properties of inner products:
\[
\begin{array}{rcl}
f(\vect{x}) &=& \langle A\vect{x}-\vect{b},A\vect{x}-\vect{b}\rangle + \lambda\langle\vect{x},\vect{x}\rangle \\
&=& \langle A\vect{x},A\vect{x}\rangle - 2\langle A\vect{x},\vect{b}\rangle + \|\vect{b}\|^2 + \lambda\|\vect{x}\|^2
\end{array}
\]
Setting the derivative with respect to $\vect{x}$ to zero:
\[
2A^TA\vect{x} - 2A^T\vect{b} + 2\lambda\vect{x} = \vect{0}
\]
Rearranging yields the modified normal equations:
\[
    (A^TA + \lambda I)\vect{x} = A^T\vect{b}
\]
This regularized system always has a unique solution, even when $A^TA$ is singular, as the added term $\lambda I$ ensures positive definiteness. The solution implements a form of biased pseudoinverse:
\[
\vect{x} = (A^TA + \lambda I)^{-1}A^T\vect{b}
\]
which trades unbiasedness for reduced variance in parameter estimates. The parameter $\lambda$ controls this bias-variance tradeoff, with larger values producing smaller coefficients but potentially larger residual error.

\end{proof}

\begin{example}[Signal Smoothing]
Consider smoothing a noisy time series through local polynomial fitting. Pure least squares produces fits that track noise too closely. Ridge regression, by penalizing large coefficients, yields smoother approximations that better capture underlying trends while suppressing high-frequency noise. The parameter $\lambda$ provides direct control over this smoothing effect.
\end{example}

The choice of regularization parameter $\lambda$ embodies the fundamental tradeoff between fitting data and maintaining stability. Small values yield solutions close to pure least squares; large values force solutions toward zero. No universal choice exists  --  the appropriate balance depends on noise levels, problem structure, and ultimate purpose.
%
\begin{marginfigure}
{\em Example:} In polynomial fitting, larger $\lambda$ values increasingly suppress higher-degree terms, effectively limiting the complexity of the fitted function regardless of formal degree.
\end{marginfigure}

This modification of least squares  --  this regularization through additional geometric constraints  --  exemplifies a broader principle in computational mathematics. When theoretical elegance meets practical complexity, we often find success not by abandoning our framework but by carefully augmenting it. The geometric intuition that guided our development of least squares proves robust enough to accommodate these practical concerns while maintaining its essential character.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Signal Processing: From Data to Information}
\label{EM:signal}
% **************** EMANATION *******************

The passage from raw measurements to meaningful information defines modern signal processing. Every sensor --- whether measuring temperature, pressure, acceleration, or electromagnetic waves --- provides data whose natural representation lies in a quotient space modulo measurement noise. The mathematical framework we have built in this chapter, particularly orthogonal projection, pseudoinverse computation, and their regularized variants, provides the tools to extract signal from noise through systematic decomposition of spaces.

Consider a temperature sensor sampling at regular intervals. We represent its measurements as a vector $\vect{y}\in\R^n$ whose components combine true temperature with random fluctuations:
\[
    \vect{y} = \vect{s} + \vect{n}
\]
where $\vect{s}$ represents the underlying signal and $\vect{n}$ the noise. Our task is to recover $\vect{s}$ from $\vect{y}$ --- a challenge that leads naturally to the pseudoinverse operations studied in Section \ref{sec:pseudoinverse}.

The key insight is that real temperature variations typically occur more slowly than random measurement noise. We can express this mathematically by assuming $\vect{s}$ lies near a subspace spanned by low-degree polynomials. If we represent our basis as columns of a matrix $A$, then the optimal recovery becomes:
\[
    \hat{\vect{s}} = A(A^TA)^{-1}A^T\vect{y} = A A^{\dagger}\vect{y}
\]
This is precisely the orthogonal projection through the pseudoinverse, selecting the optimal point in our model subspace.

For any window of time points $[t_k-w,t_k+w]$, we consider the subspace $\poly_d$ of polynomials of degree at most $d$. The basis polynomials $\{1,t,t^2,\ldots,t^d\}$ provide directions for decomposing our signal, though for numerical stability we often use orthogonal polynomials as discussed in Section \ref{sec:orthogonal}. Each local smoothing operation becomes an application of the pseudoinverse, with ridge regularization providing the stability studied in Section \ref{sec:regularized}.

\begin{example}[Temperature Monitoring]
Consider hourly temperature measurements from a chemical reactor represented as $\vect{y}\in\R^{24}$. Raw sensor data shows rapid fluctuations from measurement noise superimposed on the true temperature trends. The space of quadratic polynomials $\poly_2$ provides a natural subspace for local approximation --- its three-dimensional structure captures constant levels, linear trends, and gentle curvature while excluding higher-frequency noise.

For our design matrix $A$ whose columns represent basis polynomials evaluated at our sampling points, the pseudoinverse $A^{\dagger} = (A^TA)^{-1}A^T$ gives us exactly the optimal recovery operator. When the problem becomes ill-conditioned due to closely spaced samples, the regularized pseudoinverse $(A^TA + \lambda I)^{-1}A^T$ stabilizes our solution while maintaining near-optimality. The regularization parameter $\lambda$ controls the balance between fitting fidelity and coefficient stability, precisely as analyzed in Section \ref{sec:regularized}.
\end{example}

The quotient space perspective proves particularly illuminating. Two temperature signals that differ only by high-frequency noise belong to the same equivalence class in an appropriate quotient space. The pseudoinverse operators we have developed provide a systematic way to select canonical representatives from these equivalence classes --- those representatives minimizing both approximation error and coefficient magnitude.

\begin{example}[Electrocardiogram Processing]
An ECG signal represented as $\vect{y}\in\R^n$ contains both high-frequency noise and sharp features (the QRS complex) that must be preserved. Simple pseudoinverse projection would either retain too much noise or blur important peaks. The solution emerges from adaptive regularization --- by varying the regularization parameter $\lambda$ based on local signal properties, we can tune the pseudoinverse to respect the local structure.

Specifically, in regions with sharp transitions, we reduce regularization to preserve detail, while in flatter regions we increase regularization to suppress noise. This adaptive approach demonstrates how the pseudoinverse framework from Section \ref{sec:pseudoinverse} guides practical algorithm design beyond simple least squares.
\end{example}

The framework extends naturally to multidimensional signals through the tensor product constructions implicit in our treatment of matrix spaces. Consider an array of pressure sensors monitoring structural loads on an aircraft wing. Their readings form a matrix $Y\in\R^{m\times n}$, but we expect the true pressure field to vary smoothly across the wing's surface. Two-dimensional polynomial fitting becomes a pseudoinverse problem with a structured design matrix, with regularization ensuring stability just as in the one-dimensional case.

The fundamental principle remains: meaningful information often lives in a lower-dimensional subspace than raw measurements. By carefully choosing these subspaces and using the pseudoinverse and regularized projections developed in this chapter, we separate signal from noise in a mathematically principled way. The geometric intuition we have built --- orthogonality, pseudoinverses, and regularized projection --- provides the foundation for this essential task of modern engineering.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Image Processing: Pixels \& Polynomials}
\label{EM:image}
% **************** EMANATION *******************

Digital images present a natural domain for applying the orthogonal decomposition and pseudoinverse methods developed in this chapter. Each grayscale image is represented by a matrix $F\in\R^{m\times n}$ whose entries combine meaningful content with measurement noise. The mathematical framework we have built --- particularly the pseudoinverse operations of Section \ref{sec:pseudoinverse} and regularized least squares of Section \ref{sec:regularized} --- provides systematic tools for image enhancement through optimal approximation.

Consider the fundamental problem of image smoothing. The key insight parallels our development of the pseudoinverse: most natural images are locally well-approximated by elements of low-dimensional subspaces. Just as Section \ref{sec:pseudoinverse} showed how the pseudoinverse provides optimal projection onto compatible subspaces, we can enhance images through careful application of local pseudoinverse operations.

\begin{example}[Local Surface Fitting]
At each pixel location $(i,j)$, a neighborhood of intensities defines a vector that we fit using the pseudoinverse of an appropriate design matrix. The basis polynomials
\[
    \{\vect{b}_{k\ell}(x,y) = (x-i)^k(y-j)^\ell : k+\ell\leq d\}
\]
form the columns of our design matrix $A$. Computing $A^{\dagger} = (A^TA)^{-1}A^T$ gives us the optimal fitting operator for this polynomial space. As in Section \ref{sec:regularized}, the regularized pseudoinverse $(A^TA + \lambda I)^{-1}A^T$ prevents oscillations near edges where the polynomial model becomes strained.
\end{example}

The geometric interpretation proves particularly illuminating. Each local fit implements the pseudoinverse operators developed in Section \ref{sec:pseudoinverse}, mapping patches of image data onto carefully chosen polynomial subspaces. The regularization parameter $\lambda$ shapes this process, pulling solutions toward simpler polynomial coefficients just as ridge regression stabilized the general least squares problems analyzed in Section \ref{sec:regularized}.

\begin{example}[Medical Imaging]
Consider X-ray images used in medical diagnosis, represented as matrices $F\in\R^{m\times n}$ containing quantum noise from photon counting statistics. The space of diagnostic features forms a natural quotient modulo this noise structure. The pseudoinverse computation, regularized appropriately to preserve clinically relevant details, provides a systematic way to select canonical representatives from these equivalence classes. The framework of Section \ref{sec:pseudoinverse} transforms parameter selection from art to principled mathematics.
\end{example}

The pseudoinverse perspective introduced in Section \ref{sec:pseudoinverse} provides particular insight for image inpainting --- reconstructing missing or corrupted pixels. When some pixels are missing, the design matrix $A$ contains rows corresponding only to the observed pixels. The pseudoinverse $A^{\dagger}$ provides the optimal reconstruction that fits observed data while minimizing coefficient complexity. Since the pseudoinverse minimizes both the residual and the solution norm, it naturally balances fidelity to surrounding data against solution simplicity.

Color images introduce additional structure through their multiple channels. An RGB image comprises three matrices $F_R, F_G, F_B\in\R^{m\times n}$ that benefit from joint analysis. The pseudoinverse methods developed in this chapter extend naturally:
\begin{itemize}
    \item Channel coupling through appropriate block-structured design matrices
    \item Joint regularization preserving color consistency
    \item Pseudoinverse computation respecting inter-channel relationships
\end{itemize}

\begin{example}[Satellite Imaging]
Satellite imagery provides a striking example of multi-channel data, with each spectral band yielding a matrix $F_k\in\R^{m\times n}$. The meaningful information often lies in a lower-dimensional subspace than the raw measurements. A properly structured pseudoinverse computation, implementing the framework developed in Section \ref{sec:pseudoinverse}, separates essential features from sensor noise while respecting the coupling between spectral bands.

This multi-channel regularization represents a natural extension of the basic pseudoinverse operations to handle the structured redundancy across different spectral measurements. Just as the pseudoinverse finds the minimum-norm solution to an underdetermined system, these multi-spectral methods find the solution with minimum cross-channel variation.
\end{example}



% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Support Vector Machines: Geometry of Optimal Separation}
\label{EM:svm}
% **************** EMANATION *******************

The principles of orthogonal decomposition, pseudoinverse computation, and optimal approximation illuminate a fundamental challenge in engineering: how can we best separate data into distinct classes? Support Vector Machines [SVMs] approach this question through the geometry of separating hyperplanes, transforming classification into a problem deeply connected to the pseudoinverse operators and optimization methods developed in this chapter.

\begin{marginfigure}
{\em Think:} Like the pseudoinverse operations of Section \ref{sec:pseudoinverse}, the optimal separating hyperplane minimizes a combination of fit error and solution complexity. The balance between these objectives parallels the regularization studied throughout this chapter.
\end{marginfigure}

Consider points $\{\vect{x}_i\}_{i=1}^n$ in $\R^d$ labeled as either positive ($y_i=+1$) or negative ($y_i=-1$). Just as we sought optimal subspaces for approximating data, we now seek an optimal hyperplane for separating classes. This geometric viewpoint transforms classification from abstract pattern matching into concrete optimization of distances and projections.

A separating hyperplane has form $\{\vect{x}: \vect{w}^T\vect{x} + b = 0\}$ where $\vect{w}$ provides the normal direction. This geometric object naturally decomposes space into positive and negative halfspaces through the sign of $\vect{w}^T\vect{x} + b$. The distance from any point $\vect{x}$ to this hyperplane emerges through orthogonal projection:
%
\begin{marginfigure}
{\em Example:} In quality control, sensor measurements of products must be classified as acceptable or defective. The SVM's maximum margin principle provides robustness against measurement noise, similar to how the pseudoinverse provides stability in sensing applications.
\end{marginfigure}
%
\[
    d(\vect{x}) = \frac{|\vect{w}^T\vect{x} + b|}{\|\vect{w}\|}
\]

The geometric insight of SVMs lies in maximizing the \style{margin} --- the minimum distance between the hyperplane and any training point. After rescaling $\vect{w}$ and $b$ to ensure this minimum distance equals $1/\|\vect{w}\|$, we seek to minimize $\|\vect{w}\|^2$ subject to correct classification of all points. The resulting optimization mirrors the trade-offs encountered in pseudoinverse computation: we balance the robustness gained from a large margin (analogous to a small solution norm) against the constraints of fitting our training data (analogous to residual minimization).

When data proves inseparable by any single hyperplane, we introduce slack variables $\xi_i$ measuring the degree of misclassification. The optimization becomes:
\[
    \min_{\vect{w},b,\boldsymbol{\xi}} \frac{1}{2}\|\vect{w}\|^2 + C\sum_{i=1}^n \xi_i 
    \quad\text{subject to}\quad y_i(\vect{w}^T\vect{x}_i + b) \geq 1 - \xi_i
\]
The parameter $C$ controls our balance between margin size and classification errors --- exactly paralleling how regularization balanced fit against stability in the pseudoinverse computations of Section \ref{sec:pseudoinverse}.

Consider how this framework guides medical image classification. Each scan becomes a point in high-dimensional space where coordinates represent intensities, textures, and shapes extracted from the image. The SVM finds an optimal separating hyperplane through principles similar to pseudoinverse computation. The points exactly satisfying the margin constraints --- the support vectors --- identify the most diagnostically challenging cases. Their special role emerges naturally from the geometry: these boundary cases completely determine the optimal hyperplane, while other points could be moved within their halfspace without affecting the solution.

\begin{marginfigure}
{\em Nota bene:} The support vectors that determine the margin reflect a deeper sparsity principle similar to how the pseudoinverse provides minimum-norm solutions. Both approaches identify the essential components that dictate optimal behavior.
\end{marginfigure}

This geometric perspective transforms our approach to structural health monitoring. Vibration sensors on bridges and buildings generate streams of acceleration data requiring rapid classification as either normal or potentially dangerous. Features extracted from these signals become coordinates in a high-dimensional space where the SVM constructs its separating hyperplane. The margin provides crucial tolerance against sensor noise and environmental variation --- we need decisions that remain reliable despite uncertainty in our measurements. Points near the margin identify borderline structural states warranting particular attention, while the slack variables allow for occasional constraint violations without compromising overall robustness.

The solution reflects the Fundamental Theorem's decomposition of space as elaborated through the pseudoinverse in Section \ref{sec:pseudoinverse}. Once we have found our optimal hyperplane, its normal vector $\vect{w}$ naturally splits $\R^d$ into the direct sum of a line (parallel to $\vect{w}$) and a hyperplane (the space of directions parallel to the decision boundary). This geometric decomposition guides both our understanding and our computations: distances to the hyperplane measure confidence in our predictions, while the margin quantifies robustness against input perturbations.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 6}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item For vectors $\vect{v}_1 = (1,1,1)^T$ and $\vect{v}_2 = (1,2,-1)^T$ in $\R^3$:
(a) Find a vector $\vect{w}$ that is orthogonal to both $\vect{v}_1$ and $\vect{v}_2$
(b) Verify that $\{\vect{v}_1,\vect{v}_2,\vect{w}\}$ forms a basis for $\R^3$
(c) Express $(2,3,1)^T$ as a linear combination of these basis vectors using orthogonal projections

\item Let $U$ be the subspace of $\R^3$ spanned by $\vect{u}_1=(1,1,0)^T$ and $\vect{u}_2=(0,1,1)^T$. 
(a) Find a basis for $U^\perp$
(b) Compute the orthogonal projection of $\vect{v}=(2,1,3)^T$ onto $U$
(c) Verify that $\vect{v} - \proj{U}\vect{v}$ lies in $U^\perp$

\item In the vector space $\poly_2$ with inner product $\langle f,g\rangle = \int_0^1 f(x)g(x)\,dx$:
(a) Show that $1$ and $x-\frac{1}{2}$ are orthogonal polynomials
(b) Use the Gram-Schmidt process to extend these to an orthogonal basis for $\poly_2$
(c) Find the orthogonal projection of $x^2$ onto $\spanset\{1,x-\frac{1}{2}\}$

\item Let $A = \begin{bmatrix}1 & 2 & 1\\2 & 1 & 0\\1 & 0 & 1\end{bmatrix}$. Find:
(a) A basis for the column space of $A$
(b) A basis for the null space of $A$
(c) Show these spaces are orthogonal complements by computing appropriate inner products

\item Consider fitting the line $y=mx+b$ to points $(0,1)$, $(1,3)$, and $(2,2)$.
(a) Set up the normal equations for this least squares problem
(b) Solve for $m$ and $b$
(c) Find the residual vector and verify it is orthogonal to the column space of the coefficient matrix

\item Consider the data points $(1,0)$, $(2,2)$, $(3,1)$, $(4,4)$. Find:
(a) The least squares line $y=mx+b$
(b) The orthogonal projection of the data vector onto the column space of the coefficient matrix
(c) The residual vector and verify it is orthogonal to both $1$ and $x$

\item Let $P$ be the orthogonal projection onto a subspace $U$ of an inner product space $V$. Prove that:
(a) $P^2 = P$
(b) $P^* = P$
(c) $\|P\vect{v}\| \leq \|\vect{v}\|$ for all $\vect{v}\in V$

\item Prove that if $U$ and $W$ are orthogonal subspaces of an inner product space $V$, then:
\[
    \|\vect{u}+\vect{w}\|^2 = \|\vect{u}\|^2 + \|\vect{w}\|^2
\]
for any $\vect{u}\in U$ and $\vect{w}\in W$. Use this to explain why the Pythagorean theorem holds for orthogonal projections.

\item Let $V$ be a finite-dimensional inner product space and $U<V$ a subspace. Prove that:
(a) $(U^\perp)^\perp = U$
(b) $\dim U + \dim U^\perp = \dim V$
(c) Every vector in $V$ can be uniquely written as a sum of vectors from $U$ and $U^\perp$

\item For vectors $\vect{x},\vect{y}\in\R^n$ and $\alpha>0$, consider the regularized least squares problem:
\[
    \min_{\vect{v}\in\R^n} \|\vect{x}-\vect{v}\|^2 + \alpha\|\vect{v}-\vect{y}\|^2
\]
(a) Show this has a unique solution
(b) Find an explicit formula for the solution in terms of $\vect{x}$, $\vect{y}$, and $\alpha$
(c) Interpret this solution as an interpolation between $\vect{x}$ and $\vect{y}$

\item Consider the regularized least squares problem:
\[
    \min_{\vect{x}} \|A\vect{x}-\vect{b}\|^2 + \lambda\|\vect{x}\|^2
\]
Prove that as $\lambda\to\infty$, the solution approaches $\vect{0}$, while as $\lambda\to 0^+$, the solution approaches the minimum norm solution of the original least squares problem.

\item For a linear transformation $T:V\to W$ between inner product spaces, prove that $\ker T$ and $\im T^*$ are orthogonal complements in $V$. What does this tell you about the relationship between solutions to $T\vect{x}=\vect{b}$ and $T^*T\vect{x}=T^*\vect{b}$?

\item Let $U$ be a subspace of an inner product space $V$. Show that $\vect{v}\in V$ has minimum norm among all vectors in its coset $\vect{v}+U$ if and only if $\vect{v}\perp U$.

\item For a matrix $A$, show that the orthogonal projection onto $\column(A)$ is given by $A(A^TA)^{-1}A^T$ when $A$ has full column rank. Use this to explain why the least squares solution minimizes the residual norm.

\item Let $V$ be the space of continuous functions on $[0,1]$ with inner product $\langle f,g\rangle = \int_0^1 f(x)g(x)\,dx$. Show that:
(a) The subspace $U$ of functions satisfying $f(0)=f(1)$ is closed under addition and scalar multiplication
(b) Find a nonzero function in $U^\perp$
(c) Describe geometrically what it means for a function to be orthogonal to all functions in $U$

\item Consider training data consisting of hourly temperature readings $T$ and energy usage $E$ (in kWh) from a building's HVAC system:
\[
    (T,E) = \{(68,42), (72,45), (75,48), (71,44), (69,43), (74,47)\}
\]
The facilities manager believes that measurement errors in temperature readings are twice as significant as those in energy readings. Set up and solve an appropriate weighted least squares problem to find the best linear relationship $E = aT + b$.

\item Let $A$ be an $m\times n$ matrix with $m > n$ and let $\vect{b}$ be a vector such that $A\vect{x}=\vect{b}$ has no solution. Prove that among all vectors $\vect{y}$ satisfying $A\vect{x}=\vect{y}$, the vector $\proj{\im A}\vect{b}$ is closest to $\vect{b}$ in Euclidean norm.

\item A manufacturer measures the tensile strength $y$ of a material at different temperatures $x$, obtaining data points:
\[
    (x,y) = \{(20,45), (30,42), (40,38), (50,35), (60,30), (70,28)\}
\]
Theory suggests the relationship should be of form $y = ae^{bx}$. Show how to transform this into a linear least squares problem and solve for $a$ and $b$.

\item Let $P$ and $Q$ be orthogonal projections onto subspaces $U$ and $W$ respectively. Show that $PQ$ is itself an orthogonal projection if and only if $PQ=QP$. What does this tell you about the relationship between $U$ and $W$?

\item An engineer measures signals $s(t)$ contaminated by periodic noise of known frequency $\omega$. The measurements at times $t_1,\ldots,t_n$ are:
\[
    y_i = s(t_i) + a\cos(\omega t_i) + b\sin(\omega t_i) + \epsilon_i
\]
where $\epsilon_i$ represents random error. Show how to use orthogonal projections to estimate and remove the periodic components, leaving a cleaner estimate of the original signal $s(t)$.

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THE LUVAH CYCLE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\cleardoublepage
\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.75\textwidth]{LUVAH.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
\end{fullwidth}

\cleardoublepage
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Diagonalization \& Dynamics}
\label{ch:7}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``how is it that all things are chang'd even as in ancient times''}

\newthought{A transformation in perpetual action} reveals patterns hidden from static view. A mass-spring system oscillates with characteristic frequencies; a population grows or declines at intrinsic rates; a network's influence flows along preferred channels. These seemingly distinct phenomena share a common mathematical essence: certain directions remain invariant under repeated transformation, while vectors along these special directions experience pure scaling. Such fixed directions and their associated scaling factors  --  \style{eigenvectors} and \style{eigenvalues}  --  provide the key to understanding how linear transformations act over time.

Consider a simple linear recurrence modeling population growth: each generation's size is a fixed multiple of the last. The population either grows exponentially or decays to extinction, depending on whether this multiplier exceeds unity. Though elementary, this example contains the seed of a deeper truth: the long-term behavior of linear systems often reduces to pure scaling along special directions. These particular directions and their scaling factors determine not just whether populations thrive or perish, but how mechanical systems vibrate, how heat diffuses, how quantum states evolve, and how networks transmit influence.

The search for such invariant directions leads us to \style{characteristic polynomials}  --  equations whose roots reveal the natural scaling factors of a transformation. These \style{eigenvalues}, together with their associated \style{eigenvectors}, provide a new perspective on linear transformations. We seek coordinates that aligned with intrinsic scaling; when such exist, the transformation assumes its most primal diagonal form.

This diagonalization  --  this alignment of coordinates with natural directions  --  does more than simplify computation. It reveals the essence of how transformations act, decomposing complex motion into simpler components. The vibration of a drum becomes a superposition of pure tones; the flow of heat resolves into independent decay modes. What appears complicated in one basis becomes elementary when seen through the lens of diagonalization.

% ==============================================
\section{The First Order}
\label{sec:firstorder}
% ==============================================

The simplest differential equation from calculus serves as prototype for all continuous-time linear systems. Consider the equation
\begin{equation}
\label{eq:1stODE}
    \frac{dx}{dt} = \lambda x
\end{equation}
\begin{marginfigure}
    {\em Notation:} Using the differential operator $D=d/dt$ will prove advantageous later. 
\end{marginfigure}
where $\lambda$ is a constant. From calculus, we know the general solution is an exponential:
\[
    x(t) = x_0 e^{\lambda t}
\]
where $x_0$ is a constant interpretable as the \style{initial condition} $x_0 = x(0)$. This elementary equation contains the seeds of deeper structure since the solutions to (\ref{eq:1stODE}) are closed under addition and scalar multiplication. As such, the simple solution $e^{\lambda t}$ serves as a basis for the 1-dimensional solution space to $(\ref{eq:1stODE})$. 

Note the centrality of the constant $\lambda$ in determining the qualitative behavior of all the solutions in the solution space:
\begin{itemize}
    \item When $\lambda > 0$, solutions grow exponentially
    \item When $\lambda < 0$, solutions decay exponentially
    \item When $\lambda = 0$, solutions remain constant
\end{itemize}
%
\begin{marginfigure}
{\em Think:} The solution corresponding to $x_0=0$ is special, in that even when other solutions grow or shrink, it remains constant. Such \style{equilibrium} solutions are central objects of study in dynamical systems.
\end{marginfigure}

This structure  --  exponential solutions parametrized by a characteristic number $\lambda$  --  will recur throughout this and the following two chapters. More complex systems will decompose into collections of such fundamental solutions, each growing or decaying at its own characteristic rate. The art lies in finding these natural modes of behavior.
%

The parameter $\lambda$ appearing here is the characteristic value that determines the natural rate of growth or decay in the ODE. It is not initially obvious, but such $\lambda$ values are connected at the deepest levels to the algebra of matrices and linear transformations. 

% ==============================================
\section{Coupled First-Order Systems}
\label{sec:coupled}
% ==============================================

Real systems rarely evolve in isolation. A predator population depends on its prey; stock prices move with market sectors; neurons fire in vast interconnected networks. Even the simplest model tends to track at least two interacting variables. Consider $x(t)$ and $y(t)$ whose evolution depends linearly on a combination of present states:
\[
\displaystyle\frac{dx}{dt} = ax + by \quad : \quad \displaystyle\frac{dy}{dt} = cx + dy
\]
where $a,b,c,d$ are constants. Unlike the scalar case, no obvious solution presents itself. Writing this linear system in matrix form reveals a familiar pattern:
\[
\displaystyle\frac{d}{dt}\begin{pmatrix}x\\y\end{pmatrix} = 
\begin{bmatrix}a & b \\ c & d\end{bmatrix}
\begin{pmatrix}x\\y\end{pmatrix}
\]
This matrix form suggests a special case worth examining. Suppose the matrix were diagonal:
\[
\displaystyle\frac{d}{dt}\begin{pmatrix}x\\y\end{pmatrix} = 
\begin{bmatrix}\lambda_1 & 0 \\ 0 & \lambda_2\end{bmatrix}
\begin{pmatrix}x\\y\end{pmatrix}
\quad \Leftrightarrow \quad
% \]
% Then our coupled system would \style{decouple} into independent scalar equations:
% \[
\frac{dx}{dt} = \lambda_1 x 
\,\, : \,\, 
\frac{dy}{dt} = \lambda_2 y
\]
each solvable by the methods of the previous section. The solution would be pure exponential growth or decay along each coordinate axis:
\[
\begin{pmatrix}x(t)\\y(t)\end{pmatrix} 
= 
\begin{pmatrix}c_1e^{\lambda_1 t}\\c_2e^{\lambda_2 t}\end{pmatrix}
\]
where $c_1$ and $c_2$ are determined by initial conditions.

This observation  --  that diagonal systems decompose into independent scalar equations  --  suggests a strategy. If we could somehow transform our original system into diagonal form, its solution would reduce to pure exponentials: we seek coordinates that reveal the hidden diagonal structure lurking within coupled systems.

\begin{example}[Chemical Reaction]
Consider two chemical species with concentrations $x_A$ and $x_B$ that interact through a simple reaction network in matrix form:
\[
\frac{d}{dt}\begin{pmatrix}x_A\\x_B\end{pmatrix} 
= 
\begin{bmatrix}-k_1 & k_2 \\ k_1 & -k_2\end{bmatrix}
\begin{pmatrix}x_A\\x_B\end{pmatrix}
\]
where $k_1,k_2 > 0$ are reaction rate constants. A curious change of coordinates reveals hidden simplicity. Consider the invertible transformation
\[
\begin{pmatrix}x_A\\x_B\end{pmatrix} = 
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}
\begin{pmatrix}u\\v\end{pmatrix}
\quad\Longleftrightarrow\quad
\begin{pmatrix}u\\v\end{pmatrix} = 
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}^{-1}
\begin{pmatrix}x_A\\x_B\end{pmatrix}
\]
We can convert the differential equations by means of a similarity transformation:
\begin{align*}
\frac{d}{dt}
\begin{pmatrix}u\\v\end{pmatrix} 
& = 
\frac{d}{dt}
\left(
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}^{-1}
\begin{pmatrix}x_A\\x_B\end{pmatrix}
\right)
= 
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}^{-1}
\frac{d}{dt}
\begin{pmatrix}x_A\\x_B\end{pmatrix}
\\
& =
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}^{-1}
\begin{bmatrix}-k_1 & k_2 \\ k_1 & -k_2\end{bmatrix}
\begin{pmatrix}x_A\\x_B\end{pmatrix}
\\
& = 
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}^{-1}
\begin{bmatrix}-k_1 & k_2 \\ k_1 & -k_2\end{bmatrix}
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}
\begin{pmatrix}u\\v\end{pmatrix}
= 
\begin{bmatrix}0 & 0 \\ 0 & -k_1-k_2\end{bmatrix}
\begin{pmatrix}u\\v\end{pmatrix}
\end{align*}
This diagonal matrix has diagonal entries $0$ for $u$ and $-(k_1+k_2)<0$ for $v$. Thus $u$ remains constant while $v$ decays exponentially. The solution in original coordinates is computed as:
\[
\begin{pmatrix}x_A\\x_B\end{pmatrix} 
= 
\begin{bmatrix}1 & 1 \\ k_2 & -k_1\end{bmatrix}
\begin{pmatrix}u\\ v\end{pmatrix} 
=
\begin{pmatrix}
    c_1+c_2e^{-(k_1+k_2)t} \\
    c_1k_2-c_2k_1e^{-(k_1+k_2)t}
\end{pmatrix}
\]
where $c_1$ and $c_2$ depend on initial conditions. This ``magical'' change of coordinates anticipates deeper structure -- the zero on the diagonal hints that there is a conservation principle at work (conservation of mass in this case). 
\end{example}


The search for such diagonal representations will guide our development in the sections ahead. We shall discover that many coupled systems admit transformation to diagonal form, where their behavior becomes transparent. Those that resist such diagonalization will require more subtle analysis, leading us to the Jordan canonical form of Chapter \ref{ch:8}. Throughout, our goal remains to understand how linear systems evolve by finding coordinates that reveal their essential structure.

% ==============================================
\section{Eigenvalues \& Eigenvectors}
\label{sec:eigens}
% ==============================================

The general linear system of ordinary differential equations takes the form
\begin{equation}
\label{eq:linearODE}
    \frac{d\vect{x}}{dt} = A\vect{x}
\end{equation}
where $A$ is a square matrix and $\vect{x}(t)$ is a vector-valued function of time. Such equations arise throughout engineering  --  from mechanical vibrations to chemical kinetics to population dynamics. Their solution, though not immediately apparent, holds the key to understanding how linear systems evolve.

Our experience with scalar equations suggests a strategy. Were $A$ replaced by a scalar $\lambda$, the solution would take the simple exponential form $\vect{x}(t) = e^{\lambda t}\vect{x}_0$. This observation leads to a crucial question: might there exist a 1-d subspace on which the matrix $A$ acts just like scalar multiplication? This would be equivalent to finding a vector $\vect{v}\neq\vect{0}$ such that
\begin{equation}
\label{eq:eigen}
    A\vect{v} = \lambda\vect{v}
\end{equation}
for some scalar $\lambda$. Such a special subspace $\spanset(\vect{v})$, should it exist, would be \style{invariant} under $A$ and, for vectors in this subspace, solving the ODE would reduce to the trivial 1-d case.
%
\begin{marginfigure}
{\em Nota bene:} The restriction $\vect{v}\neq\vect{0}$ is crucial. The zero vector satisfies $A\vect{0}=\lambda\vect{0}$ for any $\lambda$, but tells us nothing about the transformation's behavior.
\end{marginfigure}

Viewing $A$ as a linear transformation illuminates the path forward. Equation (\ref{eq:eigen}) can be rewritten as
\[
    (A-\lambda I)\vect{v} = \vect{0}
\]
where $I$ denotes the identity matrix. For this equation to have a nonzero solution $\vect{v}$, the transformation $(A-\lambda I)$ must have nontrivial kernel. This occurs precisely when $(A-\lambda I)$ fails to be invertible  --  when its determinant vanishes: $\det(A-\lambda I) = 0$. This is the key to the following fundamental definitions.

\begin{definition}[Eigenvalues and Eigenvectors]
\label{def:eigen}
The \style{characteristic polynomial} $p_A(\lambda)$ of a square matrix $A$ is the polynomial
\begin{equation}
\label{eq:charpoly}
    p_A(\lambda) = \det(A-\lambda I)
\end{equation}
A scalar $\lambda$ is called an \style{eigenvalue} of $A$ if it is a root of the characteristic polynomial: $p_A(\lambda)=0$. For each eigenvalue $\lambda$, any nonzero vector $\vect{v}$ satisfying
\begin{equation}
\label{eq:eigen-again}    
    A\vect{v} = \lambda\vect{v}
\end{equation}
is called an \style{eigenvector} corresponding to eigenvalue $\lambda$. 
\end{definition}
\begin{marginfigure}
    {\em Nota bene:} Eigenvalues and eigenvectors are paired -- though uniqueness is not implied.
\end{marginfigure}

\begin{example}
\label{ex:2112}
For a concrete example, consider the matrix
\[
    A = \begin{bmatrix}
    2 & 1 \\
    1 & 2
    \end{bmatrix}
\]
Its characteristic polynomial is
\[
    \det(A-\lambda I) = \begin{vmatrix}
    2-\lambda & 1 \\
    1 & 2-\lambda
    \end{vmatrix}
    = (2-\lambda)^2 - 1 = \lambda^2 - 4\lambda + 3
\]
Setting this equal to zero yields eigenvalues $\lambda_1=3$ and $\lambda_2=1$. For $\lambda_1=3$, we solve $(A-3I)\vect{v}=\vect{0}$:
\[
    \begin{bmatrix}
    -1 & 1 \\
    1 & -1
    \end{bmatrix}
    \begin{pmatrix}
    v_1 \\ v_2
    \end{pmatrix}
    = \vect{0}
    \quad\Rightarrow\quad
    \vect{v}_1 = \begin{pmatrix}
    1 \\ 1
    \end{pmatrix}
\]
Similarly, for $\lambda_2=1$ we find $\vect{v}_2=(1,-1)^T$. Each eigenvector reveals a direction in which $A$ acts by simple scaling: vectors along $\vect{v}_1$ are stretched by a factor of $3$, while those along $\vect{v}_2$ are left unchanged (rescaled by $1$).
\end{example}
%
The search for eigenvalues and eigenvectors thus reduces to:
\begin{enumerate}
    \item Form the characteristic polynomial $\det(A-\lambda I)$
    \item Find its roots (the eigenvalues)
    \item For each eigenvalue $\lambda$, solve $(A-\lambda I)\vect{v}=\vect{0}$ for nonzero $\vect{v}$
\end{enumerate}
%
As to what type and how many eigenvalues a system has, the following is crucial.

\begin{lemma}[Characteristic Polynomials]
\label{lem:charpoly}
For any matrix $A\in\R^{n\times n}$, its characteristic polynomial $p_A(\lambda)=\det(A-\lambda I)$ satisfies:
\begin{enumerate}
    \item The polynomial has degree exactly $n$
    \item It has exactly $n$ complex roots (counted with algebraic multiplicity)
    \item Its coefficients are real, and its complex roots occur in conjugate pairs
\end{enumerate}
\end{lemma}

\begin{proof}
For (1), expand $\det(A-\lambda I)$ using any minor expansion. The term $(-\lambda)^n$ appears uniquely from the product of diagonal entries of $-\lambda I$. All other terms in the determinant expansion contain fewer factors of $\lambda$, as they must include at least one entry from $A$. Thus $p_A(\lambda)$ has degree exactly $n$ with leading coefficient $(-1)^n$.

\begin{marginfigure}
{\em Example:} The matrix $J = \begin{bmatrix}0 & -1 \\ 1 & 0\end{bmatrix}$ has characteristic polynomial $p_J(\lambda)=\lambda^2+1$, whose roots are $\pm i$, illustrating how complex eigenvalues must indeed appear in conjugate pairs. What does the linear transformation with matrix $J$ do?
\end{marginfigure}
%
For (2), the Fundamental Theorem of Algebra ensures that any polynomial of degree $n$ with complex coefficients has exactly $n$ complex roots, counting multiplicity. 

For (3), the entries of $A-\lambda I$ are either real constants or real linear terms in $\lambda$. The determinant of such a matrix must yield a polynomial with real coefficients. For such polynomials, if $\alpha+i\beta$ is a root, then its complex conjugate $\alpha-i\beta$ must also be a root with equal multiplicity. This follows because complex roots of real polynomials occur in conjugate pairs: if $p(a+bi)=0$ then $p(\overline{a+bi})=p(a-bi)=0$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The characteristic polynomial encodes more than just eigenvalues  --  its coefficients reveal fundamental invariants of the transformation. Most striking are the relationships between eigenvalues and two elementary matrix measurements: trace and determinant.

\begin{lemma}[Eigenvalue Relations]
\label{lem:eigrelations}
For an $n\times n$ matrix $A$ with eigenvalues $\lambda_1,\ldots,\lambda_n$:
%
\begin{marginfigure}
{\em Example:} For a $2\times 2$ matrix $A$, the characteristic polynomial $p_A(\lambda)=\lambda^2-\trace(A)\lambda+\det(A)$ makes these relationships transparent.
\end{marginfigure}
\begin{enumerate}
    \item The determinant equals their product: $\det(A) = \prod_{i=1}^n \lambda_i$
    \item The trace equals their sum: $\trace(A) = \sum_{i=1}^n \lambda_i$
\end{enumerate}
\end{lemma}


% ==============================================
\section{Simple Diagonalization}
\label{sec:simplediag}
% ==============================================

When a linear transformation possesses $n$ distinct real eigenvalues, a remarkable simplification becomes possible. The eigenvectors  --  those special directions experiencing pure scaling  --  provide a natural basis for viewing the transformation in its simplest form. This process of changing coordinates to align with eigenvectors, called \style{diagonalization}, reveals the essential character of the transformation stripped of coupled complexity.

Consider again the matrix from Example \ref{ex:2112}:
\[
    A = \begin{bmatrix}
    2 & 1 \\
    1 & 2
    \end{bmatrix}
\]
We found eigenvalues $\lambda_1=3$ and $\lambda_2=1$ with corresponding eigenvectors $\vect{v}_1=(1,1)^T$ and $\vect{v}_2=(1,-1)^T$. The individual eigenvalue equations
\[
    A\vect{v}_1 = 3\vect{v}_1 \quad\text{and}\quad A\vect{v}_2 = \vect{v}_2
\]
can be elegantly combined by arranging the eigenvectors as columns of a matrix:
\[
    A[\vect{v}_1\;\vect{v}_2] = [\vect{v}_1\;\vect{v}_2]\begin{bmatrix}3 & 0 \\ 0 & 1\end{bmatrix}
\]
Writing in terms of square matrices $V=[\vect{v}_1\;\vect{v}_2]$ and $\Lambda=\diag(3,1)$, this becomes simply
\begin{equation}
\label{eq:AV=VL}
    AV = V\Lambda
\end{equation}
\begin{marginfigure}
    {\em Think: if you have internalized the equation $A\vect{v}=\lambda\vect{v}$, you may find the matrix form of this equation to be easily memorable. Just be sure to convince yourself as to the ordering of the terms...}
\end{marginfigure}
When $V$ is invertible (as it must be when the eigenvalues are distinct), we obtain the diagonalization
$V^{-1}AV = \Lambda$. This observation generalizes to arbitrary dimension:

\begin{theorem}[Diagonalization]
\label{thm:diagonalization}
Let $A$ be an $n\times n$ matrix with $n$ distinct real eigenvalues $\lambda_1,\ldots,\lambda_n$ and corresponding eigenvectors $\vect{v}_1,\ldots,\vect{v}_n$. Then:
\begin{enumerate}
    \item The eigenvectors form a basis for $\R^n$
    \item The matrix $V=[\vect{v}_1\;\cdots\;\vect{v}_n]$ is invertible
    \item $V^{-1}AV = \Lambda$ where $\Lambda=\diag(\lambda_1,\ldots,\lambda_n)$
\end{enumerate}
\end{theorem}

\begin{proof}
That the eigenvectors are linearly independent follows from their association with distinct eigenvalues. Indeed, suppose $\sum_{i=1}^n c_i\vect{v}_i=\vect{0}$. Then
\[
    \vect{0} = A\left(\sum_{i=1}^n c_i\vect{v}_i\right) = \sum_{i=1}^n c_i\lambda_i\vect{v}_i
\]
Subtracting $\lambda_1$ times the first equation from the second:
\[
    \sum_{i=2}^n c_i(\lambda_i-\lambda_1)\vect{v}_i = \vect{0}
\]
Since $\lambda_i\neq\lambda_1$ for $i\geq 2$, we must have $c_2=\cdots=c_n=0$, and consequently $c_1=0$. Thus $\{\vect{v}_1,\ldots,\vect{v}_n\}$ is linearly independent, making $V$ invertible. The final statement follows from our derivation above.
\end{proof}

The power of diagonalization lies in how it simplifies computation of matrix powers. When $A=V\Lambda V^{-1}$, repeated multiplication becomes mere diagonal scaling:

\begin{lemma}[Matrix Powers]
\label{lem:powers}
If $A=V\Lambda V^{-1}$ is diagonalizable, then for any positive integer $k$:
\[
    A^k = V\Lambda^k V^{-1}
\]
\end{lemma}
\begin{marginfigure}
    {\em Think:} how hard is it to compute powers of the diagonal matrix $\Lambda$? 
\end{marginfigure}
\begin{proof}
The result follows directly from associativity of matrix multiplication and the properties of the inverse.
\end{proof}

This decomposition reveals how a transformation compounds through repeated application: each eigenspace experiences repeated scaling by its eigenvalue, while the change of basis matrices $V$ and $V^{-1}$ translate between our chosen coordinates and these natural eigenspaces. The behavior as $k\to\infty$ becomes transparent  --  it is controlled entirely by the magnitude of eigenvalues: see Chapter \ref{ch:9} for implications.

The relationship between similar matrices acquires new meaning through diagonalization. Matrices are similar precisely when they represent the same linear transformation viewed in different coordinates. When those coordinates align with an eigenbasis, the matrix assumes diagonal form  --  its simplest possible representation. Not every matrix admits such diagonal form (a topic for Chapter \ref{ch:8}), but when it does, we gain both computational advantage and theoretical insight.

% ==============================================
\section{Matrix Exponentials}
\label{sec:matexp}
% ==============================================

Recall from Section \ref{sec:firstorder} the general linear system of differential equations:
\begin{equation}
\label{eq:linearsystem}
    \frac{d\vect{x}}{dt} = A\vect{x}
\end{equation}
where $A$ is a square matrix. The scalar case $A=\lambda I$ yielded solutions of the form $e^{\lambda t}\vect{x}_0$. Such exponentials indeed generate solutions in the general case. The following definition is key.
%
\begin{definition}[Matrix Exponential]
\label{def:matexp}
The \style{matrix exponential} of a square matrix $A$ is defined by the absolutely convergent power series
\begin{equation}
\label{eq:e^A}
    e^A = I + A + \frac{A^2}{2!} + \frac{A^3}{3!} + \cdots = \displaystyle\sum_{k=0}^\infty \frac{A^k}{k!}
\end{equation}
%For a time-dependent system, we write $e^{At}$ for the matrix exponential of $At$.
\end{definition}

This formal series inherits many properties of the scalar exponential, including the crucial fact that it solves our differential equation:

\begin{lemma}[Matrix Exponential Solution]
\label{lem:matexpsol}
The general solution to the initial value problem
\[
    \frac{d\vect{x}}{dt} = A\vect{x}, \quad \vect{x}(0)=\vect{x}_0
\]
is given by $\vect{x}(t) = e^{At}\vect{x}_0$.
\end{lemma}

\begin{marginfigure}
{\em Caveat:} While the formula resembles the scalar case, computing $e^{At}$ directly from the series is rarely practical. The art lies in finding more efficient methods based on the structure of $A$.
\end{marginfigure}

The simplest case occurs when $A$ is diagonal. For $A=\diag(\lambda_1,\ldots,\lambda_n)$, the matrix exponential is simply $e^{At} = \diag(e^{\lambda_1t},\ldots,e^{\lambda_nt})$.
Each component evolves independently according to its diagonal entry.

\begin{example}[Diagonal System]
The system
\[
    \frac{d}{dt}\begin{pmatrix}x\\y\end{pmatrix} = 
    \begin{bmatrix}2 & 0\\0 & -1\end{bmatrix}
    \begin{pmatrix}x\\y\end{pmatrix}
\]
has matrix exponential
\[
    e^{At} = \begin{bmatrix}e^{2t} & 0\\0 & e^{-t}\end{bmatrix}
\]
The first component grows exponentially while the second decays  --  a behavior determined entirely by the eigenvalues.
\end{example}

\begin{lemma}[Diagonalizable Matrix Exponential]
\label{lem:diagexp}
If $A=V\Lambda V^{-1}$ is diagonalizable with $\Lambda=\diag(\lambda_1,\ldots,\lambda_n)$, then
\[
    e^{At} = Ve^{\Lambda t}V^{-1}
\]
where $e^{\Lambda t}=\diag(e^{\lambda_1t},\ldots,e^{\lambda_nt})$.
\end{lemma}

\begin{proof}
By Lemma \ref{lem:powers}, for any positive integer $k$, $A^k = V\Lambda^k V^{-1}$. Therefore in the power series defining $e^{At}$:
\[
    e^{At} = I + At + \frac{(At)^2}{2!} + \cdots = \sum_{k=0}^\infty \frac{t^k}{k!}A^k = \sum_{k=0}^\infty \frac{t^k}{k!}V\Lambda^k V^{-1}
\]
The absolute convergence of the matrix exponential series allows us to factor out $V$ and $V^{-1}$:
\[
    e^{At} = V\left(\sum_{k=0}^\infty \frac{t^k}{k!}\Lambda^k\right)V^{-1} = Ve^{\Lambda t}V^{-1}
\]
where the middle term reduces to $\diag(e^{\lambda_1t},\ldots,e^{\lambda_nt})$ by the scalar exponential series.
\end{proof}

This decomposition reveals how eigenvalues control the long-term behavior of solutions: each eigendirection experiences exponential growth or decay at a rate determined by its eigenvalue.

\begin{example}[Population Growth]
\label{ex:population}
Consider a simple model of two interacting populations where each species' growth rate depends both on its own population and that of the other species:
\[
    \frac{d}{dt}\begin{pmatrix}x\\y\end{pmatrix} = 
    \begin{bmatrix}2 & 1\\1 & 2\end{bmatrix}
    \begin{pmatrix}x\\y\end{pmatrix}
\]
From Example \ref{ex:2112}, we know this matrix has eigenvalues $\lambda_1=3$ and $\lambda_2=1$ with eigenvectors $\vect{v}_1=(1,1)^T$ and $\vect{v}_2=(1,-1)^T$. Therefore
\[
    e^{At} = \begin{bmatrix}1 & 1\\1 & -1\end{bmatrix}
    \begin{bmatrix}e^{3t} & 0\\0 & e^t\end{bmatrix}
    \begin{bmatrix}1/2 & 1/2\\1/2 & -1/2\end{bmatrix}
\]
The solution reveals two fundamental modes: symmetric growth where both populations increase in proportion ($e^{3t}$ term) and asymmetric growth where their difference evolves more slowly ($e^t$ term). Any initial condition excites a combination of these modes, with the symmetric mode eventually dominating due to its larger eigenvalue.
\end{example}

This connection between eigenvalues and the qualitative behavior of solutions  --  growth, decay, or oscillation  --  exemplifies how the algebraic structure of a matrix determines the geometric character of its flow. The matrix exponential transforms our static understanding of eigenvalues into a dynamic picture of how systems evolve in time.

% ==============================================
\section{Higher-Order Equations}
\label{sec:higherorder}
% ==============================================

A simple mass-spring system requires tracking both position and velocity; a circuit with inductance and capacitance needs current and charge; a chemical reaction network may depend on concentrations and their rates of change. Such systems lead naturally to second-order (or higher) differential equations. Though seemingly more complex than the first-order systems studied thus far, these equations succumb to the same eigenvalue methods through a systematic reduction to matrix form.

As foreshadowed at the conclusion of Chapter \ref{ch:2},
consider a linear homogeneous differential equation of order $n$:
\[
    \frac{d^nx}{dt^n} + a_{n-1}\frac{d^{n-1}x}{dt^{n-1}} + \cdots + a_1\frac{dx}{dt} + a_0x 
    = 0
\]
where the coefficients $a_k$ are constants. Using the differential operator $D=d/dt$, we can write this more compactly as
\[
    p(D)x = (D^n + a_{n-1}D^{n-1} + \cdots + a_1D + a_0I)x = 0
\]
where $p\in\poly_n$ is a degree-$n$ polynomial in the differentiation operator $D$ having coefficients $a_i$, $i=0\ldots n-1$ (and top coefficient $1$). This single equation of order $n$ can be transformed into a system of $n$ first-order equations by introducing new variables for the derivatives. Let
\[
    x_1 = x, \quad x_2 = \frac{dx}{dt}, \quad x_3 = \frac{d^2x}{dt^2}, \quad \ldots \quad x_n = \frac{d^{n-1}x}{dt^{n-1}}
\]
Then our equation becomes a first-order system in matrix form:
\[
    \frac{d}{dt}\begin{pmatrix}x_1\\x_2\\x_3\\\vdots\\x_n\end{pmatrix} = 
    \begin{bmatrix}
    0 & 1 & 0 & \cdots & 0 \\
    0 & 0 & 1 & \cdots & 0 \\
    0 & 0 & 0 & \cdots & 0 \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    -a_0 & -a_1 & -a_2 & \cdots & -a_{n-1}
    \end{bmatrix}
    \begin{pmatrix}x_1\\x_2\\x_3\\\vdots\\x_n\end{pmatrix}
\]
%
\begin{marginfigure}
{\em Nota bene:} The matrix has a special structure -- zeros everywhere except for $1$'s on the superdiagonal and the coefficients $-a_k$ in the last row. Such matrices are called \style{companion matrices}.
\end{marginfigure}

This transformation reveals a deep connection between the original differential equation and linear algebra: the eigenvalues of the companion matrix are precisely the roots of the \style{characteristic equation}
\[
    \lambda^n + a_{n-1}\lambda^{n-1} + \cdots + a_1\lambda + a_0 = p(\lambda) = 0
\]
obtained by substituting $\lambda$ for $D$ in the polynomial operator. When these eigenvalues are distinct (as we assume throughout this chapter), the solution follows directly from our previous work on matrix exponentials.

\begin{example}[Mass-Spring System]
\label{ex:mass-spring}
Consider a mass $m$ attached to a spring with constant $k$ and dashpot damping coefficient $c$. Newton's second law yields
\[
    m\frac{d^2x}{dt^2} + c\frac{dx}{dt} + kx = 0
\]
where $x(t)$ measures displacement from equilibrium. Dividing by $m$ and setting $\omega_0^2=k/m$ (the natural frequency) and $\gamma=c/m$ (the damping ratio), we obtain
\[
    \frac{d^2x}{dt^2} + \gamma\frac{dx}{dt} + \omega_0^2x = 0
\]
This transforms to first-order form by setting $x_1=x$ and $x_2=\dot{x}$:
\[
    \frac{d}{dt}\begin{pmatrix}x_1\\x_2\end{pmatrix} = 
    \begin{bmatrix}
    0 & 1 \\
    -\omega_0^2 & -\gamma
    \end{bmatrix}
    \begin{pmatrix}x_1\\x_2\end{pmatrix}
\]

For concrete values $\omega_0^2=4$ and $\gamma=3$, the companion matrix
\[
    A = \begin{bmatrix}
    0 & 1 \\
    -4 & -3
    \end{bmatrix}
\]
has characteristic equation $\lambda^2 + 3\lambda + 4 = 0$ with roots $\lambda_1=-1$ and $\lambda_2=-2$. The corresponding eigenvectors are
\[
    \vect{v}_1 = \begin{pmatrix}1\\-1\end{pmatrix} \quad\text{and}\quad
    \vect{v}_2 = \begin{pmatrix}1\\-2\end{pmatrix}
\]
The solution follows from the matrix exponential. Since $A$ is diagonalizable with
\[
V = \begin{bmatrix}1 & 1\\-1 & -2\end{bmatrix} \quad\text{and}\quad
\Lambda = \diag(-1,-2)=\begin{bmatrix}-1 & 0\\0 & -2\end{bmatrix}
\]
we have
\[
e^{At} = Ve^{\Lambda t}V^{-1} =
\begin{bmatrix}1 & 1\\-1 & -2\end{bmatrix}
\begin{bmatrix}e^{-t} & 0\\0 & e^{-2t}\end{bmatrix}
\begin{bmatrix}2 & 1\\-1 & -1\end{bmatrix}
\]
For any initial condition $\vect{x}(0)$, the solution is $\vect{x}(t) = e^{At}\vect{x}(0)$. The position $x(t)=x_1(t)$ decays to equilibrium as $t\to\infty$, with the rate determined by the eigenvalues $-1$ and $-2$.
\end{example}

This example illustrates how eigenvalue analysis illuminates physical behavior. The eigenvalues $-1$ and $-2$ being real and negative indicates pure exponential decay without oscillation  --  characteristic of an overdamped system. Different parameter values might yield underdamped oscillations or critical damping, cases we shall explore in Chapter \ref{ch:8}.

% ==============================================
\section{Basis Solutions}
\label{sec:basissol}
% ==============================================

Linear homogeneous differential equations, whether expressed as first-order systems or higher-order scalar equations, admit elegant solution structures based on fundamental modes. These two perspectives  --  vector-valued solutions to systems and scalar solutions to higher-order equations  --  offer complementary insights into the nature of linear evolution.

\begin{lemma}[Solution Spaces]
\label{lem:solspace}
The solutions to both:
\begin{enumerate}
    \item The first-order system $\displaystyle\frac{d\vect{x}}{dt} = A\vect{x}$ in $\R^n$
    \item The $n$-th order equation $(D^n + a_{n-1}D^{n-1} + \cdots + a_1D + a_0)x = 0$
\end{enumerate}
form vector spaces under their natural operations of addition and scalar multiplication.
\end{lemma}
%
\begin{proof}
For the system, if $\vect{x}_1(t)$ and $\vect{x}_2(t)$ solve $D\vect{x}=A\vect{x}$, then:
\[
    \frac{d}{dt}(\vect{x}_1 + \vect{x}_2) = A\vect{x}_1 + A\vect{x}_2 = A(\vect{x}_1 + \vect{x}_2)
\]
For the scalar equation, if $x_1(t)$ and $x_2(t)$ solve $p(D)x=0$ where $p(D)$ is the polynomial differential operator, then:
\[
    p(D)(x_1 + x_2) = p(D)x_1 + p(D)x_2 = 0
\]
Similar reasoning applies to scalar multiplication in both cases.
\end{proof}

These solution spaces are intimately connected through the eigenstructure of the companion matrix from Section \ref{sec:higherorder}. The following theorem reveals how eigenvalues generate basis solutions in both settings:

\begin{theorem}[Basis Solutions]
\label{thm:fundsol}
Let $A$ be diagonalizable with eigenvalues $\lambda_1,\ldots,\lambda_n$. Then:
\begin{enumerate}
    \item For the system $D\vect{x}=A\vect{x}$ with eigenvectors $\vect{v}_1,\ldots,\vect{v}_n$, a basis for the solution space is given by:
    \[
        \vect{\phi}_i(t) = e^{\lambda_i t}\vect{v}_i, \quad i=1,\ldots,n
    \]
    \item For the scalar equation $P(D)x=0$, where $P(\lambda)$ is the characteristic polynomial, a basis is:
    \[
        \phi_i(t) = e^{\lambda_i t}, \quad i=1,\ldots,n
    \]
\end{enumerate}
\end{theorem}

\begin{proof}
For the system, direct substitution verifies each $\vect{\phi}_i$ is a solution:
\[
    \frac{d}{dt}\vect{\phi}_i = \lambda_i e^{\lambda_i t}\vect{v}_i = e^{\lambda_i t}(A\vect{v}_i) = A\vect{\phi}_i
\]
Their linear independence follows from that of the eigenvectors.

For the scalar equation, note that $p(D)e^{\lambda t} = p(\lambda)e^{\lambda t}$. Thus when $\lambda$ is a root of $p$, the exponential $e^{\lambda t}$ provides a solution. To establish linear independence of these solutions, suppose we have a linear combination that vanishes:
\[
    \sum_{i=1}^n c_ie^{\lambda_i t} = 0 \quad\text{for all }t
\]
We will show all coefficients $c_i$ must be zero. Differentiating this equation $k$ times yields:
\[
    \sum_{i=1}^n c_i\lambda_i^k e^{\lambda_i t} = 0 \quad\text{for }k=0,1,\ldots,n-1
\]

These $n$ equations form a linear system in the coefficients $c_i$. The matrix of this system is a Vandermonde matrix in the distinct values $\lambda_i$, which has nonzero determinant. Therefore $c_i=0$ for all $i$, establishing linear independence of the exponential solutions.
\end{proof}
%
\begin{marginfigure}
{\em Nota bene:} A \style{Vandermonde matrix} has entries of the form $v_{ij} = x_i^{j-1}$ where $x_i$ are distinct numbers. Its determinant is nonzero precisely when the $x_i$ are distinct. See the Exercises for properties of Vandermonde matrices.
\end{marginfigure}
General solutions take parallel forms:
\[
    \vect{x}(t) = \sum_{i=1}^n c_i e^{\lambda_i t}\vect{v}_i \quad\text{and}\quad x(t) = \sum_{i=1}^n c_i e^{\lambda_i t}
\]
The coefficients $c_i$ are determined by initial conditions  --  either $\vect{x}(0)$ for the system or $x(0),x'(0),\ldots,x^{(n-1)}(0)$ for the scalar equation.

\begin{example}[Mass-Spring System Redux]
\label{ex:mass-spring-redux}
The mass-spring equation from Example \ref{ex:mass-spring}:
\[
    \frac{d^2x}{dt^2} + 3\frac{dx}{dt} + 4x = 0
\]
has characteristic equation $\lambda^2 + 3\lambda + 4 = 0$ with roots $\lambda_1=-1$ and $\lambda_2=-2$. Thus:
\begin{enumerate}
    \item As a scalar equation, the general solution is:
    \[
        x(t) = c_1e^{-t} + c_2e^{-2t}
    \]
    \item As a system with eigenvectors $\vect{v}_1=(1,-1)^T$ and $\vect{v}_2=(1,-2)^T$:
    \[
        \begin{pmatrix}x(t)\\x'(t)\end{pmatrix} = 
        c_1e^{-t}\begin{pmatrix}1\\-1\end{pmatrix} +
        c_2e^{-2t}\begin{pmatrix}1\\-2\end{pmatrix}
    \]
\end{enumerate}
\begin{marginfigure}
{\em Nota bene:} In general, the scalar solutions $\phi_i(t)=e^{\lambda_i t}$ correspond precisely to the first components of the vector solutions $\vect{\phi}_i(t)$ when the equations are properly matched.
\end{marginfigure}
The first component of the vector solution matches the scalar solution, as it must.
\end{example}

This parallel development reveals the essential unity of linear differential equations, whether viewed as coupled systems or high-order scalar equations. Each perspective offers advantages: the scalar form often simplifies computation when only one variable interests us, while the system form better reveals geometric structure and generalizes to coupled equations.
%
\begin{marginfigure}
{\em Foreshadowing:} When eigenvalues coincide, both formulations require modification. The solutions acquire polynomial factors multiplying the exponentials  --  a complication we shall explore in Chapter \ref{ch:8}.
\end{marginfigure}

The choice between scalar and system forms often depends on context. Physical systems naturally couple multiple variables, favoring the system approach. Signal processing and control theory traditionally use scalar transfer functions, preferring the high-order form. Yet the underlying mathematical structure  --  exponential solutions built from eigenvalues  --  remains the same, a unity we shall exploit repeatedly in applications.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Multi-Zone Building Temperature Control}
\label{EM:tempcontrol}
% **************** EMANATION *******************

The temperature dynamics of a multi-zone building provide an elegant application of eigenvalue analysis to a practical engineering problem. Consider a building with multiple rooms, each maintained at a controlled temperature through a combination of HVAC input and heat exchange with neighboring spaces. The evolution of temperatures throughout the building reveals the power of linear systems theory in understanding and controlling complex thermal environments.

\begin{marginfigure}
\centering
\includegraphics[width=1.1in]{three-rooms.png}
\end{marginfigure}

Consider three adjacent rooms sharing walls but with different exterior exposures. Let $T_i(t)$ denote the temperature of room $i$ at time $t$. The rate of temperature change in each room depends on heat exchange with neighboring rooms (proportional to temperature differences), heat loss to the exterior (proportional to difference from ambient temperature), and HVAC input (controlled heating or cooling).

Applying Newton's law of cooling and conservation of energy leads to a system of coupled differential equations:
\[
c_1\frac{dT_1}{dt} = k_{12}(T_2-T_1) + k_{13}(T_3-T_1) - h_1(T_1-T_a) + u_1
\]
\[
c_2\frac{dT_2}{dt} = k_{12}(T_1-T_2) + k_{23}(T_3-T_2) - h_2(T_2-T_a) + u_2
\]
\[
c_3\frac{dT_3}{dt} = k_{13}(T_1-T_3) + k_{23}(T_2-T_3) - h_3(T_3-T_a) + u_3
\]
where $c_i$ represents the thermal mass of room $i$, $k_{ij}$ the thermal conductance between rooms $i$ and $j$, $h_i$ the heat transfer coefficient to ambient temperature $T_a$, and $u_i$ the HVAC input power to room $i$.

\begin{marginfigure}
{\em Nota bene:} The modeling approach here exemplifies a broader pattern in engineering: complex physical systems often reduce to coupled linear differential equations through appropriate approximations.
\end{marginfigure}

To analyze this system, first consider the unforced response ($u_i=0$) relative to ambient temperature. Let $x_i = T_i-T_a$ denote the temperature deviation in room $i$. Taking realistic values for a modern office building section with equal thermal masses ($c_1=c_2=c_3=1$), symmetric coupling between adjacent rooms ($k_{12}=k_{23}=1$, $k_{13}=0.5$), and varying exterior exposure ($h_1=2$, $h_2=1$, $h_3=1.5$), we obtain the system matrix:
\[
A = \begin{bmatrix}
-3.5 & 1.0 & 0.5 \\
1.0 & -2.5 & 1.0 \\
0.5 & 1.0 & -3.0
\end{bmatrix}
\]

The eigenvalues of $A$ determine the natural thermal modes of the building. Computing these reveals three distinct real eigenvalues: $\lambda_1 \approx -4.37$, $\lambda_2 \approx -2.83$, and $\lambda_3 \approx -1.80$. The distinctness of these eigenvalues proves particularly fortunate, as it enables a complete modal decomposition of the system's behavior without requiring the more complex Jordan form analysis developed in Section 7.3. Moreover, their negative values guarantee asymptotic stability - the temperature in each room will eventually return to ambient after any disturbance, as expected from physical intuition.

\begin{marginfigure}
{\em Foreshadowing:} The appearance of multiple time scales in thermal systems anticipates the more general theory of singular perturbations, crucial in many areas of engineering.
\end{marginfigure}

These eigenmodes reveal the fundamental thermal behavior of the building. The fastest mode, associated with $\lambda_1$, represents rapid equilibration between all rooms. The intermediate mode shows temperature oscillation between room 2 and its neighbors, while the slowest mode captures the gradual cooling of the entire system.

Using the diagonalization methods developed in Section 7.5, we can compute the complete temperature response through the matrix exponential:
\[
e^{At} = \begin{bmatrix}
-0.65 & -0.24 & 0.72 \\
-0.41 & 0.93 & -0.06 \\
-0.64 & -0.28 & -0.69
\end{bmatrix}
\begin{bmatrix}
e^{-4.37t} & 0 & 0 \\
0 & e^{-2.83t} & 0 \\
0 & 0 & e^{-1.80t}
\end{bmatrix}
\begin{bmatrix}
-0.59 & -0.13 & 0.54 \\
-0.25 & 0.87 & 0.12 \\
-0.52 & -0.38 & -0.83
\end{bmatrix}
\]

This decomposition guides practical HVAC control strategy. The fastest modes naturally equilibrate, so control systems should focus on compensating for the slowest decaying mode. Temperature sensors should be placed to best observe the dominant eigenmodes, and building design can optimize thermal response by adjusting the eigenvalue spectrum through insulation and thermal mass distribution.

\begin{marginfigure}
{\em Example:} A well-designed building might have its slowest eigenvalue around $-2.0$ hr$^{-1}$, meaning the slowest thermal mode decays with a half-life of about 20 minutes.
\end{marginfigure}

The methods developed here extend readily to larger buildings, more complex thermal networks, and other diffusive systems. Whether designing HVAC control systems, optimizing sensor placement, or planning building retrofits, eigenanalysis provides crucial insights into system behavior and guides engineering decisions. 

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Heavy Vehicle Suspension Analysis}
\label{EM:suspension}
% **************** EMANATION *******************

The dynamics of heavily damped suspension systems, such as those found in construction equipment and heavy trucks, provide an instructive application of real eigenvalue analysis. While passenger vehicles typically exhibit oscillatory behavior (a topic for Chapter 8), heavily damped systems demonstrate pure exponential modes that perfectly align with the theory developed in this chapter.

% \begin{marginfigure}
% {\em FIGURE:} Quarter-truck model showing vehicle body mass $M$ connected via spring $k$ and heavy damper $c$ to wheel mass $m$, which in turn connects to the road through tire spring $k_t$. Vertical displacements $x$ and $y$ are measured from equilibrium positions.
% \end{marginfigure}

Consider a ``quarter-truck'' model of one wheel on a heavy vehicle. Let $x(t)$ denote the vertical displacement of the vehicle body (sprung mass) and $y(t)$ the displacement of the wheel assembly (unsprung mass) from their equilibrium positions. The system obeys Newton's second law applied to each mass:
\[
M\ddot{x} = -k(x-y) - c(\dot{x}-\dot{y})
\]
\[
m\ddot{y} = k(x-y) + c(\dot{x}-\dot{y}) - k_t y
\]
where $M$ is the quarter-truck body mass, $m$ the wheel assembly mass, $k$ the suspension spring constant, $c$ the damper coefficient, and $k_t$ the tire spring rate.

To analyze this fourth-order system through eigentheory, we transform it to first-order form by introducing the state vector:
\[
\vect{z} = \begin{pmatrix} x \\ y \\ \dot{x} \\ \dot{y} \end{pmatrix}
\]
This yields the matrix equation $\dot{\vect{z}} = A\vect{z}$ where:
\[
A = \begin{bmatrix}
0 & 0 & 1 & 0 \\
0 & 0 & 0 & 1 \\
-k/M & k/M & -c/M & c/M \\
k/m & -(k+k_t)/m & c/m & -c/m
\end{bmatrix}
\]

Taking values typical for a heavy commercial vehicle ($M=2500$ kg, $m=200$ kg, $k=100000$ N/m, $k_t=800000$ N/m, $c=15000$ Ns/m), the eigenvalues emerge as four distinct real values:
\[
\lambda_1 = -12.3, \quad \lambda_2 = -8.4, \quad \lambda_3 = -3.2, \quad \lambda_4 = -0.9
\]

\begin{marginfigure}
{\em Think:} The wide separation between eigenvalues reflects different physical time scales: rapid tire deflection, intermediate suspension motion, and slow body settling.
\end{marginfigure}

These distinct real eigenvalues, characteristic of overdamped systems, indicate pure exponential decay modes without oscillation. The explicit solution through matrix exponential takes the form developed in Section 7.5:
\[
e^{At} = \sum_{j=1}^4 e^{\lambda_j t}\vect{v}_j\vect{w}_j^T
\]
where $\vect{v}_j$ and $\vect{w}_j^T$ are the right and left eigenvectors respectively. Each mode decays at its own characteristic rate, with physical interpretations:
\begin{enumerate}
    \item $\lambda_1$ mode: rapid tire deflection absorption
    \item $\lambda_2$ mode: primary suspension response
    \item $\lambda_3$ mode: coupled body-suspension motion
    \item $\lambda_4$ mode: slow final settling
\end{enumerate}

This eigenstructure guides heavy vehicle suspension design through several principles:
\begin{enumerate}
    \item All eigenvalues should be real and negative for pure damping
    \item Time scale separation prevents undesirable coupling
    \item Mode shapes (eigenvectors) determine effective sensor placement
    \item Slowest mode ($\lambda_4$) governs overall settling time
\end{enumerate}

The design of construction equipment often deliberately seeks this overdamped behavior to maintain stable operation under varying loads. The eigenvectors determine optimal placement of sensors and actuators by revealing how each decay mode manifests in measurable motions.

\begin{marginfigure}
{\em Example:} Mining trucks use multiple displacement sensors to monitor suspension motion, with sensor locations chosen to best observe the dominant eigenmodes identified through this analysis.
\end{marginfigure}

Extension to full-vehicle dynamics introduces additional degrees of freedom - vertical motion at each corner plus pitch and roll of the body - leading to larger matrices but similar principles. The eigenstructure reveals coupled modes like load-sharing between diagonally opposite corners that prove crucial for stability under shifting loads.

This mechanical application complements our thermal analysis by demonstrating eigentheory's power for a different class of physical system. Where thermal modes showed simple exponential decay from geometric symmetry, suspension modes exhibit a hierarchy of decay rates from mechanical coupling. Together they illustrate how the mathematical framework developed in this chapter illuminates diverse physical phenomena, transforming fourth-order differential equations into clearly understood modal responses through the unifying language of linear algebra.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 7}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item Let $A = \begin{bmatrix}3 & 1 \\ 1 & 3\end{bmatrix}$. Find its eigenvalues and eigenvectors. Use these to compute $e^{At}$ and describe the behavior of solutions to the system $\dot{\vect{x}} = A\vect{x}$ as $t\to\infty$.

\item Find the eigenvalues and eigenvectors of $A = \begin{bmatrix}4 & -1 \\ 2 & 1\end{bmatrix}$. Use these to solve the initial value problem $\dot{\vect{x}} = A\vect{x}$ with $\vect{x}(0) = \begin{pmatrix}1\\0\end{pmatrix}$.

\item Compute the eigenvalues and eigenvectors of $B = \begin{bmatrix}3 & -2 & 0 \\ 0 & 3 & 0 \\ -4 & 1 & 1\end{bmatrix}$. Is $B$ diagonalizable?

\item Prove that if $A$ is a triangular matrix (upper or lower) then the diagonal entries coincide with the eigenvalues. 
\begin{marginfigure}
This is an extremely useful result.
\end{marginfigure}

\item Convert the equation $\ddot{x} + 2\dot{x} + 5x = 0$ to a first-order system using the companion matrix. Show the relationship between the eigenvalues of this matrix and the roots of the characteristic equation of the original second-order equation.

\item For the differential equation $\ddot{x} + 3\dot{x} + 2x = 0$, find all linearly independent solutions of the form $e^{\lambda t}$. Write out the general solution using arbitrary constants $c_i$.

\item Let $A = \begin{bmatrix}1 & 1 \\ 0 & 1\end{bmatrix}$ and compute $e^{At}$ by summing the first four terms of its series expansion. Compare this with the result obtained by diagonalization. Why do these differ?

\item A matrix $A$ has characteristic polynomial $p(\lambda) = \lambda^3 - 6\lambda^2 + 11\lambda - 6$. Is $A$ diagonalizable? 

\item A matrix $A$ has characteristic polynomial $p(\lambda) = \lambda^4 - 2\lambda^3 - \lambda^2 + 2\lambda$. Find its eigenvalues and determine whether $A$ is necessarily diagonalizable. What can you say about the long-term behavior of solutions to $\dot{\vect{x}}=A\vect{x}$?

\item The trace of a matrix $A$ is the sum of its diagonal entries. Prove that the trace equals the sum of the eigenvalues (counted with multiplicity). 

\item Consider the third-order equation $\dddot{x} + 4\ddot{x} + 5\dot{x} + 2x = 0$. Find a basis for its solution space and express the general solution. For what values of the initial conditions $x(0)$, $\dot{x}(0)$, and $\ddot{x}(0)$ does the solution remain bounded as $t\to\infty$?

\item Show that for any diagonalizable matrix $A$, the matrices $A$ and $e^A$ share the same eigenvectors. If $\vect{v}$ is an eigenvector of $A$ with eigenvalue $\lambda$, what is the corresponding eigenvalue of $e^A$?

\item Prove that the determinant of a square matrix $A$ equals the product of its eigenvalues: for simplicity, assume that $A$ is diagonalizable, though the result holds in general.

\item Show that if $A$ is diagonalizable with all eigenvalues real and negative, then $\lim_{t\to\infty} e^{At} = \vect{0}$ (the zero matrix). 

\item If $A$ is a diagonalizable $2\times 2$ matrix and $\trace(A)=0$, what can you say about its eigenvalues? What does this tell you about solutions to the system $\dot{\vect{x}} = A\vect{x}$?

\item Consider square $n$-by-$n$ matrices $A$ and $B$. What condition is required to conclude the innocuous-looking result $e^{(A+B)t} = e^{At}e^{Bt}$. Give an example in the 2-by-2 case for which this formula does not hold.

\item Consider the matrix $A = \begin{bmatrix}3 & -1 \\ 2 & 4\end{bmatrix}$. Compute its characteristic polynomial $p(\lambda)$ and verify directly that $p(A) = \vect{0}$ (the zero matrix). What does this suggest about the relationship between a matrix and its characteristic polynomial?

\item The \style{Cayley-Hamilton theorem} states that every square matrix satisfies its own characteristic equation. For $A = \begin{bmatrix}2 & 1 \\ 0 & 2\end{bmatrix}$, verify this by showing that if $p(\lambda) = \det(A-\lambda I)$ then $p(A) = \vect{0}$. What does this tell you about powers of $A-2I$?

\item For any $2\times 2$ matrix $A$, show that $A^2 - \trace(A)A + \det(A)I = \vect{0}$. Explain how this is a special case of Cayley-Hamilton for $2\times 2$ matrices.

\item Suppose $A$ is a $3\times 3$ matrix with characteristic polynomial $p(\lambda) = \lambda^3 - 7\lambda^2 + 14\lambda - 8$. Without computing any matrix operations, determine the trace and determinant of $A$, and give an expression for $A^3$ in terms of lower powers of $A$

\item A rotation matrix $R$ on $\R^3$ is an orthogonal matrix with has determinant $1$. Explain why $R$ must have $1$ as an eigenvalue, thus rotating about some fixed axis. Does this argument extend to rotations in $\R^n$ for all $n$? (Hint: consider the characteristic polynomial and what happens in odd versus even dimensions).

\item Prove that for any square matrix $A$, the eigenvalues of $e^A$ are $\{e^{\lambda}\}$ where $\{\lambda\}$ are the eigenvalues of $A$. Explain why this means $e^A$ is always invertible.

\item The previous exercise shows that for $A$ a square matrix, $e^A$ is invertible. What is the inverse? Was it easier to guess the inverse than it was to show that it is invertible?

\item The Cayley-Hamilton theorem provides a path to computing the inverse of a nonsingular matrix. If $p(\lambda)$ is the characteristic polynomial of $A$, write $p(\lambda) = a_n\lambda^n + \cdots + a_1\lambda + a_0$ and show that when $A$ is invertible:
\[
    A^{-1} = -\frac{1}{a_0}(a_n A^{n-1} + \cdots + a_2A + a_1I)
\]
Use this to find $A^{-1}$ for $A = \begin{bmatrix}3 & 1 \\ 1 & 3\end{bmatrix}$ and compare with other methods.

\item For a $2\times 2$ matrix $A$ with distinct eigenvalues $\lambda_1,\lambda_2$, use the Cayley-Hamilton theorem to prove that:
\[
    A = \frac{\lambda_2 I - A}{\lambda_2-\lambda_1}\lambda_1 + \frac{A - \lambda_1 I}{\lambda_2-\lambda_1}\lambda_2
\]
This expression shows how $A$ decomposes along its eigenspaces even without explicitly computing eigenvectors.

\item For the $3\times 3$ Vandermonde matrix with $x_1=1$, $x_2=2$, and $x_3=3$, find its eigenvalues and verify they are real. Then compute the trace and verify it equals $x_1+x_2+x_3$ plus additional terms you should determine. What does this suggest about the general relationship between the $x_i$ values and the trace of a Vandermonde matrix?

\item For distinct real numbers $x_1,\ldots,x_n$, the Vandermonde matrix $V$ is defined by $v_{ij} = (x_i)^{j-1}$:
\[
V = \begin{bmatrix}
1 & x_1 & x_1^2 & \cdots & x_1^{n-1} \\
1 & x_2 & x_2^2 & \cdots & x_2^{n-1} \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
1 & x_n & x_n^2 & \cdots & x_n^{n-1}
\end{bmatrix}
\]
Show first that when $n=2$, $\det(V) = x_2-x_1$. Then prove for $n=3$ that $\det(V) = (x_2-x_1)(x_3-x_1)(x_3-x_2)$. Based on this pattern, state a conjecture for the general formula $\det(V)$ in terms of differences of the $x_i$. Prove this formula. What does this tell you about the eigenvalues when $x_i=i$?

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Eigenvalue Complexities}
\label{ch:8}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``With songs of sweetest cadence to the turning spindle \& reel''}

\newthought{Reality transcends real exponentials.} The elegant theory of the previous chapter  --  with its real eigenvalues generating pure growth and decay  --  must confront a deeper truth: the world pulses with rhythm. Heart cells oscillate in synchronized waves; animal populations surge and collapse in predictable cycles; financial markets breathe between boom and bust. Even our thoughts and social movements follow patterns of ebb and flow, as ideas fade and resurge with generations. This ubiquitous periodicity seems to defy our carefully constructed framework of eigenvalues and exponential solutions.

Yet mathematics bends rather than breaks. The introduction of complex eigenvalues resolves our crisis of description, revealing oscillation as natural as growth or decay. When eigenvalues coincide, still richer patterns emerge  --  polynomial terms multiplying our exponentials in ways that mirror the gradual onset of synchronized behavior in coupled systems. What appears first as complications to our theory reveals itself as essential structure, demanded by the very phenomena we seek to understand.

These complexities  --  imaginary eigenvalues, repeated roots, and their subtle interactions  --  lead us beyond simple diagonalization to the best possible world of the Jordan Canonical Form. This framework unifies our treatment of linear evolution while illuminating the delicate ways in which nearly identical eigenvalues can produce radically different behavior. The mathematics itself seems to pulse between poles of simplicity and intricacy, echoing the rhythms it describes.

% ==============================================
\section{Complex Eigenvalues \& Oscillation}
\label{sec:complexeigs}
% ==============================================

The simple harmonic oscillator  --  a mass on a spring without damping  --  provides our first encounter with behavior that transcends real eigenvalues. The equation of motion
\[
    \frac{d^2x}{dt^2} + \omega^2x = 0
\]
has well-known solutions involving sines and cosines:
\[
    x(t) = c_1\cos(\omega t) + c_2\sin(\omega t)
\]
Yet these solutions seem to conflict with our theory from Chapter \ref{ch:7}, where all solutions emerged as combinations of real exponentials. Converting to first-order form via $x_1=x$ and $x_2=\dot{x}$ yields
\[
    \frac{d}{dt}\begin{pmatrix}x_1\\x_2\end{pmatrix} = 
    \begin{bmatrix}
    0 & 1 \\
    -\omega^2 & 0
    \end{bmatrix}
    \begin{pmatrix}x_1\\x_2\end{pmatrix}
\]
The characteristic polynomial $\det(A-\lambda I) = \lambda^2 + \omega^2$ has roots $\lambda = \pm i\omega$  --  our first encounter with complex eigenvalues.
%
\begin{example}[Complex Eigenvectors]
Consider the rotation matrix for angle $\pi/3$:
\[
    R = \begin{bmatrix}
    \cos(\pi/3) & -\sin(\pi/3) \\
    \sin(\pi/3) & \cos(\pi/3)
    \end{bmatrix}
    =
    \begin{bmatrix}
    1/2 & -\sqrt{3}/2 \\
    \sqrt{3}/2 & 1/2
    \end{bmatrix}
\]
The characteristic equation is $\lambda^2 - \lambda + 1 = 0$, yielding eigenvalues $\lambda = \frac{1}{2} \pm i\frac{\sqrt{3}}{2} = e^{\pm i\pi/3}$. For $\lambda = \frac{1}{2} + i\frac{\sqrt{3}}{2}$, we solve $(R-\lambda I)\vect{v}=\vect{0}$:
\[
    -\frac{\sqrt{3}}{2}
    \begin{bmatrix}
    i & 1 \\
    -1 & i
    \end{bmatrix}
    \begin{pmatrix}
    v_1 \\ v_2
    \end{pmatrix}
    = \vect{0}
\]
yielding eigenvector $\vect{v} = (1,i)^T$. Though this eigenvector is complex, the matrix $R$ maps real vectors to real vectors. The seeming paradox resolves by noting that $\vect{v}$ and its complex conjugate $\overline{\vect{v}}=(1,-i)^T$ span $\R^2$ over the complex numbers, while their real and imaginary parts span $\R^2$ over the reals.
\end{example}


Far from invalidating our theory, these complex eigenvalues illuminate it. Euler's formula $e^{i\theta} = \cos\theta + i\sin\theta$ reveals that our trigonometric solutions are simply complex exponentials in disguise:
\[
    c_1\cos(\omega t) + c_2\sin(\omega t) = 
    \Re\left(z_1e^{i\omega t} + z_2e^{-i\omega t}\right)
\]
for appropriate complex constants $z_1$ and $z_2$. The oscillatory motion emerges from exponential solutions with purely imaginary exponents.

This insight generalizes through study of the fundamental $2\times 2$ matrix
\begin{equation}
\label{eq:J}
    J = \begin{bmatrix}0 & -1 \\ 1 & 0\end{bmatrix}
\end{equation}
\begin{marginfigure}
    {\em Nota bene:} The matrix $J=\sqrt{-I}$ is the matrix analogue of $i$ in complex arithmetic.
\end{marginfigure}
Direct computation reveals that $J^2=-I$, suggesting a deep connection to complex arithmetic. Indeed, the matrix exponential $e^{Jt}$ can be computed directly from its series definition:
\begin{equation}
\label{eq:e^J}
\begin{array}{rcl}
    e^{Jt} &=& I + tJ + \frac{t^2}{2!}J^2 + \frac{t^3}{3!}J^3 + \frac{t^4}{4!}J^4 + \cdots \\[1em]
    &=& \left(I - \frac{t^2}{2!}I + \frac{t^4}{4!}I - \cdots\right) + \left(tJ - \frac{t^3}{3!}J + \frac{t^5}{5!}J - \cdots\right) \\[1em]
    &=& (\cos t)I + (\sin t)J
\end{array}
\end{equation}
This matrix-valued analog of Euler's formula explains the geometric action of $J$  --  pure rotation in the plane.
%

More generally, consider a matrix $A$ with complex conjugate eigenvalues $\alpha\pm i\beta$. This matrix -- like $J$ -- cannot be diagonalized over the reals, but we can reduce it to a simplest form.
%
\begin{marginfigure}
    Such matrices arise frequently in applications  --  the real part $\alpha$ controlling growth or decay while the imaginary part $\beta$ determines oscillation frequency. 
\end{marginfigure}
%
\begin{lemma}[Complex Normal Form]
\label{lem:complexnormal}
Any $2\times 2$ matrix $A$ with complex conjugate eigenvalues $\alpha\pm i\beta$ is similar to $\alpha I + \beta J$. That is, there exists an invertible matrix $P$ such that
\begin{equation}
    P^{-1}AP = \alpha I + \beta J
\end{equation}
\end{lemma}
%
\begin{proof}
Let $\vect{v} = \vect{u} + i\vect{w}$ be an eigenvector for eigenvalue $\alpha + i\beta$. Then
\[
    A\vect{v} = (\alpha + i\beta)\vect{v} \implies A(\vect{u} + i\vect{w}) = (\alpha + i\beta)(\vect{u} + i\vect{w})
\]
Equating real and imaginary parts:
\[
    A\vect{u} = \alpha\vect{u} - \beta\vect{w} \quad\text{and}\quad A\vect{w} = \beta\vect{u} + \alpha\vect{w}
\]
The matrix $P = [\vect{u}\;\vect{w}]$ is invertible (as $\vect{v}$ cannot be purely real or imaginary), and
\[
    AP = [\alpha\vect{u} - \beta\vect{w}\;\;\beta\vect{u} + \alpha\vect{w}] = P(\alpha I + \beta J)
\]
yielding the desired similarity.
\end{proof}

This \style{normal form} is the simplest possible presentation of a 2-by-2 matrix with complex eigenvalues. Based on how similar matrices behave under exponentiation, our strategy for exponentiating $A$ is to change coordinates, exponentiate $\alpha I + \beta J$, then change coordinates back. Euler's Theorem once again emerges.

\begin{lemma}[Euler's Theorem Redux]
\label{lem:euler}
\begin{equation}
\label{eq:Euler2x2}
    e^{(\alpha I + \beta J)t} = e^{\alpha t}(\cos(\beta t)I + \sin(\beta t)J)
\end{equation}
\end{lemma}
\begin{proof}
Since $I$ and $J$ commute, we have:
\[
    e^{(\alpha I + \beta J)t} = e^{\alpha t I + \beta t J} = e^{\alpha t I}e^{\beta t J}
\]
Having computed $e^{Jt}$ in Equation (\ref{eq:e^J}), we conclude:
\[
    e^{(\alpha I + \beta J)t} = e^{\alpha t}I(\cos(\beta t)I + \sin(\beta t)J) = e^{\alpha t}(\cos(\beta t)I + \sin(\beta t)J)
\]
as claimed.
\end{proof}

This decomposition reveals how complex eigenvalues generate spiraling motion  --  exponential growth or decay modulated by rotation.

% ==============================================
\section{Repeated Eigenvalues}
\label{sec:repeated}
% ==============================================

Not all transformations keep their eigenspaces distinct. When eigenvalues coincide, the corresponding eigenvectors may merge or proliferate in subtle ways. This collision of eigenspaces -- this failure of perfect separation -- leads to behavior richer than pure scaling, yet still simpler than complex oscillation. Understanding such behavior requires carefully distinguishing between how often an eigenvalue appears and how many independent directions it controls.

Consider the matrices
\[
    A_1 = \begin{bmatrix}2 & 0\\0 & 2\end{bmatrix}
    \quad\text{and}\quad
    A_2 = \begin{bmatrix}2 & 1\\0 & 2\end{bmatrix}
\]
Both have characteristic polynomial $(\lambda-2)^2$, yielding eigenvalue $\lambda=2$ with \style{algebraic multiplicity} 2. Yet these matrices behave quite differently: $A_1$ acts by pure scaling in all directions, while $A_2$ combines scaling with shear. This distinction emerges from the \style{geometric multiplicity} -- the dimension of the eigenspace $\ker(A-2I)$.

\begin{definition}[Eigenvalue Multiplicities]
\label{def:multiplicity}
The \style{algebraic multiplicity} of an eigenvalue is its multiplicity as a root of the characteristic polynomial.
The \style{geometric multiplicity} is the dimension of its eigenspace $\ker(A-\lambda I)$. 
\end{definition}
\begin{marginfigure}
    The geometric multiplicity never exceeds the algebraic multiplicity.
\end{marginfigure}
%
For $A_1$ above, both multiplicities equal $2$ -- every nonzero vector is an eigenvector with eigenvalue $2$. For $A_2$, the algebraic multiplicity is $2$ but the geometric multiplicity is only $1$, as only multiples of $(1,0)^T$ are eigenvectors. This deficiency of eigenvectors signals deeper structure that pure diagonalization cannot capture.

When geometric and algebraic multiplicities differ, eigenvectors alone cannot span the space. We require \style{generalized eigenvectors} -- vectors $\vect{w}$ satisfying $(A-\lambda I)^k\vect{w}=\vect{0}$ for some $k>1$. These vectors generate solutions involving polynomial terms multiplied by exponentials, as seen in our example above.

\begin{lemma}[Generalized Eigenspaces]
\label{lem:geneigen}
For eigenvalue $\lambda$, the sequence of subspaces
\[
    \ker(A-\lambda I) < \ker(A-\lambda I)^2 < \ker(A-\lambda I)^3 < \cdots
\]
stabilizes at dimension equal to the algebraic multiplicity of $\lambda$.
\end{lemma}
\begin{marginfigure}
{\em BONUS!} This nesting of generalized eigenspaces -- or any sequence of nested subspaces -- is called a \style{filtration}. Filtrations are important in approximating large complicated spaces by graded low-dimensional entities, {\em cf.} Taylor polynomials, Fourier series, etc.
\end{marginfigure}

\begin{example}[Generalized Eigenvectors]
For the matrix
\[
    A = \begin{bmatrix}2 & 1 & 0\\0 & 2 & 1\\0 & 0 & 2\end{bmatrix}
\]
the eigenvalue $\lambda=2$ has algebraic multiplicity $3$ but geometric multiplicity $1$. The chain of generalized eigenvectors
\[
    \vect{w}_1 = \begin{pmatrix}1\\0\\0\end{pmatrix}, \quad
    \vect{w}_2 = \begin{pmatrix}0\\1\\0\end{pmatrix}, \quad
    \vect{w}_3 = \begin{pmatrix}0\\0\\1\end{pmatrix}
\]
satisfies
\[
    (A-2I)\vect{w}_1 = \vect{0}, \quad
    (A-2I)\vect{w}_2 = \vect{w}_1, \quad
    (A-2I)\vect{w}_3 = \vect{w}_2
\]
\end{example}

This richer structure -- generalized eigenvectors forming chains of increasing length -- provides the key to understanding transformations with repeated eigenvalues. Though such transformations resist diagonalization, their behavior remains comprehensible through careful analysis of how the generalized eigenspaces interact. This understanding will prove crucial as we develop the complete theory of linear evolution in subsequent sections.

\begin{example}[Exponentiating a Simple Block]
Consider the matrix
\[
    R = \begin{bmatrix} 
    2 & 1 & 0 \\
    0 & 2 & 1 \\
    0 & 0 & 2
    \end{bmatrix}
\]
This matrix can be written as $R = 2I + N$ where $I$ is the identity matrix and
\[
    N = \begin{bmatrix}
    0 & 1 & 0 \\
    0 & 0 & 1 \\
    0 & 0 & 0
    \end{bmatrix}
\]
is \style{nilpotent}  --  meaning some power of $N$ equals zero. Indeed, $N^3=0$ in this case, though $N^2\neq 0$. To find $e^{Rt}$, we use an exponent rule:
\[
    e^{Rt} = e^{(2I + N)t} = e^{2It}e^{Nt} = e^{2t}e^{Nt}
\]
this being valid since $I$ commutes with any matrix. The challenge of computing $e^{Nt}$ is not so bad, thanks to the exponential series and nilpotency:
\[
\begin{array}{rcl}
    e^{Nt} &=& I + Nt + \frac{N^2t^2}{2!} + \frac{N^3t^3}{3!} + \cdots \\[1em]
    &=& I + Nt + \frac{N^2t^2}{2!}
\end{array}
\]
The series terminates since all higher powers of $N$ vanish. Computing $N^2$ and evaluating yields:
\[
    e^{Rt} 
    = e^{2t}e^{Nt}  
    = e^{2t}
    \begin{bmatrix}
    1 & t & \frac{t^2}{2} \\
    0 & 1 & t \\
    0 & 0 & 1
    \end{bmatrix}
    =
    \begin{bmatrix}
    e^{2t} & te^{2t} & \frac{t^2}{2}e^{2t} \\
    0 & e^{2t} & te^{2t} \\
    0 & 0 & e^{2t}
    \end{bmatrix}
\]
The solution reveals a remarkable structure: the exponential growth $e^{2t}$ is modulated by polynomial terms in $t$. This pattern  --  polynomial-exponential products  --  characterizes solutions to systems with repeated eigenvalues. The nilpotent part $N$ creates a cascade of influence up the superdiagonal, with each level inheriting the dynamics of those below it.
\end{example}

% ==============================================
\section{The Jordan Canonical Form}
\label{sec:jordan}
% ==============================================

The quest for simplest form leads us beyond diagonalization. When eigenvalues coincide or turn complex, pure scaling gives way to richer structure  --  a cascade of influence captured by the \style{Jordan canonical form}. This ultimate decomposition reveals the true anatomy of linear transformations, showing how general evolution emerges from an interplay of scaling, rotation, and generalized eigenvectors.

Our journey through repeated eigenvalues and polynomial-exponential solutions points toward a deeper unity. Recall that matrices with repeated eigenvalues may lack a full set of independent eigenvectors, requiring generalized eigenvectors to span the space. The previous example showed how such structure manifests in solutions  --  pure exponentials gain polynomial coefficients through a cascade of influence along chains of generalized eigenvectors. These patterns, seemingly special cases, in fact reveal the general form all linear transformations must take.

\begin{theorem}[Jordan Canonical Form]
\label{thm:jordan}
Every square matrix $A$ is similar to a block diagonal matrix $J$, called its \style{Jordan canonical form}:
\[
    A = VJV^{-1} = V\begin{bmatrix}
    J_1 & 0 & \cdots & 0 \\
    0 & J_2 & \cdots & 0 \\
    \vdots & \vdots & \ddots & \vdots \\
    0 & 0 & \cdots & J_m
    \end{bmatrix}V^{-1}
\]
where each \style{Jordan block} $J_i$ has one of two forms:
\begin{enumerate}
    \item For a real eigenvalue $\lambda$ of algebraic multiplicity $k$, the Jordan block is the $k$-by-$k$ matrix:
    \[
        J_i = \lambda I + N = \begin{bmatrix}
        \lambda & 1 & 0 & \cdots & 0 \\
        0 & \lambda & 1 & \cdots & 0 \\
        0 & 0 & \lambda & \ddots & \vdots \\
        \vdots & \vdots & \ddots & \ddots & 1 \\
        0 & 0 & \cdots & 0 & \lambda
        \end{bmatrix}
    \]
    \item For complex conjugate eigenvalues $\alpha \pm i\beta$ of algebraic muliplicity $k$, the Jordan block is the $2k$-by-$2k$ block matrix
\begin{marginfigure}
    This block structure mirrors that of the real case, with the superdiagonal $I$ matrices playing the role of the nilpotent matrix of ones, connecting a chain of $2\times 2$ complex blocks $C$. 
\end{marginfigure}
\[
    J_i = 
    \begin{bmatrix}
    C & I & 0 & \cdots & 0 \\
    0 & C & I & \cdots & 0 \\
    0 & 0 & C & \ddots & \vdots \\
    \vdots & \vdots & \ddots & \ddots & I \\
    0 & 0 & \cdots & 0 & C
    \end{bmatrix} 
    \quad : \quad 
    C = 
    \begin{bmatrix}
    \alpha & -\beta \\
    \beta & \alpha
    \end{bmatrix}
    \quad : \quad 
    I = 
    \begin{bmatrix}
    1 & 0 \\
    0 & 1
    \end{bmatrix}
\]
where each entry is a 2-by-2 block. 
\end{enumerate}
The form $J$ is unique up to the ordering of blocks.
\end{theorem}
%
\begin{marginfigure}
    It would have been simpler to use the complex-valued Jordan form; only one type of block is needed. Given our motivation in working with solutions to ODEs, we have chosen to keep things real.
\end{marginfigure}
Each Jordan block corresponds to an eigenvalue (real or complex) and its associated generalized eigenvectors. The size of the block equals the length of the longest chain of generalized eigenvectors for that eigenvalue. When all eigenvalues are real and distinct (or real but with independent eigenvectors), each block is $1\times 1$ and we recover the diagonal form of Chapter \ref{ch:7}.


The structure within each block illuminates the transformation's action. For real eigenvalues, the matrix $N$ having ones on the superdiagonal and zeros elsewhere is \style{nilpotent}  --  some power equals zero. This nilpotence explains the finite polynomial terms we saw emerge in solutions. For complex eigenvalues, the $2\times 2$ blocks encode rotation as seen in Section \ref{sec:complexeigs}.

\begin{example}[Jordan Structure]
The matrix
\[
    A = \begin{bmatrix}
    2 & 1 & 0 & 0 \\
    0 & 2 & 0 & 0 \\
    0 & 0 & 3 & 1 \\
    0 & 0 & 0 & 3
    \end{bmatrix}
\]
is already in Jordan form. It has two Jordan blocks: a $2\times 2$ block for eigenvalue $\lambda=2$ and another for $\lambda=3$. The generalized eigenvector chains have lengths 2 and 2 respectively. Solutions will involve terms like $te^{2t}$ and $te^{3t}$ from each block's nilpotent part.
\end{example}

The Jordan decomposition provides the key to understanding matrix powers and exponentials. Since $J$ is block diagonal, its exponential is also block diagonal:
\[
    e^{At} = Ve^{Jt}V^{-1} = V\begin{bmatrix}
    e^{J_1t} & 0 & \cdots & 0 \\
    0 & e^{J_2t} & \cdots & 0 \\
    \vdots & \vdots & \ddots & \vdots \\
    0 & 0 & \cdots & e^{J_kt}
    \end{bmatrix}V^{-1}
\]
For each real Jordan block $J_i = \lambda I + N$, we have:
\[
    e^{J_it} = e^{\lambda t}e^{Nt} = e^{\lambda t}\left(I + Nt + \frac{N^2t^2}{2!} + \cdots + \frac{N^{m-1}t^{m-1}}{(m-1)!}\right)
\]
where $m$ is the size of the block. The series terminates because $N^m=0$. For complex blocks, similar formulas emerge using the rotation matrices studied earlier.

\begin{example}[Full Jordan Decomposition]
Consider the matrix
\[
    A = \begin{bmatrix}
    7 & -4 & 4 \\
    4 & -1 & 4 \\
    0 & 0 & 3
    \end{bmatrix}
\]
Its characteristic polynomial $(\lambda-3)(\lambda-3)(\lambda-3)$ reveals eigenvalue $\lambda=3$ with algebraic multiplicity $3$. Computing generalized eigenvectors shows geometric multiplicity $1$, yielding Jordan form:
\[
    A = V\begin{bmatrix}
    3 & 1 & 0 \\
    0 & 3 & 1 \\
    0 & 0 & 3
    \end{bmatrix}V^{-1}
\]
where $V$ contains the generalized eigenvectors. The single Jordan block indicates all generalized eigenvectors form one chain. Solutions will involve terms $e^{3t}$, $te^{3t}$, and $t^2e^{3t}/2$.
\end{example}
\begin{marginfigure}
    {\em Nota bene:} computing the generalized eigenvector that effect the coordinate transformation $V$ is not always straightforward. See Example \ref{ex:jordansucks} for details.
\end{marginfigure}
The Jordan form reveals that every linear transformation, viewed in the right coordinates, acts through a combination of:
\begin{enumerate}
    \item Pure scaling (diagonal entries)
    \item Rotation (complex conjugate blocks)
    \item Cascading influence (superdiagonal ones)
\end{enumerate}
This decomposition provides the complete picture of linear evolution, unifying our earlier studies of eigenvalues, repeated roots, and complex behavior into a single coherent structure.

% ==============================================
\section{Finding the Jordan Form}
\label{sec:findjordan}
% ==============================================

The beautiful structure of Jordan form revealed in previous sections now confronts a practical challenge: given a matrix $A$, how does one actually compute the similarity transformation $V$ putting $A$ into Jordan form? Unlike eigenvalues, which emerge cleanly from characteristic polynomials, or eigenvectors found through nullspace computation, the Jordan structure demands careful investigation of how generalized eigenvectors chain together into a basis. Our task is to transform theoretical understanding into practical computation.

The process begins with eigenvalues and their multiplicities. For each eigenvalue $\lambda$:
\begin{enumerate}
    \item The algebraic multiplicity $m$ comes from the characteristic polynomial
    \item The geometric multiplicity $k = \dim\ker(A-\lambda I)$ counts independent eigenvectors
    \item The difference $m-k$ reveals how many generalized eigenvectors we need
\end{enumerate}
Yet these numbers alone tell only part of the story. The real work lies in discovering how generalized eigenvectors form chains that build our transformation matrix $V$.

Recall from Section \ref{sec:repeated} that each eigenvalue $\lambda$ generates a sequence of nested subspaces:
\[
    \ker(A-\lambda I) < \ker(A-\lambda I)^2 < \ker(A-\lambda I)^3 < \cdots
\]
This filtration stabilizes when it reaches dimension equal to $\lambda$'s algebraic multiplicity. The challenge lies in extracting chains of generalized eigenvectors from this nested structure. Each chain begins with an eigenvector $\vect{v}_1$ and builds through solving equations of form:
\[
    (A-\lambda I)\vect{v}_{i+1} = \vect{v}_i
\]
When multiple chains exist, we must carefully track their relationships to construct $V$ properly.

\begin{example}[Jordan Transformation]
\label{ex:jordansucks}
Consider the $5\times 5$ matrix:
\[
    A = \begin{bmatrix}
    3 & 2 & 1 & 0 & 2 \\
    0 & 3 & -1 & 0 & 3 \\
    0 & 0 & 3 & 0 & -4 \\
    0 & 0 & 0 & 2 & 0 \\
    0 & 0 & 0 & 0 & 2
    \end{bmatrix}
\]
The characteristic polynomial $(\lambda-3)^3(\lambda-2)^2$ reveals eigenvalues $\lambda_1=3$ with algebraic multiplicity 3 and $\lambda_2=2$ with algebraic multiplicity 2. Let us systematically construct the Jordan decomposition.

For $\lambda_1=3$, it is clear that $\ker(A-3I) = \spanset(1, 0, 0, 0, 0)^T$, showing geometric multiplicity $1$. Two more vectors are required to complete this chain. Solving
\[
    (A-3I)\vect{v}_2 = \begin{pmatrix}1\\0\\0\\0\\0\end{pmatrix}
    \quad\Rightarrow\quad
    \vect{v}_2 = \begin{pmatrix}a\\1/2\\0\\0\\0\end{pmatrix}
\]
(where $a$ is a free parameter) and then
\[
    (A-3I)\vect{v}_3 = \begin{pmatrix}a\\1/2\\0\\0\\0\end{pmatrix}
    \quad\Rightarrow\quad
    \vect{v}_3 = \begin{pmatrix}b\\a/2+1/4\\-1/2\\0\\0\end{pmatrix}
\]
(where $b$ is another free parameter) provides our first chain $\vect{v}_3\mapsto\vect{v}_2\mapsto\vect{v}_1\mapsto\vect{0}$. 

For $\lambda_2=2$, solving $(A-2I)\vect{v}=\vect{0}$ reveals $\lambda=2$ to have geometric and algebraic multiplicity equal to $2$, with independent eigenvectors
\[
    \vect{v}_4 = \begin{pmatrix}0\\0\\0\\1\\0\end{pmatrix}
    \quad : \quad
    \vect{v}_5 = \begin{pmatrix}-2\\0\\0\\0\\1\end{pmatrix}
\]
The transformation matrix $V$ must assemble these chains as columns in order of decreasing chain length:
\[
    V = [\vect{v}_3\;\vect{v}_2\;\vect{v}_1\;\vect{v}_4\;\vect{v}_5]
\]
One checks that $a=0$ and $b=1/12$ suffice in this example to produce the Jordan form:
\begin{marginfigure}
    {\em Ouch!} The term ``{\em one checks}'' is doing the heavy lifting here...
\end{marginfigure}
\[
    J = V^{-1}AV = \begin{bmatrix}
    3 & 1 & 0 & 0 & 0 \\
    0 & 3 & 1 & 0 & 0 \\
    0 & 0 & 3 & 0 & 0 \\
    0 & 0 & 0 & 2 & 0 \\
    0 & 0 & 0 & 0 & 2
    \end{bmatrix}
\]
The structure of $J$ reflects both the algebraic multiplicities of eigenvalues and the lengths of generalized eigenvector chains discovered in our systematic decomposition.
\end{example}

Several practical principles emerge from this computation:
\begin{enumerate}
    \item Work systematically through eigenvalues in order of decreasing algebraic multiplicity
    \item For each eigenvalue, find all independent eigenvectors first
    \item Build chains starting from eigenvectors not yet used in other chains
    \item Assemble $V$ by arranging chains in order of decreasing length
    \item Verify the result through direct computation of $AV$ and $VJ$
\end{enumerate}

Two common pitfalls deserve special attention. First, when solving $(A-\lambda I)\vect{v}_{i+1}=\vect{v}_i$ for generalized eigenvectors, solutions always exist but contain free parameters. Different choices can produce different-looking but equivalent Jordan bases. Second, constructing $V$ requires careful attention to column order  --  chains must be arranged to produce the correct block structure in $J$.

The practical computation of Jordan form thus unites the theoretical structures developed throughout this chapter. The filtration of generalized eigenspaces guides our search for chains, while careful basis construction transforms these chains into the similarity transformation $V$. Though the process requires more subtlety than mere eigenvalue computation, systematic application of these principles yields this deepest decomposition of linear transformations.

% ==============================================
\section{Computing Eigenvalues}
\label{sec:computing}
% ==============================================

While focusing on the Jordan form and its computation, a preliminary step remains: how do we compute the eigenvalues in practice? The characteristic polynomial is not so simple  --  matrices of even modest size yield equations that resist direct solution. Eigenvalues of large matrices prompt more sophisticated methods of extraction. There is a remarkable algorithm based on a different matrix factorization that transforms our theoretical understanding into practical computation through careful composition of fundamental operations.

Consider first why naive approaches fail. A 100-by-100 matrix yields a degree-100 polynomial, far beyond the reach of standard root-finding methods. Even computing the coefficients proves perilous  --  expanding $\det(A-\lambda I)$ requires astronomical numbers of operations with catastrophic error growth. Yet modern software computes eigenvalues of thousand-by-thousand matrices in seconds. This seeming paradox resolves through iteration rather than direct solution.

The algorithm's elegance emerges from similarity transformations  --  a theme encountered repeatedly in our study of eigenvalues. If matrices $A$ and $B$ are similar, they share eigenvalues while potentially offering different computational advantages. The QR algorithm exploits this principle through careful iteration:

\begin{definition}[QR Algorithm]
\label{def:qr-alg}
Given matrix $A_0=A$, the \style{QR algorithm} generates a sequence through:
\begin{enumerate}
    \item Factor $A_k = Q_kR_k$ where $Q_k$ is orthogonal and $R_k$ is upper triangular
    \item Form $A_{k+1} = R_kQ_k$ (reverse multiply)
\end{enumerate}
Each iterate maintains similarity: $A_{k+1} = Q_k^TA_kQ_k$.
\end{definition}

Though simple in description, this process harbors remarkable properties. The sequence $\{A_k\}$ converges to upper triangular form, with eigenvalues appearing on the diagonal. Complex conjugate pairs emerge naturally as 2-by-2 blocks, while repeated eigenvalues maintain their multiplicities. The algorithm effectively performs simultaneous power iteration on all eigenspaces, yet with crucial numerical stability through orthogonal transformations.

\begin{example}[Simple Iteration]
\label{ex:qriter}
Consider the matrix
\[
    A_0 = \begin{bmatrix}
    2 & 1 & 0 \\
    1 & 2 & 1 \\
    0 & 1 & 2
    \end{bmatrix}
\]
After five iterations we find:
\[
    A_5 \approx \begin{bmatrix}
    3.414 & 0.892 & 0.247 \\
    0.000 & 2.000 & 0.731 \\
    0.000 & 0.000 & 0.586
    \end{bmatrix}
\]
The diagonal entries converge to eigenvalues approximately 3.414, 2.000, and 0.586  --  roots of the characteristic polynomial $\lambda^3-6\lambda^2+11\lambda-6$ that we never explicitly computed.
\end{example}

For matrices with complex eigenvalues, 2-by-2 blocks emerge naturally on the diagonal:
\[
    \begin{bmatrix}
    a & b \\
    -b & a
    \end{bmatrix}
    \quad\text{representing}\quad
    \lambda = a \pm ib
\]
This structure allows computation entirely in real arithmetic while still capturing complex behavior  --  a principle we first encountered when studying rotations in Section \ref{sec:complexeigs}.

Modern implementations enhance this basic iteration through several refinements:
\begin{enumerate}
    \item Shifts accelerate convergence by focusing iteration near likely eigenvalues
    \item Implicit updates maintain structure while reducing operation count
    \item Deflation techniques isolate converged eigenvalues for efficient processing
\end{enumerate}
\begin{marginfigure}
{\em Example:} The shifted QR algorithm subtracts an estimate $\mu$ of an eigenvalue, iterates, then adds $\mu$ back  --  dramatically improving convergence when $\mu$ is well-chosen.
\end{marginfigure}

The algorithm's power emerges from its respect for matrix structure. Unlike characteristic polynomial computation, which discards geometry for pure algebra, QR iteration maintains crucial relationships:
\begin{itemize}
    \item Orthogonal transformations preserve eigenvalues exactly
    \item Upper triangular form reveals eigenvalue approximations directly
    \item Similar matrices retain Jordan structure though it may be obscured
\end{itemize}

Though we lose explicit access to Jordan structure in the numerical process, the eigenvalues themselves emerge with remarkable accuracy. This balance  --  between theoretical understanding and practical computation  --  exemplifies how seemingly abstract concepts guide the development of efficient algorithms. The ideas that illuminated eigenvalue structure theoretically provide the tools for finding them in practice.

% ==============================================
\section{Back to Basis}
\label{sec:basisODEs}
% ==============================================

Our development of Jordan form illuminates the solution structure of higher-order linear differential equations. The abstract machinery of generalized eigenvectors and complex blocks translates directly into concrete solution patterns  --  polynomial terms multiplying exponentials, trigonometric functions emerging from complex pairs. This understanding completes our picture of basis solutions begun in Chapter \ref{ch:7}, revealing why and how such patterns must arise.

Consider the general linear homogeneous differential equation of order $n$ with degree $n$ characteristic polynomial $p\in\poly_n$:
\[
    p(D)x = (D^n + a_{n-1}D^{n-1} + \cdots + a_1D + a_0)x = 0
\]
When the roots of $p(\lambda)=0$ are repeated, the companion matrix falls into Jordan blocks. The basis solutions emerge directly from the first row of the matrix exponential $e^{Jt}$ computed in Section \ref{sec:jordan}. For a Jordan block of size $k$ with eigenvalue $\lambda$, we found:
\[
    e^{Jt} = e^{\lambda t}e^{Nt} = e^{\lambda t}\left(I + Nt + \frac{N^2t^2}{2!} + \cdots + \frac{N^{k-1}t^{k-1}}{(k-1)!}\right)
\]
The first row of this matrix yields our fundamental solutions:

\begin{theorem}[Basis Solutions for Repeated Roots]
\label{thm:repeatedroots}
Let $\lambda$ be a root of the characteristic equation $p(\lambda)=0$ with algebraic multiplicity $k$. Then:
\begin{enumerate}
    \item For real $\lambda$, the functions
    \[
        \phi_j(t) = t^je^{\lambda t}, \quad j=0,1,\ldots,k-1
    \]
    form a basis for the solution space corresponding to $\lambda$.
    \item For complex $\lambda = \alpha \pm i\beta$, the functions
    \[
        \phi_j(t) = t^je^{\alpha t}\cos(\beta t), \quad \psi_j(t) = t^je^{\alpha t}\sin(\beta t), \quad j=0,1,\ldots,k-1
    \]
    form a real basis for the solution space corresponding to the conjugate pair.
\end{enumerate}
\end{theorem}
These solutions arise directly from the first row of the exponentiated Jordan blocks computed in Section \ref{sec:jordan}.

\begin{example}[Repeated Real Root]
The equation
\[
    \frac{d^3x}{dt^3} - 3\frac{d^2x}{dt^2} + 3\frac{dx}{dt} - x = 0
\]
has characteristic polynomial $(\lambda-1)^3=0$. The basis solutions are
\[
    \phi_0(t) = e^t, \quad \phi_1(t) = te^t, \quad \phi_2(t) = \frac{1}{2!}t^2 e^t
\]
matching exactly the patterns seen in the Jordan form of its companion matrix. These emerge from the first row of the matrix exponential computed in our earlier example of a $3\times 3$ Jordan block. The general solution is their linear combination:
\[
    x(t) = c_0\phi_0(t) + c_1\phi_1(t) + c_2\phi_2(t)
\]
\end{example}


\begin{example}[Damped Oscillator]
The equation
\begin{equation}
\label{eq:osc}    
    \frac{d^2x}{dt^2} + 2\gamma\frac{dx}{dt} + \omega^2x = 0
\end{equation}
with $0<\gamma<\omega$ (underdamped case) has characteristic equation
\[
    \lambda^2 + 2\gamma\lambda + \omega^2 = 0
\]
with roots $\lambda = -\gamma \pm i\sqrt{\omega^2-\gamma^2}$. The solution
\[
    x(t) = e^{-\gamma t}
    \left(c_1\cos(\sqrt{\omega^2-\gamma^2}\,t) + c_2\sin(\sqrt{\omega^2-\gamma^2}\,t)\right)
\]
exhibits damped oscillations  --  exponential decay at rate $\gamma$ modulating oscillations of frequency $\sqrt{\omega^2-\gamma^2}$.
\end{example}

\begin{example}[Critical Damping]
In equation (\ref{eq:osc}), if we set the damping to the precise value $\gamma=\omega$, this represents critical damping when the damping coefficient exactly balances the natural frequency. Its characteristic polynomial
\begin{marginfigure}
    {\em Example:} In mechanical systems, critical damping represents the fastest return to equilibrium without oscillation -- all eigenvalues coincide at the negative point maximizing decay.
\end{marginfigure}
\[
    \lambda^2 + 2\omega\lambda + \omega^2 = (\lambda+\omega)^2
\]
has repeated root $\lambda=-\omega$. The basis solutions are now
\[
    \phi_1(t) = e^{-\omega t} 
    \quad\text{and}\quad 
    \phi_2(t) = te^{-\omega t}
\]
The transition between underdamped oscillation and pure decay manifests in this borderline case  --  the polynomial term $t$ replacing trigonometric functions as the roots merge.
\end{example}

\begin{example}[Repeated Complex Roots]
Consider the fourth-order equation
\[
\frac{d^4x}{dt^4} + 2\frac{d^2x}{dt^2} + x = 0
\]
whose characteristic polynomial $\lambda^4 + 2\lambda^2 + 1 = (\lambda^2+1)^2$ has root $\lambda=i$ with multiplicity 2. From the complex solutions $t^je^{it}$, we obtain four real basis solutions:
\[
    \phi_1(t) = \cos(t), \quad \phi_2(t) = \sin(t), \quad \phi_3(t) = t\cos(t), \quad \phi_4(t) = t\sin(t)
\]
These emerge directly from the first row of the exponentiated complex Jordan block, with the polynomial coefficients modulating pure sinusoidal motion to create more intricate oscillatory patterns.
\end{example}

These examples illustrate the perfect correspondence between Jordan structure and scalar solutions. The nilpotent part of each Jordan block manifests as powers of $t$, while complex blocks generate the trigonometric functions essential to modeling oscillation. What appeared first as abstract matrix theory reveals itself as the natural language for describing physical motion.

The complete solution always emerges by combining the contributions from each Jordan block:
\begin{itemize}
    \item Real eigenvalues yield exponential decay or growth
    \item Complex pairs generate oscillatory terms
    \item Repeated roots add polynomial factors
\end{itemize}
This decomposition  --  splitting motion into its fundamental modes  --  provides both theoretical insight and practical methods for understanding linear systems.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Power Grid Stability}
\label{EM:powergrid}
% **************** EMANATION *******************

The stability of electrical power grids provides a compelling demonstration of Jordan canonical form in practice. A modern power network consists of hundreds of generators linked by transmission lines, each machine contributing to a delicate dance of synchronized motion. When this dance falters, blackouts can cascade across continents. The mathematics of Chapter 8 reveals precisely how such instabilities develop through analysis of the network's Jordan structure.

Consider first a simple case: two generators connected by a transmission line. Each generator has an angle $\theta_i$ relative to the grid's nominal frequency. Small deviations from equilibrium follow linearized dynamics:
\[
    \begin{bmatrix}
    m_1 & 0 \\
    0 & m_2
    \end{bmatrix}
    \begin{pmatrix}\ddot{\theta}_1\\\ddot{\theta}_2\end{pmatrix} +
    \begin{bmatrix}
    d_1 & 0 \\
    0 & d_2
    \end{bmatrix}
    \begin{pmatrix}\dot{\theta}_1\\\dot{\theta}_2\end{pmatrix} +
    \begin{bmatrix}
    k & -k \\
    -k & k
    \end{bmatrix}
    \begin{pmatrix}\theta_1\\\theta_2\end{pmatrix} = \vect{0}
\]
where $m_i$ represents generator inertia, $d_i$ damping, and $k$ the transmission line coupling. Converting to first-order form by introducing phase angles and frequencies as state variables yields a $4\times 4$ system matrix $A$.

\begin{marginfigure}
{\em Historical Note:} The 2003 Northeast blackout, affecting 50 million people, began when overloaded transmission lines sagged into trees. The resulting loss of synchronization demonstrated dramatically how local instabilities can propagate through grid structures.
\end{marginfigure}

The crucial feature appears when generators have identical parameters: $m_1=m_2=m$ and $d_1=d_2=d$. Then $A$ develops eigenvalues of algebraic multiplicity two:
\[
    A = \begin{bmatrix}
    0 & 0 & 1 & 0 \\
    0 & 0 & 0 & 1 \\
    -k/m & k/m & -d/m & 0 \\
    k/m & -k/m & 0 & -d/m
    \end{bmatrix}
\]
Direct computation shows these eigenvalues come in pairs:
\[
    \lambda_{1,2} = -\frac{d}{2m} \pm i\sqrt{\frac{2k}{m}-\frac{d^2}{4m^2}}
\]
each with algebraic multiplicity 2 but geometric multiplicity 1. The physical meaning is clear: identical generators can oscillate in phase or anti-phase, with complex eigenvalues indicating oscillatory motion.

Following Section \ref{sec:jordan}, this matrix has Jordan canonical form:
\[
    A = PJP^{-1}
\]
where $J$ takes real Jordan form with two $2\times 2$ blocks:
\[
    J = \begin{bmatrix}
    C & I_2 \\
    0 & C
    \end{bmatrix}, \quad
    C = \begin{bmatrix}
    -d/(2m) & -\omega \\
    \omega & -d/(2m)
    \end{bmatrix}
\]
where $\omega = \sqrt{2k/m-d^2/(4m^2)}$ gives the oscillation frequency. This real form, as developed in Section \ref{sec:jordan}, reveals both the damped oscillation through $C$ and the coupling between modes through $I_2$.

The appearance of nontrivial Jordan structure signals subtle instability: though the system appears symmetric, small perturbations can excite growing oscillations as generators lose synchronization. Grid operators must carefully monitor these near-resonant conditions where eigenvalues coalesce.

\begin{marginfigure}
{\em Example:} Modern grid control systems continuously assess stability margins using a combination of model-based analysis and real-time measurements. When these assessments, which are related to the system's eigenvalues, indicate a reduction in stability margins or a risk of growing oscillations, operators receive alerts to adjust system conditions.
\end{marginfigure}

Real power networks involve hundreds of generators, leading to stability matrices of enormous dimension. Computing their Jordan structure demands the careful numerical methods developed in Section \ref{sec:computing}. The QR algorithms prove essential, as direct eigenvalue computation becomes unstable precisely when Jordan blocks begin to form --- exactly the dangerous regime operators must monitor.

Modern grid analysis employs sophisticated variants of these methods. Rather than compute full Jordan decompositions (which prove numerically delicate), operators track the migration of eigenvalues through the complex plane using the robust iteration schemes of Section \ref{sec:algorithms}. When eigenvalues approach coalescence, indicating incipient Jordan block formation, the network can be reconfigured to maintain stability.

The key insight is that Jordan structure in power networks indicates more than mere mathematical curiosity --- it signals physical conditions requiring immediate attention. Each identity superdiagonal in a Jordan block represents a path for instability to propagate between generators. Understanding this structure proves crucial for preventing cascading failures.

Grid stability thus exemplifies how the mathematics of Jordan canonical form shapes critical infrastructure. The delicate interplay between complex eigenvalues and geometric multiplicity, seemingly abstract in Section \ref{sec:repeated}, manifests physically in the synchronized dance of massive generators. Modern grid reliability depends fundamentally on the careful numerical computation of this structure.

\begin{marginfigure}
{\em Nota bene:} Unlike mechanical resonance, power grid instability through Jordan block formation can occur without external forcing --- a sobering reminder of how mathematical structure alone can precipitate physical disaster.
\end{marginfigure}

The profound lesson is that apparently identical components --- like generators with matching parameters --- create subtle dangers through their identical eigenvalues. The Jordan canonical form reveals precisely how such symmetry enables instability to propagate. Every major power grid control center now employs these mathematical tools, developed in this chapter, to maintain the reliable electricity supply on which modern society depends.



% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Chemical Process Network Analysis}
\label{EM:chemical}
% **************** EMANATION *******************

The identification of weakly coupled subsystems within large chemical processing networks demonstrates another powerful application of Jordan canonical form analysis. Modern chemical plants often combine dozens of reaction vessels, heat exchangers, and separation units into complex networks where material and energy flows create subtle couplings between processes. Understanding these couplings -- particularly which processes can be treated as nearly independent -- proves crucial for both safety and efficient control.

Consider a chemical plant with six coupled reaction vessels. In each vessel $i$, both the concentration $c_i(t)$ of a key reactant and the temperature $T_i(t)$ evolve according to coupled differential equations. Material flows between vessels create concentration coupling, while shared cooling systems introduce thermal coupling. Let $\vect{x}=(c_1,T_1,\ldots,c_6,T_6)^T$ represent the complete state. Under standard assumptions of mass action kinetics and Newtonian cooling, the system takes the form:
\[
\frac{d\vect{x}}{dt} = A\vect{x}
\]

For a typical petrochemical process operating near steady state, we might find coupling matrix:
\[
A = \begin{bmatrix}
A_{11} & A_{12} & \varepsilon_{13} \\
A_{21} & A_{22} & 0 \\
\varepsilon_{31} & 0 & A_{33}
\end{bmatrix}
\]
where each $A_{ii}$ is a $4\times 4$ block describing two coupled vessels, $A_{ij}$ represents stronger coupling between adjacent units, and $\varepsilon_{ij}$ indicates weak coupling through shared utilities. Taking realistic values for an ethylene oxide production unit:
\[
A_{11} = \begin{bmatrix}
-2.1 & 0.8 & 0.5 & 0.1 \\
0.6 & -1.7 & 0.1 & 0.4 \\
0.5 & 0.1 & -1.9 & 0.7 \\
0.1 & 0.4 & 0.5 & -1.6
\end{bmatrix}
\]
with similar structure for $A_{22}$ and $A_{33}$, while the coupling terms have entries of order 0.1 or 0.01 for strong and weak coupling respectively.

\begin{marginfigure}
{\em Example:} In ethylene oxide production, reactor temperatures typically range from 200-300°C, making thermal coupling through shared cooling systems a significant safety consideration.
\end{marginfigure}

Computing eigenvalues through the methods of Section 8.5 reveals three distinct clusters, each parameterized by the coupling strength $\varepsilon$:
\[
\lambda_1 = -2.2 + O(\varepsilon), \quad \lambda_2 = -2.2 + O(\varepsilon)
\]
\[
\lambda_3,\lambda_4,\lambda_5 = -1.8 + O(\varepsilon)
\]
\[
\lambda_6,\ldots,\lambda_{12} = -1.5 + O(\varepsilon)
\]

These clustered eigenvalues suggest near-independence between process units. The Jordan decomposition confirms this structure:
\[
J = \begin{bmatrix}
J_1 & \varepsilon E_{12} & \varepsilon E_{13} \\
\varepsilon E_{21} & J_2 & \varepsilon E_{23} \\
\varepsilon E_{31} & \varepsilon E_{32} & J_3
\end{bmatrix}
\]
where each $J_i$ corresponds to a cluster of eigenvalues and each $E_{ij}$ is a matrix with entries of order unity, so the off-diagonal blocks contain terms of order $\varepsilon$.

This decomposition has profound implications for process control and safety:
\begin{enumerate}
    \item Each process train can be controlled quasi-independently
    \item Disturbances remain largely confined within blocks
    \item Critical failure modes align with Jordan block structure
    \item Sensor placement should respect subsystem boundaries
\end{enumerate}

The generalized eigenvectors associated with each Jordan block reveal exactly how disturbances propagate between units. For example, a temperature excursion in vessel 1 affects its paired vessel 2 strongly but influences the other process trains only weakly through the $O(\varepsilon)$ terms. This understanding proves crucial for safety system design.

\begin{marginfigure}
{\em Historical Note:} Major chemical accidents like the 1974 Flixborough disaster have highlighted the importance of understanding how process disturbances can propagate through weakly coupled systems.
\end{marginfigure}

Modern chemical plant design explicitly considers this block structure. Critical processes are grouped to minimize coupling with other units, while safety systems are designed around identified subsystem boundaries. The Jordan canonical form provides the mathematical framework for quantifying these design decisions.

The computational methods of Section 8.5 prove essential for large plants, where direct eigenvalue calculation becomes numerically unstable precisely when identifying weak coupling is most important. The QR iteration reliably reveals clustered eigenvalues even when direct methods fail.

This chemical engineering application complements our flutter analysis by showing how Jordan structure illuminates weakly coupled systems. Where aircraft exhibit exact repeated eigenvalues from physical symmetries, chemical processes display near-repetition from weak coupling. Together they demonstrate how the mathematical framework developed in this chapter guides both analysis and design of complex engineered systems.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 8}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item The matrix
\[
    A = \begin{bmatrix}
    5 & -12 \\
    4 & 5
    \end{bmatrix}
\]
has complex eigenvalues. Find them and write $e^{At}$ in terms of trigonometric functions. Describe geometrically what this matrix does to vectors in $\R^2$ as $t$ increases.

\item Find the eigenvalues and generalized eigenvectors of
\[
    A = \begin{bmatrix}
    3 & 1 & 0 & 0 \\
    0 & 3 & 1 & 0 \\
    0 & 0 & 3 & 1 \\
    0 & 0 & 0 & 3
    \end{bmatrix}
\]
Then use them to compute $e^{At}$. Verify your answer by checking that it satisfies both $(d/dt)e^{At} = Ae^{At}$ and $e^{A0} = I$.

\item Consider the $5\times 5$ matrix
\[
    A = \begin{bmatrix}
    3 & 1 & 0 & 0 & 0 \\
    0 & 3 & 0 & 0 & 0 \\
    0 & 0 & -1 & 1 & 0 \\
    0 & 0 & 0 & -1 & 1 \\
    0 & 0 & 0 & 0 & -1
    \end{bmatrix}
\]
Find its Jordan canonical form and compute $e^{At}$. Which solutions grow as $t\to\infty$?

\item For the system
\[
    \frac{d}{dt}\begin{pmatrix}x\\y\\z\\w\end{pmatrix} = 
    \begin{bmatrix}
    -1 & 1 & 0 & 0 \\
    0 & -1 & 0 & 0 \\
    0 & 0 & -1 & 1 \\
    0 & 0 & 0 & -1
    \end{bmatrix}
    \begin{pmatrix}x\\y\\z\\w\end{pmatrix}
\]
find all solutions that remain bounded as $t\to\infty$. Show that these form a subspace and find its dimension.

\item Consider the matrix
\[
    A = \begin{bmatrix}
    -1 & -2 & 1 & 0 & 0 \\
    2 & -1 & 0 & 1 & 0 \\
    0 & 0 & 4 & 1 & 0 \\
    0 & 0 & 0 & 4 & 1 \\
    0 & 0 & 0 & 0 & 4
    \end{bmatrix}
\]
Find its Jordan canonical form and a matrix $V$ such that $V^{-1}AV$ gives this form. What does $e^{At}$ look like?

\item For the fourth-order equation
\[
    \frac{d^4x}{dt^4} + 4\frac{d^2x}{dt^2} + 4x = 0
\]
find all linearly independent solutions. (Hint: the characteristic polynomial factors as $(\lambda^2+2)^2$).

\item Consider the matrices
\[
    A = \begin{bmatrix}
    1 & 1 & 0 & 0 & 0 & 0 \\
    0 & 1 & 0 & 0 & 0 & 0 \\
    0 & 0 & 1 & 1 & 0 & 0 \\
    0 & 0 & 0 & 1 & 1 & 0 \\
    0 & 0 & 0 & 0 & 1 & 0 \\
    0 & 0 & 0 & 0 & 0 & 1
    \end{bmatrix}
    \quad\text{and}\quad
    B = \begin{bmatrix}
    1 & 1 & 0 & 0 & 0 & 0 \\
    0 & 1 & 1 & 0 & 0 & 0 \\
    0 & 0 & 1 & 0 & 0 & 0 \\
    0 & 0 & 0 & 1 & 1 & 0 \\
    0 & 0 & 0 & -1 & 1 & 0 \\
    0 & 0 & 0 & 0 & 0 & 1
    \end{bmatrix}
\]
For each matrix: list all eigenvalues with their algebraic and geometric multiplicities and identify the Jordan blocks and their sizes; then, compute $e^{At}$ and $e^{Bt}$.

\item Let $A$ be a $4\times 4$ matrix with characteristic polynomial $p(\lambda) = (\lambda-5)^4$. How many different Jordan canonical forms are possible for $A$? Draw diagrams showing all possibilities.

\item For a real $4\times 4$ matrix $A$ with complex conjugate pairs of eigenvalues $1\pm 2i$ and $-3\pm i$, what are the possible Jordan canonical forms? Draw diagrams showing all possibilities.

\item Let
\[
    A = \begin{bmatrix}
    1 & -4 & 2 & 0 \\
    1 & 1 & 0 & 2 \\
    0 & 0 & 3 & 1 \\
    0 & 0 & 0 & 3
    \end{bmatrix}
\]
Find all possible Jordan canonical forms consistent with its eigenvalues. Which is correct? Explain how to determine this without computing $V$.

\item Consider the matrix
\[
    A = \begin{bmatrix}
    3 & -4 & 1 & 0 & 0 \\
    4 & 3 & 0 & 1 & 0 \\
    0 & 0 & -2 & 1 & 0 \\
    0 & 0 & 0 & -2 & 1 \\
    0 & 0 & 0 & 0 & -2
    \end{bmatrix}
\]
Find its Jordan canonical form. What is the minimal polynomial of $A$?

\item Show that for any nilpotent matrix $N$ of size $n\times n$, the series
\[
    I + tN + \frac{t^2}{2!}N^2 + \frac{t^3}{3!}N^3 + \cdots + \frac{t^{n-1}}{(n-1)!}N^{n-1}
\]
gives $e^{Nt}$ exactly. Use this to find $e^{Nt}$ for the $4\times 4$ nilpotent matrix $N$ with ones on the superdiagonal.

\item Let $A$ be a real $n\times n$ matrix. Prove that if $\lambda$ is an eigenvalue of $A$ with algebraic multiplicity $m$, then the solutions to $(A-\lambda I)^m\vect{x}=\vect{0}$ form a subspace of dimension exactly $m$.

\item For a $3\times 3$ matrix $A$ with repeated eigenvalue $\lambda$, prove that there are exactly three possible Jordan canonical forms. Draw diagrams showing the generalized eigenvector chains in each case.

\item A $4\times 4$ matrix $A$ has characteristic polynomial $(\lambda+1)^2(\lambda-3)(\lambda+2)$. If $\ker(A+I)$ is one-dimensional, what must the Jordan canonical form of $A$ be? Justify your answer.

\item For an $n\times n$ matrix $A$ with eigenvalues $\lambda_1,\ldots,\lambda_n$ (counted with multiplicity), prove that $\trace(e^{At}) = \sum_{k=1}^n e^{\lambda_kt}$. Use this to show that if all eigenvalues have negative real part, then $\trace(e^{At})\to 0$ as $t\to\infty$.

\item Let $A$ be a real $2\times 2$ matrix with complex eigenvalues $a\pm bi$ where $b\neq 0$. Prove that $A$ cannot be written as a product of two real matrices with real eigenvalues. (Hint: consider what happens to eigenvalues under matrix multiplication).

\item Let $A$ be a real $4\times 4$ matrix with eigenvalues $3\pm 4i$ and $-2\pm i$. What form must
\[
    \lim_{t\to\infty} \frac{e^{At}}{e^{3t}}
\]
take? Justify your answer.

\item Show that for any $n\times n$ matrix $A$, the nilpotent part $N$ in its Jordan decomposition satisfies $N^n=0$. Give an example of a $5\times 5$ nilpotent matrix $N$ where $N^4\neq 0$ but $N^5=0$.

\item Use Cayley-Hamilton to prove that if $A$ is nilpotent (meaning some power equals zero) then its characteristic polynomial must be $p(\lambda) = \lambda^n$ where $n$ is the size of the matrix. What does this tell you about the eigenvalues of nilpotent matrices?
\begin{marginfigure}
    See the Chapter 7 exercises for the Cayley-Hamilton Theorem.
\end{marginfigure}

\item Let $A$ be an $n\times n$ matrix. Prove that the minimal polynomial of $A$ (the monic polynomial of least degree that annihilates $A$) must divide the characteristic polynomial. How does this relate to the Jordan canonical form?

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Linear Iterative Systems}
\label{ch:9}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``the sea of Time \& Space beat round the Rock in mighty waves''}

\newthought{Power begets power} in the iteration of linear transformations. Each application of a matrix to a vector shifts weight toward dominant directions, channeling through preferred pathways until some natural balance emerges. This fundamental process  --  the convergence of repeated transformation toward dominant modes  --  pervades both theory and computation. Though simpler than the continuous flows and Jordan structures of previous chapters, these iterative systems harbor their own subtle beauty in the delicate interplay between matrix structure and asymptotic behavior.

The central idea lies in the connection between matrix powers and eigenvalues. In the cleanest case, iterating a linear transformation leads to one eigenvalue dominating the results, dragging all vectors toward its eigendirection. This process distills a matrix to its essential axis. The power method for finding largest eigenvalues emerges naturally from the iteration itself, transforming a theoretical insight into a computational tool.

Yet not all matrices yield so readily to analysis. Special structures  --  positivity constraints, probability conservation, symmetry  --  all complicate and enlighten our study. The Perron-Frobenius theory reveals how positivity forces uniqueness of dominant directions. Stochastic matrices preserve total measure while driving systems toward equilibrium states. Graph matrices encode discrete topology through their spectrum. Each class brings its own spectral features that shape long-term behavior.

Our development moves from the abstract to the concrete, from general iteration to structured classes of matrices whose special properties illuminate both theory and application. The patterns we uncover extend naturally from finite to infinite dimensions, from discrete steps to continuous limits. Throughout, we maintain the perspective that iteration provides both theoretical insight and practical power  --  each application of a transformation bringing us closer to understanding its fundamental nature.

% ==============================================
\section{At First Iteration}
\label{sec:firstiteration}
% ==============================================

The Fibonacci sequence provides a first glimpse of the mysteries inherent in linear iteration. Each number, the sum of its two predecessors, generates the sequence
\[
    0,1,1,2,3,5,8,13,21,34,55,89,144,\ldots
\]
More striking than the numbers themselves is a hidden pattern: the ratio of consecutive terms approaches a fixed value $\varphi=(1+\sqrt{5})/2\approx 1.618$. This convergence of ratios hints at deeper structure within linear recurrence relations.
\begin{marginfigure}
    Yes, this is the so-called \style{golden ratio}. No, it is not here because it is ubiquitous in art \& nature. It appears here because it is a dominant eigenvalue: keep reading.
\end{marginfigure}
The Fibonacci rule $x_{n+2} = x_{n+1} + x_n$ represents a second-order recurrence. Just as we transformed second-order differential equations to first-order systems in Chapter \ref{ch:7}, we can recast this scalar sequence as a vector iteration. Setting
\[
    \vect{v}(n) = \begin{pmatrix}x_{n}\\x_{n+1}\end{pmatrix}
\]
transforms the recurrence into
\[
    \vect{v}(n+1) = \begin{bmatrix}0 & 1\\1 & 1\end{bmatrix}\vect{v}(n)
\]
This first-order vector system captures the same evolution through matrix multiplication. Its solution is clear if not clearly computable:
\[
    \vect{v}(n) = A^n\vect{v}(0) = A^n\begin{pmatrix}
        1 \\ 1
    \end{pmatrix}
\]

More generally, a discrete-time linear system evolving has form and solution
\begin{equation}
\label{eq:iteration}
    \vect{x}(n+1) = A\vect{x}(n)
    \quad \Rightarrow \quad
    \vect{x}(n) = A^n\vect{x}(0)
\end{equation}
where $A$ encodes the rules of evolution. Just as differential equations $d\vect{x}/dt = A\vect{x}$ generate continuous flow through infinitesimal change, recurrence relations  (\ref{eq:iteration}) create discrete trajectories through stepped iteration.

This discrete framework finds natural application in economic systems, where regular time periods (quarters, years) impose inherent granularity. Consider an economy divided into $n$ sectors, each producing goods consumed partly by other sectors and partly as final output. The \style{input-output matrix} $A=[a_{ij}]$ encodes production requirements: entry $a_{ij}$ represents the amount of sector $i$'s output needed to produce one unit of sector $j$'s output. The evolution of such a system follows equation (\ref{eq:iteration}), where $\vect{x}(n)$ represents sector outputs in period $n$. 
\begin{marginfigure}
{\em Historical Note:} This type of economic model is called a \style{Leontief input-output model}, after W. Leontief who developed it in the 1930s through detailed study of the American economy. His work on structural interdependence in production earned him the 1973 Nobel Prize in Economics. Though originally developed using data laboriously collected by hand, such models now inform economic planning worldwide through automated data collection and computation.
\end{marginfigure}
%
\begin{example}[Six-Sector Economy]
\label{ex:Leontief}
Consider an economy with six primary sectors:
\begin{enumerate}
    \item Agriculture (food production)
    \item Energy (power generation)
    \item Manufacturing (industrial goods)
    \item Transportation (logistics)
    \item Services (business/consumer)
    \item Technology (IT/communications)
\end{enumerate}
The input-output matrix $A$ captures their interdependencies:
\[
    A = \begin{bmatrix}
    0.15 & 0.08 & 0.10 & 0.05 & 0.20 & 0.05 \\
    0.25 & 0.30 & 0.35 & 0.40 & 0.20 & 0.30 \\
    0.10 & 0.15 & 0.20 & 0.25 & 0.10 & 0.20 \\
    0.15 & 0.12 & 0.15 & 0.15 & 0.10 & 0.08 \\
    0.20 & 0.25 & 0.15 & 0.15 & 0.25 & 0.30 \\
    0.15 & 0.15 & 0.20 & 0.15 & 0.25 & 0.20
    \end{bmatrix}
\]
Each column represents input requirements for one unit of sector output. For instance, producing one unit of manufacturing output (column 3) requires 0.10 units of agricultural input, 0.35 units of energy, and so forth. The columns sum to about 0.95 as they exclude labor and other external factors.

A remarkable pattern emerges under iteration. Regardless of initial conditions, the relative sizes of sectors converge to fixed proportions approximately equal to $(0.22, 0.62, 0.34, 0.25, 0.48, 0.39)^T$, while the overall economy grows by roughly $1.09$ (or $9\%$) each period. This means that after sufficient time, the energy sector (2) stabilizes at almost $2.8$ times the size of the agricultural sector (1), the service sector (5) maintains about $1.2$ times the size of the technology sector (6), and so forth  --  despite widely varying initial conditions and complex interconnections.
\end{example}
%
\begin{marginfigure}
{\em Foreshadowing:} The uniform growth rate emerging from Leontief models previews the Perron-Frobenius theory of positive matrices, where special structure ensures uniqueness of a dominant positive eigenvalue.
\end{marginfigure}
%
This tension  --  between the apparent complexity of sector-wise interaction and the simplicity of asymptotic behavior  --  exemplifies a broader principle in linear systems. Complex coupling through multiple variables often reduces to simpler patterns driven by dominant modes. Our task in this chapter is to understand this reduction, revealing how eigenstructure shapes the long-term character of linear iteration.

% ==============================================
\section{Dominance \& Convergence}
\label{sec:dominant}
% ==============================================

The mysterious convergence of Fibonacci ratios hints at deeper structure within matrix powers. Just as the ratio sequence $x_{n+1}/x_n$ approaches the golden mean $\varphi$, general matrix iterations often display similar convergence  --  their behavior dominated by a single eigenvalue that drives long-term evolution. This reduction of complex iteration to simple scaling reflects a fundamental principle: repeated linear transformation distills a matrix to its essential character.

The key to understanding such convergence lies in the representation of matrix powers from Lemma \ref{lem:powers}. For a diagonalizable matrix $A=V\Lambda V^{-1}$, powers take the form
\[
    A^n = V\Lambda^n V^{-1}
\]
where $\Lambda^n$ simply raises each eigenvalue to the $n$th power. When these eigenvalues differ in magnitude, larger ones grow faster than smaller ones, eventually dominating the iteration. This observation leads to crucial definitions:

\begin{definition}[Spectral Radius and Dominance]
\label{def:specrad}
The \style{spectral radius} $\rho_A$ of a matrix $A$ is the maximum magnitude of its eigenvalues:
\[
    \rho_A = \max\{|\lambda| : \lambda \text{ is an eigenvalue of }A\}
\]
An eigenvalue $\lambda_*$ is \style{dominant} if $|\lambda_*|=\rho_A$ and no other eigenvalue has this magnitude. Its corresponding eigenvector $\vect{v}_*$ is called the \style{dominant eigenvector}.
\end{definition}

\begin{lemma}[Dominant Convergence]
\label{lem:domconv}
Let $A$ be diagonalizable with dominant eigenvalue $\lambda_*$ and corresponding eigenvector $\vect{v}_*$. Then for any initial vector $\vect{x}_0$ with nonzero component $C$ along $\vect{v}_*$, 
%
\begin{marginfigure}
{\em Nota bene:} The notion of eigenvalue dominance holds sway in ODEs and continuous-time dynamics as well; however, it is not the spectral radius that matters for continuous-time dominance -- it is the real parts of eigenvalues that count. Why? The differentiation operator $D$ is the natural logarithm of the shift operator $E$. 
\end{marginfigure}
%
\[
    A^n\vect{x}_0 \simeq C\lambda_*^n\vect{v}_*
    \,{\rm as }\, n\to\infty 
\]
\end{lemma}
%
\begin{proof}
Index the eigenvalues $\{\lambda_i\}$ such that $\lambda_*=\lambda_1$. Writing $\vect{x}_0$ in the eigenbasis $\{\vect{v}_i\}$ of $A$ and applying $A^n$ yields
\begin{align}
    A^n\vect{x}_0 
    & = A^n\sum_{k=1}^n c_k\vect{v}_k 
    = \sum_{k=1}^n c_k\lambda_k^n\vect{v}_k 
    \\
    & = c_1\lambda_*^n\left(\vect{v}_* + \sum_{k=2}^n \frac{c_k}{c_1}\left(\frac{\lambda_k}{\lambda_*}\right)^n\vect{v}_k\right)
    \simeq c_1\lambda_*^n\vect{v}_*
\end{align}
Since $|\lambda_k/\lambda_1|<1$ for all $k>1$, these ratios decay exponentially to zero as $n\to\infty$. The constant $c_1=C$. 
\end{proof}
%
This convergence principle underlies the \style{power method} for computing dominant eigenvalues. Starting with a random vector $\vect{x}_0$, repeated multiplication by $A$ followed by normalization yields convergence to the dominant eigendirection. The rate of convergence is governed by the ratio of the second largest eigenvalue magnitude to the dominant one.

\begin{example}[Fibonacci redux]
For the Fibonacci matrix of the previous section,
\[
    A = \begin{bmatrix}0 & 1\\1 & 1\end{bmatrix}
\]
the eigenvalues are $\varphi=(1+\sqrt{5})/2$ and $\psi=(1-\sqrt{5})/2$, with $|\varphi|>1>|\psi|$. As $n\to\infty$, the powers of the dominant eigenvalue $\varphi^n$ grow exponentially faster than $|\psi|^n$, explaining why consecutive Fibonacci numbers approach ratio $\varphi$.
\end{example}
%
\begin{example}[Economic Convergence]
Returning to Example \ref{ex:Leontief}, the six-sector economy's input-output matrix $A$ has dominant eigenvalue $\lambda_*=\rho_A\approx 1.09$ with corresponding dominant eigenvector 
\[
\vect{v}_* \approx (0.22, 0.62, 0.34, 0.25, 0.48, 0.39)^T
\]
normalized to unit length. This explains both the $9\%$ growth rate and the fixed proportions observed empirically  --  the economy's structure forces convergence to these ratios regardless of initial conditions.
\end{example}

When $A$ is not diagonalizable, Jordan blocks complicate but do not fundamentally alter this picture. Powers of a Jordan block 
\[
    J_k = \lambda I + N
\]
involve binomial terms:
\[
    J_k^n = \lambda^n
    \left(I + \frac{n}{\lambda}N + \frac{n(n-1)}{2\lambda^2}N^2 + \cdots\right)
\]
The polynomial factors in $n$ modify but cannot overcome the exponential dominance of the largest $|\lambda|$. Thus spectral radius still controls asymptotic behavior, though convergence may display subtle oscillations from the Jordan structure.

This reduction of matrix iteration to dominant modes exemplifies a broader principle: complex linear systems often simplify dramatically under repeated action. Whether modeling economic growth, population dynamics, or network influence, the long-term behavior typically reflects just one or two characteristic directions. The art lies in recognizing when such reduction occurs and exploiting the resulting simplification.

% ==============================================
\section{Positivity \& Perron-Frobenius Theory}
\label{sec:positive}
% ==============================================

Certain natural quantities like mass, energy, and population are bound below by zero; probabilities and measures alike stubbornly refuse to venture into negative territory. When linear transformations act on such intrinsically positive quantities, their matrices inherit special structure that profoundly shapes their spectral properties. This structure  --  the intrinsic positivity of real-world measurements  --  leads to remarkable conclusions about eigenvalues and asymptotic behavior.

Consider a matrix $A=[a_{ij}]$ whose entries are all positive: $a_{ij}>0$ for all $i,j$. Such matrices arise naturally in modeling coupled growth processes, where each component positively influences all others. The Perron-Frobenius theory reveals that such matrices possess uniquely simple spectral properties:

\begin{theorem}[Perron-Frobenius]
\label{thm:P-F}
Let $A$ be a square matrix with all entries strictly positive. Then $A$ has a dominant eigenvalue $\lambda_*=\rho_A>0$, with corresponding dominant eigenvector $\vect{v}_*>0$ having all positive components. Moreover, any nonnegative eigenvector of $A$ must lie in the dominant eigenspace.
\end{theorem}
%
\begin{marginfigure}
{\em Historical Note:} O. Perron first proved these results for positive matrices in 1907. G. Frobenius extended them to the more general class of nonnegative irreducible matrices in 1912, capturing broader applications in economics and probability theory.
\end{marginfigure}

The proof of this remarkable theorem illuminates how positivity shapes spectral structure. Consider iteration of $A$ on any positive vector $\vect{x}_0$. The sequence of normalized vectors $\vect{y}_n = A^n\vect{x}_0/\|A^n\vect{x}_0\|$ remains positive, and a key inequality emerges:
\[
    \min_i\frac{(A\vect{y})_i}{(\vect{y})_i} 
    \, \leq \, \rho_A \, \leq \, \max_i\frac{(A\vect{y})_i}{(\vect{y})_i}
\]
holding for any positive vector $\vect{y}$. These bounds, forced by positivity, trap the spectral radius. A delicate argument shows the bounds converge as $n\to\infty$, yielding both the dominant eigenvalue and its positive eigenvector. The impossibility of maintaining positivity through oscillation or decay then forces all other eigenvalues to have strictly smaller magnitude.
%
\begin{marginfigure}
    There is a more elegant, incisive proof using algebraic topology, of which the Author is particularly fond. 
\end{marginfigure}
%
\begin{example}[Research Citation Network]
Consider the influence network among six major research areas in computer science, where entry $b_{ij}$ of a matrix $B$ represents the relative citation flow: of all citations \emph{received by} papers in field $j$ during a given year, what fraction came \emph{from} papers in field $i$. The field-level averaging reveals citation patterns and a Perron-Frobenius dominant eigenvector as follows:
\[
    B = \begin{bmatrix}
    0.60 & 0.25 & 0.15 & 0.10 & 0.05 & 0.05 \\
    0.20 & 0.50 & 0.15 & 0.10 & 0.05 & 0.05 \\
    0.10 & 0.10 & 0.50 & 0.15 & 0.05 & 0.05 \\
    0.05 & 0.05 & 0.10 & 0.50 & 0.10 & 0.05 \\
    0.03 & 0.05 & 0.05 & 0.10 & 0.65 & 0.10 \\
    0.02 & 0.05 & 0.05 & 0.05 & 0.10 & 0.70
    \end{bmatrix}
    \quad : \quad
    \vect{v}_* \approx 
    \begin{pmatrix}
        0.220 \\ 0.250 \\ 0.180 \\ 0.148 \\ 0.122 \\ 0.081
    \end{pmatrix}
\]
\begin{marginfigure}
\textit{Nota bene:} These numbers are artificial, but the principle of using matrices to track citations is legit, and genuinely useful in many other contexts.
\end{marginfigure}

These correspond to: (1) Machine Learning, (2) Artificial Intelligence, (3) Computer Vision, (4) Robotics, (5) Systems \& Networks, and (6) Theory. For instance, when examining all citations received by AI papers (column 2), 25\% ($b_{12}=0.25$) come from ML papers, 50\% ($b_{22}=0.50$) come from other AI papers, and smaller fractions come from other fields. The Perron-Frobenius eigenvector $\vect{v}_*$ (the eigenvector corresponding to $\lambda_*=1$) provides a natural ranking of field influence or the stationary distribution of citations, with AI and ML dominating but maintaining significant coupling to application areas. The dominant eigenvalue is $\lambda_*=1$; the next largest eigenvalue magnitude is $|\lambda_2|\approx 0.647$. This reveals the convergence rate to the stationary distribution $\vect{v}_*$; the influence of other eigenmodes decays by a factor of $\approx 0.647$ with each citation cycle. Convergence to within a few percent of the final distribution occurs relatively quickly, roughly within $N$ cycles where $0.647^N$ is small (e.g., $N \approx 5$ cycles for decay to 10\%).
\end{example}


The power of these results extends beyond strictly positive matrices. A matrix $A$ is called \style{nonnegative} if all its entries are $\geq 0$, and \style{irreducible} if no coordinate permutation can transform it into block triangular form. These properties capture the connectivity of the underlying interaction network: irreducibility means every component eventually influences every other component, either directly or through intermediaries.

\begin{theorem}[Frobenius Extension]
\label{thm:frobenius}
The conclusions of the Perron-Frobenius theorem hold for nonnegative irreducible matrices, with the spectral radius $\rho_A>0$ still positive but possibly having complex conjugate eigenvalues of magnitude $\rho_A$.
\end{theorem}

\begin{example}[Catalytic Cycle]
\label{ex:catalysis}
Consider a chemical reaction network where substrates $S_1, S_2, S_3, S_4$ convert cyclically with transition matrix
\[
    \begin{tikzcd}[arrows={Rightarrow}]
    S_1 \arrow[r, "k_1"] & S_2 \arrow[d, "k_2"] \\
    S_4 \arrow[u, "k_4"] & S_3 \arrow[l, "k_3"]
    \end{tikzcd}
    \quad : \quad
    A = \begin{bmatrix}
    0 & 0 & 0 & k_4 \\
    k_1 & 0 & 0 & 0 \\
    0 & k_2 & 0 & 0 \\
    0 & 0 & k_3 & 0
    \end{bmatrix}
\]
with each reaction catalyzed by an enzyme. 
The characteristic polynomial of $A$ is 
$\lambda^4-k_1k_2k_3k_4=0$ and the
eigenvalues lie equally-spaced on the circle in the complex plane of radius $\rho_A = (k_1k_2k_3k_4)^{1/4}$. The cyclic nature of the reaction is reflected in these two complex pairs of eigenvalues, while irreducibility ensures the existence of a positive steady-state distribution among the substrates. 
\end{example}
%
\begin{marginfigure}
    Biochemists call such networks \style{futile cycles} when they appear to serve no purpose beyond energy consumption, though they often play crucial regulatory roles in cellular metabolism.
\end{marginfigure}

The connection between positivity and spectral properties exemplifies a broader principle: constraints on matrix entries often force constraints on eigenstructure. Understanding these relationships  --  how matrix structure shapes spectral behavior  --  provides both theoretical insight and practical tools for analyzing linear systems. The Perron-Frobenius theory stands as perhaps the most elegant example of this principle, where simple positivity yields profound conclusions about asymptotic behavior.


% ==============================================
\section{Stochastic Matrices \& Markov Chains}
\label{sec:stochastic}
% ==============================================

Many real-world processes involve transitions between states where probabilities govern the changes. Examples include weather patterns shifting between sunny and rainy conditions, animal populations moving between territories, and genetic traits passing between generations. Such processes, where future states depend only on the present and not on past history, lead naturally to the theory of Markov chains  --  a framework unifying discrete random processes through linear algebra.

\begin{definition}[Markov Chain]
\label{def:markov}
A \style{Markov chain} is a sequence of random variables $\{X_n\}_{n\geq 0}$ taking values in a set of states $S$, satisfying the \style{Markov property}:
\[
    \mathbb{P}(X_{n+1}=j|X_n=i,X_{n-1}=i_{n-1},\ldots,X_0=i_0) 
    = 
    \mathbb{P}(X_{n+1}=j|X_n=i)
\]
\begin{marginfigure}
    {\em Relax...} If you have not seen conditional probability, you can proceed without worrying too much about it. When you get the chance, read up on conditionals. For now, just work with $P$ as a matrix of probabilities.
\end{marginfigure}
The probability $p_{ij}$ of transitioning from state $i$ to state $j$ in one step defines an entry in the chain's \style{transition matrix} $P=[p_{ij}]$.
\end{definition}

The Markov property  --  that future depends on present but not past  --  transforms temporal evolution into matrix iteration. To see this connection, we must first clarify what our vectors represent:

\begin{definition}[Probability Distribution]
\label{def:probdist}
A vector $\vect{x}=(x_1,\ldots,x_n)^T$ is a \style{probability distribution} if:
\begin{enumerate}
    \item Nonnegativity: $x_i \geq 0$ for all $i$
    \item Total probability: $\sum_{i=1}^n x_i = 1$
\end{enumerate}
The entry $x_i$ represents the probability of being in state $i$.
\end{definition}

The transition matrices of Markov chains must preserve these probability constraints, leading to our central object of study:

\begin{definition}[Stochastic Matrix]
\label{def:stochastic}
A square matrix $P=[p_{ij}]$ is \style{stochastic} if it satisfies:
\begin{enumerate}
    \item Nonnegativity: $p_{ij} \geq 0$ for all $i,j$
    \item Column-stochasticity: $\sum_{i=1}^n p_{ij} = 1$ for all $j$
\end{enumerate}
\begin{marginfigure}
    {\em Caveat:} many authors use row-stochastic matrices, where the rows are probability distributions. If the term \style{stochastic} is used, always double-check which type is meant.
\end{marginfigure}
The entry $p_{ij}$ represents the probability of transitioning from state $j$ to state $i$ in one step.
\end{definition}

The evolution of probabilities in a Markov chain follows our familiar iteration pattern:
\[
    \vect{x}(k+1) = P\vect{x}(k)
\]
where $\vect{x}(k)$ represents the probability distribution across states at step $k$. The stochastic constraints on $P$ ensure that if $\vect{x}(k)$ is a probability distribution, then $\vect{x}(k+1)$ will be as well.

\begin{example}[Weather Patterns]
Consider a simple model of daily weather transitions among three states: Sunny (S), Cloudy (C), and Rainy (R). Historical data suggests transition probabilities:
\[
    P = \begin{bmatrix}
    0.7 & 0.3 & 0.2 \\
    0.2 & 0.4 & 0.3 \\
    0.1 & 0.3 & 0.5
    \end{bmatrix}
\]
Reading down columns: from a sunny day (first column), the weather transitions to sunny with probability 0.7, cloudy with 0.2, and rainy with 0.1; similarly for transitions from cloudy or rainy states. 

Given initial distribution $\vect{x}(0)=(1,0,0)^T$ representing certainty of sun today, tomorrow's distribution becomes:
\[
    \vect{x}(1) = P\vect{x}(0) = (0.7,0.2,0.1)^T
\]
showing how probability disperses across states. After two days:
\[
    \vect{x}(2) = P^2\vect{x}(0) = (0.55,0.27,0.18)^T
\]
suggesting convergence toward some equilibrium distribution.
\end{example}

The spectral properties of stochastic matrices determine their long-term behavior.
A Markov chain (and its transition matrix $P$) is \style{irreducible} if it is possible to reach any state from any other state (not necessarily in one step). It is \style{aperiodic} if the greatest common divisor of the lengths of all possible return paths to any state is 1. An irreducible and aperiodic Markov chain is called \style{ergodic}.
Every column-stochastic matrix has a particularly important eigenvalue:

\begin{lemma}[Stochastic Spectral Radius]
\label{lem:stochspec}
For any stochastic matrix $P$:
\begin{enumerate}
    \item $1$ is an eigenvalue of $P$.
    \item The spectral radius $\rho_P = 1$.
    \item All other eigenvalues satisfy $|\lambda| \leq 1$.
    \item If $P$ is irreducible, the eigenvalue $1$ is simple (algebraic multiplicity one). If $P$ is ergodic (irreducible and aperiodic), then $1$ is the unique eigenvalue of $P$ with magnitude $1$. If $P$ is irreducible but periodic with period $h>1$, there are $h$ distinct eigenvalues on the unit circle.
\end{enumerate}
\end{lemma}

\begin{proof}
First, observe that $\vect{1} = (1, 1, \ldots, 1)^T$ is an eigenvector of $P^T$ with eigenvalue 1, since each row of $P^T$ (column of $P$) sums to 1:
\[
    P^T\vect{1} = \vect{1}
\]
Since the eigenvalues of $P$ and $P^T$ are identical, this proves that 1 is an eigenvalue of $P$ as well.

For the spectral radius bound, consider any eigenvalue $\lambda$ of $P$ with eigenvector $\vect{v}$. Let $i$ be the index where $|v_i|$ is maximal. Then:
\[
    \lambda v_i = \sum_{j=1}^n p_{ij}v_j
\]
Taking absolute values:
\[
    |\lambda||v_i| = \left|\sum_{j=1}^n p_{ij}v_j\right| \leq \sum_{j=1}^n p_{ij}|v_j| \leq |v_i|\sum_{j=1}^n p_{ij} = |v_i|
\]
Since $|v_i| > 0$, we have $|\lambda| \leq 1$, establishing that $\rho_P = 1$.
The properties of eigenvalue 1 stated in point 4 are standard results from Perron-Frobenius theory adapted for stochastic matrices.
\end{proof}

A distribution $\vect{\pi}$ satisfying $P\vect{\pi}=\vect{\pi}$ is called \style{stationary}  --  it remains unchanged under the transition rules. For irreducible chains, such a distribution is unique. For ergodic chains, iteration from any initial distribution converges to this stationary state:

\begin{theorem}[Markov Convergence]
\label{thm:markovconv}
Let $P$ be an ergodic stochastic matrix (i.e., irreducible and aperiodic). Then:
\begin{enumerate}
    \item There exists a unique probability vector $\vect{\pi}$ such that $P\vect{\pi}=\vect{\pi}$. This $\vect{\pi}$ is called the \style{stationary distribution}.
    \item For any initial probability vector $\vect{x}(0)$:
    \[
        \lim_{k\to\infty} P^k\vect{x}(0) = \vect{\pi}
    \]
    \item The rate of convergence is governed by the magnitude of the second-largest eigenvalue (i.e., the largest $|\lambda|$ such that $|\lambda|<1$).
\end{enumerate}
\begin{marginfigure}
    If $P$ is irreducible but periodic, $P^k\vect{x}(0)$ does not converge to a single vector but may exhibit periodic limits or converge in Cesaro mean to $\vect{\pi}$.
\end{marginfigure}
\end{theorem}

\begin{example}[Weather Equilibrium]
Returning to our weather model, the matrix $P$ is irreducible (all entries are positive) and aperiodic (e.g., $p_{11}>0$). Solving $P\vect{\pi}=\vect{\pi}$ subject to $\sum \pi_i = 1$ yields stationary distribution $\vect{\pi}\approx(0.455,0.295,0.250)^T$. Thus in the long run, regardless of initial conditions, we expect about 45.5\% sunny days, 29.5\% cloudy, and 25\% rainy. The eigenvalues of $P$ are $1, \approx 0.422, \approx 0.178$. The second-largest eigenvalue magnitude $\approx 0.422$ indicates rapid convergence to this equilibrium.
\end{example}

The convergence guaranteed by Theorem \ref{thm:markovconv} explains numerous natural phenomena. Population genetics models using stochastic matrices predict allele frequencies in large populations. Economic models of brand loyalty forecast market share evolution. Web page rankings emerge from random walk models of user browsing patterns. In each case, the mathematics of probability conservation shapes long-term behavior through eigenstructure.

\begin{example}[Population Genetics]
Consider a population with two alleles $A$ and $a$ at a single genetic locus. The frequency $p$ of allele $A$ in the parental generation determines offspring genotype probabilities $(AA,Aa,aa)$ under random mating through transition matrix:
\[
    P = \begin{bmatrix}
    p^2 & p^2 & p^2 \\
    2p(1-p) & 2p(1-p) & 2p(1-p) \\
    (1-p)^2 & (1-p)^2 & (1-p)^2
    \end{bmatrix}
\]
where $Aa$ and $aA$ represent the same heterozygous genotype. This matrix has identical columns -- each offspring's genotype depends only on the overall allele frequencies, not its parents' specific genotypes. The unique stationary distribution gives Hardy-Weinberg equilibrium proportions $(p^2,2p(1-p),(1-p)^2)$, achieved in just one generation due to the rank-1 structure of $P$.
\end{example}

Special structures within stochastic matrices reveal additional features. A matrix $P$ is \style{doubly stochastic} if both its columns and rows sum to one -- probability is conserved in both forward and backward time. Such matrices arise naturally in physical systems where transitions conserve some underlying quantity. Their stationary distribution must be uniform: $\pi_i=1/n$ for all $i$.

\begin{example}[Random Walk on a Graph]
Consider a particle moving randomly on an undirected graph with $n$ vertices, where at each step it moves with equal probability to any adjacent vertex. The transition probabilities form a stochastic matrix $P=[p_{ij}]$ where
\[
    p_{ij} = \begin{cases}
    1/\deg(j) & \text{if vertices $i$ and $j$ are adjacent} \\
    0 & \text{otherwise}
    \end{cases}
\]
Here $\deg(j)$ denotes the degree (number of neighbors) of vertex $j$. This choice makes $P$ column-stochastic, though not necessarily symmetric. However, the relationship $\deg(j)p_{ij} = \deg(i)p_{ji}$ holds due to the undirected nature of the graph.

For instance, consider a graph with six vertices as shown. The transition matrix is:
\[
    P = \begin{bmatrix}
    0 & 1/3 & 1 & 1/2 & 1/3 & 0 \\
    1/4 & 0 & 0 & 1/2 & 1/3 & 0 \\
    1/4 & 0 & 0 & 0 & 0 & 0 \\
    1/4 & 1/3 & 0 & 0 & 0 & 0 \\
    1/4 & 1/3 & 0 & 0 & 0 & 1 \\
    0 & 0 & 0 & 0 & 1/3 & 0
    \end{bmatrix}
\]
\begin{marginfigure}
\centering
\includegraphics[width=1.0in]{random-walk.png}
\end{marginfigure}
If this graph is connected and not bipartite (which implies aperiodicity for this type of random walk), the chain is ergodic. The stationary distribution has components proportional to vertex degrees: $\pi_1 = 4/14$, $\pi_2 = 3/14$, $\pi_3 = 1/14$, $\pi_4 = 2/14$, $\pi_5 = 3/14$, $\pi_6 = 1/14$. This non-uniform distribution reflects how the random walk spends more time at higher-degree vertices, a principle that underlies many network centrality measures. In particular, observe how vertex 1, with its high degree of 4, captures the largest share of the stationary probability.
\end{example}

The theory extends naturally to infinite state spaces, though technical care is needed. Random walks on infinite graphs lead to rich theories of recurrence and transience, while scaling limits yield Brownian motion and stochastic differential equations. Throughout, the interplay between probability conservation and matrix structure guides our understanding of how randomness evolves into regularity through linear transformation.


% ==============================================
\section{Symmetric Matrices \& Spectra}
\label{sec:symmetric}
% ==============================================

Mathematics often reveals itself through the relationship between a structure's outer form and inner essence. Like a genome encoding an organism's shape and function, certain morphological features of matrices -- outward patterns in their array of numbers -- indicate hidden spectral properties that determine this fundamental behavior. The simple structural condition of symmetry ($A^T=A$) yields spectral consequences of profound significance.

Consider, for example, the matrices
\[
    \begin{bmatrix}
    4 & 1 & 2 \\
    1 & 3 & -1 \\
    2 & -1 & 5
    \end{bmatrix}
    \quad\text{or}\quad
    \begin{bmatrix}
    3 & -2 & 0 \\
    -2 & 5 & -1 \\
    0 & -1 & 4
    \end{bmatrix}
\]
Though their entries differ markedly, they share the crucial feature of symmetry about their diagonals. This visible structure -- like the bilateral symmetry of a butterfly's wings -- reveals deeper patterns: both matrices share spectral properties that emerge not from the specific numbers but from the symmetric pattern itself.

\begin{theorem}[Spectral Theorem]
\label{thm:spectral}
Let $A$ be a real symmetric matrix. Then:
\begin{enumerate}
    \item All eigenvalues of $A$ are real
    \item Eigenvectors corresponding to distinct eigenvalues are orthogonal
    \item $A$ has an orthonormal basis of eigenvectors
\end{enumerate}
Thus $A$ can be orthogonally diagonalized: $A=Q\Lambda Q^T$ where $Q$ is orthogonal and $\Lambda$ is diagonal with real entries.
\end{theorem}

This remarkable result -- that symmetry forces reality of eigenvalues and orthogonality of eigenvectors -- transforms abstract matrices into concrete geometric objects. Each symmetric matrix acts by stretching or compressing space along perpendicular axes determined by its eigenvectors. The eigenvalues measure the scale of this deformation, providing a coordinate-independent description of the transformation's action.

The quadratic form associated with symmetric $A$ reveals another face of this structure:
\[
    q(\vect{x}) = \vect{x}^TA\vect{x}
\]
This scalar function measures a kind of generalized ``energy'' in direction $\vect{x}$. Though more abstract than physical energy, this measure guides our understanding of how symmetric matrices shape the spaces they act upon. The following examples illuminate this connection between structure and spectrum across diverse applications.

\begin{example}[Correlation Structure]
\label{ex:correlation}
Given $n$ measurements of $d$ variables, their correlation matrix $\CORR=[\corr_{ij}]$ records standardized relationships between pairs of variables:
\[
    \corr_{ij} = \frac{\sum_{k=1}^n (x_{ki}-\overline{x}_i)(x_{kj}-\overline{x}_j)}{\sqrt{\sum_{k=1}^n(x_{ki}-\overline{x}_i)^2\sum_{k=1}^n(x_{kj}-\overline{x}_j)^2}}
\]
This matrix is naturally symmetric ($\corr_{ij}=\corr_{ji}$) with ones on the diagonal ($\corr_{ii}=1$). Its spectral structure reveals fundamental patterns in the data:
\begin{itemize}
    \item Eigenvalues measure strength of correlation patterns
    \item Eigenvectors identify groups of correlated variables
    \item Small eigenvalues indicate redundancy or dependencies
\end{itemize}
For instance, in financial data, eigenvectors often separate market sectors while eigenvalues measure sector-wide versus company-specific variations. In gene expression data, eigenvectors might identify co-regulated genes while eigenvalues quantify the strength of regulatory patterns.
\end{example}

\begin{example}[Solid Body Mechanics]
\label{ex:shape}
The mass of a complex three-dimensional object can be characterized through its inertia matrix (a mass-weighted) covariance matrix:
\begin{marginfigure}
    In mechanics, this is often called the {\em inertia tensor}. The intuitive fact that asymmetric massive bodies have three ``natural'' axes of rotation that are all orthogonal is a result of the spectral theorem. 
\end{marginfigure}
\[
    \inertia = \begin{bmatrix}
    I_{xx} & I_{xy} & I_{xz} \\
    I_{xy} & I_{yy} & I_{yz} \\
    I_{xz} & I_{yz} & I_{zz}
    \end{bmatrix}
\]
where entries measure second moments of the object's mass or point distribution. The eigenstructure of $\inertia$ provides a coordinate-independent description:
\begin{itemize}
    \item Eigenvectors give principal axes of the shape
    \item Eigenvalues measure spatial extent along these axes
    \item Ratios of eigenvalues quantify deviation from spherical symmetry
\end{itemize}
This spectral fingerprint enables shape matching and classification without requiring explicit alignment of objects. A drinking glass has one large eigenvalue (along its length) and two smaller equal ones (across its circular cross-section); a book has three distinct eigenvalues reflecting its rectangular proportions.
\end{example}

\begin{marginfigure}
{\em Think:} Given only pairwise distances between points, can we reconstruct their relative positions? This inverse problem is akin to trying to deduce the shape of a molecule from measurements between its atoms, or inferring a social network's structure from similarities between individuals.
\end{marginfigure}
\begin{example}[Distance Matrices]
\label{ex:distance}
Consider a collection of $n$ abstract points with only their pairwise distances known. The squared distances form a symmetric matrix $D=[d_{ij}]$ where $d_{ij}$ represents the squared distance between points $i$ and $j$. Though we cannot visualize these points directly, their geometric structure hides within $D$.


The centering matrix $H=I-\frac{1}{n}[1]$ (where $[1]$ denotes the matrix of all ones) and derived matrix $B=-\frac{1}{2}HDH$ play crucial roles. While $D$ itself may not be positive definite, $B$ is symmetric positive semidefinite and encodes the point cloud's geometric essence:
\begin{itemize}
    \item Positive eigenvalues reveal embedding dimensions
    \item Eigenvectors reconstruct relative positions
    \item Spectral decay indicates intrinsic dimensionality
\end{itemize}
This process of extracting coordinates from distances, known as \style{classical multidimensional scaling}, demonstrates how eigenstructure can reveal hidden geometric relationships in abstract data.
\end{example}

The extremal properties of quadratic forms associated with symmetric matrices provide another perspective on their structure:

\begin{lemma}[Extreme Values]
\label{lem:extreme}
For symmetric $A$, the quadratic form $q(\vect{x})=\vect{x}^TA\vect{x}$ achieves:
\begin{enumerate}
    \item Maximum value $\lambda_{\max}$ in direction of largest eigenvalue
    \item Minimum value $\lambda_{\min}$ in direction of smallest eigenvalue
    \item Intermediate values between $\lambda_{\min}$ and $\lambda_{\max}$ elsewhere
\end{enumerate}
on the unit sphere $\|\vect{x}\|=1$.
\end{lemma}

This optimization principle explains why many iterative processes converge toward eigenvectors. The power method for computing dominant eigenvalues, for instance, follows naturally: repeated multiplication by $A$ amplifies components along the direction of largest eigenvalue. 
\begin{marginfigure}
    More sophisticated algorithms like Lanczos iteration exploit the connection between symmetry and orthogonality to compute multiple eigenvalues efficiently.
\end{marginfigure}
%
\begin{example}[Portfolio Optimization]
\label{ex:portfolio}
Consider selecting investment weights $\vect{w}=(w_1,\ldots,w_n)^T$ for $n$ assets, where $w_i$ represents the fraction of wealth invested in asset $i$. These weights must satisfy two fundamental constraints: they must sum to one (full investment of available funds) and typically must be non-negative (no short selling): 
\[
    \sum_{i=1}^n w_i = 1 \quad\text{and}\quad w_i \geq 0
\]
The risk of this investment portfolio depends on how the assets' returns move together, captured by their \style{covariance matrix} $\COV=[C_{ij}]$. Each entry $C_{ij}$ measures how returns of assets $i$ and $j$ vary together  --  positive values indicate they tend to rise and fall together, while negative values suggest opposite movements.

This covariance matrix is naturally symmetric ($C_{ij}=C_{ji}$), connecting our financial problem to the theory of symmetric matrices. The total portfolio risk, measured by its variance, takes the form of a quadratic expression $\vect{w}^T\COV\vect{w}$. In the simplest formulation, our optimization problem becomes:
\[
    \begin{array}{rl}
    \text{minimize} & \vect{w}^T\COV\vect{w} \\
    \text{subject to} & \vect{w}^T\vect{1} = 1
    \end{array}
\]
where $\vect{1}$ denotes the vector of all ones. Despite its simple appearance, this constrained minimization reveals surprising depth when analyzed through the Lagrangian:
\[
    \mathcal{L}(\vect{w},\lambda) = \vect{w}^T\COV\vect{w} - \lambda(\vect{w}^T\vect{1}-1)
\]

\begin{marginfigure}
{\em Recall:} For minimizing $F(\vect{x})$ subject to $G(\vect{x})=0$, the Lagrangian $\mathcal{L}(\vect{x},\lambda) = F(\vect{x}) - \lambda G(\vect{x})$ yields equations $\nabla_{\vect{x}}\mathcal{L}=\vect{0}$ and $G(\vect{x})=0$. Here $F(\vect{w})=\vect{w}^T\COV\vect{w}$ and $G(\vect{w})=\vect{w}^T\vect{1}-1$.
\end{marginfigure}

Setting $\nabla_{\vect{w}}\mathcal{L}=\vect{0}$ yields:
\[
    2\COV\vect{w} - \lambda\vect{1} = \vect{0}
\]
Together with the constraint equation, this system determines the optimal weights:
\[
    \begin{array}{rcl}
    \vect{w} &=& \frac{1}{2}\COV^{-1}\vect{1}\lambda \\
    1 &=& \vect{w}^T\vect{1} = \frac{1}{2}\lambda\vect{1}^T\COV^{-1}\vect{1}
    \end{array}
\]
The solution reveals a fundamental relationship:
\[
    \vect{w} = \frac{\COV^{-1}\vect{1}}{\vect{1}^T\COV^{-1}\vect{1}}
\]
%
\begin{marginfigure}
{\em Think:} The constraint $\vect{w}^T\vect{1}=1$ defines a hyperplane in $\R^n$. The solution occurs where level sets of the quadratic form become tangent to this hyperplane  --  a geometric picture explaining why constrained optimization often yields such elegant solutions.
\end{marginfigure}
%
This transformation  --  from minimizing risk under investment constraints to solving a linear system  --  exemplifies a deeper pattern. Many constrained optimization problems involving symmetric matrices reduce to such systems through the machinery of Lagrange multipliers. The eigenstructure of $\COV$ shapes both the optimal portfolio weights and the minimum achievable risk. In practice, this analysis extends naturally to include expected returns through the Markowitz mean-variance framework, where risk minimization balances against return maximization through careful exploitation of covariance structure.
\end{example}
This intimate relationship between structural symmetry and spectral properties exemplifies a deeper pattern in mathematics: morphological constraints often encode spectral essence. Just as an organism's form reflects its genome, a matrix's visible patterns encode hidden properties that determine its fundamental behavior. This theme will deepen as we explore structured matrices in network science and develop the singular value decomposition in Chapter \ref{ch:10}.

% ==============================================
\section{Networked Behavior \& Consensus}
\label{sec:networks}
% ==============================================

The flow of influence through networks shapes everything from opinion formation to economic behavior to artificial intelligence. When agents in a network update their states based on their neighbors' values, complex global patterns can emerge from simple local rules. Understanding such collective behavior requires uniting the spectral theory of Sections \ref{sec:positive}--\ref{sec:symmetric} with the concrete topology of interaction networks.

\begin{definition}[Graph Laplacian]
\label{def:laplacian}
For an undirected graph with $n$ vertices, the \style{graph Laplacian} $L=[L_{ij}]$ is an $n\times n$ matrix whose entries are:
\[
    L_{ij} = \begin{cases}
        d_i & \text{if }i=j \\
        -1 & \text{if vertices }i\text{ and }j\text{ are connected} \\
        0 & \text{otherwise}
    \end{cases}
\]
where $d_i$ denotes the \style{degree} of vertex $i$  --  the number of edges connected to it. Equivalently, if $D=\diag(d_1,\ldots,d_n)$ is the \style{degree matrix} and $A=[a_{ij}]$ is the \style{adjacency matrix} with $a_{ij}=1$ for connected vertices and $0$ otherwise, then $L=D-A$.
\end{definition}

This matrix, though simple to define, carries profound information about network topology through its spectrum. Its action on a vector measures how quantities diffuse across edges, making it fundamental to understanding collective dynamics.

Consider a network of $n$ agents, each holding a real value $x_i(t)$ that evolves in discrete time through local averaging:
\[
    x_i(t+1) = \sum_{j\in N(i)} w_{ij}x_j(t)
\]
where $N(i)$ denotes the neighbors of agent $i$ and weights $w_{ij}$ represent interaction strengths. Writing $\vect{x}(t)$ for the vector of states, this local update rule becomes matrix iteration:
\[
    \vect{x}(t+1) = W\vect{x}(t)
\]
The weight matrix $W=[w_{ij}]$ often takes the form $W=D^{-1}A$ (neighbors weighted equally) or $W=I-\epsilon L$ (gradient descent on disagreement), connecting back to our study of stochastic matrices in Section \ref{sec:stochastic}.

\begin{example}[Opinion Dynamics]
Consider a political discourse network where there are five rough cohorts of people on social media, with connections as shown (these are not individuals, but rather interacting groups with aggregate opinions). Groups 1 and 4 hold the most influential positions, each connected to three others. Each group updates their opinion on a political proposal based on a weighted average of their neighbors' views:
\begin{marginfigure}
\centering
\includegraphics[width=1.0in]{group-opinions.png}
\end{marginfigure}
\[
    W = \begin{bmatrix}
    1/4 & 1/2 & 0 & 1/4 & 1/3 \\
    1/4 & 1/2 & 0 & 0 & 0 \\
    0 & 0 & 1/2 & 1/4 & 0 \\
    1/4 & 0 & 0 & 1/4 & 1/3 \\
    1/4 & 0 & 1/2 & 1/4 & 1/3 
    \end{bmatrix}
\]
The column-stochasticity of $W$ ensures each new opinion distribution remains a proper probability distribution. Starting from polarized initial opinions $\vect{x}(0)=(1,-1,-1,1,-1)^T$, iteration reveals steady convergence toward consensus, with the well-connected groups exercising greater influence on the final value through their multiple pathways of influence.
\end{example}

When the network is connected (the matrix $W$ is irreducible as in Section \ref{sec:positive}) and the weights respect certain balance conditions, a remarkable phenomenon occurs: all agents converge to consensus. This convergence follows directly from the Perron-Frobenius theory of Section \ref{sec:positive}, but the network perspective reveals how topology shapes the path to agreement.

\begin{theorem}[Network Consensus]
\label{thm:consensus}
Let $W$ be an $n \times n$ column-stochastic matrix that is also \style{ergodic} (i.e., irreducible and aperiodic). Let $\vect{\pi}$ be the unique stationary probability distribution for $W$ (the eigenvector corresponding to eigenvalue $\lambda_1=1$, normalized such that $\sum_i \pi_i = 1$). Then for any initial state vector $\vect{x}(0) \in \R^n$:
\[
    \lim_{t\to\infty} W^t\vect{x}(0) = \left(\sum_{i=1}^n x_i(0)\right) \vect{\pi}
\]
\begin{marginfigure}
    The term $\sum x_i(0)$ is the sum of the initial states. If $\vect{x}(0)$ is itself a probability distribution, this sum is 1, and the system converges to $\vect{\pi}$. If $W$ is only irreducible and periodic, $W^t\vect{x}(0)$ does not generally converge to a single vector.
\end{marginfigure}
Moreover, the rate of convergence to this consensus state (or its scaled version) is determined by $|\lambda_2|$, where $\lambda_2$ is an eigenvalue of $W$ with the second-largest magnitude (so $|\lambda_2| < 1$). The convergence is geometric, with the error typically decreasing by a factor of approximately $|\lambda_2|$ at each step.
\end{theorem}

This theorem connects network structure to collective behavior through spectral properties. The entries of $\vect{\pi}$ represent the relative influence of each node in the final consensus state, while $|\lambda_2|$ measures how quickly agreement emerges. Highly connected networks with good mixing properties have large spectral gaps and rapid convergence, as we saw with stochastic matrices in Section \ref{sec:stochastic}.

\begin{example}[Asset Price Formation]
Consider a market network where traders adjust their valuation of an asset based on their trading partners' prices. The weight matrix $W$ reflects both trust relationships and trading volume between parties. The consensus price that emerges represents a form of market efficiency, with well-connected traders (high $\pi_i$ values) having greater influence on price discovery. The spectral gap $1-|\lambda_2|$ determines how quickly the market reaches this equilibrium  --  critical for understanding market stability and reaction to shocks.
\end{example}

\begin{example}[Robotic Flocking]
A swarm of robots moving in the plane can achieve coordinated motion through local averaging of velocities. Each robot $i$ has position $\vect{p}_i$ and velocity $\vect{v}_i$, updating according to:
\[
    \vect{v}_i(t+1) = \vect{v}_i(t) + \epsilon\sum_{j\in N_i(t)}(\vect{v}_j(t)-\vect{v}_i(t))
\]
where $N_i(t)$ contains robots within communication range of robot $i$. This can be written as iteration with a time-varying Laplacian:
\[
    \vect{v}(t+1) = (I-\epsilon L(t))\vect{v}(t)
\]
The spectrum of $L(t)$ determines both stability of the flocking behavior and convergence rate to aligned motion. This example shows how the tools developed for static networks extend naturally to dynamic topologies.
\end{example}

Beyond simple averaging, the graph Laplacian enables more sophisticated analysis through its connection to network structure. For a connected graph:
\begin{enumerate}
    \item The zero eigenvalue is simple, with eigenvector $\vect{1}$
    \item The smallest nonzero eigenvalue measures connectivity
    \item The corresponding eigenvector reveals natural network clusters
\end{enumerate}
These properties, emerging from the symmetry studied in Section \ref{sec:symmetric}, provide both analytical tools and design principles for engineering collective behavior.

\begin{example}[Supply Chain Networks]
Returning to the input-output model from Section \ref{sec:firstiteration}, the Laplacian spectrum reveals vulnerability to disruption. Industries corresponding to components of the Fiedler vector (the eigenvector of the smallest nonzero eigenvalue) tend to split first under stress, identifying natural fault lines in the economic network. The magnitude of this eigenvalue quantifies the network's robustness to such splits.
\end{example}

Modern applications of these principles abound in both technology and nature:
\begin{itemize}
    \item Social networks shape opinion formation and information spread
    \item Financial networks transmit shocks and determine systemic risk
    \item Robot swarms coordinate through local interactions
    \item Supply chains balance efficiency against resilience
\end{itemize}
In each case, the interplay between network structure and spectral properties determines system-level behavior.

The tools we have developed  --  from stochastic matrices to symmetric spectra  --  unite in the study of network dynamics. This synthesis reveals how local rules and global topology combine to produce collective intelligence, pointing toward both deeper mathematics and practical applications in the chapters ahead. The final application of this chapter will explore one particularly elegant application: the PageRank algorithm that grew from these principles to organize the early Web.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Spectral Graph Theory \& Vibrations}
\label{EM:spectral}
% **************** EMANATION *******************

The mathematics of musical instruments emerges through the vibrations of strings and membranes. A plucked string settles into standing waves whose frequencies determine musical pitch, while a drum head vibrates in complex patterns creating its characteristic timbre. Though seemingly far from the matrix theory developed in this chapter, these physical systems find their mathematical essence in the eigenvalues of \style{graph Laplacians} --- a connection that transforms discrete network analysis into continuous harmonics through the deep principles of spectral theory.

Consider first a string fixed at its endpoints, discretized into $n$ points connected by identical springs. The displacement $x_i$ of point $i$ from equilibrium feels restoring forces from its neighbors proportional to the displacement differences:
\[
m\frac{d^2x_i}{dt^2} = \kappa(x_{i+1} - x_i) - \kappa(x_i - x_{i-1})
\]
where $m$ is the mass of each point and $\kappa$ the spring constant. Written in matrix form:
\[
m\frac{d^2\vect{x}}{dt^2} = -\kappa L\vect{x}
\]
where $L$ is the \style{graph Laplacian}:
\[
L = \begin{bmatrix}
1 & -1 & 0 & \cdots & 0 \\
-1 & 2 & -1 & \cdots & 0 \\
0 & -1 & 2 & \cdots & 0 \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
0 & 0 & 0 & \cdots & 1
\end{bmatrix}
\]

\begin{marginfigure}
{\em Note:} The graph Laplacian is symmetric positive semidefinite, with eigenvalues directly determining vibrational frequencies.
\end{marginfigure}

The eigenvectors of $L$ represent standing wave patterns --- modes of vibration where all points move sinusoidally with the same frequency but different amplitudes. The corresponding eigenvalues $\lambda_i$ determine these frequencies through $\omega_i = \sqrt{k\lambda_i/m}$. As $n$ increases, these discrete modes converge to the continuous solutions:
\[
v_k(x) = \sin\left(\frac{k\pi x}{L}\right), \quad \omega_k = \frac{k\pi}{L}\sqrt{\frac{T}{\rho}}
\]
where $L$ is string length, $T$ tension, and $\rho$ linear density.

\begin{marginfigure}
{\em Think:} The convergence of discrete to continuous eigenmodes reveals how graph theory naturally extends to continuous systems.
\end{marginfigure}

This connection between graph spectra and physical vibrations extends to higher dimensions. For a drum head approximated by a triangular mesh, focus on the vertices and edges of the mesh and define the graph Laplacian as a square matrix on the vertex set $V$:
\[
L_{ij} = \begin{cases}
\deg(i) & \text{if }i=j \\
-1 & \text{if }i\sim j \\
0 & \text{otherwise}
\end{cases}
\]
where $\deg(i)$ counts the number of vertices adjacent to vertex $i$, and $i\sim j$ denotes adjacent vertices. The eigenvalues again determine vibrational frequencies, while eigenvectors describe mode shapes --- patterns of displacement that maintain their form while oscillating in time.

\begin{marginfigure}
{\em Example:} A circular drum head's modes form the remarkable \style{Bessel functions}, emerging as limits of mesh eigenvectors.
\end{marginfigure}

A remarkable result connects these spectra to pure graph theory: the multiplicity of the Laplacian's zero eigenvalue equals the number of connected components. This reveals an elegant duality --- just as low eigenvalues describe slow vibrations, they also capture fundamental topological properties. The second-smallest eigenvalue measures how well-connected the graph is, with larger values indicating stronger connectivity, much as dominant eigenvalues determined convergence rates in our study of stochastic matrices.

\begin{marginfigure}
{\em Nota bene:} The connection between eigenvalues and graph connectivity parallels how dominant eigenvalues controlled network convergence earlier in this chapter.
\end{marginfigure}

The spectral approach to graph analysis extends naturally to partitioning problems, where we seek to divide vertices into groups with many internal connections but few connections between groups. The eigenvector corresponding to the second-smallest eigenvalue provides a continuous relaxation of this discrete problem: partition vertices according to the sign of their corresponding entry. This spectral clustering method often reveals natural communities in networks.

For weighted graphs, where edges carry different strengths, define the degree matrix $D=\diag(d_1,\ldots,d_n)$ where $d_i=\sum_j w_{ij}$ sums the weights of edges incident to vertex $i$. The normalized Laplacian $\mathcal{L} = D^{-1/2}LD^{-1/2}$ accounts for these varying connection strengths while maintaining symmetry. Its eigenvalues lie in $[0,2]$, providing a normalized measure of graph structure independent of absolute edge weights.

The deep connection between discrete graphs and continuous vibrations reveals a profound unity in mathematical physics. Eigenvalues and eigenvectors encode both physical oscillations and abstract graph properties, while the Laplacian provides the bridge between structure and dynamics. This synthesis --- of the discrete and continuous through spectral theory --- exemplifies how fundamental mathematical principles illuminate seemingly disparate phenomena.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{PageRank: The Flow of Web Authority}
\label{EM:pagerank}
% **************** EMANATION *******************

The World Wide Web presents perhaps the largest human-constructed network in history --- billions of pages connected through hyperlinks that channel attention and information across the digital sphere. Like the discrete-time systems studied in Section \ref{sec:firstiteration}, this vast network exhibits intrinsic patterns of information flow that can be understood through careful mathematical analysis. The challenge of organizing this space led to one of the most elegant applications of stochastic matrices and iterative convergence: Google's PageRank algorithm.

Consider a web surfer following links from page to page, modeling their behavior as the type of Markov chain developed in Section \ref{sec:stochastic}. At each step, they either follow a randomly chosen outgoing link (with probability $\alpha$) or jump to a random page anywhere on the web (with probability $1-\alpha$). This process generates a \style{transition matrix} $P = [p_{ij}]$:
\[
p_{ij} = \alpha\frac{a_{ij}}{d_j} + \frac{1-\alpha}{n}
\]
where $a_{ij}=1$ if page $j$ links to page $i$ (and 0 otherwise), $d_j$ is the number of outgoing links from page $j$ (if $d_j=0$, this term is often handled by assuming jumps to all pages equally), and $n$ is the total number of pages. The term $(1-\alpha)/n$ represents the random teleportation probability.

\begin{marginfigure}
{\em Historical Note:} The addition of random teleportation (typically $\alpha \approx 0.85 < 1$) ensures that $P$ is a \style{primitive} matrix if $n>0$: all its entries are strictly positive. A primitive stochastic matrix is necessarily \style{ergodic} (irreducible and aperiodic), guaranteeing the convergence properties studied in Section \ref{sec:stochastic}.
\end{marginfigure}

This construction makes $P$ a column-stochastic matrix. Since $0 < \alpha < 1$ and $n>0$, all entries $p_{ij}$ are strictly positive. A strictly positive stochastic matrix is ergodic. Applying the theory developed in Section \ref{sec:stochastic} (specifically Theorem \ref{thm:markovconv}), we know this matrix $P$ must have a unique stationary probability distribution $\vect{\pi}$ satisfying $\vect{\pi} = P\vect{\pi}$. This \style{PageRank vector} $\vect{\pi}$ measures each page's importance through its long-term visit probability.

Just as with the consensus problems studied in Section \ref{sec:networks}, the computation of $\vect{\pi}$ naturally employs iterative methods (the power method). Starting from any initial probability distribution $\vect{x}_0$ (typically uniform), repeated multiplication by $P$ converges to $\vect{\pi}$:
\[
\vect{x}_{k+1} = P\vect{x}_k \quad \text{and} \quad \lim_{k\to\infty} \vect{x}_k = \vect{\pi}
\]
The properties of $P$ ensure each iterate remains a probability distribution and that convergence to the unique limit $\vect{\pi}$ occurs.

\begin{example}[Simple Web]
Consider a tiny web of four pages with link structure given by adjacency matrix $A$.
% (Matrix A as before)
% \[
% A = \begin{bmatrix}
% 0 & 1 & 0 & 0 \\
% 1 & 0 & 1 & 0 \\
% 0 & 1 & 0 & 1 \\
% 0 & 0 & 1 & 0
% \end{bmatrix}
% \]
With damping factor $\alpha = 0.85$, the iteration converges to:
\[
\vect{\pi} \approx \begin{pmatrix}
0.17 \\ 0.31 \\ 0.35 \\ 0.17
\end{pmatrix}
\]
This distribution reflects both local link structure and global network position. Like the network centrality measures from Section \ref{sec:networks}, pages with more incoming paths tend to receive higher scores.
\end{example}

The rate of convergence, as with all ergodic Markov chains, is determined by the magnitude of the second-largest eigenvalue of $P$, denoted $|\lambda_2(P)|$. The eigenvalues of $P$ are related to those of the normalized adjacency matrix part by $\lambda_k(P) = \alpha \lambda_k(M_{norm}) + (1-\alpha)/n$ for the eigenvector corresponding to $\lambda_k(M_{norm})$ if it's orthogonal to the all-ones vector, and $\lambda_1(P)=1$. Thus, $|\lambda_2(P)| \approx \alpha |\lambda_2(M_{norm})|$. For typical web graphs and $\alpha \approx 0.85$, $|\lambda_2(P)|$ is often close to $\alpha$.

\begin{example}[Convergence Behavior]
For our four-page example, tracking successive iterates reveals geometric convergence:
\[
\|\vect{x}_k - \vect{\pi}\| \approx |\lambda_2(P)|^k\|\vect{x}_0 - \vect{\pi}\|
\]
where $|\lambda_2(P)| < 1$. For $\alpha=0.85$, this value is typically close to $0.85$, leading to reasonably fast convergence. This matches the behavior predicted by our Markov chain analysis.
\end{example}

The web graph provides a perfect example of the network structures studied in Section \ref{sec:networks}, where topology shapes dynamic behavior. PageRank's success stems from how it unites two key principles from our chapter: the convergence of ergodic stochastic matrices and the flow of influence through networks. This synthesis --- of Markov chain theory with network dynamics --- transformed web search through pure mathematics.

The framework extends naturally to other contexts where importance flows along network edges, provided the construction ensures an ergodic transition matrix for unique convergence:
\begin{itemize}
    \item Citation networks ranking academic papers
    \item Social networks measuring user influence 
    \item Economic networks revealing systematic importance
\end{itemize}

PageRank exemplifies how the mathematics developed in this chapter shapes the modern world. The stochastic matrices, network dynamics, and iterative methods we have studied combine to organize the chaotic web through rigorous theory. This transformation of abstract mathematics into practical computation demonstrates the profound utility of careful mathematical analysis in contemporary engineering.

% % **************** EMANATION *******************
% \emanation
% % **************** EMANATION *******************
% \section*{Opinion Dynamics \& The Paradox of Connection}
% \label{EM:opinion}
% % **************** EMANATION *******************

% The mathematical study of opinion dynamics began with an elegant and optimistic vision. Early theorists like J.R.P. French (1956) and M.H. DeGroot (1974) proposed that social beliefs evolve through an averaging process, with each person updating their view based on their social contacts' opinions. Starting from the gradient flow on the graph Laplacian:
% \[
% \frac{d\vect{x}}{dt} = -L\vect{x}
% \]
% where $L=D-A$ is the graph Laplacian, we obtain a natural discrete-time model through forward Euler discretization:
% \[
% \vect{x}(t+1) = (I-\epsilon L)\vect{x}(t)
% \]
% Here $\epsilon>0$ represents the time step, chosen small enough that $I-\epsilon L$ remains positive definite. This iteration matrix $W=I-\epsilon L$ inherits key properties from $L$: it is symmetric, has row sums one, and its eigenvalues lie in $[0,1]$.

% The early theoretical results were striking in their optimism. For connected networks with sufficiently small $\epsilon$, repeated iteration leads inevitably to consensus. The convergence rate depends on the second-largest eigenvalue of $W$, which decreases with network connectivity. This mathematics seemed to tell a compelling story about the march toward social consensus through increased communication.

% Reality proved far more complex. Rather than facilitating consensus, increased connectivity through social media appears to have intensified polarization. Several key phenomena require extending the basic model:

% First consider inflexible nodes that maintain fixed opinions while influencing others --- propaganda sources or zealots. The iteration becomes:
% \[
% \vect{x}(t+1) = W\vect{x}(t) + \vect{b}
% \]
% where $\vect{b}$ represents external forcing from inflexible nodes. Rather than converging to consensus, opinions now stabilize at $\vect{x}^* = (I-W)^{-1}\vect{b}$ --- a state potentially far from uniformity.

% More insidiously, agents may selectively misrepresent their opinions to different neighbors. This introduces negative weights:
% \[
% x_i(t+1) = x_i(t) + \epsilon\sum_{j\sim i} w_{ij}(x_j(t) - x_i(t))
% \]
% where $w_{ij}$ can be negative, representing antagonistic relationships. The loss of positive semi-definiteness means eigenvalues may exit the unit disk, leading to oscillation or divergence rather than convergence.

% A fundamentally different modification comes from bounded confidence models, where agents only influence each other if their opinions are sufficiently close:
% \[
% x_i(t+1) = x_i(t) + \epsilon\sum_{j\sim i} \phi(|x_j(t) - x_i(t)|)(x_j(t) - x_i(t))
% \]
% where $\phi(z)$ drops to zero for large $z$. This nonlinear coupling allows polarized states to persist stably, as agents with very different opinions cease to influence each other.

% Network structure itself may evolve as people preferentially connect with like-minded others:
% \[
% w_{ij}(t+1) = w_{ij}(t) + \alpha(1-|x_i(t)-x_j(t)|) - \beta w_{ij}(t)
% \]
% This coupling between opinion and network dynamics creates a feedback loop potentially amplifying initial differences. The resulting nonlinear system can exhibit complex behaviors including pattern formation and metastable states.

% The mathematics reveals why such modifications disrupt consensus. Zealots create inhomogeneous forcing terms that prevent uniform states. Negative weights push eigenvalues outside the unit disk. Bounded confidence introduces threshold nonlinearities that allow multiple stable equilibria. Each modification breaks a key property that the original theory relied upon.

% Yet even as we acknowledge these complications, the linear framework remains valuable for understanding baseline dynamics and identifying when and how other factors disrupt them. The eigenstructure of the iteration matrix still shapes opinion evolution, even if it no longer ensures consensus. Spectral analysis reveals time scales of opinion change and identifies naturally stable configurations, even if these now include polarized states.

% The story of opinion dynamics thus provides crucial lessons about both the power and limitations of linear models. The basic averaging framework illuminates fundamental mechanisms of collective belief formation. Yet reality often harbors nonlinearities that qualitatively change system behavior. Understanding both the insights and limitations of our mathematical models proves essential for analyzing complex social phenomena.


% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 9}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

    % First: Basic matrix powers and patterns (computational)
    \item Let $A = \begin{bmatrix}0 & 1 & 0\\1 & 0 & 1\\0 & 1 & 0\end{bmatrix}$. Compute the first five powers of $A$ and describe any patterns you observe. What does this tell you about random walks on the corresponding graph?
    
    % Simple stochastic matrix operations
    \item For the transition matrix $P = \begin{bmatrix}0.5 & 0.3 & 0.2\\0.4 & 0.4 & 0.3\\0.1 & 0.3 & 0.5\end{bmatrix}$, compute $P^2$ and $P^3$. What appears to be happening to the powers of $P$? Explain this behavior using the theory of stochastic matrices.
    
    % Basic Markov chain computation
    \item Consider a stochastic matrix $P = \begin{bmatrix}0.8 & 0.2\\0.3 & 0.7\end{bmatrix}$. Find its stationary distribution. Is this distribution unique? Explain why or why not using the Perron-Frobenius theory.
    
    % Graph Laplacian computation
    \item Let $G$ be an undirected graph with vertices $\{1,2,3,4\}$ and edges 
    \[
    E = \{(1,2),(2,3),(3,4),(4,1),(2,4)\}
    \]
    Write down its adjacency matrix $A$ and degree matrix $D$; then compute the graph Laplacian $L=D-A$ and find its eigenvalues. What do these tell you about the graph's connectivity?
    
    % Applied Markov chain problem
    \item A gambler starts with \$2 and plays a game where they win \$1 with probability 1/3 and lose \$1 with probability 2/3. They stop playing when they either go broke (\$0) or reach their goal of \$3. Model this as a Markov chain with states \{0,1,2,3\}. Find the transition matrix and compute the probability of eventually reaching the goal state.
    
    % Applied toy problem
    \item A child's toy has three buttons: red, blue, and green. When pressed, each button plays a tune and randomly transitions to lighting up another button (including possibly the same one) according to transition matrix
    \[
    P = \begin{bmatrix}
    0.2 & 0.4 & 0.3 \\
    0.5 & 0.3 & 0.4 \\
    0.3 & 0.3 & 0.3
    \end{bmatrix}
    \]
    If the red button is pressed first, what is the probability that the green button will be lit after exactly three presses? After many presses, what fraction of time will each button be lit?
    
    % Economic application
    \item Consider an input-output economic model with three sectors where the input matrix is $A = \begin{bmatrix}0.3 & 0.2 & 0.1\\0.2 & 0.4 & 0.3\\0.1 & 0.2 & 0.4\end{bmatrix}$. Find the equilibrium production levels. What does the Perron-Frobenius theorem tell you about the stability of this economy?
    
    % Basic theorem about adjacency matrices
    \item Let $A$ be the adjacency matrix of a simple undirected graph. Prove that the $(i,j)$ entry of $A^k$ counts the number of walks of length $k$ from vertex $i$ to vertex $j$.
    
    % Proof about doubly stochastic matrices
    \item A matrix $P$ is doubly stochastic if both its rows and columns sum to 1. Prove that if $P$ is doubly stochastic, then the vector $\vect{1}/n$ (where $n$ is the dimension) must be a stationary distribution. Is this distribution necessarily unique?
    
    % Random walk Laplacian proof
    \item For a connected undirected graph, the random walk Laplacian is defined as $L_{rw} = I - D^{-1}A$ where $D$ is the degree matrix and $A$ is the adjacency matrix. Prove that $L_{rw}$ always has eigenvalue 0 with eigenvector $\vect{1}$.
    
    % Stochastic matrix eigenvalue proof
    \item Let $P$ be an irreducible stochastic matrix. Prove that if $P$ has an eigenvalue $\lambda$ with $|\lambda|=1$, then $\lambda$ must be a root of unity (i.e., $\lambda^k=1$ for some positive integer $k$).
    
    % Network consensus problem
    \item Consider a network of 4 agents where each agent updates their opinion based on a weighted average of their neighbors' opinions. If the network is a square (4-cycle), write down the update matrix and determine how quickly opinions will converge to consensus.
    
    % Laplacian multiplicity theorem
    \item Let $G$ be a graph and $L$ its Laplacian matrix. Prove that the multiplicity of the eigenvalue 0 equals the number of connected components in $G$.
    
    % Stochastic matrix norm proof
    \item For an irreducible stochastic matrix $P$, show that $\|P^n\vect{x}\| \leq \|\vect{x}\|$ for any vector $\vect{x}$ perpendicular to $\vect{1}$, where $\|\cdot\|$ is the standard Euclidean norm.
    
    % Mean first passage time problem
    \item Let $P$ be a stochastic matrix representing transition probabilities in a Markov chain. Define the \style{mean first passage time} $m_{ij}$ as the expected number of steps to reach state $j$ starting from state $i$. Show that these times satisfy the equation $m_{ij} = 1 + \sum_{k\neq j} p_{ki}m_{kj}$ for $i\neq j$.
    
    % Absorbing state probability theorem
    \item Consider a Markov chain with transition matrix $P$ and suppose state $j$ is \style{absorbing} (meaning $p_{jj}=1$ and $p_{ij}=0$ for $i\neq j$). Prove that if the chain starts in state $i\neq j$, the probability of eventual absorption in state $j$ equals the $(i,j)$ entry of $(I-Q)^{-1}R$, where $Q$ is $P$ with row and column $j$ removed, and $R$ is column $j$ of $P$ with entry $j$ removed.
    
    % Time-reversed chain proof
    \item Let $P$ be the transition matrix of an irreducible Markov chain, and let $\vect{\pi}$ be its stationary distribution. The \style{time-reversed} chain has transition matrix $\tilde{P}$ where
    \[
    \tilde{p}_{ij} = \frac{\pi_i}{\pi_j}p_{ji}
    \]
    Prove that $\tilde{P}$ is stochastic and shares the same stationary distribution as $P$. What does this tell you about the reversibility of the chain?
    
    % Maximum entropy stationary distribution
    \item The entropy of a probability distribution $\vect{\pi}$ is defined as $H(\vect{\pi})=-\sum_i \pi_i\ln(\pi_i)$. For a stochastic matrix $P$, prove that if $\vect{\pi}$ is a stationary distribution, then $\vect{\pi}$ maximizes the entropy among all distributions $\vect{x}$ satisfying $P\vect{x}=\vect{x}$.
    
    % Cube spectrum problem
    \item Consider the graph of a cube (8 vertices, 12 edges). Write down its adjacency matrix and compute its spectrum. What does the spectrum tell you about random walks on this graph?
    
    % Clustering coefficient
    \item The \style{clustering coefficient} of a vertex $i$ in an undirected graph is defined as the fraction of pairs of $i$'s neighbors that are themselves connected. Given the adjacency matrix of a graph, write out the formula for computing the clustering coefficient of a vertex in terms of matrix operations. Apply your formula to compute the clustering coefficient of each vertex in the graph with adjacency matrix:
    \[
    A = \begin{bmatrix}
    0 & 1 & 1 & 0 \\
    1 & 0 & 1 & 1 \\
    1 & 1 & 0 & 1 \\
    0 & 1 & 1 & 0
    \end{bmatrix}
    \]
    
    % Eigenvector centrality 
    \item The \style{eigenvector centrality} $\vect{x}$ of nodes in a network is defined as the solution to $\lambda\vect{x} = A\vect{x}$ where $A$ is the adjacency matrix and $\lambda$ is the spectral radius of $A$. Given a network represented by adjacency matrix:
    \[
    A = \begin{bmatrix}
    0 & 1 & 1 & 0 & 0\\
    1 & 0 & 1 & 1 & 0\\
    1 & 1 & 0 & 1 & 1\\
    0 & 1 & 1 & 0 & 1\\
    0 & 0 & 1 & 1 & 0
    \end{bmatrix}
    \]
    Use power iteration to approximate the eigenvector centrality scores (normalize after each iteration). How do these scores relate to each node's degree? Explain why some nodes have higher centrality than their degree would suggest.
    
    \end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THE URTHONA CYCLE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\cleardoublepage

\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.75\textwidth]{URTHONA.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
\end{fullwidth}

\cleardoublepage

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Singular Value Decomposition}
\label{ch:10}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``build we the Mundane Shell around the Rock of Albion''}

\newthought{Every transformation harbors} hidden symmetries beneath its surface complexity. Like crystal structures buried in seemingly formless rock, these patterns reveal themselves only through careful excavation and analysis. The eigendecomposition developed in previous chapters illuminates the structure of square matrices through their action under iteration. Yet this tool, powerful as it is, reaches only part way to the deepest patterns underlying linear transformations. A more fundamental decomposition lies waiting to be unearthed.

The limitation of eigentheory to square matrices is no accident  --  eigenvalues and eigenvectors emerge naturally from iteration, which requires a transformation to map a space to itself. Yet the geometric essence of a linear transformation  --  how it stretches and rotates space  --  extends beyond such endomorphisms. Every linear transformation, whether square or rectangular, admits a canonical decomposition that reveals its intrinsic geometric character. This decomposition exposes not just preferred directions, but fundamental relationships between input and output spaces that remain invisible to eigentheory alone.

Our task is to dig beneath the surface structure of linear transformations to discover these deeper patterns. We begin with the geometric intuition of how matrices transform spheres into ellipsoids, revealing natural input and output directions. From these foundations emerges the singular value decomposition, providing both theoretical insight and practical tools for understanding linear transformations in their full generality.

The power of this decomposition lies in its fusion of algebraic and geometric perspectives. What appears as abstract factorization in coordinates manifests geometrically as an optimal sequence of simple transformations. This optimality principle  --  that the singular value decomposition provides best possible approximations in various precise senses  --  transforms our theoretical understanding into practical methods for data compression, signal processing, and dimensionality reduction. The tools we develop here will prove fundamental to modern applications from image processing to machine learning.

% ==============================================
\section{Spheres, Ellipsoids, \& Singular Values}
\label{sec:stretching}
% ==============================================

The geometry of a linear transformation $A\in\R^{m\times n}$ is vividly revealed by its action on the unit sphere in its domain, $\R^n$. The unit sphere, defined by $\|\vect{x}\|^2=1$ or $\vect{x}^T\vect{x} = 1$, is deformed by $A$ into an ellipsoid in the codomain, $\R^m$. Understanding this deformation is key.

The shape and orientation of this output ellipsoid are determined by the symmetric positive semidefinite matrix $A^TA$. The squared length of a transformed vector $A\vect{x}$ is given by:
\[
    \|A\vect{x}\|^2 = (A\vect{x})^T(A\vect{x}) = \vect{x}^T(A^TA)\vect{x}.
\]
By the Spectral Theorem (Theorem \ref{thm:spectral}), $A^TA$ (being an $n \times n$ symmetric matrix) has $n$ real, non-negative eigenvalues $\lambda_1 \geq \lambda_2 \geq \dots \geq \lambda_n \geq 0$. Let $V=[\vect{v}_1 \dots \vect{v}_n]$ be an orthogonal matrix whose columns are the corresponding orthonormal eigenvectors of $A^TA$. These eigenvectors $\vect{v}_i$ represent principal directions in the input space $\R^n$. When $\vect{x}$ is one of these eigenvectors, say $\vect{x}=\vect{v}_i$, then
\[
    \|A\vect{v}_i\|^2 = \vect{v}_i^T(A^TA)\vect{v}_i = \vect{v}_i^T(\lambda_i \vect{v}_i) = \lambda_i \|\vect{v}_i\|^2 = \lambda_i.
\]
%
\begin{marginfigure}
\centering
    \includegraphics[width=1.0in]{ellipsoid.png}
\end{marginfigure}
%
Thus, the matrix $A$ stretches the principal input direction $\vect{v}_i$ by a factor of $\sqrt{\lambda_i}$. These stretching factors are fundamental to the transformation $A$.

\begin{definition}[Singular Values]
\label{def:singular-values}
Let $A\in\R^{m\times n}$. The $n \times n$ matrix $A^TA$ is symmetric and positive semidefinite, so its eigenvalues $\lambda_i$ are real and non-negative. The \style{singular values} of $A$, denoted $\sigma_i$, are the square roots of these eigenvalues: $\sigma_i = \sqrt{\lambda_i}$. They are conventionally arranged in descending order:
\begin{marginfigure}
    The eigenvalues $\lambda_i$ specifically refer to those of $A^TA$.
\end{marginfigure}
\[
    \sigma_1 \geq \sigma_2 \geq \cdots \geq \sigma_n \geq 0.
\]
The number of non-zero singular values is $r = \rank(A^TA) = \rank(A)$. The corresponding orthonormal eigenvectors $\vect{v}_i$ of $A^TA$ are called the \style{right singular vectors} of $A$.
\end{definition}

The image of the unit sphere under $A$ is an ellipsoid (possibly degenerate if $A$ is rank-deficient) in $\R^m$. The semi-axes of this ellipsoid are aligned with certain vectors $\vect{u}_i \in \R^m$ and have lengths equal to the non-zero singular values $\sigma_i$. The directions $\vect{v}_i$ in $\R^n$ are mapped by $A$ to these semi-axis vectors: $A\vect{v}_i = \sigma_i \vect{u}_i$. The vectors $\vect{u}_i$ will be the \style{left singular vectors}.

\begin{example}[Image Transformation]
Consider the $2\times 2$ matrix:
\[
    A = \begin{bmatrix}
    3 & 1 \\
    1 & 2
    \end{bmatrix}
    \quad
    \Rightarrow
    \quad
    A^TA = \begin{bmatrix} 
    10 & 5 \\ 
    5 & 5 
    \end{bmatrix} 
\]
The characteristic polynomial of $A^TA$ is $\lambda^2 - 15\lambda + 25 = 0$.
Its eigenvalues are $\lambda_1 = (15 + \sqrt{125})/2 \approx 13.09$ and $\lambda_2 = (15 - \sqrt{125})/2 \approx 1.91$.
The singular values are $\sigma_1 = \sqrt{\lambda_1} \approx 3.618$ and $\sigma_2 = \sqrt{\lambda_2} \approx 1.382$.
The unit circle in $\R^2$ is transformed by $A$ into an ellipse whose semi-axes have lengths $\sigma_1$ and $\sigma_2$. The directions of these semi-axes in $\R^2$ (the domain) are given by the eigenvectors of $A^TA$.
\end{example}

This geometric picture -- mapping principal input directions (eigenvectors of $A^TA$) to scaled principal output directions -- forms the intuitive basis for the Singular Value Decomposition.

% ==============================================
\section{Constructing the SVD}
\label{sec:svd_construction} % Changed section title
% ==============================================

The geometric insight that a linear transformation $A$ maps orthonormal principal input directions to orthogonal principal output directions, scaled by singular values, leads directly to its most fundamental factorization. We aim to find orthogonal matrices $U$ and $V$ and a rectangular diagonal matrix $\Sigma$ such that $A = U\Sigma V^T$.

\subsection*{Step 1: Finding $V$ and the Singular Values $\sigma_i$.}
As established in Section \ref{sec:stretching}, the matrix $A^TA$ is an $n \times n$ symmetric positive semidefinite matrix. By the Spectral Theorem, there exists an $n \times n$ orthogonal matrix $V = [\vect{v}_1 \dots \vect{v}_n]$ whose columns are orthonormal eigenvectors of $A^TA$, and an $n \times n$ diagonal matrix $\Lambda = \diag(\lambda_1, \dots, \lambda_n)$ of corresponding non-negative eigenvalues, such that $A^TA = V\Lambda V^T$.
The singular values of $A$ are $\sigma_i = \sqrt{\lambda_i}$, ordered $\sigma_1 \ge \sigma_2 \ge \dots \ge \sigma_n \ge 0$. Let $r$ be the rank of $A$, which is also the number of non-zero singular values. The columns $\vect{v}_1, \dots, \vect{v}_r$ form an orthonormal basis for $(\ker A)^\perp = \im(A^T)$, and $\vect{v}_{r+1}, \dots, \vect{v}_n$ form an orthonormal basis for $\ker A = \ker(A^TA)$.

\subsection*{Step 2: Defining the Left Singular Vectors $U$.}
For each $i=1, \dots, r$ (where $\sigma_i > 0$), define the vector $\vect{u}_i \in \R^m$ by
\[
    \vect{u}_i = \frac{1}{\sigma_i} A\vect{v}_i.
\]
These $r$ vectors are orthonormal. To see this, consider their inner product:
\begin{align*}
    \vect{u}_i^T \vect{u}_j &= \left(\frac{1}{\sigma_i} A\vect{v}_i\right)^T \left(\frac{1}{\sigma_j} A\vect{v}_j\right) = \frac{1}{\sigma_i \sigma_j} \vect{v}_i^T (A^TA) \vect{v}_j \\
    &= \frac{1}{\sigma_i \sigma_j} \vect{v}_i^T (\lambda_j \vect{v}_j) \quad (\text{since } \vect{v}_j \text{ is an eigenvector of } A^TA) \\
    &= \frac{\sigma_j^2}{\sigma_i \sigma_j} (\vect{v}_i^T \vect{v}_j).
\end{align*}
Since $\{\vect{v}_k\}$ is an orthonormal set, $\vect{v}_i^T \vect{v}_j = \delta_{ij}$. Thus, $\vect{u}_i^T \vect{u}_j = \frac{\sigma_j}{\sigma_i}\delta_{ij}$. For $i=j$, $\vect{u}_i^T \vect{u}_i = 1$. For $i \neq j$, $\vect{u}_i^T \vect{u}_j = 0$.
So, $\{\vect{u}_1, \dots, \vect{u}_r\}$ is an orthonormal set of $r$ vectors in $\R^m$. These vectors form an orthonormal basis for the column space of $A$, $\im(A)$.

%\subsection*{Step 3: Completing the Orthonormal Basis $U$.}
If $r < m$, the set $\{\vect{u}_1, \dots, \vect{u}_r\}$ does not span all of $\R^m$. We can extend it to a full orthonormal basis for $\R^m$ by choosing an additional $m-r$ orthonormal vectors $\{\vect{u}_{r+1}, \dots, \vect{u}_m\}$ that form a basis for $(\im A)^\perp = \ker(A^T)$.
Let $U = [\vect{u}_1 \dots \vect{u}_r \dots \vect{u}_m]$ be the $m \times m$ orthogonal matrix whose columns are these left singular vectors.

\subsection*{Step 3: Defining the Matrix $\Sigma$.}
Let $\Sigma$ be the $m \times n$ matrix whose entry $\Sigma_{ii} = \sigma_i$ for $i=1, \dots, p = \min(m,n)$, and all other entries are zero. If $r < p$, then $\sigma_{r+1}, \dots, \sigma_p$ are zero.

With $U$, $\Sigma$, and $V$ thus constructed, we have $A\vect{v}_i = \sigma_i \vect{u}_i$ for $i=1, \dots, r$, and $A\vect{v}_i = \vect{0}$ for $i=r+1, \dots, n$ (since these $\vect{v}_i$ are in $\ker A$).
This set of vector equations can be written in matrix form as $AV = U\Sigma'$, where $\Sigma'$ is an $m \times n$ matrix whose first $r$ diagonal entries are $\sigma_1, \dots, \sigma_r$ and all other entries are zero. This $\Sigma'$ is precisely our $\Sigma$.
Since $V$ is orthogonal, $V^{-1}=V^T$, so $AV=U\Sigma$ implies $A=U\Sigma V^T$.

\begin{marginfigure}
{\em Nota bene:} The columns of $U$ are also the orthonormal eigenvectors of $AA^T$, and the non-zero eigenvalues of $AA^T$ are $\sigma_1^2, \dots, \sigma_r^2$, the same as for $A^TA$. One could alternatively start by diagonalizing $AA^T$ to find $U$ and the $\sigma_i^2$.
\end{marginfigure}

This construction leads to the central theorem:

\begin{theorem}[Singular Value Decomposition]
\label{thm:svd}
Every matrix $A\in\R^{m\times n}$ admits a decomposition
\begin{equation}
\label{eq:SVD}
    A = U\Sigma V^T
\end{equation}
where:
\begin{enumerate}
    \item $U\in\R^{m\times m}$ is an orthogonal matrix whose columns are the left singular vectors of $A$.
    \item $V\in\R^{n\times n}$ is an orthogonal matrix whose columns are the right singular vectors of $A$.
    \item $\Sigma\in\R^{m\times n}$ is a rectangular diagonal matrix, where the diagonal entries $\Sigma_{ii} = \sigma_i$ are the singular values of $A$, ordered $\sigma_1\geq\sigma_2\geq\cdots\geq\sigma_p\geq 0$ with $p=\min\{m,n\}$.
\end{enumerate}
The singular values $\sigma_i$ are uniquely determined.
\end{theorem}

This SVD reveals the fundamental action of $A$: it maps the $i$-th right singular vector $\vect{v}_i$ to $\sigma_i$ times the $i$-th left singular vector $\vect{u}_i$:
\[
    A\vect{v}_i = \sigma_i\vect{u}_i \quad\text{for } i=1, \dots, \min(m,n).
\]
If $\sigma_i=0$, then $A\vect{v}_i = \vect{0}$. The transformation $A$ essentially acts by:
\begin{enumerate}
    \item Rotating/reflecting the input space so the basis vectors $\vect{e}_i$ align with $\vect{v}_i$ (action of $V^T$).
    \item Scaling these aligned vectors by $\sigma_i$ along the new axes (action of $\Sigma$).
    \item Rotating/reflecting the result into the output space so the scaled axes align with $\vect{u}_i$ (action of $U$).
\end{enumerate}

The SVD is arguably the most important matrix factorization, providing deep insights into a matrix's structure, geometry, rank, and numerical properties. Its applications, explored in subsequent sections and chapters, are vast.

% ==============================================
\section{Interpreting the SVD}
\label{sec:svd_interpretation}
% ==============================================

The Singular Value Decomposition $A = U\Sigma V^T$ provides far more than a mere algebraic factorization of a matrix $A\in\R^{m\times n}$; it unveils the intrinsic geometry and fundamental structure of the linear transformation $A$ represents. Having constructed the components $U$, $\Sigma$, and $V$ in Section \ref{sec:svd_construction}, we now consider their meaning and how they connect to core concepts like rank, the four fundamental subspaces, and the action of $A$ on its domain.

The singular values $\sigma_i$, found on the diagonal of $\Sigma$, are the stretching factors or ``gains'' of the transformation along specific orthogonal directions. The largest singular value, $\sigma_1$, is precisely the \style{spectral norm} (or 2-norm) of $A$, representing the maximum factor by which $A$ can stretch any unit vector:
\[
    \|A\|_2 = \max_{\|\vect{x}\|=1} \|A\vect{x}\| = \sigma_1.
\]
The ordered sequence $\sigma_1 \ge \sigma_2 \ge \dots \ge \sigma_r > 0$ (where $r=\rank(A)$) indicates the relative importance of different modes of the transformation. A rapid decay in these singular values, as we shall see in Chapters \ref{ch:11} and \ref{ch:12}, suggests that the matrix $A$ (and the data it might represent) is well-approximated by a matrix of lower rank. 
\begin{marginfigure}
    This is the cornerstone of dimensionality reduction and data compression techniques like Principal Component Analysis.
\end{marginfigure}

The columns of $V = [\vect{v}_1 \dots \vect{v}_n]$ are the \style{right singular vectors}. Each $\vect{v}_i$ represents a \style{principal input direction}. When $A$ represents data (e.g., rows as observations, columns as features), these $\vect{v}_i$ correspond to principal directions or inherent patterns within the feature space. For instance, if $A$ were a document-term matrix, the $\vect{v}_i$ might align with underlying topics.
\begin{marginfigure}
    The vectors $\vect{v}_i$ are the directions in the domain that get mapped by $A$ to the semi-axes of the ellipsoid formed by transforming the unit sphere.
\end{marginfigure}

The columns of $U = [\vect{u}_1 \dots \vect{u}_m]$ are the \style{left singular vectors}, forming an orthonormal basis for the output space $\R^m$. The fundamental action of $A$ on its right singular vectors is to map them to scaled versions of its left singular vectors:
\[
    A\vect{v}_i = \sigma_i \vect{u}_i \quad \text{for } i=1, \dots, \min(m,n).
\]
If $\sigma_i=0$, then $A\vect{v}_i = \vect{0}$. The set $\{\vect{u}_1, \dots, \vect{u}_r\}$ (corresponding to non-zero $\sigma_i$) forms an orthonormal basis for the image (or column space) of $A$, $\im(A)$. These $\vect{u}_i$ are the \style{principal output directions}. In a data context where rows of $A$ are observations, the columns of $U\Sigma$ (specifically $U_r\Sigma_r$, where $U_r$ and $\Sigma_r$ contain the first $r$ components) can be interpreted as the coordinates of the transformed data in the basis of principal output directions.

The SVD provides a striking geometric refinement and an explicit construction for the concepts surrounding the Fundamental Theorem of Linear Algebra (Theorem \ref{thm:FTLAgeom}). Recall that any linear transformation $A:\R^n\rightarrow \R^m$ (using matrix notation for $T$) is associated with four fundamental subspaces. The SVD not only confirms their existence and dimensional relationships but also provides orthonormal bases for each:

\begin{itemize}
    \item The \style{column space} or \style{image} of $A$, $\im(A) \subset \R^m$:
        The left singular vectors $\{\vect{u}_1,\ldots,\vect{u}_r\}$ corresponding to non-zero singular values $\sigma_1, \dots, \sigma_r$ form an orthonormal basis for $\im(A)$. Thus, $\dim(\im A) = r$.
    \item The \style{null space} or \style{kernel} of $A$, $\ker(A) \subset \R^n$:
        The right singular vectors $\{\vect{v}_{r+1},\ldots,\vect{v}_n\}$ corresponding to zero singular values (if $r<n$) form an orthonormal basis for $\ker(A)$. Thus, $\dim(\ker A) = n-r$.
    \item The \style{row space} of $A$, $\im(A^T) \subset \R^n$:
        Since $A^T = V\Sigma^T U^T$, the right singular vectors $\{\vect{v}_1,\ldots,\vect{v}_r\}$ (which are left singular vectors for $A^T$ corresponding to non-zero $\sigma_i$) form an orthonormal basis for $\im(A^T)$. This space is also $(\ker A)^\perp$. Thus, $\dim(\im A^T) = r$.
    \item The \style{left null space} of $A$, $\ker(A^T) \subset \R^m$:
        The left singular vectors $\{\vect{u}_{r+1},\ldots,\vect{u}_m\}$ corresponding to positions associated with zero singular values (if $r<m$) form an orthonormal basis for $\ker(A^T)$. This space is also $(\im A)^\perp$. Thus, $\dim(\ker A^T) = m-r$.
\end{itemize}
The rank of $A$, $r$, is immediately identifiable as the number of non-zero singular values. The Rank-Nullity Theorem, $\dim(\ker A) + \dim(\im A^T) = n$ (or $ (n-r) + r = n$), and its counterpart for $A^T$ are thus made explicit by the SVD.

\begin{marginfigure}
{\em Think:} The SVD elegantly decomposes $\R^n = \im(A^T) \orthosum \ker(A)$ and $\R^m = \im(A) \orthosum \ker(A^T)$, with $A$ acting as an isomorphism (scaled by $\sigma_i$) between $\im(A^T)$ and $\im(A)$.
\end{marginfigure}

The SVD expresses $A$ as a sum of $r$ rank-one matrices, often called the SVD expansion:
\[
    A = U\Sigma V^T = \sum_{i=1}^r \sigma_i \vect{u}_i \vect{v}_i^T.
\]
Each term $\sigma_i \vect{u}_i \vect{v}_i^T$ is a rank-one matrix representing an outer product. This expansion shows $A$ as a linear combination of these fundamental rank-one "layers," ordered by the magnitude of their corresponding singular values $\sigma_i$. This form is pivotal for low-rank approximation (Chapter \ref{ch:12}), where truncating this sum by keeping only the first $k$ terms yields $A_k = \sum_{i=1}^k \sigma_i \vect{u}_i \vect{v}_i^T$, the best rank-$k$ approximation to $A$.

Finally, the SVD provides the most general and stable way to define the \style{pseudoinverse} $A^\dagger$ of any matrix $A$, extending the concept from Chapter \ref{ch:6}. If $A = U\Sigma V^T$, its pseudoinverse is given by:
\[
    A^\dagger = V \Sigma^\dagger U^T.
\]
Here, $\Sigma^\dagger$ is an $n \times m$ rectangular diagonal matrix. If $\Sigma_{ii} = \sigma_i > 0$, then $(\Sigma^\dagger)_{ii} = 1/\sigma_i$. If $\Sigma_{ii} = 0$, then $(\Sigma^\dagger)_{ii} = 0$. All off-diagonal entries of $\Sigma^\dagger$ are zero.

The pseudoinverse $A^\dagger$ effectively "inverts" the action of $A$ where possible:
\begin{itemize}
    \item It maps vectors from $\im(A)$ back to $\im(A^T)$ by "undoing" the scaling by $\sigma_i$: if $\vect{y} = \sigma_i \vect{u}_i \in \im(A)$, then $A^\dagger \vect{y} = \vect{v}_i$.
    \item It maps vectors from $(\im A)^\perp = \ker(A^T)$ to the zero vector in $\R^n$.
\end{itemize}
As discussed in Chapter \ref{ch:6}, $A^\dagger\vect{b}$ yields the minimum-norm least-squares solution to $A\vect{x}=\vect{b}$. The SVD construction of $A^\dagger$ makes this general for any $A$. Furthermore, $AA^\dagger = U_rU_r^T$ (where $U_r = [\vect{u}_1 \dots \vect{u}_r]$) is the orthogonal projection onto $\im(A)$, and $A^\dagger A = V_rV_r^T$ (where $V_r = [\vect{v}_1 \dots \vect{v}_r]$) is the orthogonal projection onto $\im(A^T)$.

\begin{marginfigure}
    The SVD thus offers a ``master key'' to understanding a linear transformation $A$: it provides optimal orthonormal bases for its domain and codomain ($V$ and $U$), the scaling factors along these principal axes ($\Sigma$), its rank, explicit bases for all four fundamental subspaces, and a robust definition of its generalized inverse ($A^\dagger$).
\end{marginfigure}

In essence, the SVD reveals that any linear transformation, no matter how complex its initial matrix representation, can be understood as a sequence of three fundamental geometric operations: a rotation/reflection ($V^T$), a scaling along axes ($\Sigma$), and another rotation/reflection ($U$). This profound insight is central to many modern applications of linear algebra.

% ==============================================
\section{Invariance \& Natural Structure}
\label{sec:invariance}
% ==============================================

The singular values emerge not merely as computational artifacts but as fundamental invariants of the transformation  --  quantities that remain unchanged when viewed through different orthonormal bases. If $Q_1$ and $Q_2$ are orthogonal matrices, the transformation $B=Q_1AQ_2^T$ represents the same underlying map as $A$, merely viewed through different coordinates. Yet its singular values match those of $A$ exactly, measuring intrinsic stretching factors independent of our choice of measurement frame.

Two matrix norms emerge naturally from this geometric perspective:

\begin{definition}[Matrix Norms]
\label{def:matrixnorms}
For a matrix $A\in\R^{m\times n}$:
\begin{enumerate}
    \item The \style{spectral norm} (or \style{2-norm}) measures maximal stretching:
    \[
        \|A\|_2 = \max_{\|\vect{x}\|=1} \|A\vect{x}\| = \sigma_1
    \]
    \item The \style{Frobenius norm} measures total energy:
    \[
        \|A\|_F = \left(\sum_{i,j} a_{ij}^2\right)^{1/2} = \left(\sum_{i=1}^p \sigma_i^2\right)^{1/2}
    \]
\end{enumerate}
\end{definition}
\begin{marginfigure}
    {\em Compare:} The Frobenius norm comes from the inner product in Chapter 5, Example \ref{ex:Frobenius}. It provides a natural measure of approximation error, effectively counting the total squared discrepancy across all matrix entries.
\end{marginfigure}
These norms connect deeply to the SVD structure through the fundamental metrics $A^TA$ and $AA^T$ in domain and codomain:
\[
    A^TA = V\Sigma^T\Sigma V^T = \sum_{i=1}^p \sigma_i^2\vect{v}_i\vect{v}_i^T
    \quad\text{and}\quad
    AA^T = U\Sigma\Sigma^T U^T = \sum_{i=1}^p \sigma_i^2\vect{u}_i\vect{u}_i^T
\]
The equality of nonzero eigenvalues between these matrices now emerges naturally from singular value structure.

Even more remarkable is how singular values control composition of transformations. While eigenvalues can grow explosively under matrix multiplication, singular values satisfy delicate inequalities:

\begin{lemma}[Singular Value Interlacing]
\label{lem:svinterlace}
For matrices $A$ and $B$ of compatible size with singular values in descending order:
\[
    \sigma_i(AB) \leq \sigma_i(A)\sigma_1(B)
\]
\end{lemma}

\begin{proof}[Proof Idea]
The key insight comes from the \style{minimax principle}: the $k$th singular value equals the minimal spectral norm over all rank-$(k-1)$ approximations. For any $k$-dimensional subspace $S$:
\[
    \sigma_k(AB) = \min_{\rank(X)<k} \|AB-X\|_2 \leq \|A\|_2\sigma_k(B) = \sigma_1(A)\sigma_k(B)
\]
A more careful argument using the SVDs of both $A$ and $B$ establishes the full inequality.
\end{proof}

This control of singular values under composition has no direct analog for eigenvalues. Even the product of two symmetric positive definite matrices can have complex eigenvalues, while their singular values remain real and well-behaved. This stability under composition helps explain why singular values often prove more useful than eigenvalues in analyzing iterative processes and error propagation.

The singular values also provide the most natural measure of matrix rank:
\[
    \rank(A) = \#\{\sigma_i > 0\}
\]
This equality illuminates the geometric meaning of rank as counting independent stretching directions. More subtly, small singular values indicate directions that are \textit{nearly} dependent  --  a crucial insight for the numerical computation of rank that we shall explore in Chapter \ref{ch:12}.

Each of these relationships  --  through norms, composition, and rank  --  reflects a different facet of how singular values capture the intrinsic character of linear transformations. Their collective power lies in uniting algebraic, geometric, and computational perspectives into a single coherent framework for understanding linear maps.

The singular values of a matrix provide more than just optimal approximation  --  they quantify precisely how the matrix distorts space under transformation. Recall from Chapter 1 the notion of condition number as a measure of numerical sensitivity. The SVD framework now allows us to define this concept rigorously:

\begin{definition}[Condition Number]
For a nonsingular matrix $A$ with singular values $\sigma_1\geq\sigma_2\geq\cdots\geq\sigma_n>0$, the \style{condition number} is the ratio
\[
    \cond(A) = \frac{\sigma_1}{\sigma_n}
\]
of largest to smallest singular values. For singular matrices, we set $\cond(A)=\infty$.
\end{definition}

This definition illuminates why condition number measures sensitivity. When solving $A\vect{x}=\vect{b}$, the SVD shows that $A$ stretches some directions by $\sigma_1$ while compressing others by $\sigma_n$. The ratio $\sigma_1/\sigma_n$ thus bounds how much relative errors can be amplified when computing solutions. More precisely, small perturbations $\delta\vect{b}$ in the right-hand side can produce changes in solution of magnitude up to $\cond(A)\|\delta\vect{b}\|$.

\begin{marginfigure}
{\em Example:} The matrix from Example \ref{ex:conditionintro} in Chapter 1,
\[
    A = \begin{bmatrix}
    1 & 0.999 \\
    0 & 0.001
    \end{bmatrix}
\]
has singular values $\sigma_1\approx 2$ and $\sigma_2\approx 0.001$, explaining its condition number of about 2000.
\end{marginfigure}

The condition number remains unchanged under orthogonal transformations: if $Q_1$ and $Q_2$ are orthogonal, then $\cond(Q_1AQ_2)=\cond(A)$. This invariance reflects that conditioning measures intrinsic sensitivity rather than artifacts of particular coordinate choices. Indeed, $\cond(A)$ can be characterized independently of the SVD as
\[
    \cond(A) = \|A\|\|A^{-1}\|
\]
where $\|\cdot\|$ denotes any matrix norm invariant under orthogonal transformations.

This geometric interpretation of conditioning  --  as measuring the eccentricity of how $A$ transforms the unit sphere  --  explains its fundamental importance in numerical computation. Matrices with large condition numbers map some directions to nearly zero, making accurate recovery of inputs from outputs inherently difficult regardless of what numerical method we employ.
%%%%%%%%%%%%%%%%%%%%%%%%%

The singular values also illuminate the fundamental subspaces studied in Chapter \ref{ch:3}. A transformation's kernel corresponds precisely to right singular vectors with zero singular values:
\[
    \ker(A) = \spanset\{\vect{v}_i : \sigma_i = 0\}
\]
while its image is spanned by left singular vectors with nonzero singular values:
\[
    \im(A) = \spanset\{\vect{u}_i : \sigma_i > 0\}
\]

\begin{marginfigure}
{\em Think:} The SVD provides an orthonormal basis for each of the four fundamental subspaces: kernel, image, cokernel, and coimage. This geometric refinement of the Fundamental Theorem reveals not just dimensions but natural coordinate systems.
\end{marginfigure}

This understanding transforms abstract concepts like rank and nullity into concrete geometric measurements. A singular value of zero indicates a direction that collapses under the transformation; the number of such zeros counts the nullity. The nonzero singular values measure how much each surviving direction stretches or compresses; their number gives the rank.

\begin{example}[Matrix Completion]
Consider a matrix with missing entries, like data from an incomplete survey:
\[
    A = \begin{bmatrix}
    1 & ? & 2 \\
    2 & 1 & ? \\
    ? & 2 & 3
    \end{bmatrix}
\]
If we suspect the true matrix has low rank (meaning many dependencies among entries), the SVD suggests a natural completion strategy: fill the missing entries to minimize the number of nonzero singular values. This geometric principle  --  that real data often lies near a lower-dimensional subspace  --  has profound implications for modern data science.
\end{example}

While the singular values are uniquely determined (when arranged in descending order), the singular vectors exhibit constrained non-uniqueness that reflects fundamental symmetries:
\begin{itemize}
    \item Vectors corresponding to distinct nonzero singular values are determined up to sign
    \item Vectors sharing a singular value can be rotated within their subspace
    \item Vectors corresponding to zero singular values need only form an orthonormal basis for the kernel
\end{itemize}

\begin{example}[Image Compression]
A grayscale image stored as an $m\times n$ matrix typically has full rank  --  every singular value is nonzero. Yet most singular values may be very small, indicating directions that contribute little to the image's visual content. Setting these small singular values to zero effectively reduces the rank while maintaining essential features. This process of low-rank approximation, to be studied in detail in Chapter \ref{ch:12}, exemplifies how singular values guide practical computation through geometric insight.
\end{example}

This geometric understanding transforms our view of linear transformations from mere computational objects into structured entities with intrinsic character. The SVD reveals not just how to factor matrices but how to read the deepest patterns woven into linear maps themselves. These patterns  --  of stretching, rank, and fundamental subspaces  --  will guide our development of both theoretical understanding and practical algorithms in the chapters ahead.


% ==============================================
\section{Inner Products \& the SVD}
\label{sec:svdIP}
% ==============================================

The singular value decomposition generalizes naturally to linear transformations between arbitrary inner product spaces. Given $T:V\rightarrow W$ between finite-dimensional inner product spaces, the adjoint $T^*:W\rightarrow V$ defined in Chapter \ref{ch:5} allows us to form $T^*T:V\rightarrow V$ and $TT^*:W\rightarrow W$. These self-adjoint operators play the role of $A^TA$ and $AA^T$, with their spectral properties determining the SVD structure.

\begin{marginfigure}
{\em Definition:} For vectors $\vect{u}\in W$ and $\vect{v}\in V$, the tensor product $\vect{u}\otimes\vect{v}$ denotes the rank-one operator sending $\vect{x}\mapsto \langle\vect{x},\vect{v}\rangle\vect{u}$. This generalizes the matrix outer product $\vect{u}\vect{v}^T$ to arbitrary inner product spaces.
\end{marginfigure}

More precisely, let $\{\vect{v}_1,\ldots,\vect{v}_n\}$ be orthonormal eigenvectors of $T^*T$ with eigenvalues $\{\sigma_1^2,\ldots,\sigma_n^2\}$. For each nonzero singular value $\sigma_i$, the vector $\vect{u}_i=\frac{1}{\sigma_i}T\vect{v}_i$ is well-defined, and these vectors form an orthonormal set in $W$. The transformation then admits decomposition:
\[
    T = \sum_{i=1}^r \sigma_i(\vect{u}_i\otimes\vect{v}_i)
\]
where $r=\rank(T)$. When expressed in orthonormal bases, this abstract decomposition yields exactly the matrix factorization $A=U\Sigma V^T$ developed earlier.

This coordinate-free perspective reveals that the SVD is not merely a matrix factorization but a fundamental property of linear transformations between finite-dimensional inner product spaces. The assumption of finite dimensionality is crucial  --  in infinite dimensions, the story becomes far more subtle. While compact operators between Hilbert spaces admit a similar spectral decomposition with countably many singular values approaching zero, general bounded operators may lack such decomposition entirely. This boundary between finite and infinite dimensions marks a profound transition in the structure of linear transformations.

\begin{marginfigure}
{\em BONUS!} The spectral theory of compact operators in infinite dimensions leads to deep connections with integral equations, quantum mechanics, and functional analysis. The SVD appears there as the \style{Schmidt decomposition} of an integral kernel.
\end{marginfigure}

For transformations between finite-dimensional spaces, however, the inner product structure is sufficient  --  without it, we cannot form adjoints or measure orthogonality, and the beautiful connection between input and output spaces through singular vectors dissolves. The inner product provides exactly the geometric structure needed for the SVD to emerge naturally, independent of any choice of coordinates.

\begin{example}[Finite-Dimensional Function Spaces]
Consider the space $\poly_n$ of polynomials of degree at most $n$, equipped with the $L^2$ inner product on $[0,1]$: $\langle f,g\rangle=\int_0^1 f(t)g(t)\,dt$. The differentiation operator $D:\poly_n\to\poly_{n-1}$ is linear, and its adjoint $D^*:\poly_{n-1}\to\poly_n$ involves both integration and boundary terms. Though we work with functions, the finite-dimensionality of these polynomial spaces ensures the SVD exists and is unique.
\end{example}

\begin{example}[Integration Operator]
Consider the integration operator $T:C([0,1])\to C([0,1])$ defined by
\[
    (Tf)(x) = \int_0^x f(t)\,dt
\]
This operator is bounded when $C[0,1]$ is equipped with the standard inner product $\langle f,g\rangle = \int_0^1 f(t)g(t)\,dt$. Its adjoint can be found through integration by parts:
\[
    (T^*f)(x) = \int_x^1 f(t)\,dt
\]

The composed operator $T^*T$ then has a particularly nice form:
\[
    (T^*Tf)(x) = \int_0^1 \min(x,t)f(t)\,dt
\]
This is an example of an integral operator with a symmetric kernel. While we cannot write down its singular values and vectors explicitly (they involve solutions to certain differential equations), we know they must form a complete orthonormal set in $C([0,1])$. This infinite sequence of singular values approaches zero, ensuring the operator is compact  --  a key distinction from the finite-dimensional case where singular values stay bounded away from zero unless exactly zero.

This example hints at the deeper theory of integral operators, where the SVD manifests as an infinite series rather than a finite sum. The geometric intuition remains  --  we are still decomposing the transformation into orthogonal components  --  but the infinitude of dimensions introduces subtleties of convergence \& completeness absent in finite dimensions.
\end{example}

% The SVD provides striking geometric refinement of the Fundamental Theorem from Chapter \ref{ch:3}. Recall that any linear transformation $T:V\rightarrow W$ induces four fundamental sub- or quotient-spaces:
% \begin{itemize}
%     \item The kernel $\ker T < V$
%     \item The image $\im T < W$
%     \item The cokernel $\coker T = W/\im T \iso (\im T)^\perp$
%     \item The coimage $\coim T = V/\ker T \iso (\ker T)^\perp$
% \end{itemize}
% The SVD reveals not just the existence of these spaces but their precise geometric relationship. The right singular vectors $\{\vect{v}_1,\ldots,\vect{v}_n\}$ split naturally into two sets:
% \begin{enumerate}
%     \item Those corresponding to nonzero singular values form an orthonormal basis for $(\ker T)^\perp$
%     \item Those corresponding to zero singular values form an orthonormal basis for $\ker T$
% \end{enumerate}
% Similarly, the left singular vectors $\{\vect{u}_1,\ldots,\vect{u}_m\}$ provide:
% \begin{enumerate}
%     \item An orthonormal basis for $\im T$ (nonzero singular values)
%     \item An orthonormal basis for $(\im T)^\perp$ (zero singular values)
% \end{enumerate}

% \begin{marginfigure}
% {\em Think:} The singular values quantify how much of the input survives projection onto $(\ker T)^\perp$ and transfer to $\im T$  --  they measure the ``strength'' of the transformation along each fundamental direction.
% \end{marginfigure}

% More remarkably, the SVD shows these spaces are connected by an elegant isomorphism:
% \[
%  \ker T \perp \spanset\{\vect{v}_1,\ldots,\vect{v}_r\} \iso \spanset\{\vect{u}_1,\ldots,\vect{u}_r\} \perp (\im T)^\perp
% \]
% where $r=\rank T$ and the middle isomorphism scales by singular values. This precisely quantifies how the transformation connects input to output space, with singular values measuring the strength of this connection along each fundamental direction.

% \begin{marginfigure}
% {\em Nota bene:} The earlier matrix formulation $A=U\Sigma V^T$ represents this abstract decomposition in coordinates, with $U$ and $V$ expressing the singular vectors relative to chosen orthonormal bases.
% \end{marginfigure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Latent Semantic Structure in Text}
\label{EM:text}
% **************** EMANATION *******************

Hidden within the soil of human discourse lie the roots of text that grow into meaning. The singular value decomposition reveals these latent structures, transforming our intuitive sense that {\em words are known by the company they keep} into more precise mathematical insights. Through careful analysis of word co-occurrence patterns, we can uncover the deep relationships that give language its remarkable expressive power.

Consider a collection of documents represented through their word frequencies. Each document becomes a vector in a high-dimensional space where each dimension corresponds to a possible word. The complete corpus forms a term-document matrix $A$ where entry $a_{ij}$ represents the (weighted) occurrence of word $i$ in document $j$. This matrix, though sparse and high-dimensional, contains rich structure that the SVD can expose.

The singular value decomposition $A=U\Sigma V^T$ reveals fundamental semantic patterns:
\begin{itemize}
    \item Left singular vectors (columns of $U$) reveal word clusters that tend to occur together
    \item Right singular vectors (columns of $V$) identify document themes
    \item Singular values measure the strength of these semantic associations
\end{itemize}

More remarkably, this decomposition often captures genuine semantic relationships despite operating purely on word co-occurrence patterns. Words with similar meanings tend to appear in similar contexts, creating parallel rows in the term-document matrix. The SVD detects these parallels and groups related terms in its singular vectors.

\begin{example}[Scientific Abstract Analysis]
Consider analyzing a collection of physics abstracts. The first few left singular vectors often reveal clear semantic groupings:
\begin{enumerate}
    \item Experimental terms: ``measurement'', ``observation'', ``data'', ``experiment''
    \item Theoretical terms: ``model'', ``theory'', ``prediction'', ``framework''
    \item Quantum terms: ``state'', ``superposition'', ``entanglement'', ``qubit''
\end{enumerate}
These groupings emerge naturally from co-occurrence patterns, without any explicit semantic knowledge provided to the algorithm.
\end{example}

The singular values themselves tell an interesting story. They typically follow a power law decay, with a few large values followed by many smaller ones. This suggests that most semantic content lies in a low-dimensional subspace --- a phenomenon that enables both efficient document indexing and semantic search.

Several practical refinements (such as term frequency-inverse document frequency weighting) as well as more sophisticated modern methods together augment these SVD foundations. Word embeddings like {\em Word2Vec} create dense vector representations of words that capture subtle semantic relationships. Yet these methods still reflect the fundamental insight that meaning emerges from patterns of association --- patterns that the SVD is uniquely suited to reveal.
%
\begin{marginfigure}
{\em Historical Note:} Latent Semantic Analysis (LSA), developed in the late 1980s, used the SVD as its foundational mathematical tool. This application of matrix factorization to language transformed both theoretical linguistics and practical information retrieval systems.
\end{marginfigure}
This application exemplifies the deeper truth that linear algebra illuminates structure in seemingly unstructured data. Just as the SVD reveals preferred directions of stretching in geometric transformations, it exposes natural semantic axes in the high-dimensional space of human language. The mathematics developed in this chapter thus provides not just computational tools but genuine insight into how meaning emerges from pattern.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Sensor Networks \& The Geometry of Measurement}
\label{EM:sensor}
% **************** EMANATION *******************

Sensors are the nerves of the Industrial Body --- temperature probes in data centers, accelerometers in smartphones, pressure gauges in industrial plants. Each device measures external aspects of reality, yet these measurements harbor systematic errors from manufacturing variations, environmental conditions, and malfunctions. The singular value decomposition provides an elegant framework for integrating and smoothing these errors through the underlying geometry of measurement.

Consider an array of $n$ sensors measuring the same physical quantity at $m$ different times or conditions. In an ideal world, these measurements would differ only by known physical variations. Reality proves messier --- each sensor has its own gain, offset, and noise characteristics. A \style{measurement matrix} $M\in\R^{m\times n}$ contains these corrupted observations:
\[
    M_{ij} = g_i(s_j + \eta_{ij}) + b_i
\]
where $s_j$ is the true signal at time $j$, $g_i$ and $b_i$ are the \style{gain} and \style{bias} of sensor $i$, and $\eta_{ij}$ represents noise.

The SVD of this measurement matrix reveals remarkable structure. After centering each sensor's readings (subtracting its mean), we obtain:
\[
    M = U\Sigma V^T = \sum_{k=1}^r \sigma_k\vect{u}_k\vect{v}_k^T
\]
The leading singular vector $\vect{v}_1$ often captures the true underlying signal, while subsequent singular vectors reveal systematic error patterns:
\begin{itemize}
    \item $\sigma_1\vect{u}_1\vect{v}_1^T$ approximates the physical variation
    \item $\sigma_2\vect{u}_2\vect{v}_2^T$ typically shows gain variation effects
    \item Higher terms capture more subtle error patterns
\end{itemize}

\begin{example}[Temperature Sensor Array]
Consider a server room monitored by 100 temperature sensors sampled every minute. Over an hour of operation (60 samples), we obtain a $60\times 100$ measurement matrix. The SVD typically reveals:
\begin{enumerate}
    \item First singular value $\sim 10\times$ larger than second, reflecting true temperature variation
    \item Second singular vector correlates with sensor positions, showing spatial bias
    \item Third and beyond capture various drift and noise patterns
\end{enumerate}
This decomposition enables both data cleaning and sensor fault detection.
\end{example}

The singular values themselves provide crucial diagnostic information. Define the \style{effective rank} of the measurement matrix as:
\[
    r_{\text{eff}} = \left(\sum_{i=1}^r \sigma_i^2\right)^2 \bigg/ \sum_{i=1}^r \sigma_i^4
\]
This quantity, always between $1$ and $r=\rank(M)$, measures how many independent patterns exist in the data. A value near 1 suggests clean measurements dominated by the physical signal; larger values indicate significant systematic errors.
%
\begin{marginfigure}
{\em Nota bene:} This effective rank formula appears in random matrix theory and quantum mechanics as the participation ratio, measuring how many components participate significantly in a system.
\end{marginfigure}

More sophisticated analysis uses the full SVD structure to calibrate the sensor array. If $\vect{v}_1$ approximates the true signal direction, we can estimate each sensor's gain by comparing its response to this reference:
\[
    \hat{g}_i = \frac{\langle \vect{m}_i, \vect{v}_1\rangle}{\|\vect{v}_1\|^2}
\]
where $\vect{m}_i$ is the $i$-th column of $M$ (centered). 
\begin{marginfigure}
    {\em Nota bene:} More complex calibration models can be addressed through careful analysis of the singular vectors.
\end{marginfigure}

This approach to sensor calibration reveals a deeper truth about physical measurement: though raw data often appears complex and corrupted, the underlying signal typically lives in a low-dimensional subspace. Systematic errors, rather than creating pure noise, generate characteristic geometric patterns that the SVD naturally detects and isolates. Modern sensor networks extend these principles through sliding window analysis for time-varying calibration and distributed computation across large arrays, yet the core insight remains: measurement errors, seemingly complex, often possess remarkably simple structure when viewed in the right coordinates.

%The SVD thus transforms sensor calibration from art to science, replacing ad-hoc corrections with systematic decomposition of signal and error. This geometric understanding guides not just calibration but the very design of sensor networks themselves, suggesting optimal sensor placement and measurement strategies aligned with the natural coordinates that emerge from physical reality.

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Tensors \& Multi-Linear Algebra}
\label{EM:tensor}
% **************** EMANATION *******************

Data rarely submits to two-dimensional representation. Though matrices provide powerful tools for analyzing paired relationships, reality often demands higher-dimensional structures. A collection of RGB images varying over time; user interactions with products across different platforms and contexts; the activations of a deep neural network responding to diverse inputs through multiple layers --- such data naturally organizes into multi-dimensional arrays called \style{tensors}. Like the transition from vectors to matrices that began our development, the leap from matrices to tensors reveals new patterns through careful extension of familiar principles.

Consider first the formal structure. A \style{tensor} of order $k$ assigns a number to each choice of $k$ indices:
\[
    \mathcal{T} = [t_{i_1i_2\cdots i_k}], \quad 1\leq i_j\leq n_j
\]
Just as matrices generalize vectors by adding a second index, tensors extend this indexing to arbitrary dimension. This algebraic definition connects naturally to the differential forms encountered in multivariable calculus --- a $k$-form is precisely an alternating $k$-tensor, measuring oriented $k$-dimensional volumes through multi-linear maps.
%
\begin{marginfigure}
{\em Nota bene:} The differential forms studied in calculus represent special tensors that change sign under odd permutations of their inputs. Their anti-symmetry makes them ideal for integration, just as symmetric tensors prove natural for deep learning.
\end{marginfigure}
%

The familiar operations of linear algebra extend naturally to this setting. Given a third-order tensor $\mathcal{T}\in\R^{n_1\times n_2\times n_3}$, we can extract matrices through \style{fibers} (fixing all but one index) or \style{slices} (fixing all but two indices):
\begin{marginfigure}
{\em Example:} In a tensor of image embeddings, fibers might represent feature trajectories across similar images, while slices capture feature relationships at fixed semantic levels.
\end{marginfigure}
\[
    \mathcal{T}_{:jk} = [t_{ijk}]_{i=1}^{n_1}, \quad \mathcal{T}_{i::} = [t_{ijk}]_{j,k=1}^{n_2,n_3}
\]
These sections provide windows into the tensor's structure, much as row and column vectors illuminated matrices.

The extension of singular value decomposition to tensors reveals profound subtlety. While matrices admit unique decomposition into orthogonal factors, tensors resist such clean factorization. The \style{CP decomposition} provides one natural generalization:
\begin{marginfigure}
    {\em Nota bene:} CP stands for CANDECOMP / PARAFAC, as you may have guessed.
\end{marginfigure}
\[
    \mathcal{T} = \sum_{r=1}^R \sigma_r\vect{a}_r\otimes\vect{b}_r\otimes\vect{c}_r
\]
expressing the tensor as a sum of rank-one components formed through outer products. Though elegant, this decomposition rarely achieves exact low-rank representation. The \style{Tucker decomposition} offers greater flexibility:
\[
    \mathcal{T} = \mathcal{S}\times_1 U\times_2 V\times_3 W
\]
where $\mathcal{S}$ denotes a small \style{core tensor} and $\times_k$ represents multiplication along mode $k$. This structure --- of a concentrated core expanded through factor matrices --- echoes how SVD reveals low-dimensional structure in matrices.

\begin{example}[Neural Network Analysis]
Modern deep networks organize their learned weights as tensors, with indices spanning input channels, output channels, and spatial dimensions. A convolutional layer operating on image data uses fourth-order tensors $\mathcal{W}\in\R^{c_{\text{out}}\times c_{\text{in}}\times h\times w}$ to transform input feature maps. Understanding how information flows through these structures demands tensor analysis:
\[
    \text{output}[i,x,y] = \sum_{j,p,q} \mathcal{W}[i,j,p,q]\cdot\text{input}[j,x+p,y+q]
\]
Tensor decompositions of these weights reveal learned feature hierarchies while enabling efficient computation through reduced parameterization.
\end{example}

Beyond raw data organization, tensors provide natural structure for modern machine learning systems. Language models process sequences through attention tensors that capture relationships across tokens, positions, and feature dimensions. Computer vision models build hierarchical representations through tensor operations that preserve spatial relationships while learning abstract features. Recommendation systems model complex interactions between users, items, and contexts through tensor factorizations that capture latent patterns.

\begin{example}[Multi-modal Learning]
Consider a deep learning system processing images with text descriptions. Each image-text pair generates embeddings in separate spaces, but their relationship forms a third-order tensor $\mathcal{T}\in\R^{n\times d_1\times d_2}$ where:
\begin{itemize}
    \item $n$ indexes training examples
    \item $d_1$ represents image embedding dimension
    \item $d_2$ represents text embedding dimension
\end{itemize}
The tensor structure captures how different modalities interact --- its decomposition reveals shared semantic spaces that enable cross-modal retrieval and generation.
\end{example}

The mathematics of tensors continues to evolve through modern applications. Training algorithms exploit tensor structure for efficient gradient computation. Neural architectures compress their parameters through tensor factorizations. Foundation models leverage tensor operations for processing multiple modalities simultaneously. 
Each development reveals new aspects of these fundamental objects, transforming abstract multi-linear maps into practical tools for artificial intelligence.

%Yet perhaps tensors' deepest significance lies in how they unify seemingly distinct computational structures. Attention mechanisms emerge naturally as tensor contractions ({\em cf.} Chapter \ref{ch:13}); convolution operations represent specialized tensor products; embedding spaces connect through tensor factorizations. Like the singular value decomposition that revealed hidden patterns in matrices, tensor analysis exposes the profound patterns underlying modern machine learning.
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 10}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item For the following matrices, $A$, compute the singular value decomposition by:
(1) computing $A^TA$ and $AA^T$; (2) finding the eigenvalues and eigenvectors of each;
(3) constructing the matrices $U$, $\Sigma$, and $V$.
\[
A = \begin{bmatrix}2 & 2\\1 & 3\end{bmatrix}
\quad : \quad
A = \begin{bmatrix}3 & 1\\1 & 2\\2 & -1\end{bmatrix}
\]

\item The \style{Hilbert matrix} $H_n$ has entries $h_{ij}=\frac{1}{i+j-1}$. For example:
\[
    H_3 = \begin{bmatrix}
    1 & 1/2 & 1/3 \\
    1/2 & 1/3 & 1/4 \\
    1/3 & 1/4 & 1/5
    \end{bmatrix}
\]
    Compute $\cond(H_3)$ using the SVD; then explain the asymptotics of $\cond(H_n)$ as $n\to\infty$.

\item Show that for any matrix $A$: $\trace(A^TA) = \sum_{i=1}^p \sigma_i^2$ where $p=\min\{m,n\}$.

\item For a square matrix $A$, prove that $|\det(A)| = \prod_{i=1}^n \sigma_i$. Then use this to show that $A$ is invertible if and only if all its singular values are nonzero.
\begin{marginfigure}
    This connects the algebraic notion of determinant with the geometric meaning of singular values as stretching factors.
\end{marginfigure}

\item For a nonsingular matrix $A$, let $H$ be the positive square root of $AA^T$ (why is this well-defined?). Show that $A=HQ$ for some orthogonal matrix $Q$. 

\item IF $A$ is invertible, how could you use the SVD to quickly compute $A^{-1}$? {\em Explain.}

\item Let $A+A^T$ and $A-A^T$ have singular values at most twice those of $A$. When does equality hold?

\item Prove that for any nonsingular matrix $A$ and orthogonal matrices $Q_1,Q_2$:
\[
    \cond(Q_1AQ_2) = \cond(A)
\]

\item For matrices $A$ and $B$ of compatible size, prove that:
\[
    \cond(AB) \leq \cond(A)\cond(B)
\]
When does equality hold?

\item Let $\vect{x}\in\R^n$ be a unit vector. The \style{Rayleigh quotient} of a matrix $A$ is defined as $R_A(\vect{x})=\vect{x}^TA^TA\vect{x}$. Prove that:
\begin{enumerate}
    \item $\sigma_n^2\leq R_A(\vect{x})\leq \sigma_1^2$ for all unit vectors $\vect{x}$
    \item Equality holds in either bound if and only if $\vect{x}$ is the corresponding right singular vector
\end{enumerate}

\item For a symmetric positive definite matrix $A$, prove that its singular values equal its eigenvalues. Under what conditions on a general matrix $A$ do its singular values equal the absolute values of its eigenvalues?

\item A real matrix $A$ is called \style{normal} if $AA^T=A^TA$. Are there any interesting properties that the SVD of a normal matrix exhibits?

\item Let $A$ be an $m\times n$ matrix with $m>n$ and full column rank. Show that the condition number of $A^TA$ is the square of the condition number of $A$. What practical implication does this have for solving least squares problems?

\item Consider a rank-$k$ approximation $A_k$ to matrix $A$ obtained by keeping only the $k$ largest singular values. Show that:
\[
    \cond(A_k) \leq \cond(A)
\]
with equality if and only if $k=\rank(A)$. Explain why this means low-rank approximations tend to be better conditioned than the original matrix.

\item Let $V$ and $W$ be finite-dimensional inner product spaces with orthonormal bases $\{\vect{e}_i\}$ and $\{\vect{f}_j\}$ respectively. Given a linear transformation $T:V\to W$, its \style{matrix elements} are $a_{ij}=\langle T\vect{e}_i,\vect{f}_j\rangle$. Show that:
\begin{enumerate}
    \item $\sum_{i,j} |a_{ij}|^2 = \sum_k \sigma_k^2$ where $\sigma_k$ are the singular values of $T$
    \item This sum is independent of the choice of orthonormal bases
\end{enumerate}

\item (Challenge) Let $V$ be a finite-dimensional inner product space and $T:V\to V$ a linear transformation. The \style{numerical range} of $T$ is defined as:
\[
    W(T) = \{\langle T\vect{v},\vect{v}\rangle : \vect{v}\in V, \|\vect{v}\|=1\}
\]
Show that if $T$ is normal (i.e., $TT^*=T^*T$), then $W(T)$ is the convex hull of the eigenvalues of $T$. How does this set relate to the singular values when $T$ is not normal?

\item (Challenge) Let $A=U\Sigma V^T$ be the SVD of a matrix $A$. Define $A_k$ by keeping only the first $k$ singular values in $\Sigma$ and setting the rest to zero. Prove that $A_k$ minimizes $\|A-B\|_F$ among all matrices $B$ of rank at most $k$. 
\begin{marginfigure}
This optimality property of the SVD truncation will be explored further in Chapter \ref{ch:12} on low-rank approximation.
\end{marginfigure}

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Principal Component Analysis}
\label{ch:11}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``to stretch across the heavens \& step from star to star''}

\newthought{Deep patterns lie hidden} within high-dimensional data, invisible to direct observation yet fundamental to understanding complex systems. The challenge lies not in gathering data  --  modern science and engineering generate observations in abundance  --  but in extracting meaningful structure from measurements that often span dozens or hundreds of dimensions.
\begin{marginfigure}
{\em Example:} A single human genome contains roughly 20,000 genes whose expression levels vary across conditions. Understanding this variation requires finding patterns in a 20,000-dimensional space.
\end{marginfigure}

Consider digital images, where each pixel intensity represents one dimension, or financial markets, where thousands of securities move in subtle correlation. Even simple physical systems, when fully instrumented, generate measurements whose dimension far exceeds human intuitive grasp. Within such spaces, important features often concentrate along a few key directions, like mineral deposits concentrated by geological processes.

Principal Component Analysis (PCA) provides the mathematical tools for uncovering these essential patterns. Through careful study of how measurements vary and correlate, PCA reveals natural coordinates aligned with the data's intrinsic structure. These coordinates  --  ordered by their importance in capturing variation  --  enable both dimension reduction and pattern discovery.
\begin{marginfigure}
{\em Foreshadowing:} The connection between PCA and neural networks (Chapter \ref{ch:13}) runs deep: both seek to transform high-dimensional data into more meaningful representations.
\end{marginfigure}

The foundations for this analysis emerged from our work with singular values in Chapter \ref{ch:10}. There we saw how any linear transformation admits decomposition into orthogonal stretching along principal axes. PCA applies this geometric insight to data matrices, where rows represent observations and columns represent measured variables. The singular vectors of such matrices reveal natural coordinates for understanding variation, while singular values measure the strength of pattern in each direction.
\begin{marginfigure}
{\em Nota bene:} PCA has roots in statistical analysis dating to Pearson (1901), though the SVD is more modern.
\end{marginfigure}

Our development moves from statistical foundations through geometric intuition to practical application. Though the data we analyze may seem chaotic at first glance, careful mathematical excavation often reveals underlying simplicity. Like a sculpture emerging from rough stone through careful removal of excess material, meaningful low-dimensional structure emerges from high-dimensional data through principled dimension reduction. Our task is to develop both the theory to understand this process and the tools to effect it in practice.

% ==============================================
\section{Covariance \& Correlation}
\label{sec:variance}
% ==============================================

The story of variance begins with rotation. A solid body spinning about its center of mass experiences rotational resistance determined not by total mass, but by how that mass is distributed in space. The familiar scalar moment of inertia $I = \int r^2\,dm$ measures this resistance about a single axis, but a complete description requires the full \style{inertia tensor}:
\[
    \mathcal{I} = [\mathcal{I}_{ij}]
    \quad : \quad
    \mathcal{I}_{ij} = \int (r^2\delta_{ij} - x_ix_j)\,dm
\]
\begin{marginfigure}
    {\em Definition:} the Kronecker delta $\delta_{ij}$ evaluates to $1$ if $i=j$ and $0$ otherwise.  
\end{marginfigure}
This mechanical perspective  --  of mass distributed about a center and its resistance to different rotations  --  provides surprisingly deep insight into the statistical structures we now develop.

Consider first a single random variable $Z$. Its \style{mean} or \style{expectation} $\mu=\mathbb{E}(Z)$ acts as a center of mass, while its \style{variance} $\mathbb{V}(Z)=\mathbb{E}((Z-\mu)^2)$ measures spread about this center  --  precisely analogous to the scalar moment of inertia of a mass distribution about its centroid. The \style{standard deviation} $\sigma=\sqrt{\mathbb{V}}$, like the radius of gyration in mechanics, provides a characteristic length scale of this spread.
%
\begin{marginfigure}
{\em Think:} Just as a solid's resistance to rotation depends on mass distribution about its axes, a dataset's statistical structure depends on how measurements distribute about their means in different directions.
\end{marginfigure}

In most instances, data is discrete rather than continuous, and we can represent $Z$ as a vector $\Z=(z_1,\ldots,z_n)^T$. From this, we have basic statistical measures:
\[
    \mathbb{E}(Z) = \frac{1}{n}\sum_{i=1}^n z_i \quad\text{and}\quad
    \mathbb{V}(Z) = \frac{1}{n}\sum_{i=1}^n (z_i-\mathbb{E}(Z))^2
\]
\begin{marginfigure}
    {\em Nota bene:} Instead of a $1/n$ in front of the variance, one often sees a $1/(n-1)$, especially in the context of statistics. This reflects the loss of one degree of freedom in estimating the mean from a sampling. For purposes of doing data science and dimension reduction, $1/n$ is the more appropriate scaling and is what we shall use throughout.
\end{marginfigure}

In data science and statistics, one typically {\em centers} the data, transforming $Z$ to $\hat{Z}=Z-\mathbb{E}(Z)$ with mean zero. This, then, leads to a geometric interpretation of variance as $\mathbb{V}(Z)=\hat{Z}^T\hat{Z}$ with the standard deviation interpreted as dimension-scale length: 
\[
    \sigma = \sqrt{\mathbb{V}} 
    = \frac{1}{\sqrt{n}}\sqrt{\hat{Z}^T\hat{Z}} 
    = \frac{1}{\sqrt{n}}\left\|{\hat{Z}}\right\| .
\]
This interpretation is the key to understanding the geometry of data. 

What happens with two random variables? Covariance and correlation are the key measures. Given random variables $Y$ and $Z$, their \style{covariance} 
\begin{align*}
    \cov(Y,Z) &= \mathbb{E}(\hat{Y}\hat{Z}) = \mathbb{E}((Y-\mathbb{E}(Y))(Z-\mathbb{E}(Z)))  \\
    &= \frac{1}{n} \,\hat{Y}\cdot\hat{Z}
\end{align*}
measures their tendency to vary together. Like the off-diagonal terms of the inertia matrix, covariance captures coupling between different directions of variation. Positive covariance indicates that large values of $Y$ tend to occur with large values of $Z$, while negative covariance suggests opposition  --  when one variable rises above its mean, the other tends to fall below.

That this is a dot product (scaled by dimension) should act as a balm to students who have suffered through a traditional statistics course. Filled with the geometric imagination that the dot product inspires, the Reader might guess what is to come.   

To determine the degree of alignment (or misalignment) between two data vectors, one defines a \style{correlation} to be a rescaling of the covariance to lie between $-1$ and $+1$ with a correlation of zero connoting independence. As a formula, correlation becomes a familiar friend:
\[
    \corr(Y,Z) = \frac{\cov(Y, Z)}{\sigma(Y)\,\sigma(Z)} = \frac{\hat{Y}\cdot\hat{Z}}{\|\hat{Y}\|\|\hat{Z}\|} = \cos\theta(\hat{Y},\hat{Z}) .
\]
\begin{marginfigure}
    {\em Truth:} Correlation is not causation; but it is cosine similarity.
\end{marginfigure}
It is the angle between the two centered data points.

% ==============================================
\section{Matrices \& Data}
\label{sec:datamatrix}
% ==============================================

A single vector of data corresponds to one point in a point cloud. How then shall we proceed to work with the entire data set? The reader will by now have seen the future: a collection of data points becomes a collection of vectors, assembled into a data matrix. 

For an (arbitrarily) ordered collection of $d$ variables $Z_1,\ldots,Z_d$, center them each to $\hat{Z}_i$ and arrange them into a centered data matrix $\Data$ with $d$ columns. In practice, the columns of $\Data$ correspond to individual variables and the rows correspond to samples or runs. 

To estimate these quantities from data, we organize our observations into a \style{data matrix} $\Data\in\R^{n\times d}$ where:
\begin{itemize}
    \item Each row represents one observation;
    \item Each column corresponds to one variable;
    \item Entry $x_{ij}$ is the $j$th measurement from observation $i$.
\end{itemize}
For instance, consider daily temperature measurements at three weather stations over one year:
\[
\Data = \begin{bmatrix}
    72 & 70 & 68 \\
    75 & 74 & 71 \\
    65 & 63 & 62 \\
    \vdots & \vdots & \vdots
\end{bmatrix}
\]

After centering by subtracting column means (analogous to shifting to center of mass coordinates in mechanics), the covariance matrix becomes:
\[
    \COV = \frac{1}{n}\Data^T\Data
    = \begin{bmatrix}
    25.3 & 23.1 & 20.4 \\
    23.1 & 24.7 & 19.8 \\
    20.4 & 19.8 & 22.9
    \end{bmatrix} ,
\]
where we assume $\Data$ has already been centered.

The diagonal entries show each station's temperature variance  --  station 1 shows slightly more variability than the others. The large positive off-diagonal terms indicate strong correlation between stations, as expected for nearby locations experiencing similar weather patterns. Yet the correlation is not perfect, with station pairs (1,2) showing stronger relationship than pairs involving station 3, suggesting it may be geographically more distant.

This covariance matrix is symmetric positive semidefinite  --  a property that emerges naturally from its construction. Its diagonal entries are the individual variances, while off-diagonal terms measure pairwise relationships.

Just as the inertia matrix's eigenvalues measure resistance to rotation about principal axes, the covariance matrix's eigenstructure reveals fundamental patterns of variation in our data. To better understand these patterns independent of scale, we sometimes normalize through the \style{correlation matrix} $\CORR=[\corentry_{ij}]$ where
\[
    \corentry_{ij} = \frac{\coventry_{ij}}{\sigma_i\sigma_j} = \frac{\cov(Z_i,Z_j)}{\sqrt{\mathbb{V}(Z_i)\mathbb{V}(Z_j)}}
\]
This scales all entries to the interval $[-1,1]$, measuring purely the strength and direction of linear relationships. For our temperature data, we first extract standard deviations from the diagonal entries of the covariance matrix:
\[
    \sigma_1 = \sqrt{25.3} \approx 5.03^\circ, \quad
    \sigma_2 = \sqrt{24.7} \approx 4.97^\circ, \quad
    \sigma_3 = \sqrt{22.9} \approx 4.79^\circ
\]
These measure the typical variation at each station. The complete correlation matrix then becomes:
\[
    \CORR = \begin{bmatrix}
    1.000 & 0.923 & 0.846 \\
    0.923 & 1.000 & 0.831 \\
    0.846 & 0.831 & 1.000
    \end{bmatrix}
\]

The covariance and correlation matrices transform abstract statistical relationships into concrete geometric objects. Their eigenvectors identify principal axes of variation, while their eigenvalues measure the strength of variation along these axes. This fusion of mechanical intuition, statistical theory, and geometric structure provides the foundation for the dimension reduction methods we shall develop. Even our simple temperature example suggests how datasets may harbor hidden simplicity: though we measured three variables, the strong correlations hint at underlying patterns waiting to be uncovered through principal component analysis.

% ==============================================
\section{Principal Components}
\label{sec:pca}
% ==============================================

The covariance matrix captures how our data varies along different directions in measurement space. Yet these directions, aligned with our original variables, may obscure simpler underlying patterns. Just as the projection operators of Chapter \ref{ch:6} revealed optimal approximating subspaces, we now seek coordinates aligned with the inherent structure of our data rather than arbitrary measurement choices.

Consider a centered data matrix $\Data\in\R^{n\times d}$, where each row represents one observation of $d$ variables, and each column has zero mean. The singular value decomposition studied in Chapter \ref{ch:10} provides exactly the transformation we seek:

\begin{definition}[Principal Components]
\label{def:pc}
Given a centered data matrix $\Data$, its \style{principal components} are the right singular vectors $\vect{v}_1,\ldots,\vect{v}_d$ from the SVD $\Data=U\Sigma V^T$, ordered by decreasing singular value. Each component $\vect{v}_k$ represents a direction in the original variable space that captures maximal remaining variation after accounting for previous components.
\end{definition}

These principal components transform our original variables into new features that capture the data's variation structure:

\begin{definition}[PC Scores]
\label{def:scores}
Given a principal component $\vect{v}_k$, the corresponding \style{principal component score} for observation $\vect{x}\in\R^d$ is its projection $z_k=\vect{x}^T\vect{v}_k$ onto that direction. The scores of all observations along $\vect{v}_k$ form the $k$th \style{score vector} $\vect{z}_k=\Data\vect{v}_k$.
\end{definition}

Just as the orthogonal projections of Chapter \ref{ch:6} decomposed vectors into optimal approximating components, these score vectors decompose our data into orthogonal features of decreasing importance. The relationship between singular values and variation emerges naturally: if $\sigma_k$ is the $k$th singular value of $\Data$, then $\lambda_k=\sigma_k^2/(n-1)$ gives the variance of scores along the $k$th principal component. These variances decrease as we move through components, reflecting how each successive direction captures maximal remaining variation in the data.

\begin{example}[Gene Expression Data]
Consider genetic expression measurements across thousands of genes in different cell types. Each row of our data matrix represents a cell, while columns record expression levels of different genes:
\[
    \Data = \begin{bmatrix}
    \leftarrow & \text{cell 1} & \rightarrow \\
    \leftarrow & \text{cell 2} & \rightarrow \\
    & \vdots & \\
    \leftarrow & \text{cell n} & \rightarrow
    \end{bmatrix}
    \begin{array}{l}
    \text{\small gene 1} \\
    \text{\small gene 2} \\
    \vdots \\
    \text{\small gene d}
    \end{array}
\]
Though each cell's state lives in a space of dimension $d\approx 20,000$, biological constraints often restrict variation to a much lower-dimensional manifold. Principal component analysis reveals these constraints through directions $\vect{v}_k$ that often correspond to fundamental regulatory programs or cell state transitions.

\begin{marginfigure}
{\em Foreshadowing:} The dimension reduction achieved through PCA previews how neural networks (Chapter \ref{ch:13}) learn to represent high-dimensional data through lower-dimensional features.
\end{marginfigure}

Projecting onto the first two principal components produces a two-dimensional visualization:
\[
    \begin{bmatrix}
    z_{11} & z_{12} \\
    z_{21} & z_{22} \\
    \vdots & \vdots \\
    z_{n1} & z_{n2}
    \end{bmatrix} = \Data[\vect{v}_1\;\vect{v}_2]
\]
The resulting scatter plot of points $(z_{i1},z_{i2})$ often reveals clusters of similar cell types or gradients of cellular differentiation  --  patterns invisible in the original high-dimensional space.
\end{example}

\begin{example}[Market Returns]
Daily returns of stocks in a market index provide another illuminating example. Each row of $\Data$ represents one trading day, while columns represent different stocks. The principal components extract fundamental market factors:
\begin{itemize}
    \item The first component $\vect{v}_1$ typically captures market-wide movement
    \item Subsequent components often align with sector-specific variation
    \item Later components may reveal company-specific effects
\end{itemize}
The corresponding scores measure how strongly each factor influenced returns on different days, providing a natural decomposition of market behavior.
\end{example}

\begin{marginfigure}
{\em Nota bene:} This sequence of directions generalizes the orthogonal bases of Chapter \ref{ch:4}, now optimized to capture variation in data rather than arbitrary coordinate choices.
\end{marginfigure}

The fraction of total variance captured by the first $k$ components provides a measure of how well they summarize our data:
\[
    r_k = \frac{\sum_{i=1}^k \lambda_i}{\sum_{i=1}^d \lambda_i} = \frac{\sum_{i=1}^k \sigma_i^2}{\sum_{i=1}^d \sigma_i^2}
\]
When this ratio approaches $1$ for small $k$, we have found a low-dimensional representation capturing most of the data's variation.

In practice, the choice of scaling crucially affects what patterns PCA discovers. Two standard approaches emerge:
\begin{enumerate}
    \item \style{Covariance PCA}: Use centered data directly, preserving relative scales
    \item \style{Correlation PCA}: Standardize each variable to unit variance first
\end{enumerate}
The first emphasizes directions of large absolute variation; the second focuses on patterns of correlation regardless of scale. Section \ref{sec:preprocessing} will explore these choices and their implications in detail.

Principal component analysis thus provides a systematic way to replace arbitrary measurement coordinates with natural axes aligned to variation in our data. Like the optimal projections of Chapter \ref{ch:6} and the singular vectors of Chapter \ref{ch:10}, these directions emerge from fundamental geometric and algebraic properties that the next section will carefully examine.

% ==============================================
\section{Optimality Properties}
\label{sec:optimality}
% ==============================================

Principal components provide more than just convenient coordinates for data analysis  --  they emerge naturally from fundamental optimization principles. Like the orthogonal projections of Chapter \ref{ch:6}, which minimized approximation error in geometric spaces, principal components minimize error in representing high-dimensional data through lower-dimensional summaries.

Consider first the problem of finding a single direction that best captures variation in our data. Given centered observations $\{\vect{x}_1,\ldots,\vect{x}_n\}$, we seek a unit vector $\vect{v}$ maximizing the variance of projections:
\[
    \text{maximize} \quad \frac{1}{n-1}\sum_{i=1}^n (\vect{x}_i^T\vect{v})^2 
    \quad\text{subject to}\quad \|\vect{v}\|=1
\]
This optimization has profound geometric meaning: we seek the direction along which our data shows greatest spread. Writing $\Data$ for our centered data matrix, this objective becomes:
\[
    \frac{1}{n-1}\vect{v}^T\Data^T\Data\vect{v} = \vect{v}^T\COV\vect{v}
\]
subject to $\vect{v}^T\vect{v}=1$. 

\begin{marginfigure}
{\em Think:} The optimization balances two competing goals: capturing as much variation as possible while maintaining orthogonality between components.
\end{marginfigure}

\begin{theorem}[Principal Component Optimality]
\label{thm:pcaopt}
The first principal component $\vect{v}_1$ maximizes $\vect{v}^T\COV\vect{v}$ subject to $\|\vect{v}\|=1$. Each subsequent component $\vect{v}_k$ maximizes this same objective subject to orthogonality with all previous components.
\end{theorem}

\begin{proof}
By the Rayleigh-Ritz principle, the maximum value of $\vect{v}^T\COV\vect{v}$ subject to $\|\vect{v}\|=1$ equals the largest eigenvalue of $\COV$, achieved by its corresponding eigenvector. The optimization for subsequent components follows from the same principle applied to the residual variation after removing previous components.
\end{proof}

This optimization perspective reveals PCA's fundamental character: it provides an orthogonal coordinate system optimally aligned with variation in our data. Each component extracts maximal remaining variance while maintaining orthogonality with previous components.

\begin{example}[Image Compression]
Consider a grayscale image represented as a matrix of pixel intensities. Though formally high-dimensional, natural images often concentrate variation in few directions due to spatial correlations. PCA reveals this low-dimensional structure: the first few principal components capture coherent image features, while later components often represent noise.
\begin{marginfigure}
{\em Foreshadowing:} Chapter \ref{ch:12} will develop the matrix approximation theory underlying such compression tasks, complementing our current statistical perspective.
\end{marginfigure}

For instance, retaining only the first $k$ components approximates the original image through:
\[
    \hat{\Data}_k = \sum_{i=1}^k \sigma_i\vect{u}_i\vect{v}_i^T
\]
where $\sigma_i$ are singular values and $\vect{u}_i,\vect{v}_i$ are left and right singular vectors. The fraction of variance preserved equals $r_k$ from Section \ref{sec:pca}, providing a natural measure of approximation quality.
\end{example}

Beyond variance maximization, principal components possess several equivalent optimality properties:

\begin{enumerate}
    \item They minimize mean squared reconstruction error for $k$-dimensional representations
    \item They maximize mutual information between data and its projection (under Gaussian assumptions)
    \item They provide optimal linear dimension reduction for preserving pairwise distances
\end{enumerate}

\begin{marginfigure}
{\em Example:} In financial portfolio analysis, PCA often reveals risk factors ordered by their contribution to total market variance  --  a decomposition crucial for risk management.
\end{marginfigure}

The reconstruction error perspective proves particularly illuminating. Given observations $\{\vect{x}_i\}$, consider approximating each through:
\[
    \hat{\vect{x}}_i = \sum_{j=1}^k z_{ij}\vect{v}_j
\]
where $z_{ij}$ are scores and $\vect{v}_j$ are unit vectors. The principal components minimize:
\[
    \frac{1}{n}\sum_{i=1}^n \|\vect{x}_i - \hat{\vect{x}}_i\|^2
\]
over all choices of $k$ orthonormal vectors $\{\vect{v}_j\}$. This optimality connects directly to the projection operators of Chapter \ref{ch:6}: PCA provides the best rank-$k$ approximation in terms of squared error.

\begin{example}[Signal Denoising]
Consider a time series corrupted by noise. If the underlying signal has simpler structure than the noise, principal components often achieve denoising through:
\begin{enumerate}
    \item Project the noisy data onto principal components
    \item Retain only components with variance above noise level
    \item Reconstruct using these components only
\end{enumerate}
The optimality of PCA ensures this procedure minimizes mean squared error under appropriate noise assumptions.
\end{example}

Though we have focused on statistical optimality, these same principles emerge from pure linear algebra through singular value decomposition. Chapter \ref{ch:12} will develop this complementary perspective, showing how PCA's optimality properties generalize to arbitrary matrix approximation problems while maintaining computational efficiency.

% ==============================================
\section{Preprocessing \& Scaling}
\label{sec:preprocessing}
% ==============================================

Real data rarely arrives in the pristine form assumed by our theoretical development. Consider monitoring an automated manufacturing process through five sensors:
\[
\Data = \begin{bmatrix}
    82.3 & 1205 & 4.2 & 7.1 & 156 \\
    85.1 & 1198 & 4.1 & 7.2 & 162 \\
    79.8 & 1210 & 4.3 & 6.8 & 145 \\
    \vdots & \vdots & \vdots & \vdots & \vdots
\end{bmatrix}
\quad : \quad
\textrm{cols } \Rightarrow  
\begin{array}{l}
    \text{\small Temperature (°C)} \\
    \text{\small Pressure (kPa)} \\
    \text{\small Flow (L/s)} \\
    \text{\small pH} \\
    \text{\small Conductivity ($\mu$S/cm)}
\end{array}
\]
Each row represents one measurement, but the variables differ dramatically in scale. Direct application of covariance PCA would be dominated entirely by pressure measurements, while potentially important variations in pH or flow rate vanish in the noise. Yet blindly standardizing each variable might discard meaningful scale information.

\begin{example}[Scale Effects]
\label{ex:scale}
For the manufacturing data above, the first principal component under different preprocessing choices reveals starkly different patterns:

Covariance PCA yields $\vect{v}_1 \approx (0.002, 0.999, 0.001, 0.000, 0.003)^T$, essentially capturing pressure variation alone. After standardizing to unit variance, correlation PCA gives $\vect{v}_1 \approx (0.51, 0.48, -0.42, 0.38, 0.44)^T$, revealing coordinated variation across all measurements. The choice fundamentally shapes what patterns we can discover.
\end{example}

Beyond scaling, real data suffers contamination from measurement errors, sensor failures, and genuine but extreme events. We need systematic methods to identify observations requiring special treatment. The key insight lies in measuring distance not just in terms of raw values, but in a way that accounts for the natural scales and relationships in our data:

\begin{definition}[Mahalanobis Distance]
\label{def:mahalanobis}
For an observation $\vect{x}$ from a collection with mean $\bar{\vect{x}}$ and covariance $\COV$, the \style{Mahalanobis distance} is:
\[
    d_M(\vect{x}) = \sqrt{(\vect{x}-\bar{\vect{x}})^T\COV^{-1}(\vect{x}-\bar{\vect{x}})}
\]
\begin{marginfigure}
{\em Nota bene:} The matrix $\COV^{-1}$ plays the role of the squared reciprocal standard deviation $1/\sigma^2$ in higher dimensions, accounting for both scale and correlation between variables.
\end{marginfigure}
This metric accounts for both scale and correlation structure in measuring how far an observation deviates from typical patterns. The inverse covariance matrix $\COV^{-1}$ ensures that distances properly reflect the natural variability in each direction.
\end{definition}

For our engineering applications, this distance provides a natural way to identify observations that deviate markedly from typical patterns. Experience suggests that observations with Mahalanobis distances more than twice that of typical points warrant careful investigation.
\begin{marginfigure}
{\em Think:} In one dimension, $d_M$ reduces to distance from the mean measured in standard deviations: $d_M(x) = |x-\mu|/\sigma$. This connects to our intuition about ``unusual'' values in simple measurements.
\end{marginfigure}

\begin{example}[Outlier Detection]
\label{ex:outlier}
Returning to our manufacturing data, most observations have Mahalanobis distances between 1.5 and 3 units. However, one measurement:
\[
    \vect{x} = (84.2, 1203, 12.8, 7.0, 158)^T
\]
yields $d_M = 4.9$, far exceeding the typical range. Though each individual measurement appears plausible, their combination suggests a process anomaly requiring investigation. The flow rate of 12.8 L/s, while not extreme in absolute terms, proves inconsistent with the observed temperature and pressure.
\end{example}

Missing measurements present a final challenge. When relatively rare ($<5\%$ of entries), simply omitting affected observations suffices. More extensive missingness requires careful treatment to avoid biasing our analysis. Simple mean imputation  --  replacing missing values with variable averages  --  often distorts correlation structure. More sophisticated iterative schemes use partial PCA results to estimate missing values, though such approaches risk imposing artificial patterns.

The choice of preprocessing fundamentally shapes what patterns PCA can discover. Different choices serve different purposes:
\begin{itemize}
    \item Covariance PCA preserves absolute scales when they carry meaning
    \item Correlation PCA reveals purely relational patterns
    \item Robust methods sacrifice efficiency for reliability with corrupted data
\end{itemize}
Sound engineering judgment, guided by careful examination of data quality and clear analysis goals, must inform these choices.

% ==============================================
\section{Statistical Significance}
\label{sec:significance}
% ==============================================

The singular value decomposition of our centered data matrix $\Data$ provides not just optimal directions for dimension reduction but natural measures of their importance. Each singular value $\sigma_i$ quantifies how strongly its corresponding direction contributes to the data's structure  --  precisely analogous to the dominance concepts developed in Chapter \ref{ch:9}. The art lies in determining where to truncate this sequence, balancing fidelity of representation against simplicity of model.

Consider the sequence of partial approximations to $\Data$ using increasing numbers of singular values:
\[
    \Data_k = \sum_{i=1}^k \sigma_i\vect{u}_i\vect{v}_i^T
\]
The ratio of retained to total variation provides our first measure of approximation quality:
\[
    r_k = \frac{\sum_{i=1}^k \sigma_i^2}{\sum_{i=1}^d \sigma_i^2}
\]
This quantity, like the spectral ratio from Section \ref{sec:dominant}, measures how thoroughly $k$ components capture the data's essential structure.

\begin{example}[Vibration Analysis]
\label{ex:vibration}
Consider acceleration measurements from sensors on a bridge structure, yielding singular values:
\[
    \sigma_1 = 12.5, \quad \sigma_2 = 9.5, \quad \sigma_3 = 3.6, \quad \sigma_4 = 1.3, \quad \sigma_5 = 1.1, \ldots
\]
The sharp drop after $\sigma_2$ suggests two dominant modes of vibration. Just as dominant eigenvalues governed asymptotic behavior in Chapter \ref{ch:9}, these dominant singular values identify directions essential to the bridge's motion. The first two components capture proportion
\[
    r_2 = \frac{\sigma_1^2 + \sigma_2^2}{\sum_{i=1}^d \sigma_i^2} \approx 0.85
\]
of total variation  --  a quantitative measure of their dominance.
\end{example}

\begin{marginfigure}
{\em Nota bene:} When analyzing noise-corrupted data, sharp drops in singular values often mark transition from signal to noise  --  a pattern explained theoretically through random matrix theory but visible even in simple examples.
\end{marginfigure}

This sequence of singular values connects directly to optimal approximation theory. By Section \ref{sec:optimality}, each $\Data_k$ provides the best rank-$k$ approximation to $\Data$ in terms of squared error. The truncation level $k$ thus trades approximation accuracy against model complexity  --  a balance made quantitative through the singular value spectrum.

\begin{example}[Chemical Process Data]
\label{ex:chemical}
A chemical reactor monitored through multiple sensors yields normalized singular values decreasing more gradually:
\[
    \sigma_1 = 2.05, \quad \sigma_2 = 1.76, \quad \sigma_3 = 1.52, \quad \sigma_4 = 1.18, \quad \sigma_5 = 0.89, \ldots
\]
No sharp dominance emerges, reflecting complex coupling between process variables. The cumulative proportion $r_4 \approx 0.85$ suggests retaining four components captures most significant variation while filtering sensor noise.
\end{example}

The practical choice of how many components to retain benefits from systematic validation. By partitioning our data into training and testing sets, we can assess how well different truncation levels generalize to new observations. Components that dominate in one portion of the data should maintain their dominance in others  --  a principle that helps distinguish genuine structure from sampling artifacts.

\begin{example}[Materials Spectroscopy]
\label{ex:materials}
Consider spectroscopic measurements where each sample generates thousands of intensity readings across different wavelengths. With limited samples, distinguishing significant components becomes crucial. Cross-validation reveals that although singular values continue well past $\sigma_{20}$, predictions using more than 5-6 components often perform worse on holdout data  --  a sign of overfitting despite apparent dominance in training data.
\end{example}

Sample size fundamentally affects our confidence in identified components. When analyzing $n$ observations of $d$ variables, singular values beyond index $\min\{n,d\}$ must be zero  --  a fact following directly from the SVD's matrix structure. More subtly, the ratio $n/d$ affects stability of non-zero singular values: too few observations relative to variables can create spurious apparent structure.

These considerations transform our theoretical understanding of SVD dominance into practical guidance for data analysis. Though lacking the deterministic certainty of pure matrix algebra, they provide systematic ways to identify genuinely dominant components while guarding against over-interpretation. The true test lies not in abstract significance but in whether retained components enable better prediction, control, or understanding of the systems we study.

% ==============================================
\section{Beyond Linear PCA}
\label{sec:beyondPCA}
% ==============================================

Reality rarely submits to purely linear description. Though principal components illuminate structure within linear transformations of space, many datasets harbor intrinsic nonlinearity  --  their essential patterns twist and curve through measurement space like vines growing beyond their linear supports. A temperature sensor's readings may vary sinusoidally with time; chemical reaction rates couple through nonlinear dynamics; images of handwritten digits trace complex manifolds far from any linear subspace. Understanding such data requires extending our mathematical framework beyond the linear realm that has supported our development thus far.

Consider first why linear PCA might fail. When data lies near a curved surface or manifold, no linear projection can capture its true structure. The principal components, optimal though they are for linear approximation, may entirely miss the underlying simplicity. Like shadows cast by a curved wire that seem tangled and self-intersecting, linear projections can obscure rather than reveal the true form of nonlinear data.

% \begin{marginfigure}
% {\em FIGURE:} [Illustration showing how linear PCA fails to capture a simple curved structure. Left panel shows original curved data in $\R^3$, middle panel shows best linear 2D projection which appears tangled, right panel shows true 2D representation found by nonlinear methods.]
% \end{marginfigure}

The key insight emerging from such examples is that we must adapt our notion of dimension reduction to respect the local structure of data. Rather than seeking global linear projections, we might approximate our data locally by tangent spaces that follow its curves and bends. This perspective  --  that nonlinear structure often appears nearly linear when examined at sufficiently small scales  --  provides the foundation for modern manifold learning methods.

\begin{example}[Single Cell Genomics]
Consider gene expression measurements from cells undergoing differentiation. Though measured in a space of thousands of dimensions (one per gene), the developmental process often follows a branching path as cells progress from stem-like states toward specialized types. No linear projection captures this branching structure  --  PCA might show a tangled mess where biologically meaningful progression exists. The data's true simplicity emerges only when we respect its curved, branching geometry, revealing developmental trajectories that mirror known biology.
\end{example}

% \begin{marginfigure}
% {\em FIGURE:} [Visualization showing cell differentiation data. Top: PCA projection showing apparent chaos. Bottom: Nonlinear embedding revealing clear branching structure with stem cells at center branching into distinct cell types.]
% \end{marginfigure}

One path beyond linearity leads through \style{kernel methods}. Rather than working directly in measurement space, we implicitly map our data into a higher-dimensional feature space through a kernel function $k(\vect{x},\vect{y})$ measuring similarity between observations. Within this expanded space, we perform standard PCA  --  though now operating on the kernel matrix $K=[k(\vect{x}_i,\vect{x}_j)]$ rather than the covariance of raw measurements.
%
\begin{marginfigure}
{\em Foreshadowing:} The implicit feature mappings of kernel methods preview how neural networks will learn representations transforming data into more meaningful spaces.
\end{marginfigure}

This \style{kernel PCA} reveals nonlinear structure through careful choice of kernel function. The radial basis kernel
\[
    k(\vect{x},\vect{y}) = \exp\left(-\frac{\|\vect{x}-\vect{y}\|^2}{2\sigma^2}\right)
\]
measures local similarity, allowing the method to follow curved patterns in data. Though computationally intensive for large datasets, kernel PCA provides a mathematically elegant bridge between linear and nonlinear methods.

\begin{marginfigure}
{\em BONUS!} Beyond the methods discussed here, Topological Data Analysis (TDA) provides tools for understanding the shape of data. While beyond our current scope, TDA offers powerful insights into data geometry through linear-algebraic tools such as \style{persistent homology}.
\end{marginfigure}

A different approach emerges from examining how linear PCA optimizes reconstruction error. Rather than restricting ourselves to linear mappings, we might seek general functions that compress our data while preserving its essential structure. This perspective leads naturally to \style{autoencoders}  --  neural networks trained to reconstruct their inputs through a low-dimensional bottleneck layer. Though we defer their detailed study to Chapter \ref{ch:13}, autoencoders represent a fundamental generalization of PCA's dimension reduction principle.

Local methods provide yet another path beyond linearity. Rather than seeking global structure, techniques like \style{locally linear embedding} (LLE) preserve geometry by reconstructing each point from its neighbors. If $\vect{x}_i$ denotes our $i$th observation and $\mathcal{N}(i)$ its set of nearest neighbors, LLE first computes weights $w_{ij}$ solving:
\[
    \min_{w_{ij}} \sum_{i=1}^n\left\|\vect{x}_i - \sum_{j\in\mathcal{N}(i)} w_{ij}\vect{x}_j\right\|^2
    \quad\text{subject to}\quad \sum_{j\in\mathcal{N}(i)} w_{ij} = 1
\]
These weights capture local geometry through linear approximations. A second optimization then finds low-dimensional coordinates $\vect{y}_i$ preserving these relationships:
\[
    \min_{\vect{y}_i} \sum_{i=1}^n\left\|\vect{y}_i - \sum_{j\in\mathcal{N}(i)} w_{ij}\vect{y}_j\right\|^2
\]
%
\begin{marginfigure}
{\em Think:} The two-stage optimization of LLE mirrors how manifolds appear locally linear  --  first capture local structure, then find global coordinates respecting these local patterns.
\end{marginfigure}

This local-to-global principle  --  that complex structure often reduces to simpler pieces when properly decomposed  --  appears throughout mathematics and engineering. Just as integration reduces complex functions to sums of simple pieces, or finite elements approximate complex domains through simple patches, manifold learning methods build global understanding from local approximations.

The emergence of these nonlinear methods marks a profound shift in how we think about dimension reduction. No longer constrained to linear projections, we can seek representations that truly capture how our data organizes itself in space. This freedom brings both power and challenge  --  we gain expressiveness at the cost of uniqueness and computational simplicity. The balance between these factors shapes the choice of method in practice:
\begin{itemize}
    \item Kernel PCA offers mathematical elegance but scales poorly
    \item Local methods capture fine structure but may miss global patterns
    \item Autoencoders provide flexibility but require significant data and computation
\end{itemize}

Yet these challenges should not obscure the fundamental insight: that dimension reduction, properly understood, means finding simpler representations that preserve essential structure. Whether through kernels, local approximations, or learned transformations, the goal remains to distill complex data to its meaningful essence. This principle  --  that high-dimensional data often has lower-dimensional structure waiting to be discovered  --  will guide our development through neural networks and beyond.

The path from linear PCA to modern nonlinear methods recapitulates a broader pattern in mathematical thought. Just as classical linear algebra grew to embrace infinite-dimensional spaces and nonlinear operators, our tools for understanding data must evolve beyond the linear framework that birthed them. The methods we have explored represent first steps toward this broader vision  --  one that will continue to unfold as we delve deeper into neural networks and artificial intelligence in Chapter \ref{ch:13}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Decoding Neural Population Activity}
\label{EM:neurosci}
% **************** EMANATION *******************

The coordinated firing of neural populations encodes complex behaviors through patterns of electrical activity. Modern experimental techniques enable simultaneous recording from hundreds of neurons, each contributing a time-varying signal that reflects both local computation and global brain state. While individual neurons exhibit complex and often noisy dynamics, their collective activity reveals remarkably clear structure when analyzed through the lens of dimensionality reduction.

Consider a typical motor cortex experiment where researchers record from $n=256$ neurons while a subject performs reaching movements toward different targets. Each neuron's firing rate varies with time, producing a vector $\vect{r}(t)\in\R^n$ of instantaneous population activity sampled at millisecond resolution. Over a session with 100 reaches lasting 500ms each, we obtain the centered data matrix:
\[
    R = \begin{bmatrix}
    \leftarrow & \vect{r}(0) - \bar{\vect{r}} & \rightarrow \\
    \leftarrow & \vect{r}(1) - \bar{\vect{r}} & \rightarrow \\
    & \vdots & \\
    \leftarrow & \vect{r}(50000) - \bar{\vect{r}} & \rightarrow
    \end{bmatrix} \in \R^{50000\times 256}
\]
where $\bar{\vect{r}}$ denotes the temporal average firing rate vector. The sample covariance matrix $\COV = \frac{1}{T-1}R^TR \in \R^{256\times 256}$ captures how pairs of neurons co-vary in their firing patterns.

A concrete example reveals the remarkable structure in this neural data. During a recent experiment, the first ten singular values of the normalized neural activity matrix were:
\[
    \sigma_1 = 128.4, \quad \sigma_2 = 84.2, \quad \sigma_3 = 52.1, \quad \sigma_4 = 31.5, \quad \sigma_5 = 18.7
\]
\[
    \sigma_6 = 12.3, \quad \sigma_7 = 8.1, \quad \sigma_8 = 5.4, \quad \sigma_9 = 3.8, \quad \sigma_{10} = 2.6
\]
The proportion of variance explained by the first $k$ components:
\[
    r_k = \frac{\sum_{i=1}^k \sigma_i^2}{\sum_{i=1}^{256} \sigma_i^2}
\]
reaches $r_5 = 0.86$ with just five components. This dramatic dimensionality reduction -- from 256 neurons to 5 principal components while retaining 86\% of variance -- suggests neural computation occurs in a much lower-dimensional space than raw cell counts would indicate.

The principal components themselves, computed as eigenvectors of $\COV$, reveal distinct aspects of motor encoding. The first component $\vect{v}_1$ shows coordinated activation across the population:
\[
    \vect{v}_1 = \begin{bmatrix}
    0.31 \\ 0.28 \\ 0.33 \\ \vdots \\ 0.29
    \end{bmatrix}
    \quad \text{with entries} \quad
    [\vect{v}_1]_j \approx \frac{1}{\sqrt{n}} \pm 0.1
\]
This nearly uniform weighting suggests a global activity mode, perhaps reflecting overall movement vigor. The second component shows clear anatomical organization:
\[
    \vect{v}_2 = \begin{bmatrix}
    0.42 \\ 0.38 \\ -0.35 \\ \vdots \\ -0.41
    \end{bmatrix}
\]
with positive weights for neurons preferring forward reaches and negative weights for neurons preferring backward reaches.

To validate these components against behavior, we can project the neural data at each time $t$ onto our principal components:
\[
    z_i(t) = \vect{v}_i^T(\vect{r}(t) - \bar{\vect{r}})
\]
The resulting scores $z_i(t)$ reveal tight coupling between neural trajectories and movement:
\begin{itemize}
    \item $z_1(t)$ correlates strongly with movement speed ($r=0.82$)
    \item $z_2(t)$ predicts reach direction ($r=0.76$ with target angle)
    \item $z_3(t)$ and $z_4(t)$ capture grip force modulation
\end{itemize}

Like the manufacturing data from Section 11.4, neural recordings require careful preprocessing. Common challenges include:
\begin{enumerate}
    \item Highly variable firing rates across neurons (some cells fire 100x more than others)
    \item Non-stationary baseline activity due to brain state changes
    \item Temporally correlated noise from shared inputs
    \item Missing data from transiently lost neural signals
\end{enumerate}

The choice between covariance and correlation-based PCA proves especially crucial. For the data above, correlation-based analysis reveals additional structure by preventing highly active neurons from dominating:
\[
    \tilde{\COV}_{ij} = \frac{\COV_{ij}}{\sqrt{\COV_{ii}\COV_{jj}}}
\]
The correlation-based principal components highlight coordination between neurons regardless of their absolute firing rates. This often better reflects underlying computation, as neuron-specific factors like electrode placement can artificially inflate some firing rates.

Modern experiments often record from multiple brain regions simultaneously. Consider data from both primary motor cortex (M1) and dorsal premotor cortex (PMd):
\[
    R = \begin{bmatrix} R_{M1} \\ R_{PMd} \end{bmatrix}
    \quad \text{where} \quad
    R_{M1} \in \R^{50000\times 256}
    \quad \text{and} \quad
    R_{PMd} \in \R^{50000\times 128}
\]
PCA on this combined data reveals both area-specific and cross-area patterns. Some principal components load primarily on single areas:
\[
    \vect{v}_1 = \begin{bmatrix} \vect{v}_{M1} \\ \vect{0} \end{bmatrix}
    \quad \text{or} \quad
    \vect{v}_2 = \begin{bmatrix} \vect{0} \\ \vect{v}_{PMd} \end{bmatrix}
\]
while others reflect coordinated computation:
\[
    \vect{v}_3 = \begin{bmatrix} \vect{v}_{M1} \\ \vect{v}_{PMd} \end{bmatrix}
    \quad \text{with} \quad
    \|\vect{v}_{M1}\| \approx \|\vect{v}_{PMd}\|
\]

The low-dimensional structure revealed by PCA directly impacts brain-computer interface (BCI) design. Rather than attempting to decode intended movement from individual neurons, modern BCIs first project neural activity onto data-driven principal components:
\[
    \hat{\vect{x}}(t) = W\left[\begin{array}{c}
        \vect{v}_1^T(\vect{r}(t) - \bar{\vect{r}}) \\
        \vdots \\
        \vect{v}_k^T(\vect{r}(t) - \bar{\vect{r}})
    \end{array}\right]
\]
where $W$ maps the $k$ principal component scores to decoded movement variables $\hat{\vect{x}}(t)$. This approach proves remarkably robust -- even if individual neurons are lost, the principal components often remain stable by leveraging the coordinated activity of the remaining population.

The statistical significance framework proves vital for validating low-dimensional neural structure. Consider splitting our reaching data into training and testing sets:
\[
    R_{train} = R_{1:40000,:} \quad \text{and} \quad R_{test} = R_{40001:50000,:}
\]
Computing principal components on $R_{train}$ and evaluating variance explained on $R_{test}$ helps distinguish reliable patterns from overfitting. When neural dimensions consistently align with behavior across splits while maintaining high variance explained, we gain confidence in their computational relevance.

Beyond basic research, dimensionality reduction enables sophisticated clinical applications. A paralyzed patient using a BCI requires reliable control signals despite neural variability. PCA-based decoders maintain performance by extracting consistent movement-related dimensions even as individual neurons change their tuning. This synthesis of mathematical theory and clinical practice exemplifies how dimensionality reduction transforms our understanding of neural computation while enabling practical neural engineering.

The analysis of neural population activity demonstrates both the power and limitations of PCA. Though linear methods reveal remarkable structure, the brain's intrinsic nonlinearity and dynamics suggest even richer patterns await discovery. For now, PCA provides our clearest window into how populations of neurons collectively encode behavior -- a testament to the profound utility of dimensionality reduction in modern neuroscience.

% \begin{marginfigure}
% {\em FIGURE:} [Illustration showing neural population activity projected into its first three principal components, with trajectories colored by movement direction demonstrating how low-dimensional projections preserve behaviorally relevant structure.]
% \end{marginfigure}

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Eigenfaces}
\label{EM:eigenfaces}
% **************** EMANATION *******************


The human brain's remarkable ability to recognize faces has long inspired mathematical approaches to image analysis. Among these, the method of \style{eigenfaces} stands out for both its elegant simplicity and its profound influence on computer vision. Though modern deep learning approaches have superseded it for practical applications, eigenfaces provide perhaps the clearest demonstration of how PCA can extract meaningful structure from high-dimensional data.

Consider a grayscale image of size $m\times n$ pixels. Though we naturally view this as a rectangular array of intensities, we can reshape it into a single vector in $\R^{mn}$. A face image of modest 100$\times$100 resolution thus becomes a point in $\R^{10000}$ --- a space of such high dimension that direct analysis proves hopeless. Yet faces, despite their complexity, exhibit strong statistical regularities that PCA can reveal.

Given a collection of $N$ face images $\{\vect{x}_1,\ldots,\vect{x}_N\}$, each reshaped into a vector, we first center the data by subtracting the mean face:
\[
    \bar{\vect{x}} = \frac{1}{N}\sum_{i=1}^N \vect{x}_i
    \quad:\quad
    \tilde{\vect{x}}_i = \vect{x}_i - \bar{\vect{x}}
\]
The centered images form the columns of a data matrix $\Data\in\R^{(mn)\times N}$. Its singular value decomposition $\Data = U\Sigma V^T$ yields the eigenfaces as the columns of $U$ corresponding to the largest singular values.

These eigenfaces, when reshaped back to image format, reveal striking structure. The first few capture large-scale features --- overall face shape, the position of eyes and mouth, major lighting variations. Subsequent eigenfaces encode progressively finer details, with later components often representing noise or idiosyncratic features. Typically 40-50 components suffice to capture the essential structure of face images, achieving dimension reduction by two orders of magnitude.

Any face image $\vect{x}$ can be approximated through projection onto the first $k$ eigenfaces:
\[
    \vect{x} \approx \bar{\vect{x}} + \sum_{i=1}^k c_i\vect{u}_i
    \quad\text{where}\quad
    c_i = (\vect{x}-\bar{\vect{x}})^T\vect{u}_i
\]
The coefficients $c_i$ form a \style{face code} --- a low-dimensional representation capturing the image's essential features. This encoding enables efficient face recognition: similar faces yield similar coefficients, allowing recognition through simple distance metrics in the reduced space.

The preprocessing of face images proves crucial for effective analysis. Beyond basic centering, careful attention to:
\begin{enumerate}
    \item Alignment and scaling of facial features
    \item Lighting normalization
    \item Background removal
    \item Handling of facial expressions
\end{enumerate}
significantly impacts the quality of the resulting eigenfaces. The Mahalanobis distance in face space provides a natural way to detect images that deviate markedly from typical faces --- a valuable test for reliability in practical systems.

Modern face recognition has largely moved to deep learning approaches that can handle greater variation in pose, lighting, and expression. Yet the eigenface method's core insight --- that high-dimensional face images lie near a lower-dimensional manifold --- remains valid. Indeed, the intermediate layers of modern neural networks often learn representations remarkably similar to eigenfaces, suggesting these patterns reflect fundamental structure rather than mere algorithmic artifacts.
\begin{marginfigure}
{\em BONUS!} The eigenface technique extends naturally to other types of image analysis. Researchers have applied similar methods to medical images (eigen-tumors for cancer detection), satellite data (eigen-landscapes for terrain classification), and industrial inspection (eigen-defects for quality control).
\end{marginfigure}

Beyond its practical impact, the eigenface method exemplifies how linear algebra transforms abstract theory into engineering solutions. The singular value decomposition of Chapter 10 becomes a practical tool for dimension reduction; the preprocessing principles of Section \ref{sec:preprocessing} find concrete application; the statistical considerations of Section \ref{sec:significance} guide implementation choices. This synthesis --- of mathematics, statistics, and engineering --- enables machines to begin approaching human capabilities in pattern recognition and scene understanding.

% \begin{marginfigure}
% {\em FIGURE:} [Illustration showing several eigenfaces, arranged in order of decreasing singular value, demonstrating how they capture progressively finer details of facial structure.]
% \end{marginfigure}
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 11}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{enumerate}

\item The following measurements come from two sensors monitoring a simple process:
\[
    \begin{bmatrix}
    1.0 & 0.8 \\
    0.8 & 2.0 \\
    -0.5 & -0.2 \\
    -1.2 & -1.5
    \end{bmatrix}
\]
Compute the sample mean and covariance matrix. Find the principal components and create a scatter plot showing both the original data points and the direction of the first principal component. What proportion of variance does this component explain?

\item A $3\times 3$ correlation matrix has eigenvalues 2.4, 0.5, and 0.1. Without seeing the original matrix:
\begin{enumerate}
    \item Find its trace and determinant
    \item Calculate proportion of variance captured by first principal component
    \item What can you conclude about correlations between original variables?
\end{enumerate}

\item Given centered data matrix $\Data \in \R^{n \times 3}$ with columns representing temperature, pressure, and concentration measurements:
\begin{enumerate}
    \item Write explicit formulas for computing principal component scores
    \item Show how to project a new measurement $\vect{x}$ onto these components
    \item Explain why standardization might be important before this computation
\end{enumerate}

\item A chemical reactor is monitored through three sensors measuring temperature ($^\circ$C), pressure (kPa), and flow rate (L/s). The measurements yield covariance matrix
\[
    \COV = \begin{bmatrix}
    16 & 8 & 0 \\
    8 & 25 & -4 \\
    0 & -4 & 9
    \end{bmatrix}
\]
Find its eigenvalues and eigenvectors. What proportion of total variance is captured by the first two principal components? Give a physical interpretation of what these components might reveal about the reactor's behavior.

\item A vibration analysis system records displacement at three points on a machine, yielding covariance matrix
\[
    \COV = \begin{bmatrix}
    4 & 2 & 1 \\
    2 & 5 & 3 \\
    1 & 3 & 6
    \end{bmatrix}
\]
For the measurement $\vect{x} = (0.5, 2.8, -1.2)^T$:
\begin{enumerate}
    \item Calculate its projection onto first two principal components
    \item What proportion of the vector's energy is retained?
    \item How would standardizing the variables first change your analysis?
\end{enumerate}

\item Consider a sequence of measurements arriving one at a time. Describe an efficient algorithm to:
\begin{enumerate}
    \item Update the sample mean incrementally
    \item Update the covariance matrix without storing all past data
    \item Periodically update principal components when needed
\end{enumerate}
Why might such an incremental approach be valuable in practice?

\item In analyzing semiconductor manufacturing, three quality metrics yield correlation matrix
\[
    \CORR = \begin{bmatrix}
    1.0 & 0.8 & 0.7 \\
    0.8 & 1.0 & 0.9 \\
    0.7 & 0.9 & 1.0
    \end{bmatrix}
\]
Find its principal components and explain what patterns they reveal about the manufacturing process. What might explain the strong correlations between all variables?

\item A time series of strain measurements from a material under cyclic loading yields singular values:
\[
    \sigma_1 = 12.3, \quad \sigma_2 = 5.1, \quad \sigma_3 = 0.8, \quad \sigma_4 = 0.3, \quad \sigma_5 = 0.2
\]
Based on this decay pattern, how many components would you retain? Justify your answer both mathematically and in terms of physical meaning for the material's behavior.

\item Four accelerometers mounted on a bridge structure record vertical displacement (mm) with covariance matrix
\[
    \COV = \begin{bmatrix}
    10 & -2 & 4 & 1 \\
    -2 & 8 & 0 & 3 \\
    4 & 0 & 6 & -1 \\
    1 & 3 & -1 & 5
    \end{bmatrix}
\]
How many principal components would you retain to capture 90\% of total variance? What physical interpretation might these dominant modes have in terms of the bridge's vibration patterns?

\item Consider monitoring a chemical process where temperature ranges 150--200 °C, pressure 2000--2500 kPa, and flow 0.5--2.0 L/s. Explain why covariance PCA might be misleading here and propose a preprocessing strategy. How would you validate your choice through cross-validation? 

\item In monitoring an industrial furnace, temperature measurements (°C) from five sensors yield mean vector $\bar{\vect{x}}=(845,835,840,850,855)^T$ and measurement $\vect{x}=(850,823,841,868,859)^T$ with covariance matrix
\[
    \COV = \begin{bmatrix}
    100 & 85 & 82 & 75 & 70 \\
    85 & 120 & 90 & 80 & 75 \\
    82 & 90 & 110 & 85 & 80 \\
    75 & 80 & 85 & 90 & 75 \\
    70 & 75 & 80 & 75 & 95
    \end{bmatrix}
\]
Calculate the Mahalanobis distance of $\vect{x}$ from the mean. If typical measurements have Mahalanobis distances less than 3, does this reading warrant investigation?

\item Let $\Data$ be a centered data matrix. Prove that principal components of the correlation matrix (obtained after standardizing each variable to unit variance) differ from those of the covariance matrix unless all original variables had equal variance. When might you prefer correlation PCA to covariance PCA in practice?

\item For the Mahalanobis distance $d_M^2(\vect{x}) = (\vect{x}-\bar{\vect{x}})^T\COV^{-1}(\vect{x}-\bar{\vect{x}})$, prove it is invariant under linear transformations that preserve covariance structure. Show that its level sets are ellipsoids aligned with principal components.

\item A manufacturing quality inspection system records five measurements per part. The singular values of the centered data matrix decrease as:
\[
    \sigma_k = 10e^{-0.8k}, \quad k = 1,\ldots,5
\]
Prove that retaining the first two components captures at least 95\% of the total variance. What does the exponential decay of singular values suggest about the intrinsic dimensionality of the manufacturing variations?

\item Consider the problem of splitting  data into training and validation sets before performing PCA. Explain: (a) How to properly compute principal components using only training data; (b) How to measure reconstruction error on validation data; (c) Why this cross-validation approach provides better estimates of generalization error.

\item Consider a dataset where most measurements cluster near the mean but occasional outliers appear. Explain why PCA based on the covariance matrix might be unduly influenced by these outliers. Propose a robust alternative approach using concepts from Section \ref{sec:preprocessing}. How would you validate that your approach better captures the true data structure?

\item The correlation matrix for measurements in a chemical process is nearly block diagonal, with strong correlations within blocks but weak correlations between blocks. Explain what this structure reveals about the process. How might you use this information to design a hierarchical monitoring strategy based on PCA?

\item Explain how the singular value spectrum of a data matrix reveals intrinsic dimensionality of the underlying process. How might you distinguish between:
(a) Genuine low-dimensional structure; (b) Random correlations from finite sampling; and (c) Systematic measurement artifacts. Support your answer using concepts from Section \ref{sec:significance}.

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Low Rank Approximation}
\label{ch:12}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

% ALTERNATIVE CHAPTER

% # Chapter 12: Random Linear Algebra

% ## 12.1 Computational Challenges in High Dimensions
% - The curse of dimensionality in modern data analysis
% - Limitations of traditional deterministic algorithms
% - Sparse, high-dimensional matrices in practice
% - Example: Web-scale matrices, large scientific simulations, social networks

% ## 12.2 Randomized Projections
% - The Johnson-Lindenstrauss lemma: preserving distances in lower dimensions
% - Random projection matrices: Gaussian, Rademacher, sparse projections
% - Error bounds and probabilistic guarantees
% - Connection to compressed sensing

% ## 12.3 Randomized Matrix Approximation
% - Random sampling of rows and columns
% - Theoretical guarantees for matrix sketching
% - Nyström method for positive semidefinite matrices
% - CUR decomposition: interpretable low-rank approximation

% ## 12.4 Randomized SVD
% - Matrix sketch via random projection
% - Power iteration with randomization
% - Error analysis and convergence guarantees
% - Computational complexity versus accuracy tradeoffs

% ## 12.5 Monte Carlo Matrix Inversion
% - Stochastic estimation of matrix inverses
% - Hutchinson's trace estimator
% - Randomized algorithms for linear systems
% - Applications to network analysis and graph Laplacians

% ## 12.6 Stochastic Optimization for Linear Problems
% - Stochastic gradient descent for large-scale linear systems
% - Mini-batch methods for least squares problems
% - Convergence analysis and acceleration techniques
% - Connection to Chapter 13's neural network training

% ## 12.7 Concentration of Measure
% - Matrix concentration inequalities
% - Bounds on spectral norms of random matrices
% - Applications to algorithm analysis
% - The fundamental role of randomness in high dimensions



{\em ``do I not stretch the heavens abroad or fold them up like a garment''}

\newthought{A matrix of full rank} harbors redundancy in subtle forms  --  patterns obscured by profusion of detail, structure veiled by sheer abundance of data. The fundamental character of a matrix often lies not in its apparent complexity but in the simpler forms lurking beneath. This insight  --  that matrices admit approximation by versions of lower rank  --  transforms both our theoretical understanding and practical computation. The art lies in finding such approximations that preserve essential features while stripping away superfluity.

The singular value decomposition developed in Chapter \ref{ch:10} provides our first glimpse of how matrices might be approximated by simpler versions of themselves. Just as a sculptor reveals form by removing excess material, we can often capture a matrix's essential character by keeping only its dominant singular values and vectors. This process of low-rank approximation  --  of finding the best possible simple representation of complex data  --  connects deeply to both the abstract theory of matrix analysis and the practical challenges of modern computing.

Consider a matrix $A$ whose entries represent pixel intensities in a digital image. Though formally of high rank (each entry potentially independent), the structured nature of real images often allows remarkably accurate reconstruction from far fewer parameters. The theoretical foundations developed in this chapter explain both why such compression is possible and how to achieve it optimally. These same principles guide dimension reduction in data analysis, signal processing, and the emerging field of matrix completion  --  where we seek to recover full matrices from partial observations.

The path ahead moves from abstract approximation theory through practical algorithms to modern applications. We begin with the fundamental limits of low-rank approximation, establishing both what is possible and what is not. This leads naturally to computational methods that achieve these limits efficiently, even for massive datasets where direct computation proves infeasible. The theory extends to handle corrupted or incomplete data, reflecting the messy reality of real-world applications. Throughout, we maintain the perspective that effective approximation requires both mathematical understanding and sound engineering judgment.

%Our development culminates in a bridge to the neural networks and deep learning methods of Chapter \ref{ch:13}. The layered compression achieved through low-rank approximations previews how deeper architectures might transform data through multiple stages of representation. This connection  --  between classical matrix approximation and modern artificial intelligence  --  reveals a profound unity beneath seemingly distinct approaches to finding simplicity within complexity.

% ==============================================
\section{Optimal Approximation}
\label{sec:optapprox}
% ==============================================

The singular value decomposition developed in Chapter \ref{ch:10} revealed how any matrix can be expressed as a sum of rank-one terms, each capturing a different direction of variation in the data. This decomposition suggests a natural path to approximation: rather than keeping all terms, we might retain only those corresponding to the largest singular values. Such truncation raises fundamental questions about optimality  --  how well can matrices be approximated by simpler versions of themselves, and what form should such approximations take?

Let us begin by formalizing what we mean by rank-constrained approximation:

\begin{definition}[Rank-$k$ Approximation]
\label{def:rank-k}
For a matrix $A\in\R^{m\times n}$ and integer $k\leq\rank(A)$, a \style{rank-$k$ approximation} is any matrix $B\in\R^{m\times n}$ satisfying $\rank(B)=k$. The error in such an approximation is typically measured using matrix norms introduced in Chapter \ref{ch:5}, particularly:
\begin{enumerate}
    \item The Frobenius norm: $\|A-B\|_F = \sqrt{\sum_{i,j} (a_{ij}-b_{ij})^2}$
    \item The spectral norm: $\|A-B\|_2 = \sigma_1(A-B)$
\end{enumerate}
where $\sigma_1(A-B)$ denotes the largest singular value of the difference matrix.
\end{definition}

Given matrix $A$ with singular value decomposition $A=U\Sigma V^T$, we can construct a natural rank-$k$ approximation by keeping only the first $k$ singular values:
\[
    A_k = \sum_{i=1}^k \sigma_i\vect{u}_i\vect{v}_i^T
\]
This truncated SVD yields a matrix of rank exactly $k$, as each term $\sigma_i\vect{u}_i\vect{v}_i^T$ has rank one and the terms combine linearly. The error in this approximation takes a particularly elegant form:
%
\begin{marginfigure}
{\em Compare:} The truncation of singular values parallels the dimensionality reduction of PCA in Chapter 11, but now viewed through the lens of matrix approximation rather than statistics.
\end{marginfigure}
%
\[
    \|A-A_k\|_F^2 = \sum_{i=k+1}^r \sigma_i^2
\]
where $r=\rank(A)$. Each truncated singular value contributes exactly its square to the total approximation error.

The profound importance of this construction emerges from its optimality properties:

\begin{theorem}[Eckart-Young-Mirsky]
\label{thm:EYM}
Let $A\in\R^{m\times n}$ have singular value decomposition $A=U\Sigma V^T$. Then for any matrix $B$ of rank at most $k$:
\[
    \|A-A_k\|_F \leq \|A-B\|_F
\]
That is, the truncated SVD $A_k$ provides the best possible rank-$k$ approximation to $A$ in the Frobenius norm.
\end{theorem}
%
\begin{marginfigure}
{\em Historical Note:} Though named for work in the 1930s, the core insight that matrices admit optimal low-rank approximations through SVD emerged from astronomy, where Karl Pearson used it to fit planes to noisy data in 1901.
\end{marginfigure}
%
\begin{proof}
Let $B$ have rank at most $k$ and consider the matrix $A-B$. By the Fundamental Theorem (Theorem \ref{thm:FTLA}), its null space has dimension at least $n-k$. This space must have nontrivial intersection with the span of $\{\vect{v}_{k+1},\ldots,\vect{v}_n\}$, which also has dimension $n-k$. Let $\vect{x}$ be a unit vector in this intersection.

Then $\|A-B\|_F^2 \geq \|(A-B)\vect{x}\|^2 = \|A\vect{x}\|^2$ since $B\vect{x}=\vect{0}$. But $A\vect{x}$ lies in $\spanset\{\vect{u}_{k+1},\ldots,\vect{u}_r\}$ and has norm at least $\sigma_{k+1}$. Therefore:
\[
    \|A-B\|_F^2 \geq \sigma_{k+1}^2 = \|A-A_k\|_F^2
\]
establishing optimality of the truncated SVD.
\end{proof}

This remarkable result extends beyond the Frobenius norm to any unitarily invariant matrix norm  --  including the spectral norm $\|A\|_2=\sigma_1$ and the nuclear norm $\|A\|_* = \sum_i \sigma_i$ which we shall explore in Section \ref{sec:incomplete}. The geometric intuition remains: $A_k$ captures the $k$ most important directions of variation in the data, as measured by singular values.

\begin{example}[Image Compression]
\label{ex:imgcompress}
Consider a grayscale image stored as a $1024\times 1024$ matrix $A$ of pixel intensities. Though formally of rank 1024, most singular values prove negligibly small. A rank-50 approximation $A_{50}$ often captures the visually significant features while reducing storage from over 1 million values to just:
\begin{itemize}
    \item 50 singular values ($\sigma_1,\ldots,\sigma_{50}$)
    \item 50 left singular vectors ($\vect{u}_1,\ldots,\vect{u}_{50}$)
    \item 50 right singular vectors ($\vect{v}_1,\ldots,\vect{v}_{50}$)
\end{itemize}
totaling about 100,000 values  --  a 90\% reduction in storage with minimal perceptual loss.
\end{example}

The truncated SVD provides more than just optimal approximation  --  it reveals the fundamental limits of matrix approximation. For any matrix $B$ of rank at most $k$:
\[
    \|A-B\|_2 \geq \sigma_{k+1}(A)
\]
This lower bound, achieved by $A_k$, quantifies precisely how well matrices of reduced rank can approximate $A$. The relationship extends beyond the spectral norm to other important matrix norms:

\begin{lemma}[Approximation Bounds]
\label{lem:approxbounds}
For the truncated SVD approximation $A_k$:
\begin{enumerate}
    \item $\|A-A_k\|_2 = \sigma_{k+1}$
    \item $\|A-A_k\|_F^2 = \sum_{i=k+1}^r \sigma_i^2$
    \item $\|A-A_k\|_* = \sum_{i=k+1}^r \sigma_i$
\end{enumerate}
Moreover, these bounds are optimal: no rank-$k$ matrix can achieve smaller error in any of these norms.
\end{lemma}

The power of these optimal approximation results emerges most clearly through contrast with constrained approximation problems. When we require additional properties  --  preserving certain entries exactly or maintaining non-negativity of the original matrix  --  closed-form solutions generally cease to exist. Yet such constrained problems remain amenable to numerical optimization through techniques we shall explore in Section \ref{sec:algorithms}.

The profound implications of optimal approximation extend far beyond mere data compression. By revealing the fundamental limits of matrix approximation, the Eckart-Young-Mirsky theorem provides theoretical foundations for:
\begin{itemize}
    \item Principal component analysis in statistics
    \item Latent semantic analysis in natural language processing
    \item Model reduction in dynamical systems
    \item Collaborative filtering in recommendation systems
\end{itemize}

In each case, the existence of optimal low-rank approximations transforms seemingly complex problems into manageable computations. The art lies not in finding these approximations  --  the truncated SVD provides them automatically  --  but in choosing appropriate rank $k$ to balance accuracy against simplicity. This choice, guided by the singular value spectrum, exemplifies the fundamental tension between model complexity and data fidelity that pervades modern data science.


% ==============================================
\section{Approximation in Practice}
\label{sec:practice}
% ==============================================

The elegant optimality of truncated singular values, though mathematically complete, leaves open crucial questions of implementation. When working with real data  --  whether from sensor arrays, image sequences, or network traffic  --  we must balance theoretical optimality against practical constraints of storage, computation, and reliability. These engineering concerns transform abstract approximation theory into working systems through careful consideration of how data naturally structures itself.

Consider first the storage implications. A rank-$k$ approximation through truncated SVD of matrix $A\in\R^{m\times n}$ requires storing:
\begin{enumerate}
    \item $k$ singular values $\sigma_1,\ldots,\sigma_k$
    \item $k$ left singular vectors $\vect{u}_1,\ldots,\vect{u}_k\in\R^m$
    \item $k$ right singular vectors $\vect{v}_1,\ldots,\vect{v}_k\in\R^n$
\end{enumerate}
totaling $k(m+n+1)$ numbers. This yields compression precisely when $k(m+n+1) < mn$. Yet this crude parameter counting ignores the crucial question: what rank suffices to capture essential behavior?

\begin{example}[Building Sensor Network]
\label{ex:sensornets}
Consider a building instrumented with 100 temperature sensors recording hourly measurements for one year. The resulting data matrix $A\in\R^{8760\times 100}$ requires nominally 876,000 values to specify completely. Each entry $a_{ij}$ gives the temperature at sensor $j$ during hour $i$. Physical reality suggests strong redundancy: heat diffuses smoothly through space while following clear daily and seasonal patterns.

The singular value spectrum reveals this structure quantitatively:
\[
    \sigma_1 = 1247.3, \quad \sigma_2 = 423.1, \quad \sigma_3 = 89.4, \quad \sigma_4 = 12.7, \quad \sigma_5 = 3.2, \ldots
\]
This rapid decay  --  each singular value roughly a quarter of its predecessor  --  confirms strong low-rank structure. The physical meaning emerges through singular vectors: $\vect{u}_1$ captures seasonal variation, $\vect{u}_2$ reflects daily cycles, while $\vect{v}_1$ and $\vect{v}_2$ reveal primary thermal zones. A rank-4 approximation yields root-mean-square error under 0.1°C while reducing storage by 95\%.
\end{example}

Different error metrics suggest different truncation strategies. The Frobenius norm discussed in Section \ref{sec:optapprox} measures typical reconstruction error:
\[
    \|A-A_k\|_F = \sqrt{\sum_{i=k+1}^r \sigma_i^2}
\]
while the spectral norm $\|A-A_k\|_2 = \sigma_{k+1}$ bounds worst-case error  --  crucial when approximations feed into control systems where errors might compound. For our temperature monitoring example, spectral norm bounds ensure no reconstructed value differs by more than 0.5°C from measured data when keeping six singular values.

The singular value spectrum itself provides natural indicators for approximation quality:
\begin{enumerate}
    \item Sharp drops suggest clear truncation points, as with our temperature data where physical modes separate cleanly
    \item Gradual decay without clear gaps warns that no natural low-rank approximation may exist
    \item Clusters of similar singular values hint at coupled features requiring joint preservation
\end{enumerate}

\begin{example}[Image Sequence Analysis]
\label{ex:video}
A video stream storing 30 frames per second at $1024\times 1024$ resolution nominally requires over 30 million values per second. Yet most motion appears smooth both spatially and temporally. By treating each second as a $1024\times(1024\cdot 30)$ matrix, low-rank approximation often achieves 20:1 compression while maintaining visual quality. The singular vectors naturally separate persistent background features from moving elements through their different temporal structure:
\begin{itemize}
    \item Static background elements appear in early singular vectors with large singular values
    \item Moving objects emerge in later vectors with intermediate singular values
    \item Sensor noise concentrates in vectors with small singular values
\end{itemize}
This decomposition, reminiscent of the principal components studied in Chapter \ref{ch:11}, reveals how matrix approximation automatically discovers meaningful structure in data.
\end{example}

When data arrives continuously or distributes across multiple sensors, direct SVD computation may prove impossible. Such scenarios demand the streaming algorithms we shall develop in Section \ref{sec:algorithms}, building on the dominance principles studied in Chapter \ref{ch:9}. The mathematical insight that dominant directions emerge naturally through iteration transforms theoretical optimality into practical computation.

Physical constraints often restrict allowable approximations beyond mere rank reduction:
\begin{itemize}
    \item Non-negativity of reconstructed values
    \item Exact preservation of critical sensor readings
    \item Bounded derivatives between adjacent measurements
    \item Conservation laws or other physical invariants
\end{itemize}

\begin{definition}[Constrained Low-Rank Approximation]
\label{def:constrained}
Given matrix $A\in\R^{m\times n}$ and constraint set $\mathcal{C}$, the \style{constrained rank-$k$ approximation problem} seeks:
\[
    \min_{B\in\mathcal{C}} \|A-B\|_F \quad\text{subject to}\quad \rank(B)\leq k
\]
Common constraints include:
\begin{enumerate}
    \item Non-negativity: $\mathcal{C}=\{B:b_{ij}\geq 0\}$
    \item Entry-wise bounds: $\mathcal{C}=\{B:l_{ij}\leq b_{ij}\leq u_{ij}\}$
    \item Exact matching: $\mathcal{C}=\{B:b_{ij}=a_{ij}\text{ for }(i,j)\in\mathcal{I}\}$
\end{enumerate}
where $\mathcal{I}$ denotes a set of index pairs for entries that must be preserved exactly.
\end{definition}

Though such constrained approximations rarely admit closed forms like the Eckart-Young-Mirsky theorem of Section \ref{sec:optapprox}, the truncated SVD often provides excellent initialization for numerical optimization seeking physically valid approximations. We shall explore algorithms for solving such constrained problems in Section \ref{sec:algorithms}.

The practical power of low-rank approximation emerges precisely when theoretical structure aligns with natural data properties. Just as Chapter \ref{ch:11}'s principal components revealed patterns in high-dimensional data, truncated singular values expose redundancy in matrix measurements. This fusion of mathematical elegance with engineering reality transforms abstract optimization theory into practical tools for data compression and analysis.

% ==============================================
\section{Algorithms at Scale}
\label{sec:algorithms}
% ==============================================

The iterative systems studied in Chapter \ref{ch:9} revealed how matrix powers naturally amplify dominant directions while suppressing others. This same principle  --  of dominance emerging through iteration  --  provides our first path to efficient computation of singular vectors. Though direct SVD computation proves infeasible for large matrices, careful iteration can efficiently extract just the dominant singular vectors needed for low-rank approximation.

\begin{definition}[Power Iteration]
\label{def:powermethod}
For matrix $A\in\R^{m\times n}$, the \style{power method} iterates:
\begin{marginfigure}
{\em Nota bene:} Normalization here is not merely for convenience - it prevents numerical overflow while preserving the direction information we seek.
\end{marginfigure}
\[
    \vect{x}_{k+1} = \frac{A\vect{x}_k}{\|A\vect{x}_k\|}
\]
starting from random initial vector $\vect{x}_0$. When $A$ has dominant singular value $\sigma_1$ strictly larger than $\sigma_2$, these iterates converge to the dominant right singular vector $\vect{v}_1$.
\end{definition}

This convergence follows directly from singular vector expansion: writing $\vect{x}_0=\sum_i c_i\vect{v}_i$ in the right singular vector basis:
\[
    A^k\vect{x}_0 = \sum_{i=1}^r \sigma_i^k c_i\vect{u}_i
    = \sigma_1^k\left(c_1\vect{u}_1 + \sum_{i=2}^r \left(\frac{\sigma_i}{\sigma_1}\right)^k c_i\vect{u}_i\right)
\]
Since $\sigma_i/\sigma_1 < 1$ for $i>1$, repeated multiplication amplifies the dominant direction while suppressing others. The rate of convergence depends on the gap ratio $\sigma_2/\sigma_1$  --  a large gap between first and second singular values ensures rapid convergence.
\begin{marginfigure}
{\em Compare:} Like the dominance concepts of Chapter 9, power iteration amplifies strongest patterns while naturally suppressing weaker ones. Nature herself works thus: greater populations grow faster, stronger signals drown out weaker ones.
\end{marginfigure}

For low-rank approximation, we need not just one but several singular vectors. The \style{block power method} addresses this by iterating with multiple vectors simultaneously:
\[
    X_{k+1} = \text{orth}(AX_k)
\]
where $\text{orth}(\cdot)$ produces an orthonormal basis for the column space through QR factorization. This extracts an approximate basis for the dominant right singular subspace, though careful reorthogonalization proves crucial for numerical stability.

\begin{theorem}[Block Power Convergence]
\label{thm:blockpower}
Let $A\in\R^{m\times n}$ have singular values $\sigma_1>\sigma_2>\cdots>\sigma_r>0$. For block size $p$, the block power method converges to the span of the first $p$ right singular vectors at rate:
\[
    \|\sin\Theta(X_k,V_p)\|_2 \leq \left(\frac{\sigma_{p+1}}{\sigma_p}\right)^k
\]
where $\Theta(X_k,V_p)$ denotes the principal angles between the subspace spanned by $X_k$ and that spanned by the first $p$ right singular vectors.
\end{theorem}
\begin{marginfigure}
{\em Think:} The sine of principal angles measures misalignment between subspaces - like measuring how far we've strayed from true north while navigating by stars.
\end{marginfigure}

Krylov subspace methods achieve dramatically faster convergence by working in richer spaces. Rather than just the current iterate, they maintain the full subspace:
\[
    \mathcal{K}_k(A,\vect{x}) = \spanset\{\vect{x}, A\vect{x}, A^2\vect{x}, \ldots, A^{k-1}\vect{x}\}
\]
The \style{Lanczos process} builds an orthonormal basis for this space through a remarkably efficient three-term recurrence:
\begin{equation}
\label{eq:lanczos}
    \beta_{j+1}\vect{q}_{j+1} = A\vect{q}_j - \alpha_j\vect{q}_j - \beta_j\vect{q}_{j-1}
\end{equation}
\begin{marginfigure}
{\em Historical Note:} Lanczos discovered this elegant recurrence in 1950, but its power remained hidden until modern computers made large-scale computation feasible. Like a prophetic verse, its meaning emerged only with time.
\end{marginfigure}
where coefficients $\alpha_j,\beta_j$ emerge naturally during orthogonalization. This recurrence maintains both simplicity  --  requiring just one matrix multiplication per step  --  and numerical stability through careful management of orthogonality.

The Lanczos vectors $\{\vect{q}_1,\ldots,\vect{q}_k\}$ provide excellent approximations to singular vectors, with errors decreasing rapidly as $k$ grows. More remarkably, the tridiagonal matrix of coefficients:
\[
    T_k = \begin{bmatrix}
    \alpha_1 & \beta_2 & & \\
    \beta_2 & \alpha_2 & \beta_3 & \\
    & \beta_3 & \alpha_3 & \ddots \\
    & & \ddots & \ddots
    \end{bmatrix}
\]
captures the essential spectrum of $A$  --  its eigenvalues rapidly converge to singular values of $A$. This reduction from large sparse to small tridiagonal form exemplifies how careful algorithm design can extract structure efficiently.

Modern applications often involve matrices so large that even single matrix-vector products prove expensive. Randomization offers a powerful path forward through a surprisingly simple idea: project our large matrix onto a small random subspace. Consider the sketched matrix:
\[
    Y = A\Omega \quad\text{where}\quad \Omega\in\R^{n\times(k+p)}
\]
has independent standard normal entries. Though apparently reckless, this random projection tends to preserve the dominant singular subspace with remarkable accuracy.

\begin{theorem}[Random Projection]
\label{thm:randproj}
Let $A\in\R^{m\times n}$ have singular value decomposition $A=U\Sigma V^T$, and let $\Omega\in\R^{n\times(k+p)}$ have independent standard normal entries. Then with probability at least $1-5e^{-p}$:
\[
    \|A - Q_kQ_k^TA\|_2 \leq (1+11\sqrt{k/p})\sigma_{k+1}
\]
where $Q_k$ has orthonormal columns spanning the range of $Y=A\Omega$.
\end{theorem}
\begin{marginfigure}
{\em BONUS!} The factor $(1+11\sqrt{k/p})$ reveals a profound truth: we can achieve near-optimal accuracy with random projections just slightly larger than our target rank ($p \approx k$).
\end{marginfigure}
For streaming data where observations arrive sequentially:
\[
    A_t = A_{t-1} + \vect{u}_t\vect{v}_t^T
\]
%
\begin{marginfigure}
{\em Think:} The challenge of streaming computation mirrors Chapter 9's iterative systems - how can we maintain essential structure while processing infinite sequences of updates?
\end{marginfigure}
%
we seek to maintain a low-rank approximation without storing the full matrix. The \style{incremental SVD} provides an elegant solution: given rank-$k$ approximation $A_{t-1}\approx U_{t-1}\Sigma_{t-1}V_{t-1}^T$, we can efficiently update it while maintaining fixed rank through small SVDs of matrices no larger than $(k+1)\times(k+1)$.

\begin{theorem}[Incremental Error Growth]
\label{thm:incremental}
The error in incremental rank-$k$ approximation grows as:
\begin{enumerate}
    \item $O(\sqrt{t})$ in Frobenius norm
    \item $O(\log t)$ in spectral norm
\end{enumerate}
where $t$ is the number of rank-one updates processed.
\end{theorem}

This controlled degradation enables processing of essentially infinite data streams while maintaining approximate optimality. The evolution from power method through randomized algorithms to streaming updates reflects a broader pattern in computational mathematics: as problems scale beyond traditional limits, we maintain accuracy by carefully relaxing requirements  --  accepting probabilistic guarantees or approximate optimality in exchange for dramatic efficiency gains.

Practice suggests the following guidelines for algorithm selection:
\begin{enumerate}
    \item Use power iteration for quick estimates of dominant direction
    \item Apply block methods when several singular vectors are needed
    \item Choose Lanczos for highest accuracy with moderate size
    \item Employ randomized methods for extremely large problems
    \item Use incremental updates for streaming data
\end{enumerate}
Each method balances different tradeoffs between accuracy, efficiency, and implementation complexity.

% ==============================================
\section{Incompleteness \& Reconstruction}
\label{sec:incomplete}
% ==============================================

Organic data frequently arrives with missing entries. Recommender systems know only a tiny fraction of user preferences; sensor networks suffer occasional failures; laboratory measurements capture only certain interactions between proteins. These scenarios share a common mathematical structure: we observe some entries of a matrix while others remain unknown, yet the underlying data often has low rank due to natural constraints or patterns.

\begin{definition}[Matrix Completion Problem]
\label{def:completion}
Let $M\in\R^{m\times n}$ be an unknown matrix, and let $\Omega\subset\{1,\ldots,m\}\times\{1,\ldots,n\}$ denote a set of observed indices. Given observations $\{m_{ij}:(i,j)\in\Omega\}$, the \style{matrix completion problem} seeks to recover $M$ under the assumption it has low rank. Formally, we aim to solve:
\[
    \min_X \rank(X) \quad\text{subject to}\quad x_{ij} = m_{ij}\text{ for all }(i,j)\in\Omega
\]
\end{definition}
%
\begin{marginfigure}
{\em Example:} In collaborative filtering, we observe only a tiny fraction of possible user-item ratings, yet seek to predict unobserved preferences by exploiting latent low-rank structure.
\end{marginfigure}

This optimization problem proves challenging due to the discrete nature of matrix rank. A powerful relaxation emerges through the nuclear norm introduced in Section \ref{sec:optapprox}:

\begin{definition}[Nuclear Norm]
\label{def:nuclear}
The \style{nuclear norm} of a matrix $A$, denoted $\|A\|_*$, equals the sum of its singular values:
\[
    \|A\|_* = \sum_{i=1}^{\min\{m,n\}} \sigma_i(A)
\]
\begin{marginfigure}
{\em Foreshadowing:} The nuclear norm's role in matrix completion previews how modern deep learning uses carefully chosen regularization to induce desired structure in learned representations.
\end{marginfigure}
This norm provides a convex relaxation of matrix rank, since $\rank(A)$ equals the number of nonzero singular values.
\end{definition}

If $M$ has rank $r$, it admits factorization $M=UV^T$ where $U\in\R^{m\times r}$ and $V\in\R^{n\times r}$. The partial observations provide constraints:
\[
    \sum_{k=1}^r u_{ik}v_{jk} = m_{ij} \quad\text{for all }(i,j)\in\Omega
\]
where $u_{ik}$ and $v_{jk}$ are entries of $U$ and $V$ respectively. Recovery becomes possible when these equations sufficiently constrain the factors.

\begin{theorem}[Matrix Completion]
\label{thm:completion}
Let $M\in\R^{m\times n}$ be a rank-$r$ matrix with singular value decomposition $M=U\Sigma V^T$, where $\sigma_r(M)>0$. Let $\Omega$ contain entries sampled uniformly at random. Then with probability at least $1-cn^{-3}$ (for some constant $c$), $M$ is the unique solution to:
\[
    \min_X \|X\|_* \quad\text{subject to}\quad x_{ij}=m_{ij}\text{ for all }(i,j)\in\Omega
\]
provided:
\begin{enumerate}
    \item $|\Omega| \geq C\mu r(m+n)\log^2(m+n)$ entries are observed
    \item The singular vectors $\{\vect{u}_i\}$ and $\{\vect{v}_i\}$ satisfy the \style{incoherence condition}:
    \[
        \max_{i,j} \left\{\frac{m}{r}\|\proj{U}\vect{e}_i\|^2, \frac{n}{r}\|\proj{V}\vect{e}_j\|^2\right\} \leq \mu
    \]
    where $\proj{U}$ and $\proj{V}$ denote projection onto the column spaces of $U$ and $V$ respectively
\end{enumerate}
\begin{marginfigure}
{\em Think:} Incoherence ensures information spreads evenly through the matrix rather than concentrating in a few entries - like requiring good experimental design in statistics.
\end{marginfigure}
Here $C$ and $\mu$ are numerical constants independent of matrix dimensions.
\end{theorem}

The incoherence condition proves crucial  --  it ensures information spreads evenly through the matrix rather than concentrating in a few entries. When singular vectors align closely with coordinate axes, individual entries may become essential for recovery. The random sampling ensures we capture enough information about all directions.

\begin{example}[Movie Recommendations]
\label{ex:netflix}
Consider a matrix where rows represent users, columns represent movies, and entries give ratings (1-5 stars). Most users rate only a tiny fraction of movies, yet patterns emerge: users with similar tastes rate films similarly; movies of similar genre receive correlated scores. These patterns manifest mathematically as low-rank structure.

For a dataset with 1000 users and 1000 movies, traditional approaches might require all million potential ratings. Yet if underlying preferences depend on just 40 latent factors (genre preferences, production values, etc.), the matrix completion theory shows we can recover accurate predictions from roughly 80,000 observed ratings  --  a dramatic reduction enabling practical recommender systems.
\end{example}

Though theoretically elegant, nuclear norm minimization requires careful algorithmic treatment for large-scale problems. The \style{proximal gradient method} provides an efficient approach through iteration:
\begin{align*}
    Y_k &= X_k - \eta_k\nabla f(X_k) \\
    X_{k+1} &= \mathcal{S}_{\lambda\eta_k}(Y_k)
\end{align*}
where $f(X)$ measures fidelity to observed entries, $\eta_k$ controls step size, and $\mathcal{S}_\tau$ denotes singular value soft-thresholding:
\[
    \mathcal{S}_\tau(Y) = U\diag(\max\{\sigma_i-\tau,0\})V^T
\]
for SVD $Y=U\Sigma V^T$. This algorithm effectively performs gradient descent in the space of low-rank matrices.

More sophisticated models handle real-world complexities like noise and systematic bias. The \style{robust matrix completion} problem modifies our optimization to allow for errors:
\[
    \min_{X,S} \|X\|_* + \lambda\|S\|_1 \quad\text{subject to}\quad x_{ij} + s_{ij} = m_{ij}\text{ for }(i,j)\in\Omega
\]
Here $S$ captures sparse errors while $X$ maintains low-rank structure. The parameter $\lambda$ balances these competing goals  --  larger values favor low rank over error tolerance.

\begin{example}[Sensor Networks]
\label{ex:sensors}
A network of temperature sensors distributed through a building should exhibit strong low-rank structure  --  nearby sensors record similar values, while daily and seasonal patterns affect all sensors systematically. When some sensors fail, matrix completion can interpolate their readings from surviving measurements. The incoherence conditions of Theorem \ref{thm:completion} translate to physical insight: recovery works best when sensors spread evenly through the space, avoiding dense clusters or sparse regions.
\end{example}

The theoretical guarantees for matrix completion reveal a profound truth: structure often renders redundant what initially appears essential. Just as Chapter \ref{ch:10}'s singular value decomposition exposed redundancy in full matrices, completion theory shows how partial observations suffice when underlying patterns exist. This principle  --  that structure enables reconstruction from incomplete data  --  appears throughout modern data science, from compressed sensing to neural network training.


% ==============================================
\section{Robust Matrix Factorization}
\label{sec:robust}
% ==============================================

Data rarely arrives pristine enough for direct approximation. Though the low-rank methods we have developed prove powerful when patterns lie hidden in high dimensions, real measurements often suffer more fundamental corruption. Sensors fail completely rather than merely adding noise; experimental procedures introduce systematic bias; malicious actors may deliberately contaminate observations. Yet in many cases, the underlying low-rank structure persists beneath these distortions, much as a crystal's fundamental symmetries remain even when its surface is marred.

Consider first how corruption manifests in matrix observations:
\begin{enumerate}
    \item Gross errors in individual measurements
    \item Systematic bias affecting entire rows or columns
    \item Adversarial corruption targeting specific patterns
    \item Random noise from measurement uncertainty
\end{enumerate}
These distinct forms of corruption demand different mathematical treatment, yet share a common theme: they represent sparse deviations from an underlying low-rank structure.

\begin{definition}[Robust Principal Component Analysis]
\label{def:rpca}
The \style{robust principal component analysis} problem seeks to decompose an observed matrix $M$ as:
\[
    M = L + S + N
\]
where:
\begin{itemize}
    \item $L$ has low rank (capturing true patterns)
    \item $S$ is sparse (representing gross errors)
    \item $N$ contains small random noise
\end{itemize}
The noiseless version ($N=0$) admits elegant theoretical analysis while retaining essential features.
\end{definition}
\begin{marginfigure}
{\em Compare:} Like the principal components of Chapter 11, robust PCA seeks fundamental patterns in data - but now explicitly modeling and separating out corruption and noise.
\end{marginfigure}
This insight  --  that corruption often affects only a small fraction of entries while leaving most observations reliable  --  suggests combining the nuclear norm from Section \ref{sec:incomplete} with the $\ell_1$ norm to promote both low rank and sparsity:

\begin{theorem}[Principal Component Pursuit]
\label{thm:pcp}
Let $M = L_0 + S_0$ where $L_0$ has rank $r$ and $S_0$ has at most $k$ nonzero entries with random signs. Under the conditions:
\begin{enumerate}
    \item $\rank(L_0) \leq \rho_r \min\{m,n\}$
    \item $\|S_0\|_0 \leq \rho_s mn$
    \item The singular vectors of $L_0$ satisfy the incoherence condition of Theorem \ref{thm:completion}
\end{enumerate}
the solution to:
\[
    \min_{L,S} \|L\|_* + \lambda\|S\|_1 \quad\text{subject to}\quad L + S = M
\]
exactly recovers $L_0$ and $S_0$ with high probability when $\lambda = 1/\sqrt{\max\{m,n\}}$ and $\rho_r,\rho_s$ are sufficiently small constants.
\end{theorem}
\begin{marginfigure}
    {\em BONUS!} The optimization in Theorem \ref{thm:pcp}, though convex, requires careful algorithmic treatment for large-scale problems. The \style{alternating direction method of multipliers} (ADMM) provides an efficient approach through iteration.
\end{marginfigure}

\begin{example}[Video Surveillance]
\label{ex:surveillance}
Consider a fixed camera recording a scene where most variation comes from a few moving objects against a static background. The resulting data matrix $M$ has rows indexing frames and columns indexing pixels. The background contributes a low-rank component $L$ (as it changes slowly if at all), while moving objects create sparse changes $S$. Despite occasional sensor glitches or lighting changes, robust PCA successfully separates these components:
\[
    \begin{bmatrix}
    \text{frame 1} \\
    \text{frame 2} \\
    \vdots \\
    \text{frame } n
    \end{bmatrix}
    =
    \underbrace{
    \begin{bmatrix}
    \text{background} \\
    \text{background} \\
    \vdots \\
    \text{background}
    \end{bmatrix}
    }_{\text{rank } \approx 1-4}
    +
    \underbrace{
    \begin{bmatrix}
    \text{moving objects} \\
    \text{moving objects} \\
    \vdots \\
    \text{moving objects}
    \end{bmatrix}
    }_{\text{sparse}}
\]
\end{example}

More realistic scenarios often require additional structure in our decomposition. When corruptions display patterns  --  systematic bias in sensor calibration, coordinated manipulation of ratings  --  we might model $S$ through specialized norms capturing this structure:
\[
    \min_{L,S} \|L\|_* + \lambda\Omega(S) \quad\text{subject to}\quad L + S = M
\]
where $\Omega(\cdot)$ encodes our assumptions about corruption patterns.

\begin{example}[Collaborative Filtering]
\label{ex:collab}
In recommendation systems, some users deliberately manipulate ratings to promote or demote certain items. Rather than treating these as independent corruptions, we might model them through group sparsity:
\[
    \Omega(S) = \sum_{i=1}^m \sqrt{\sum_{j=1}^n s_{ij}^2}
\]
This row-wise $\ell_{2,1}$ norm encourages entire users to be identified as manipulative rather than treating each rating independently.
\end{example}

When noise levels vary across observations, weighted variants prove valuable:
\[
    \min_{L,S} \|L\|_* + \lambda\|W\odot S\|_1 \quad\text{subject to}\quad L + S = M
\]
where $W$ contains entry-wise weights and $\odot$ denotes Hadamard (element-wise) product. This allows us to place more trust in reliable measurements while remaining suspicious of potentially corrupted ones.

\begin{theorem}[Stable Recovery]
\label{thm:stable}
Under the conditions of Theorem \ref{thm:pcp}, if $M = L_0 + S_0 + N$ where $\|N\|_F \leq \epsilon$, then the solution $(L,S)$ to the robust PCA problem satisfies:
\[
    \|L-L_0\|_F^2 + \|S-S_0\|_F^2 \leq C\epsilon^2
\]
for some constant $C$ depending only on matrix dimensions.
\end{theorem}
% \begin{marginfigure}
% {\em BONUS!} The stability guarantees for matrix recovery connect deeply to random matrix theory and concentration of measure - topics beyond our scope but central to modern high-dimensional statistics.
% \end{marginfigure}

The theoretical guarantees for robust matrix factorization reveal a fundamental principle: structure can be recovered even from severely corrupted observations, provided the corruption itself exhibits some form of simplicity (like sparsity). This principle  --  that patterns become clearer when viewed through appropriate mathematical lenses  --  will guide our development of neural networks in Chapter \ref{ch:13}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{The Netflix Prize}
\label{EM:netflix}
% **************** EMANATION *******************

Like archaeologists reconstructing ancient texts from fragmentary scrolls, modern recommender systems face the challenge of inferring complete meaning from sparse observations. The Netflix Prize of 2006-2009 transformed this metaphorical reconstruction into precise mathematics, demonstrating how the theory of low-rank matrix approximation enables machines to predict human preference at unprecedented scale. This million-dollar challenge would unite the abstract mathematics developed in this chapter with practical machine learning, revealing deep connections between matrix factorization and collaborative filtering.

The mathematical landscape seemed clear: a massive but sparse matrix $R\in\R^{m\times n}$ containing roughly 100 million ratings from $m=480,000$ users across $n=17,700$ movies. Each entry $r_{ij}$ represented one user's 1-5 star rating of one movie, with over 98\% of entries missing --- most users rate only a tiny fraction of available films. The formal objective was simple: minimize the root mean squared error (RMSE) of predictions:
\[
    \text{RMSE} = \sqrt{\frac{1}{|\Omega|}\sum_{(i,j)\in\Omega}(r_{ij} - \hat{r}_{ij})^2}
\]
where $\Omega$ denotes the set of observed ratings and $\hat{r}_{ij}$ represents predicted ratings.

\begin{marginfigure}
{\em Historical Note:} The 10\% improvement target was chosen based on Netflix's internal analysis suggesting this threshold would yield meaningful improvement in user experience. Few anticipated it would take three years to achieve.
\end{marginfigure}

The winning solution revealed the profound unity between theory and practice. At its core lay the singular value decomposition and its offspring --- the very tools developed in Section \ref{sec:incomplete} for matrix completion. Yet real-world recommender systems must contend with complexities our pristine theory ignores:

\begin{enumerate}
    \item Temporal effects: viewing patterns shift over time
    \item User bias: some rate generously, others harshly
    \item Item bias: some films consistently rate higher than others
    \item Implicit feedback: not rating a film conveys information
\end{enumerate}

Pure low-rank approximation, though theoretically elegant, proved insufficient. The winning algorithm addressed these complexities through careful decomposition of the ratings matrix. For user $u$ rating item $i$ at time $t$, the predicted rating took the form:
\[
    \hat{r}_{ui}(t) = \mu + b_u(t) + b_i(t) + \vect{q}_i^T\left(\vect{p}_u + |\mathcal{N}(u)|^{-\frac{1}{2}}\sum_{j\in\mathcal{N}(u)}\vect{y}_j\right)
\]
Here:
\begin{itemize}
    \item $\mu$ represents global mean rating
    \item $b_u(t)$ and $b_i(t)$ capture time-varying user and item biases
    \item Vectors $\vect{q}_i,\vect{p}_u\in\R^f$ encode $f$ latent features
    \item The sum over $\mathcal{N}(u)$ incorporates implicit feedback
\end{itemize}

This decomposition embodies a profound insight: human preference, though complex, often reduces to the interaction of simpler patterns. The Netflix Prize revealed how customer preference emerges from interpretable features. The parameters were learned by solving the regularized optimization:
\[
    \min_{b,\vect{p},\vect{q},\vect{y}} \sum_{(u,i)\in\Omega} \left(r_{ui} - \hat{r}_{ui}\right)^2 + 
    \lambda\left(\|b_u\|^2 + \|b_i\|^2 + \|\vect{p}_u\|^2 + \|\vect{q}_i\|^2 + \sum_{j\in\mathcal{N}(u)}\|\vect{y}_j\|^2\right)
\]
matching the nuclear norm regularization developed in Section \ref{sec:incomplete}.

Implementation revealed equally important lessons about robust matrix factorization. Raw ratings proved surprisingly noisy, contaminated by:
\begin{itemize}
    \item Varying interpretation of star ratings across users
    \item Temporal drift in rating patterns
    \item Selection bias in which movies users choose to rate
    \item Adversarial ratings from competing services
\end{itemize}

These challenges demanded the robust techniques developed in Section \ref{sec:robust}. By carefully modeling and removing systematic biases while remaining suspicious of extreme ratings, the winning team achieved an RMSE improvement of 10.06\% --- just barely crossing the threshold that had seemed impossible when the contest began. Their success demonstrated how theoretical insight enables practical engineering: without the mathematical foundation of matrix factorization, no amount of algorithmic ingenuity could have succeeded.

The legacy of the Netflix Prize extends far beyond movie recommendations. Its insights now power personalization across the digital economy, from e-commerce to music streaming to social media content ranking. Each applies the same core principle: that human preference, though seemingly chaotic, often admits low-rank approximation when viewed through appropriate mathematical lenses.
%
\begin{marginfigure}
{\em Nota bene:} Though Netflix never implemented the winning algorithm due to engineering complexity, the competition's insights fundamentally transformed how industry approaches recommender systems.
\end{marginfigure}

% **************** END EMANATION ****************

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Network Anomaly \& Detection}
\label{EM:network}
% **************** EMANATION *******************

Within the pulsing arteries of the internet, traffic flows between endpoints like blood through vessels, carrying the lifeblood of modern communication. Each packet's journey from source to destination leaves traces in router logs and network monitors, creating vast matrices of traffic data. Yet hidden within this digital circulatory system lie both normal patterns --- the regular rhythms of email, web browsing, and video streaming --- and dangerous anomalies that might signal security breaches or impending failures. The theory of low-rank approximation developed in this chapter provides powerful tools for distinguishing these patterns from background noise.

Consider a network with $n$ nodes exchanging traffic over time. Each measurement produces a traffic matrix $X(t)\in\R^{n\times n}$ where entry $x_{ij}(t)$ represents the volume of data flowing from node $i$ to node $j$ during time interval $t$. Collecting these snapshots over $m$ time periods yields a third-order tensor $\mathcal{X}\in\R^{n\times n\times m}$. Though seemingly complex, this traffic tensor often admits remarkably simple description through low-rank approximation.

\begin{marginfigure}
{\em Historical Note:} Early attempts at network anomaly detection focused on simple statistical outlier tests. The application of matrix factorization techniques in the early 2000s revolutionized the field by revealing coordinated patterns invisible to simpler methods.
\end{marginfigure}

The key insight emerges from examining the tensor's structure. Normal network traffic exhibits strong patterns:
\begin{enumerate}
    \item Spatial correlation: nearby nodes often share similar traffic profiles
    \item Temporal periodicity: daily and weekly cycles dominate normal use
    \item Low intrinsic dimension: most traffic follows predictable paths
\end{enumerate}

These patterns suggest decomposing each time slice $X(t)$ into three components:
\[
    X(t) = L(t) + S(t) + N(t)
\]
where $L(t)$ captures low-rank normal traffic patterns; $S(t)$ represents sparse anomalous flows; and $N(t)$ contains measurement noise.

This decomposition exactly matches the robust PCA framework developed in Section \ref{sec:robust}. By solving:
\[
    \min_{L,S} \|L\|_* + \lambda\|S\|_1 \quad\text{subject to}\quad L + S = X
\]
for each time slice, we separate regular traffic patterns from potential anomalies. The nuclear norm $\|L\|_*$ encourages low-rank structure in normal traffic, while the $\ell_1$ norm $\|S\|_1$ promotes sparse anomalies.

\begin{example}[DDoS Attack Detection]
During a distributed denial of service (DDoS) attack, many compromised machines flood a single target with traffic. This creates a characteristic pattern in $S(t)$: many small entries in one column (multiple sources) targeting one destination. The sparsity constraint naturally highlights such coordinated anomalies even when individual flows appear innocent.

Consider a network with 1000 nodes under attack. The normal traffic component $L(t)$ typically has rank 10-20, reflecting legitimate communication patterns. During attack, $S(t)$ reveals the assault through elevated values in a single column, even though each individual source contributes only a small fraction of total traffic.
\end{example}

More sophisticated analysis exploits temporal structure through tensor decomposition. Writing the traffic tensor in matrix form by stacking time slices:
\[
    \mathbf{X} = \begin{bmatrix} X(1) \\ X(2) \\ \vdots \\ X(m) \end{bmatrix}
\]
reveals additional patterns through its singular value spectrum. Normal traffic typically produces a sharp drop after 20-30 singular values, while anomalies create distinctive perturbations to this spectrum.

\begin{marginfigure}
{\em Nota bene:} The choice of temporal resolution crucially affects what patterns emerge. Too fine a granularity amplifies noise; too coarse misses important structure. Most implementations use multiple timescales simultaneously.
\end{marginfigure}

The robust decomposition framework handles several practical challenges:
\begin{itemize}
    \item Missing data from failed monitors
    \item Quantization effects from packet counting
    \item Time-varying network topology
    \item Mixed normal and abnormal traffic
\end{itemize}

These issues connect directly to the matrix completion theory of Section \ref{sec:incomplete}. Just as Netflix must predict unseen ratings, network analysis must infer missing traffic measurements while remaining robust to corrupted observations.

Modern implementations extend these ideas through streaming algorithms that process traffic data in real-time. The incremental SVD techniques developed in Section \ref{sec:algorithms} enable continuous monitoring of network health with bounded memory requirements. When anomalies appear, their projection onto the dominant singular vectors often reveals their nature:
\begin{itemize}
    \item Port scans appear as sparse rows
    \item DDoS attacks create dense columns
    \item Worm propagation shows characteristic diagonal patterns
    \item Data exfiltration manifests as sustained point-to-point flows
\end{itemize}

Yet this power brings responsibility. The same techniques that detect malicious traffic could potentially be used to de-anonymize users or track communication patterns. As networks grow more central to modern life, the balance between security and privacy becomes increasingly delicate. These concerns mirror those we encountered with recommendation systems --- mathematical tools that are powerful but not inherently ethical.

The application of low-rank approximation to network traffic analysis exemplifies a broader principle: that complex real-world systems often possess simpler underlying structure waiting to be discovered. Just as Chapter \ref{ch:10}'s singular value decomposition revealed preferred directions in abstract vector spaces, the techniques developed here expose the fundamental patterns in digital communication. As we move toward neural networks in Chapter \ref{ch:13}, this insight --- that high-dimensional data often lives near lower-dimensional manifolds --- will prove increasingly valuable.


% **************** END EMANATION ****************
% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 12}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

% Simple SVD computation, good starter problem
\item Let $A = \begin{bmatrix}4 & 2 & 1\\2 & 3 & 0\\1 & 0 & 2\end{bmatrix}$. Find the best rank-2 approximation $A_2$ using SVD truncation. Compare the Frobenius norm error $\|A-A_2\|_F$ to the theoretical minimum guaranteed by the Eckart-Young-Mirsky theorem.

% Basic power method calculation
\item The power method converges linearly with rate $|\sigma_2/\sigma_1|$. For matrix
\[
    A = \begin{bmatrix}3 & 1\\1 & 2\end{bmatrix}
\]
compute three iterations starting from $\vect{x}_0=(1,0)^T$. Estimate the convergence rate and compare to theory. What initial vector would give fastest convergence?

% Practical block structure analysis
\item Image compression often involves partitioning into blocks before low-rank approximation. For the matrix
\[
    A = \begin{bmatrix}
    100 & 98 & 45 & 43 \\
    97 & 95 & 47 & 44 \\
    42 & 45 & 153 & 155 \\
    44 & 46 & 152 & 154
    \end{bmatrix}
\]
compare the error from: (a) rank-2 approximation of the full matrix, versus (b) rank-1 approximations of each $2\times 2$ block. Explain which approach better captures the block structure.

% Nice concrete application with clear structured data
\item A chemical process generates noisy measurements of rank-2 structure corrupted by sparse errors:
\[
    M = \begin{bmatrix}
    2.1 & 4.2 & -15.0 & 8.1 \\
    4.0 & 8.1 & 12.2 & 16.0 \\
    6.2 & 12.0 & 18.1 & 23.9 \\
    8.0 & 16.2 & 24.1 & 32.2
    \end{bmatrix}
\]
Using patterns in the data, identify likely outliers and propose a clean rank-2 structure. Justify your decomposition.

% Basic robust PCA example
\item Consider the robust PCA problem $M=L+S$ where
\[
    M = \begin{bmatrix}
    1 & 2 & 10 \\
    2 & 4 & 3 \\
    3 & 6 & 4
    \end{bmatrix}
\]
Explain why $(3,3)$ entry likely represents an outlier. Find a reasonable decomposition into low-rank $L$ and sparse $S$ components.

% SVD properties without explicit computation
\item A rank-3 matrix $A\in\R^{5\times 4}$ has first two singular values $\sigma_1=10$ and $\sigma_2=5$. Without computing $A_1$ explicitly, find:
\begin{enumerate}
    \item The minimal achievable error $\|A-B\|_F$ for any rank-1 matrix $B$
    \item The Frobenius norm difference $\|A_2-A_1\|_F$ between best rank-2 and rank-1 approximations
    \item The unknown singular value $\sigma_3$
\end{enumerate}

% Matrix completion - simple case but requires deeper thinking
\item Consider the matrix completion problem where we observe entries
\[
    M = \begin{bmatrix}2 & ? & 1\\? & 4 & ?\\1 & ? & 2\end{bmatrix}
\]
Find a rank-1 matrix consistent with these observations, or prove none exists. How many rank-2 completions exist? Relate your answer to the sampling conditions of Theorem \ref{thm:completion}.

% Applied problem with physical constraints
\item A sensor network measures temperature at 5 locations every hour. After three days, two sensors fail simultaneously. The data matrix becomes:
\[
    M = \begin{bmatrix}
    72 & 70 & 68 & ? & ? \\
    75 & 74 & 71 & ? & ? \\
    69 & 67 & 65 & ? & ? \\
    73 & 71 & 69 & ? & ? \\
    \vdots & \vdots & \vdots & \vdots & \vdots
    \end{bmatrix}
\]
Explain how daily and spatial patterns create low-rank structure. If the first singular value explains 85% of variation and the second 12%, how would you choose the rank for completion? What physical constraints should the solution respect?

% Theoretical but with practical implications
\item Prove that for any matrix $A$ and index $k$, the difference between successive best rank approximations satisfies
\[
    \|A_k - A_{k-1}\|_F^2 = \sigma_k^2
\]
Use this to explain why examining ratios $\sigma_k/\sigma_{k+1}$ helps choose truncation rank.

% Practical algorithmic insight
\item Let $A$ have SVD $A=U\Sigma V^T$ and consider updating a rank-$k$ approximation when a new column $\vect{c}$ arrives. Prove that computing $(A|\vect{c})_k$ from scratch is unnecessary - the update requires only $O(mk)$ operations where $m$ is the row dimension. How would you maintain an approximate SVD for streaming data?

% Analysis of randomized methods
\item The randomized SVD algorithm computes $Y=AQ$ where $Q$ has orthonormal columns. Consider the error bound:
\[
    \|A - QQ^TA\|_2 \leq (1+\epsilon)\sigma_{k+1}
\]
where $\epsilon$ depends on oversampling. Explain why this motivates using $Q$ as a basis for rank-k approximation. How would you choose the oversampling parameter in practice?

% Theoretical analysis of projection
\item For matrices $A$ and $B$ of compatible size, prove that
\[
    \left\|\proj{\im A} B\right\|_F 
    \leq \|B\|_F
\]
where $\proj{\im A}$ denotes orthogonal projection onto the image of $A$. When does equality hold? How does this relate to optimal low-rank approximation?

% Block structure theory
\item Let $A$ be a matrix with blocks:
\[
    A = \begin{bmatrix}
    A_{11} & A_{12} \\
    A_{21} & A_{22}
    \end{bmatrix}
\]
Prove that $\rank(A) \geq \max\{\rank(A_{11}), \rank(A_{22})\}$. Then construct an example showing this bound can be strict. How does this inform strategies for block-wise low-rank approximation?

% Deeper theoretical analysis
\item Let $A$ be a rank-$r$ matrix whose smallest nonzero singular value is $\sigma_r$. Prove that if $\|E\|_2 < \sigma_r/2$, then $\rank(A+E) \geq r$. Use this to explain why rank estimation from noisy data requires examining singular value gaps rather than just counting nonzero values.

% Matrix completion theory
\item Explain why matrix completion typically fails when missing entries form a regular pattern rather than occurring randomly. Construct a simple $3\times 3$ example demonstrating this phenomenon. How does this relate to the incoherence conditions in Theorem \ref{thm:completion}?

% Product rank analysis
\item For two rank-$k$ matrices $A$ and $B$, prove their product $AB$ has rank at most $k$. Then show this bound is tight by constructing matrices achieving equality. How does this inform algorithms for approximate matrix multiplication?

% Analysis of Gram matrices and convergence
\item Let $\{\vect{x}_1,\ldots,\vect{x}_n\}$ be unit vectors in $\R^m$. The Gram matrix $G$ has entries $g_{ij}=\vect{x}_i^T\vect{x}_j$. Prove $G$ has rank at most $m$ and relate its singular values to principal angles between subspaces. How might this help analyze convergence of iterative methods?

% Advanced algorithmic problem
\item The weighted low-rank approximation problem seeks to minimize:
\[
    \|W \odot (A-X)\|_F \quad\text{subject to}\quad \rank(X)\leq k
\]
where $\odot$ denotes elementwise multiplication and $W$ has nonnegative entries. Explain why the solution is no longer given by truncated SVD. Propose an iterative algorithm for this problem based on alternating minimization.

% Deeper theoretical insights about nuclear norm
\item For an arbitrary matrix $A$, prove that the nuclear norm $\|A\|_*$ equals the optimal value of
\[
    \min_{UV^T=A} \frac{1}{2}(\|U\|_F^2 + \|V\|_F^2)
\]
Use this to explain why nuclear norm minimization promotes low-rank solutions in matrix completion.

% Analytical scaling problem
\item Consider an $n\times n$ matrix $A$ with exponentially decaying singular values $\sigma_k = 2^{-k}$. For error tolerance $\epsilon$:
\begin{enumerate}
    \item Find the minimal rank needed for Frobenius norm approximation
    \item Compare memory requirements of storing full versus low-rank form
    \item Analyze relative efficiency of power iteration versus randomized methods
\end{enumerate}
How do your conclusions depend on matrix dimension $n$?

% Advanced convergence analysis
\item (Challenge) Let $A$ be a rank-$r$ matrix with condition number $\kappa$. Prove that after $t$ iterations, the error in power iteration satisfies
\[
    \|\sin\Theta(\vect{x}_t,\vect{v}_1)\| \leq (1/\kappa)^t\|\tan\Theta(\vect{x}_0,\vect{v}_1)\|
\]
where $\Theta$ denotes the angle between subspaces. What does this reveal about initialization sensitivity?

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THE SYNTHESIS OF ALBION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\cleardoublepage
\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.75\textwidth]{ALBION.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
\end{fullwidth}

\cleardoublepage

%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
\chapter{Neural Networks \& AI}
\label{ch:13}
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=
%=%=%=%=%=%=% CHAPTER %=%=%=%=%=%=%
%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=%=

{\em ``multitudes without number work incessant: the hewn stone is plac'd in beds of mortar mingled with the ashes of Vala''}

\newthought{The recent synthesis of linear algebra} into machine intelligence marks a profound transition both mathematical and societal. Through twelve chapters we have built a careful understanding of linear transformations -- their fundamental subspaces and quotients, their geometry through singular values, their dynamics through eigentheory, their approximation through low rank structure. 

Reasoning transcends linearity, requiring tools that can capture the subtle nonlinearities pervasive in natural and artificial systems alike. Neural networks provide exactly such tools, transforming our linear foundations into building blocks for artificial intelligence through careful composition and training.

Consider first why linearity alone proves insufficient. An image recognition system must somehow map pixel intensities to object categories; a language model must transform sequences of words into meaningful representations; a robot must convert sensor readings into appropriate actions. No purely linear transformation, however carefully chosen, can capture these fundamentally nonlinear relationships. Yet the mathematics we have developed remains essential -- not as a complete solution, but as the foundational grammar from which neural networks construct their expressive power.

This construction proceeds through composition of simple elements. Each layer of a neural network performs a linear transformation followed by a nonlinear \style{activation function}. Though individually simple, these layers combine to approximate remarkably complex mappings. The low-rank approximations studied in Chapter \ref{ch:12} hint at this power -- just as truncated singular values capture dominant patterns in data, trained networks learn to extract and transform relevant features through their layered structure. Understanding this learning process demands uniting our linear algebraic insights with optimization theory, probability, and information geometry.

Modern neural architectures have evolved far beyond simple layered networks. Convolutional networks exploit spatial structure through specialized linear operations; attention mechanisms dynamically reweigh their computations based on context; residual connections create shortcuts that ease optimization. Yet beneath these sophisticated designs lie patterns familiar from our study of linear algebra -- the role of condition numbers in training stability, the importance of rank in compression, the geometry of high-dimensional representations. Even the largest language models, processing billions of parameters, build their capabilities on careful composition of fundamentally linear operations.

Our task is to bridge the gap between classical mathematics and modern artificial intelligence. We begin with the basic architecture of neural networks, showing how nonlinearity transforms simple matrix multiplication into universal function approximation. This leads naturally to gradient-based learning algorithms, where insights from matrix calculus guide efficient optimization. Modern architectures then emerge as careful refinements of these principles, each innovation grounded in mathematical understanding. Throughout, we maintain the perspective that neural networks represent not a break from classical methods but their natural evolution -- the transformation of linear building blocks into learned intelligence through principled composition and training.

The quest to understand how neural networks achieve their remarkable capabilities -- and what fundamental limits they might face -- drives much of modern research in artificial intelligence. Though we cannot hope to resolve all mysteries, the mathematical foundations developed in this text provide essential tools for analysis and innovation. As we explore the frontiers of AI, from large language models to robot control, we shall see repeatedly how linear algebraic insights illuminate both practical engineering and deeper theoretical questions about the nature of learning and intelligence.

% ==============================================
\section{Beyond Linear Transformations}
\label{sec:beyond}
% ==============================================

Reality resists reduction to purely linear description. Though the preceding chapters have built a powerful framework for understanding linear transformations  --  their geometry through singular values, their dynamics through eigentheory, their approximation through low rank structure  --  nature demands more. The patterns that define intelligence, whether natural or artificial, trace curves and manifolds far beyond any linear subspace. Like light bending through a crystal, information must be transformed nonlinearly to reveal its deeper structure.

Consider the fundamental limitations of linearity. No purely linear transformation can separate points that are not linearly separable; no matrix multiplication alone can capture the XOR function; no sequence of linear operations can represent even simple logical decisions. Yet the brain performs such feats effortlessly, transforming raw sensory data into abstract understanding through cascades of neural activity. This suggests a profound principle: intelligence emerges not from linear operations in isolation, but from their careful composition with nonlinearity.

\begin{marginfigure}
{\em Historical Note:} The XOR function outputs 1 only when exactly one of its inputs is 1:
\[
\begin{array}{cc|c}
x_1 & x_2 & \text{XOR} \\
\hline
0 & 0 & 0 \\
0 & 1 & 1 \\
1 & 0 & 1 \\
1 & 1 & 0
\end{array}
\]
Minsky and Papert's 1969 book \textit{Perceptrons} proved single-layer networks cannot learn this simple function, effectively halting neural network research for over a decade. The solution required hidden layers  --  a insight that would wait until the 1980s backpropagation revolution.
\end{marginfigure}

The mathematical foundation for this composition comes through nonlinear operations that transform their inputs componentwise:

\begin{definition}[Activation Function]
\label{def:activation}
An \style{activation function} $\sigma:\R\to\R$ is a nonlinear function applied elementwise to vectors. Common choices include:
\begin{enumerate}
    \item ReLU (Rectified Linear Unit): $\sigma(x) = \max\{0,x\}$
    \item Sigmoid: $\sigma(x) = 1/(1+e^{-x})$
    \item Hyperbolic tangent: $\sigma(x) = \tanh(x)$
\end{enumerate}
When applied to a vector $\vect{x}$, we write $\sigma(\vect{x})=(\sigma(x_1),\ldots,\sigma(x_n))^T$.
\end{definition}

These seemingly simple functions transform linear operations into powerful building blocks for computation. The ReLU function, for instance, implements a basic form of sparsity  --  setting negative values to zero while preserving positive ones. Though its derivative is discontinuous at zero, this very discontinuity proves crucial for efficient learning. The sigmoid and hyperbolic tangent functions provide smooth transitions between asymptotic limits, but modern practice favors ReLU for its computational simplicity and beneficial gradient properties.

\begin{marginfigure}
{\em Historical Note:} Early neural networks used sigmoid activation functions in analogy with biological neurons' firing rates. The shift to ReLU around 2011 marked a crucial advance, enabling much deeper networks through improved gradient flow.
\end{marginfigure}

Consider now how linear and nonlinear operations combine. Given input vector $\vect{x}\in\R^n$, \style{weight matrix} $W\in\R^{m\times n}$, and \style{bias vector} $\vect{b}\in\R^m$, a single neural network layer computes:
\begin{equation}
\label{eq:Wx+b}
    \vect{h} = \sigma(W\vect{x} + \vect{b})
\end{equation}
This composition of matrix multiplication, vector addition, and elementwise nonlinearity forms the fundamental building block of neural computation. Though each operation is simple, their combination enables remarkable expressivity:

\begin{lemma}[Universal Approximation]
\label{lem:universal}
A neural network with a single hidden layer of sufficient width can approximate any continuous function on a compact domain to arbitrary precision. More precisely, given any continuous function $f:[0,1]^n\to\R$ and error $\epsilon>0$, there exist weights $W_1,W_2$ and biases $\vect{b}_1,\vect{b}_2$ such that:
\[
    \|f(\vect{x}) - W_2\sigma(W_1\vect{x} + \vect{b}_1) + \vect{b}_2\|_\infty < \epsilon
\]
for all $\vect{x}\in[0,1]^n$.
\end{lemma}

This theoretical guarantee, though powerful, offers little practical guidance. Modern networks achieve their remarkable performance not through width but through depth  --  carefully composing many layers of alternating linear and nonlinear operations. This layered structure mirrors both biological neural networks, where different brain regions process information hierarchically, and the mathematical notion that complex functions often decompose naturally into simpler stages.

\begin{example}[Image Recognition]
Consider the task of recognizing handwritten digits. Each image arrives as a matrix of pixel intensities  --  a point in high-dimensional space. The variations in human handwriting create complex manifolds far from any linear subspace. A typical convolutional neural network transforms these images via stages:
\begin{enumerate}
    \item Linear convolution detects local patterns
    \item ReLU activation introduces nonlinearity
    \item Pooling operations provide invariance
    \item Further layers combine features hierarchically
\end{enumerate}
\begin{marginfigure}
    {\em Keep going...} For more information on CNNs, see the end of this chapter.
\end{marginfigure}
No single linear transformation could separate these digit classes, yet their composition with nonlinearity achieves remarkable accuracy.
\end{example}

The power of neural networks lies not in the complexity of their individual operations  --  both the linear transformations and activation functions remain simple  --  but in how these operations combine to approximate complex functions. Like the crystalline structure of minerals emerging from simple atomic arrangements, intelligence emerges from careful composition of fundamental mathematical operations.

This perspective transforms our understanding of what neural networks compute. Rather than viewing them as black boxes, we see them as learnable compositions of well-understood mathematical operations. The linear transformations studied in previous chapters provide the framework for information flow, while activation functions introduce the nonlinearity essential for complex computation. This unity of linear and nonlinear operations  --  of continuous transformation and discrete decision  --  provides the foundation for modern artificial intelligence.

The sections ahead will explore how these basic building blocks combine into sophisticated architectures, how networks learn through gradient-based optimization, and how modern innovations like attention mechanisms and residual connections extend these fundamental principles. Throughout, we maintain the perspective that neural networks represent not a break from classical mathematics but its natural evolution  --  the transformation of linear algebraic operations into learned intelligence through principled composition and training.

% ==============================================
\section{Network Architecture \& Matrix Factorization}
\label{sec:architecture}
% ==============================================

The layered composition of linear and nonlinear operations discussed in Section \ref{sec:beyond} suggests a natural architecture for neural computation. Like the singular value decomposition of Chapter \ref{ch:10}, neural networks factorize complex transformations into simpler components  --  not through analytically derived singular vectors, but through learned weight matrices separated by nonlinearity. This perspective transforms neural networks from biological metaphor into mathematical entity, revealing deep connections between classical matrix analysis and modern machine learning.

\begin{marginfigure}
\centering
\includegraphics[width=1.0in]{neural-net.png}
\end{marginfigure}
Consider a neural network with $L$ layers. Each layer performs the transformation described by equation (\ref{eq:Wx+b}), but now we chain these operations together:
\begin{equation}
\label{eq:feedforward}
\begin{array}{rcl}
\vect{h}_1 &=& \sigma(W_1\vect{x} + \vect{b}_1) \\
\vect{h}_2 &=& \sigma(W_2\vect{h}_1 + \vect{b}_2) \\
&\vdots& \\
\vect{y} &=& W_L\vect{h}_{L-1} + \vect{b}_L
\end{array}
\end{equation}
where $\vect{x}$ denotes input, $\vect{h}_i$ are \style{hidden layers}, and $\vect{y}$ provides output. Without the nonlinear functions $\sigma$, this would reduce to mere matrix multiplication $W_L\cdots W_1$  --  precisely the form we studied in the context of matrix factorization. The activation functions transform this linear composition into something far more expressive, yet the underlying structure of matrix multiplication remains essential.
%
\begin{marginfigure}
{\em Think:} Without activation functions, this composition would reduce to pure matrix multiplication $W_L\cdots W_1$. The nonlinearity creates expressive power beyond matrix factorization, but transforms a convex optimization problem into a non-convex one.
\end{marginfigure}
%
\begin{definition}[Feedforward Neural Network]
\label{def:ffnn}
A \style{feedforward neural network} is a function $f:\R^n\to\R^m$ parameterized by weight matrices $\{W_i\}$ and bias vectors $\{\vect{b}_i\}$ that transforms its input through successive application of equation (\ref{eq:feedforward}). The \style{width} of layer $i$ equals the number of rows in $W_i$, while the \style{depth} equals the total number of layers $L$.
\end{definition}

The choice between network depth and width presents fundamental trade-offs analogous to those encountered in low-rank approximation. Just as truncating different numbers of singular values balances approximation accuracy against complexity, the width and depth of neural networks control their expressive power. Wide but shallow networks effectively learn a large basis expansion of their target function  --  like keeping many singular values in a low-rank approximation. Deep networks instead compose simpler transformations, potentially capturing hierarchical structure more efficiently:

\begin{theorem}[Deep Network Expressivity]
\label{thm:depth}
There exist functions that can be computed by deep neural networks with polynomial size but require exponential size when restricted to bounded depth. More precisely, for any $n$ there exists a function $f:\{0,1\}^n\to\{0,1\}$ computable by a network of depth $O(n)$ and total size $O(n)$ that requires size $2^{n/\log n}$ when restricted to depth $O(\log n)$.
\end{theorem}

Modern architectures enhance this basic structure through innovations that echo ideas from matrix conditioning. \style{Skip connections} allow information to bypass layers directly:
%
\begin{marginfigure}
{\em Historical Note:} The introduction of residual networks (ResNets) in 2015 enabled training of networks with over 100 layers, shattering previous depth barriers. The insight came from asking {\em ``what if layers learned differences rather than absolute transformations?''}
\end{marginfigure}
%
\[
    \vect{h}_{i+1} = \sigma(W_i\vect{h}_i + \vect{b}_i) + \vect{h}_i
\]
Like the regularization techniques of Chapter \ref{ch:12}, these residual paths improve numerical stability by providing direct routes for gradient flow. They transform the learning task from approximating the desired function to learning its refinements  --  often a better-conditioned optimization problem. Similarly, \style{normalization layers} standardize their inputs:
\[
    \hat{\vect{h}}_i = \frac{\vect{h}_i - \mu_i}{\sqrt{\sigma_i^2 + \epsilon}}
\]
where $\mu_i,\sigma_i$ estimate mean and standard deviation. This operation controls the condition numbers of weight matrices by stabilizing the scale of intermediate representations, much as the column scaling studied in Chapter \ref{ch:1} improves matrix conditioning.
%
\begin{marginfigure}
{\em Compare:} Like the column scaling that improved matrix conditioning in Chapter 1, normalization layers stabilize computation through careful control of intermediate scales. The key difference is that these scales are learned from data rather than computed analytically.
\end{marginfigure}

These architectural choices reflect a profound principle: the effectiveness of neural networks emerges not just from their expressive power, but from how their structure enables stable optimization. Like the carefully chosen basis transformations that revealed structure in classical linear algebra, modern network architectures create pathways for efficient learning through their thoughtful composition of operations. The next section will explore how this learning process operates through systematic application of matrix calculus.

% ==============================================
\section{Chains \& Backpropagation}
\label{sec:backprop}
% ==============================================

The challenge of neural network training lies not in understanding what to optimize  --  clearly we seek parameters that minimize prediction error  --  but in computing how small changes in parameters affect network output. With potentially billions of parameters spread across many layers, direct calculation of derivatives seems hopelessly complex. Yet the chain rule of multivariable calculus provides exactly the tool we need, transforming an apparently intractable computation into an elegant recursive algorithm.

\begin{marginfigure}
{\em Historical Note:} Backpropagation first appeared in the 1974 thesis of Werbos and was subsequently ignored. Its importance for neural networks was only recognized in the 1980s through parallel discoveries by multiple researchers. The key insight -- applying the chain rule systematically through recursive computation -- comes from reverse-mode automatic differentiation techniques from the 1960s. 
\end{marginfigure}

Consider first the mathematical structure of what we wish to compute. Given a network with parameters $\PARAM$ (encoding all weights and biases), we seek to minimize some loss function $\LOSS(\PARAM)$ measuring prediction error on our training data. The challenge lies in computing $[\partial \LOSS/\partial\PARAM]$, the derivatives of loss with respect to parameters. Though $\LOSS$ is ultimately scalar-valued, it emerges from complex composition of many operations.

Common choices for the loss function include:
\begin{itemize}
    \item Mean squared error: $\LOSS(\vect{y}, \hat{\vect{y}}) = \frac{1}{n}\sum_{i=1}^n (y_i - \hat{y}_i)^2$
    \item Cross-entropy for classification: $\LOSS(\vect{y}, \hat{\vect{y}}) = -\sum_{i=1}^n y_i\log(\hat{y}_i)$
    \item L1 loss for robustness: $\LOSS(\vect{y}, \hat{\vect{y}}) = \frac{1}{n}\sum_{i=1}^n |y_i - \hat{y}_i|$
\end{itemize}
where $\vect{y}$ denotes true values and $\hat{\vect{y}}$ the network's predictions.
%
\begin{marginfigure}
{\em Nota bene:} The terms \style{loss function} and \style{cost function} are used interchangeably in the literature, with ``loss'' being more common in modern machine learning practice.
\end{marginfigure}

Each layer of the network performs a simple transformation of its inputs. For a layer with weights $W_\ell \in \R^{n_\ell \times n_{\ell-1}}$ and biases $\vect{b}_\ell \in \R^{n_\ell}$, we compute:
\[
    \vect{h}_\ell = \sigma(W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell)
\]
where $\sigma$ is a nonlinear activation function applied componentwise, and $\vect{h}_{\ell-1} \in \R^{n_{\ell-1}}$ denotes the previous layer's activations. The derivatives of this operation with respect to parameters are straightforward:
\[
    \left[\frac{\partial}{\partial W_\ell}(W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell)\right] = \left[\vect{h}_{\ell-1}^T \quad \vect{0}\right]
    \quad \text{and} \quad
    \left[\frac{\partial}{\partial \vect{b}_\ell}(W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell)\right] = I
\]
The challenge comes in combining these simple pieces through the chain rule.

\begin{definition}[Error Signal]
\label{def:error}
The \style{error signal} $[\delta_\ell]$ at layer $\ell$ is the derivative of loss with respect to that layer's pre-activation output:
\[
    \left[\delta_\ell\right] = \left[\frac{\partial \LOSS}{\partial \vect{z}_\ell}\right] \in \R^{1 \times n_\ell}
\]
where $\vect{z}_\ell = W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell \in \R^{n_\ell}$ denotes the layer's pre-activation values.
\end{definition}

\begin{lemma}[Backpropagation Rule]
\label{lem:backprop}
Let $\LOSS$ be a scalar loss function of network output. For any layer $\ell$ in a neural network:
\[
    \left[\delta_\ell\right] = \left[\delta_{\ell+1}\right]W_{\ell+1}\diag(\sigma'(\vect{z}_\ell))
\]
where $\sigma'$ denotes the derivative of the activation function. The parameter gradients are then:
\[
    \left[\frac{\partial \LOSS}{\partial W_\ell}\right] = \left[\delta_\ell\right]^T\vect{h}_{\ell-1}^T
    \quad \text{and} \quad
    \left[\frac{\partial \LOSS}{\partial \vect{b}_\ell}\right] = \left[\delta_\ell\right]^T
\]
\end{lemma}

\begin{proof}
By the chain rule:
\[
    \left[\frac{\partial \LOSS}{\partial \vect{z}_\ell}\right] = 
    \left[\frac{\partial \LOSS}{\partial \vect{z}_{\ell+1}}\right]
    \left[\frac{\partial \vect{z}_{\ell+1}}{\partial \vect{h}_\ell}\right]
    \left[\frac{\partial \vect{h}_\ell}{\partial \vect{z}_\ell}\right]
\]
The result follows from computing each factor: $[\partial \vect{z}_{\ell+1}/\partial \vect{h}_\ell] = W_{\ell+1}$ and $[\partial \vect{h}_\ell/\partial \vect{z}_\ell] = \diag(\sigma'(\vect{z}_\ell))$, noting carefully the dimensions of each matrix product.
\end{proof}

\begin{example}[Simple Neural Network]
\label{ex:simple-nn}
Consider a network with two hidden layers of width $n_1$ and $n_2$ processing input $\vect{x} \in \R^{n_0}$:
\[
\begin{array}{rcl}
\vect{h}_1 &=& \sigma(W_1\vect{x} + \vect{b}_1) \\
\vect{h}_2 &=& \sigma(W_2\vect{h}_1 + \vect{b}_2) \\
\vect{y} &=& W_3\vect{h}_2 + \vect{b}_3
\end{array}
\]
where $W_1 \in \R^{n_1 \times n_0}$, $W_2 \in \R^{n_2 \times n_1}$, and $W_3 \in \R^{n_3 \times n_2}$. For mean squared error loss $\LOSS = \frac{1}{2}\|\vect{y} - \vect{t}\|^2$ with target $\vect{t}$, backpropagation computes:
\[
\begin{array}{rcl}
\left[\delta_3\right] &=& \left[\vect{y} - \vect{t}\right]^T \\
\left[\delta_2\right] &=& \left[\delta_3\right]W_3\diag(\sigma'(\vect{z}_2)) \\
\left[\delta_1\right] &=& \left[\delta_2\right]W_2\diag(\sigma'(\vect{z}_1))
\end{array}
\]
These error signals then yield parameter gradients through matrix multiplication with stored activations.
\end{example}

\begin{marginfigure}
{\em Nota bene:} The careful tracking of matrix dimensions throughout backpropagation reveals why the algorithm produces correctly-sized gradients for each parameter matrix.
\end{marginfigure}

The algorithm's efficiency emerges from careful organization of computation:
\begin{itemize}
    \item Forward pass stores activations $\vect{h}_\ell$ and pre-activations $\vect{z}_\ell$
    \item Backward pass computes error signals $\left[\delta_\ell\right]$ from output to input
    \item Parameter gradients emerge from simple matrix operations using stored values
\end{itemize}

This recursive structure connecting forward and backward passes echoes the complementary subspaces first encountered in Chapter \ref{ch:3}. Just as kernel and image decomposed linear transformations, the forward and backward passes of backpropagation decompose the flow of information through the network. The error signals $\delta_\ell$ measure precisely how changes propagate backward through this structure.

\begin{theorem}[Backpropagation Complexity]
\label{thm:backprop-complexity}
For a network with $\NUMLAY$ layers each of width at most $n$, backpropagation computes all parameter derivatives in time $O(\NUMLAY n^2)$ using storage $O(\NUMLAY n)$.
\end{theorem}

This remarkable efficiency  --  linear in depth and quadratic in width  --  transforms neural network training from theoretical possibility to practical reality. Like the matrix factorizations studied in Chapter \ref{ch:10}, backpropagation achieves its power through careful decomposition of computation. The chain rule provides the mathematical foundation, while thoughtful algorithm design turns this insight into efficient implementation.

\begin{marginfigure}
{\em Foreshadowing:} The efficiency of backpropagation becomes crucial when we scale to deep networks with billions of parameters. The next section will show how stochastic sampling further reduces computational cost.
\end{marginfigure}

The fundamental insight of backpropagation  --  that influence flows backward through the network just as information flows forward  --  provides more than mere computational efficiency. It reveals how neural networks learn through systematic measurement of parameter influence, guided by precise mathematical principles. This fusion of calculus with computational graphs transforms the abstract possibility of gradient-based learning into practical reality.

% ==============================================
\section{Stochastic Gradient Descent}
\label{sec:sgd}
% ==============================================

The mathematics of scale prompts probabilistic thinking. Though backpropagation provides an elegant algorithm for computing gradients, applying it to every training example proves prohibitively expensive for large datasets. Like a river finding efficient paths through complex terrain, stochastic approximation transforms exact but intractable computations into efficient estimates through careful sampling. This principle  --  that randomness can accelerate computation while maintaining accuracy  --  pervades modern machine learning.

\begin{definition}[Stochastic Gradient Descent]
\label{def:sgd}
Let $\{\PARAM_t\}_{t\geq 0}$ denote a sequence of parameter vectors updated iteratively according to:
\[
    \PARAM_{t+1} = \PARAM_t - \eta_t\left[\frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right]^T
\]
where $\eta_t > 0$ is the learning rate and $[\widehat{\partial \LOSS}/\partial \PARAM]$ denotes a stochastic estimate of the loss gradient.
\end{definition}

Consider the true gradient of our loss function $\LOSS$ averaged over a dataset of $n$ examples:
\[
    \left[\frac{\partial \LOSS}{\partial \PARAM}\right] = \frac{1}{n}\sum_{i=1}^n \left[\frac{\partial \LOSS_i}{\partial \PARAM}\right]
\]
Computing this exactly requires a full pass through the dataset, yet the sum's structure suggests a natural approximation. By sampling a mini-batch $\mathcal{B}$ of size $b \ll n$ uniformly at random, we obtain an unbiased estimate:
\[
    \left[\frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right] = \frac{1}{b}\sum_{i\in\mathcal{B}} \left[\frac{\partial \LOSS_i}{\partial \PARAM}\right]
\]
This stochastic gradient provides the foundation for modern neural network training.
%
\begin{marginfigure}
{\em Historical Note:} The term ``gradient descent'' arose from early focus on scalar-valued functions whose derivatives naturally formed gradient vectors. Modern usage encompasses matrix derivatives, though the name persists.
\end{marginfigure}

\begin{lemma}[Mini-batch Properties]
\label{lem:minibatch}
Let $\sigma^2$ denote the variance of individual gradient estimates. The mini-batch gradient estimator satisfies:
\begin{enumerate}
    \item Unbiasedness: 
    $\displaystyle
    \mathbb{E}\left[\frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right] 
    = \left[\frac{\partial \LOSS}{\partial \PARAM}\right]$
    \item Variance bound: 
    $\displaystyle
    \mathbb{V}\left[\frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right] 
    =\mathbb{E}\left\|\left[
    \frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right] - \left[\frac{\partial \LOSS}{\partial \PARAM}
    \right]\right\|_F^2 
    \leq \frac{\sigma^2}{b}$
\end{enumerate}
where $\|\cdot\|_F$ denotes the Frobenius norm.
\end{lemma}

\begin{example}[Binary Classification]
\label{ex:sgd-binary}
Consider training a simple network to classify points in $\R^2$ using the logistic loss. Given parameters $\vect{w}\in\R^2$ and $c\in\R$, the network computes:
\[
    p(x) = \frac{1}{1+e^{-(\vect{w}^T\vect{x} + c)}}
\]
For a dataset of points $\{(\vect{x}_i,y_i)\}_{i=1}^n$ with $y_i\in\{0,1\}$, the loss becomes:
\[
    \LOSS(\vect{w},c) = -\frac{1}{n}\sum_{i=1}^n \left[y_i\log p(\vect{x}_i) + (1-y_i)\log(1-p(\vect{x}_i))\right]
\]

With mini-batch size $b=2$, each iteration:
\begin{enumerate}
    \item Samples two points $i,j$ uniformly at random
    \item Computes predictions $p(\vect{x}_i), p(\vect{x}_j)$
    \item Updates parameters using averaged gradient:
\[
    \begin{array}{rcl}
    \vect{w}_{t+1} &=& \vect{w}_t - \frac{1}{2}\eta_t\sum_{k\in\{i,j\}} (p(\vect{x}_k)-y_k)\vect{x}_k \\
    c_{t+1} &=& c_t - \frac{1}{2}\eta_t\sum_{k\in\{i,j\}} (p(\vect{x}_k)-y_k)
    \end{array}
\]
\end{enumerate}
The stochastic updates prove far more efficient than full-batch computation while maintaining convergence.
\end{example}

Like the low-rank approximations studied in Chapter \ref{ch:12}, mini-batch derivatives trade accuracy for computational efficiency. The connection to matrix conditioning emerges through momentum methods, which transform first-order updates into an approximation of second-order information:
\[
    \begin{array}{rcl}
    \vect{m}_t &=& \beta\vect{m}_{t-1} + (1-\beta)\left[\frac{\widehat{\partial \LOSS}}{\partial \PARAM}\right]^T \\
    \PARAM_{t+1} &=& \PARAM_t - \eta_t\vect{m}_t
    \end{array}
\]
where $\beta\in[0,1)$ controls momentum and $\vect{m}_0=\vect{0}$. This averaging of derivatives reduces variance while accelerating convergence, particularly when optimization surfaces resemble the elongated valleys characteristic of ill-conditioned systems.
%
\begin{marginfigure}
{\em Nota bene:} Like the iterative methods of Chapter \ref{ch:9}, momentum accumulates information across steps to accelerate convergence. The parameter $\beta=0.9$ proves effective in many applications.
\end{marginfigure}

In order to prove results on the convergence of SGD, a few more technical terms will need to be deployed. These are not essential to our story, but are included for precision.
%
\begin{definition}[Smoothness and Strong Convexity]
\label{def:smooth-convex}
A differentiable function $f:\R^n\to\R$ is:
\begin{enumerate}
    \item \style{$L$-smooth} if its gradient is Lipschitz continuous with parameter $L>0$:
    \[
        \|\nabla f(\vect{x}) - \nabla f(\vect{y})\| \leq L\|\vect{x}-\vect{y}\|
        \quad\text{for all }\vect{x},\vect{y}\in\R^n
    \]
\begin{marginfigure}
    {\em Ouch!} Yeah, you might need more math if you want to learn AI for real
\end{marginfigure}
    \item \style{$\mu$-strongly convex} if for some $\mu>0$:
    \[
        f(\vect{y}) \geq f(\vect{x}) + \nabla f(\vect{x})^T(\vect{y}-\vect{x}) + \frac{\mu}{2}\|\vect{y}-\vect{x}\|^2
        \quad\text{for all }\vect{x},\vect{y}\in\R^n
    \]
\end{enumerate}
In effect, $L$-smoothness provides an upper bound on how quickly the gradient can change, while $\mu$-strong convexity ensures a minimum amount of curvature in all directions.
\end{definition}
%
\begin{marginfigure}
{\em Nota bene:} These conditions relate directly to matrix conditioning: for quadratic functions $f(\vect{x})=\frac{1}{2}\vect{x}^TA\vect{x}$, the ratio $L/\mu$ equals the condition number of $A$.
\end{marginfigure}

\begin{theorem}[SGD Convergence]
\label{thm:sgd-convergence}
Let $\LOSS$ be $\mu$-strongly convex and $L$-smooth. For learning rate schedule $\eta_t = \frac{2}{\mu(t+1)}$ and mini-batch size $b$, stochastic gradient descent converges in expectation:
\[
    \mathbb{E}[\|\PARAM_t - \PARAM^*\|^2] \leq \frac{4L}{b\mu^2(t+1)} \max\{\|\PARAM_0 - \PARAM^*\|^2, \sigma^2\} = O\left(\frac{1}{t}\right)
\]
where $\PARAM^*$ denotes the optimal parameters and $\sigma^2$ bounds the variance of individual gradients. For non-convex losses typical in deep learning, convergence to local minima occurs under suitable regularity conditions.
\end{theorem}
%
\begin{marginfigure}
{\em Interpretation:} The error decreases at least as fast as $1/t$, with the rate controlled by the problem's condition number $L/\mu$ and the gradient noise level $\sigma^2/b$.
\end{marginfigure}
%

Modern practice suggests several refinements of these basic principles:
\begin{enumerate}
    \item Gradient clipping to bound update magnitudes:
    \item Learning rate schedules that decrease over time:
    \item Layer-wise adaptive rates for deep networks
    \item Regularization through stochastic noise addition
\end{enumerate}
Each refinement addresses specific challenges in optimization while maintaining the core idea of stochastic approximation.
\begin{marginfigure}
{\em Historical Note:} The development of adaptive methods like Adam and RMSprop in the 2010s transformed deep learning by making training more robust to hyperparameter choices, effectively learning separate learning rates for each parameter.
\end{marginfigure}

The remarkable effectiveness of stochastic descent emerges from deep mathematical principles. Like the least squares methods of Chapter \ref{ch:6}, which approximate optimal solutions through tractable computation, mini-batch derivatives provide suitable directions for optimization. The noise inherent in these estimates often proves beneficial, helping networks escape poor local minima while providing implicit regularization. This fusion of stochastic approximation with classical optimization transforms neural network training from theoretical possibility to practical reality.
% ==============================================
\section{Attention \& Transformers}
\label{sec:attention}
% ==============================================

The synthesis of linear algebra into intelligence reaches its fullest expression in the attention mechanism of transformer architectures. Like the divine smith measuring ethereal rays with compass and scale, attention introduces a profound innovation: matrices whose entries themselves emerge dynamically from data. Each element becomes not merely a fixed weight but a learned relationship  --  a measure of relevance computed through systematic inner products. This mechanism  --  elegant in conception yet powerful in application  --  transforms the fixed weights studied throughout this text into adaptive transformations that focus computation where needed.

The intuition behind attention emerges naturally from information retrieval systems. Consider a library where each book has both descriptive metadata (title, subjects, keywords) and actual content. A user's search query must somehow match against the metadata to determine which books' content to examine and combine. This process  --  of computing relevance scores between a query and many possible keys, then using these scores to combine values  --  provides the template for neural attention.

\begin{marginfigure}
{\em Think:} The query-key-value paradigm generalizes how we naturally organize information. A query expresses what we seek, keys provide searchable descriptors, while values hold actual content to be retrieved.
\end{marginfigure}

The mathematics begins with three types of learned projections that transform input vectors into representations serving distinct roles. Given input vectors $\{\vect{x}_1,\ldots,\vect{x}_n\}$, we first project each into a query vector representing what that position is looking for, a key vector encoding what it offers to others, and a value vector containing its actual content. These roles emerge through learned matrices $W_Q$, $W_K$, and $W_V$ respectively:
\[
    Q = XW_Q, \quad K = XW_K, \quad V = XW_V
\]
where $X$ stacks the input vectors as rows. Like the coordinate transformations studied in Chapter \ref{ch:4}, these projections align input features with natural axes for the attention computation  --  but now these axes themselves emerge through learning.

\begin{definition}[Attention Mechanism]
\label{def:attention}
Given input matrix $X\in\R^{n\times d}$, the \style{attention mechanism} computes output $Y$ through three stages:
\begin{equation}
    A = QK^T, \quad \tilde{A}_{ij} = \frac{\exp(a_{ij}/\sqrt{d_k})}{\sum_{j=1}^n \exp(a_{ij}/\sqrt{d_k})}, \quad Y = \tilde{A}V
\end{equation}
where $d_k$ denotes the dimension of query/key space. The exponential normalization ensures positive weights summing to one while sharpening focus on strong matches.
\end{definition}

\begin{marginfigure}
{\em Historical Note:} The attention mechanism emerged from neural machine translation in 2015, but its true power became apparent only with the transformer architecture in 2017. 
\end{marginfigure}

Consider how this mechanism processes natural language. In the line ``Tyger, tyger, burning bright,'' each word's representation passes through the attention computation. The query projection learns to capture what context each word needs  --  ``burning'' might query for objects it modifies. The key projection learns what context each word provides  --  ``tyger'' develops a strong key signal as the subject. The value projection learns to encode meaning that can be usefully combined  --  perhaps capturing that ``bright'' modifies the burning quality. The attention weights then determine how these meanings flow between positions.

The remarkable effectiveness of this mechanism emerges from its fusion of classical and adaptive computation. Though built from familiar operations  --  matrix multiplication and normalization  --  their careful composition creates a transformation that adapts to its input. The core matrix product $QK^T$ measures relevance between all pairs of positions through their inner products, while the normalization ensures these scores form proper attention distributions.

\begin{theorem}[Attention Properties]
\label{thm:attention}
The attention mechanism satisfies:
\[
    \text{row-stochastic:}\quad \sum_{j=1}^n \tilde{a}_{ij} = 1 \quad\text{and}\quad \tilde{a}_{ij} \geq 0
\]
\[
    \text{permutation equivariant:}\quad \pi(Y) = \text{Attention}(\pi(X))
\]
for any permutation $\pi$. The computation requires $O(n^2\max\{d_k,d_v\})$ operations and $O(n^2)$ memory.
\end{theorem}

Rather than compute a single attention pattern, transformer architectures typically employ multiple attention ``heads'' operating in parallel. Each head learns different query/key/value projections, allowing the network to capture multiple types of relationship simultaneously. A learned output transformation $W_O$ then combines these parallel streams:
\[
    \text{MultiHead}(X) = W_O\begin{bmatrix}
    \text{head}_1 \\
    \vdots \\
    \text{head}_h
    \end{bmatrix}
\]

\begin{marginfigure}
{\em Think:} Like the parallel paths in residual networks or the multiple singular vectors of SVD, multiple attention heads allow the network to capture different aspects of relationship simultaneously.
\end{marginfigure}

The complete transformer architecture synthesizes these attention blocks with several innovations that echo concepts from earlier chapters. Layer normalization stabilizes computation like the column scaling of Chapter \ref{ch:1}. Residual connections enable gradient flow as in Chapter \ref{ch:9}. Position encodings add sequence order through trigonometric functions, while feed-forward layers process attention outputs through standard linear transformations. Each component builds on fundamental principles developed throughout this text, yet their combination transcends the sum of parts.

The computational demands of attention grow quadratically with sequence length, as computing all key-query interactions requires matrix multiplication $QK^T\in\R^{n\times n}$. Modern implementations address this through sparse attention patterns, clever matrix multiplication algorithms, and architectural innovations. Yet the fundamental operation remains matrix multiplication  --  the same operation that has guided our development from Chapter \ref{ch:1} onward.

The attention mechanism exemplifies how classical mathematics transforms into modern artificial intelligence. Though built from matrices and inner products, it introduces a new level of dynamism where transformations adapt to their inputs. This fusion of timeless mathematical principles with learned adaptation has driven remarkable advances in natural language processing, computer vision, and other domains. Attention unites the precision of linear algebra with the flexibility demanded by intelligence.

% ==============================================
\section{Representation Learning}
\label{sec:representation}
% ==============================================

While earlier sections revealed how networks learn to approximate functions, deeper insight emerges from understanding how they discover representations that make such approximation natural. This process  --  of learning optimal feature spaces rather than merely fitting functions
\begin{marginfigure}
{\em Foreshadowing:} The evolution from hand-crafted to learned representations parallels our journey from analytical matrix factorizations to trained neural networks.
\end{marginfigure}  --  connects the abstract machinery of previous chapters to the remarkable capabilities of modern artificial intelligence. Like the singular vectors that emerged naturally from matrix structure in Chapter \ref{ch:10}, learned representations capture fundamental patterns in data, yet transcend linearity through careful composition of transformations.

Consider first how classical feature engineering differs from learned representations. A hand-crafted feature extractor applies fixed transformations based on domain knowledge: edge detectors for images, frequency analysis for signals, parse trees for text. Though informed by expertise, such features remain limited by human intuition. Neural networks transcend this limitation by learning their own transformations, discovering representational spaces optimally suited to their tasks through end-to-end training.

\begin{definition}[Representation Space]
\label{def:repspace}
A \style{representation space} $\mathcal{H}$ for data $\mathcal{X}$ is a vector space equipped with a learned transformation $f:\mathcal{X}\to\mathcal{H}$ that maps inputs to features. In a deep network with $L$ layers, each hidden layer defines such a space $\mathcal{H}_\ell$ through its learned weights and nonlinearity:
\[
    \mathcal{H}_\ell = \{\vect{h}_\ell = \sigma_\ell(W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell) : \vect{h}_{\ell-1} \in \mathcal{H}_{\ell-1}\}
\]
where $W_\ell,\vect{b}_\ell$ are learned parameters and $\sigma_\ell$ provides nonlinearity.
\end{definition}

This learned structure connects deeply to the \style{manifold hypothesis}  --  the principle that high-dimensional data often concentrates near lower-dimensional nonlinear surfaces. While the PCA methods of Chapter \ref{ch:11} discovered optimal \textit{linear} projections of data, deep networks learn \textit{nonlinear} embeddings through composition of transformations. Each layer's weights define a mapping:
\begin{marginfigure}
    {\em Think:} Every image you have ever seen lives in a high-dimensional subdomain of image space whose complement is of even higher dimension.
\end{marginfigure}
\[
    \vect{h}_\ell = \sigma(W_\ell\vect{h}_{\ell-1} + \vect{b}_\ell)
\]
that gradually transforms inputs into more structured representations. The nonlinear functions $\sigma$ allow these mappings to follow curved manifolds, while skip connections and attention mechanisms provide shortcuts through representation space.

\begin{lemma}[Representation Decomposition]
\label{lem:repdecomp}
Let $f_\ell$ denote the transformation implemented by layer $\ell$ of a neural network. Each $f_\ell$ admits decomposition:
\[
    f_\ell = \sigma_\ell \circ \tilde{f}_\ell
\]
where $\tilde{f}_\ell$ is linear and $\sigma_\ell$ provides nonlinearity. The full network learns representations by composing such transformations:
\[
    f = f_L \circ f_{L-1} \circ \cdots \circ f_1
\]
Moreover, each linear component $\tilde{f}_\ell$ induces a decomposition of its domain through the fundamental subspaces:
\[
    \mathcal{H}_{\ell-1} = \ker(\tilde{f}_\ell) \orthosum (\ker \tilde{f}_\ell)^\perp
\]
The network learns both the transformations and their associated subspace decompositions through training.
\begin{marginfigure}
{\em Nota bene:} The fundamental subspaces of Chapter \ref{ch:3} reappear naturally in each layer's learned transformations, now discovered through optimization rather than analysis.
\end{marginfigure}
\end{lemma}

\begin{example}[Visual Representations]
\label{ex:visual}
Consider how convolutional networks transform image data through their layers. Early representations capture local patterns like edges and textures  --  features reminiscent of hand-designed filters. Deeper layers learn increasingly abstract features:
\begin{itemize}
    \item Layer 1: Edges and orientations
    \item Layer 2: Textures and simple shapes
    \item Layer 3: Object parts and complex patterns
    \item Layer 4: Complete objects and scenes
\end{itemize}
This hierarchy emerges naturally through training, with each layer learning representations that make the next layer's task simpler. A precise mathematical pattern underlies this emergence: each layer's weights $W_\ell$ implement linear maps whose singular value decomposition reveals learned feature hierarchies:
\[
    W_\ell = U_\ell\Sigma_\ell V_\ell^T
\]
The right singular vectors $V_\ell$ capture input patterns while left singular vectors $U_\ell$ provide the transformed representation basis.
\begin{marginfigure}
{\em Example:} A face detector might emerge when $\vect{v}_i$ captures facial features and $\vect{u}_i$ provides their transformed representation  --  like singular vectors aligning with natural patterns in data.
\end{marginfigure}
\end{example}

The power of learned representations becomes clear through the lens of Chapter \ref{ch:12}'s approximation theory. Just as low-rank matrix factorizations capture dominant patterns in data, deep networks learn features that make complex tasks approximately linear in transformed space. This principle  --  that good representations linearize otherwise complex problems  --  appears repeatedly in modern architectures:

\begin{theorem}[Representation Linearization]
\label{thm:replinear}
Let $f:\mathcal{X}\to\mathcal{Y}$ be a smooth function between manifolds, and let $\{\vect{h}_\ell\}_{\ell=1}^L$ be the sequence of representations learned by a deep network trained to approximate $f$. Then under suitable regularity conditions:
\begin{marginfigure}
{\em BONUS!} This progressive linearization connects to universal approximation theory: deep networks can represent any continuous function by gradually straightening its graph through learned transformations.
\end{marginfigure}
\begin{enumerate}
    \item The representation error $\|\vect{h}_\ell - \vect{h}_{\ell-1}\|$ decreases with depth
    \item The nonlinearity of mappings between successive layers decreases
    \item The final layers implement approximately linear transformations
\end{enumerate}
\end{theorem}

\begin{example}[Word Embeddings]
\label{ex:word2vec}
The representation of words as vectors provides a striking example of learned structure. Rather than encoding words through hand-designed features like part of speech or semantic categories, modern language models learn embeddings directly from co-occurrence patterns in text. A word's representation emerges from a learned linear transformation $W\in\R^{d\times v}$ mapping one-hot encodings to dense vectors:
\[
    \vect{e}_w = W\vect{x}_w
\]
where $\vect{x}_w\in\R^v$ denotes the one-hot encoding of word $w$ and $d\ll v$ is the embedding dimension. The rows of $W$ form a learned basis capturing semantic relationships:
\begin{enumerate}
    \item Similar words cluster together in embedding space
    \item Differences between vectors encode analogies
    \item Projection onto principal components reveals semantic fields
\end{enumerate}
The embedding matrix $W$ effectively learns a low-dimensional manifold that smoothly captures linguistic structure.
\begin{marginfigure}
{\em Caveat:} The low-dimensional constraint $d \ll v$ is crucial  --  it forces the network to discover efficient representations rather than memorizing surface patterns.
\end{marginfigure}
\end{example}

The study of representation learning connects naturally to the statistical perspective developed in Chapter \ref{ch:11}. Just as principal components maximized explained variance in data, learned representations optimize implicit objectives through their architecture and training. Yet unlike the unique optimal projections of PCA, learned representations may vary between training runs or architectural choices. This variability often proves beneficial, as different random initializations can discover complementary feature spaces.

More formally, consider how a deep network transforms its input space through successive representations. Each layer defines a mapping $f_\ell:\mathcal{H}_{\ell-1}\to\mathcal{H}_\ell$ between hidden spaces, culminating in some final representation $\mathcal{H}_L$. The network's power lies not in any individual transformation but in how this sequence of mappings gradually builds structure. Each layer shapes and refines its input until complex patterns emerge from simple foundations.

The mathematics of representation learning reveals a profound principle: intelligence emerges not from raw computation but from learned transformations that render complex patterns simple. This insight  --  that good representations make hard problems easy  --  guides our development toward the final synthesis of linear algebra into artificial intelligence. The next section explores how these learned structures ultimately connect back to the fundamental theorem first encountered in Chapter \ref{ch:3}, completing our journey from classical mathematics to modern AI.

% ==============================================
\section{Deep Linear Algebra}
\label{sec:synthesis}
% ==============================================

The mathematics of intelligence reveals itself through fundamental patterns. Though neural networks appear to transcend the structures developed in earlier chapters, their deepest architectures echo and amplify the fundamental theorem first encountered in Chapter \ref{ch:3}. At each layer, every transformation between learned representations preserves the essential decomposition that has guided our entire development. This unity  --  of classical decomposition with learned structure  --  marks the final transformation of our mathematical journey.

Consider how neural networks shape their representational spaces. Each layer implements not merely a linear transformation followed by nonlinearity, but a learned decomposition reflecting the kernel-image relationship of the Fundamental Theorem. When a layer with weight matrix $W$ maps $\R^n$ to $\R^m$, it induces two complementary splittings:
\[
    \R^n = \ker(W) \orthosum (\ker W)^\perp \quad\text{and}\quad \R^m = \im(W) \orthosum (\im W)^\perp
\]
Unlike the fixed decompositions studied in previous chapters, these spaces emerge through training  --  the network discovers which directions to preserve and which to collapse. This learned adaptation explains why neural networks often capture patterns that elude hand-designed features: they align their fundamental decompositions with natural structure in data.

\begin{example}[Learned Decompositions]
Consider an autoencoder compressing data through a narrow hidden layer. The encoder $E:\R^n\to\R^k$ and decoder $D:\R^k\to\R^n$ ($k \ll n$) implement dimension reduction through learned transformations whose fundamental spaces acquire profound meaning:
\begin{itemize}
    \item The \style{kernel} $\ker(E)$ captures precisely those features deemed irrelevant through training
    \item The \style{image} $\im(E)$ provides the learned representation  --  an optimal $k$-dimensional summary
    \item The \style{cokernel} $\coker(E)$ measures geometric reconstruction loss
    \item The \style{coimage} $\coim(E)$ reveals the effective feature space modulo irrelevant variation
\end{itemize}
The network discovers these spaces by minimizing reconstruction error while respecting the orthogonal structure guaranteed by the Fundamental Theorem. Each space emerges not through mathematical prescription but through learned adaptation to data.
\end{example}

This principle  --  that neural networks learn task-appropriate versions of our fundamental decompositions  --  appears throughout modern architectures. In convolutional networks, each layer's kernel captures local patterns deemed irrelevant while its image preserves task-relevant features. Attention mechanisms implement learned equivalence relations, effectively constructing adaptive quotient spaces by identifying similar tokens. Residual connections create shortcuts through these decompositions, providing direct paths for information flow while maintaining the underlying structure.

The four fundamental spaces that began as abstract concepts in Chapter \ref{ch:3} thus find their deepest expression in how neural networks learn and process information:
\begin{itemize}
    \item The \style{kernel} evolves from nullspace to learned invariance, capturing input variations that should be considered equivalent
    \item The \style{image} transforms from range to learned manifold, providing representations that meaningfully encode information
    \item The \style{cokernel} measures not just algebraic deficiency but learning capacity
    \item The \style{coimage} emerges as optimal quotient space, identifying features relevant to the task
\end{itemize}

Through training, these spaces align naturally with structure inherent in data. The kernel learns task-relevant invariances; the image constrains possible representations; the cokernel guides architecture design; while the coimage captures effective features. Each layer discovers its own version of these fundamental decompositions, transforming our abstract analysis into learned intelligence.

Our exploration of linear algebra concludes as it began: with the profound unity of seemingly distinct concepts. The Fundamental Theorem, which first revealed the complementary subspaces underlying all linear transformations, reaches its fullest expression in the adaptive decompositions of neural networks. In this light, artificial intelligence appears not as revolutionary rupture but as revolutionary refinement  --  the transformation of timeless mathematical principles into learned form through the structures developed in these pages. Like the singular values that emerged from matrix structure or the eigenspaces that captured asymptotic behavior, the fundamental spaces of linear algebra reveal themselves anew in the learned patterns of modern artificial intelligence.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% **************** EMANATION *******************
\small

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Convolutional Neural Networks}
\label{EM:cnn}
% **************** EMANATION *******************

The synthesis of human and machine vision reveals itself most profoundly in how neural networks process images. When humans look at a photograph, we perceive not mere pixels but meaningful patterns woven from relationships between shapes, textures, and edges. This deep structure --- the natural geometry of visual information --- guides the design of modern artificial intelligence. Convolutional neural networks achieve their remarkable power by encoding these perceptual principles directly into their architecture, transforming the dense matrix operations of standard networks into specialized transformations that mirror how both human and machine vision parse the visual world.

Consider first how images encode their information. A grayscale image of size $h\times w$ presents itself as a matrix of intensities, yet treating this as an arbitrary array discards crucial spatial relationships. When reshaped into a vector for neural network processing, a modest 256$\times$256 image becomes a point in $\R^{65536}$ --- a space of such high dimension that learning arbitrary transformations proves hopelessly inefficient. Yet the patterns we seek to discover --- the features that distinguish cats from dogs or tumors from healthy tissue --- depend on relationships between nearby pixels rather than arbitrary long-range connections.

This locality principle transforms into mathematical structure through specialized weight matrices that encode spatial relationships. Rather than learning arbitrary transformations, convolutional layers constrain their linear operations to respect the grid-like topology of visual data. A small filter $K\in\R^{k\times k}$ defines a local feature detector that examines each image region in turn --- a dramatic reduction in parameters from the $hw\times hw$ matrices of fully connected layers.

The power of this approach emerges from its fusion of biological inspiration with mathematical structure. Just as the visual cortex processes input through localized receptive fields, convolutional networks learn hierarchical feature detectors through carefully constrained linear transformations. Early layers typically discover simple patterns:
\begin{itemize}
    \item Edge detectors at various orientations
    \item Color contrast boundaries
    \item Local texture elements
\end{itemize}
These combine in later layers to represent progressively more complex features, each emerging from learned compositions of simpler patterns.

\begin{marginfigure}
{\em Historical Note:} Though inspired by biological vision, convolutional networks achieved practical success only after careful mathematical analysis revealed the importance of proper initialization and regularization. The breakthrough performance of AlexNet in 2012 demonstrated how theoretical insight enables practical engineering.
\end{marginfigure}

Modern architectures enhance this basic structure through innovations that echo ideas from matrix conditioning. Batch normalization layers standardize their inputs across spatial locations:
\[
    \hat{x}_{ijk} = \frac{x_{ijk} - \mu_k}{\sqrt{\sigma_k^2 + \epsilon}}
\]
where $k$ indexes feature channels. Like the scaling techniques that improved matrix conditioning in Chapter \ref{ch:1}, this operation stabilizes gradient flow during training. Similarly, residual connections create shortcuts that help networks optimize despite their great depth:
\[
    Y = X + \sigma(K_2 * \sigma(K_1 * X))
\]
This structure, reminiscent of the splitting studied in Chapter \ref{ch:3}, provides direct paths for gradient flow while maintaining the spatial structure enforced by convolution.

The remarkable effectiveness of convolutional networks emerges from this fusion of biological inspiration with mathematical principle. Like the low-rank approximations studied in this chapter, they achieve efficiency through careful constraint --- not by limiting expressive power, but by encoding known properties of visual data into their matrix operations. Early layers learn localized feature detectors:
\begin{itemize}
    \item Vertical and horizontal edge detectors
    \item Center-surround contrast filters
    \item Simple texture elements
\end{itemize}
Each represents a specialized linear transformation discovered through optimization yet constrained by architectural design to respect the structure of visual data.

These learned features often exhibit striking universality. Networks trained on natural images reliably discover similar early-layer features regardless of their ultimate task, suggesting these patterns reflect fundamental structure in visual data rather than task-specific artifacts. Later layers specialize more dramatically, but still show recognizable correspondence between networks trained on related tasks.

The success of convolutional networks carries profound implications for representation learning. Their ability to discover effective features automatically, rather than requiring hand-engineered transformations, suggests deeper principles about how structure emerges from data through constrained optimization. This theme --- that carefully designed architectures can learn natural representations --- appears throughout modern machine learning, from language models to protein structure prediction.

Yet challenges remain. The very architectural constraints that enable efficient learning can sometimes prove limiting:
\begin{itemize}
    \item Translation invariance may be undesirable
    \item Long-range dependencies get ignored
    \item Spatial structure may not match the data
\end{itemize}
Modern variants like transformers address these limitations through learned attention mechanisms, yet retain the core insight that architectural constraint enables efficient learning.

The mathematical principles underlying convolutional networks extend far beyond computer vision. Their fundamental insight --- that local structure matters and should be architecturally encoded --- has inspired specialized networks for many domains:
\begin{itemize}
    \item Graph convolution for molecule modeling
    \item Spherical convolution for climate data
    \item Temporal convolution for time series
\end{itemize}
Each adapts the basic mathematics to respect domain-specific structure while maintaining the core principle that architectural constraint enables efficient learning.

Beyond their practical impact, convolutional networks exemplify how classical mathematics transforms into modern machine learning through careful engineering. The linear transformations studied throughout this text provide the foundation, while thoughtful architectural constraints enable efficient learning of natural representations. As artificial intelligence continues its rapid advance, this principle --- that structure enables learning --- will likely prove increasingly valuable.

% \begin{marginfigure}
% {\em FIGURE:} [Illustration showing learned convolutional filters from different network layers, demonstrating how simple edge and texture detectors combine to form increasingly complex feature detectors.]
% \end{marginfigure}

% **************** EMANATION *******************
\emanation
% **************** EMANATION *******************
\section*{Large Language Models \& Algebraic Reasoning}
\label{EM:llm}
% **************** EMANATION *******************

The ultimate synthesis of linear algebra into intelligence reveals itself most profoundly in how vast neural networks process language. When humans read text, we perceive not mere sequences of words but intricate webs of meaning woven from grammar, context, and knowledge. This deep structure --- the natural geometry of linguistic information --- guides the design of modern language models. Through careful composition of attention mechanisms and learned representations, these systems transform the mathematics developed throughout this text into apparent understanding, achieving capabilities that would have seemed miraculous mere years ago.

Consider first how language encodes its meaning. Each word or subword token maps to a learned vector in high-dimensional space --- a point in $\R^d$ where typically $d\approx 1024$ or larger. Yet treating these embeddings as arbitrary vectors discards the profound structure of language. When processing the phrase ``the cat sat on the mat,'' a language model must somehow capture not just individual word meanings but their grammatical relationships, semantic roles, and broader context. The patterns we seek to model --- the features that distinguish questions from statements or detect subtle implications --- emerge from complex interactions between these representations.

This relational structure transforms into mathematical form through the attention mechanisms studied earlier in this chapter. Rather than applying fixed transformations, each layer computes dynamic relationships between all pairs of tokens:
\[
    A_{ij} = \frac{\vect{q}_i^T\vect{k}_j}{\sqrt{d}} \quad\text{and}\quad \tilde{A}_{ij} = \frac{\exp(A_{ij})}{\sum_k \exp(A_{ik})}
\]
The resulting attention weights $\tilde{A}$ implement learned equivalence relations, effectively constructing quotient spaces that group tokens by their contextual roles. Like the fundamental decompositions first encountered in Chapter \ref{ch:3}, these dynamic quotients organize information flow through the network.

The power of this approach emerges from its fusion of mathematical elegance with practical effectiveness. Just as the quotient spaces of earlier chapters revealed structure by identifying equivalent elements, attention mechanisms discover contextual patterns by learning which tokens should interact. Early layers typically capture surface properties:
\begin{itemize}
    \item Basic syntactic relationships
    \item Local word associations
    \item Simple referential patterns
\end{itemize}
These combine in later layers to represent increasingly abstract relationships, each emerging from learned compositions of simpler patterns.

\begin{marginfigure}
{\em Historical Note:} The term ``attention'' emerged from neural machine translation, where these mechanisms first showed their power. Their true potential became apparent only with the transformer architecture in 2017, leading directly to modern language models.
\end{marginfigure}

The sheer scale of modern language models --- with hundreds of billions of parameters --- reveals new principles beyond those visible in smaller systems. Like massive crystals whose microscopic structure gives rise to emergent properties, these networks develop capabilities that seem to transcend their basic components:
\begin{itemize}
    \item Few-shot learning from textual examples
    \item Zero-shot transfer to novel tasks
    \item Apparent reasoning and inference
\end{itemize}
Yet their foundation remains the matrix operations studied throughout this text --- attention computing relevance scores through inner products, feed-forward layers implementing learned transformations, layer normalization stabilizing gradients.

This emergence of intelligence from scale illuminates profound questions about representation and learning. The vectors and matrices processing each token encode not just word meanings but fragments of knowledge, reasoning patterns, and implicit skills. Like the singular vectors that emerged naturally from matrix structure in Chapter \ref{ch:10}, these learned representations capture fundamental patterns in language --- but now discovered through optimization rather than analysis.

Consider how a language model processes the prompt ``Complete the pattern: 2, 4, 6, ...''. Each token's embedding transforms through layers of attention and feed-forward computation:
\[
    \vect{h}_\ell = \text{FFN}_\ell(\text{Attention}_\ell(\vect{h}_{\ell-1})) + \vect{h}_{\ell-1}
\]
The residual connections, reminiscent of the splitting studied in Chapter \ref{ch:3}, allow information to flow directly while attention mechanisms extract relevant patterns. Somehow from these mathematical operations emerges the ability to recognize and continue sequences --- an apparently simple capability that requires sophisticated processing of both language and number concepts.

Yet challenges remain. The very architectural features that enable this emergence also impose limitations:
\begin{itemize}
    \item Attention scales quadratically with sequence length
    \item Models can produce plausible but incorrect responses
    \item The basis for their capabilities remains somewhat opaque
\end{itemize}
These limitations remind us that while current systems achieve remarkable results, they represent steps along a path rather than its endpoint.

The mathematical principles underlying language models extend far beyond text processing. Their fundamental insights --- that attention enables dynamic information flow, that scale enables emergence, that careful architecture design channels learning --- have inspired applications across domains:
\begin{itemize}
    \item Protein structure prediction
    \item Scientific discovery
    \item Code generation and analysis
\end{itemize}
Each adapts the basic mathematics to new contexts while maintaining the core principle that structured computation enables complex behavior.

Beyond their practical impact, large language models exemplify how classical mathematics transforms into modern artificial intelligence through careful engineering. The linear transformations studied throughout this text provide the foundation, while thoughtful architectural choices enable efficient learning of natural representations. As these systems continue their rapid advance, this principle --- that structure enables intelligence --- will likely prove increasingly valuable.

The journey from basic linear algebra to language models tracks our text's broader theme: how timeless mathematical principles transform into practical tools through careful development. These systems forge understanding from the raw material of computation. Their success suggests not that traditional mathematics has been superseded, but that it finds new expression through the structures we create. In this light, large language models appear not as breaks from classical principles but as their natural evolution --- the transformation of linear algebraic operations into apparent intelligence through principled composition at scale.


% **************** EMANATION *******************

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\exercises
\section*{Exercises: Chapter 13}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{enumerate}

\item Let $W\in\R^{m\times n}$ and $\vect{b}\in\R^m$ define a neural network layer computing $\vect{h} = \max\{0, W\vect{x} + \vect{b}\}$. Show that without the ReLU activation $\max\{0,\cdot\}$, this reduces to transformations from Chapter \ref{ch:3}. Then for $n=m=2$, explicitly describe the regions of input space where this transformation is linear, illustrating with a diagram.

\item For the softmax function $\sigma_i(\vect{x}) = e^{x_i}/\sum_{j=1}^n e^{x_j}$: Prove that outputs sum to 1 for any input $\vect{x}$; derive the derivative matrix entries $\partial \sigma_i/\partial x_j = \sigma_i(\delta_{ij} - \sigma_j)$; and explain why adding a constant to all inputs leaves outputs unchanged.

\item Consider layer normalization $\hat{\vect{h}} = \gamma(\vect{h} - \mu)/\sqrt{\sigma^2 + \epsilon} + \beta$ where $\mu,\sigma^2$ are activation mean/variance and $\gamma,\beta$ are learned parameters. Show the output has mean $\beta$ and variance $\gamma^2$, relating this to the column scaling of Chapter \ref{ch:1}. How does this help control weight matrix condition numbers?

\item For a word embedding matrix $W\in\R^{d\times v}$ mapping vocabulary to $d$-dimensional vectors: Prove its row space dimension is at most $d$; connect this to Chapter \ref{ch:12}'s low-rank approximations; and explain what similar contexts imply about corresponding rows of $W$.

\item The residual connection $\vect{h}_{\text{out}} = \vect{h}_{\text{in}} + F(\vect{h}_{\text{in}})$ adds input directly to transformed output. Express $\partial \vect{h}_{\text{out}}/\partial \vect{h}_{\text{in}}$ in terms of $F$'s Jacobian and explain how this helps deep network training, connecting to the conditioning concepts of Chapter \ref{ch:4}.

\item Let $E:\R^n\to\R^k$ and $D:\R^k\to\R^n$ be an autoencoder's encoding/decoding maps with $k < n$. Identify $E$'s four fundamental subspaces, explaining how $\ker(E)$ relates to discarded information and $\coker(E)$ to reconstruction error. Connect to the theory developed in Chapter \ref{ch:3}.

\item For attention weights $A_{ij} = \exp(\vect{q}_i^T\vect{k}_j/\sqrt{d})/\sum_l \exp(\vect{q}_i^T\vect{k}_l/\sqrt{d})$: Prove each row sums to 1; analyze behavior as $d\to 0$ and $d\to\infty$; and relate to the stochastic matrices from Chapter \ref{ch:9}.

\item Multi-head attention combines $h$ parallel computations through $\text{MultiHead}(Q,K,V) = W_O[\text{head}_1;\ldots;\text{head}_h]$. Show this reduces to standard attention when $h=1$ and explain why multiple heads might better capture different relationship types. Connect to how SVD's singular vectors capture different patterns.

\item Consider a neural network applying $L$ layers of $\vect{h}_{\ell+1} = \sigma(W_\ell\vect{h}_\ell + \vect{b}_\ell)$. Show that without activations this reduces to matrix multiplication; explain how nonlinearity enables curved manifold approximation; and connect to representation learning from Section \ref{sec:representation}.

\item (Challenge) For a transformer using positional encodings $\{\vect{p}_1,\ldots,\vect{p}_n\}$ with sequence $\{\vect{x}_1,\ldots,\vect{x}_n\}$: Prove attention can detect relative positions through inner products; demonstrate how sinusoidal encodings facilitate this; and connect to Chapter \ref{ch:5}'s orthogonality concepts.

\item (Challenge) Let $f$ be a neural network with more parameters than training points $\{(\vect{x}_i,y_i)\}_{i=1}^n$, trained to minimize mean squared error. Show it can achieve zero training error by analyzing the associated Jacobian's rank. Relate this to the universal approximation theorem from Section \ref{sec:beyond}.

\end{enumerate}

\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\backmatter

\clearpage
\thispagestyle{empty} % no headers/footers

\begin{fullwidth}
  \vspace*{\fill} % push content down to center vertically
  \centering
  \includegraphics[width=0.5\textwidth]{OPLUS.png} % adjust width as needed
  \vspace*{\fill} % push content up to center vertically
  
{\em
\& these again surrounded by 

Four Wonders of the Almighty Incomprehensible

Pervading all amidst \& round about

Fourfold each in the other reflected 

They are named Life's in Eternity

Four Starry Universes going forward 

From Eternity to Eternity
}
\vspace*{\fill} % push content up to center vertically
\end{fullwidth}


\cleardoublepage


\begin{fullwidth}
%
\thispagestyle{empty} % no headers/footers
%
\vspace*{\fill} % push content down to center 

\begin{center}
{\bf ABOUT THE AUTHOR}

\vspace{1in}

\noindent
Robert Ghrist (Ph.D., Cornell, Applied Mathematics, 1995)

is the Andrea Mitchell PIK Professor of Mathematics and 

Electrical \& Systems Engineering at the University of Pennsylvania. 

He is a recognized leader in the field of Applied Algebraic Topology, 

working in networks, robotics, signal processing, data analysis, 

optimization, and more. He is an award-winning researcher, 

teacher, and expositor of Mathematics and its applications, 

currently serving as the Associate Dean of Undergraduate Education

in the School of Engineering \& Applied Sciences
at Penn.
\vspace{0.75in}

\noindent
He is the author of several books, such as: 

{\em Elementary Applied Topology} and the {\em Calculus Blue Guide}; 

as well as the creator of YouTube video series, including 

{\em
Calculus BLUE,
Calculus GREEN, \&
Applied Dynamical Systems}. 
%See {\tt https://www.youtube.com/c/ProfGhristMath}

\vspace{0.75in}

\noindent
In his spare time
he publishes mathematical art 

and animation
under the moniker {\em colimit}.

\end{center}

\vspace*{\fill} % push content up to center vertically
\end{fullwidth}


\clearpage

\cleardoublepage

\end{document}