% ========================================================= % Chapter 5: Neural Network Training % ========================================================= \chapter{Neural Network Training} %---------------------------------------------------------- \section{A Three-Step Training Framework} Whether fitting a two-parameter logistic regression or a million-parameter deep network, the workflow decomposes into exactly three conceptual steps: \begin{center} \begin{tabular}{clll} \toprule \textbf{Step} & \textbf{Goal} & \textbf{Logistic Regression} & \textbf{Neural Network (TF)}\\ \midrule 1 & Define $\vec{x}\to\hat{y}$ & $f=\sigma(\vec{w}\cdot\vec{x}+b)$ & \texttt{Sequential([Dense(...)])}\\ 2 & Measure error & Logistic loss & \texttt{model.compile(loss=...)}\\ 3 & Minimise error & Gradient descent & \texttt{model.fit(X,y,epochs=...)}\\ \bottomrule \end{tabular} \end{center} \subsection{Step 1 --- Architecture} \begin{lstlisting}[caption={Binary digit classifier: 3-layer network}] from tensorflow.keras import Sequential from tensorflow.keras.layers import Dense model = Sequential([ Dense(units=25, activation='sigmoid'), # Hidden layer 1 Dense(units=15, activation='sigmoid'), # Hidden layer 2 Dense(units=1, activation='sigmoid'), # Output layer ]) \end{lstlisting} TensorFlow initialises all weight matrices and bias vectors automatically. \subsection{Step 2 --- Compile: Specify the Loss Function} \subsubsection{Binary Cross-Entropy Loss} For $(\vec{x},y)$ with $y\in\{0,1\}$: \begin{equation} L\bigl(f,y\bigr)=-y\log f-(1-y)\log(1-f). \end{equation} Cost over all $m$ examples: \begin{equation} J(\mathbf{W},\mathbf{B})=\frac{1}{m}\sum_{i=1}^m L\!\bigl(f(\vec{x}^{(i)}),y^{(i)}\bigr). \end{equation} \begin{lstlisting} from tensorflow.keras.losses import BinaryCrossentropy model.compile(loss=BinaryCrossentropy()) \end{lstlisting} \subsubsection{Mean Squared Error (Regression)} \begin{lstlisting} from tensorflow.keras.losses import MeanSquaredError model.compile(loss=MeanSquaredError()) \end{lstlisting} \subsection{Step 3 --- Train: Backpropagation} To minimise $J$, every parameter in every layer is updated iteratively: \begin{align} w_j^{[l]}&\leftarrow w_j^{[l]}-\alpha\,\pd{J}{w_j^{[l]}},\\ b_j^{[l]}&\leftarrow b_j^{[l]}-\alpha\,\pd{J}{b_j^{[l]}}. \end{align} Computing these partial derivatives across all layers is \textbf{backpropagation} (Section~\ref{sec:backprop}). \begin{definition}[Epoch] One \textbf{epoch} is a single complete pass of gradient descent over the entire training set. \end{definition} \begin{lstlisting} model.fit(X, Y, epochs=100) \end{lstlisting} %---------------------------------------------------------- \section{Activation Functions} \subsection{Why Not Just Use Sigmoid Everywhere?} Using sigmoid in every hidden layer causes two problems: \begin{enumerate} \item \textbf{Range constraint.} Sigmoid outputs lie in $(0,1)$, but hidden-layer activations may represent quantities with unbounded range. \item \textbf{Vanishing gradients.} At large $|z|$, $g'(z)\approx 0$, causing gradients to shrink exponentially during backpropagation through many layers. \end{enumerate} \subsection{Catalogue of Common Activation Functions} \subsubsection{Sigmoid} \begin{equation} g(z)=\frac{1}{1+e^{-z}},\quad g(z)\in(0,1). \end{equation} Natural choice for binary classifier output. \subsubsection{ReLU (Rectified Linear Unit)} \begin{equation} g(z)=\max(0,z),\quad g(z)\in[0,\infty). \end{equation} The \textbf{default for hidden layers} in modern deep learning. Advantages: \begin{itemize} \item \textbf{Computationally cheap}: a single comparison, no exponentiation. \item \textbf{Non-vanishing gradients}: for $z>0$ the gradient is exactly $1$. \end{itemize} \subsubsection{Linear (Identity)} \begin{equation} g(z)=z,\quad g(z)\in(-\infty,+\infty). \end{equation} Appropriate \emph{only} in the output layer for regression (any-sign $y$). \begin{figure}[h] \centering \begin{tikzpicture} \begin{axis}[ width=0.88\textwidth, height=5.8cm, xlabel={$z$}, ylabel={$g(z)$}, xmin=-5, xmax=5, ymin=-1.2, ymax=3.5, xtick={-4,-2,0,2,4}, legend pos=north west, legend style={font=\small, fill=white}, grid=both, grid style={gray!15, thin}, axis lines=left, every axis plot/.append style={line width=1.4pt}] \addplot[pBlue, domain=-5:5, samples=200]{1/(1+exp(-x))}; \addlegendentry{Sigmoid $\sigma(z)$} \addplot[pRed, domain=-5:0, samples=2]{0}; \addplot[pRed, domain=0:5, samples=2, forget plot]{x}; \addlegendentry{ReLU $\max(0,z)$} \addplot[pGreen, domain=-1.2:3.5, samples=2, dashed]{x}; \addlegendentry{Linear $z$} \draw[pGray, dashed, thin] (axis cs:0,-1.2)--(axis cs:0,3.5); \end{axis} \end{tikzpicture} \caption{Comparison of the three most common activation functions.} \label{fig:activations} \end{figure} \subsection{Choosing the Right Activation} \subsubsection{Output Layer --- Determined by Task} \begin{center} \begin{tabular}{llll} \toprule \textbf{Task} & \textbf{Target $y$} & \textbf{Activation} & \textbf{Range}\\ \midrule Binary classification & $y\in\{0,1\}$ & Sigmoid & $(0,1)$\\ Regression (any sign) & $y\in\R$ & Linear & $(-\infty,+\infty)$\\ Regression ($\ge 0$) & $y\ge 0$ & ReLU & $[0,+\infty)$\\ \bottomrule \end{tabular} \end{center} \subsubsection{Hidden Layers --- ReLU as Default} \begin{lstlisting}[caption={Recommended: ReLU hidden layers with sigmoid output}] model = Sequential([ Dense(units=25, activation='relu'), Dense(units=15, activation='relu'), Dense(units=1, activation='sigmoid'), ]) \end{lstlisting} \subsection{The Necessity of Non-Linearity} What happens if we use \emph{linear} activations everywhere? \paragraph{Mathematical proof (one hidden unit).} \begin{align*} a^{[1]}&=w_1 x+b_1,\\ a^{[2]}&=w_2 a^{[1]}+b_2=w_2(w_1 x+b_1)+b_2=(w_2 w_1)x+(w_2 b_1+b_2)=Wx+B. \end{align*} Setting $W=w_2 w_1$ and $B=w_2 b_1+b_2$ gives a \emph{linear} model. The hidden layer contributed nothing. \textbf{Generalisation}: any depth of linear layers is equivalent to a single linear layer. Adding more linear layers provides \emph{zero} additional expressive power. Non-linear activations (ReLU) are what allow deep networks to learn hierarchical, complex features. \paragraph{Rule of thumb.} \begin{itemize} \item \textbf{Never} use linear activations in hidden layers. \item Use \textbf{ReLU} for hidden layers. \item Use \textbf{linear} in the output only for regression with $y\in\R$. \end{itemize} %---------------------------------------------------------- \section{Multiclass Classification} \subsection{Definition and Motivation} \begin{definition}[Multiclass classification] A problem is \textbf{multiclass} when $y\in\{1,2,\ldots,N\}$ for $N>2$. \end{definition} Examples: handwritten digit recognition ($N=10$), medical diagnosis with multiple disease types, part-of-speech tagging. \subsection{Softmax Regression} Logistic regression is a special case of \emph{softmax regression} with $N=2$. For general $N$, each class $j$ has its own parameter vector $(\vec{w}_j,b_j)$. \paragraph{Step 1: linear terms.} \begin{equation} z_j=\vec{w}_j\cdot\vec{x}+b_j,\quad j=1,\ldots,N. \end{equation} \paragraph{Step 2: softmax activation.} \begin{equation} \boxed{a_j=\frac{e^{z_j}}{\displaystyle\sum_{k=1}^N e^{z_k}}=P(y=j\mid\vec{x}).} \label{eq:softmax} \end{equation} \paragraph{Normalisation property.} \begin{equation} \sum_{j=1}^N a_j=1. \end{equation} \begin{remark}[Coupling across units] Unlike sigmoid or ReLU, softmax is \textbf{not element-wise}. Computing $a_j$ requires all $N$ values of $z$ simultaneously due to the shared denominator. \end{remark} \subsection{Cross-Entropy Loss for Softmax} If the ground-truth label for example $i$ is $y^{(i)}=j$: \begin{equation} L=-\log(a_j)\quad\text{if }y=j. \end{equation} The total cost: $J=\frac{1}{m}\sum_{i=1}^m L(f(\vec{x}^{(i)}),y^{(i)})$. \subsection{TensorFlow Implementation} \begin{lstlisting}[caption={Softmax network for 10-class digit recognition}] from tensorflow.keras.losses import SparseCategoricalCrossentropy model = Sequential([ Dense(units=25, activation='relu'), Dense(units=15, activation='relu'), Dense(units=10, activation='softmax'), # Softmax output ]) model.compile(loss=SparseCategoricalCrossentropy()) model.fit(X, Y, epochs=100) \end{lstlisting} %---------------------------------------------------------- \section{Advanced Concepts} \subsection{Numerical Stability: The \texttt{from\_logits} Pattern} When TensorFlow evaluates softmax then log inside the cross-entropy loss, it may suffer from floating-point overflow or underflow. The solution: let the final layer output raw \textbf{logits} (the $z$ values) and pass \lstinline{from_logits=True}. TensorFlow then combines the operations into a single numerically stable formula. \begin{definition}[Logit] A \textbf{logit} is the raw, unscaled output $z$ of the final layer before any activation. \end{definition} \begin{lstlisting}[caption={Numerically stable multiclass model (recommended in production)}] model = Sequential([ Dense(units=25, activation='relu'), Dense(units=15, activation='relu'), Dense(units=10, activation='linear'), # Output logits, not probabilities ]) model.compile(loss=SparseCategoricalCrossentropy(from_logits=True)) model.fit(X, Y, epochs=100) # Convert logits to probabilities at inference time: import tensorflow as tf logits = model.predict(X) probs = tf.nn.softmax(logits) \end{lstlisting} \subsection{Multi-Label Classification} \begin{definition}[Multi-label classification] The model must predict a \emph{vector} of binary labels; multiple labels can be simultaneously active (non-mutually exclusive). \end{definition} \begin{example} In autonomous driving, a single image may contain a car ($y_1=1$), no bus ($y_2=0$), and a pedestrian ($y_3=1$) simultaneously. \end{example} \begin{center} \begin{tabular}{lll} \toprule \textbf{Feature} & \textbf{Multi-class} & \textbf{Multi-label}\\ \midrule Output $y$ & Single integer & Vector of $N$ binaries\\ Classes & Mutually exclusive & Independent\\ Output activation & Softmax & Sigmoid (per output)\\ Loss & Sparse categorical CE & Binary CE\\ \bottomrule \end{tabular} \end{center} \begin{lstlisting}[caption={Multi-label classification model}] model = Sequential([ Dense(units=25, activation='relu'), Dense(units=15, activation='relu'), Dense(units=3, activation='sigmoid'), # 3 independent binary outputs ]) model.compile(loss='binary_crossentropy') \end{lstlisting} \subsection{The Adam Optimiser} Standard gradient descent uses a single fixed $\alpha$ for all parameters. \begin{definition}[Adam] \textbf{Adam} (Adaptive Moment Estimation) maintains a \emph{separate, adaptive learning rate} $\alpha_j$ for every parameter $w_j$. \end{definition} Intuition: \begin{itemize} \item If $w_j$ consistently moves in the \emph{same direction}: increase $\alpha_j$. \item If $w_j$ \emph{oscillates}: decrease $\alpha_j$. \end{itemize} Adam maintains two running statistics (moments) per parameter: a first moment (exponentially weighted average of gradients) and a second moment (weighted average of squared gradients). Their ratio yields a per-parameter step size. \begin{lstlisting}[caption={Using Adam in TensorFlow}] from tensorflow.keras.optimizers import Adam model.compile( optimizer=Adam(learning_rate=1e-3), loss=SparseCategoricalCrossentropy(from_logits=True)) model.fit(X, Y, epochs=100) \end{lstlisting} %---------------------------------------------------------- \section{Backpropagation} \label{sec:backprop} \subsection{The Intuition Behind Derivatives} Backpropagation computes the partial derivative of the cost $J$ with respect to every parameter. These derivatives are used by the optimiser to update parameters. \begin{definition}[Derivative (informal)] $\pd{J}{w}\approx k$ means: increasing $w$ by $\varepsilon$ causes $J$ to change by approximately $k\varepsilon$. \end{definition} \paragraph{Example: $J(w)=w^2$.} \begin{center} \begin{tabular}{ccccc} \toprule $w$ & $J=w^2$ & $w+\varepsilon$ & $\Delta J$ & $\pd{J}{w}=2w$\\ \midrule $3$ & $9$ & $3.001$ & $\approx 6\varepsilon$ & $6$\\ $2$ & $4$ & $2.001$ & $\approx 4\varepsilon$ & $4$\\ $-3$ & $9$ & $-2.999$ & $\approx -6\varepsilon$ & $-6$\\ \bottomrule \end{tabular} \end{center} The derivative tells the optimiser: (1) which \emph{direction} to move, and (2) how large a \emph{step} is appropriate. \subsection{Computation Graphs and the Chain Rule} \begin{definition}[Computation graph] A \textbf{computation graph} is a DAG where each node represents an elementary operation and each edge carries a value. Forward propagation traverses it left-to-right; backpropagation right-to-left using the chain rule. \end{definition} \paragraph{Forward pass example.} $a=wx+b$, $d=a-y$, $J=\tfrac{1}{2}d^2$. With $x=-2$, $y=2$, $w=2$, $b=8$: \[c=2(-2)=-4,\quad a=-4+8=4,\quad d=4-2=2,\quad J=2.\] \paragraph{Backward pass (chain rule).} \begin{align*} \pd{J}{d}&=d=2,\quad \pd{J}{a}=\pd{J}{d}\cdot 1=2,\\ \pd{J}{b}&=2,\quad \pd{J}{c}=2,\quad \pd{J}{w}=\pd{J}{c}\cdot x=2\cdot(-2)=-4. \end{align*} \subsection{Computational Efficiency} \begin{center} \begin{tabular}{lll} \toprule \textbf{Method} & \textbf{Complexity} & \textbf{Description}\\ \midrule Naive numerical & $O(N\times P)$ & Re-run forward pass for each parameter\\ Backpropagation & $O(N+P)$ & Single backward pass, reuse partial derivatives\\ \bottomrule \end{tabular} \end{center} For a network with one million parameters, the naive approach requires one million forward passes per gradient step — physically intractable. Backpropagation collapses this to a single backward pass of cost comparable to one forward pass. Modern frameworks (TensorFlow, PyTorch) implement \textbf{automatic differentiation}: they construct the computation graph dynamically and traverse it in reverse, computing all gradients without any user-supplied formulas. %---------------------------------------------------------- \section*{Chapter Summary} \addcontentsline{toc}{section}{Chapter Summary} \begin{center} \begin{tabular}{ll} \toprule \textbf{Concept} & \textbf{Key takeaway}\\ \midrule 3-step training & Specify $\to$ Compile $\to$ Fit\\ Activation choice & ReLU for hidden; task-determined for output\\ Linear collapse & All-linear networks $\equiv$ linear regression\\ Multiclass & Softmax output with $N$ neurons\\ Numerical stability & Use \texttt{from\_logits=True} in production\\ Multi-label & $N$ independent sigmoid outputs\\ Adam & Per-parameter adaptive learning rates\\ Backpropagation & $O(N+P)$ vs.\ $O(N\times P)$ naive; chain rule\\ Autodiff & TF/PyTorch automate all gradient computation\\ \bottomrule \end{tabular} \end{center}