% =========================================================
%  Chapter 5: Neural Network Training
% =========================================================
\chapter{Neural Network Training}

%----------------------------------------------------------
\section{A Three-Step Training Framework}

Whether fitting a two-parameter logistic regression or a million-parameter deep
network, the workflow decomposes into exactly three conceptual steps:

\begin{center}
\begin{tabular}{clll}
\toprule
\textbf{Step} & \textbf{Goal} & \textbf{Logistic Regression} &
\textbf{Neural Network (TF)}\\
\midrule
1 & Define $\vec{x}\to\hat{y}$ & $f=\sigma(\vec{w}\cdot\vec{x}+b)$ &
\texttt{Sequential([Dense(...)])}\\
2 & Measure error & Logistic loss & \texttt{model.compile(loss=...)}\\
3 & Minimise error & Gradient descent & \texttt{model.fit(X,y,epochs=...)}\\
\bottomrule
\end{tabular}
\end{center}

\subsection{Step 1 --- Architecture}

\begin{lstlisting}[caption={Binary digit classifier: 3-layer network}]
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(units=25, activation='sigmoid'),  # Hidden layer 1
    Dense(units=15, activation='sigmoid'),  # Hidden layer 2
    Dense(units=1,  activation='sigmoid'),  # Output layer
])
\end{lstlisting}
TensorFlow initialises all weight matrices and bias vectors automatically.

\subsection{Step 2 --- Compile: Specify the Loss Function}

\subsubsection{Binary Cross-Entropy Loss}
For $(\vec{x},y)$ with $y\in\{0,1\}$:
\begin{equation}
L\bigl(f,y\bigr)=-y\log f-(1-y)\log(1-f).
\end{equation}
Cost over all $m$ examples:
\begin{equation}
J(\mathbf{W},\mathbf{B})=\frac{1}{m}\sum_{i=1}^m L\!\bigl(f(\vec{x}^{(i)}),y^{(i)}\bigr).
\end{equation}
\begin{lstlisting}
from tensorflow.keras.losses import BinaryCrossentropy
model.compile(loss=BinaryCrossentropy())
\end{lstlisting}

\subsubsection{Mean Squared Error (Regression)}
\begin{lstlisting}
from tensorflow.keras.losses import MeanSquaredError
model.compile(loss=MeanSquaredError())
\end{lstlisting}

\subsection{Step 3 --- Train: Backpropagation}

To minimise $J$, every parameter in every layer is updated iteratively:
\begin{align}
w_j^{[l]}&\leftarrow w_j^{[l]}-\alpha\,\pd{J}{w_j^{[l]}},\\
b_j^{[l]}&\leftarrow b_j^{[l]}-\alpha\,\pd{J}{b_j^{[l]}}.
\end{align}
Computing these partial derivatives across all layers is
\textbf{backpropagation} (Section~\ref{sec:backprop}).

\begin{definition}[Epoch]
One \textbf{epoch} is a single complete pass of gradient descent over the
entire training set.
\end{definition}
\begin{lstlisting}
model.fit(X, Y, epochs=100)
\end{lstlisting}

%----------------------------------------------------------
\section{Activation Functions}

\subsection{Why Not Just Use Sigmoid Everywhere?}

Using sigmoid in every hidden layer causes two problems:
\begin{enumerate}
\item \textbf{Range constraint.} Sigmoid outputs lie in $(0,1)$, but
  hidden-layer activations may represent quantities with unbounded range.
\item \textbf{Vanishing gradients.} At large $|z|$, $g'(z)\approx 0$, causing
  gradients to shrink exponentially during backpropagation through many layers.
\end{enumerate}

\subsection{Catalogue of Common Activation Functions}

\subsubsection{Sigmoid}
\begin{equation}
g(z)=\frac{1}{1+e^{-z}},\quad g(z)\in(0,1).
\end{equation}
Natural choice for binary classifier output.

\subsubsection{ReLU (Rectified Linear Unit)}
\begin{equation}
g(z)=\max(0,z),\quad g(z)\in[0,\infty).
\end{equation}
The \textbf{default for hidden layers} in modern deep learning. Advantages:
\begin{itemize}
\item \textbf{Computationally cheap}: a single comparison, no exponentiation.
\item \textbf{Non-vanishing gradients}: for $z>0$ the gradient is exactly $1$.
\end{itemize}

\subsubsection{Linear (Identity)}
\begin{equation}
g(z)=z,\quad g(z)\in(-\infty,+\infty).
\end{equation}
Appropriate \emph{only} in the output layer for regression (any-sign $y$).

\begin{figure}[h]
\centering
\begin{tikzpicture}
\begin{axis}[
  width=0.88\textwidth, height=5.8cm,
  xlabel={$z$}, ylabel={$g(z)$},
  xmin=-5, xmax=5, ymin=-1.2, ymax=3.5,
  xtick={-4,-2,0,2,4},
  legend pos=north west, legend style={font=\small, fill=white},
  grid=both, grid style={gray!15, thin}, axis lines=left,
  every axis plot/.append style={line width=1.4pt}]
\addplot[pBlue, domain=-5:5, samples=200]{1/(1+exp(-x))};
\addlegendentry{Sigmoid $\sigma(z)$}
\addplot[pRed, domain=-5:0, samples=2]{0};
\addplot[pRed, domain=0:5, samples=2, forget plot]{x};
\addlegendentry{ReLU $\max(0,z)$}
\addplot[pGreen, domain=-1.2:3.5, samples=2, dashed]{x};
\addlegendentry{Linear $z$}
\draw[pGray, dashed, thin] (axis cs:0,-1.2)--(axis cs:0,3.5);
\end{axis}
\end{tikzpicture}
\caption{Comparison of the three most common activation functions.}
\label{fig:activations}
\end{figure}

\subsection{Choosing the Right Activation}

\subsubsection{Output Layer --- Determined by Task}
\begin{center}
\begin{tabular}{llll}
\toprule
\textbf{Task} & \textbf{Target $y$} & \textbf{Activation} & \textbf{Range}\\
\midrule
Binary classification & $y\in\{0,1\}$ & Sigmoid  & $(0,1)$\\
Regression (any sign) & $y\in\R$       & Linear   & $(-\infty,+\infty)$\\
Regression ($\ge 0$)  & $y\ge 0$       & ReLU     & $[0,+\infty)$\\
\bottomrule
\end{tabular}
\end{center}

\subsubsection{Hidden Layers --- ReLU as Default}

\begin{lstlisting}[caption={Recommended: ReLU hidden layers with sigmoid output}]
model = Sequential([
    Dense(units=25, activation='relu'),
    Dense(units=15, activation='relu'),
    Dense(units=1,  activation='sigmoid'),
])
\end{lstlisting}

\subsection{The Necessity of Non-Linearity}

What happens if we use \emph{linear} activations everywhere?

\paragraph{Mathematical proof (one hidden unit).}
\begin{align*}
a^{[1]}&=w_1 x+b_1,\\
a^{[2]}&=w_2 a^{[1]}+b_2=w_2(w_1 x+b_1)+b_2=(w_2 w_1)x+(w_2 b_1+b_2)=Wx+B.
\end{align*}
Setting $W=w_2 w_1$ and $B=w_2 b_1+b_2$ gives a \emph{linear} model. The
hidden layer contributed nothing.

\textbf{Generalisation}: any depth of linear layers is equivalent to a single
linear layer. Adding more linear layers provides \emph{zero} additional
expressive power. Non-linear activations (ReLU) are what allow deep networks
to learn hierarchical, complex features.

\paragraph{Rule of thumb.}
\begin{itemize}
\item \textbf{Never} use linear activations in hidden layers.
\item Use \textbf{ReLU} for hidden layers.
\item Use \textbf{linear} in the output only for regression with $y\in\R$.
\end{itemize}

%----------------------------------------------------------
\section{Multiclass Classification}

\subsection{Definition and Motivation}

\begin{definition}[Multiclass classification]
A problem is \textbf{multiclass} when $y\in\{1,2,\ldots,N\}$ for $N>2$.
\end{definition}

Examples: handwritten digit recognition ($N=10$), medical diagnosis with
multiple disease types, part-of-speech tagging.

\subsection{Softmax Regression}

Logistic regression is a special case of \emph{softmax regression} with $N=2$.
For general $N$, each class $j$ has its own parameter vector $(\vec{w}_j,b_j)$.

\paragraph{Step 1: linear terms.}
\begin{equation}
z_j=\vec{w}_j\cdot\vec{x}+b_j,\quad j=1,\ldots,N.
\end{equation}

\paragraph{Step 2: softmax activation.}
\begin{equation}
\boxed{a_j=\frac{e^{z_j}}{\displaystyle\sum_{k=1}^N e^{z_k}}=P(y=j\mid\vec{x}).}
\label{eq:softmax}
\end{equation}

\paragraph{Normalisation property.}
\begin{equation}
\sum_{j=1}^N a_j=1.
\end{equation}

\begin{remark}[Coupling across units]
Unlike sigmoid or ReLU, softmax is \textbf{not element-wise}. Computing $a_j$
requires all $N$ values of $z$ simultaneously due to the shared denominator.
\end{remark}

\subsection{Cross-Entropy Loss for Softmax}

If the ground-truth label for example $i$ is $y^{(i)}=j$:
\begin{equation}
L=-\log(a_j)\quad\text{if }y=j.
\end{equation}
The total cost: $J=\frac{1}{m}\sum_{i=1}^m L(f(\vec{x}^{(i)}),y^{(i)})$.

\subsection{TensorFlow Implementation}

\begin{lstlisting}[caption={Softmax network for 10-class digit recognition}]
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model = Sequential([
    Dense(units=25, activation='relu'),
    Dense(units=15, activation='relu'),
    Dense(units=10, activation='softmax'),   # Softmax output
])
model.compile(loss=SparseCategoricalCrossentropy())
model.fit(X, Y, epochs=100)
\end{lstlisting}

%----------------------------------------------------------
\section{Advanced Concepts}

\subsection{Numerical Stability: The \texttt{from\_logits} Pattern}

When TensorFlow evaluates softmax then log inside the cross-entropy loss, it
may suffer from floating-point overflow or underflow. The solution: let the
final layer output raw \textbf{logits} (the $z$ values) and pass
\lstinline{from_logits=True}. TensorFlow then combines the operations into a
single numerically stable formula.

\begin{definition}[Logit]
A \textbf{logit} is the raw, unscaled output $z$ of the final layer before any
activation.
\end{definition}

\begin{lstlisting}[caption={Numerically stable multiclass model (recommended in production)}]
model = Sequential([
    Dense(units=25, activation='relu'),
    Dense(units=15, activation='relu'),
    Dense(units=10, activation='linear'),   # Output logits, not probabilities
])
model.compile(loss=SparseCategoricalCrossentropy(from_logits=True))
model.fit(X, Y, epochs=100)

# Convert logits to probabilities at inference time:
import tensorflow as tf
logits = model.predict(X)
probs  = tf.nn.softmax(logits)
\end{lstlisting}

\subsection{Multi-Label Classification}

\begin{definition}[Multi-label classification]
The model must predict a \emph{vector} of binary labels; multiple labels can
be simultaneously active (non-mutually exclusive).
\end{definition}

\begin{example}
In autonomous driving, a single image may contain a car ($y_1=1$), no bus
($y_2=0$), and a pedestrian ($y_3=1$) simultaneously.
\end{example}

\begin{center}
\begin{tabular}{lll}
\toprule
\textbf{Feature} & \textbf{Multi-class} & \textbf{Multi-label}\\
\midrule
Output $y$           & Single integer   & Vector of $N$ binaries\\
Classes              & Mutually exclusive & Independent\\
Output activation    & Softmax          & Sigmoid (per output)\\
Loss                 & Sparse categorical CE & Binary CE\\
\bottomrule
\end{tabular}
\end{center}

\begin{lstlisting}[caption={Multi-label classification model}]
model = Sequential([
    Dense(units=25, activation='relu'),
    Dense(units=15, activation='relu'),
    Dense(units=3,  activation='sigmoid'),   # 3 independent binary outputs
])
model.compile(loss='binary_crossentropy')
\end{lstlisting}

\subsection{The Adam Optimiser}

Standard gradient descent uses a single fixed $\alpha$ for all parameters.

\begin{definition}[Adam]
\textbf{Adam} (Adaptive Moment Estimation) maintains a \emph{separate,
adaptive learning rate} $\alpha_j$ for every parameter $w_j$.
\end{definition}

Intuition:
\begin{itemize}
\item If $w_j$ consistently moves in the \emph{same direction}: increase $\alpha_j$.
\item If $w_j$ \emph{oscillates}: decrease $\alpha_j$.
\end{itemize}
Adam maintains two running statistics (moments) per parameter: a first moment
(exponentially weighted average of gradients) and a second moment (weighted
average of squared gradients). Their ratio yields a per-parameter step size.

\begin{lstlisting}[caption={Using Adam in TensorFlow}]
from tensorflow.keras.optimizers import Adam
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss=SparseCategoricalCrossentropy(from_logits=True))
model.fit(X, Y, epochs=100)
\end{lstlisting}

%----------------------------------------------------------
\section{Backpropagation}
\label{sec:backprop}

\subsection{The Intuition Behind Derivatives}

Backpropagation computes the partial derivative of the cost $J$ with respect
to every parameter. These derivatives are used by the optimiser to update
parameters.

\begin{definition}[Derivative (informal)]
$\pd{J}{w}\approx k$ means: increasing $w$ by $\varepsilon$ causes $J$ to
change by approximately $k\varepsilon$.
\end{definition}

\paragraph{Example: $J(w)=w^2$.}
\begin{center}
\begin{tabular}{ccccc}
\toprule
$w$ & $J=w^2$ & $w+\varepsilon$ & $\Delta J$ & $\pd{J}{w}=2w$\\
\midrule
$3$  & $9$ & $3.001$ & $\approx 6\varepsilon$ & $6$\\
$2$  & $4$ & $2.001$ & $\approx 4\varepsilon$ & $4$\\
$-3$ & $9$ & $-2.999$ & $\approx -6\varepsilon$ & $-6$\\
\bottomrule
\end{tabular}
\end{center}

The derivative tells the optimiser: (1) which \emph{direction} to move, and
(2) how large a \emph{step} is appropriate.

\subsection{Computation Graphs and the Chain Rule}

\begin{definition}[Computation graph]
A \textbf{computation graph} is a DAG where each node represents an elementary
operation and each edge carries a value. Forward propagation traverses it
left-to-right; backpropagation right-to-left using the chain rule.
\end{definition}

\paragraph{Forward pass example.}
$a=wx+b$, $d=a-y$, $J=\tfrac{1}{2}d^2$.
With $x=-2$, $y=2$, $w=2$, $b=8$:
\[c=2(-2)=-4,\quad a=-4+8=4,\quad d=4-2=2,\quad J=2.\]

\paragraph{Backward pass (chain rule).}
\begin{align*}
\pd{J}{d}&=d=2,\quad
\pd{J}{a}=\pd{J}{d}\cdot 1=2,\\
\pd{J}{b}&=2,\quad
\pd{J}{c}=2,\quad
\pd{J}{w}=\pd{J}{c}\cdot x=2\cdot(-2)=-4.
\end{align*}

\subsection{Computational Efficiency}

\begin{center}
\begin{tabular}{lll}
\toprule
\textbf{Method} & \textbf{Complexity} & \textbf{Description}\\
\midrule
Naive numerical  & $O(N\times P)$ & Re-run forward pass for each parameter\\
Backpropagation  & $O(N+P)$       & Single backward pass, reuse partial derivatives\\
\bottomrule
\end{tabular}
\end{center}

For a network with one million parameters, the naive approach requires one
million forward passes per gradient step — physically intractable. Backpropagation
collapses this to a single backward pass of cost comparable to one forward pass.
Modern frameworks (TensorFlow, PyTorch) implement \textbf{automatic
differentiation}: they construct the computation graph dynamically and traverse
it in reverse, computing all gradients without any user-supplied formulas.

%----------------------------------------------------------
\section*{Chapter Summary}
\addcontentsline{toc}{section}{Chapter Summary}

\begin{center}
\begin{tabular}{ll}
\toprule
\textbf{Concept} & \textbf{Key takeaway}\\
\midrule
3-step training      & Specify $\to$ Compile $\to$ Fit\\
Activation choice    & ReLU for hidden; task-determined for output\\
Linear collapse      & All-linear networks $\equiv$ linear regression\\
Multiclass           & Softmax output with $N$ neurons\\
Numerical stability  & Use \texttt{from\_logits=True} in production\\
Multi-label          & $N$ independent sigmoid outputs\\
Adam                 & Per-parameter adaptive learning rates\\
Backpropagation      & $O(N+P)$ vs.\ $O(N\times P)$ naive; chain rule\\
Autodiff             & TF/PyTorch automate all gradient computation\\
\bottomrule
\end{tabular}
\end{center}