% ========================================================= % Chapter 6: Advice for Applying Machine Learning % ========================================================= \chapter{Advice for Applying Machine Learning} \textit{This chapter provides a systematic framework for diagnosing and improving machine learning systems. We cover practical debugging strategies, the bias--variance trade-off, structured development workflows, data-centric AI, and methods for handling skewed datasets.} %---------------------------------------------------------- \section{Evaluating Model Performance} \subsection{Unacceptable Prediction Errors: What Next?} Suppose regularised linear regression produces unacceptably large errors. You face a critical decision: \emph{what should you try next?} Choosing wrongly can waste months of effort. Six commonly considered paths exist: \begin{center} \begin{tabular}{lp{9cm}} \toprule \textbf{Strategy} & \textbf{Description}\\ \midrule Data collection & Gather more training examples.\\ Feature reduction & Use a smaller feature set.\\ Feature expansion & Introduce additional informative features.\\ Polynomial complexity& Add polynomial terms for non-linear patterns.\\ Decrease $\lambda$ & Allow closer fitting of training data.\\ Increase $\lambda$ & Reduce overfitting by stronger regularisation.\\ \bottomrule \end{tabular} \end{center} Without guidance, practitioners often choose randomly. \textbf{Machine learning diagnostics} replace guessing with evidence-based reasoning. \begin{definition}[Diagnostic] A \textbf{diagnostic} is a formal test that provides insight into what is or is not working within a learning algorithm, guiding targeted improvement. \end{definition} \subsection{The Train / Test Split} The dataset is divided into two non-overlapping subsets: \begin{enumerate} \item \textbf{Training set} ($\approx70$--$80\%$): used to fit parameters $(\vec{w},b)$. \item \textbf{Test set} ($\approx20$--$30\%$): used to evaluate on unseen data. \end{enumerate} \paragraph{Procedure for linear regression.} \begin{enumerate} \item Minimise the regularised cost on the training set. \item Compute train and test errors \emph{without} regularisation: \begin{align} J_{\text{test}}&=\frac{1}{2m_{\text{test}}}\sum_{i=1}^{m_{\text{test}}} \bigl(f_{\vec{w},b}(\vec{x}_{\text{test}}^{(i)})-y_{\text{test}}^{(i)}\bigr)^2,\\ J_{\text{train}}&=\frac{1}{2m_{\text{train}}}\sum_{i=1}^{m_{\text{train}}} \bigl(f_{\vec{w},b}(\vec{x}_{\text{train}}^{(i)})-y_{\text{train}}^{(i)}\bigr)^2. \end{align} \end{enumerate} \begin{remark}[Overfitting diagnostic rule] If $J_{\text{train}}$ is low but $J_{\text{test}}$ is high, the model has \textbf{failed to generalise} (overfitting). \end{remark} \subsection{The Three-Way Data Split for Model Selection} A two-way split has a flaw: if we use the test set to select the best model (e.g.\ the best polynomial degree $d$), $J_{\text{test}}$ is no longer an unbiased estimate — it has been ``used up'' in the selection decision. \textbf{Solution}: split data into three disjoint subsets. \begin{center} \begin{tabular}{lll} \toprule \textbf{Subset} & \textbf{Size} & \textbf{Purpose}\\ \midrule Training set & 60\% & Fit parameters $\vec{w}$, $b$\\ Cross-validation (CV) & 20\% & Select model / tune hyperparameters\\ Test set & 20\% & Unbiased final performance estimate\\ \bottomrule \end{tabular} \end{center} \paragraph{Correct procedure.} \begin{enumerate} \item Train each candidate model on the training set. \item Evaluate all models on the CV set; choose the one with lowest $J_{\text{cv}}$. \item Report $J_{\text{test}}$ of the \emph{single} chosen model. \end{enumerate} %---------------------------------------------------------- \section{Bias and Variance} \subsection{Defining Bias and Variance} \begin{definition}[High Bias] A model has \textbf{high bias} when it is too simple to capture the underlying patterns. Both $J_{\text{train}}$ and $J_{\text{cv}}$ are high, and close to each other. \end{definition} \begin{definition}[High Variance] A model has \textbf{high variance} when it fits training noise and fails to generalise. $J_{\text{train}}$ is very low but $J_{\text{cv}}\gg J_{\text{train}}$. \end{definition} \begin{center} \begin{tabular}{llll} \toprule \textbf{Condition} & $J_{\text{train}}$ & $J_{\text{cv}}$ & \textbf{Relationship}\\ \midrule High Bias & High & High & $J_{\text{cv}}\approx J_{\text{train}}$\\ High Variance & Low & High & $J_{\text{cv}}\gg J_{\text{train}}$\\ Just Right & Low & Low & $J_{\text{cv}}\approx J_{\text{train}}$\\ Both High & High & Much higher & Large gap, $J_{\text{train}}$ also high\\ \bottomrule \end{tabular} \end{center} \begin{figure}[h] \centering \begin{tikzpicture} \begin{axis}[ width=10cm, height=6.5cm, xlabel={Polynomial degree $d$}, ylabel={Error}, xmin=0.5, xmax=7.5, ymin=0, ymax=1.12, xtick={1,2,3,4,5,6,7}, ytick=\empty, legend pos=north east, legend style={font=\small, fill=white}, axis lines=left, every axis plot/.append style={line width=1.4pt}] \addplot[pBlue, smooth] coordinates {(1,.95)(2,.6)(3,.35)(4,.18)(5,.09)(6,.04)(7,.02)}; \addlegendentry{$J_{\text{train}}$} \addplot[pOrange, dashed, smooth] coordinates {(1,1.0)(2,.55)(3,.3)(4,.36)(5,.56)(6,.79)(7,1.02)}; \addlegendentry{$J_{\text{cv}}$} \draw[pGray, thin, dashed] (axis cs:3,0)--(axis cs:3,1.08); \node[font=\small, pGray] at (axis cs:3,1.12) {Optimal $d$}; \node[pRed, font=\small] at (axis cs:1.4,.88) {High Bias}; \node[pRed, font=\small] at (axis cs:6.1,.90) {High Variance}; \end{axis} \end{tikzpicture} \caption{$J_{\text{train}}$ decreases monotonically with model complexity. $J_{\text{cv}}$ follows a U-shaped curve with minimum at the optimal degree.} \label{fig:complexity} \end{figure} \subsection{Regularisation and the Bias--Variance Trade-off} \begin{itemize} \item \textbf{Large $\lambda$}: forces $\vec{w}\approx\mathbf{0}$, model near constant $\Rightarrow$ \textbf{High Bias}. \item \textbf{Small $\lambda$}: no constraint $\Rightarrow$ \textbf{High Variance}. \item \textbf{Optimal $\lambda$}: balance between fitting data and simplicity. \end{itemize} \begin{figure}[h] \centering \begin{tikzpicture} \begin{axis}[ width=10cm, height=6.5cm, xlabel={$\lambda$}, ylabel={Error}, xmin=-0.1, xmax=10.5, ymin=0, ymax=1.12, xtick={0,2,4,6,8,10}, ytick=\empty, legend pos=north east, legend style={font=\small, fill=white}, axis lines=left, every axis plot/.append style={line width=1.4pt}] \addplot[pBlue, smooth] coordinates {(0,.02)(1,.08)(2,.18)(3,.3)(5,.52)(7,.72)(10,.95)}; \addlegendentry{$J_{\text{train}}$} \addplot[pOrange, dashed, smooth] coordinates {(0,1.0)(1,.55)(2,.30)(3,.29)(4,.33)(5,.46)(7,.66)(10,.96)}; \addlegendentry{$J_{\text{cv}}$} \draw[pGray, thin, dashed] (axis cs:3,0)--(axis cs:3,1.08); \node[font=\small, pGray] at (axis cs:3,1.12) {Optimal $\lambda$}; \node[pRed, font=\small] at (axis cs:0.7,.92) {High Variance}; \node[pRed, font=\small] at (axis cs:8.5,.78) {High Bias}; \end{axis} \end{tikzpicture} \caption{As $\lambda$ increases, $J_{\text{train}}$ rises and $J_{\text{cv}}$ forms a U-shape. The valley identifies the optimal regularisation strength.} \label{fig:lambda} \end{figure} \subsection{Baseline Performance and the Gap Framework} Comparing $J_{\text{train}}$ in isolation can be misleading — many tasks have an irreducible noise floor. \begin{definition}[Bias gap and variance gap] \[ \text{Bias Gap}=J_{\text{train}}-\text{Baseline},\qquad \text{Variance Gap}=J_{\text{cv}}-J_{\text{train}}. \] Large bias gap $\Rightarrow$ High Bias. Large variance gap $\Rightarrow$ High Variance. \end{definition} \begin{example}[Speech recognition] $J_{\text{train}}=10.8\%$, $J_{\text{cv}}=14.8\%$, human baseline $=10.6\%$. \begin{itemize} \item Bias Gap $=0.2\%$ (small). \item Variance Gap $=4.0\%$ (large). \item Conclusion: \textbf{High Variance} problem. \end{itemize} \end{example} \subsection{Debugging Strategies} \begin{center} \begin{tabular}{ll} \toprule \textbf{Diagnosis} & \textbf{Recommended action}\\ \midrule High Variance & Get more training examples\\ & Reduce feature set\\ & Increase $\lambda$\\ \midrule High Bias & Add more / better features\\ & Add polynomial features\\ & Decrease $\lambda$\\ \bottomrule \end{tabular} \end{center} \subsection{Neural Networks and Bias--Variance} A sufficiently large, well-regularised neural network can be made effectively low-bias. The iterative workflow: \begin{enumerate} \item Check $J_{\text{train}}$ vs.\ baseline. If high $\Rightarrow$ High Bias: \textbf{use a bigger network} (more layers/units). \item Check $J_{\text{cv}}$. If $J_{\text{cv}}\gg J_{\text{train}}$ $\Rightarrow$ High Variance: \textbf{get more data}. \item Repeat until both errors are acceptably low. \end{enumerate} \begin{lstlisting}[caption={$L_2$ regularisation in Keras}] from tensorflow.keras.regularizers import L2 model = Sequential([ Dense(units=25, activation='relu', kernel_regularizer=L2(0.01)), Dense(units=15, activation='relu', kernel_regularizer=L2(0.01)), Dense(units=1, activation='sigmoid', kernel_regularizer=L2(0.01)), ]) \end{lstlisting} %---------------------------------------------------------- \section{Machine Learning Development Process} \subsection{The Iterative Development Loop} Building a production ML model is rarely linear. It follows a continuous cycle: \begin{enumerate} \item \textbf{Choose architecture}: model family, features, hyperparameters. \item \textbf{Train model}: implement and train. \item \textbf{Run diagnostics}: bias--variance analysis and error analysis. Use findings to loop back. \end{enumerate} \subsection{Data Augmentation and Synthesis} \begin{definition}[Data augmentation] \textbf{Data augmentation} creates new training examples $(x',y)$ by applying structure-preserving transformations to existing examples $(x,y)$. \end{definition} \begin{itemize} \item \textbf{Computer vision}: rotation, cropping, mirroring, contrast changes, grid-based warping. \item \textbf{Speech}: adding background noise, applying filters to simulate poor phone connections. \end{itemize} \textbf{Key rule}: Augmentation distortions must be \emph{representative of real test-set noise}. Purely random pixel noise provides little benefit. \begin{definition}[Data synthesis] \textbf{Data synthesis} generates entirely new training examples from scratch (e.g.\ rendering text in diverse fonts for OCR training). \end{definition} \subsection{Transfer Learning} \begin{definition}[Transfer learning] \textbf{Transfer learning} reuses a model pre-trained on a large related task as the starting point for a new, data-scarce task. \end{definition} Standard two-step workflow: \begin{enumerate} \item \textbf{Supervised pre-training}: train a large network on a massive labelled dataset (e.g.\ 1 million ImageNet images). \item \textbf{Fine-tuning}: replace the output layer for the target task. Then: \begin{itemize} \item \textbf{Frozen layers (Option 1)}: train only the new output layer. Best for very small new datasets ($\approx50$--$100$ examples). \item \textbf{Full fine-tuning (Option 2)}: retrain all layers using pre-trained weights as initialisation. Best for moderate-to-large new datasets. \end{itemize} \end{enumerate} Transfer learning works because neural networks learn a hierarchy of features: early layers detect universal low-level patterns (edges, corners) that are task-agnostic. \subsection{Ethics and Fairness} As ML systems impact billions of users, \textbf{fairness}, \textbf{bias detection}, and \textbf{ethical development} are mandatory parts of the lifecycle. A framework: \begin{enumerate}[label=\Roman*.] \item \textbf{Team diversity}: assemble diverse teams. \item \textbf{Literature review}: consult industry guidelines and standards. \item \textbf{Pre-deployment auditing}: measure performance across demographic subgroups. \item \textbf{Mitigation and monitoring}: prepare rollback strategy; monitor for real-world harms post-deployment. \end{enumerate} %---------------------------------------------------------- \section{Skewed Datasets} \subsection{The Problem with Accuracy on Skewed Data} \begin{definition}[Skewed dataset] A dataset is \textbf{skewed} when the ratio of positive to negative examples is far from 50/50 (e.g.\ $99\%$ negative, $1\%$ positive). \end{definition} A classifier that \emph{always} predicts negative achieves $99\%$ accuracy on such a dataset while providing zero useful predictions. This motivates \textbf{Precision} and \textbf{Recall}. \subsection{Confusion Matrix, Precision, and Recall} \begin{center} \begin{tabular}{llcc} \toprule & & \multicolumn{2}{c}{\textbf{Actual Class}}\\ \cmidrule{3-4} & & Positive ($y=1$) & Negative ($y=0$)\\ \midrule \multirow{2}{*}{\textbf{Predicted}} & Positive ($\hat{y}=1$) & True Positive (TP) & False Positive (FP)\\ & Negative ($\hat{y}=0$) & False Negative (FN) & True Negative (TN)\\ \bottomrule \end{tabular} \end{center} \begin{align} \text{Precision}&=\frac{TP}{TP+FP},\qquad \text{Recall}=\frac{TP}{TP+FN}. \end{align} A classifier that always predicts negative has $\text{Recall}=0$, correctly exposing its uselessness. \subsection{The Precision--Recall Trade-off} Adjusting the decision threshold $\tau$ navigates the trade-off: \begin{itemize} \item Higher $\tau$ ($0.7$--$0.9$): higher precision, lower recall. Appropriate when false positives are costly. \item Lower $\tau$ ($0.3$): higher recall, lower precision. Appropriate when false negatives are costly (e.g.\ missing a deadly disease). \end{itemize} \subsection{The F1 Score} A single composite score combining precision and recall. The \emph{harmonic mean} penalises extreme imbalance: \begin{equation} F_1=\frac{2PR}{P+R}. \label{eq:f1} \end{equation} A high $F_1$ requires \emph{both} $P$ and $R$ to be high. \begin{center} \begin{tabular}{lcccc} \toprule \textbf{Algorithm} & $P$ & $R$ & \textbf{Arith.\ mean} & $F_1$\\ \midrule Algorithm 1 & 0.50 & 0.40 & 0.450 & \textbf{0.444}\\ Algorithm 2 & 0.70 & 0.10 & 0.400 & 0.175\\ Algorithm 3 & 0.02 & 1.00 & 0.510 & 0.039\\ \bottomrule \end{tabular} \end{center} Algorithm~3 achieves the highest arithmetic mean ($0.51$) but its F1 of $0.039$ correctly reveals it as a poor classifier (predicts positive for everything). %---------------------------------------------------------- \section*{Chapter Summary} \addcontentsline{toc}{section}{Chapter Summary} \begin{itemize} \item \textbf{Diagnostics} replace guessing with evidence. The three-way split (train/CV/test) enables unbiased model selection. \item \textbf{Bias} (underfitting) and \textbf{variance} (overfitting) are the two primary failure modes, diagnosed by comparing $J_{\text{train}}$ vs.\ $J_{\text{cv}}$ against a baseline. \item The iterative development loop (architecture $\to$ train $\to$ diagnose) is the standard workflow. Large, well-regularised networks tend to be low-bias; more data cures high variance. \item Data augmentation, synthesis, and transfer learning address data scarcity. \item \textbf{Skewed datasets} require Precision, Recall, and $F_1$ score rather than raw accuracy. \item Ethics and fairness must be embedded throughout the development lifecycle. \end{itemize}