diff --git a/script/lls.pdf b/script/lls.pdf
index e1fbdace3d49dbf5d4e38e4e2f1882e8578ccd76..91a9a695f95d20143d3ff1d957a39b7dfad22647 100644
Binary files a/script/lls.pdf and b/script/lls.pdf differ
diff --git a/script/lls.tex b/script/lls.tex
index fcf6a1ffb2cb2d3a5d4a1f130c7f96b9b911c19c..518e81072f7575e1cb40313e091bb486622720a6 100644
--- a/script/lls.tex
+++ b/script/lls.tex
@@ -8,7 +8,7 @@
 	
 	We are then going to generalize this idea and talk about how linear regression can be used for polynomial approximation and compare the results with the ones
 	obtained using polynomial interpolation. Then we are going to use linear regression for solving a classification problem, in particular for recognizing
-	handwritten digits. We are then going to observe the flaws of this solution and give some motivation on alternative solutions.
+	handwritten digits.
 }
 
 \begin{document}
@@ -23,11 +23,13 @@
 		In other words, we can define our experiment as a function $f(x)$ such that:
 		
 		\begin{align}
-			f(x_i) = \hat{y_i} \\
-			\hat{f(x_i)} = f(x_i) + \mathcal{N}(\mu,\,\sigma^{2}) = \hat{y_i} + \mathcal{N}(\mu,\,\sigma^{2}) = y_i
+			f(x_i) &= \hat{y_i} \\
+			\hat{f(x_i)} &= f(x_i) + \mathcal{N}(\mu,\,\sigma^{2}) \\
+				&= \hat{y_i} + \mathcal{N}(\mu,\,\sigma^{2}) \\
+				&= y_i
 		\end{align}
 		
-		Here $\mathcal{N}(\mu,\,\sigma^{2})$ is a small measurement error, denoted as a sampled point from a normal distribution with mean $\mu$ and standard deviation $\sigma^2$.
+		Here $\mathcal{N}(\mu,\,\sigma^{2})$ is a small measurement error, denoted as a sampled point from a normal distribution with mean $\mu$ and variance $\sigma^2$.
 		Equation (1) models the outcome of an experiment with variables $x_i$. Equation (2) models the outcomes we measure when conducting 
 		an experiment with variables $x_i$. Our task is to find a good approximation of $f(x)$ using observations of $\hat{f(x)}$.
 		
@@ -58,10 +60,11 @@
 		However, as one can see from the example above, such a function doesn't exist, as there is no line which goes through all points. We need to find the
 		best possible line, therefore we can define our error for each data point as being $f(x_i) - y_i$ and try to minimize this error. As we want
 		$f(x_i) = y_i$, we actually want to minimize the absolute value, so as to minimize the deviation between prediction and observation.
-		Therefore, we use the mean squared error.
+		Therefore, we use the mean squared error ($MSE$).
 		
 		\begin{align}
-			MSE((x_1, y_1), \dots, (x_n, y_n)) = \frac{1}{n}\sum_{i = 0}^{n} (f(x_i) - y_i)^2 = (a x_i + b - y_i)^2
+			MSE((x_1, y_1), \dots, (x_n, y_n)) &= \frac{1}{n}\sum_{i = 0}^{n} (f(x_i) - y_i)^2 \\
+				&= \frac{1}{n}\sum_{i = 0}^{n} (a x_i + b - y_i)^2
 		\end{align}
 		
 	
@@ -74,7 +77,7 @@
 		\end{align*}
 		
 		We are now going to redefine our problem in matrix notation, as the derivation of the optimal parameters $\omega_1,  \dots \omega_m$ will be much easier in this notation.
-		First, we observe we can represent $f(x_i)$ as just a vector multiplications, as
+		First, we observe we can represent $f(x_i)$ as just a vector-vector multiplication, as
 		
 		\begin{align*}
 			f(x_i) &= \omega_1 x_{i,1} + \omega_2 x_{i, 2} + \dots + \omega_m x{i, m} + b \\
@@ -113,14 +116,15 @@
 			\end{bmatrix}
 		\end{align*}
 		
-		We define our model parameters $\omega_1,  \dots \omega_m$ as a vector
+		We define our model parameters $\omega_1,  \dots \omega_m, b$ as a vector
 		
 		\begin{align*}
 			\omega &= \begin{bmatrix}
 				\omega_{1} \\
 				\omega_{2} \\
 				\vdots \\
-				\omega_{m}
+				\omega_{m} \\
+				b
 			\end{bmatrix}
 		\end{align*}		
 		
@@ -135,7 +139,9 @@
 			\end{bmatrix}
 		\end{align*}
 		
-		We then observe
+		Note that the previous sum of squared errors over single data points and expectations 
+		$\sum_{i = 0}^{n} (a x_i + b - y_i)^2$ is in matrix notation expressed as $(X \omega - y)^{T} (X \omega - y)$.
+		Let's observe the mean squared error
 		
 		\begin{align}
 			MSE(X, y) &= \frac{1}{n}(X \omega - y)^{T} (X \omega - y) \\
@@ -152,14 +158,17 @@
 	
 		\subsection{Minimizing the mean squared error}
 		
-			We know the mean squared error is a quadratic function with strictly positive values. Therefore, it has only one extreme, which is it's minimum.
+			We know the mean squared error is a quadratic function with non-negative values. Therefore, it has only one extreme, which is it's minimum.
 			Therefore, the optimal parameters $\omega$ are those for which $MSE$ is minimal. We can find those by finding the $\omega$ for which $\frac{\partial MSE(X, y)}{\partial \omega} = 0$.
 			
 			\begin{align*}
 				& \frac{\partial MSE(X, y)}{\partial \omega} &= & 0 \\
-					\iff & \frac{\partial n MSE(X, y)}{\partial \omega} &= & 0 \\
-					\iff & \frac{\partial \omega^T X^T X \omega - 2 \omega^T X^T y + y^Ty}{\partial} &= & 0 \\
-					\iff & \frac{\partial \omega^T X^T X \omega - 2 \omega^T X^T y}{\partial} &= & 0 \\
+					\iff & \frac{\partial n MSE(X, y)}{\partial \omega} &= & 0  
+														\tag{derivative multiplied by a constant factor has same roots} \\
+					\iff & \frac{\partial \omega^T X^T X \omega - 2 \omega^T X^T y + y^Ty}{\partial} &= & 0 
+														\tag{$\frac{1}{n} n = 1$} \\
+					\iff & \frac{\partial \omega^T X^T X \omega - 2 \omega^T X^T y}{\partial} &= & 0 
+														\tag{$\frac{\partial y^Ty}{\partial \omega} = 0$} \\
 					\iff & \frac{\partial \omega^T X^T X \omega}{\partial \omega} - \frac{2 \omega^T X^T y}{\partial} &= & 0 \\
 					\iff & \frac{\partial \omega^T X^T X \omega}{\partial \omega} &= & \frac{2 \omega^T X^T y}{\partial} \\
 			\end{align*}
@@ -198,31 +207,40 @@
 			A symmetric square matrix can be uniquely represented by a quadratic form as learned in Linear Algebra II \cite{laii}, so we could omit the calculation of
 			$x^T A x$. For the sake of completeness, we show this result anyways. 
 			
-			We observe for $A = 
+			Note that as A is symmetric, that means $A = 
 			\begin{bmatrix}
-				a_{1,1} & \dots & a_{1, k} \\
-				\vdots & \ddots & \vdots \\
-				a_{k, 1} & \dots & a_{k, k}
+				a_{1,1} & a_{1,2} & \dots & a_{1, k} \\
+				a_{2,1} & a_{2,2} & \dots & a_{2, k} \\
+				\vdots & \vdots & \ddots & \vdots \\
+				a_{k, 1} & a_{k, 2} & \dots & a_{k, k}
 			\end{bmatrix} = 
 			\begin{bmatrix}
-				a_{1,1} & \dots & a_{1, k} \\
-				\vdots & \ddots & \vdots \\
-				a_{1, k} & \dots & a_{k, k}
+			a_{1,1} & a_{1,2} & \dots & a_{1, k} \\
+			a_{1,2} & a_{2,2} & \dots & a_{2, k} \\
+			\vdots & \vdots & \ddots & \vdots \\
+			a_{1, k} & a_{2, k} & \dots & a_{k, k}
 			\end{bmatrix}
 			$
 			
 			\begin{align*}
 				\frac{\partial x^T A X}{\partial x} &= \frac{\partial \sum_{i = 0}^{k} \sum_{j = 0}^{k} x_i x_j a_{i, j} } {\partial x} \\
-					&=  \frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l \neq j} 2 x_j x_l a_{j, l} }{\partial x}  \tag{because $a_{i, j} = a_{j, i}$ as A is symmetric} \\
+					&=  \frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l \neq j} x_j x_l a_{j, l} }{\partial x} \\
+					&=  \frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l > j} 2 x_j x_l a_{j, l} }{\partial x} 
+																					\tag{because $a_{i, j} = a_{j, i}$ as A is symmetric} \\
 					&= \begin{bmatrix}
-						\frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l \neq j} 2 x_j x_l a_{j, l} }{\partial x_1} \\
+						\frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l > j} 2 x_j x_l a_{j, l} }{\partial x_1} \\
 						\vdots \\
-						\frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l \neq j} 2 x_j x_l a_{j, l} }{\partial x_k}
+						\frac{\partial \sum_{i = 0}^{k} x_i^2 a_{i, i} + \sum_{j = 0}^k\sum_{l > j} 2 x_j x_l a_{j, l} }{\partial x_k}
 					\end{bmatrix} \\
 					&= \begin{bmatrix}
 						2 a_{1,1} x_1 + \dots + 2 a_{1, k} x_k \\
 						\vdots \\
-						2 a_{k,1} x_1 + \dots + 2 a_{k, k} x_k
+						2 a_{1, k} x_1 + \dots + 2 a_{k, k} x_k
+					\end{bmatrix} \\
+					&= \begin{bmatrix}
+					2 a_{1,1} x_1 + \dots + 2 a_{1, k} x_k \\
+					\vdots \\
+					2 a_{k,1} x_1 + \dots + 2 a_{k, k} x_k
 					\end{bmatrix} \\
 					&= 2Ax
 			\end{align*}
@@ -247,16 +265,23 @@
 			we are omitting this step here though as it is out of the scope of the current tutorial).
 	
 		\subsection{Inverting possibly singular matrix}
-		In order to invert a singular matrix, often the Moore-Penrose pseudoinverse is used \cite{moorepen}.
+		A singular matrix is a square non-invertible matrix. We know a square matrix $A$ is singular if and only if $det(A) = 0$, which could be the case for $X^TX$ and is
+		the problem to solve in this section.
+		
+		
+		In order to invert a singular matrix, often the Moore-Penrose pseudoinverse is used \cite{moorepen}, however this inverse includes
+		complex matrix decompositions such as singular value decomposition (SVD), which are beyond the scope of this tutorial. We are going to 
+		focus on one special matrix here, $A + I\epsilon$ and explain why it's always invertible if A is a positive semidefinite square matrix.
+		
 		As we know from "Lineare Algebra für Informatiker" \cite{lai}, the characteristic polynomial of  $A \in Mat_{n \times n}(\mathbb{R})$ is
 		$\chi_A(\lambda) = det(X - I \lambda)$. Further we know
 		
 		\begin{align*}
 			& \chi_A(\lambda) = 0 \\
-			\iff & det(X - I \lambda) = 0 \\
-			\iff & Ker(X - I \lambda) \neq \emptyset \\
+			\iff & det(A - I \lambda) = 0 \\
+			\iff & Ker(A - I \lambda) \neq \emptyset \\
 			\iff & Eig(A, \lambda) \neq \emptyset \\
-			\iff & \lambda \text{ is an eigenvalue of } X
+			\iff & \lambda \text{ is an eigenvalue of } A
 		\end{align*}
 		
 		But further, as learned in Linear Algebra II \cite{laii}, $\forall x \in \mathbb{R}^k: xAx \geq 0 \iff (\forall \lambda_i: Eig(A, \lambda_i) \neq \emptyset \iff \lambda_i > 0)$.
@@ -264,9 +289,8 @@
 		
 		$\implies \forall \epsilon > 0: det(X^TX + I\epsilon) = det(X^TX - I(-\epsilon)) \neq 0$ \\
 		
+		Otherwise, this would mean a negative $\epsilon$ is an eigenvalue of $X^TX$, which would be a contradiction to the previous statements.
 		Therefore, we could use $(X^TX + I\epsilon)^{-1}$ instead of $(X^TX)^{-1}$ for some very small $\epsilon$.
-		
-	\clearpage
 	
 	\section{Polynomial approximation}
 	
@@ -299,13 +323,15 @@
 		
 		This way, we can approximate a polynomial of degree $k$. Often it makes sense to increment the degree $k$ until one is happy with the mean squared error.
 		Otherwise, one might have a very precise model when tested on the data the model is supposed to fit, but a very inaccurate one when tested on unseen data. This
-		phenomenon is known as "overfitting" the data. Figure Figure \ref{fig:polapint} compares results of polynomial approximation and polynomial approximation.
+		phenomenon is known as "overfitting" the data. Figure \ref{fig:polapint} compares results of polynomial approximation and polynomial interpolation.
 		
 		\begin{figure}[h!]
 			\includegraphics[width=\textwidth]{./plots/2d_pol_ap_int.png}
 			\caption{Polynomial approximation VS interpolation}
 			\label{fig:polapint}
 		\end{figure}
+	
+		\clearpage
 		
 		
 	\section*{Classification}
@@ -313,38 +339,50 @@
 		One could also use linear regression to solve a classification problem. For this example, we are going to distinguish between hand-written digits
 		using the famous MNIST \cite{mnist} data set. \\
 		
-		The data set consist of $28 \times 28$ images. Our algorithm needs to predict the digit $y_i \in {0, \dots, 9}$ given an image $\hat{x_i} = \begin{bmatrix*}
-			x_{i, 1, 1} & \dots & x_{i, 1, 28} \\
+		The data set consist of $28 \times 28$ pixel images. Our algorithm needs to predict the digit $y_i \in {0, \dots, 9}$ given an image $\widetilde{x_i} = \begin{bmatrix*}
+			p_{i, 1, 1} & \dots & p_{i, 1, 28} \\
 			\vdots & \ddots & \vdots \\
-			x_{i, 28, 1} & \dots & x_{i, 28, 28}
-		\end{bmatrix*}$
+			p_{i, 28, 1} & \dots & p_{i, 28, 28}
+		\end{bmatrix*} \in Mat_{28 \times 28}([0, 1]^2) $, for $ [0, 1]^2 \subset \mathbb{R}^2$
 		
 		First, we are transforming each image into a $28 \times 28$ - dimensional row vector
 		
 		\begin{align*}
-				\hat{x_i} = [x_{i, 1, 1}, \dots, x_{i, 1, 28},x_{i,2,1}, \dots, x_{i,2,28}, \dots, x_{i, 28, 28}]
+				\hat{x_i} = [p_{i, 1, 1}, \dots, p_{i, 1, 28}, p_{i,2,1}, \dots, p_{i,2,28}, \dots, p_{i, 28, 28}]
 		\end{align*}
 		
 		Then, similarly to previous sections, we define
 		
 		\begin{align*}
-			X = \begin{bmatrix}
-				\hat{x_1} & 1 \\
-				\vdots & \vdots \\
-				\hat{x_n} \vdots & 1 \\
-			\end{bmatrix}
+			X = 
+				\begin{bmatrix}
+					\hat{x_1} & 1 \\
+					\vdots & \vdots \\
+					\hat{x_n} \vdots & 1 \\
+				\end{bmatrix}
+			  =
+			  	\begin{bmatrix}
+				  	x_1 \\
+				  	\vdots \\
+				  	x_n \\
+			  	\end{bmatrix}
 		\end{align*}
 		
 			\subsection{Distinguishing between two digits}
 			
-			In order to distinguish between digits i and j, we only take the corresponding $X_i, X_j$ such that $\forall x_k \in X_k: f(x_k) = k$.
+			In order to distinguish between digits i and j, we only take the corresponding $X_i, X_j$ with
+			
+			\begin{align*}
+				X_k := \{ x_i \in X | y_i = k \}
+			\end{align*}
+			
 			Then, we train a binary classifier, which distinguishes between i and j. \\
 			
 			We do this by training a linear regression to predict, for $x_k \in X_i \cup X_j$
 			
 			\begin{align*}
 				f(x_k) = \begin{cases}
-					1 				& k = i \\
+					1 				& y_k = i \\
 					-1              & \text{otherwise}
 				\end{cases}
 			\end{align*}
diff --git a/script/res/titlepage.tex b/script/res/titlepage.tex
index 9475cf43768c92c170ea9a154ddf7a06efff666e..61c446dda42ff4878960cd57114271b7e6616a40 100644
--- a/script/res/titlepage.tex
+++ b/script/res/titlepage.tex
@@ -13,7 +13,7 @@
 	\end{abstract}
 
 
-	\begin{textblock}{100}(95,7)
+	\begin{textblock}{120}(60.5,10)
 		\makebox[\dimexpr\textwidth+1cm][r]{
 			\includegraphics[width=0.4\textwidth]{./res/fu_logo.eps}
 		}