Polynomial Fit with RANSAC

Polynomial Fit with RANSAC#

Notebook by:

Royi Avital RoyiAvital@fixelalgorithms.com

Revision History#

Version	Date	User	Content / Changes
1.0.000	24/03/2024	Royi Avital	First version

# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.linear_model import HuberRegressor, LinearRegression ,RANSACRegressor

# Miscellaneous
import math
import os
from platform import python_version
import random
import timeit

# Typing
from typing import Callable, Dict, List, Optional, Set, Tuple, Union

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image
from IPython.display import display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout, SelectionSlider
from ipywidgets import interact

Notations#

(?) Question to answer interactively.
(!) Simple task to add code for the notebook.
(@) Optional / Extra self practice.
(#) Note / Useful resource / Food for thought.

Code Notations:

someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler

Code Exercise#

Single line fill

vallToFill = ???

Multi Line to Fill (At least one)

# You need to start writing
????

Section to Fill

#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

???
#===============================================================#

# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

# Courses Packages

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
from utils.DataVisualization import PlotRegressionData

# General Auxiliary Functions

def GenOutlierData( vR: np.ndarray, vO: np.ndarray, vI: np.ndarray, numOutliers: int ) -> np.ndarray:

    vY = vR.copy()
    vY[vI[:numOutliers]] = vO[vI[:numOutliers]]

    return vY

def PlotPolyFit( vX: np.ndarray, vR: np.ndarray, vO: np.ndarray, vI: np.ndarray, numOutliers: int, P: int, minSamplesRatio: float, 
                numGridPts: int = 1001, hA: Optional[plt.Axes] = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = MARKER_SIZE_DEF, 
                lineWidth: int = LINE_WIDTH_DEF, axisTitle: Optional[str] = None ) -> None:

    if hA is None:
        hF, hA = plt.subplots(1, 1, figsize = figSize)
    else:
        hF = hA[0].get_figure()

    numSamples = len(vX)

    mX = vX[:, np.newaxis] ** range(P + 1)
    vY = GenOutlierData(vR, vO, vI, numOutliers)

    oLinReg = LinearRegression(fit_intercept = False).fit(mX, vY)
    oRanReg = RANSACRegressor(estimator = LinearRegression(fit_intercept = False), min_samples = minSamplesRatio).fit(mX, vY)
    oHubReg = HuberRegressor(fit_intercept = False).fit(mX, vY)

    #TODO: Add calculation of the R2 only on the inliers

    # Plot
    xx  = np.linspace(np.floor(np.min(vX)), np.ceil(np.max(vX)), numGridPts)
    yy1 = np.polyval(oLinReg.coef_[::-1], xx)
    yy2 = np.polyval(oRanReg.estimator_.coef_[::-1], xx)
    yy3 = np.polyval(oHubReg.coef_[::-1], xx)

    hA.plot(vX, vY, '.r', ms = markerSize, label = '$y_i$')
    hA.plot(vX[vI[:numOutliers]], vY[vI[:numOutliers]], '.y', ms = markerSize, label = 'Outliers')
    hA.plot(xx, yy1, 'g', lw = 2,  label = f'OLS, R2 = {oLinReg.score(mX, vY)}')
    hA.plot(xx, yy2, 'b', lw = 2,  label = f'RANSAC, R2 = {oRanReg.estimator_.score(mX, vY)}')
    hA.plot(xx, yy3, 'm', lw = 2,  label = f'Huber, R2 = {oHubReg.score(mX, vY)}')
    hA.set_title (f'OLS vs. RANSAC OLS vs. Huber Loss')
    hA.set_xlabel('$x$')
    hA.set_ylabel('$y$')
    hA.grid()
    hA.legend()

Robust Regression#

The concept of Robust Regression is to estimate the model parameters while minimizing the effect of the outliers.
It is usually achieved by 2 approaches:

Robust Objective
Using an outlier robust objective function. Such as the \({L}^{1}\) or Huber Loss.
Filtration
Marking the outliers and applying the objective on the inliers.

(#) RANSAC is a filtration approach which iteratively finds the inliers of the data.

# Parameters

# Data Generation
numSamples  = 150
noiseStd    = 0.3
numOutliers = 30
fctOutliers = 5

vP = np.array([1, 2])

# Data Visualization
gridNoiseStd    = 0.05
numGridPts      = 500

Generate / Load Data#

\[ y_{i} = f \left( x_{i} \right) + \epsilon_{i} \]

Where

\[ f \left( x \right) = \sum_{i = 0}^{N - 1} {p}_{i} {x}^{N - 1 - i} \]

# The Data Generating Function

def f( vX: np.ndarray, vP: np.ndarray ) -> np.ndarray:
    
    return np.polyval(vP, vX)

hF = lambda vX: f(vX, vP)

# Generate Data

vX = np.linspace(-2, 2, numSamples, endpoint = True) + (gridNoiseStd * np.random.randn(numSamples))
vN = noiseStd * np.random.randn(numSamples)
vR = f(vX, vP) + vN
vO = fctOutliers * vR
vI = np.random.permutation(numSamples)
vY = GenOutlierData(vR, vO, vI, numOutliers)

print(f'The features data shape: {vX.shape}')
print(f'The labels data shape: {vY.shape}')

The features data shape: (150,)
The labels data shape: (150,)

Plot Data#

# Plot the Data

PlotRegressionData(vX, vY)

plt.show()

../../../../_images/7fea80f88a1a352d6e671f55b37a150f0596e1c89ec297c673b56a8a036d1a9e.png

Train a Robust Polyfit Regressor#

The Linear Regressor optimization problem is given by:

\[ \arg \min_{\boldsymbol{w}} \frac{1}{2} {\left\| X \boldsymbol{w} - \boldsymbol{y} \right\|}_{2}^{2} \]

Where in Polynomial model:

\[\begin{split} \boldsymbol{X} = \begin{bmatrix} 1 & x_{1} & x_{1}^{2} & \cdots & x_{1}^{p} \\ 1 & x_{2} & x_{2}^{2} & \cdots & x_{2}^{p} \\ \vdots & \vdots & \vdots & & \vdots \\ 1 & x_{N} & x_{N}^{2} & \cdots & x_{N}^{p} \end{bmatrix} \end{split}\]

There are 2 robust model to train:

Huber Loss
Being based on a Robust Loss.
Implemented by HuberRegressor.
RANSAC Filtration of outliers by maximizing the number of inliers. Implemented by RANSACRegressor.
The model estimator parameter supports any regressor with the fit(), predict() and score() methods.

(#) Linear model can work with any features with transformation (Polynomial or not).
(#) In practice the RANSAC method can be used for any model, Linear or Non Linear.

# Interactive Plot

hPolyFit = lambda numOutliers, minSamplesRatio: PlotPolyFit(vX, vR, vO, vI, numOutliers, len(vP) - 1, minSamplesRatio)
numOutliersSlider = IntSlider(min = 0, max = numSamples, step = 1, value = 0, layout = Layout(width = '30%'))
minSamplesRatioSlider = FloatSlider(min = 0.01, max = 1.0, step = 0.01, value = 0.5, layout = Layout(width = '30%'))
interact(hPolyFit, numOutliers = numOutliersSlider, minSamplesRatio = minSamplesRatioSlider)
plt.show()

(?) Does the R2 score reflects the performance? What can be done?
(#) Once we have the model we may even use it to identify the outliers (Outlier / Anomaly Detection).

R2 score examples: