grid search pipe#
see also https://en.wikipedia.org/wiki/Hyperparameter_optimization
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
load#
# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
define pipeline#
# Define a pipeline with three steps: StandardScaler, PCA, and LogisticRegression
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('logistic_regression', LogisticRegression())
])
GridSearchCV#
define a parameter grid to explore different combinations of PCA components and regularization strengths (C values) for logistic regression.
# Define the parameter grid
param_grid = {
'pca__n_components': [2, 3, 4], # PCA components to try
'logistic_regression__C': np.logspace(-4, 4, 4) # Regularization strengths to try
}
# Set up the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
Fit GridSearchCV#
# Fit GridSearchCV
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('pca', PCA()),
('logistic_regression',
LogisticRegression())]),
n_jobs=-1,
param_grid={'logistic_regression__C': array([1.00000000e-04, 4.64158883e-02, 2.15443469e+01, 1.00000000e+04]),
'pca__n_components': [2, 3, 4]},
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('pca', PCA()),
('logistic_regression',
LogisticRegression())]),
n_jobs=-1,
param_grid={'logistic_regression__C': array([1.00000000e-04, 4.64158883e-02, 2.15443469e+01, 1.00000000e+04]),
'pca__n_components': [2, 3, 4]},
verbose=1)Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA()),
('logistic_regression', LogisticRegression())])StandardScaler()
PCA()
LogisticRegression()
best model#
# Print the best parameters and the score of the best model
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
# Test score
test_score = grid_search.score(X_test, y_test)
print("Test set score:", test_score)
Best parameters: {'logistic_regression__C': 10000.0, 'pca__n_components': 3}
Best cross-validation score: 0.9523809523809523
Test set score: 0.9777777777777777