Basically it is a bit difficult to manually perform grid search across different models in scikit-learn. We usually need to glue different pieces of code for different models to perform grid searches. In this post, I will share two solutions that I found useful for automating this process in two different cases:
The idea, in this case, is pretty simple, we pass two dictionaries to a helper class: the models and the parameters; then we call the fit method, wait until everything runs, and after you call the summary() method to have a nice DataFrame with the report for each model instance, according to the parameters.
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
class EstimatorSelectionHelper:
def __init__(self, models, params):
if not set(models.keys()).issubset(set(params.keys())):
missing_params = list(set(models.keys()) - set(params.keys()))
raise ValueError("Some estimators are missing parameters: %s" % missing_params)
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
for key in self.keys:
print("Running GridSearchCV for %s." % key)
model = self.models[key]
params = self.params[key]
gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
verbose=verbose, scoring=scoring, refit=refit,
return_train_score=True)
gs.fit(X,y)
self.grid_searches[key] = gs
def score_summary(self, sort_by='mean_score'):
def row(key, scores, params):
d = {
'estimator': key,
'min_score': min(scores),
'max_score': max(scores),
'mean_score': np.mean(scores),
'std_score': np.std(scores),
}
return pd.Series({**params,**d})
rows = []
for k in self.grid_searches:
print(k)
params = self.grid_searches[k].cv_results_['params']
scores = []
for i in range(self.grid_searches[k].cv):
key = "split{}_test_score".format(i)
r = self.grid_searches[k].cv_results_[key]
scores.append(r.reshape(len(params),1))
all_scores = np.hstack(scores)
for p, s in zip(params,all_scores):
rows.append((row(k, s, p)))
df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
columns = columns + [c for c in df.columns if c not in columns]
return df[columns]
The code above defines the helper class, now you need to pass it a dictionary of models and a dictionary of parameters for each of the models.
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
models1 = {
'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'SVC': SVC()
}
params1 = {
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
'RandomForestClassifier': { 'n_estimators': [16, 32] },
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
'SVC': [
{'kernel': ['linear'], 'C': [1, 10]},
{'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
]
}
We create a EstimatorSelectionHelper
by passing the models and the parameters, and then call the fit()
function, which has a signature similar to the original GridSearchCV
object.
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for SVC.
Fitting 3 folds for each of 6 candidates, totalling 18 fits
After the experiments have run, we can inspect the results of each model and each set of parameters by calling the score_summary
method.
helper1.score_summary(sort_by='max_score')
estimator | min_score | mean_score | max_score | std_score | C | gamma | kernel | learning_rate | n_estimators | |
---|---|---|---|---|---|---|---|---|---|---|
5 | AdaBoostClassifier | 0.962343 | 0.974907 | 0.991667 | 0.0123335 | NaN | NaN | NaN | NaN | 32 |
1 | ExtraTreesClassifier | 0.966387 | 0.973627 | 0.987552 | 0.00984908 | NaN | NaN | NaN | NaN | 32 |
4 | AdaBoostClassifier | 0.95279 | 0.966463 | 0.983333 | 0.0126727 | NaN | NaN | NaN | NaN | 16 |
3 | RandomForestClassifier | 0.958678 | 0.966758 | 0.979253 | 0.00896123 | NaN | NaN | NaN | NaN | 32 |
6 | GradientBoostingClassifier | 0.917031 | 0.947595 | 0.979253 | 0.025414 | NaN | NaN | NaN | 0.8 | 16 |
9 | GradientBoostingClassifier | 0.950413 | 0.962373 | 0.979079 | 0.0121747 | NaN | NaN | NaN | 1 | 32 |
7 | GradientBoostingClassifier | 0.95279 | 0.966317 | 0.975207 | 0.00972142 | NaN | NaN | NaN | 0.8 | 32 |
8 | GradientBoostingClassifier | 0.950413 | 0.962548 | 0.975207 | 0.0101286 | NaN | NaN | NaN | 1 | 16 |
10 | SVC | 0.95122 | 0.961108 | 0.975207 | 0.0102354 | 1 | NaN | linear | NaN | NaN |
2 | RandomForestClassifier | 0.953191 | 0.960593 | 0.975 | 0.0101888 | NaN | NaN | NaN | NaN | 16 |
0 | ExtraTreesClassifier | 0.958678 | 0.96666 | 0.974359 | 0.00640498 | NaN | NaN | NaN | NaN | 16 |
11 | SVC | 0.961373 | 0.963747 | 0.967213 | 0.00250593 | 10 | NaN | linear | NaN | NaN |
15 | SVC | 0.935484 | 0.945366 | 0.955466 | 0.00815896 | 10 | 0.0001 | rbf | NaN | NaN |
13 | SVC | 0.934959 | 0.946564 | 0.954733 | 0.00843008 | 1 | 0.0001 | rbf | NaN | NaN |
12 | SVC | 0.926407 | 0.936624 | 0.94958 | 0.00965657 | 1 | 0.001 | rbf | NaN | NaN |
14 | SVC | 0.918455 | 0.929334 | 0.940678 | 0.00907845 | 10 | 0.001 | rbf | NaN | NaN |
Here is an easy way to optimize over any classifier and for each classifier any settings of parameters.
from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator=SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
Now we can pass in anything for the estimator parameter. And we can optimize any parameter for any estimator we pass in as follows:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', ClfSwitcher()),
])
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
gscv.fit(train_data, train_labels)
and you can pretty-print the grid search results using the following script:
import pandas as pd
pd.DataFrame(gscv.cv_results_)
Resources:
https://www.davidsbatista.net/blog/2018/02/23/model_optimization/
https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers