ML Pipelines using Python Sklearn Package
Filtering warnings
import warnings
warnings.filterwarnings(‘ignore’)
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
iris_df=load_iris() #Load dataset from sklearn.org
iris_df.data
array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3. , 1.4, 0.1], [4.3, 3. , 1.1, 0.1], [5.8, 4. , 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1. , 0.2], [5.1, 3.3, 1.7, 0.5], [4.8, 3.4, 1.9, 0.2], [5. , 3. , 1.6, 0.2], [5. , 3.4, 1.6, 0.4], [5.2, 3.5, 1.5, 0.2], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [4.9, 3.1, 1.5, 0.2], [5. , 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.6, 1.4, 0.1], [4.4, 3. , 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5. , 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5. , 3.5, 1.6, 0.6], [5.1, 3.8, 1.9, 0.4], [4.8, 3. , 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [4.6, 3.2, 1.4, 0.2], [5.3, 3.7, 1.5, 0.2], [5. , 3.3, 1.4, 0.2], [7. , 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4. , 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3, 4.7, 1.6], [4.9, 2.4, 3.3, 1. ], [6.6, 2.9, 4.6, 1.3], [5.2, 2.7, 3.9, 1.4], [5. , 2. , 3.5, 1. ], [5.9, 3. , 4.2, 1.5], [6. , 2.2, 4. , 1. ], [6.1, 2.9, 4.7, 1.4], [5.6, 2.9, 3.6, 1.3], [6.7, 3.1, 4.4, 1.4], [5.6, 3. , 4.5, 1.5], [5.8, 2.7, 4.1, 1. ], [6.2, 2.2, 4.5, 1.5], [5.6, 2.5, 3.9, 1.1], [5.9, 3.2, 4.8, 1.8], [6.1, 2.8, 4. , 1.3], [6.3, 2.5, 4.9, 1.5], [6.1, 2.8, 4.7, 1.2], [6.4, 2.9, 4.3, 1.3], [6.6, 3. , 4.4, 1.4], [6.8, 2.8, 4.8, 1.4], [6.7, 3. , 5. , 1.7], [6. , 2.9, 4.5, 1.5], [5.7, 2.6, 3.5, 1. ], [5.5, 2.4, 3.8, 1.1], [5.5, 2.4, 3.7, 1. ], [5.8, 2.7, 3.9, 1.2], [6. , 2.7, 5.1, 1.6], [5.4, 3. , 4.5, 1.5], [6. , 3.4, 4.5, 1.6], [6.7, 3.1, 4.7, 1.5], [6.3, 2.3, 4.4, 1.3], [5.6, 3. , 4.1, 1.3], [5.5, 2.5, 4. , 1.3], [5.5, 2.6, 4.4, 1.2], [6.1, 3. , 4.6, 1.4], [5.8, 2.6, 4. , 1.2], [5. , 2.3, 3.3, 1. ], [5.6, 2.7, 4.2, 1.3], [5.7, 3. , 4.2, 1.2], [5.7, 2.9, 4.2, 1.3], [6.2, 2.9, 4.3, 1.3], [5.1, 2.5, 3. , 1.1], [5.7, 2.8, 4.1, 1.3], [6.3, 3.3, 6. , 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3. , 5.9, 2.1], [6.3, 2.9, 5.6, 1.8], [6.5, 3. , 5.8, 2.2], [7.6, 3. , 6.6, 2.1], [4.9, 2.5, 4.5, 1.7], [7.3, 2.9, 6.3, 1.8], [6.7, 2.5, 5.8, 1.8], [7.2, 3.6, 6.1, 2.5], [6.5, 3.2, 5.1, 2. ], [6.4, 2.7, 5.3, 1.9], [6.8, 3. , 5.5, 2.1], [5.7, 2.5, 5. , 2. ], [5.8, 2.8, 5.1, 2.4], [6.4, 3.2, 5.3, 2.3], [6.5, 3. , 5.5, 1.8], [7.7, 3.8, 6.7, 2.2], [7.7, 2.6, 6.9, 2.3], [6. , 2.2, 5. , 1.5], [6.9, 3.2, 5.7, 2.3], [5.6, 2.8, 4.9, 2. ], [7.7, 2.8, 6.7, 2. ], [6.3, 2.7, 4.9, 1.8], [6.7, 3.3, 5.7, 2.1], [7.2, 3.2, 6. , 1.8], [6.2, 2.8, 4.8, 1.8], [6.1, 3. , 4.9, 1.8], [6.4, 2.8, 5.6, 2.1], [7.2, 3. , 5.8, 1.6], [7.4, 2.8, 6.1, 1.9], [7.9, 3.8, 6.4, 2. ], [6.4, 2.8, 5.6, 2.2], [6.3, 2.8, 5.1, 1.5], [6.1, 2.6, 5.6, 1.4], [7.7, 3. , 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6. , 3. , 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3. , 5.2, 2.3], [6.3, 2.5, 5. , 1.9], [6.5, 3. , 5.2, 2. ], [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]])
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)
Pipelines Creation
1. Data Preprocessing by using Standard Scaler
2. Reduce Dimension using PCA
3. Apply Classifier
LogisticRegressionPipeline=Pipeline([(‘scalar1’,StandardScaler()),
(‘pca1’,PCA(n_components=2)),
(‘lr_classifier’,LogisticRegression(random_state=0))])
DecisionTreePipeline=Pipeline([(‘scalar2’,StandardScaler()),
(‘pca2’,PCA(n_components=2)),
(‘dt_classifier’,DecisionTreeClassifier())])
RandomForestPipeline=Pipeline([(‘scalar3’,StandardScaler()),
(‘pca3’,PCA(n_components=2)),
(‘rf_classifier’,RandomForestClassifier())])
LEts make the list of pipelines
pipelines = [LogisticRegressionPipeline, DecisionTreePipeline, RandomForestPipeline]
best_accuracy=0.0
best_classifier=0
best_pipeline=””
Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: ‘Logistic Regression’, 1: ‘Decision Tree’, 2: ‘RandomForest’}
Fit the pipelines
for pipe in pipelines:
pipe.fit(X_train, y_train)
for i,model in enumerate(pipelines):
print(“{} Test Accuracy: {}”.format(pipe_dict[i],model.score(X_test,y_test)))
Logistic Regression Test Accuracy: 0.8666666666666667 Decision Tree Test Accuracy: 0.9111111111111111 RandomForest Test Accuracy: 0.9111111111111111
for i,model in enumerate(pipelines):
if model.score(X_test,y_test)>best_accuracy:
best_accuracy=model.score(X_test,y_test)
best_pipeline=model
best_classifier=i
print(‘Classifier with best accuracy:{}’.format(pipe_dict[best_classifier]))
Classifier with best accuracy:Decision Tree
Pipelines Perform Hyperparameter Tuning Using Grid SearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
Create a pipeline
pipe = Pipeline([(“classifier”, RandomForestClassifier())])
Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
{“classifier”: [LogisticRegression()],
“classifier__penalty”: [‘l2′,’l1’],
“classifier__C”: np.logspace(0, 4, 10)
},
{“classifier”: [LogisticRegression()],
“classifier__penalty”: [‘l2’],
“classifier__C”: np.logspace(0, 4, 10),
“classifier__solver”:[‘newton-cg’,’saga’,’sag’,’liblinear’] ##This solvers don’t allow L1 penalty
},
{“classifier”: [RandomForestClassifier()],
“classifier__n_estimators”: [10, 100, 1000],
“classifier__max_depth”:[5,8,15,25,30,None],
“classifier__min_samples_leaf”:[1,2,5,10,15,100],
“classifier__max_leaf_nodes”: [2, 5,10]}]
create a gridsearch of the pipeline that fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)
print(best_model.best_estimator_)
print(“The mean accuracy of the model is:”,best_model.score(X_test,y_test))
Pipeline(steps=[('classifier', LogisticRegression(solver='saga'))]) The mean accuracy of the model is: 0.9555555555555556
Make Pipelines In SKLearn
from sklearn.pipeline import make_pipeline
Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
{“randomforestclassifier”: [RandomForestClassifier()],
“randomforestclassifier__n_estimators”: [10, 100, 1000],
“randomforestclassifier__max_depth”:[5,8,15,25,30,None],
“randomforestclassifier__min_samples_leaf”:[1,2,5,10,15,100],
“randomforestclassifier__max_leaf_nodes”: [2, 5,10]}]
create a gridsearch of the pipeline that fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)
0.9777777777777777
Predict Using Best Model
Predict target vector
best_model.predict(X_train)
array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 0, 0, 0, 2, 1, 2, 0])