ML – Pipeline with Logistic Regression, Column Transformers and Feature Selection

ML Pipeline with Logistic Regression, Column Transformers and Feature Selection

import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df=pd.read_csv(‘Documents\Machine Learning\churn_data.csv’)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        10 non-null     object 
 1   Tenure            10 non-null     int64  
 2   PhoneService      10 non-null     object 
 3   Contract          10 non-null     object 
 4   PaperlessBilling  10 non-null     object 
 5   PaymentMethod     10 non-null     object 
 6   MonthlyCharges    10 non-null     float64
 7   TotalCharges      10 non-null     float64
 8   Churn             10 non-null     object 
dtypes: float64(2), int64(1), object(6)
memory usage: 848.0+ bytes

convert churn datatype to float64

df[‘Churn’] = df[‘Churn’].replace({‘Yes’:1, ‘No’:0})
df.head()

df.columns

Index(['CustomerID', 'Tenure', 'PhoneService', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

col=[‘Tenure’, ‘PhoneService’, ‘Contract’, ‘PaperlessBilling’,
‘PaymentMethod’, ‘MonthlyCharges’]
X=df[col]
y=df[‘Churn’]

X_train, X_test, y_train, y_test = train_test_split(X,y)

X_train

X_test

y_train

7    1
4    1
0    0
3    0
6    0
2    1
5    0
Name: Churn, dtype: int64

y_test

9    1
8    0
1    0
Name: Churn, dtype: int64

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

scale = StandardScaler()
ohe = OneHotEncoder()
logreg = LogisticRegression()

ct = make_column_transformer((ohe,[‘PhoneService’,’Contract’, ‘PaperlessBilling’,
‘PaymentMethod’]), (scale,[‘Tenure’, ‘MonthlyCharges’]), remainder=’passthrough’)

feature_selection = SelectKBest(k=6)

pipe = make_pipeline(ct,logreg)
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['PhoneService', 'Contract',
                                                   'PaperlessBilling',
                                                   'PaymentMethod']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['Tenure',
                                                   'MonthlyCharges'])])),
                ('logisticregression', LogisticRegression())])

from sklearn import set_config
set_config(display=’diagram’)
pipe