ML Pipeline with Logistic Regression, Column Transformers and Feature Selection
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df=pd.read_csv(‘Documents\Machine Learning\churn_data.csv’)
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 10 non-null object 1 Tenure 10 non-null int64 2 PhoneService 10 non-null object 3 Contract 10 non-null object 4 PaperlessBilling 10 non-null object 5 PaymentMethod 10 non-null object 6 MonthlyCharges 10 non-null float64 7 TotalCharges 10 non-null float64 8 Churn 10 non-null object dtypes: float64(2), int64(1), object(6) memory usage: 848.0+ bytes
convert churn datatype to float64
df[‘Churn’] = df[‘Churn’].replace({‘Yes’:1, ‘No’:0})
df.head()
df.columns
Index(['CustomerID', 'Tenure', 'PhoneService', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'], dtype='object')
col=[‘Tenure’, ‘PhoneService’, ‘Contract’, ‘PaperlessBilling’,
‘PaymentMethod’, ‘MonthlyCharges’]
X=df[col]
y=df[‘Churn’]
X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train
X_test
y_train
7 1 4 1 0 0 3 0 6 0 2 1 5 0 Name: Churn, dtype: int64
y_test
9 1 8 0 1 0 Name: Churn, dtype: int64
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
scale = StandardScaler()
ohe = OneHotEncoder()
logreg = LogisticRegression()
ct = make_column_transformer((ohe,[‘PhoneService’,’Contract’, ‘PaperlessBilling’,
‘PaymentMethod’]), (scale,[‘Tenure’, ‘MonthlyCharges’]), remainder=’passthrough’)
feature_selection = SelectKBest(k=6)
pipe = make_pipeline(ct,logreg)
pipe.fit(X_train, y_train)
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder='passthrough', transformers=[('onehotencoder', OneHotEncoder(), ['PhoneService', 'Contract', 'PaperlessBilling', 'PaymentMethod']), ('standardscaler', StandardScaler(), ['Tenure', 'MonthlyCharges'])])), ('logisticregression', LogisticRegression())])
from sklearn import set_config
set_config(display=’diagram’)
pipe