Feature Control

·

3 min read

Get in mind that I'm using...

# checkout variable names.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(<data>, <target>)

Transformer

sklearn.preprocessing.PolynomialFeatures

  • adds additional polynomial features by combining original data.
# adding poly features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(include_bias=False)
poly.fit(x_train) # ! must fit with train data !
poly_train = poly.transform(x_train) # get transform result
x_train.shape, poly_train.shape
>> ((42, 3), (42, 9))

Adding much more polynomial features

# adding poly features of degree 5
poly = PolynomialFeatures(degree=5, include_bias=False)
poly.fit(x_train)
poly.get_feature_names_out()
>> array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2',
       'x2^2', 'x0^3', 'x0^2 x1', 'x0^2 x2', 'x0 x1^2', 'x0 x1 x2',
       'x0 x2^2', 'x1^3', 'x1^2 x2', 'x1 x2^2', 'x2^3', 'x0^4', 'x0^3 x1',
       'x0^3 x2', 'x0^2 x1^2', 'x0^2 x1 x2', 'x0^2 x2^2', 'x0 x1^3',
       'x0 x1^2 x2', 'x0 x1 x2^2', 'x0 x2^3', 'x1^4', 'x1^3 x2',
       'x1^2 x2^2', 'x1 x2^3', 'x2^4', 'x0^5', 'x0^4 x1', 'x0^4 x2',
       'x0^3 x1^2', 'x0^3 x1 x2', 'x0^3 x2^2', 'x0^2 x1^3',
       'x0^2 x1^2 x2', 'x0^2 x1 x2^2', 'x0^2 x2^3', 'x0 x1^4',
       'x0 x1^3 x2', 'x0 x1^2 x2^2', 'x0 x1 x2^3', 'x0 x2^4', 'x1^5',
       'x1^4 x2', 'x1^3 x2^2', 'x1^2 x2^3', 'x1 x2^4', 'x2^5'],
      dtype=object)
# transform
poly_train = poly.transform(x_train)
poly_test = poly.transform(x_test)
poly_train.shape
>> (42, 55)
  • but this can result overfitting problem on test data.
    # score on test data
    model.score(poly_test, y_test)
    
    >> -129.88151143522893
    

Regularization

sklearn.preprocessing.StandardScaler

  • this will scale your data with range of 0..1 for each column.
  • kind of data normalization
    # regularization
    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    ss.fit(poly_train)
    scaled_train = ss.transform(poly_train)
    scaled_test = ss.transform(poly_test)
    

Ridge, Lasso

sklearn.linear_model.Ridge

# Ridge
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(scaled_train, y_train)
ridge.score(scaled_train, y_train), ridge.score(scaled_test, y_test)
>> (0.9896217956447125, 0.9788853860988016)
  • alpha is a hyperparameter
    • bigger value => underfitting
    • smaller value => overfitting

Finding good alpha value

# alpha value plot
from matplotlib import pyplot as plt
train_score = list()
test_score = list()
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alphas:

  ridge = Ridge(alpha=alpha)
  ridge.fit(scaled_train, y_train)
  train_score.append(ridge.score(scaled_train, y_train))
  test_score.append(ridge.score(scaled_test, y_test))

import numpy as np
plt.plot(np.log10(alphas), train_score, color='blue')
plt.plot(np.log10(alphas), test_score, color='green')
plt.show()

image.png

  • alpha=0.1 is a good value.

sklearn.linear_model.Lasso

# Lasso
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(scaled_train, y_train)
lasso.score(scaled_train, y_train), lasso.score(scaled_test, y_test)
>> (0.9898014198970121, 0.9798798667260247)
# Finding alpha value of lasso
train_score, test_score = [], []
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alphas:
  lasso = Lasso(alpha=alpha, max_iter=10000)
  lasso.fit(scaled_train, y_train)
  train_score.append(lasso.score(scaled_train, y_train))
  test_score.append(lasso.score(scaled_test, y_test))

plt.plot(np.log10(alphas), train_score, color='dodgerblue')
plt.plot(np.log10(alphas), test_score, color='limegreen')
plt.show()

image.png

  • alpha=10 is a good value.