Get in mind that I'm using...
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(<data>, <target>)
sklearn.preprocessing.PolynomialFeatures
- adds additional polynomial features by combining original data.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(include_bias=False)
poly.fit(x_train)
poly_train = poly.transform(x_train)
x_train.shape, poly_train.shape
>> ((42, 3), (42, 9))
Adding much more polynomial features
poly = PolynomialFeatures(degree=5, include_bias=False)
poly.fit(x_train)
poly.get_feature_names_out()
>> array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2',
'x2^2', 'x0^3', 'x0^2 x1', 'x0^2 x2', 'x0 x1^2', 'x0 x1 x2',
'x0 x2^2', 'x1^3', 'x1^2 x2', 'x1 x2^2', 'x2^3', 'x0^4', 'x0^3 x1',
'x0^3 x2', 'x0^2 x1^2', 'x0^2 x1 x2', 'x0^2 x2^2', 'x0 x1^3',
'x0 x1^2 x2', 'x0 x1 x2^2', 'x0 x2^3', 'x1^4', 'x1^3 x2',
'x1^2 x2^2', 'x1 x2^3', 'x2^4', 'x0^5', 'x0^4 x1', 'x0^4 x2',
'x0^3 x1^2', 'x0^3 x1 x2', 'x0^3 x2^2', 'x0^2 x1^3',
'x0^2 x1^2 x2', 'x0^2 x1 x2^2', 'x0^2 x2^3', 'x0 x1^4',
'x0 x1^3 x2', 'x0 x1^2 x2^2', 'x0 x1 x2^3', 'x0 x2^4', 'x1^5',
'x1^4 x2', 'x1^3 x2^2', 'x1^2 x2^3', 'x1 x2^4', 'x2^5'],
dtype=object)
poly_train = poly.transform(x_train)
poly_test = poly.transform(x_test)
poly_train.shape
>> (42, 55)
Regularization
sklearn.preprocessing.StandardScaler
Ridge, Lasso
sklearn.linear_model.Ridge
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(scaled_train, y_train)
ridge.score(scaled_train, y_train), ridge.score(scaled_test, y_test)
>> (0.9896217956447125, 0.9788853860988016)
- alpha is a hyperparameter
- bigger value => underfitting
- smaller value => overfitting
Finding good alpha value
from matplotlib import pyplot as plt
train_score = list()
test_score = list()
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(scaled_train, y_train)
train_score.append(ridge.score(scaled_train, y_train))
test_score.append(ridge.score(scaled_test, y_test))
import numpy as np
plt.plot(np.log10(alphas), train_score, color='blue')
plt.plot(np.log10(alphas), test_score, color='green')
plt.show()
- alpha=0.1 is a good value.
sklearn.linear_model.Lasso
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(scaled_train, y_train)
lasso.score(scaled_train, y_train), lasso.score(scaled_test, y_test)
>> (0.9898014198970121, 0.9798798667260247)
train_score, test_score = [], []
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(scaled_train, y_train)
train_score.append(lasso.score(scaled_train, y_train))
test_score.append(lasso.score(scaled_test, y_test))
plt.plot(np.log10(alphas), train_score, color='dodgerblue')
plt.plot(np.log10(alphas), test_score, color='limegreen')
plt.show()
- alpha=10 is a good value.