# sine curve
def curve(x):
return 2 * np.sin(2 * x)
# Generate data
np.random.seed(0)
X1 = np.random.uniform(low=-5, high=5, size=(1000, 1))
y1 = curve(X1) + np.random.normal(scale=2, size=(1000, 1))
y1 = y1.flatten()
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(
X1, y1, test_size=0.2, random_state=0
)
# View training data
plt.scatter(X1_train, y1_train, label="noisy data")
plt.xlabel("feature")
plt.ylabel("target")
plt.show()
# Training data and true curve
plt.scatter(X1_train, y1_train, label="noisy data")
plt.xlabel("feature")
plt.ylabel("target")
plt.plot(
np.sort(X1.flatten()),
curve(np.sort(X1.flatten())),
label="true curve",
c=colors[1],
lw=3
)
plt.legend()
plt.show()
# Generate 100 features (predictors) and 1000 data points
np.random.seed(0)
X2 = np.random.normal(size=(1000, 100))
# only the first feature will matter
y2 = X2[:, 0] + np.random.normal(size=1000)
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(
X2, y2, test_size=0.2, random_state=0
)
from sklearn.ensemble import RandomForestRegressor
# shallow forest
forest1a = RandomForestRegressor(max_depth=4, random_state=0)
forest1a.fit(X=X1_train, y=y1_train)
# deep forest
forest1b = RandomForestRegressor(max_depth=50, random_state=0)
forest1b.fit(X=X1_train, y=y1_train)
RandomForestRegressor(max_depth=50, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_depth=50, random_state=0)
# training data
train1a = forest1a.score(X=X1_train, y=y1_train)
train1b = forest1b.score(X=X1_train, y=y1_train)
# test data
test1a = forest1a.score(X=X1_test, y=y1_test)
test1b = forest1b.score(X=X1_test, y=y1_test)
print(f"R-squared of shallow forest on training data = {train1a:.2%}")
print(f"R-squared of deep forest on training data = {train1b:.2%}")
print("\n")
print(f"R-squared of shallow forest on test data = {test1a:.2%}")
print(f"R-squared of deep forest on test data = {test1b:.2%}")
R-squared of shallow forest on training data = 35.23% R-squared of deep forest on training data = 86.87% R-squared of shallow forest on test data = 23.72% R-squared of deep forest on test data = -4.33%
from sklearn.model_selection import GridSearchCV
param_grid = {"max_depth": range(2, 22, 2)}
forest_cv1 = GridSearchCV(
RandomForestRegressor(random_state=0),
param_grid=param_grid,
)
forest_cv1.fit(X=X1_train, y=y1_train)
print(f"best hyperparameter is {forest_cv1.best_params_}")
print(f"R-squared on the test data is {forest_cv1.score(X=X1_test, y=y1_test): .2%}")
best hyperparameter is {'max_depth': 6} R-squared on the test data is 24.80%
forest_cv1.cv_results_
{'mean_fit_time': array([0.07377267, 0.08717914, 0.09889469, 0.11252427, 0.12421117, 0.12569208, 0.1228476 , 0.1191843 , 0.12329407, 0.12679143]), 'std_fit_time': array([0.00582796, 0.00928541, 0.00741403, 0.00925495, 0.00805558, 0.00675253, 0.00531868, 0.0084519 , 0.00760523, 0.00612412]), 'mean_score_time': array([0.00211639, 0.00513015, 0.00760536, 0.00627818, 0.0107892 , 0.00312495, 0.00312672, 0.00828857, 0.0067699 , 0.0001091 ]), 'std_score_time': array([0.00423279, 0.00559281, 0.00619265, 0.00680234, 0.00666072, 0.0062499 , 0.00625343, 0.00738377, 0.00833234, 0.0002182 ]), 'param_max_depth': masked_array(data=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20], mask=[False, False, False, False, False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'max_depth': 2}, {'max_depth': 4}, {'max_depth': 6}, {'max_depth': 8}, {'max_depth': 10}, {'max_depth': 12}, {'max_depth': 14}, {'max_depth': 16}, {'max_depth': 18}, {'max_depth': 20}], 'split0_test_score': array([0.09616569, 0.23542263, 0.30173737, 0.28694295, 0.25095494, 0.21625665, 0.19441994, 0.18510077, 0.18248995, 0.18137539]), 'split1_test_score': array([ 0.10763798, 0.17905418, 0.20575719, 0.10149797, -0.01897264, -0.11577575, -0.17075033, -0.19293928, -0.19777477, -0.1993838 ]), 'split2_test_score': array([ 0.12027304, 0.19937604, 0.14168575, 0.06906105, -0.00299846, -0.07808594, -0.1020879 , -0.11310146, -0.117921 , -0.11925511]), 'split3_test_score': array([0.09619447, 0.19925939, 0.28839889, 0.22429493, 0.17486064, 0.11083016, 0.06668123, 0.05486168, 0.05026746, 0.04889382]), 'split4_test_score': array([0.15072561, 0.26089124, 0.30192806, 0.24991503, 0.16510371, 0.09916815, 0.06724514, 0.05339673, 0.05211175, 0.05149178]), 'mean_test_score': array([ 0.11419936, 0.2148007 , 0.24790145, 0.18634239, 0.11378964, 0.04647865, 0.01110162, -0.00253631, -0.00616532, -0.00737558]), 'std_test_score': array([0.02031501, 0.02934917, 0.06405488, 0.08550536, 0.10624726, 0.12456701, 0.13094181, 0.13424504, 0.13518521, 0.13538296]), 'rank_test_score': array([ 4, 2, 1, 3, 5, 6, 7, 8, 9, 10])}
predict1 = forest_cv1.predict(X=X1_test)
plt.scatter(X1_test, y1_test, label="test data")
plt.scatter(X1_test, predict1, label="predicted")
plt.legend()
plt.show()
param_grid = {"max_depth": range(2, 22, 2)}
forest_cv2 = GridSearchCV(
RandomForestRegressor(random_state=0),
param_grid=param_grid,
)
forest_cv2.fit(X=X2_train, y=y2_train)
print(f"best hyperparameter is {forest_cv2.best_params_}")
print(f"R-squared on the test data is {forest_cv2.score(X=X2_test, y=y2_test):.2%}")
best hyperparameter is {'max_depth': 4} R-squared on the test data is 51.19%
from sklearn.neural_network import MLPRegressor
param_grid = {"hidden_layer_sizes": [[100], [100, 100], [100, 100, 100]]}
net_cv1 = GridSearchCV(
MLPRegressor(random_state=0, max_iter=1000),
param_grid=param_grid,
)
net_cv1.fit(X=X1_train, y=y1_train)
print(f"best hyperparameter is {net_cv1.best_params_}")
print(f"R-squared on the test data is {net_cv1.score(X=X1_test, y=y1_test):.2%}")
best hyperparameter is {'hidden_layer_sizes': [100, 100, 100]} R-squared on the test data is 25.75%
predict1 = net_cv1.predict(X=X1_test)
plt.scatter(X1_test, y1_test, label="test data")
plt.scatter(X1_test, predict1, label="predicted")
plt.legend()
plt.show()
param_grid = {"hidden_layer_sizes": [[100], [100, 100], [100, 100, 100]]}
net_cv2 = GridSearchCV(
MLPRegressor(random_state=0, max_iter=1000),
param_grid=param_grid,
)
net_cv2.fit(X=X2_train, y=y2_train)
print(f"best hyperparameter is {net_cv2.best_params_}")
print(f"R-squared on the test data is {net_cv2.score(X=X2_test, y=y2_test):.2%}")
best hyperparameter is {'hidden_layer_sizes': [100, 100]} R-squared on the test data is 5.55%
from sklearn.linear_model import LinearRegression
linear1 = LinearRegression()
linear1.fit(X=X1_train, y=y1_train)
test1 = linear1.score(X=X1_test, y=y1_test)
linear2 = LinearRegression()
linear2.fit(X=X2_train, y=y2_train)
test2 = linear2.score(X=X2_test, y=y2_test)
print(f"R-squared for dataset 1 test data is {test1:.2%}")
print(f"R-squared for dataset 2 test data is {test2:.2%}")
R-squared for dataset 1 test data is 1.18% R-squared for dataset 2 test data is 40.97%
np.round(linear2.coef_, 3)
array([ 0.993, 0.058, -0.032, -0.037, 0.014, 0.033, -0.002, 0.014, -0.032, 0.015, 0.035, -0.004, -0.07 , -0.038, 0.03 , -0.04 , -0.007, 0.045, 0.002, -0.012, -0.014, -0.089, -0.007, 0.002, -0.052, 0.01 , 0.012, -0.111, -0.003, -0.038, 0.01 , -0.049, 0.115, 0.054, 0.053, 0.032, -0.027, -0.022, -0.033, -0.003, -0.047, -0.03 , 0.005, -0.034, -0.004, -0.041, -0.083, 0.054, 0.031, -0.007, 0.021, 0.049, -0.027, -0.073, 0.033, 0.023, 0.038, 0.035, 0.004, 0.05 , -0.018, 0.003, -0.078, 0.021, 0.008, -0.016, -0.003, -0.014, -0.028, -0.037, -0.022, 0.075, 0.003, -0.021, -0.111, 0.023, -0.015, 0.02 , 0.015, 0.018, -0.01 , -0.043, -0.091, 0.044, -0.014, -0.03 , 0.017, -0.025, 0.095, -0.001, -0.022, -0.043, -0.072, -0.019, 0.052, 0.028, 0.006, 0.016, -0.046, -0.036])
The penalty is a hyperparameter. It is called "alpha" (not the regression intercept).
The larger the penalty, the smaller the estimated betas will be. For large alpha, the estimated betas will be zeros.
Penalizing is a way to reduce model complexity and avoid overfitting.
from sklearn.linear_model import Lasso
param_grid = {"alpha": np.arange(0.1, 2.1, 0.1)}
lasso_cv1 = GridSearchCV(Lasso(), param_grid=param_grid)
lasso_cv1.fit(X1_train, y1_train)
print(f"best hyperparameter is {lasso_cv1.best_params_}")
print(f"R-squared on the test data is {lasso_cv1.score(X1_test, y1_test):.2%}")
best hyperparameter is {'alpha': 0.1} R-squared on the test data is 1.12%
param_grid = {"alpha": np.arange(0.1, 2.1, 0.1)}
lasso_cv2 = GridSearchCV(Lasso(), param_grid=param_grid)
lasso_cv2.fit(X2_train, y2_train)
print(f"best hyperparameter is {lasso_cv2.best_params_}")
print(f"R-squared on the test data is {lasso_cv2.score(X2_test, y2_test):.2%}")
best hyperparameter is {'alpha': 0.1} R-squared on the test data is 51.76%
lasso = Lasso(alpha=0.1)
lasso.fit(X2_train, y2_train)
np.round(lasso.coef_, 3)
array([ 0.898, 0. , -0. , -0. , 0. , 0. , -0. , 0. , -0. , 0. , 0. , 0. , -0. , -0. , 0. , -0. , -0. , 0. , -0. , 0. , -0. , -0. , -0. , 0. , -0. , -0. , 0. , -0. , 0. , -0. , 0. , -0. , 0. , 0. , 0. , 0. , -0. , -0. , -0. , -0. , -0. , 0. , -0. , -0. , -0. , -0. , -0. , 0. , 0. , -0. , 0. , 0. , -0. , -0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , -0. , 0. , 0. , -0. , -0. , -0. , -0. , -0. , -0. , 0. , 0. , -0. , -0. , 0. , 0. , 0. , 0. , 0. , -0. , -0. , -0.009, 0. , 0. , -0. , -0. , -0. , 0. , -0. , -0. , -0. , -0. , -0. , 0. , 0. , 0. , 0. , -0. , -0. ])
url = "https://www.dropbox.com/scl/fi/g9uzsntv93waniyw9pkc2/boston.csv?rlkey=e8n7uub35p0wk2xna56ptfx23&dl=1"
df = pd.read_csv(url)
df.head(3)
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
X = df.drop(columns=["MEDV"])
y = df.MEDV
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0
)
Do this for
Random forest with
param_grid = {"max_depth": range(2, 22, 2)}
Neural network with
param_grid = {"hidden_layer_sizes": [[100], [100, 100], [100, 100, 100]]}
Can chatGPT help?