# sine curve
def curve(x):
    return 2 * np.sin(2 * x)

# Generate data
np.random.seed(0)
X1 = np.random.uniform(low=-5, high=5, size=(1000, 1))
y1 = curve(X1) + np.random.normal(scale=2, size=(1000, 1))
y1 = y1.flatten()


from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=0
)


# View training data
plt.scatter(X1_train, y1_train, label="noisy data")
plt.xlabel("feature")
plt.ylabel("target")
plt.show()


# Training data and true curve
plt.scatter(X1_train, y1_train, label="noisy data")
plt.xlabel("feature")
plt.ylabel("target")
plt.plot(
    np.sort(X1.flatten()),
    curve(np.sort(X1.flatten())), 
    label="true curve", 
    c=colors[1],
    lw=3
)
plt.legend()
plt.show()


# Generate 100 features (predictors) and 1000 data points

np.random.seed(0)
X2 = np.random.normal(size=(1000, 100))

# only the first feature will matter
y2 = X2[:, 0] + np.random.normal(size=1000)


from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=0
)


from sklearn.ensemble import RandomForestRegressor

# shallow forest
forest1a = RandomForestRegressor(max_depth=4, random_state=0)
forest1a.fit(X=X1_train, y=y1_train)

# deep forest
forest1b = RandomForestRegressor(max_depth=50, random_state=0)
forest1b.fit(X=X1_train, y=y1_train)

RandomForestRegressor(max_depth=50, random_state=0)

RandomForestRegressor(max_depth=50, random_state=0)


# training data
train1a = forest1a.score(X=X1_train, y=y1_train)
train1b = forest1b.score(X=X1_train, y=y1_train)

# test data
test1a = forest1a.score(X=X1_test, y=y1_test)
test1b = forest1b.score(X=X1_test, y=y1_test)


print(f"R-squared of shallow forest on training data = {train1a:.2%}")
print(f"R-squared of deep forest on training data = {train1b:.2%}")
print("\n")
print(f"R-squared of shallow forest on test data = {test1a:.2%}")
print(f"R-squared of deep forest on test data = {test1b:.2%}")

R-squared of shallow forest on training data = 35.23%
R-squared of deep forest on training data = 86.87%


R-squared of shallow forest on test data = 23.72%
R-squared of deep forest on test data = -4.33%


from sklearn.model_selection import GridSearchCV

param_grid = {"max_depth": range(2, 22, 2)}
forest_cv1 = GridSearchCV(
    RandomForestRegressor(random_state=0),
    param_grid=param_grid,
)
forest_cv1.fit(X=X1_train, y=y1_train)
print(f"best hyperparameter is {forest_cv1.best_params_}")
print(f"R-squared on the test data is {forest_cv1.score(X=X1_test, y=y1_test): .2%}")

best hyperparameter is {'max_depth': 6}
R-squared on the test data is  24.80%


forest_cv1.cv_results_

{'mean_fit_time': array([0.07377267, 0.08717914, 0.09889469, 0.11252427, 0.12421117,
        0.12569208, 0.1228476 , 0.1191843 , 0.12329407, 0.12679143]),
 'std_fit_time': array([0.00582796, 0.00928541, 0.00741403, 0.00925495, 0.00805558,
        0.00675253, 0.00531868, 0.0084519 , 0.00760523, 0.00612412]),
 'mean_score_time': array([0.00211639, 0.00513015, 0.00760536, 0.00627818, 0.0107892 ,
        0.00312495, 0.00312672, 0.00828857, 0.0067699 , 0.0001091 ]),
 'std_score_time': array([0.00423279, 0.00559281, 0.00619265, 0.00680234, 0.00666072,
        0.0062499 , 0.00625343, 0.00738377, 0.00833234, 0.0002182 ]),
 'param_max_depth': masked_array(data=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2},
  {'max_depth': 4},
  {'max_depth': 6},
  {'max_depth': 8},
  {'max_depth': 10},
  {'max_depth': 12},
  {'max_depth': 14},
  {'max_depth': 16},
  {'max_depth': 18},
  {'max_depth': 20}],
 'split0_test_score': array([0.09616569, 0.23542263, 0.30173737, 0.28694295, 0.25095494,
        0.21625665, 0.19441994, 0.18510077, 0.18248995, 0.18137539]),
 'split1_test_score': array([ 0.10763798,  0.17905418,  0.20575719,  0.10149797, -0.01897264,
        -0.11577575, -0.17075033, -0.19293928, -0.19777477, -0.1993838 ]),
 'split2_test_score': array([ 0.12027304,  0.19937604,  0.14168575,  0.06906105, -0.00299846,
        -0.07808594, -0.1020879 , -0.11310146, -0.117921  , -0.11925511]),
 'split3_test_score': array([0.09619447, 0.19925939, 0.28839889, 0.22429493, 0.17486064,
        0.11083016, 0.06668123, 0.05486168, 0.05026746, 0.04889382]),
 'split4_test_score': array([0.15072561, 0.26089124, 0.30192806, 0.24991503, 0.16510371,
        0.09916815, 0.06724514, 0.05339673, 0.05211175, 0.05149178]),
 'mean_test_score': array([ 0.11419936,  0.2148007 ,  0.24790145,  0.18634239,  0.11378964,
         0.04647865,  0.01110162, -0.00253631, -0.00616532, -0.00737558]),
 'std_test_score': array([0.02031501, 0.02934917, 0.06405488, 0.08550536, 0.10624726,
        0.12456701, 0.13094181, 0.13424504, 0.13518521, 0.13538296]),
 'rank_test_score': array([ 4,  2,  1,  3,  5,  6,  7,  8,  9, 10])}


predict1 = forest_cv1.predict(X=X1_test)
plt.scatter(X1_test, y1_test, label="test data")
plt.scatter(X1_test, predict1, label="predicted")
plt.legend()
plt.show()


param_grid = {"max_depth": range(2, 22, 2)}
forest_cv2 = GridSearchCV(
    RandomForestRegressor(random_state=0),
    param_grid=param_grid,
)
forest_cv2.fit(X=X2_train, y=y2_train)
print(f"best hyperparameter is {forest_cv2.best_params_}")
print(f"R-squared on the test data is {forest_cv2.score(X=X2_test, y=y2_test):.2%}")

best hyperparameter is {'max_depth': 4}
R-squared on the test data is 51.19%


from sklearn.neural_network import MLPRegressor

param_grid = {"hidden_layer_sizes": [[100], [100, 100], [100, 100, 100]]}

net_cv1 = GridSearchCV(
    MLPRegressor(random_state=0, max_iter=1000),
    param_grid=param_grid,
)
net_cv1.fit(X=X1_train, y=y1_train)

print(f"best hyperparameter is {net_cv1.best_params_}")
print(f"R-squared on the test data is {net_cv1.score(X=X1_test, y=y1_test):.2%}")

best hyperparameter is {'hidden_layer_sizes': [100, 100, 100]}
R-squared on the test data is 25.75%


predict1 = net_cv1.predict(X=X1_test)
plt.scatter(X1_test, y1_test, label="test data")
plt.scatter(X1_test, predict1, label="predicted")
plt.legend()
plt.show()


param_grid = {"hidden_layer_sizes": [[100], [100, 100], [100, 100, 100]]}

net_cv2 = GridSearchCV(
    MLPRegressor(random_state=0, max_iter=1000),
    param_grid=param_grid,
)
net_cv2.fit(X=X2_train, y=y2_train)

print(f"best hyperparameter is {net_cv2.best_params_}")
print(f"R-squared on the test data is {net_cv2.score(X=X2_test, y=y2_test):.2%}")

best hyperparameter is {'hidden_layer_sizes': [100, 100]}
R-squared on the test data is 5.55%


from sklearn.linear_model import LinearRegression 

linear1 = LinearRegression()
linear1.fit(X=X1_train, y=y1_train)
test1 = linear1.score(X=X1_test, y=y1_test)

linear2 = LinearRegression()
linear2.fit(X=X2_train, y=y2_train)
test2 = linear2.score(X=X2_test, y=y2_test)

print(f"R-squared for dataset 1 test data is {test1:.2%}")
print(f"R-squared for dataset 2 test data is {test2:.2%}")

R-squared for dataset 1 test data is 1.18%
R-squared for dataset 2 test data is 40.97%


np.round(linear2.coef_, 3)

array([ 0.993,  0.058, -0.032, -0.037,  0.014,  0.033, -0.002,  0.014,
       -0.032,  0.015,  0.035, -0.004, -0.07 , -0.038,  0.03 , -0.04 ,
       -0.007,  0.045,  0.002, -0.012, -0.014, -0.089, -0.007,  0.002,
       -0.052,  0.01 ,  0.012, -0.111, -0.003, -0.038,  0.01 , -0.049,
        0.115,  0.054,  0.053,  0.032, -0.027, -0.022, -0.033, -0.003,
       -0.047, -0.03 ,  0.005, -0.034, -0.004, -0.041, -0.083,  0.054,
        0.031, -0.007,  0.021,  0.049, -0.027, -0.073,  0.033,  0.023,
        0.038,  0.035,  0.004,  0.05 , -0.018,  0.003, -0.078,  0.021,
        0.008, -0.016, -0.003, -0.014, -0.028, -0.037, -0.022,  0.075,
        0.003, -0.021, -0.111,  0.023, -0.015,  0.02 ,  0.015,  0.018,
       -0.01 , -0.043, -0.091,  0.044, -0.014, -0.03 ,  0.017, -0.025,
        0.095, -0.001, -0.022, -0.043, -0.072, -0.019,  0.052,  0.028,
        0.006,  0.016, -0.046, -0.036])


from sklearn.linear_model import Lasso

param_grid = {"alpha": np.arange(0.1, 2.1, 0.1)}

lasso_cv1 = GridSearchCV(Lasso(), param_grid=param_grid)
lasso_cv1.fit(X1_train, y1_train)

print(f"best hyperparameter is {lasso_cv1.best_params_}")
print(f"R-squared on the test data is {lasso_cv1.score(X1_test, y1_test):.2%}")

best hyperparameter is {'alpha': 0.1}
R-squared on the test data is 1.12%


param_grid = {"alpha": np.arange(0.1, 2.1, 0.1)}

lasso_cv2 = GridSearchCV(Lasso(), param_grid=param_grid)
lasso_cv2.fit(X2_train, y2_train)

print(f"best hyperparameter is {lasso_cv2.best_params_}")
print(f"R-squared on the test data is {lasso_cv2.score(X2_test, y2_test):.2%}")

best hyperparameter is {'alpha': 0.1}
R-squared on the test data is 51.76%


lasso = Lasso(alpha=0.1)
lasso.fit(X2_train, y2_train)
np.round(lasso.coef_, 3)

array([ 0.898,  0.   , -0.   , -0.   ,  0.   ,  0.   , -0.   ,  0.   ,
       -0.   ,  0.   ,  0.   ,  0.   , -0.   , -0.   ,  0.   , -0.   ,
       -0.   ,  0.   , -0.   ,  0.   , -0.   , -0.   , -0.   ,  0.   ,
       -0.   , -0.   ,  0.   , -0.   ,  0.   , -0.   ,  0.   , -0.   ,
        0.   ,  0.   ,  0.   ,  0.   , -0.   , -0.   , -0.   , -0.   ,
       -0.   ,  0.   , -0.   , -0.   , -0.   , -0.   , -0.   ,  0.   ,
        0.   , -0.   ,  0.   ,  0.   , -0.   , -0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   , -0.   ,  0.   ,
        0.   , -0.   , -0.   , -0.   , -0.   , -0.   , -0.   ,  0.   ,
        0.   , -0.   , -0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
       -0.   , -0.   , -0.009,  0.   ,  0.   , -0.   , -0.   , -0.   ,
        0.   , -0.   , -0.   , -0.   , -0.   , -0.   ,  0.   ,  0.   ,
        0.   ,  0.   , -0.   , -0.   ])


url = "https://www.dropbox.com/scl/fi/g9uzsntv93waniyw9pkc2/boston.csv?rlkey=e8n7uub35p0wk2xna56ptfx23&dl=1"
df = pd.read_csv(url)
df.head(3)


X = df.drop(columns=["MEDV"])
y = df.MEDV

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7

More Machine Learning¶

More Machine Learning¶

Kerry Back, JGSB, Rice University¶

Overview¶

Tuning hyperparameters¶

Overfitting¶

Train-test split¶

Dataset 1¶

Train-test split¶

View¶

Dataset 2¶

Train-test split¶

Random forests¶

Shallow and deep forests for dataset 1¶

$R^2$'s of shallow and deep forests for dataset 1¶

Avoid overfitting by cross validation¶

Cross validation¶

Cross validation explained again¶

Cross validate random forest on dataset 1¶

See the full cross-validation results if we want¶

View fit on the best forest on the test data if we want¶

Cross validate random forest on dataset 2¶

Cross validate neural network on dataset 1¶

View fit of the best neural network on the test data if we want¶

Cross-validate neural network on dataset 2¶

Linear regression¶

Linear regression on both datasets¶

View regression coefficients if we want¶

Lasso¶

Cross-validation for lasso on dataset 1¶

Cross-validation for lasso on dataset 2¶

View the regression coefficients if we want¶

Some real data¶

Tasks¶