import pandas as pd

# change path_to_file to "./" if the file is in your working directory
path_to_file = "../../"

df = pd.read_csv(path_to_file + "data-2023-11-08.csv")
df.head()


from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(max_depth=3)

df["target"] = df.groupby("date", group_keys=False).ret.apply(
    lambda x: x - x.median()
)


features = [
    "marketcap", 
    "pb", 
    "mom", 
    "volume", 
    "volatility", 
    "roe", 
    "accruals"
]


dates = df.date.unique()
dates.sort()
df = df[df.date.isin(dates[-156:])]


forest.fit(X=df[features], y=df.target)

RandomForestRegressor(max_depth=3)

RandomForestRegressor(max_depth=3)


from joblib import dump
dump(forest, path_to_file + "forest.joblib")

['../../forest.joblib']

	ticker	date	marketcap	pb	ret	mom	volume	volatility	roe	accruals	agr	sector
0	AACC	2011-01-14	188.3	1.4	-0.014634	-0.184615	2.078000e+04	0.071498	-0.118239	-0.182275	0.004383	Financial Services
1	AAI	2011-01-14	1012.1	2.0	0.002677	0.438224	2.775580e+06	0.128450	0.108073	-0.220045	0.002237	Industrials
2	AAIC	2011-01-14	189.3	1.0	-0.010119	0.684547	3.466000e+04	0.048505	0.136967	0.108055	0.135697	Real Estate
3	AAON	2011-01-14	479.4	4.2	0.007778	0.528685	2.817291e+05	0.044912	0.191801	-0.088557	0.011656	Basic Materials
4	AATC	2011-01-14	63.3	1.4	-0.013960	0.008216	6.800000e+03	0.049756	0.072269	-0.008792	0.089436	Technology

Training a Random Forest¶

MGMT 638: Data-Driven Investments: Equity¶

Kerry Back, Rice University¶

Outline¶

Read data¶

Define model and target variable¶

Define predictors (features)¶

Filter to most recent 3 years¶

Train the model¶

Save the model¶