import pandas as pd
url = "https://www.dropbox.com/scl/fi/hjpebns5qv0nzh1ucl4tr/data-2023-11-13.csv?rlkey=ljnn4pf04sewolnxa96t4lpdv&dl=1"
df = pd.read_csv(url)
df.head()
ticker | date | marketcap | pb | ret | mom | volume | volatility | roe | accruals | agr | sector | mktvol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AACC | 2011-01-14 | 188.3 | 1.4 | -0.014634 | -0.184615 | 2.078000e+04 | 0.071498 | -0.118239 | -0.182275 | 0.004383 | Financial Services | 0.055937 |
1 | AAI | 2011-01-14 | 1012.1 | 2.0 | 0.002677 | 0.438224 | 2.775580e+06 | 0.128450 | 0.108073 | -0.220045 | 0.002237 | Industrials | 0.055937 |
2 | AAIC | 2011-01-14 | 189.3 | 1.0 | -0.010119 | 0.684547 | 3.466000e+04 | 0.048505 | 0.136967 | 0.108055 | 0.135697 | Real Estate | 0.055937 |
3 | AAON | 2011-01-14 | 479.4 | 4.2 | 0.007778 | 0.528685 | 2.817291e+05 | 0.044912 | 0.191801 | -0.088557 | 0.011656 | Basic Materials | 0.055937 |
4 | AATC | 2011-01-14 | 63.3 | 1.4 | -0.013960 | 0.008216 | 6.800000e+03 | 0.049756 | 0.072269 | -0.008792 | 0.089436 | Technology | 0.055937 |
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(max_depth=4)
df["target"] = df.groupby("date", group_keys=False).ret.apply(
lambda x: 100*x.rank(pct=True)
)
features = [
"marketcap",
"pb",
"mom",
"volume",
"volatility",
"roe",
"accruals",
"agr"
]
features.sort()
for x in features:
df[x+"_vol"] = df[x]*df.mktvol
features += [x+"_vol" for x in features]
dates = df.date.unique()
dates.sort()
df = df[df.date.isin(dates[-156:])]
forest.fit(X=df[features], y=df.target)
RandomForestRegressor(max_depth=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_depth=4)
# local machine version
from joblib import dump
dump(forest, "forest_ver2.joblib")
['forest_ver2.joblib']
# colab version
from joblib import dump
from google.colab import drive
drive.mount('/content/drive')
dump(forest, "/content/drive/My Drive/forest_ver2.joblib")