1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-05 12:00:58 +08:00
Files
qlib/qlib/contrib/model/linear.py
you-n-g 5190332c7e Add some misc features. (#1816)
* Normal mod

* Black linting

* Linting
2024-06-26 18:34:00 +08:00

114 lines
4.1 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
from typing import Text, Union
from qlib.log import get_module_logger
from qlib.data.dataset.weight import Reweighter
from scipy.optimize import nnls
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class LinearModel(Model):
"""Linear Model
Solve one of the following regression problems:
- `ols`: min_w |y - Xw|^2_2
- `nnls`: min_w |y - Xw|^2_2, s.t. w >= 0
- `ridge`: min_w |y - Xw|^2_2 + \alpha*|w|^2_2
- `lasso`: min_w |y - Xw|^2_2 + \alpha*|w|_1
where `w` is the regression coefficient.
"""
OLS = "ols"
NNLS = "nnls"
RIDGE = "ridge"
LASSO = "lasso"
def __init__(self, estimator="ols", alpha=0.0, fit_intercept=False, include_valid: bool = False):
"""
Parameters
----------
estimator : str
which estimator to use for linear regression
alpha : float
l1 or l2 regularization parameter
fit_intercept : bool
whether fit intercept
include_valid: bool
Should the validation data be included for training?
The validation data should be included
"""
assert estimator in [self.OLS, self.NNLS, self.RIDGE, self.LASSO], f"unsupported estimator `{estimator}`"
self.estimator = estimator
assert alpha == 0 or estimator in [self.RIDGE, self.LASSO], f"alpha is only supported in `ridge`&`lasso`"
self.alpha = alpha
self.fit_intercept = fit_intercept
self.coef_ = None
self.include_valid = include_valid
def fit(self, dataset: DatasetH, reweighter: Reweighter = None):
df_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
if self.include_valid:
try:
df_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
df_train = pd.concat([df_train, df_valid])
except KeyError:
get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
df_train = df_train.dropna()
if df_train.empty:
raise ValueError("Empty data from dataset, please check your dataset config.")
if reweighter is not None:
w: pd.Series = reweighter.reweight(df_train)
w = w.values
else:
w = None
X, y = df_train["feature"].values, np.squeeze(df_train["label"].values)
if self.estimator in [self.OLS, self.RIDGE, self.LASSO]:
self._fit(X, y, w)
elif self.estimator == self.NNLS:
self._fit_nnls(X, y, w)
else:
raise ValueError(f"unknown estimator `{self.estimator}`")
return self
def _fit(self, X, y, w):
if self.estimator == self.OLS:
model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=False)
else:
model = {self.RIDGE: Ridge, self.LASSO: Lasso}[self.estimator](
alpha=self.alpha, fit_intercept=self.fit_intercept, copy_X=False
)
model.fit(X, y, sample_weight=w)
self.coef_ = model.coef_
self.intercept_ = model.intercept_
def _fit_nnls(self, X, y, w=None):
if w is not None:
raise NotImplementedError("TODO: support nnls with weight") # TODO
if self.fit_intercept:
X = np.c_[X, np.ones(len(X))] # NOTE: mem copy
coef = nnls(X, y)[0]
if self.fit_intercept:
self.coef_ = coef[:-1]
self.intercept_ = coef[-1]
else:
self.coef_ = coef
self.intercept_ = 0.0
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
if self.coef_ is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
return pd.Series(x_test.values @ self.coef_ + self.intercept_, index=x_test.index)