diff --git a/examples/benchmarks/Linear/requirements.txt b/examples/benchmarks/Linear/requirements.txt new file mode 100644 index 000000000..6a53211f9 --- /dev/null +++ b/examples/benchmarks/Linear/requirements.txt @@ -0,0 +1,3 @@ +numpy>=1.17.4 +pandas>=1.0.1 +scikit-learn>=0.23.1 diff --git a/examples/benchmarks/Linear/workflow_config_linear.yaml b/examples/benchmarks/Linear/workflow_config_linear.yaml new file mode 100644 index 000000000..70d3eaf68 --- /dev/null +++ b/examples/benchmarks/Linear/workflow_config_linear.yaml @@ -0,0 +1,71 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LinearModel + module_path: qlib.contrib.model.linear + kwargs: + estimator: ols + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: True + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py new file mode 100644 index 000000000..0f9223737 --- /dev/null +++ b/qlib/contrib/model/linear.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import numpy as np +import pandas as pd + +from scipy.optimize import nnls +from sklearn.linear_model import LinearRegression, Ridge, Lasso + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class LinearModel(Model): + """Linear Model + + Solve one of the following regression problems: + - `ols`: min_w |y - Xw|^2_2 + - `nnls`: min_w |y - Xw|^2_2, s.t. w >= 0 + - `ridge`: min_w |y - Xw|^2_2 + \alpha*|w|^2_2 + - `lasso`: min_w |y - Xw|^2_2 + \alpha*|w|_1 + where `w` is the regression coefficient. + """ + + OLS = "ols" + NNLS = "nnls" + RIDGE = "ridge" + LASSO = "lasso" + + def __init__(self, estimator="ols", alpha=0.0, fit_intercept=False): + """ + Parameters + ---------- + estimator : str + which estimator to use for linear regression + alpha : float + l1 or l2 regularization parameter + fit_intercept : bool + whether fit intercept + """ + assert estimator in [self.OLS, self.NNLS, self.RIDGE, self.LASSO], f"unsupported estimator `{estimator}`" + self.estimator = estimator + + assert alpha == 0 or estimator in [self.RIDGE, self.LASSO], f"alpha is only supported in `ridge`&`lasso`" + self.alpha = alpha + + self.fit_intercept = fit_intercept + + self.coef_ = None + + def fit(self, dataset: DatasetH): + df_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + X, y = df_train["feature"].values, np.squeeze(df_train["label"].values) + + if self.estimator in [self.OLS, self.RIDGE, self.LASSO]: + self._fit(X, y) + elif self.estimator == self.NNLS: + self._fit_nnls(X, y) + else: + raise ValueError(f"unknown estimator `{self.estimator}`") + + return self + + def _fit(self, X, y): + if self.estimator == self.OLS: + model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=False) + else: + model = {self.RIDGE: Ridge, self.LASSO: Lasso}[self.estimator]( + alpha=self.alpha, fit_intercept=self.fit_intercept, copy_X=False + ) + model.fit(X, y) + self.coef_ = model.coef_ + self.intercept_ = model.intercept_ + + def _fit_nnls(self, X, y): + if self.fit_intercept: + X = np.c_[X, np.ones(len(X))] # NOTE: mem copy + coef = nnls(X, y)[0] + if self.fit_intercept: + self.coef_ = coef[:-1] + self.intercept_ = coef[-1] + else: + self.coef_ = coef + self.intercept_ = 0.0 + + def predict(self, dataset): + if self.coef_ is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + return pd.Series(x_test.values @ self.coef_ + self.intercept_, index=x_test.index)