add linear model

2026-07-03 02:50:58 +08:00 · 2020-11-29 17:22:37 +08:00
parent 0fb0109f9c
commit b3657d1c8f
3 changed files with 165 additions and 0 deletions
--- a/examples/benchmarks/Linear/requirements.txt
+++ b/examples/benchmarks/Linear/requirements.txt
@@ -0,0 +1,3 @@
+numpy>=1.17.4
+pandas>=1.0.1
+scikit-learn>=0.23.1
--- a/examples/benchmarks/Linear/workflow_config_linear.yaml
+++ b/examples/benchmarks/Linear/workflow_config_linear.yaml
@@ -0,0 +1,71 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors:
+        - class: RobustZScoreNorm
+          kwargs:
+              fields_group: feature
+              clip_outlier: true
+        - class: Fillna
+          kwargs:
+              fields_group: feature
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: LinearModel
+        module_path: qlib.contrib.model.linear
+        kwargs:
+            estimator: ols
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record:
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            ana_long_short: True
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            config: *port_analysis_config
--- a/qlib/contrib/model/linear.py
+++ b/qlib/contrib/model/linear.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+
+from scipy.optimize import nnls
+from sklearn.linear_model import LinearRegression, Ridge, Lasso
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+
+class LinearModel(Model):
+    """Linear Model
+
+    Solve one of the following regression problems:
+        - `ols`: min_w |y - Xw|^2_2
+        - `nnls`: min_w |y - Xw|^2_2, s.t. w >= 0
+        - `ridge`: min_w |y - Xw|^2_2 + \alpha*|w|^2_2
+        - `lasso`: min_w |y - Xw|^2_2 + \alpha*|w|_1
+    where `w` is the regression coefficient.
+    """
+
+    OLS = "ols"
+    NNLS = "nnls"
+    RIDGE = "ridge"
+    LASSO = "lasso"
+
+    def __init__(self, estimator="ols", alpha=0.0, fit_intercept=False):
+        """
+        Parameters
+        ----------
+        estimator : str
+            which estimator to use for linear regression
+        alpha : float
+            l1 or l2 regularization parameter
+        fit_intercept : bool
+            whether fit intercept
+        """
+        assert estimator in [self.OLS, self.NNLS, self.RIDGE, self.LASSO], f"unsupported estimator `{estimator}`"
+        self.estimator = estimator
+
+        assert alpha == 0 or estimator in [self.RIDGE, self.LASSO], f"alpha is only supported in `ridge`&`lasso`"
+        self.alpha = alpha
+
+        self.fit_intercept = fit_intercept
+
+        self.coef_ = None
+
+    def fit(self, dataset: DatasetH):
+        df_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
+        X, y = df_train["feature"].values, np.squeeze(df_train["label"].values)
+
+        if self.estimator in [self.OLS, self.RIDGE, self.LASSO]:
+            self._fit(X, y)
+        elif self.estimator == self.NNLS:
+            self._fit_nnls(X, y)
+        else:
+            raise ValueError(f"unknown estimator `{self.estimator}`")
+
+        return self
+
+    def _fit(self, X, y):
+        if self.estimator == self.OLS:
+            model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=False)
+        else:
+            model = {self.RIDGE: Ridge, self.LASSO: Lasso}[self.estimator](
+                alpha=self.alpha, fit_intercept=self.fit_intercept, copy_X=False
+            )
+        model.fit(X, y)
+        self.coef_ = model.coef_
+        self.intercept_ = model.intercept_
+
+    def _fit_nnls(self, X, y):
+        if self.fit_intercept:
+            X = np.c_[X, np.ones(len(X))]  # NOTE: mem copy
+        coef = nnls(X, y)[0]
+        if self.fit_intercept:
+            self.coef_ = coef[:-1]
+            self.intercept_ = coef[-1]
+        else:
+            self.coef_ = coef
+            self.intercept_ = 0.0
+
+    def predict(self, dataset):
+        if self.coef_ is None:
+            raise ValueError("model is not fitted yet!")
+        x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
+        return pd.Series(x_test.values @ self.coef_ + self.intercept_, index=x_test.index)