diff --git a/README.md b/README.md index 4383dea26..cd0c8542f 100644 --- a/README.md +++ b/README.md @@ -196,10 +196,12 @@ Here is a list of models built on `Qlib`. - [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py) - [GRU based on pytorch](qlib/contrib/model/pytorch_gru.py) - [LSTM based on pytorcn](qlib/contrib/model/pytorch_lstm.py) +- [ALSTM based on pytorcn](qlib/contrib/model/pytorch_alstm.py) - [GATs based on pytorch](qlib/contrib/model/pytorch_gats.py) - [TabNet based on pytorch](qlib/contrib/model/tabnet.py) - [SFM based on pytorch](qlib/contrib/model/pytorch_sfm.py) - +- [HATs based on pytorch](qlib/contrib/model/pytorch_hats.py) +- [TFT based on tensorflow](examples/benchmarks/TFT/tft.py) Your PR of new Quant models is highly welcomed. diff --git a/examples/benchmarks/ALSTM/requirements.txt b/examples/benchmarks/ALSTM/requirements.txt new file mode 100644 index 000000000..1fc2779c0 --- /dev/null +++ b/examples/benchmarks/ALSTM/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.17.4 +pandas==1.1.2 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm.yaml new file mode 100644 index 000000000..bb35b6da5 --- /dev/null +++ b/examples/benchmarks/ALSTM/workflow_config_alstm.yaml @@ -0,0 +1,69 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: ALSTM + module_path: qlib.contrib.model.pytorch_alstm + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.0 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: IC + loss: mse + seed: 0 + GPU: 0 + rnn_type: GRU + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/CatBoost/README.md b/examples/benchmarks/CatBoost/README.md new file mode 100644 index 000000000..5e4f3966f --- /dev/null +++ b/examples/benchmarks/CatBoost/README.md @@ -0,0 +1,3 @@ +# CatBoost +* Code: [https://github.com/catboost/catboost](https://github.com/catboost/catboost) +* Paper: CatBoost: unbiased boosting with categorical features. [https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf](https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf). \ No newline at end of file diff --git a/examples/benchmarks/DNN/workflow_config_dnn.yaml b/examples/benchmarks/DNN/workflow_config_dnn.yaml index e853726ca..0f9ae7254 100644 --- a/examples/benchmarks/DNN/workflow_config_dnn.yaml +++ b/examples/benchmarks/DNN/workflow_config_dnn.yaml @@ -30,7 +30,7 @@ task: module_path: qlib.contrib.model.pytorch_nn kwargs: loss: mse - input_dim: 360 + input_dim: 158 output_dim: 1 lr: 0.002 lr_decay: 0.96 diff --git a/examples/benchmarks/GATs/worflow_config_gats.yaml b/examples/benchmarks/GATs/workflow_config_gats.yaml similarity index 95% rename from examples/benchmarks/GATs/worflow_config_gats.yaml rename to examples/benchmarks/GATs/workflow_config_gats.yaml index 84eeff4db..37bced99d 100644 --- a/examples/benchmarks/GATs/worflow_config_gats.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats.yaml @@ -37,9 +37,10 @@ task: lr: 1e-3 early_stop: 20 batch_size: 800 - metric: IC + metric: loss loss: mse - base_model: GRU + base_model: LSTM + with_pretrain: True seed: 0 GPU: 0 dataset: diff --git a/examples/benchmarks/GRU/model_gru_csi300.pkl b/examples/benchmarks/GRU/model_gru_csi300.pkl new file mode 100644 index 000000000..46347ce8c Binary files /dev/null and b/examples/benchmarks/GRU/model_gru_csi300.pkl differ diff --git a/examples/benchmarks/HATS/requirements.txt b/examples/benchmarks/HATS/requirements.txt new file mode 100644 index 000000000..16de0a438 --- /dev/null +++ b/examples/benchmarks/HATS/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.1.2 +numpy==1.17.4 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/HATS/worflow_config_hats.yaml b/examples/benchmarks/HATS/worflow_config_hats.yaml new file mode 100644 index 000000000..a7ab0d2d7 --- /dev/null +++ b/examples/benchmarks/HATS/worflow_config_hats.yaml @@ -0,0 +1,64 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: HATS + module_path: qlib.contrib.model.pytorch_gats + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.6 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: IC + loss: mse + base_model: GRU + seed: 0 + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/LSTM/model_lstm_csi300.pkl b/examples/benchmarks/LSTM/model_lstm_csi300.pkl new file mode 100644 index 000000000..ff7fee450 Binary files /dev/null and b/examples/benchmarks/LSTM/model_lstm_csi300.pkl differ diff --git a/examples/benchmarks/LightGBM/README.md b/examples/benchmarks/LightGBM/README.md new file mode 100644 index 000000000..13f408d5f --- /dev/null +++ b/examples/benchmarks/LightGBM/README.md @@ -0,0 +1,4 @@ +# LightGBM +* Code: [https://github.com/microsoft/LightGBM](https://github.com/microsoft/LightGBM) +* Paper: LightGBM: A Highly Efficient Gradient Boosting +Decision Tree. [https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf](https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf). \ No newline at end of file diff --git a/examples/benchmarks/SFM/README.md b/examples/benchmarks/SFM/README.md new file mode 100644 index 000000000..06ca50485 --- /dev/null +++ b/examples/benchmarks/SFM/README.md @@ -0,0 +1,4 @@ +# State-Frequency-Memory +- State Frequency Memory (SFM) is a novel recurrent network that uses Discrete Fourier Transform (DFT) to decompose the hidden states of memory cells and capture the multi-frequency trading patterns from past market data to make stock price predictions. +- The code used in Qlib is a pyTorch implementation of SFM (Zhang, L., Aggarwal, C., & Qi, G. J. (2017,)). +- Paper: Stock Price Prediction via Discovering Multi-Frequency Trading Patterns. https://www.cs.ucf.edu/~gqi/publications/kdd2017_stock.pdf. \ No newline at end of file diff --git a/examples/benchmarks/TabNet/README.md b/examples/benchmarks/TabNet/README.md new file mode 100644 index 000000000..3a233df46 --- /dev/null +++ b/examples/benchmarks/TabNet/README.md @@ -0,0 +1,4 @@ +# TabNet +* TabNet is a novel high-performance and interpretable canonical deep tabular data learning architectur. TabNet uses sequential attention to choose which features to reason from at each decision step, enabling interpretability and more effcient learning as the learning capacity is used for the most salient features. +* The code used in Qlib is a pyTorch implementation of Tabnet (Arik, S. O., & Pfister, T. (2019). [https://github.com/dreamquark-ai/tabnet](https://github.com/dreamquark-ai/tabnet) +* Paper: TabNet: Attentive Interpretable Tabular Learning. [https://arxiv.org/pdf/1908.07442.pdf](https://arxiv.org/pdf/1908.07442.pdf). \ No newline at end of file diff --git a/examples/benchmarks/XGBoost/README.md b/examples/benchmarks/XGBoost/README.md new file mode 100644 index 000000000..33e04b23b --- /dev/null +++ b/examples/benchmarks/XGBoost/README.md @@ -0,0 +1,3 @@ +# XGBoost +* Code: [https://github.com/dmlc/xgboost](https://github.com/dmlc/xgboost) +* Paper: XGBoost: A Scalable Tree Boosting System. [https://dl.acm.org/doi/pdf/10.1145/2939672.2939785](https://dl.acm.org/doi/pdf/10.1145/2939672.2939785). \ No newline at end of file diff --git a/examples/workflow_by_code_alstm.py b/examples/workflow_by_code_alstm.py new file mode 100644 index 000000000..eabce3b07 --- /dev/null +++ b/examples/workflow_by_code_alstm.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.pytorch_alstm import ALSTM +from qlib.contrib.data.handler import ALPHA360_Denoise +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +import pickle + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "ALSTM", + "module_path": "qlib.contrib.model.pytorch_alstm", + "kwargs": { + "d_feat": 6, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.0, + "n_epochs": 200, + "lr": 1e-3, + "early_stop": 20, + "batch_size": 800, + "metric": "IC", + "loss": "mse", + "seed": 0, + "GPU": 0, + "rnn_type": "GRU", + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "ALPHA360_Denoise", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py index 6b15b77b4..3bb4edf08 100644 --- a/examples/workflow_by_code_gats.py +++ b/examples/workflow_by_code_gats.py @@ -70,9 +70,10 @@ if __name__ == "__main__": "lr": 1e-3, "early_stop": 20, "batch_size": 800, - "metric": "IC", + "metric": "loss", "loss": "mse", - "base_model": "GRU", + "base_model": "LSTM", + "with_pretrain": True, "seed": 0, "GPU": 0, }, diff --git a/examples/workflow_by_code_hats.py b/examples/workflow_by_code_hats.py new file mode 100644 index 000000000..3ea81ba49 --- /dev/null +++ b/examples/workflow_by_code_hats.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.pytorch_hats import HATS +from qlib.contrib.data.handler import ALPHA360_Denoise +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +import pickle + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data_cn(target_dir=provider_uri) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "HATS", + "module_path": "qlib.contrib.model.pytorch_hats", + "kwargs": { + "d_feat": 6, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.6, + "n_epochs": 200, + "lr": 1e-3, + "early_stop": 20, + "batch_size": 800, + "metric": "IC", + "loss": "mse", + "base_model": "LSTM", + "seed": 0, + "GPU": 0, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "ALPHA360_Denoise", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset, save_path="benchmarks/HATS/model_hat.pkl") + + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/qlib/contrib/evaluate.py b/qlib/contrib/evaluate.py index a9b08719a..2b85f1a9b 100644 --- a/qlib/contrib/evaluate.py +++ b/qlib/contrib/evaluate.py @@ -190,7 +190,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k Parameters ---------- - # backtest workflow related or commmon arguments + - **backtest workflow related or commmon arguments** + pred : pandas.DataFrame predict should has index and one `score` column account : float @@ -202,7 +203,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k verbose : bool whether to print log - # strategy related arguments + - **strategy related arguments** + strategy : Strategy() strategy used in backtest topk : int (Default value: 50) @@ -225,7 +227,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k str_type: 'amount', 'weight' or 'dropout' strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy - # exchange related arguments + - **exchange related arguments** + exchange: Exchange() pass the exchange for speeding up. subscribe_fields: list diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index e487a6d1e..bba006c35 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import pandas as pd from catboost import Pool, CatBoost diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py new file mode 100644 index 000000000..bdf1e3ea0 --- /dev/null +++ b/qlib/contrib/model/pytorch_alstm.py @@ -0,0 +1,394 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import pandas as pd +import copy +from sklearn.metrics import roc_auc_score, mean_squared_error +import logging +from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index +from ...log import get_module_logger, TimeInspector + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class ALSTM(Model): + """ALSTM Model + + Parameters + ---------- + input_dim : int + input dimension + output_dim : int + output dimension + layers : tuple + layer sizes + lr : float + learning rate + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + lr=0.001, + metric="IC", + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + GPU="0", + seed=0, + rnn_type="GRU", + **kwargs + ): + # Set logger. + self.logger = get_module_logger("ALSTM") + self.logger.info("ALSTM pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.visible_GPU = GPU + self.use_gpu = torch.cuda.is_available() + self.seed = seed + self.rnn_type = rnn_type + + self.logger.info( + "ALSTM parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}" + "\nrnn_type : {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + GPU, + self.use_gpu, + seed, + self.rnn_type, + ) + ) + + if loss not in {"mse", "binary"}: + raise NotImplementedError("loss {} is not supported!".format(loss)) + self._scorer = mean_squared_error if loss == "mse" else roc_auc_score + + self.alstm_model = ALSTMModel( + d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout + ) + # def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, input_day=20, rnn_type="GRU"): + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.alstm_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.alstm_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self._fitted = False + if self.use_gpu: + self.alstm_model.cuda() + # set the visible GPU + if self.visible_GPU: + os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + + mask = torch.isfinite(label) + if self.metric == "IC": + return self.cal_ic(pred[mask], label[mask]) + + if self.metric == "" or self.metric == "loss": # use loss + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def cal_ic(self, pred, label): + return torch.mean(pred * label) + + def train_epoch(self, x_train, y_train): + + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) * 100 + + self.alstm_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float() + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float() + + if self.use_gpu: + feature = feature.cuda() + label = label.cuda() + + pred = self.alstm_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.alstm_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.alstm_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float() + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float() + + if self.use_gpu: + feature = feature.cuda() + label = label.cuda() + + pred = self.alstm_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + verbose=True, + save_path=None, + ): + + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + if save_path == None: + save_path = create_save_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self._fitted = True + # return + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.alstm_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.alstm_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset): + if not self._fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare("test", col_set="feature") + index = x_test.index + self.alstm_model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + + x_batch = torch.from_numpy(x_values[begin:end]).float() + + if self.use_gpu: + x_batch = x_batch.cuda() + + with torch.no_grad(): + if self.use_gpu: + pred = self.alstm_model(x_batch).detach().cpu().numpy() + else: + pred = self.alstm_model(x_batch).detach().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) + + +class GRUModel(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0): + super().__init__() + + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + self.fc_out = nn.Linear(hidden_size, 1) + + self.d_feat = d_feat + + def forward(self, x): + # x: [N, F*T] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] + out, _ = self.rnn(x) + return self.fc_out(out[:, -1, :]).squeeze() + + +class ALSTMModel(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"): + super().__init__() + self.hid_size = hidden_size + self.input_size = d_feat + self.dropout = dropout + self.rnn_type = rnn_type + self.rnn_layer = num_layers + self._build_model() + + def _build_model(self): + try: + klass = getattr(nn, self.rnn_type.upper()) + except: + raise ValueError("unknown rnn_type `%s`" % self.rnn_type) + self.net = nn.Sequential() + self.net.add_module("fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size)) + self.net.add_module("act", nn.Tanh()) + self.rnn = klass( + input_size=self.hid_size, + hidden_size=self.hid_size, + num_layers=self.rnn_layer, + batch_first=True, + dropout=self.dropout, + ) + self.fc_out = nn.Linear(in_features=self.hid_size * 2, out_features=1) + # self.fc_out = nn.Linear(in_features=self.hid_size, out_features=1) + self.att_net = nn.Sequential() + self.att_net.add_module("att_fc_in", nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2))) + self.att_net.add_module("att_dropout", torch.nn.Dropout(self.dropout)) + self.att_net.add_module("att_act", nn.Tanh()) + self.att_net.add_module("att_fc_out", nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False)) + self.att_net.add_module("att_softmax", nn.Softmax(dim=1)) + + def forward(self, inputs): + # inputs: [batch_size, input_size*input_day] + inputs = inputs.view(len(inputs), self.input_size, -1) + inputs = inputs.permute(0, 2, 1) # [batch, input_size, seq_len] -> [batch, seq_len, input_size] + rnn_out, _ = self.rnn(self.net(inputs)) # [batch, seq_len, num_directions * hidden_size] + attention_score = self.att_net(rnn_out) # [batch, seq_len, 1] + out_att = torch.mul(rnn_out, attention_score) + out_att = torch.sum(out_att, dim=1) + out = self.fc_out( + torch.cat((rnn_out[:, -1, :], out_att), dim=1) + ) # [batch, seq_len, num_directions * hidden_size] -> [batch, 1] + # out = self.fc_out(rnn_out[:, -1, :] + out_att) + return out[..., 0] diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 22ed6812d..07af4eda4 100755 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -55,6 +55,7 @@ class GAT(Model): early_stop=20, loss="mse", base_model="GRU", + with_pretrain=True, optimizer="adam", GPU="0", seed=0, @@ -77,6 +78,7 @@ class GAT(Model): self.optimizer = optimizer.lower() self.loss = loss self.base_model = base_model + self.with_pretrain = with_pretrain self.visible_GPU = GPU self.use_gpu = torch.cuda.is_available() self.seed = seed @@ -95,6 +97,7 @@ class GAT(Model): "\noptimizer : {}" "\nloss_type : {}" "\nbase_model : {}" + "\nwith_pretrain : {}" "\nvisible_GPU : {}" "\nuse_GPU : {}" "\nseed : {}".format( @@ -110,6 +113,7 @@ class GAT(Model): optimizer.lower(), loss, base_model, + with_pretrain, GPU, self.use_gpu, seed, @@ -256,6 +260,25 @@ class GAT(Model): evals_result["train"] = [] evals_result["valid"] = [] + # load pretrained base_model + if self.with_pretrain: + self.logger.info("Loading pretrained model...") + if self.base_model == "LSTM": + from ...contrib.model.pytorch_lstm import LSTMModel + + pretrained_model = LSTMModel() + pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl")) + elif self.base_model == "GRU": + from ...contrib.model.pytorch_gru import GRUModel + + pretrained_model = GRUModel() + pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl")) + model_dict = self.GAT_model.state_dict() + pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict} + model_dict.update(pretrained_dict) + self.GAT_model.load_state_dict(model_dict) + self.logger.info("Loading pretrained model Done...") + # train self.logger.info("training...") self._fitted = True diff --git a/qlib/contrib/model/pytorch_hats.py b/qlib/contrib/model/pytorch_hats.py new file mode 100644 index 000000000..7b4307e25 --- /dev/null +++ b/qlib/contrib/model/pytorch_hats.py @@ -0,0 +1,504 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import pandas as pd +import copy +from sklearn.metrics import roc_auc_score, mean_squared_error +import logging +from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index +from ...log import get_module_logger, TimeInspector + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class HATS(Model): + """HATS Model + + Parameters + ---------- + input_dim : int + input dimension + output_dim : int + output dimension + layers : tuple + layer sizes + lr : float + learning rate + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.5, + n_epochs=200, + lr=0.01, + metric="IC", + batch_size=800, + early_stop=20, + loss="mse", + base_model="GRU", + with_pretrain=True, + optimizer="adam", + GPU="0", + seed=0, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("HATS") + self.logger.info("HATS pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.base_model = base_model + self.with_pretrain = with_pretrain #### True if train HATS with pretrained base model + self.visible_GPU = GPU + self.use_gpu = torch.cuda.is_available() + self.seed = seed + + self.logger.info( + "HATS parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nbase_model : {}" + "\nwith_pretrain : {}" ##### debug + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + base_model, + with_pretrain, ### debug + GPU, + self.use_gpu, + seed, + ) + ) + + if loss not in {"mse", "binary"}: + raise NotImplementedError("loss {} is not supported!".format(loss)) + self._scorer = mean_squared_error if loss == "mse" else roc_auc_score + + self.HATS_model = HATSModel( + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, + ) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.HATS_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.HATS_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self._fitted = False + if self.use_gpu: + self.HATS_model.cuda() + # set the visible GPU + if self.visible_GPU: + os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + + mask = torch.isfinite(label) + if self.metric == "IC": + return self.cal_ic(pred[mask], label[mask]) + + if self.metric == "" or self.metric == "loss": # use loss + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def cal_ic(self, pred, label): + return torch.mean(pred * label) + + def train_epoch(self, x_train, y_train): + + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) * 100 + + self.HATS_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float() + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float() + + if self.use_gpu: + feature = feature.cuda() + label = label.cuda() + + pred = self.HATS_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.HATS_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.HATS_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float() + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float() + + if self.use_gpu: + feature = feature.cuda() + label = label.cuda() + + pred = self.HATS_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + verbose=True, + save_path=None, + ): + + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + if save_path == None: + save_path = create_save_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # load pretrained base_model + if self.with_pretrain: + self.logger.info("loading pretrained model...") + if self.base_model == "LSTM": + from ...contrib.model.pytorch_lstm import LSTMModel + + pretrained_model = LSTMModel() + pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl")) + elif self.base_model == "GRU": + from ...contrib.model.pytorch_gru import GRUModel + + pretrained_model = GRUModel() + pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl")) + model_dict = self.HATS_model.state_dict() + + # filter unnecessary parameters + pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict} + # overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + # load the new state dict + self.HATS_model.load_state_dict(model_dict) + self.logger.info("loading pretrained model Done...") + + # train + self.logger.info("training...") + self._fitted = True + # return + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.HATS_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.HATS_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset): + if not self._fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare("test", col_set="feature") + index = x_test.index + self.HATS_model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + + x_batch = torch.from_numpy(x_values[begin:end]).float() + + if self.use_gpu: + x_batch = x_batch.cuda() + + with torch.no_grad(): + if self.use_gpu: + pred = self.HATS_model(x_batch).detach().cpu().numpy() + else: + pred = self.HATS_model(x_batch).detach().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) + + +class HATSModel(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"): + super().__init__() + + if base_model == "GRU": + self.model = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + elif base_model == "LSTM": + self.model = nn.LSTM( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + else: + raise ValueError("unknown base model name `%s`" % base_model) + + self.hidden_size = hidden_size + self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False) + self.fc = nn.Linear(hidden_size, hidden_size) + self.bn2 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False) + self.fc_out = nn.Linear(hidden_size, 1) + self.leaky_relu = nn.LeakyReLU() + self.softmax = nn.Softmax(dim=1) + self.d_feat = d_feat + + num_head_att = [1] * num_layers + hidden_dim = [hidden_size] * num_layers + dims = [d_feat] + [d * nh for (d, nh) in zip(hidden_dim, num_head_att[:-1])] + [num_head_att[-1]] + in_dims = dims[:-1] + out_dims = [d // nh for (d, nh) in zip(dims[1:], num_head_att)] + self.attn = nn.ModuleList( + [GraphAttention(i, o, nh, dropout) for (i, o, nh) in zip(in_dims, out_dims, num_head_att)] + ) + self.bns = nn.ModuleList([nn.BatchNorm1d(dim) for dim in dims[1:-1]]) + self.dropout = nn.Dropout(dropout) + self.elu = nn.ELU() + + def forward(self, x): + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] + out, _ = self.model(x) + hidden = out[:, -1, :] + hidden = self.bn1(hidden) + attention = GraphAttention.cal_attention(hidden, hidden) + output = attention.mm(hidden) + output = self.fc(output) + output = self.bn2(output) + output = self.leaky_relu(output) + return self.fc_out(output).squeeze() + + +class GraphAttention(nn.Module): + def __init__(self, input_dim, output_dim, num_heads, dropout=0.5): + + super().__init__() + + """ + Parameters + ---------- + input_dim : int + Dimension of input node features. + output_dim : int + Dimension of output node features. + num_heads : list of ints + Number of attention heads in each hidden layer and output layer. Must be non empty. Note that len(num_heads) = len(hidden_dims)+1. + dropout : float + Dropout rate. Default: 0.5. + """ + + self.input_dim = input_dim + self.output_dim = output_dim + self.num_heads = num_heads + + self.fcs = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_heads)]) + self.a = nn.ModuleList([nn.Linear(2 * output_dim, 1) for _ in range(num_heads)]) + + self.dropout = nn.Dropout(dropout) + self.softmax = nn.Softmax(dim=0) + self.leakyrelu = nn.LeakyReLU() + + def forward(self, features, nodes, mapping, rows): + + """ + Parameters + ---------- + features : torch.Tensor + An (n' x input_dim) tensor of input node features. + node_layers : list of numpy array + node_layers[i] is an array of the nodes in the ith layer of the + computation graph. + mappings : list of dictionary + mappings[i] is a dictionary mapping node v (labelled 0 to |V|-1) + in node_layers[i] to its position in node_layers[i]. For example, + if node_layers[i] = [2,5], then mappings[i][2] = 0 and + mappings[i][5] = 1. + rows : numpy array + rows[i] is an array of neighbors of node i. + Returns + ------- + out : torch.Tensor + An (len(node_layers[-1]) x output_dim) tensor of output node features. + """ + + nprime = features.shape[0] + rows = [np.array([mapping[v] for v in row], dtype=np.int64) for row in rows] + sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows]))) + mapped_nodes = [mapping[v] for v in nodes] + indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t() + + out = [] + for k in range(self.num_heads): + h = self.fcs[k](features) + + nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0) + self_h = torch.cat(tuple([h[mapping[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0) + cat_h = torch.cat((self_h, nbr_h), dim=1) + + e = self.leakyrelu(self.a[k](cat_h)) + + alpha = [self.softmax(e[lo:hi]) for (lo, hi) in zip(sum_degs, sum_degs[1:])] + alpha = torch.cat(tuple(alpha), dim=0) + alpha = alpha.squeeze(1) + alpha = self.dropout(alpha) + + adj = torch.sparse.FloatTensor(indices, alpha, torch.Size([nprime, nprime])) + out.append(torch.sparse.mm(adj, h)[mapped_nodes]) + + return out + + def cal_attention(x, y): + + att_x = torch.mean(x, dim=1).reshape(-1, 1) + att_y = torch.mean(y, dim=1).reshape(-1, 1) + att = att_x.mm(torch.t(att_y)) + x_att = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1) + y_att = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1) + return ( + torch.mean( + x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1) + * y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1), + dim=2, + ) + - att + ) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index e0691ba16..b45e12e10 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -1,5 +1,14 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np import pandas as pd diff --git a/qlib/contrib/strategy/strategy.py b/qlib/contrib/strategy/strategy.py index 6eac9bafe..f2e2a4554 100644 --- a/qlib/contrib/strategy/strategy.py +++ b/qlib/contrib/strategy/strategy.py @@ -26,7 +26,9 @@ class BaseStrategy: def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date): """ - Parameters: + DO NOT directly change the state of current + + Parameters ----------- score_series : pd.Seires stock_id , score @@ -39,14 +41,12 @@ class BaseStrategy: predict date trade_date : pd.Timestamp trade date - - DO NOT directly change the state of current """ pass def update(self, score_series, pred_date, trade_date): """User can use this method to update strategy state each trade date. - Parameters: + Parameters ----------- score_series : pd.Series stock_id , score @@ -98,8 +98,9 @@ class AdjustTimer: """AdjustTimer Responsible for timing of position adjusting - This is designed as multiple inheritance mechanism due to + This is designed as multiple inheritance mechanism due to: - the is_adjust may need access to the internel state of a strategy + - it can be reguard as a enhancement to the existing strategy """ @@ -140,21 +141,24 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer): def generate_target_weight_position(self, score, current, trade_date): """ - Parameters: + Generate target position from score for this date and the current position.The cash is not considered in the position + + Parameters ----------- - score : pred score for this trade date, pd.Series, index is stock_id, contain 'score' column - current : current position, use Position() class + score : pd.Series + pred score for this trade date, index is stock_id, contain 'score' column + current : Position() + current position trade_exchange : Exchange() - trade_date : trade date - generate target position from score for this date and the current position - The cash is not considered in the position + trade_date : pd.Timestamp + trade date """ raise NotImplementedError() def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date): """ - Parameters: - ---------- + Parameters + ----------- score_series : pd.Seires stock_id , score current : Position() @@ -186,16 +190,29 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer): class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): - def __init__(self, topk, n_drop, method="bottom", risk_degree=0.95, thresh=1, hold_thresh=1, **kwargs): + def __init__( + self, + topk, + n_drop, + method_sell="bottom", + method_buy="top", + risk_degree=0.95, + thresh=1, + hold_thresh=1, + only_tradable=False, + **kwargs, + ): """ - Parameters: + Parameters ----------- topk : int The number of stocks in the portfolio n_drop : int number of stocks to be replaced in each trading date - method : str - dropout method, random/bottom + method_sell : str + dropout method_sell, random/bottom + method_buy : str + dropout method_buy, random/top risk_degree : float position percentage of total value thresh : int @@ -203,12 +220,19 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): hold_thresh : int minimum holding days before sell stock , will check current.get_stock_count(order.stock_id) >= self.thresh + only_tradable : bool + will the strategy only consider the tradable stock when buying and selling. + if only_tradable: + strategy will make buy sell decision without checking the tradable state of the stock + else: + strategy will make decision with the tradable state of the stock info and avoid buy and sell them """ super(TopkDropoutStrategy, self).__init__() ListAdjustTimer.__init__(self, kwargs.get("adjust_dates", None)) self.topk = topk self.n_drop = n_drop - self.method = method + self.method_sell = method_sell + self.method_buy = method_buy self.risk_degree = risk_degree self.thresh = thresh # self.stock_count['code'] will be the days the stock has been hold @@ -216,6 +240,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): self.stock_count = {} self.hold_thresh = hold_thresh + self.only_tradable = only_tradable def get_risk_degree(self, date): """get_risk_degree @@ -229,7 +254,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): """ Gnererate order list according to score_series at trade_date, will not change current. - Parameters: + Parameters ----------- score_series : pd.Series stock_id , score @@ -244,24 +269,85 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): """ if not self.is_adjust(trade_date): return [] + + if self.only_tradable: + # If The strategy only consider tradable stock when make decision + # It needs following actions to filter stocks + def get_first_n(l, n, reverse=False): + cur_n = 0 + res = [] + for si in reversed(l) if reverse else l: + if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date): + res.append(si) + cur_n += 1 + if cur_n >= n: + break + return res[::-1] if reverse else res + + def get_last_n(l, n): + return get_first_n(l, n, reverse=True) + + def filter_stock(l): + return [si for si in l if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date)] + + else: + # Otherwise, the stock will make decision with out the stock tradable info + def get_first_n(l, n): + return list(l)[:n] + + def get_last_n(l, n): + return list(l)[-n:] + + def filter_stock(l): + return l + current_temp = copy.deepcopy(current) # generate order list for this adjust date sell_order_list = [] buy_order_list = [] # load score + cash = current_temp.get_cash() current_stock_list = current_temp.get_stock_list() + # last position (sorted by score) last = score_series.reindex(current_stock_list).sort_values(ascending=False).index - today = ( - score_series[~score_series.index.isin(last)] - .sort_values(ascending=False) - .index[: self.n_drop + self.topk - len(last)] - ) - comb = score_series.reindex(last.union(today)).sort_values(ascending=False).index - if self.method == "bottom": - sell = last[last.isin(comb[-self.n_drop :])] - elif self.method == "random": - sell = pd.Index(np.random.choice(last, self.n_drop) if len(last) else []) + # The new stocks today want to buy **at most** + if self.method_buy == "top": + today = get_first_n( + score_series[~score_series.index.isin(last)].sort_values(ascending=False).index, + self.n_drop + self.topk - len(last), + ) + elif self.method_buy == "random": + topk_candi = get_first_n(score_series.sort_values(ascending=False).index, self.topk) + candi = list(filter(lambda x: x not in last, topk_candi)) + n = self.n_drop + self.topk - len(last) + try: + today = np.random.choice(candi, n, replace=False) + except ValueError: + today = candi + else: + raise NotImplementedError(f"This type of input is not supported") + # combine(new stocks + last stocks), we will drop stocks from this list + # In case of dropping higher score stock and buying lower score stock. + comb = score_series.reindex(last.union(pd.Index(today))).sort_values(ascending=False).index + + # Get the stock list we really want to sell (After filtering the case that we sell high and buy low) + if self.method_sell == "bottom": + sell = last[last.isin(get_last_n(comb, self.n_drop))] + elif self.method_sell == "random": + candi = filter_stock(last) + try: + sell = pd.Index(np.random.choice(candi, self.n_drop, replace=False) if len(last) else []) + except ValueError: # No enough candidates + sell = candi + else: + raise NotImplementedError(f"This type of input is not supported") + + # Get the stock list we really want to buy buy = today[: len(sell) + self.topk - len(last)] + + # buy singal: if a stock falls into topk, it appear in the buy_sinal + buy_signal = score_series.sort_values(ascending=False).iloc[: self.topk].index + for code in current_stock_list: if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date): continue @@ -285,12 +371,14 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): if trade_exchange.check_order(sell_order): sell_order_list.append(sell_order) trade_val, trade_cost, trade_price = trade_exchange.deal_order(sell_order, position=current_temp) + # update cash + cash += trade_val - trade_cost # sold del self.stock_count[code] else: # no buy signal, but the stock is kept self.stock_count[code] += 1 - elif code in buy: + elif code in buy_signal: # NOTE: This is different from the original version # get new buy signal # Only the stock fall in to topk will produce buy signal @@ -300,7 +388,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): # buy new stock # note the current has been changed current_stock_list = current_temp.get_stock_list() - value = current_temp.get_cash() * self.risk_degree / len(buy) if len(buy) > 0 else 0 + value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0 # open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it # as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index c46528944..e972aba3c 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -14,9 +14,11 @@ class Dataset(Serializable): def __init__(self, *args, **kwargs): """ - init is designed to finish following steps + init is designed to finish following steps: + - setup data - The data related attributes' names should start with '_' so that it will not be saved on disk when serializing + - initialize the state of the dataset(info to prepare the data) - The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing. @@ -29,11 +31,15 @@ class Dataset(Serializable): """ setup the data - We split the setup_data function for following situation - - 1) User have a Dataset object with learned status on disk - - 2) User load the Dataset object from the disk(Note the init function is skiped) - - 3) User call `setup_data` to load new data - - 4) User prepare data for model based on previous status + We split the setup_data function for following situation: + + - User have a Dataset object with learned status on disk + + - User load the Dataset object from the disk(Note the init function is skiped) + + - User call `setup_data` to load new data + + - User prepare data for model based on previous status """ pass @@ -41,8 +47,9 @@ class Dataset(Serializable): """ The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.) The parameters should specify the scope for the prepared data - The method sould + The method should: - process the data + - return the processed data Returns @@ -55,11 +62,12 @@ class Dataset(Serializable): class DatasetH(Dataset): """ - Dataset with Data(H)anler + Dataset with Data(H)andler User should try to put the data preprocessing functions into handler. - Only following data processing functions should be placed in Dataset + Only following data processing functions should be placed in Dataset: - The processing is related to specific model. + - The processing is related to data split """ @@ -81,21 +89,26 @@ class DatasetH(Dataset): Parameters ---------- handler : Union[dict, DataHandler] - handler could be - 1) insntance of `DataHandler` - 2) config of `DataHandler`. Please refer to `DataHandler` + handler could be: + + - insntance of `DataHandler` + + - config of `DataHandler`. Please refer to `DataHandler` segments : list Describe the options to segment the data. - Here are some examples - 1) 'segments': { - 'train': ("2008-01-01", "2014-12-31"), - 'valid': ("2017-01-01", "2020-08-01",), - 'test': ("2015-01-01", "2016-12-31",), - } - 2) 'segments': { - 'insample': ("2008-01-01", "2014-12-31"), - 'outsample': ("2017-01-01", "2020-08-01",), - } + Here are some examples: + + .. code-block:: + + 1) 'segments': { + 'train': ("2008-01-01", "2014-12-31"), + 'valid': ("2017-01-01", "2020-08-01",), + 'test': ("2015-01-01", "2016-12-31",), + } + 2) 'segments': { + 'insample': ("2008-01-01", "2014-12-31"), + 'outsample': ("2017-01-01", "2020-08-01",), + } """ self._handler = init_instance_by_config(handler, accept_types=DataHandler) self._segments = segments.copy() @@ -114,9 +127,11 @@ class DatasetH(Dataset): ---------- segments : Union[List[str], Tuple[str], str, slice] Describe the scope of the data to be prepared - Here are some examples - 1) 'train' - 2) ['train', 'valid'] + Here are some examples: + + - 'train' + + - ['train', 'valid'] col_set : str The col_set will be passed to self._handler when fetching data data_key: str diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index e0a4d809a..4d3d88c38 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -41,7 +41,7 @@ class DataHandler(Serializable): Example of the data: The multi-index of the columns is optional. - .. code-block:: + .. code-block:: python feature label $close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0 @@ -109,7 +109,8 @@ class DataHandler(Serializable): Parameters ---------- enable_cache : bool - default value is false + default value is false: + - if `enable_cache` == True: the processed data will be saved on disk, and handler will load the cached data from the disk directly @@ -378,8 +379,10 @@ class DataHandlerLP(DataHandler): init_type : str The type `IT_*` listed above enable_cache : bool - default value is false - if `enable_cache` == True: + default value is false: + + - if `enable_cache` == True: + the processed data will be saved on disk, and handler will load the cached data from the disk directly when we call `init` next time """ diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py index e95dc4479..404313e80 100644 --- a/qlib/data/dataset/loader.py +++ b/qlib/data/dataset/loader.py @@ -39,14 +39,16 @@ class DataLoader(abc.ABC): pd.DataFrame: data load from the under layer source - Example of the data: - (The multi-index of the columns is optional.) - feature label - $close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0 - datetime instrument - 2010-01-04 SH600000 81.807068 17145150.0 83.737389 83.016739 2.741058 0.0032 - SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042 - SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289 + Example of the data (The multi-index of the columns is optional.): + + .. code-block:: + + feature label + $close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0 + datetime instrument + 2010-01-04 SH600000 81.807068 17145150.0 83.737389 83.016739 2.741058 0.0032 + SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042 + SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289 """ pass @@ -55,7 +57,7 @@ class DLWParser(DataLoader): """ (D)ata(L)oader (W)ith (P)arser for features and names - Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields + Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields. """ def __init__(self, config: Tuple[list, tuple, dict]): @@ -65,14 +67,16 @@ class DLWParser(DataLoader): config : Tuple[list, tuple, dict] Config will be used to describe the fields and column names - := { - "group_name1": - "group_name2": - } - or - := + .. code-block:: YAML - := ["expr", ...] | (["expr", ...], ["col_name", ...]) + := { + "group_name1": + "group_name2": + } + or + := + + := ["expr", ...] | (["expr", ...], ["col_name", ...]) """ self.is_group = isinstance(config, dict) diff --git a/scripts/README.md b/scripts/README.md index 88ebdc680..99af4a457 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -43,6 +43,8 @@ python get_data.py qlib_data --help ### US data +> Need to download data first: [Downlaod US Data](#Downlaod-US-Data) + ```python import qlib from qlib.config import REG_US @@ -52,6 +54,8 @@ qlib.init(provider_uri=provider_uri, region=REG_US) ### CN data +> Need to download data first: [Download CN Data](#Download-CN-Data) + ```python import qlib from qlib.config import REG_CN diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py index 2bca4f037..9f6dd88e2 100644 --- a/scripts/dump_bin.py +++ b/scripts/dump_bin.py @@ -140,7 +140,7 @@ class DumpDataBase: def _get_source_data(self, file_path: Path) -> pd.DataFrame: df = pd.read_csv(str(file_path.resolve()), low_memory=False) - df[self.date_field_name] = df[self.date_field_name].astype(np.datetime64) + df[self.date_field_name] = df[self.date_field_name].astype(str).astype(np.datetime64) # df.drop_duplicates([self.date_field_name], inplace=True) return df @@ -339,10 +339,10 @@ class DumpDataFix(DumpDataAll): def dump(self): self._calendars_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt")) # noinspection PyAttributeOutsideInit - self._old_instruments = self._read_instruments( - self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME) - ).to_dict( - orient="index" + self._old_instruments = ( + self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)) + .set_index([self.symbol_field_name]) + .to_dict(orient="index") ) # type: dict self._dump_instruments() self._dump_features()