1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Merge branch 'main' of https://github.com/you-n-g/qlib into main

This commit is contained in:
Alex Wang
2020-11-25 19:41:25 +08:00
29 changed files with 1604 additions and 92 deletions

View File

@@ -196,10 +196,12 @@ Here is a list of models built on `Qlib`.
- [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py)
- [GRU based on pytorch](qlib/contrib/model/pytorch_gru.py)
- [LSTM based on pytorcn](qlib/contrib/model/pytorch_lstm.py)
- [ALSTM based on pytorcn](qlib/contrib/model/pytorch_alstm.py)
- [GATs based on pytorch](qlib/contrib/model/pytorch_gats.py)
- [TabNet based on pytorch](qlib/contrib/model/tabnet.py)
- [SFM based on pytorch](qlib/contrib/model/pytorch_sfm.py)
<!-- - [TFT based on tensorflow](examples/benchmarks/TFT/tft.py) -->
- [HATs based on pytorch](qlib/contrib/model/pytorch_hats.py)
- [TFT based on tensorflow](examples/benchmarks/TFT/tft.py)
Your PR of new Quant models is highly welcomed.

View File

@@ -0,0 +1,4 @@
numpy==1.17.4
pandas==1.1.2
scikit_learn==0.23.2
torch==1.7.0

View File

@@ -0,0 +1,69 @@
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: ALSTM
module_path: qlib.contrib.model.pytorch_alstm
kwargs:
d_feat: 6
hidden_size: 64
num_layers: 2
dropout: 0.0
n_epochs: 200
lr: 1e-3
early_stop: 20
batch_size: 800
metric: IC
loss: mse
seed: 0
GPU: 0
rnn_type: GRU
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360_Denoise
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,3 @@
# CatBoost
* Code: [https://github.com/catboost/catboost](https://github.com/catboost/catboost)
* Paper: CatBoost: unbiased boosting with categorical features. [https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf](https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf).

View File

@@ -30,7 +30,7 @@ task:
module_path: qlib.contrib.model.pytorch_nn
kwargs:
loss: mse
input_dim: 360
input_dim: 158
output_dim: 1
lr: 0.002
lr_decay: 0.96

View File

@@ -37,9 +37,10 @@ task:
lr: 1e-3
early_stop: 20
batch_size: 800
metric: IC
metric: loss
loss: mse
base_model: GRU
base_model: LSTM
with_pretrain: True
seed: 0
GPU: 0
dataset:

Binary file not shown.

View File

@@ -0,0 +1,4 @@
pandas==1.1.2
numpy==1.17.4
scikit_learn==0.23.2
torch==1.7.0

View File

@@ -0,0 +1,64 @@
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: HATS
module_path: qlib.contrib.model.pytorch_gats
kwargs:
d_feat: 6
hidden_size: 64
num_layers: 2
dropout: 0.6
n_epochs: 200
lr: 1e-3
early_stop: 20
batch_size: 800
metric: IC
loss: mse
base_model: GRU
seed: 0
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360_Denoise
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

Binary file not shown.

View File

@@ -0,0 +1,4 @@
# LightGBM
* Code: [https://github.com/microsoft/LightGBM](https://github.com/microsoft/LightGBM)
* Paper: LightGBM: A Highly Efficient Gradient Boosting
Decision Tree. [https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf](https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf).

View File

@@ -0,0 +1,4 @@
# State-Frequency-Memory
- State Frequency Memory (SFM) is a novel recurrent network that uses Discrete Fourier Transform (DFT) to decompose the hidden states of memory cells and capture the multi-frequency trading patterns from past market data to make stock price predictions.
- The code used in Qlib is a pyTorch implementation of SFM (Zhang, L., Aggarwal, C., & Qi, G. J. (2017,)).
- Paper: Stock Price Prediction via Discovering Multi-Frequency Trading Patterns. https://www.cs.ucf.edu/~gqi/publications/kdd2017_stock.pdf.

View File

@@ -0,0 +1,4 @@
# TabNet
* TabNet is a novel high-performance and interpretable canonical deep tabular data learning architectur. TabNet uses sequential attention to choose which features to reason from at each decision step, enabling interpretability and more effcient learning as the learning capacity is used for the most salient features.
* The code used in Qlib is a pyTorch implementation of Tabnet (Arik, S. O., & Pfister, T. (2019). [https://github.com/dreamquark-ai/tabnet](https://github.com/dreamquark-ai/tabnet)
* Paper: TabNet: Attentive Interpretable Tabular Learning. [https://arxiv.org/pdf/1908.07442.pdf](https://arxiv.org/pdf/1908.07442.pdf).

View File

@@ -0,0 +1,3 @@
# XGBoost
* Code: [https://github.com/dmlc/xgboost](https://github.com/dmlc/xgboost)
* Paper: XGBoost: A Scalable Tree Boosting System. [https://dl.acm.org/doi/pdf/10.1145/2939672.2939785](https://dl.acm.org/doi/pdf/10.1145/2939672.2939785).

View File

@@ -0,0 +1,145 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
from pathlib import Path
import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.pytorch_alstm import ALSTM
from qlib.contrib.data.handler import ALPHA360_Denoise
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import exists_qlib_data
# from qlib.model.learner import train_model
from qlib.utils import init_instance_by_config
import pickle
if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": MARKET,
}
TRAINER_CONFIG = {
"train_start_time": "2008-01-01",
"train_end_time": "2014-12-31",
"validate_start_time": "2015-01-01",
"validate_end_time": "2016-12-31",
"test_start_time": "2017-01-01",
"test_end_time": "2020-08-01",
}
task = {
"model": {
"class": "ALSTM",
"module_path": "qlib.contrib.model.pytorch_alstm",
"kwargs": {
"d_feat": 6,
"hidden_size": 64,
"num_layers": 2,
"dropout": 0.0,
"n_epochs": 200,
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "IC",
"loss": "mse",
"seed": 0,
"GPU": 0,
"rnn_type": "GRU",
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "ALPHA360_Denoise",
"module_path": "qlib.contrib.data.handler",
"kwargs": DATA_HANDLER_CONFIG,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
}
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
# model = train_model(task)
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset)
pred_score = model.predict(dataset)
# save pred_score to file
pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
pred_score_path.parent.mkdir(exist_ok=True, parents=True)
pred_score.to_pickle(pred_score_path)
###################################
# backtest
###################################
STRATEGY_CONFIG = {
"topk": 50,
"n_drop": 5,
}
BACKTEST_CONFIG = {
"verbose": False,
"limit_threshold": 0.095,
"account": 100000000,
"benchmark": BENCHMARK,
"deal_price": "close",
"open_cost": 0.0005,
"close_cost": 0.0015,
"min_cost": 5,
}
# use default strategy
# custom Strategy, refer to: TODO: Strategy API url
strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
###################################
# analyze
# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
###################################
analysis = dict()
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
analysis["excess_return_with_cost"] = risk_analysis(
report_normal["return"] - report_normal["bench"] - report_normal["cost"]
)
analysis_df = pd.concat(analysis) # type: pd.DataFrame
print(analysis_df)

View File

@@ -70,9 +70,10 @@ if __name__ == "__main__":
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "IC",
"metric": "loss",
"loss": "mse",
"base_model": "GRU",
"base_model": "LSTM",
"with_pretrain": True,
"seed": 0,
"GPU": 0,
},

View File

@@ -0,0 +1,145 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
from pathlib import Path
import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.pytorch_hats import HATS
from qlib.contrib.data.handler import ALPHA360_Denoise
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import exists_qlib_data
# from qlib.model.learner import train_model
from qlib.utils import init_instance_by_config
import pickle
if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data_cn(target_dir=provider_uri)
qlib.init(provider_uri=provider_uri, region=REG_CN)
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": MARKET,
}
TRAINER_CONFIG = {
"train_start_time": "2008-01-01",
"train_end_time": "2014-12-31",
"validate_start_time": "2015-01-01",
"validate_end_time": "2016-12-31",
"test_start_time": "2017-01-01",
"test_end_time": "2020-08-01",
}
task = {
"model": {
"class": "HATS",
"module_path": "qlib.contrib.model.pytorch_hats",
"kwargs": {
"d_feat": 6,
"hidden_size": 64,
"num_layers": 2,
"dropout": 0.6,
"n_epochs": 200,
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "IC",
"loss": "mse",
"base_model": "LSTM",
"seed": 0,
"GPU": 0,
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "ALPHA360_Denoise",
"module_path": "qlib.contrib.data.handler",
"kwargs": DATA_HANDLER_CONFIG,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
}
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
# model = train_model(task)
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset, save_path="benchmarks/HATS/model_hat.pkl")
pred_score = model.predict(dataset)
# save pred_score to file
pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
pred_score_path.parent.mkdir(exist_ok=True, parents=True)
pred_score.to_pickle(pred_score_path)
###################################
# backtest
###################################
STRATEGY_CONFIG = {
"topk": 50,
"n_drop": 5,
}
BACKTEST_CONFIG = {
"verbose": False,
"limit_threshold": 0.095,
"account": 100000000,
"benchmark": BENCHMARK,
"deal_price": "close",
"open_cost": 0.0005,
"close_cost": 0.0015,
"min_cost": 5,
}
# use default strategy
# custom Strategy, refer to: TODO: Strategy API url
strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
###################################
# analyze
# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
###################################
analysis = dict()
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
analysis["excess_return_with_cost"] = risk_analysis(
report_normal["return"] - report_normal["bench"] - report_normal["cost"]
)
analysis_df = pd.concat(analysis) # type: pd.DataFrame
print(analysis_df)

View File

@@ -190,7 +190,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
Parameters
----------
# backtest workflow related or commmon arguments
- **backtest workflow related or commmon arguments**
pred : pandas.DataFrame
predict should has <datetime, instrument> index and one `score` column
account : float
@@ -202,7 +203,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
verbose : bool
whether to print log
# strategy related arguments
- **strategy related arguments**
strategy : Strategy()
strategy used in backtest
topk : int (Default value: 50)
@@ -225,7 +227,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy
# exchange related arguments
- **exchange related arguments**
exchange: Exchange()
pass the exchange for speeding up.
subscribe_fields: list

View File

@@ -1,3 +1,15 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
from catboost import Pool, CatBoost

View File

@@ -0,0 +1,394 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class ALSTM(Model):
"""ALSTM Model
Parameters
----------
input_dim : int
input dimension
output_dim : int
output dimension
layers : tuple
layer sizes
lr : float
learning rate
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=6,
hidden_size=64,
num_layers=2,
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="IC",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
seed=0,
rnn_type="GRU",
**kwargs
):
# Set logger.
self.logger = get_module_logger("ALSTM")
self.logger.info("ALSTM pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.visible_GPU = GPU
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.rnn_type = rnn_type
self.logger.info(
"ALSTM parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}"
"\nrnn_type : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
self.use_gpu,
seed,
self.rnn_type,
)
)
if loss not in {"mse", "binary"}:
raise NotImplementedError("loss {} is not supported!".format(loss))
self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
self.alstm_model = ALSTMModel(
d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
)
# def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, input_day=20, rnn_type="GRU"):
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.alstm_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.alstm_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.alstm_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "IC":
return self.cal_ic(pred[mask], label[mask])
if self.metric == "" or self.metric == "loss": # use loss
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def cal_ic(self, pred, label):
return torch.mean(pred * label)
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values) * 100
self.alstm_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
pred = self.alstm_model(feature)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.alstm_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
self.alstm_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
pred = self.alstm_model(feature)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
verbose=True,
save_path=None,
):
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self._fitted = True
# return
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.alstm_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.alstm_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature")
index = x_test.index
self.alstm_model.eval()
x_values = x_test.values
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
with torch.no_grad():
if self.use_gpu:
pred = self.alstm_model(x_batch).detach().cpu().numpy()
else:
pred = self.alstm_model(x_batch).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)
class GRUModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
super().__init__()
self.rnn = nn.GRU(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
self.fc_out = nn.Linear(hidden_size, 1)
self.d_feat = d_feat
def forward(self, x):
# x: [N, F*T]
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
x = x.permute(0, 2, 1) # [N, T, F]
out, _ = self.rnn(x)
return self.fc_out(out[:, -1, :]).squeeze()
class ALSTMModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"):
super().__init__()
self.hid_size = hidden_size
self.input_size = d_feat
self.dropout = dropout
self.rnn_type = rnn_type
self.rnn_layer = num_layers
self._build_model()
def _build_model(self):
try:
klass = getattr(nn, self.rnn_type.upper())
except:
raise ValueError("unknown rnn_type `%s`" % self.rnn_type)
self.net = nn.Sequential()
self.net.add_module("fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size))
self.net.add_module("act", nn.Tanh())
self.rnn = klass(
input_size=self.hid_size,
hidden_size=self.hid_size,
num_layers=self.rnn_layer,
batch_first=True,
dropout=self.dropout,
)
self.fc_out = nn.Linear(in_features=self.hid_size * 2, out_features=1)
# self.fc_out = nn.Linear(in_features=self.hid_size, out_features=1)
self.att_net = nn.Sequential()
self.att_net.add_module("att_fc_in", nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2)))
self.att_net.add_module("att_dropout", torch.nn.Dropout(self.dropout))
self.att_net.add_module("att_act", nn.Tanh())
self.att_net.add_module("att_fc_out", nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False))
self.att_net.add_module("att_softmax", nn.Softmax(dim=1))
def forward(self, inputs):
# inputs: [batch_size, input_size*input_day]
inputs = inputs.view(len(inputs), self.input_size, -1)
inputs = inputs.permute(0, 2, 1) # [batch, input_size, seq_len] -> [batch, seq_len, input_size]
rnn_out, _ = self.rnn(self.net(inputs)) # [batch, seq_len, num_directions * hidden_size]
attention_score = self.att_net(rnn_out) # [batch, seq_len, 1]
out_att = torch.mul(rnn_out, attention_score)
out_att = torch.sum(out_att, dim=1)
out = self.fc_out(
torch.cat((rnn_out[:, -1, :], out_att), dim=1)
) # [batch, seq_len, num_directions * hidden_size] -> [batch, 1]
# out = self.fc_out(rnn_out[:, -1, :] + out_att)
return out[..., 0]

View File

@@ -55,6 +55,7 @@ class GAT(Model):
early_stop=20,
loss="mse",
base_model="GRU",
with_pretrain=True,
optimizer="adam",
GPU="0",
seed=0,
@@ -77,6 +78,7 @@ class GAT(Model):
self.optimizer = optimizer.lower()
self.loss = loss
self.base_model = base_model
self.with_pretrain = with_pretrain
self.visible_GPU = GPU
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -95,6 +97,7 @@ class GAT(Model):
"\noptimizer : {}"
"\nloss_type : {}"
"\nbase_model : {}"
"\nwith_pretrain : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
@@ -110,6 +113,7 @@ class GAT(Model):
optimizer.lower(),
loss,
base_model,
with_pretrain,
GPU,
self.use_gpu,
seed,
@@ -256,6 +260,25 @@ class GAT(Model):
evals_result["train"] = []
evals_result["valid"] = []
# load pretrained base_model
if self.with_pretrain:
self.logger.info("Loading pretrained model...")
if self.base_model == "LSTM":
from ...contrib.model.pytorch_lstm import LSTMModel
pretrained_model = LSTMModel()
pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
elif self.base_model == "GRU":
from ...contrib.model.pytorch_gru import GRUModel
pretrained_model = GRUModel()
pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
model_dict = self.GAT_model.state_dict()
pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
model_dict.update(pretrained_dict)
self.GAT_model.load_state_dict(model_dict)
self.logger.info("Loading pretrained model Done...")
# train
self.logger.info("training...")
self._fitted = True

View File

@@ -0,0 +1,504 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class HATS(Model):
"""HATS Model
Parameters
----------
input_dim : int
input dimension
output_dim : int
output dimension
layers : tuple
layer sizes
lr : float
learning rate
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=6,
hidden_size=64,
num_layers=2,
dropout=0.5,
n_epochs=200,
lr=0.01,
metric="IC",
batch_size=800,
early_stop=20,
loss="mse",
base_model="GRU",
with_pretrain=True,
optimizer="adam",
GPU="0",
seed=0,
**kwargs
):
# Set logger.
self.logger = get_module_logger("HATS")
self.logger.info("HATS pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.base_model = base_model
self.with_pretrain = with_pretrain #### True if train HATS with pretrained base model
self.visible_GPU = GPU
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.logger.info(
"HATS parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nbase_model : {}"
"\nwith_pretrain : {}" ##### debug
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
base_model,
with_pretrain, ### debug
GPU,
self.use_gpu,
seed,
)
)
if loss not in {"mse", "binary"}:
raise NotImplementedError("loss {} is not supported!".format(loss))
self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
self.HATS_model = HATSModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
dropout=self.dropout,
base_model=self.base_model,
)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.HATS_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.HATS_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.HATS_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "IC":
return self.cal_ic(pred[mask], label[mask])
if self.metric == "" or self.metric == "loss": # use loss
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def cal_ic(self, pred, label):
return torch.mean(pred * label)
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values) * 100
self.HATS_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
pred = self.HATS_model(feature)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.HATS_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
self.HATS_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
pred = self.HATS_model(feature)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
verbose=True,
save_path=None,
):
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# load pretrained base_model
if self.with_pretrain:
self.logger.info("loading pretrained model...")
if self.base_model == "LSTM":
from ...contrib.model.pytorch_lstm import LSTMModel
pretrained_model = LSTMModel()
pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
elif self.base_model == "GRU":
from ...contrib.model.pytorch_gru import GRUModel
pretrained_model = GRUModel()
pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
model_dict = self.HATS_model.state_dict()
# filter unnecessary parameters
pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
# overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# load the new state dict
self.HATS_model.load_state_dict(model_dict)
self.logger.info("loading pretrained model Done...")
# train
self.logger.info("training...")
self._fitted = True
# return
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.HATS_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.HATS_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature")
index = x_test.index
self.HATS_model.eval()
x_values = x_test.values
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
with torch.no_grad():
if self.use_gpu:
pred = self.HATS_model(x_batch).detach().cpu().numpy()
else:
pred = self.HATS_model(x_batch).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)
class HATSModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"):
super().__init__()
if base_model == "GRU":
self.model = nn.GRU(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
elif base_model == "LSTM":
self.model = nn.LSTM(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
else:
raise ValueError("unknown base model name `%s`" % base_model)
self.hidden_size = hidden_size
self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
self.fc = nn.Linear(hidden_size, hidden_size)
self.bn2 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
self.fc_out = nn.Linear(hidden_size, 1)
self.leaky_relu = nn.LeakyReLU()
self.softmax = nn.Softmax(dim=1)
self.d_feat = d_feat
num_head_att = [1] * num_layers
hidden_dim = [hidden_size] * num_layers
dims = [d_feat] + [d * nh for (d, nh) in zip(hidden_dim, num_head_att[:-1])] + [num_head_att[-1]]
in_dims = dims[:-1]
out_dims = [d // nh for (d, nh) in zip(dims[1:], num_head_att)]
self.attn = nn.ModuleList(
[GraphAttention(i, o, nh, dropout) for (i, o, nh) in zip(in_dims, out_dims, num_head_att)]
)
self.bns = nn.ModuleList([nn.BatchNorm1d(dim) for dim in dims[1:-1]])
self.dropout = nn.Dropout(dropout)
self.elu = nn.ELU()
def forward(self, x):
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
x = x.permute(0, 2, 1) # [N, T, F]
out, _ = self.model(x)
hidden = out[:, -1, :]
hidden = self.bn1(hidden)
attention = GraphAttention.cal_attention(hidden, hidden)
output = attention.mm(hidden)
output = self.fc(output)
output = self.bn2(output)
output = self.leaky_relu(output)
return self.fc_out(output).squeeze()
class GraphAttention(nn.Module):
def __init__(self, input_dim, output_dim, num_heads, dropout=0.5):
super().__init__()
"""
Parameters
----------
input_dim : int
Dimension of input node features.
output_dim : int
Dimension of output node features.
num_heads : list of ints
Number of attention heads in each hidden layer and output layer. Must be non empty. Note that len(num_heads) = len(hidden_dims)+1.
dropout : float
Dropout rate. Default: 0.5.
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.num_heads = num_heads
self.fcs = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_heads)])
self.a = nn.ModuleList([nn.Linear(2 * output_dim, 1) for _ in range(num_heads)])
self.dropout = nn.Dropout(dropout)
self.softmax = nn.Softmax(dim=0)
self.leakyrelu = nn.LeakyReLU()
def forward(self, features, nodes, mapping, rows):
"""
Parameters
----------
features : torch.Tensor
An (n' x input_dim) tensor of input node features.
node_layers : list of numpy array
node_layers[i] is an array of the nodes in the ith layer of the
computation graph.
mappings : list of dictionary
mappings[i] is a dictionary mapping node v (labelled 0 to |V|-1)
in node_layers[i] to its position in node_layers[i]. For example,
if node_layers[i] = [2,5], then mappings[i][2] = 0 and
mappings[i][5] = 1.
rows : numpy array
rows[i] is an array of neighbors of node i.
Returns
-------
out : torch.Tensor
An (len(node_layers[-1]) x output_dim) tensor of output node features.
"""
nprime = features.shape[0]
rows = [np.array([mapping[v] for v in row], dtype=np.int64) for row in rows]
sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows])))
mapped_nodes = [mapping[v] for v in nodes]
indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t()
out = []
for k in range(self.num_heads):
h = self.fcs[k](features)
nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0)
self_h = torch.cat(tuple([h[mapping[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0)
cat_h = torch.cat((self_h, nbr_h), dim=1)
e = self.leakyrelu(self.a[k](cat_h))
alpha = [self.softmax(e[lo:hi]) for (lo, hi) in zip(sum_degs, sum_degs[1:])]
alpha = torch.cat(tuple(alpha), dim=0)
alpha = alpha.squeeze(1)
alpha = self.dropout(alpha)
adj = torch.sparse.FloatTensor(indices, alpha, torch.Size([nprime, nprime]))
out.append(torch.sparse.mm(adj, h)[mapped_nodes])
return out
def cal_attention(x, y):
att_x = torch.mean(x, dim=1).reshape(-1, 1)
att_y = torch.mean(y, dim=1).reshape(-1, 1)
att = att_x.mm(torch.t(att_y))
x_att = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
y_att = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1)
return (
torch.mean(
x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
* y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1),
dim=2,
)
- att
)

View File

@@ -1,5 +1,14 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd

View File

@@ -26,7 +26,9 @@ class BaseStrategy:
def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date):
"""
Parameters:
DO NOT directly change the state of current
Parameters
-----------
score_series : pd.Seires
stock_id , score
@@ -39,14 +41,12 @@ class BaseStrategy:
predict date
trade_date : pd.Timestamp
trade date
DO NOT directly change the state of current
"""
pass
def update(self, score_series, pred_date, trade_date):
"""User can use this method to update strategy state each trade date.
Parameters:
Parameters
-----------
score_series : pd.Series
stock_id , score
@@ -98,8 +98,9 @@ class AdjustTimer:
"""AdjustTimer
Responsible for timing of position adjusting
This is designed as multiple inheritance mechanism due to
This is designed as multiple inheritance mechanism due to:
- the is_adjust may need access to the internel state of a strategy
- it can be reguard as a enhancement to the existing strategy
"""
@@ -140,21 +141,24 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer):
def generate_target_weight_position(self, score, current, trade_date):
"""
Parameters:
Generate target position from score for this date and the current position.The cash is not considered in the position
Parameters
-----------
score : pred score for this trade date, pd.Series, index is stock_id, contain 'score' column
current : current position, use Position() class
score : pd.Series
pred score for this trade date, index is stock_id, contain 'score' column
current : Position()
current position
trade_exchange : Exchange()
trade_date : trade date
generate target position from score for this date and the current position
The cash is not considered in the position
trade_date : pd.Timestamp
trade date
"""
raise NotImplementedError()
def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date):
"""
Parameters:
----------
Parameters
-----------
score_series : pd.Seires
stock_id , score
current : Position()
@@ -186,16 +190,29 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer):
class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
def __init__(self, topk, n_drop, method="bottom", risk_degree=0.95, thresh=1, hold_thresh=1, **kwargs):
def __init__(
self,
topk,
n_drop,
method_sell="bottom",
method_buy="top",
risk_degree=0.95,
thresh=1,
hold_thresh=1,
only_tradable=False,
**kwargs,
):
"""
Parameters:
Parameters
-----------
topk : int
The number of stocks in the portfolio
n_drop : int
number of stocks to be replaced in each trading date
method : str
dropout method, random/bottom
method_sell : str
dropout method_sell, random/bottom
method_buy : str
dropout method_buy, random/top
risk_degree : float
position percentage of total value
thresh : int
@@ -203,12 +220,19 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
hold_thresh : int
minimum holding days
before sell stock , will check current.get_stock_count(order.stock_id) >= self.thresh
only_tradable : bool
will the strategy only consider the tradable stock when buying and selling.
if only_tradable:
strategy will make buy sell decision without checking the tradable state of the stock
else:
strategy will make decision with the tradable state of the stock info and avoid buy and sell them
"""
super(TopkDropoutStrategy, self).__init__()
ListAdjustTimer.__init__(self, kwargs.get("adjust_dates", None))
self.topk = topk
self.n_drop = n_drop
self.method = method
self.method_sell = method_sell
self.method_buy = method_buy
self.risk_degree = risk_degree
self.thresh = thresh
# self.stock_count['code'] will be the days the stock has been hold
@@ -216,6 +240,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
self.stock_count = {}
self.hold_thresh = hold_thresh
self.only_tradable = only_tradable
def get_risk_degree(self, date):
"""get_risk_degree
@@ -229,7 +254,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
"""
Gnererate order list according to score_series at trade_date, will not change current.
Parameters:
Parameters
-----------
score_series : pd.Series
stock_id , score
@@ -244,24 +269,85 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
"""
if not self.is_adjust(trade_date):
return []
if self.only_tradable:
# If The strategy only consider tradable stock when make decision
# It needs following actions to filter stocks
def get_first_n(l, n, reverse=False):
cur_n = 0
res = []
for si in reversed(l) if reverse else l:
if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date):
res.append(si)
cur_n += 1
if cur_n >= n:
break
return res[::-1] if reverse else res
def get_last_n(l, n):
return get_first_n(l, n, reverse=True)
def filter_stock(l):
return [si for si in l if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date)]
else:
# Otherwise, the stock will make decision with out the stock tradable info
def get_first_n(l, n):
return list(l)[:n]
def get_last_n(l, n):
return list(l)[-n:]
def filter_stock(l):
return l
current_temp = copy.deepcopy(current)
# generate order list for this adjust date
sell_order_list = []
buy_order_list = []
# load score
cash = current_temp.get_cash()
current_stock_list = current_temp.get_stock_list()
# last position (sorted by score)
last = score_series.reindex(current_stock_list).sort_values(ascending=False).index
today = (
score_series[~score_series.index.isin(last)]
.sort_values(ascending=False)
.index[: self.n_drop + self.topk - len(last)]
)
comb = score_series.reindex(last.union(today)).sort_values(ascending=False).index
if self.method == "bottom":
sell = last[last.isin(comb[-self.n_drop :])]
elif self.method == "random":
sell = pd.Index(np.random.choice(last, self.n_drop) if len(last) else [])
# The new stocks today want to buy **at most**
if self.method_buy == "top":
today = get_first_n(
score_series[~score_series.index.isin(last)].sort_values(ascending=False).index,
self.n_drop + self.topk - len(last),
)
elif self.method_buy == "random":
topk_candi = get_first_n(score_series.sort_values(ascending=False).index, self.topk)
candi = list(filter(lambda x: x not in last, topk_candi))
n = self.n_drop + self.topk - len(last)
try:
today = np.random.choice(candi, n, replace=False)
except ValueError:
today = candi
else:
raise NotImplementedError(f"This type of input is not supported")
# combine(new stocks + last stocks), we will drop stocks from this list
# In case of dropping higher score stock and buying lower score stock.
comb = score_series.reindex(last.union(pd.Index(today))).sort_values(ascending=False).index
# Get the stock list we really want to sell (After filtering the case that we sell high and buy low)
if self.method_sell == "bottom":
sell = last[last.isin(get_last_n(comb, self.n_drop))]
elif self.method_sell == "random":
candi = filter_stock(last)
try:
sell = pd.Index(np.random.choice(candi, self.n_drop, replace=False) if len(last) else [])
except ValueError: # No enough candidates
sell = candi
else:
raise NotImplementedError(f"This type of input is not supported")
# Get the stock list we really want to buy
buy = today[: len(sell) + self.topk - len(last)]
# buy singal: if a stock falls into topk, it appear in the buy_sinal
buy_signal = score_series.sort_values(ascending=False).iloc[: self.topk].index
for code in current_stock_list:
if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date):
continue
@@ -285,12 +371,14 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
if trade_exchange.check_order(sell_order):
sell_order_list.append(sell_order)
trade_val, trade_cost, trade_price = trade_exchange.deal_order(sell_order, position=current_temp)
# update cash
cash += trade_val - trade_cost
# sold
del self.stock_count[code]
else:
# no buy signal, but the stock is kept
self.stock_count[code] += 1
elif code in buy:
elif code in buy_signal:
# NOTE: This is different from the original version
# get new buy signal
# Only the stock fall in to topk will produce buy signal
@@ -300,7 +388,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
# buy new stock
# note the current has been changed
current_stock_list = current_temp.get_stock_list()
value = current_temp.get_cash() * self.risk_degree / len(buy) if len(buy) > 0 else 0
value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0
# open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it
# as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line

View File

@@ -14,9 +14,11 @@ class Dataset(Serializable):
def __init__(self, *args, **kwargs):
"""
init is designed to finish following steps
init is designed to finish following steps:
- setup data
- The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
- initialize the state of the dataset(info to prepare the data)
- The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
@@ -29,11 +31,15 @@ class Dataset(Serializable):
"""
setup the data
We split the setup_data function for following situation
- 1) User have a Dataset object with learned status on disk
- 2) User load the Dataset object from the disk(Note the init function is skiped)
- 3) User call `setup_data` to load new data
- 4) User prepare data for model based on previous status
We split the setup_data function for following situation:
- User have a Dataset object with learned status on disk
- User load the Dataset object from the disk(Note the init function is skiped)
- User call `setup_data` to load new data
- User prepare data for model based on previous status
"""
pass
@@ -41,8 +47,9 @@ class Dataset(Serializable):
"""
The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
The parameters should specify the scope for the prepared data
The method sould
The method should:
- process the data
- return the processed data
Returns
@@ -55,11 +62,12 @@ class Dataset(Serializable):
class DatasetH(Dataset):
"""
Dataset with Data(H)anler
Dataset with Data(H)andler
User should try to put the data preprocessing functions into handler.
Only following data processing functions should be placed in Dataset
Only following data processing functions should be placed in Dataset:
- The processing is related to specific model.
- The processing is related to data split
"""
@@ -81,21 +89,26 @@ class DatasetH(Dataset):
Parameters
----------
handler : Union[dict, DataHandler]
handler could be
1) insntance of `DataHandler`
2) config of `DataHandler`. Please refer to `DataHandler`
handler could be:
- insntance of `DataHandler`
- config of `DataHandler`. Please refer to `DataHandler`
segments : list
Describe the options to segment the data.
Here are some examples
1) 'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2017-01-01", "2020-08-01",),
'test': ("2015-01-01", "2016-12-31",),
}
2) 'segments': {
'insample': ("2008-01-01", "2014-12-31"),
'outsample': ("2017-01-01", "2020-08-01",),
}
Here are some examples:
.. code-block::
1) 'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2017-01-01", "2020-08-01",),
'test': ("2015-01-01", "2016-12-31",),
}
2) 'segments': {
'insample': ("2008-01-01", "2014-12-31"),
'outsample': ("2017-01-01", "2020-08-01",),
}
"""
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
self._segments = segments.copy()
@@ -114,9 +127,11 @@ class DatasetH(Dataset):
----------
segments : Union[List[str], Tuple[str], str, slice]
Describe the scope of the data to be prepared
Here are some examples
1) 'train'
2) ['train', 'valid']
Here are some examples:
- 'train'
- ['train', 'valid']
col_set : str
The col_set will be passed to self._handler when fetching data
data_key: str

View File

@@ -41,7 +41,7 @@ class DataHandler(Serializable):
Example of the data:
The multi-index of the columns is optional.
.. code-block::
.. code-block:: python
feature label
$close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0
@@ -109,7 +109,8 @@ class DataHandler(Serializable):
Parameters
----------
enable_cache : bool
default value is false
default value is false:
- if `enable_cache` == True:
the processed data will be saved on disk, and handler will load the cached data from the disk directly
@@ -378,8 +379,10 @@ class DataHandlerLP(DataHandler):
init_type : str
The type `IT_*` listed above
enable_cache : bool
default value is false
if `enable_cache` == True:
default value is false:
- if `enable_cache` == True:
the processed data will be saved on disk, and handler will load the cached data from the disk directly
when we call `init` next time
"""

View File

@@ -39,14 +39,16 @@ class DataLoader(abc.ABC):
pd.DataFrame:
data load from the under layer source
Example of the data:
(The multi-index of the columns is optional.)
feature label
$close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0
datetime instrument
2010-01-04 SH600000 81.807068 17145150.0 83.737389 83.016739 2.741058 0.0032
SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042
SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289
Example of the data (The multi-index of the columns is optional.):
.. code-block::
feature label
$close $volume Ref($close, 1) Mean($close, 3) $high-$low LABEL0
datetime instrument
2010-01-04 SH600000 81.807068 17145150.0 83.737389 83.016739 2.741058 0.0032
SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042
SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289
"""
pass
@@ -55,7 +57,7 @@ class DLWParser(DataLoader):
"""
(D)ata(L)oader (W)ith (P)arser for features and names
Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields
Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields.
"""
def __init__(self, config: Tuple[list, tuple, dict]):
@@ -65,14 +67,16 @@ class DLWParser(DataLoader):
config : Tuple[list, tuple, dict]
Config will be used to describe the fields and column names
<config> := {
"group_name1": <fields_info1>
"group_name2": <fields_info2>
}
or
<config> := <fields_info>
.. code-block:: YAML
<fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
<config> := {
"group_name1": <fields_info1>
"group_name2": <fields_info2>
}
or
<config> := <fields_info>
<fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
"""
self.is_group = isinstance(config, dict)

View File

@@ -43,6 +43,8 @@ python get_data.py qlib_data --help
### US data
> Need to download data first: [Downlaod US Data](#Downlaod-US-Data)
```python
import qlib
from qlib.config import REG_US
@@ -52,6 +54,8 @@ qlib.init(provider_uri=provider_uri, region=REG_US)
### CN data
> Need to download data first: [Download CN Data](#Download-CN-Data)
```python
import qlib
from qlib.config import REG_CN

View File

@@ -140,7 +140,7 @@ class DumpDataBase:
def _get_source_data(self, file_path: Path) -> pd.DataFrame:
df = pd.read_csv(str(file_path.resolve()), low_memory=False)
df[self.date_field_name] = df[self.date_field_name].astype(np.datetime64)
df[self.date_field_name] = df[self.date_field_name].astype(str).astype(np.datetime64)
# df.drop_duplicates([self.date_field_name], inplace=True)
return df
@@ -339,10 +339,10 @@ class DumpDataFix(DumpDataAll):
def dump(self):
self._calendars_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
# noinspection PyAttributeOutsideInit
self._old_instruments = self._read_instruments(
self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)
).to_dict(
orient="index"
self._old_instruments = (
self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
.set_index([self.symbol_field_name])
.to_dict(orient="index")
) # type: dict
self._dump_instruments()
self._dump_features()