1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 02:21:18 +08:00

Merge branch 'main' of https://github.com/you-n-g/qlib into main

This commit is contained in:
Alex Wang
2020-11-26 22:28:57 +08:00
13 changed files with 116 additions and 114 deletions

View File

@@ -45,13 +45,11 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
At the module level, Qlib is a platform that consists of the above components. The components are designed as loose-coupled modules and each component could be used stand-alone.
| Name | Description |
| ------ | ----- |
| `Data layer` | `DataServer` focuses on providing high-performance infrastructure for users to manage and retrieve raw data. `DataEnhancement` will preprocess the data and provide the best dataset to be fed into the models. |
| `Interday Model` | `Interday model` focuses on producing prediction scores (aka. _alpha_). Models are trained by `Model Creator` and managed by `Model Manager`. Users could choose one or multiple models for prediction. Multiple models could be combined with `Ensemble` module. |
| `Interday Strategy` | `Portfolio Generator` will take prediction scores as input and output the orders based on the current position to achieve the target portfolio. |
| `Intraday Trading` | `Order Executor` is responsible for executing orders output by `Interday Strategy` and returning the executed results. |
| `Analysis` | Users could get a detailed analysis report of forecasting signals and portfolios in this part. |
| Name | Description |
| ------ | ----- |
| `Infrastructure` layer | `Infrastructure` layer provides underlying support for Quant research. `DataServer` provides high-performance infrastructure for users to manage and retrieve raw data. `Trainer` provides flexible interface to control the training process of models which enable algorithms controlling the training process. |
| `Workflow` layer | `Workflow` layer covers the whole workflow of quantitative investment. `Information Extractor` extracts data for models. `Forecast Model` focuses on producing all kinds of forecast signals (e.g. _alpha_, risk) for other modules. With these signals `Portfolio Generator` will generate the target portfolio and produce orders to be executed by `Order Executor`. |
| `Interface` layer | `Interface` layer tries to present a user-friendly interface for the underlying system. `Analyser` module will provide users detailed analysis reports of forecasting signals, portfolios and execution results |
* The modules with hand-drawn style are under development and will be released in the future.
* The modules with dashed borders are highly user-customizable and extendible.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 271 KiB

After

Width:  |  Height:  |  Size: 271 KiB

View File

@@ -36,7 +36,6 @@ task:
n_epochs: 200
lr: 1e-3
early_stop: 20
batch_size: 800
metric: loss
loss: mse
base_model: LSTM

View File

@@ -36,7 +36,6 @@ task:
n_epochs: 200
lr: 1e-3
early_stop: 20
batch_size: 800
metric: IC
loss: mse
base_model: GRU

View File

@@ -65,13 +65,12 @@ if __name__ == "__main__":
"n_epochs": 200,
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "loss",
"loss": "mse",
"base_model": "LSTM",
"with_pretrain": True,
"seed": 0,
"GPU": 0,
"GPU": "0",
},
},
"dataset": {
@@ -94,7 +93,6 @@ if __name__ == "__main__":
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
# model = train_model(task)
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset)

View File

@@ -70,7 +70,7 @@ if __name__ == "__main__":
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "IC",
"metric": "loss",
"loss": "mse",
"seed": 0,
"GPU": 0,

View File

@@ -62,12 +62,11 @@ if __name__ == "__main__":
"n_epochs": 200,
"lr": 1e-3,
"early_stop": 20,
"batch_size": 800,
"metric": "IC",
"loss": "mse",
"base_model": "LSTM",
"seed": 0,
"GPU": "0",
"GPU": "2",
},
},
"dataset": {

View File

@@ -126,6 +126,7 @@ class ALPHA360(DataHandlerLP):
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
@@ -135,7 +136,7 @@ class ALPHA360(DataHandlerLP):
"kwargs": {
"config": {
"feature": self.get_feature_config(),
"label": self.get_label_config(),
"label": kwargs.get("label", self.get_label_config()),
},
},
}
@@ -206,6 +207,7 @@ class Alpha158(DataHandlerLP):
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
@@ -213,7 +215,7 @@ class Alpha158(DataHandlerLP):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": {"feature": self.get_feature_config(), "label": self.get_label_config()},
"config": {"feature": self.get_feature_config(), "label": kwargs.get("label", self.get_label_config())},
},
}
super().__init__(

View File

@@ -9,10 +9,8 @@ import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
from ...log import get_module_logger, TimeInspector
from ...utils import create_save_path
from ...log import get_module_logger
import torch
import torch.nn as nn
@@ -49,7 +47,6 @@ class GAT(Model):
n_epochs=200,
lr=0.001,
metric="IC",
batch_size=2000,
early_stop=20,
loss="mse",
base_model="GRU",
@@ -71,7 +68,6 @@ class GAT(Model):
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
@@ -90,7 +86,6 @@ class GAT(Model):
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
@@ -106,7 +101,6 @@ class GAT(Model):
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
@@ -165,23 +159,31 @@ class GAT(Model):
def cal_ic(self, pred, label):
return torch.mean(pred * label)
def get_daily_inter(self, df, shuffle=False):
# organize the train data into daily inter as daily batches
daily_count = df.groupby(level=0).size().values
daily_index = np.roll(np.cumsum(daily_count), 1)
daily_index[0] = 0
if shuffle:
# shuffle the daily inter data
daily_shuffle = list(zip(daily_index, daily_count))
np.random.shuffle(daily_shuffle)
daily_index, daily_count = zip(*daily_shuffle)
return daily_index, daily_count
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values) * 100
self.GAT_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
# organize the train data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_train_values[batch]).float()
label = torch.from_numpy(y_train_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
@@ -206,15 +208,13 @@ class GAT(Model):
scores = []
losses = []
indices = np.arange(len(x_values))
# organize the test data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_values[batch]).float()
label = torch.from_numpy(y_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
@@ -247,7 +247,6 @@ class GAT(Model):
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
@@ -314,17 +313,14 @@ class GAT(Model):
index = x_test.index
self.GAT_model.eval()
x_values = x_test.values
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
# organize the data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
x_batch = torch.from_numpy(x_values[batch]).float()
if self.use_gpu:
x_batch = x_batch.cuda()

View File

@@ -46,7 +46,7 @@ class GRU(Model):
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="IC",
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
@@ -140,21 +140,16 @@ class GRU(Model):
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "IC":
return self.cal_ic(pred[mask], label[mask])
if self.metric == "" or self.metric == "loss": # use loss
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def cal_ic(self, pred, label):
return torch.mean(pred * label)
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values) * 100
y_train_values = np.squeeze(y_train.values)
self.gru_model.train()
@@ -193,7 +188,6 @@ class GRU(Model):
losses = []
indices = np.arange(len(x_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:

View File

@@ -54,7 +54,6 @@ class HATS(Model):
n_epochs=200,
lr=0.01,
metric="IC",
batch_size=800,
early_stop=20,
loss="mse",
base_model="GRU",
@@ -76,7 +75,6 @@ class HATS(Model):
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
@@ -95,7 +93,6 @@ class HATS(Model):
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
@@ -111,7 +108,6 @@ class HATS(Model):
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
@@ -169,6 +165,18 @@ class HATS(Model):
def cal_ic(self, pred, label):
return torch.mean(pred * label)
def get_daily_inter(self, df, shuffle=False):
# organize the train data into daily inter as daily batches
daily_count = df.groupby(level=0).size().values
daily_index = np.roll(np.cumsum(daily_count), 1)
daily_index[0] = 0
if shuffle:
# shuffle the daily inter data
daily_shuffle = list(zip(daily_index, daily_count))
np.random.shuffle(daily_shuffle)
daily_index, daily_count = zip(*daily_shuffle)
return daily_index, daily_count
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
@@ -176,16 +184,13 @@ class HATS(Model):
self.HATS_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
# organize the train data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_train_values[batch]).float()
label = torch.from_numpy(y_train_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
@@ -210,15 +215,13 @@ class HATS(Model):
scores = []
losses = []
indices = np.arange(len(x_values))
# organize the test data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_values[batch]).float()
label = torch.from_numpy(y_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
@@ -319,14 +322,12 @@ class HATS(Model):
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
# organize the data into daily inter as daily batches
daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
x_batch = torch.from_numpy(x_values[batch]).float()
if self.use_gpu:
x_batch = x_batch.cuda()

40
qlib/model/trainer.py Normal file
View File

@@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord
def task_train(config: dict, experiment_name):
"""
task based training
Parameters
----------
config : dict
A dict describing the training process
"""
# model initiaiton
model = init_instance_by_config(config.get("task")["model"])
dataset = init_instance_by_config(config.get("task")["dataset"])
# start exp
with R.start(experiment_name=experiment_name):
# train model
R.log_params(**flatten_dict(config.get("task")))
model.fit(dataset)
recorder = R.get_recorder()
# generate records: prediction, backtest, and analysis
for record in config.get("task")["record"]:
if record["class"] == SignalRecord.__name__:
srconf = {"model": model, "dataset": dataset, "recorder": recorder}
record["kwargs"].update(srconf)
sr = init_instance_by_config(record)
sr.generate()
else:
rconf = {"recorder": recorder}
record["kwargs"].update(rconf)
ar = init_instance_by_config(record)
ar.generate()

View File

@@ -8,9 +8,7 @@ import qlib
import fire
import pandas as pd
import ruamel.yaml as yaml
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord
from ..model.trainer import task_train
def get_path_list(path):
@@ -54,29 +52,7 @@ def workflow(config_path, experiment_name="workflow"):
region = config.get("region")
qlib.init(provider_uri=provider_uri, region=region)
# model initiaiton
model = init_instance_by_config(config.get("task")["model"])
dataset = init_instance_by_config(config.get("task")["dataset"])
# start exp
with R.start(experiment_name=experiment_name):
# train model
R.log_params(**flatten_dict(config.get("task")))
model.fit(dataset)
recorder = R.get_recorder()
# generate records: prediction, backtest, and analysis
for record in config.get("task")["record"]:
if record["class"] == SignalRecord.__name__:
srconf = {"model": model, "dataset": dataset, "recorder": recorder}
record["kwargs"].update(srconf)
sr = init_instance_by_config(record)
sr.generate()
else:
rconf = {"recorder": recorder}
record["kwargs"].update(rconf)
ar = init_instance_by_config(record)
ar.generate()
task_train(config, experiment_name=experiment_name)
# function to run worklflow by config