1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-01 18:11:18 +08:00
Files
qlib/examples/taskmanager/task_manager_rolling_with_updating.py
2021-03-12 08:24:21 +00:00

245 lines
8.3 KiB
Python

import qlib
import fire
import mlflow
from qlib.config import C
from qlib.workflow import R
from qlib.config import REG_CN
from qlib.model.trainer import task_train
from qlib.workflow.task.manage import run_task
from qlib.workflow.task.manage import TaskManager
from qlib.workflow.task.utils import TimeAdjuster
from qlib.workflow.task.update import ModelUpdater
from qlib.workflow.task.collect import TaskCollector
from qlib.workflow.task.gen import RollingGen, task_generator
data_handler_config = {
"start_time": "2013-01-01",
"end_time": "2020-09-25",
"fit_start_time": "2013-01-01",
"fit_end_time": "2014-12-31",
"instruments": "csi100",
}
dataset_config = {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2013-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2015-12-31"),
"test": ("2016-01-01", "2017-01-01"),
},
},
}
record_config = [
{
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
{
"class": "SigAnaRecord",
"module_path": "qlib.workflow.record_temp",
},
]
# use lgb model
task_lgb_config = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
},
"dataset": dataset_config,
"record": record_config,
}
# use xgboost model
task_xgboost_config = {
"model": {
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
},
"dataset": dataset_config,
"record": record_config,
}
# This part corresponds to "Task Generating" in the document
def task_generating(**kwargs):
print("========================================= task_generating =========================================")
rolling_generator = RollingGen(step=rolling_step, rtype=RollingGen.ROLL_EX)
tasks = task_generator(rolling_generator, **kwargs)
# See the generated tasks in a easy way
from pprint import pprint
pprint(tasks)
return tasks
# This part corresponds to "Task Storing" in the document
def task_storing(tasks):
print("========================================= task_storing =========================================")
tm = TaskManager(task_pool=task_pool)
tm.create_task(tasks) # all tasks will be saved to MongoDB
# This part corresponds to "Task Running" in the document
def task_running():
print("========================================= task_running =========================================")
run_task(task_train, task_pool, experiment_name=exp_name) # all tasks will be trained using "task_train" method
# This part corresponds to "Task Collecting" in the document
def task_collecting():
print("========================================= task_collecting =========================================")
def get_task_key(task_config):
task_key = task_config["task_key"]
rolling_end_timestamp = task_config["dataset"]["kwargs"]["segments"]["test"][1]
if rolling_end_timestamp == None:
rolling_end_timestamp = TimeAdjuster().last_date()
return task_key, rolling_end_timestamp.strftime("%Y-%m-%d")
def lgb_filter(task_config):
# only choose the results of "task_lgb"
task_key, rolling_end = get_task_key(task_config)
if task_key == "task_lgb":
return True
return False
task_collector = TaskCollector(exp_name)
pred_rolling = task_collector.collect_predictions(
get_task_key, lgb_filter
) # name tasks by "get_task_key" and filter tasks by "my_filter"
print(pred_rolling)
# Reset all things to the first status, be careful to save important data
def reset(force_end=False):
print("========================================= reset =========================================")
TaskManager(task_pool=task_pool).remove()
exp = R.get_exp(experiment_name=exp_name)
recs = TaskCollector(exp_name).list_recorders(only_finished=True)
for rid in recs:
exp.delete_recorder(rid)
try:
if force_end:
mlflow.end_run()
except Exception:
pass
def set_online_model_to_latest():
print(
"========================================= set_online_model_to_latest ========================================="
)
model_updater = ModelUpdater(experiment_name=exp_name)
latest_records, latest_test = model_updater.collect_latest_records()
model_updater.reset_online_model(latest_records.values())
# Run this firstly to see the workflow in Task Management
def first_run():
print("========================================= first_run =========================================")
reset(force_end=True)
# use "task_lgb" and "task_xgboost" as the task name
tasks = task_generating(**{"task_xgboost": task_xgboost_config, "task_lgb": task_lgb_config})
task_storing(tasks)
task_running()
task_collecting()
set_online_model_to_latest()
# Update the predictions of online model
def update_predictions():
print("========================================= update_predictions =========================================")
model_updater = ModelUpdater(experiment_name=exp_name)
model_updater.update_online_pred()
# Update the models using the latest date and set them to online model
def update_model():
print("========================================= update_model =========================================")
# get the latest recorders
model_updater = ModelUpdater(experiment_name=exp_name)
latest_records, latest_test = model_updater.collect_latest_records()
# date adjustment based on trade day of Calendar in Qlib
time_adjuster = TimeAdjuster()
calendar_latest = time_adjuster.last_date()
print("The latest date is ", calendar_latest)
if time_adjuster.cal_interval(calendar_latest, latest_test[0]) > rolling_step:
print("Need update models!")
tasks = {}
for rid, rec in latest_records.items():
old_task = rec.task
test_begin = old_task["dataset"]["kwargs"]["segments"]["test"][0]
# modify the test segment to generate new tasks
old_task["dataset"]["kwargs"]["segments"]["test"] = (test_begin, calendar_latest)
tasks[old_task["task_key"]] = old_task
# retrain the latest model
new_tasks = task_generating(**tasks)
task_storing(new_tasks)
task_running()
task_collecting()
latest_records, _ = model_updater.collect_latest_records()
# set the latest model to online model
model_updater.reset_online_model(latest_records.values())
# Run whole workflow completely
def whole_workflow():
print("========================================= whole_workflow =========================================")
# run this at the first time
first_run()
# run this every day
update_predictions()
# run this every "rolling_steps" day
update_model()
if __name__ == "__main__":
####### to train the first version's models, use the command below
# python task_manager_rolling_with_updating.py first_run
####### to update the models using the latest date and set them to online model, use the command below
# python task_manager_rolling_with_updating.py update_model
####### to update the predictions to the latest date, use the command below
# python task_manager_rolling_with_updating.py update_predictions
####### to run whole workflow completely, use the command below
# python task_manager_rolling_with_updating.py whole_workflow
#################### you need to finish the configurations below #########################
provider_uri = "~/.qlib/qlib_data/cn_data" # data_dir
qlib.init(provider_uri=provider_uri, region=REG_CN)
C["mongo"] = {
"task_url": "mongodb://localhost:27017/", # your MongoDB url
"task_db_name": "rolling_db", # database name
}
exp_name = "rolling_exp" # experiment name, will be used as the experiment in MLflow
task_pool = "rolling_task" # task pool name, will be used as the document in MongoDB
rolling_step = 550
##########################################################################################
fire.Fire()