qlib/examples/taskmanager/task_manager_rolling_with_updating.py

import qlib
import fire
import mlflow
from qlib.config import C
from qlib.workflow import R
from qlib.config import REG_CN
from qlib.model.trainer import task_train
from qlib.workflow.task.manage import run_task
from qlib.workflow.task.manage import TaskManager
from qlib.workflow.task.utils import TimeAdjuster
from qlib.workflow.task.update import ModelUpdater
from qlib.workflow.task.collect import TaskCollector
from qlib.workflow.task.gen import RollingGen, task_generator


data_handler_config = {
    "start_time": "2013-01-01",
    "end_time": "2020-09-25",
    "fit_start_time": "2013-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": "csi100",
}

dataset_config = {
    "class": "DatasetH",
    "module_path": "qlib.data.dataset",
    "kwargs": {
        "handler": {
            "class": "Alpha158",
            "module_path": "qlib.contrib.data.handler",
            "kwargs": data_handler_config,
        },
        "segments": {
            "train": ("2013-01-01", "2014-12-31"),
            "valid": ("2015-01-01", "2015-12-31"),
            "test": ("2016-01-01", "2017-01-01"),
        },
    },
}

record_config = [
    {
        "class": "SignalRecord",
        "module_path": "qlib.workflow.record_temp",
    },
    {
        "class": "SigAnaRecord",
        "module_path": "qlib.workflow.record_temp",
    },
]

# use lgb model
task_lgb_config = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
    },
    "dataset": dataset_config,
    "record": record_config,
}

# use xgboost model
task_xgboost_config = {
    "model": {
        "class": "XGBModel",
        "module_path": "qlib.contrib.model.xgboost",
    },
    "dataset": dataset_config,
    "record": record_config,
}

# This part corresponds to "Task Generating" in the document
def task_generating(**kwargs):
    print("========================================= task_generating =========================================")

    rolling_generator = RollingGen(step=rolling_step, rtype=RollingGen.ROLL_EX)

    tasks = task_generator(rolling_generator, **kwargs)

    # See the generated tasks in a easy way
    from pprint import pprint

    pprint(tasks)

    return tasks


# This part corresponds to "Task Storing" in the document
def task_storing(tasks):
    print("========================================= task_storing =========================================")
    tm = TaskManager(task_pool=task_pool)
    tm.create_task(tasks)  # all tasks will be saved to MongoDB


# This part corresponds to "Task Running" in the document
def task_running():
    print("========================================= task_running =========================================")
    run_task(task_train, task_pool, experiment_name=exp_name)  # all tasks will be trained using "task_train" method


# This part corresponds to "Task Collecting" in the document
def task_collecting():
    print("========================================= task_collecting =========================================")

    def get_task_key(task_config):
        task_key = task_config["task_key"]
        rolling_end_timestamp = task_config["dataset"]["kwargs"]["segments"]["test"][1]
        if rolling_end_timestamp == None:
            rolling_end_timestamp = TimeAdjuster().last_date()
        return task_key, rolling_end_timestamp.strftime("%Y-%m-%d")

    def lgb_filter(task_config):
        # only choose the results of "task_lgb"
        task_key, rolling_end = get_task_key(task_config)
        if task_key == "task_lgb":
            return True
        return False

    task_collector = TaskCollector(exp_name)
    pred_rolling = task_collector.collect_predictions(
        get_task_key, lgb_filter
    )  # name tasks by "get_task_key" and filter tasks by "my_filter"
    print(pred_rolling)


# Reset all things to the first status, be careful to save important data
def reset(force_end=False):
    print("========================================= reset =========================================")
    TaskManager(task_pool=task_pool).remove()

    exp = R.get_exp(experiment_name=exp_name)
    recs = TaskCollector(exp_name).list_recorders(only_finished=True)

    for rid in recs:
        exp.delete_recorder(rid)

    try:
        if force_end:
            mlflow.end_run()
    except Exception:
        pass


def set_online_model_to_latest():
    print(
        "========================================= set_online_model_to_latest ========================================="
    )
    model_updater = ModelUpdater(experiment_name=exp_name)
    latest_records, latest_test = model_updater.collect_latest_records()
    model_updater.reset_online_model(latest_records.values())


# Run this firstly to see the workflow in Task Management
def first_run():
    print("========================================= first_run =========================================")
    reset(force_end=True)

    # use "task_lgb" and "task_xgboost" as the task name
    tasks = task_generating(**{"task_xgboost": task_xgboost_config, "task_lgb": task_lgb_config})
    task_storing(tasks)
    task_running()
    task_collecting()
    set_online_model_to_latest()


# Update the predictions of online model
def update_predictions():
    print("========================================= update_predictions =========================================")
    model_updater = ModelUpdater(experiment_name=exp_name)
    model_updater.update_online_pred()


# Update the models using the latest date and set them to online model
def update_model():
    print("========================================= update_model =========================================")
    # get the latest recorders
    model_updater = ModelUpdater(experiment_name=exp_name)
    latest_records, latest_test = model_updater.collect_latest_records()
    # date adjustment based on trade day of Calendar in Qlib
    time_adjuster = TimeAdjuster()
    calendar_latest = time_adjuster.last_date()
    print("The latest date is ", calendar_latest)
    if time_adjuster.cal_interval(calendar_latest, latest_test[0]) > rolling_step:
        print("Need update models!")
        tasks = {}
        for rid, rec in latest_records.items():
            old_task = rec.task
            test_begin = old_task["dataset"]["kwargs"]["segments"]["test"][0]
            # modify the test segment to generate new tasks
            old_task["dataset"]["kwargs"]["segments"]["test"] = (test_begin, calendar_latest)
            tasks[old_task["task_key"]] = old_task

        # retrain the latest model
        new_tasks = task_generating(**tasks)
        task_storing(new_tasks)
        task_running()
        task_collecting()
        latest_records, _ = model_updater.collect_latest_records()

    # set the latest model to online model
    model_updater.reset_online_model(latest_records.values())


# Run whole workflow completely
def whole_workflow():
    print("========================================= whole_workflow =========================================")
    # run this at the first time
    first_run()
    # run this every day
    update_predictions()
    # run this every "rolling_steps" day
    update_model()


if __name__ == "__main__":
    ####### to train the first version's models, use the command below
    # python task_manager_rolling_with_updating.py first_run

    ####### to update the models using the latest date and set them to online model, use the command below
    # python task_manager_rolling_with_updating.py update_model

    ####### to update the predictions to the latest date, use the command below
    # python task_manager_rolling_with_updating.py update_predictions

    ####### to run whole workflow completely, use the command below
    # python task_manager_rolling_with_updating.py whole_workflow

    #################### you need to finish the configurations below #########################

    provider_uri = "~/.qlib/qlib_data/cn_data"  # data_dir
    qlib.init(provider_uri=provider_uri, region=REG_CN)

    C["mongo"] = {
        "task_url": "mongodb://localhost:27017/",  # your MongoDB url
        "task_db_name": "rolling_db",  # database name
    }

    exp_name = "rolling_exp"  # experiment name, will be used as the experiment in MLflow
    task_pool = "rolling_task"  # task pool name, will be used as the document in MongoDB
    rolling_step = 550

    ##########################################################################################

    fire.Fire()