1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 11:00:57 +08:00

online serving v10

This commit is contained in:
lzh222333
2021-05-07 09:59:15 +00:00
parent 84c56f13bd
commit 9dfd001f6f
14 changed files with 426 additions and 345 deletions

View File

@@ -7,6 +7,7 @@ Ensemble can merge the objects in an Ensemble. For example, if there are many su
from typing import Union
import pandas as pd
from qlib.utils import flatten_dict
class Ensemble:
@@ -77,19 +78,22 @@ class RollingEnsemble(Ensemble):
class AverageEnsemble(Ensemble):
def __call__(self, ensemble_dict: dict):
"""
Average a dict of same shape dataframe like `prediction` or `IC` into an ensemble.
Average and standardize a dict of same shape dataframe like `prediction` or `IC` into an ensemble.
NOTE: The values of dict must be pd.DataFrame, and have the index "datetime"
NOTE: The values of dict must be pd.DataFrame, and have the index "datetime". If it is a nested dict, then flat it.
Args:
ensemble_dict (dict): a dict like {"A": pd.DataFrame, "B": pd.DataFrame}.
The key of the dict will be ignored.
Returns:
pd.DataFrame: the complete result of averaging.
pd.DataFrame: the complete result of averaging and standardizing.
"""
# need to flatten the nested dict
ensemble_dict = flatten_dict(ensemble_dict)
values = list(ensemble_dict.values())
results = pd.concat(values, axis=1)
results = results.mean(axis=1).to_frame("score")
results = results.groupby("datetime").apply(lambda df: (df - df.mean()) / df.std())
results = results.mean(axis=1)
results = results.sort_index()
return results

View File

@@ -36,20 +36,36 @@ class Group:
self._ens_func = ens
def group(self, *args, **kwargs) -> dict:
# TODO: such design is weird when `_group_func` is the only configurable part in the class
"""
Group a set of object and change them to a dict.
For example: {(A,B,C1): object, (A,B,C2): object} -> {(A,B): {C1: object, C2: object}}
Returns:
dict: grouped dict
"""
if isinstance(getattr(self, "_group_func", None), Callable):
return self._group_func(*args, **kwargs)
else:
raise NotImplementedError(f"Please specify valid `group_func`.")
def reduce(self, *args, **kwargs) -> dict:
"""
Reduce grouped dict in some way.
For example: {(A,B): {C1: object, C2: object}} -> {(A,B): object}
Returns:
dict: reduced dict
"""
if isinstance(getattr(self, "_ens_func", None), Callable):
return self._ens_func(*args, **kwargs)
else:
raise NotImplementedError(f"Please specify valid `_ens_func`.")
def __call__(self, ungrouped_dict: dict, n_jobs=1, verbose=0, *args, **kwargs) -> dict:
"""Group the ungrouped_dict into different groups.
"""
Group the ungrouped_dict into different groups.
Args:
ungrouped_dict (dict): the ungrouped dict waiting for grouping like {name: things}

View File

@@ -12,7 +12,6 @@ In ``DelayTrainer``, the first step is only to save some necessary info to model
"""
import socket
import time
from typing import Callable, List
from qlib.data.dataset import Dataset
@@ -145,12 +144,6 @@ class Trainer:
"""
return self.delay
def reset(self):
"""
Reset the Trainer status.
"""
pass
class TrainerR(Trainer):
"""
@@ -160,42 +153,52 @@ class TrainerR(Trainer):
Assumption: models were defined by `task` and the results will saved to `Recorder`
"""
def __init__(self, experiment_name: str, train_func: Callable = task_train):
# Those tag will help you distinguish whether the Recorder has finished traning
STATUS_KEY = "train_status"
STATUS_BEGIN = "begin_task_train"
STATUS_END = "end_task_train"
def __init__(self, experiment_name: str = None, train_func: Callable = task_train):
"""
Init TrainerR.
Args:
experiment_name (str): the name of experiment.
experiment_name (str, optional): the default name of experiment.
train_func (Callable, optional): default training method. Defaults to `task_train`.
"""
super().__init__()
self.experiment_name = experiment_name
self.train_func = train_func
def train(self, tasks: list, train_func: Callable = None, **kwargs) -> List[Recorder]:
def train(self, tasks: list, train_func: Callable = None, experiment_name: str = None, **kwargs) -> List[Recorder]:
"""
Given a list of `task`s and return a list of trained Recorder. The order can be guaranteed.
Args:
tasks (list): a list of definition based on `task` dict
train_func (Callable): the train method which need at least `task`s and `experiment_name`. None for default training method.
experiment_name (str): the experiment name, None for use default name.
kwargs: the params for train_func.
Returns:
list: a list of Recorders
"""
if len(tasks) == 0:
return []
if train_func is None:
train_func = self.train_func
if experiment_name is None:
experiment_name = self.experiment_name
recs = []
for task in tasks:
rec = train_func(task, self.experiment_name, **kwargs)
rec.set_tags(**{"train_status": "begin_task_train"})
rec = train_func(task, experiment_name, **kwargs)
rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN})
recs.append(rec)
return recs
def end_train(self, recs: list, **kwargs) -> list:
def end_train(self, recs: list, **kwargs) -> List[Recorder]:
for rec in recs:
rec.set_tags(**{"train_status": "end_task_train"})
rec.set_tags(**{self.STATUS_KEY: self.STATUS_END})
return recs
@@ -204,12 +207,12 @@ class DelayTrainerR(TrainerR):
A delayed implementation based on TrainerR, which means `train` method may only do some preparation and `end_train` method can do the real model fitting.
"""
def __init__(self, experiment_name, train_func=begin_task_train, end_train_func=end_task_train):
def __init__(self, experiment_name: str = None, train_func=begin_task_train, end_train_func=end_task_train):
"""
Init TrainerRM.
Args:
experiment_name (str): the name of experiment.
experiment_name (str): the default name of experiment.
train_func (Callable, optional): default train method. Defaults to `begin_task_train`.
end_train_func (Callable, optional): default end_train method. Defaults to `end_task_train`.
"""
@@ -217,7 +220,7 @@ class DelayTrainerR(TrainerR):
self.end_train_func = end_train_func
self.delay = True
def end_train(self, recs, end_train_func=None, **kwargs) -> List[Recorder]:
def end_train(self, recs, end_train_func=None, experiment_name: str = None, **kwargs) -> List[Recorder]:
"""
Given a list of Recorder and return a list of trained Recorder.
This class will finish real data loading and model fitting.
@@ -225,6 +228,7 @@ class DelayTrainerR(TrainerR):
Args:
recs (list): a list of Recorder, the tasks have been saved to them
end_train_func (Callable, optional): the end_train method which need at least `recorder`s and `experiment_name`. Defaults to None for using self.end_train_func.
experiment_name (str): the experiment name, None for use default name.
kwargs: the params for end_train_func.
Returns:
@@ -232,9 +236,13 @@ class DelayTrainerR(TrainerR):
"""
if end_train_func is None:
end_train_func = self.end_train_func
if experiment_name is None:
experiment_name = self.experiment_name
for rec in recs:
end_train_func(rec, **kwargs)
rec.set_tags(**{"train_status": "end_task_train"})
if rec.list_tags()[self.STATUS_KEY] == self.STATUS_END:
continue
end_train_func(rec, experiment_name, **kwargs)
rec.set_tags(**{self.STATUS_KEY: self.STATUS_END})
return recs
@@ -246,13 +254,18 @@ class TrainerRM(Trainer):
Assumption: `task` will be saved to TaskManager and `task` will be fetched and trained from TaskManager
"""
def __init__(self, experiment_name: str, task_pool: str, train_func=task_train):
# Those tag will help you distinguish whether the Recorder has finished traning
STATUS_KEY = "train_status"
STATUS_BEGIN = "begin_task_train"
STATUS_END = "end_task_train"
def __init__(self, experiment_name: str = None, task_pool: str = None, train_func=task_train):
"""
Init TrainerR.
Args:
experiment_name (str): the name of experiment.
task_pool (str): task pool name in TaskManager.
experiment_name (str): the default name of experiment.
task_pool (str): task pool name in TaskManager. None for use same name as experiment_name.
train_func (Callable, optional): default training method. Defaults to `task_train`.
"""
super().__init__()
@@ -264,6 +277,7 @@ class TrainerRM(Trainer):
self,
tasks: list,
train_func: Callable = None,
experiment_name: str = None,
before_status: str = TaskManager.STATUS_WAITING,
after_status: str = TaskManager.STATUS_DONE,
**kwargs,
@@ -277,6 +291,7 @@ class TrainerRM(Trainer):
Args:
tasks (list): a list of definition based on `task` dict
train_func (Callable): the train method which need at least `task`s and `experiment_name`. None for default training method.
experiment_name (str): the experiment name, None for use default name.
before_status (str): the tasks in before_status will be fetched and trained. Can be STATUS_WAITING, STATUS_PART_DONE.
after_status (str): the tasks after trained will become after_status. Can be STATUS_WAITING, STATUS_PART_DONE.
kwargs: the params for train_func.
@@ -284,14 +299,21 @@ class TrainerRM(Trainer):
Returns:
list: a list of Recorders
"""
if len(tasks) == 0:
return []
if train_func is None:
train_func = self.train_func
tm = TaskManager(task_pool=self.task_pool)
if experiment_name is None:
experiment_name = self.experiment_name
task_pool = self.task_pool
if task_pool is None:
task_pool = experiment_name
tm = TaskManager(task_pool=task_pool)
_id_list = tm.create_task(tasks) # all tasks will be saved to MongoDB
run_task(
train_func,
self.task_pool,
experiment_name=self.experiment_name,
task_pool,
experiment_name=experiment_name,
before_status=before_status,
after_status=after_status,
**kwargs,
@@ -300,23 +322,15 @@ class TrainerRM(Trainer):
recs = []
for _id in _id_list:
rec = tm.re_query(_id)["res"]
rec.set_tags(**{"train_status": "begin_task_train"})
rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN})
recs.append(rec)
return recs
def end_train(self, recs: list, **kwargs) -> list:
for rec in recs:
rec.set_tags(**{"train_status": "end_task_train"})
rec.set_tags(**{self.STATUS_KEY: self.STATUS_END})
return recs
def reset(self):
"""
.. note::
this method will delete all task in this task_pool!
"""
tm = TaskManager(task_pool=self.task_pool)
tm.remove()
class DelayTrainerRM(TrainerRM):
"""
@@ -324,30 +338,57 @@ class DelayTrainerRM(TrainerRM):
"""
def __init__(self, experiment_name, task_pool: str, train_func=begin_task_train, end_train_func=end_task_train):
def __init__(
self,
experiment_name: str = None,
task_pool: str = None,
train_func=begin_task_train,
end_train_func=end_task_train,
):
"""
Init DelayTrainerRM.
Args:
experiment_name (str): the default name of experiment.
task_pool (str): task pool name in TaskManager. None for use same name as experiment_name.
train_func (Callable, optional): default train method. Defaults to `begin_task_train`.
end_train_func (Callable, optional): default end_train method. Defaults to `end_task_train`.
"""
super().__init__(experiment_name, task_pool, train_func)
self.end_train_func = end_train_func
self.delay = True
def train(self, tasks: list, train_func=None, **kwargs):
def train(self, tasks: list, train_func=None, experiment_name: str = None, **kwargs):
"""
Same as `train` of TrainerRM, after_status will be STATUS_PART_DONE.
Args:
tasks (list): a list of definition based on `task` dict
train_func (Callable): the train method which need at least `task`s and `experiment_name`. Defaults to None for using self.train_func.
experiment_name (str): the experiment name, None for use default name.
Returns:
list: a list of Recorders
"""
return super().train(tasks, train_func=train_func, after_status=TaskManager.STATUS_PART_DONE, **kwargs)
if len(tasks) == 0:
return []
return super().train(
tasks,
train_func=train_func,
experiment_name=experiment_name,
after_status=TaskManager.STATUS_PART_DONE,
**kwargs,
)
def end_train(self, recs, end_train_func=None, **kwargs):
def end_train(self, recs, end_train_func=None, experiment_name: str = None, **kwargs):
"""
Given a list of Recorder and return a list of trained Recorder.
This class will finish real data loading and model fitting.
NOTE: This method will train all STATUS_PART_DONE tasks in task pool, not only the ``recs``.
Args:
recs (list): a list of Recorder, the tasks have been saved to them.
end_train_func (Callable, optional): the end_train method which need at least `recorder`s and `experiment_name`. Defaults to None for using self.end_train_func.
experiment_name (str): the experiment name, None for use default name.
kwargs: the params for end_train_func.
Returns:
@@ -356,13 +397,23 @@ class DelayTrainerRM(TrainerRM):
if end_train_func is None:
end_train_func = self.end_train_func
if experiment_name is None:
experiment_name = self.experiment_name
task_pool = self.task_pool
if task_pool is None:
task_pool = experiment_name
tasks = []
for rec in recs:
tasks.append(rec.load_object("task"))
run_task(
end_train_func,
self.task_pool,
experiment_name=self.experiment_name,
task_pool,
tasks=tasks,
experiment_name=experiment_name,
before_status=TaskManager.STATUS_PART_DONE,
**kwargs,
)
for rec in recs:
rec.set_tags(**{"train_status": "end_task_train"})
rec.set_tags(**{self.STATUS_KEY: self.STATUS_END})
return recs