qlib/qlib/workflow/record_temp.py

#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

import re
import logging
import warnings
import pandas as pd
from pathlib import Path
from pprint import pprint
from ..contrib.evaluate import risk_analysis
from ..contrib.backtest import backtest as normal_backtest

from ..data.dataset import DatasetH
from ..data.dataset.handler import DataHandlerLP
from ..utils import init_instance_by_config, get_module_by_module_path
from ..log import get_module_logger
from ..utils import flatten_dict
from ..utils.sample import parse_freq
from ..strategy.base import BaseStrategy
from ..contrib.eva.alpha import calc_ic, calc_long_short_return


logger = get_module_logger("workflow", logging.INFO)


class RecordTemp:
    """
    This is the Records Template class that enables user to generate experiment results such as IC and
    backtest in a certain format.
    """

    artifact_path = None

    @classmethod
    def get_path(cls, path=None):
        names = []
        if cls.artifact_path is not None:
            names.append(cls.artifact_path)

        if path is not None:
            names.append(path)

        return "/".join(names)

    def __init__(self, recorder):
        self._recorder = recorder

    @property
    def recorder(self):
        if self._recorder is None:
            raise ValueError("This RecordTemp did not set recorder yet.")
        return self._recorder

    def generate(self, **kwargs):
        """
        Generate certain records such as IC, backtest etc., and save them.

        Parameters
        ----------
        kwargs

        Return
        ------
        """
        raise NotImplementedError(f"Please implement the `generate` method.")

    def load(self, name):
        """
        Load the stored records. Due to the fact that some problems occured when we tried to balancing a clean API
        with the Python's inheritance. This method has to be used in a rather ugly way, and we will try to fix them
        in the future::

            sar = SigAnaRecord(recorder)
            ic = sar.load(sar.get_path("ic.pkl"))

        Parameters
        ----------
        name : str
            the name for the file to be load.

        Return
        ------
        The stored records.
        """
        # try to load the saved object
        obj = self.recorder.load_object(name)
        return obj

    def list(self):
        """
        List the stored records.

        Return
        ------
        A list of all the stored records.
        """
        return []

    def check(self, parent=False):
        """
        Check if the records is properly generated and saved.

        Raise
        ------
        FileExistsError: whether the records are stored properly.
        """
        artifacts = set(self.recorder.list_artifacts())
        if parent:
            # Downcasting have to be done here instead of using `super`
            flist = self.__class__.__base__.list(self)  # pylint: disable=E1101
        else:
            flist = self.list()
        for item in flist:
            if item not in artifacts:
                raise FileExistsError(item)


class SignalRecord(RecordTemp):
    """
    This is the Signal Record class that generates the signal prediction. This class inherits the ``RecordTemp`` class.
    """

    def __init__(self, model=None, dataset=None, recorder=None):
        super().__init__(recorder=recorder)
        self.model = model
        self.dataset = dataset

    def generate(self, **kwargs):
        # generate prediciton
        pred = self.model.predict(self.dataset)
        if isinstance(pred, pd.Series):
            pred = pred.to_frame("score")
        self.recorder.save_objects(**{"pred.pkl": pred})

        logger.info(
            f"Signal record 'pred.pkl' has been saved as the artifact of the Experiment {self.recorder.experiment_id}"
        )
        # print out results
        pprint(f"The following are prediction results of the {type(self.model).__name__} model.")
        pprint(pred.head(5))

        if isinstance(self.dataset, DatasetH):
            # NOTE:
            # Python doesn't provide the downcasting mechanism.
            # We use the trick here to downcast the class
            orig_cls = self.dataset.__class__
            self.dataset.__class__ = DatasetH

            params = dict(segments="test", col_set="label", data_key=DataHandlerLP.DK_R)
            try:
                # Assume the backend handler is DataHandlerLP
                raw_label = self.dataset.prepare(**params)
            except TypeError:
                # The argument number is not right
                del params["data_key"]
                # The backend handler should be DataHandler
                raw_label = self.dataset.prepare(**params)

            self.recorder.save_objects(**{"label.pkl": raw_label})
            self.dataset.__class__ = orig_cls

    def list(self):
        return ["pred.pkl", "label.pkl"]

    def load(self, name="pred.pkl"):
        return super().load(name)


class HFSignalRecord(SignalRecord):
    """
    This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.
    """

    artifact_path = "hg_sig_analysis"

    def __init__(self, recorder, **kwargs):
        super().__init__(recorder=recorder)

    def generate(self):
        pred = self.load("pred.pkl")
        raw_label = self.load("label.pkl")
        long_pre, short_pre = calc_long_short_prec(pred.iloc[:, 0], raw_label.iloc[:, 0], is_alpha=True)
        ic, ric = calc_ic(pred.iloc[:, 0], raw_label.iloc[:, 0])
        metrics = {
            "IC": ic.mean(),
            "ICIR": ic.mean() / ic.std(),
            "Rank IC": ric.mean(),
            "Rank ICIR": ric.mean() / ric.std(),
            "Long precision": long_pre.mean(),
            "Short precision": short_pre.mean(),
        }
        objects = {"ic.pkl": ic, "ric.pkl": ric}
        objects.update({"long_pre.pkl": long_pre, "short_pre.pkl": short_pre})
        long_short_r, long_avg_r = calc_long_short_return(pred.iloc[:, 0], raw_label.iloc[:, 0])
        metrics.update(
            {
                "Long-Short Average Return": long_short_r.mean(),
                "Long-Short Average Sharpe": long_short_r.mean() / long_short_r.std(),
            }
        )
        objects.update(
            {
                "long_short_r.pkl": long_short_r,
                "long_avg_r.pkl": long_avg_r,
            }
        )
        self.recorder.log_metrics(**metrics)
        self.recorder.save_objects(**objects, artifact_path=self.get_path())
        pprint(metrics)

    def list(self):
        paths = [
            self.get_path("ic.pkl"),
            self.get_path("ric.pkl"),
            self.get_path("long_pre.pkl"),
            self.get_path("short_pre.pkl"),
            self.get_path("long_short_r.pkl"),
            self.get_path("long_avg_r.pkl"),
        ]
        return paths


class SigAnaRecord(SignalRecord):
    """
    This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.
    """

    artifact_path = "sig_analysis"

    def __init__(self, recorder, ana_long_short=False, ann_scaler=252, **kwargs):
        super().__init__(recorder=recorder, **kwargs)
        self.ana_long_short = ana_long_short
        self.ann_scaler = ann_scaler

    def generate(self, **kwargs):
        try:
            self.check(parent=True)
        except FileExistsError:
            super().generate()

        pred = self.load("pred.pkl")
        label = self.load("label.pkl")
        ic, ric = calc_ic(pred.iloc[:, 0], label.iloc[:, 0])
        metrics = {
            "IC": ic.mean(),
            "ICIR": ic.mean() / ic.std(),
            "Rank IC": ric.mean(),
            "Rank ICIR": ric.mean() / ric.std(),
        }
        objects = {"ic.pkl": ic, "ric.pkl": ric}
        if self.ana_long_short:
            long_short_r, long_avg_r = calc_long_short_return(pred.iloc[:, 0], label.iloc[:, 0])
            metrics.update(
                {
                    "Long-Short Ann Return": long_short_r.mean() * self.ann_scaler,
                    "Long-Short Ann Sharpe": long_short_r.mean() / long_short_r.std() * self.ann_scaler ** 0.5,
                    "Long-Avg Ann Return": long_avg_r.mean() * self.ann_scaler,
                    "Long-Avg Ann Sharpe": long_avg_r.mean() / long_avg_r.std() * self.ann_scaler ** 0.5,
                }
            )
            objects.update(
                {
                    "long_short_r.pkl": long_short_r,
                    "long_avg_r.pkl": long_avg_r,
                }
            )
        self.recorder.log_metrics(**metrics)
        self.recorder.save_objects(**objects, artifact_path=self.get_path())
        pprint(metrics)

    def list(self):
        paths = [self.get_path("ic.pkl"), self.get_path("ric.pkl")]
        if self.ana_long_short:
            paths.extend([self.get_path("long_short_r.pkl"), self.get_path("long_avg_r.pkl")])
        return paths


class PortAnaRecord(RecordTemp):
    """
    This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class.

    The following files will be stored in recorder
    - report_normal.pkl & positions_normal.pkl:
        - The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest`
    - port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis`
    """

    artifact_path = "portfolio_analysis"

    def __init__(self, recorder, config, risk_analysis_freq, **kwargs):
        """
        config["strategy"] : dict
            define the strategy class as well as the kwargs.
        config["env"] : dict
            define the env class as well as the kwargs.
        config["backtest"] : dict
            define the backtest kwargs.
        risk_analysis_freq : int
            risk analysis freq of report
        """
        super().__init__(recorder=recorder, **kwargs)

        self.strategy_config = config["strategy"]
        self.env_config = config["env"]
        self.backtest_config = config["backtest"]
        _count, _freq = parse_freq(risk_analysis_freq)
        self.risk_analysis_freq = f"{_count}{_freq}"
        self.report_freq = self._get_report_freq(self.env_config)

    def _get_report_freq(self, env_config):
        ret_freq = []
        if env_config["kwargs"].get("generate_report", False):
            _count, _freq = parse_freq(env_config["kwargs"]["step_bar"])
            ret_freq.append(f"{_count}{_freq}")
        if "sub_env" in env_config["kwargs"]:
            ret_freq.extend(self._get_report_freq(env_config["kwargs"]["sub_env"]))
        return ret_freq

    def generate(self, **kwargs):
        # custom strategy and get backtest
        report_dict = normal_backtest(env=self.env_config, strategy=self.strategy_config, **self.backtest_config)
        for report_freq, (report_normal, positions_normal) in report_dict.items():
            self.recorder.save_objects(
                **{f"report_normal_{report_freq}.pkl": report_normal}, artifact_path=PortAnaRecord.get_path()
            )
            self.recorder.save_objects(
                **{f"positions_normal_{report_freq}.pkl": positions_normal}, artifact_path=PortAnaRecord.get_path()
            )

        if self.risk_analysis_freq not in report_dict:
            warnings.warn(
                f"the freq {self.risk_analysis_freq} report is not found, please set the corresponding env with `generate_report==True`"
            )
        else:
            report_normal, _ = report_dict.get(self.risk_analysis_freq)
            analysis = dict()
            analysis["excess_return_without_cost"] = risk_analysis(
                report_normal["return"] - report_normal["bench"], self.risk_analysis_freq
            )
            analysis["excess_return_with_cost"] = risk_analysis(
                report_normal["return"] - report_normal["bench"] - report_normal["cost"], self.risk_analysis_freq
            )
            analysis_df = pd.concat(analysis)  # type: pd.DataFrame
            # log metrics
            self.recorder.log_metrics(**flatten_dict(analysis_df["risk"].unstack().T.to_dict()))
            # save results
            self.recorder.save_objects(
                **{f"port_analysis_{report_freq}.pkl": analysis_df}, artifact_path=PortAnaRecord.get_path()
            )
            logger.info(
                f"Portfolio analysis record 'port_analysis_{report_freq}.pkl' has been saved as the artifact of the Experiment {self.recorder.experiment_id}"
            )
            # print out results
            pprint("The following are analysis results of the excess return without cost.")
            pprint(analysis["excess_return_without_cost"])
            pprint("The following are analysis results of the excess return with cost.")
            pprint(analysis["excess_return_with_cost"])

    def list(self):
        list_path = []
        for _freq in self.report_freq:
            list_path.extend(
                [
                    PortAnaRecord.get_path(f"report_normal_{_freq}.pkl"),
                    PortAnaRecord.get_path(f"positions_normal_{_freq}.pkl"),
                ]
            )
            if _freq == self.risk_analysis_freq:
                list_path.append(PortAnaRecord.get_path(f"port_analysis_{_freq}.pkl"))
        return list_path