qlib/tests/test_pit.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.


import sys
import qlib
import shutil
import unittest
import pytest
import pandas as pd
from pathlib import Path

from qlib.data import D
from qlib.tests.data import GetData

sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from dump_pit import DumpPitData

sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts/data_collector/pit")))
from collector import Run


pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)

DATA_DIR = Path(__file__).parent.joinpath("test_pit_data")
SOURCE_DIR = DATA_DIR.joinpath("stock_data/source")
SOURCE_DIR.mkdir(exist_ok=True, parents=True)
QLIB_DIR = DATA_DIR.joinpath("qlib_data")
QLIB_DIR.mkdir(exist_ok=True, parents=True)


class TestPIT(unittest.TestCase):
    @classmethod
    def tearDownClass(cls) -> None:
        shutil.rmtree(str(DATA_DIR.resolve()))

    @classmethod
    def setUpClass(cls) -> None:
        cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve())
        pit_dir = str(SOURCE_DIR.joinpath("pit").resolve())
        pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve())
        GetData().qlib_data(
            name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True
        )
        GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True)

        # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data.
        # bs.login()
        # Run(
        #     source_dir=pit_dir,
        #     interval="quarterly",
        # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*")
        # bs.logout()

        Run(
            source_dir=pit_dir,
            normalize_dir=pit_normalized_dir,
            interval="quarterly",
        ).normalize_data()
        DumpPitData(
            csv_path=pit_normalized_dir,
            qlib_dir=cn_data_dir,
        ).dump(interval="quarterly")

    def setUp(self):
        # qlib.init(kernels=1)  # NOTE: set kernel to 1 to make it debug easier
        provider_uri = str(QLIB_DIR.joinpath("cn_data").resolve())
        qlib.init(provider_uri=provider_uri)

    def to_str(self, obj):
        return "".join(str(obj).split())

    def check_same(self, a, b):
        self.assertEqual(self.to_str(a), self.to_str(b))

    def test_query(self):
        instruments = ["sh600519"]
        fields = ["P($$roewa_q)", "P($$yoyni_q)"]
        # Mao Tai published 2019Q2 report at 2019-07-13 & 2019-07-18
        # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index
        data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
        res = """
               P($$roewa_q)  P($$yoyni_q)
        count    133.000000    133.000000
        mean       0.196412      0.277930
        std        0.097591      0.030262
        min        0.000000      0.243892
        25%        0.094737      0.243892
        50%        0.255220      0.304181
        75%        0.255220      0.305041
        max        0.344644      0.305041
        """
        self.check_same(data.describe(), res)

        res = """
                               P($$roewa_q)  P($$yoyni_q)
        instrument datetime
        sh600519   2019-07-15      0.000000      0.305041
                   2019-07-16      0.000000      0.305041
                   2019-07-17      0.000000      0.305041
                   2019-07-18      0.175322      0.252650
                   2019-07-19      0.175322      0.252650
        """
        self.check_same(data.tail(), res)

    def test_no_exist_data(self):
        fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"]
        data = D.features(["sh600519", "sh601988"], fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
        data["$close"] = 1  # in case of different dataset gives different values
        expect = """
                               P($$roewa_q)  P($$yoyni_q)  $close
        instrument datetime
        sh600519   2019-01-02       0.25522      0.243892       1
                   2019-01-03       0.25522      0.243892       1
                   2019-01-04       0.25522      0.243892       1
                   2019-01-07       0.25522      0.243892       1
                   2019-01-08       0.25522      0.243892       1
        ...                             ...           ...     ...
        sh601988   2019-07-15           NaN           NaN       1
                   2019-07-16           NaN           NaN       1
                   2019-07-17           NaN           NaN       1
                   2019-07-18           NaN           NaN       1
                   2019-07-19           NaN           NaN       1

        [266 rows x 3 columns]
        """
        self.check_same(data, expect)

    @pytest.mark.slow
    def test_expr(self):
        fields = [
            "P(Mean($$roewa_q, 1))",
            "P($$roewa_q)",
            "P(Mean($$roewa_q, 2))",
            "P(Ref($$roewa_q, 1))",
            "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
        ]
        instruments = ["sh600519"]
        data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
        expect = """
                               P(Mean($$roewa_q, 1))  P($$roewa_q)  P(Mean($$roewa_q, 2))  P(Ref($$roewa_q, 1))  P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
        instrument datetime
        sh600519   2019-07-01               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-02               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-03               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-04               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-05               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-08               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-09               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-10               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-11               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-12               0.094737      0.094737               0.219691              0.344644                               0.219691
                   2019-07-15               0.000000      0.000000               0.047369              0.094737                               0.047369
                   2019-07-16               0.000000      0.000000               0.047369              0.094737                               0.047369
                   2019-07-17               0.000000      0.000000               0.047369              0.094737                               0.047369
                   2019-07-18               0.175322      0.175322               0.135029              0.094737                               0.135029
                   2019-07-19               0.175322      0.175322               0.135029              0.094737                               0.135029
        """
        self.check_same(data.tail(15), expect)

    def test_unlimit(self):
        # fields = ["P(Mean($$roewa_q, 1))", "P($$roewa_q)", "P(Mean($$roewa_q, 2))", "P(Ref($$roewa_q, 1))", "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)"]
        fields = ["P($$roewa_q)"]
        instruments = ["sh600519"]
        _ = D.features(instruments, fields, freq="day")  # this should not raise error
        data = D.features(instruments, fields, end_time="2020-01-01", freq="day")  # this should not raise error
        s = data.iloc[:, 0]
        # You can check the expected value based on the content in `docs/advanced/PIT.rst`
        expect = """
        instrument  datetime
        sh600519    2005-01-04         NaN
                    2007-04-30    0.090219
                    2007-08-17    0.139330
                    2007-10-23    0.245863
                    2008-03-03    0.347900
                    2008-03-13    0.395989
                    2008-04-22    0.100724
                    2008-08-28    0.249968
                    2008-10-27    0.334120
                    2009-03-25    0.390117
                    2009-04-21    0.102675
                    2009-08-07    0.230712
                    2009-10-26    0.300730
                    2010-04-02    0.335461
                    2010-04-26    0.083825
                    2010-08-12    0.200545
                    2010-10-29    0.260986
                    2011-03-21    0.307393
                    2011-04-25    0.097411
                    2011-08-31    0.248251
                    2011-10-18    0.318919
                    2012-03-23    0.403900
                    2012-04-11    0.403925
                    2012-04-26    0.112148
                    2012-08-10    0.264847
                    2012-10-26    0.370487
                    2013-03-29    0.450047
                    2013-04-18    0.099958
                    2013-09-02    0.210442
                    2013-10-16    0.304543
                    2014-03-25    0.394328
                    2014-04-25    0.083217
                    2014-08-29    0.164503
                    2014-10-30    0.234085
                    2015-04-21    0.078494
                    2015-08-28    0.137504
                    2015-10-23    0.201709
                    2016-03-24    0.264205
                    2016-04-21    0.073664
                    2016-08-29    0.136576
                    2016-10-31    0.188062
                    2017-04-17    0.244385
                    2017-04-25    0.080614
                    2017-07-28    0.151510
                    2017-10-26    0.254166
                    2018-03-28    0.329542
                    2018-05-02    0.088887
                    2018-08-02    0.170563
                    2018-10-29    0.255220
                    2019-03-29    0.344644
                    2019-04-25    0.094737
                    2019-07-15    0.000000
                    2019-07-18    0.175322
                    2019-10-16    0.255819
        Name: P($$roewa_q), dtype: float32
        """
        self.check_same(s[~s.duplicated().values], expect)

    def test_expr2(self):
        instruments = ["sh600519"]
        fields = ["P($$roewa_q)", "P($$yoyni_q)"]
        fields += ["P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)"]
        fields += ["P(Sum($$yoyni_q, 4))"]
        fields += ["$close", "P($$roewa_q) * $close"]
        data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
        except_data = """
                                       P($$roewa_q)  P($$yoyni_q)  P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)  P(Sum($$yoyni_q, 4))      $close  P($$roewa_q) * $close
        instrument datetime
        sh600519   2019-01-02      0.255220      0.243892                                           1.484224                           1.661578   63.595333              16.230801
                   2019-01-03      0.255220      0.243892                                           1.484224                           1.661578   62.641907              15.987467
                   2019-01-04      0.255220      0.243892                                           1.484224                           1.661578   63.915985              16.312637
                   2019-01-07      0.255220      0.243892                                           1.484224                           1.661578   64.286530              16.407207
                   2019-01-08      0.255220      0.243892                                           1.484224                           1.661578   64.212196              16.388237
        ...                             ...           ...                                                ...                                ...         ...                    ...
                   2019-12-25      0.255819      0.219821                                           0.677052                           1.081693  122.150467              31.248409
                   2019-12-26      0.255819      0.219821                                           0.677052                           1.081693  122.301315              31.286999
                   2019-12-27      0.255819      0.219821                                           0.677052                           1.081693  125.307404              32.056015
                   2019-12-30      0.255819      0.219821                                           0.677052                           1.081693  127.763992              32.684456
                   2019-12-31      0.255819      0.219821                                           0.677052                           1.081693  127.462303              32.607277

        [244 rows x 6 columns]
        """
        self.check_same(data, except_data)

    def test_pref_operator(self):
        instruments = ["sh600519"]
        fields = [
            "PRef($$roewa_q, 201902)",
            "PRef($$yoyni_q, 201801)",
            "P($$roewa_q)",
            "P($$roewa_q) / PRef($$roewa_q, 201801)",
        ]
        data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
        except_data = """
                               PRef($$roewa_q, 201902)  PRef($$yoyni_q, 201801)  P($$roewa_q)  P($$roewa_q) / PRef($$roewa_q, 201801)
        instrument datetime
        sh600519   2018-05-02                      NaN                 0.395075      0.088887                                1.000000
                   2018-05-03                      NaN                 0.395075      0.088887                                1.000000
                   2018-05-04                      NaN                 0.395075      0.088887                                1.000000
                   2018-05-07                      NaN                 0.395075      0.088887                                1.000000
                   2018-05-08                      NaN                 0.395075      0.088887                                1.000000
        ...                                        ...                      ...           ...                                     ...
                   2019-07-15                 0.000000                 0.395075      0.000000                                0.000000
                   2019-07-16                 0.000000                 0.395075      0.000000                                0.000000
                   2019-07-17                 0.000000                 0.395075      0.000000                                0.000000
                   2019-07-18                 0.175322                 0.395075      0.175322                                1.972414
                   2019-07-19                 0.175322                 0.395075      0.175322                                1.972414

        [299 rows x 4 columns]
        """
        self.check_same(data, except_data)


if __name__ == "__main__":
    unittest.main()