diff --git a/README.md b/README.md index bf955e61d..98a09b619 100644 --- a/README.md +++ b/README.md @@ -1,60 +1,84 @@ + +
+ +
+ + Qlib is an AI-oriented quantitative investment platform, which aims to realize the potential, empower the research, and create the value of AI technologies in quantitative investment. -With Qlib, you can easily apply your favorite model to create a better Quant investment strategy. +With Qlib, you can easily try your ideas to create better Quant investment strategies. + +For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative Investment Platform"](https://arxiv.org/abs/2009.11189). - [Framework of Qlib](#framework-of-qlib) -- [Quick start](#quick-start) - - [Installation](#installation) - - [Get Data](#get-data) - - [Auto Quant research workflow with _estimator_](#auto-quant-research-workflow-with-estimator) - - [Customized Quant research workflow by code](#customized-quant-research-workflow-by-code) -- [More About Qlib](#more-about-qlib) - - [Offline mode and online mode](#offline-mode-and-online-mode) - - [Performance of Qlib Data Server](#performance-of-qlib-data-server) -- [Contributing](#contributing) +- [Quick Start](#Quick-Start) + - [Installation](#Installation) + - [Data Preparation](#Data-Preparation) + - [Auto Quant Research Workflow with](#Auto-Quant-Research-Workflow) + - [Building Customized Quant Research Workflow by Code](#Building-Customized-Quant-Research-Workflow-by-Code) +- [More About Qlib](#More-About-Qlib) +- [Offline mode and online mode of data server](#Offline-Mode-and-Online-Mode-of-the-Data-Server) + - [Performance of Qlib Data Server](#Performance-of-Qlib-Data-Server) +- [Contributing](#Contributing) # Framework of Qlib -![framework](docs/_static/img/framework.png) +
+ +
-At the module level, Qlib is a platform that consists of the above components. Each component is loose-coupling and can be used stand-alone. + +At the module level, Qlib is a platform that consists of the above components. The components are designed as loose-coupled modules and each component could be used stand-alone. | Name | Description | | ------ | ----- | -| _Data layer_ | _DataServer_ focus on providing high performance infrastructure for user to retrieve and get raw data. _DataEnhancement_ will preprocess the data and provide the best dataset to be fed in to the models | -| _Interday Model_ | _Interday model_ focus on producing forecasting signals(aka. _alpha_). Models are trained by _Model Creator_ and managed by _Model Manager_. User could choose one or multiple models for forecasting. Multiple models could be combined with _Ensemble_ module | -| _Interday Strategy_ | _Portfolio Generator_ will take forecasting signals as input and output the orders based on current position to achieve target portfolio | -| _Intraday Trading_ | _Order Executor_ is responsible for executing orders produced by _Interday Strategy_ and returning the executed results. | -| _Analysis_ | User could get detailed analysis report of forecasting signal and portfolio in this part. | +| `Data layer` | `DataServer` focuses on providing high-performance infrastructure for users to manage and retrieve raw data. `DataEnhancement` will preprocess the data and provide the best dataset to be fed into the models. | +| `Interday Model` | `Interday model` focuses on producing prediction scores (aka. _alpha_). Models are trained by `Model Creator` and managed by `Model Manager`. Users could choose one or multiple models for prediction. Multiple models could be combined with `Ensemble` module. | +| `Interday Strategy` | `Portfolio Generator` will take prediction scores as input and output the orders based on the current position to achieve the target portfolio. | +| `Intraday Trading` | `Order Executor` is responsible for executing orders output by `Interday Strategy` and returning the executed results. | +| `Analysis` | Users could get a detailed analysis report of forecasting signals and portfolios in this part. | -* The modules with hand-drawn style is under development and will be released in the future. -* The modules with dashed border is highly user-customizable and extendible. +* The modules with hand-drawn style are under development and will be released in the future. +* The modules with dashed borders are highly user-customizable and extendible. -# Quick start +# Quick Start + +This quick start guide tries to demonstrate +1. It's very easy to build a complete Quant research workflow and try your ideas with _Qlib_. +1. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment. ## Installation -To install Qlib from source you need _Cython_ in addition to the normal dependencies above: +Users can easily intsall ``Qlib`` according to the following steps: -```bash -pip install numpy -pip install --upgrade cython -``` +* Before installing ``Qlib`` from source, you need to install some dependencies: -Clone the repository and then run: -```bash -python setup.py install -``` + ```bash + pip install numpy + pip install --upgrade cython + ``` + +* Clone the repository and install ``Qlib``: + + ```bash + git clone https://github.com/microsoft/qlib.git && cd qlib + python setup.py install + ``` -## Get Data -- Load and prepare the Data: execute the following command to load the stock data: +## Data Preparation +Load and prepare data by running the following code: ```bash python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data ``` + +This dataset is created by public data collected by [crawler scripts](scripts/data_collector/), which have been released in +the same repository. +Users could create the same dataset with it. + -## Auto Quant research workflow with _estimator_ -Qlib provides a tool named `estimator` to run whole workflow automatically(including building dataset, train models, backtest, analysis) +## Auto Quant Research Workflow +Qlib provides a tool named `Estimator` to run the whole workflow automatically (including building dataset, training models, backtest and evaluation). You can start an auto quant research workflow and have a graphical reports analysis according to the following steps: -1. Run _estimator_ (_config.yaml_ for: [estimator_config.yaml](examples/estimator/estimator_config.yaml)): +1. Quant Research Workflow: Run `Estimator` with [estimator_config.yaml](examples/estimator/estimator_config.yaml) as following. + ```bash + cd examples # Avoid running program under the directory contains `qlib` + estimator -c estimator/estimator_config.yaml + ``` + The result of `Estimator` is as follows, please refer to please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result. ```bash - cd examples # Avoid running program under the directory contains `qlib` - estimator -c estimator/estimator_config.yaml + + risk + excess_return_without_cost mean 0.000605 + std 0.005481 + annualized_return 0.152373 + information_ratio 1.751319 + max_drawdown -0.059055 + excess_return_with_cost mean 0.000410 + std 0.005478 + annualized_return 0.103265 + information_ratio 1.187411 + max_drawdown -0.075024 + + ``` - - Estimator result: - - ```bash + Here are detailed documents for [Estimator](https://qlib.readthedocs.io/en/latest/component/estimator.html). - risk - sub_bench mean 0.000662 - std 0.004487 - annual 0.166720 - sharpe 2.340526 - mdd -0.080516 - sub_cost mean 0.000577 - std 0.004482 - annual 0.145392 - sharpe 2.043494 - mdd -0.083584 - ``` - See the full documents for [Use _Estimator_ to Start An Experiment](TODO:URL). - -2. Analysis - - Run `examples/estimator/analyze_from_estimator.ipynb` in `jupyter notebook` - 1. forecasting signal analysis - - Cumulative Return - - ![Cumulative Return](docs/_static/img/analysis/analysis_model_cumulative_return.png) - ![long_short](docs/_static/img/analysis/analysis_model_long_short.png) - - Information Coefficient(IC) - - ![Information Coefficient](docs/_static/img/analysis/analysis_model_IC.png) - ![Monthly IC](docs/_static/img/analysis/analysis_model_monthly_IC.png) - ![IC](docs/_static/img/analysis/analysis_model_NDQ.png) - - Auto Correlation - - ![Auto Correlation](docs/_static/img/analysis/analysis_model_auto_correlation.png) - - +2. Graphical Reports Analysis: Run `examples/estimator/analyze_from_estimator.ipynb` with `jupyter notebook` to get graphical reports + - Forecasting signal (model prediction) analysis + - Cumulative Return of groups + ![Cumulative Return](docs/_static/img/analysis/analysis_model_cumulative_return.png) + - Return distribution + ![long_short](docs/_static/img/analysis/analysis_model_long_short.png) + - Information Coefficient (IC) + ![Information Coefficient](docs/_static/img/analysis/analysis_model_IC.png) + ![Monthly IC](docs/_static/img/analysis/analysis_model_monthly_IC.png) + ![IC](docs/_static/img/analysis/analysis_model_NDQ.png) + - Auto Correlation of forecasting signal (model prediction) + ![Auto Correlation](docs/_static/img/analysis/analysis_model_auto_correlation.png) - 2. portfolio analysis - - Report - - ![Report](docs/_static/img/analysis/report.png) - + - Portfolio analysis + - Backtest return + ![Report](docs/_static/img/analysis/report.png) + -## Customized Quant research workflow by code -Automatic workflow may not suite the research workflow of all Quant researchers. To support flexible Quant research workflow, Qlib also provide modularized interface to allow researchers to build their own workflow. [Here](TODO_URL) is a demo for customized Quant research workflow by code +## Building Customized Quant Research Workflow by Code +The automatic workflow may not suite the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/train_backtest_analyze.ipynb) is a demo for customized Quant research workflow by code # More About Qlib -The detailed documents are organized in [docs](docs). +The detailed documents are organized in [docs](docs/). [Sphinx](http://www.sphinx-doc.org) and the readthedocs theme is required to build the documentation in html formats. ```bash cd docs/ @@ -160,32 +177,32 @@ conda install sphinx sphinx_rtd_theme -y # pip install sphinx sphinx_rtd_theme make html ``` -You can also view the [latest document](TODO_URL) online directly. +You can also view the [latest document](http://qlib.readthedocs.io/) online directly. -The roadmap is managed as a [github project](https://github.com/microsoft/qlib/projects/1). +Qlib is in active and continuing development. Our plan is in the roadmap, which is managed as a [github project](https://github.com/microsoft/qlib/projects/1). -## Offline mode and online mode -The data server of Qlib can both deployed as offline mode and online mode. The default mode is offline mode. +# Offline Mode and Online Mode of the Data Server +The data server of Qlib can either deployed as offline mode or online mode. The default mode is offline mode. Under offline mode, the data will be deployed locally. -Under online mode, the data will be deployed as a shared data service. The data and their cache will be shared by clients. The data retrieving performance is expected to be improved due to a higher rate of cache hits. It will use less disk space, too. The documents of the online mode can be found in [Qlib-Server](TODO_link). The online mode can be deployed automatically with [Azure CLI based scripts](TODO_link) +Under online mode, the data will be deployed as a shared data service. The data and their cache will be shared by all the clients. The data retrieval performance is expected to be improved due to a higher rate of cache hits. It will consume less disk space, too. The documents of the online mode can be found in [Qlib-Server](https://qlib-server.readthedocs.io/). The online mode can be deployed automatically with [Azure CLI based scripts](https://qlib-server.readthedocs.io/en/latest/build.html#one-click-deployment-in-azure). The source code of online data server can be found in [qlib-server repository](https://github.com/microsoft/qlib-server). ## Performance of Qlib Data Server -The performance of data processing is important to data-driven methods like AI technologies. As an AI-oriented platform, Qlib provides a solution for data storage and data processing. To demonstrate the performance of Qlib, We -compare Qlib with several other solutions. +The performance of data processing is important to data-driven methods like AI technologies. As an AI-oriented platform, Qlib provides a solution for data storage and data processing. To demonstrate the performance of Qlib data server, we +compare it with several other data storage solutions. -We evaluate the performance of several solutions by completing the same task, -which creates a dataset(14 features/factors) from the basic OHLCV daily data of a stock market(800 stocks each day from 2007 to 2020). The task involves data queries and processing. +We evaluate the performance of several storage solutions by finishing the same task, +which creates a dataset (14 features/factors) from the basic OHLCV daily data of a stock market (800 stocks each day from 2007 to 2020). The task involves data queries and processing. | | HDF5 | MySQL | MongoDB | InfluxDB | Qlib -E -D | Qlib +E -D | Qlib +E +D | | -- | ------ | ------ | -------- | --------- | ----------- | ------------ | ----------- | | Total (1CPU) (seconds) | 184.4±3.7 | 365.3±7.5 | 253.6±6.7 | 368.2±3.6 | 147.0±8.8 | 47.6±1.0 | **7.4±0.3** | | Total (64CPU) (seconds) | | | | | 8.8±0.6 | **4.2±0.2** | | -* `+(-)E` indicates with(out) `ExpressionCache` -* `+(-)D` indicates with(out) `DatasetCache` +* `+(-)E` indicates with (out) `ExpressionCache` +* `+(-)D` indicates with (out) `DatasetCache` Most general-purpose databases take too much time on loading data. After looking into the underlying implementation, we find that data go through too many layers of interfaces and unnecessary format transformations in general-purpose database solutions. Such overheads greatly slow down the data loading process. @@ -199,7 +216,7 @@ Qlib data are stored in a compact format, which is efficient to be combined into This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. +the right to use your contribution. For details, visit https://cla.opensource.microsoft.com. When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions diff --git a/docs/_static/img/analysis/analysis_model_IC.png b/docs/_static/img/analysis/analysis_model_IC.png index 157324bab..4769e55e6 100644 Binary files a/docs/_static/img/analysis/analysis_model_IC.png and b/docs/_static/img/analysis/analysis_model_IC.png differ diff --git a/docs/_static/img/analysis/analysis_model_NDQ.png b/docs/_static/img/analysis/analysis_model_NDQ.png index 6f95ed65d..5b3430d71 100644 Binary files a/docs/_static/img/analysis/analysis_model_NDQ.png and b/docs/_static/img/analysis/analysis_model_NDQ.png differ diff --git a/docs/_static/img/analysis/analysis_model_auto_correlation.png b/docs/_static/img/analysis/analysis_model_auto_correlation.png index db330c98a..16eca717d 100644 Binary files a/docs/_static/img/analysis/analysis_model_auto_correlation.png and b/docs/_static/img/analysis/analysis_model_auto_correlation.png differ diff --git a/docs/_static/img/analysis/analysis_model_cumulative_return.png b/docs/_static/img/analysis/analysis_model_cumulative_return.png index 3b79ead49..84dd2cde3 100644 Binary files a/docs/_static/img/analysis/analysis_model_cumulative_return.png and b/docs/_static/img/analysis/analysis_model_cumulative_return.png differ diff --git a/docs/_static/img/analysis/analysis_model_long_short.png b/docs/_static/img/analysis/analysis_model_long_short.png index 8a35dc37c..c3afa07ab 100644 Binary files a/docs/_static/img/analysis/analysis_model_long_short.png and b/docs/_static/img/analysis/analysis_model_long_short.png differ diff --git a/docs/_static/img/analysis/analysis_model_monthly_IC.png b/docs/_static/img/analysis/analysis_model_monthly_IC.png index 682aa7194..86b4235eb 100644 Binary files a/docs/_static/img/analysis/analysis_model_monthly_IC.png and b/docs/_static/img/analysis/analysis_model_monthly_IC.png differ diff --git a/docs/_static/img/analysis/analysis_model_top_bottom_turnover.png b/docs/_static/img/analysis/analysis_model_top_bottom_turnover.png deleted file mode 100644 index 4266d3f85..000000000 Binary files a/docs/_static/img/analysis/analysis_model_top_bottom_turnover.png and /dev/null differ diff --git a/docs/_static/img/analysis/cumulative_return_buy.png b/docs/_static/img/analysis/cumulative_return_buy.png index ea43fecec..23e06fa98 100644 Binary files a/docs/_static/img/analysis/cumulative_return_buy.png and b/docs/_static/img/analysis/cumulative_return_buy.png differ diff --git a/docs/_static/img/analysis/cumulative_return_buy_minus_sell.png b/docs/_static/img/analysis/cumulative_return_buy_minus_sell.png index 65ab5a823..5d976e807 100644 Binary files a/docs/_static/img/analysis/cumulative_return_buy_minus_sell.png and b/docs/_static/img/analysis/cumulative_return_buy_minus_sell.png differ diff --git a/docs/_static/img/analysis/cumulative_return_hold.png b/docs/_static/img/analysis/cumulative_return_hold.png index 2a0dfaca9..d221f5681 100644 Binary files a/docs/_static/img/analysis/cumulative_return_hold.png and b/docs/_static/img/analysis/cumulative_return_hold.png differ diff --git a/docs/_static/img/analysis/cumulative_return_sell.png b/docs/_static/img/analysis/cumulative_return_sell.png index be4eeb942..4f646b4c5 100644 Binary files a/docs/_static/img/analysis/cumulative_return_sell.png and b/docs/_static/img/analysis/cumulative_return_sell.png differ diff --git a/docs/_static/img/analysis/rank_label_buy.png b/docs/_static/img/analysis/rank_label_buy.png index bfa0eedda..2d3ed8851 100644 Binary files a/docs/_static/img/analysis/rank_label_buy.png and b/docs/_static/img/analysis/rank_label_buy.png differ diff --git a/docs/_static/img/analysis/rank_label_hold.png b/docs/_static/img/analysis/rank_label_hold.png index d176b6d91..588121835 100644 Binary files a/docs/_static/img/analysis/rank_label_hold.png and b/docs/_static/img/analysis/rank_label_hold.png differ diff --git a/docs/_static/img/analysis/rank_label_sell.png b/docs/_static/img/analysis/rank_label_sell.png index 1f9b5cb80..27040ab60 100644 Binary files a/docs/_static/img/analysis/rank_label_sell.png and b/docs/_static/img/analysis/rank_label_sell.png differ diff --git a/docs/_static/img/analysis/report.png b/docs/_static/img/analysis/report.png index 622808026..28fefb177 100644 Binary files a/docs/_static/img/analysis/report.png and b/docs/_static/img/analysis/report.png differ diff --git a/docs/_static/img/analysis/risk_analysis_annual.png b/docs/_static/img/analysis/risk_analysis_annual.png deleted file mode 100644 index 0a2b4ee40..000000000 Binary files a/docs/_static/img/analysis/risk_analysis_annual.png and /dev/null differ diff --git a/docs/_static/img/analysis/risk_analysis_annualized_return.png b/docs/_static/img/analysis/risk_analysis_annualized_return.png new file mode 100644 index 000000000..f15f315b3 Binary files /dev/null and b/docs/_static/img/analysis/risk_analysis_annualized_return.png differ diff --git a/docs/_static/img/analysis/risk_analysis_bar.png b/docs/_static/img/analysis/risk_analysis_bar.png index 8c6d4b9c5..6597317fd 100644 Binary files a/docs/_static/img/analysis/risk_analysis_bar.png and b/docs/_static/img/analysis/risk_analysis_bar.png differ diff --git a/docs/_static/img/analysis/risk_analysis_information_ratio.png b/docs/_static/img/analysis/risk_analysis_information_ratio.png new file mode 100644 index 000000000..3bef1069d Binary files /dev/null and b/docs/_static/img/analysis/risk_analysis_information_ratio.png differ diff --git a/docs/_static/img/analysis/risk_analysis_max_drawdown.png b/docs/_static/img/analysis/risk_analysis_max_drawdown.png new file mode 100644 index 000000000..c2e8b0818 Binary files /dev/null and b/docs/_static/img/analysis/risk_analysis_max_drawdown.png differ diff --git a/docs/_static/img/analysis/risk_analysis_mdd.png b/docs/_static/img/analysis/risk_analysis_mdd.png deleted file mode 100644 index 43cdcc9b6..000000000 Binary files a/docs/_static/img/analysis/risk_analysis_mdd.png and /dev/null differ diff --git a/docs/_static/img/analysis/risk_analysis_sharpe.png b/docs/_static/img/analysis/risk_analysis_sharpe.png deleted file mode 100644 index 29ac61651..000000000 Binary files a/docs/_static/img/analysis/risk_analysis_sharpe.png and /dev/null differ diff --git a/docs/_static/img/analysis/risk_analysis_std.png b/docs/_static/img/analysis/risk_analysis_std.png index b33cd6d12..49e7e287c 100644 Binary files a/docs/_static/img/analysis/risk_analysis_std.png and b/docs/_static/img/analysis/risk_analysis_std.png differ diff --git a/docs/_static/img/analysis/score_ic.png b/docs/_static/img/analysis/score_ic.png index 4398c05e0..441998724 100644 Binary files a/docs/_static/img/analysis/score_ic.png and b/docs/_static/img/analysis/score_ic.png differ diff --git a/docs/_static/img/logo/1.png b/docs/_static/img/logo/1.png new file mode 100644 index 000000000..a897c7321 Binary files /dev/null and b/docs/_static/img/logo/1.png differ diff --git a/docs/_static/img/logo/2.png b/docs/_static/img/logo/2.png new file mode 100644 index 000000000..3b1a21634 Binary files /dev/null and b/docs/_static/img/logo/2.png differ diff --git a/docs/_static/img/logo/3.png b/docs/_static/img/logo/3.png new file mode 100644 index 000000000..ec7b3e884 Binary files /dev/null and b/docs/_static/img/logo/3.png differ diff --git a/docs/_static/img/logo/white_bg_rec+word.png b/docs/_static/img/logo/white_bg_rec+word.png new file mode 100644 index 000000000..6390e7df5 Binary files /dev/null and b/docs/_static/img/logo/white_bg_rec+word.png differ diff --git a/docs/_static/img/logo/yel_bg_rec+word.png b/docs/_static/img/logo/yel_bg_rec+word.png new file mode 100644 index 000000000..a14d53364 Binary files /dev/null and b/docs/_static/img/logo/yel_bg_rec+word.png differ diff --git a/docs/_static/img/logo/yellow_bg_rec+word .png b/docs/_static/img/logo/yellow_bg_rec+word .png new file mode 100644 index 000000000..aa1bf610d Binary files /dev/null and b/docs/_static/img/logo/yellow_bg_rec+word .png differ diff --git a/docs/_static/img/logo/yellow_bg_rec.png b/docs/_static/img/logo/yellow_bg_rec.png new file mode 100644 index 000000000..45d7710a5 Binary files /dev/null and b/docs/_static/img/logo/yellow_bg_rec.png differ diff --git a/docs/advanced/alpha.rst b/docs/advanced/alpha.rst index 63a8be777..ba58b924f 100644 --- a/docs/advanced/alpha.rst +++ b/docs/advanced/alpha.rst @@ -45,7 +45,7 @@ Example Users can use ``Data Handler`` to build formulaic alphas `MACD` in qlib: -.. note:: Users need to initialize ``Qlib`` with `qlib.init` first. Please refer to `initialization `_. +.. note:: Users need to initialize ``Qlib`` with `qlib.init` first. Please refer to `initialization <../start/initialization.html>`_. .. code-block:: python diff --git a/docs/advanced/server.rst b/docs/advanced/server.rst new file mode 100644 index 000000000..230c4f04b --- /dev/null +++ b/docs/advanced/server.rst @@ -0,0 +1,28 @@ +.. _server: +================================= +``Online`` & ``Offline`` mode +================================= +.. currentmodule:: qlib + + +Introduction +============= + +``Qlib`` supports ``Online`` mode and ``Offline`` mode. Only the ``Offline`` mode is introduced in this document. + +The ``Online`` mode is designed to solve the following problems: + +- Manage the data in a centralized way. Users don't have to manage data of different versions. +- Reduce the amount of cache to be generated. +- Make the data can be accessed in a remote way. + +Qlib-Server +=============== + +``Qlib-Server`` is the assorted server system for ``Qlib``, which utilizes ``Qlib`` for basic calculations and provides extensive server system and cache mechanism. With QLibServer, the data provided for ``Qlib`` can be managed in a centralized manner. With ``Qlib-Server``, users can use ``Qlib`` in ``Online`` mode. + + + +Reference +================= +If users are interested in ``Qlib-Server`` and ``Online`` mode, please refer to `Qlib-Server Project `_ and `Qlib-Server Document `_. \ No newline at end of file diff --git a/docs/component/backtest.rst b/docs/component/backtest.rst index 614eab830..2d9f3a25b 100644 --- a/docs/component/backtest.rst +++ b/docs/component/backtest.rst @@ -7,7 +7,7 @@ Intraday Trading: Model&Strategy Testing Introduction =================== -``Intraday Trading`` is designed to test models and strategies, which help users to check the performance of custom model/strategy. +``Intraday Trading`` is designed to test models and strategies, which help users to check the performance of a custom model/strategy. .. note:: @@ -19,11 +19,11 @@ Introduction Example =========================== -Users need to generate a prediction score(a pandas DataFrame) with MultiIndex and a `score` column. And users need to assign a strategy used in backtest, if strategy is not assigned, +Users need to generate a `prediction score`(a pandas DataFrame) with MultiIndex and a `score` column. And users need to assign a strategy used in backtest, if strategy is not assigned, a `TopkDropoutStrategy` strategy with `(topk=50, n_drop=5, risk_degree=0.95, limit_threshold=0.0095)` will be used. -If ``Strategy`` module is not user's interested part, `TopkDropoutStrategy` is enough. +If ``Strategy`` module is not users' interested part, `TopkDropoutStrategy` is enough. -The simple example with default strategy is as follows. +The simple example of the default strategy is as follows. .. code-block:: python @@ -31,14 +31,14 @@ The simple example with default strategy is as follows. # pred_score is the prediction score report, positions = backtest(pred_score, topk=50, n_drop=0.5, verbose=False, limit_threshold=0.0095) -To know more about backtesting with specific strategy, please refer to `Strategy `_. +To know more about backtesting with a specific strategy, please refer to `Strategy `_. To know more about the prediction score `pred_score` output by ``Model``, please refer to `Interday Model: Model Training & Prediction `_. Prediction Score ----------------- -The prediction score is a pandas DataFrame. Its index is and it must +The `prediction score` is a pandas DataFrame. Its index is and it must contains a `score` column. A prediction sample is shown as follows. @@ -67,37 +67,44 @@ The backtest results are in the following form: .. code-block:: python - sub_bench mean 0.000662 - std 0.004487 - annual 0.166720 - sharpe 2.340526 - mdd -0.080516 - sub_cost mean 0.000577 - std 0.004482 - annual 0.145392 - sharpe 2.043494 - mdd -0.083584 + risk + excess_return_without_cost mean 0.000605 + std 0.005481 + annualized_return 0.152373 + information_ratio 1.751319 + max_drawdown -0.059055 + excess_return_with_cost mean 0.000410 + std 0.005478 + annualized_return 0.103265 + information_ratio 1.187411 + max_drawdown -0.075024 -- `sub_bench` - Returns of the portfolio without deduction of fees -- `sub_cost` - Returns of the portfolio with deduction of fees -- `mean` - Mean value of the returns sequence(difference sequence of assets). +- `excess_return_without_cost` + - `mean` + Mean value of the `CAR` (cumulative abnormal return) without cost + - `std` + The `Standard Deviation` of `CAR` (cumulative abnormal return) without cost. + - `annualized_return` + The `Annualized Rate` of `CAR` (cumulative abnormal return) without cost. + - `information_ratio` + The `Information Ratio` without cost. please refer to `Information Ratio – IR `_. + - `max_drawdown` + The `Maximum Drawdown` of `CAR` (cumulative abnormal return) without cost, please refer to `Maximum Drawdown (MDD) `_. -- `std` - Standard deviation of the returns sequence(difference sequence of assets). +- `excess_return_with_cost` + - `mean` + Mean value of the `CAR` (cumulative abnormal return) series with cost + - `std` + The `Standard Deviation` of `CAR` (cumulative abnormal return) series with cost. + - `annualized_return` + The `Annualized Rate` of `CAR` (cumulative abnormal return) with cost. + - `information_ratio` + The `Information Ratio` with cost. please refer to `Information Ratio – IR `_. + - `max_drawdown` + The `Maximum Drawdown` of `CAR` (cumulative abnormal return) with cost, please refer to `Maximum Drawdown (MDD) `_. -- `annual` - Average annualized returns of the portfolio. - -- `ir` - Information Ratio, please refer to `Information Ratio – IR `_. - -- `mdd` - Maximum Drawdown, please refer to `Maximum Drawdown (MDD) `_. Reference diff --git a/docs/component/data.rst b/docs/component/data.rst index 4f8969969..a0e9a8398 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -6,79 +6,106 @@ Data Layer: Data Framework&Usage Introduction ============================ -``Data Layer`` is designed to download raw data, retrieve data, construct datasets and get frequently-used data. +``Data Layer`` provides user-friendly APIs to manage and retrieve data. It provides high-performance data infrastructure. -Also, users can building formulaic alphas with ``Data Layer`` easliy. If users are interesting formulaic alphas, please refer to `Building Formulaic Alphas <../advanced/alpha.html>`_. +It is designed for quantitative investment. For example, users could build formulaic alphas with ``Data Layer`` easily. Please refer to `Building Formulaic Alphas <../advanced/alpha.html>`_ for more details. -The ``Data Layer`` framework includes four components as follows. +The introduction of ``Data Layer`` includes the following parts. -- Raw Data +- Data Preparation - Data API - Data Handler - Cache +- Data and Cache File Structure - -Raw Data +Data Preparation ============================ -``Qlib`` provides the script ``scripts/get_data.py`` to download the raw data that will be used to initialize the qlib package, please refer to `Initialization <../start/initialization.rst>`_. +Qlib Format Data +------------------ -When ``Qlib`` is initialized, users can choose china-stock mode or US-stock mode, please refer to `Initialization <../start/initialization.rst>`_. +We've specially designed a data structure to manage financial data, please refer to the `File storage design section in Qlib paper `_ for detailed information. +Such data will be stored with filename suffix `.bin` (We'll call them `.bin` file, `.bin` format or qlib format). `.bin` file is designed for scientific computing on finance data -China-Stock Market Mode +Qlib Format Dataset +-------------------- +``Qlib`` has provided an off-the-shelf dataset in `.bin` format, users could use the script ``scripts/get_data.py`` to download the dataset as follows. + +.. code-block:: bash + + python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data + +After running the above command, users can find china-stock data in Qlib format in the ``~/.qlib/csv_data/cn_data`` directory. + +``Qlib`` also provides the scripts in ``scripts/data_collector`` to help users crawl the latest data on the Internet and convert it to qlib format. + +When ``Qlib`` is initialized with this dataset, users could build and evaluate their own models with it. Please refer to `Initialization <../start/initialization.html>`_ for more details. + +Converting CSV Format into Qlib Format +------------------------------------------- + +``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert data in CSV format into `.bin` files(Qlib format). + + +Users can download the china-stock data in CSV format as follows for reference to the CSV format. + +.. code-block:: bash + + python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data + + +Supposed that users prepare their CSV format data in the directory ``~/.qlib/csv_data/my_data``, they can run the following command to start the conversion. + +.. code-block:: bash + + python scripts/dump_bin.py dump --csv_path ~/.qlib/csv_data/my_data --qlib_dir ~/.qlib/qlib_data/my_data --include_fields open,close,high,low,volume,factor + +After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/my_data`. + +.. note:: + + The arguments of `--include_fields` should correspond with the columns names of CSV files. The columns names of dataset provided by ``Qlib`` includes open,close,high,low,volume,factor. + + - `open` + The opening price + - `close` + The closing price + - `high` + The highest price + - `low` + The lowest price + - `volume` + The trading volume + - `factor` + The Restoration factor + + +China-Stock Mode & US-Stock Mode -------------------------------- -If users use ``Qlib`` in china-stock mode, china-stock data is required. The script ``scripts/get_data.py`` can be used to download china-stock data. If users want to use ``Qlib`` in china-stock mode, they need to do as follows. +- If users use ``Qlib`` in china-stock mode, china-stock data is required. Users can use ``Qlib`` in china-stock mode according to the following steps: + - Download china-stock in qlib format, please refer to section `Qlib Format Dataset <#qlib-format-dataset>`_. + - Initialize ``Qlib`` in china-stock mode + Supposed that users download their Qlib format data in the directory ``~/.qlib/csv_data/cn_data``. Users only need to initialize ``Qlib`` as follows. + + .. code-block:: python -- Download data in qlib format - Run the following command to download china-stock data in csv format. - - .. code-block:: bash - - python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data - - Users can find china-stock data in qlib format in the'~/.qlib/csv_data/cn_data' directory. - -- Initialize ``Qlib`` in china-stock mode - Users only need to initialize ``Qlib`` as follows. - - .. code-block:: python - - from qlib.config import REG_CN - qlib.init(provider_uri='~/.qlib/qlib_data/cn_data', region=REG_CN) + from qlib.config import REG_CN + qlib.init(provider_uri='~/.qlib/qlib_data/cn_data', region=REG_CN) -US-Stock Market Mode -------------------------- -If users use ``Qlib`` in US-stock mode, US-stock data is required. ``Qlib`` does not provide script to download US-stock data. If users want to use ``Qlib`` in US-stock market mode, they need to do as follows. - -- Prepare data in csv format - Users need to prepare US-stock data in csv format by themselves, which is in the same format as the china-stock data in csv format. Please download the china-stock data in csv format as follows for reference of format. - - .. code-block:: bash - - python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data - - -- Convert data from csv format to ``Qlib`` format - ``Qlib`` provides the script ``scripts/dump_bin.py`` to convert data from csv format to qlib format. - Assuming that the users store the US-stock data in csv format in path '~/.qlib/csv_data/us_data', they need to execute the following command to convert the data from csv format to ``Qlib`` format: - - .. code-block:: bash - - python scripts/dump_bin.py dump --csv_path ~/.qlib/csv_data/us_data --qlib_dir ~/.qlib/qlib_data/us_data --include_fields open,close,high,low,volume,factor - -- Initialize ``Qlib`` in US-stock mode - Users only need to initialize ``Qlib`` as follows. - - .. code-block:: python - - from qlib.config import REG_US - qlib.init(provider_uri='~/.qlib/qlib_data/us_data', region=REG_US) +- If users use ``Qlib`` in US-stock mode, US-stock data is required. ``Qlib`` does not provide a script to download US-stock data. Users can use ``Qlib`` in US-stock mode according to the following steps: + - Prepare data in CSV format + - Convert data from CSV format to Qlib format, please refer to section `Converting CSV Format into Qlib Format <#converting-csv-format-into-qlib-format>`_. + - Initialize ``Qlib`` in US-stock mode + Supposed that users prepare their Qlib format data in the directory ``~/.qlib/csv_data/us_data``. Users only need to initialize ``Qlib`` as follows. + .. code-block:: python -Please refer to `Script API <../reference/api.html>`_ for more details. + from qlib.config import REG_US + qlib.init(provider_uri='~/.qlib/qlib_data/us_data', region=REG_US) + Data API ======================== @@ -90,10 +117,10 @@ Users can use APIs in ``qlib.data`` to retrieve data, please refer to `Data Retr Feature ------------------ -``Qlib`` provides `Feature` and `ExpressionOps` to fetch the features according to users' need. +``Qlib`` provides `Feature` and `ExpressionOps` to fetch the features according to users' needs. - `Feature` - Load data from data provider. + Load data from the data provider. User can get the features like `$high`, `$low`, `$open`, `$close`, .etc, which should correspond with the arguments of `--include_fields`, please refer to section `Converting CSV Format into Qlib Format <#converting-csv-format-into-qlib-format>`_. - `ExpressionOps` `ExpressionOps` will use operator for feature construction. @@ -103,7 +130,7 @@ To know more about ``Feature``, please refer to `Feature API <../reference/api. Filter ------------------- -``Qlib`` provides `NameDFilter` and `ExpressionDFilter` to filter the instruments according to users' need. +``Qlib`` provides `NameDFilter` and `ExpressionDFilter` to filter the instruments according to users' needs. - `NameDFilter` Name dynamic instrument filter. Filter the instruments based on a regulated name format. A name rule regular expression is required. @@ -121,14 +148,14 @@ To know more about ``Filter``, please refer to `Filter API <../reference/api.htm API ------------- -To know more about ``Data Api``, please refer to `Data Api <../reference/api.html>`_. +To know more about ``Data API``, please refer to `Data API <../reference/api.html>`_. Data Handler ================= -``Data Handler`` is a part of ``estimator`` and can also be used as a single module. +Users can use ``Data Handler`` in an automatic workflow by ``Estimator``, refer to `Estimator `_ for more details. -``Data Handler`` can be used to load raw data, prepare features and label columns, preprocess data(standardization, remove NaN, etc.), split training, validation, and test sets. It is a subclass of ``qlib.contrib.estimator.handler.BaseDataHandler``, which provides some interfaces, for example: +Also, ``Data Handler`` can be used as an independent module, by which users can easily preprocess data(standardization, remove NaN, etc.) and build datasets. It is a subclass of ``qlib.contrib.estimator.handler.BaseDataHandler``, which provides some interfaces as follows. Base Class & Interface ---------------------- @@ -139,20 +166,20 @@ Qlib provides a base class `qlib.contrib.estimator.BaseDataHandler <../reference Implement the interface to load the data features. - `setup_label` - Implement the interface to load the data labels and calculate user's labels. + Implement the interface to load the data labels and calculate the users' labels. - `setup_processed_data` Implement the interface for data preprocessing, such as preparing feature columns, discarding blank lines, and so on. -Qlib also provides two functions to help user init the data handler, user can override them for user's need. +Qlib also provides two functions to help users init the data handler, users can override them for users' needs. - `_init_kwargs` - User can init the kwargs of the data handler in this function, some kwargs may be used when init the raw df. + Users can init the kwargs of the data handler in this function, some kwargs may be used when init the raw df. Kwargs are the other attributes in data.args, like dropna_label, dropna_feature - `_init_raw_df` - User can init the raw df, feature names and label names of data handler in this function. - If the index of feature df and label df are not same, user need to override this method to merge them (e.g. inner, left, right merge). + Users can init the raw df, feature names, and label names of data handler in this function. + If the index of feature df and label df are not same, users need to override this method to merge them (e.g. inner, left, right merge). If users want to load features and labels by config, users can inherit ``qlib.contrib.estimator.handler.ConfigDataHandler``, ``Qlib`` also have provided some preprocess method in this subclass. If users want to use qlib data, `QLibDataHandler` is recommended. Users can inherit their custom class from `QLibDataHandler`, which is also a subclass of `ConfigDataHandler`. @@ -160,7 +187,8 @@ If users want to use qlib data, `QLibDataHandler` is recommended. Users can inhe Usage -------------- -'Data Handler' can be used as a single module, which provides the following mehtod: + +``Data Handler`` can be used as a single module, which provides the following mehtods: - `get_split_data` - According to the start and end dates, return features and labels of the pandas DataFrame type used for the 'Model' @@ -178,21 +206,21 @@ Example Know more about how to run ``Data Handler`` with ``estimator``, please refer to `Estimator `_. -Qlib provides implemented data handler `QLibDataHandlerV1`. The following example shows how to run 'QLibDataHandlerV1' as a single module. +Qlib provides implemented data handler `QLibDataHandlerClose`. The following example shows how to run `QLibDataHandlerV1` as a single module. -.. note:: User needs to initialize ``Qlib`` with `qlib.init` first, please refer to `initialization `_. +.. note:: Users need to initialize ``Qlib`` with `qlib.init` first, please refer to `initialization <../start/initialization.html>`_. .. code-block:: Python - from qlib.contrib.estimator.handler import QLibDataHandlerV1 + from qlib.contrib.estimator.handler import QLibDataHandlerClose from qlib.contrib.model.gbdt import LGBModel DATA_HANDLER_CONFIG = { "dropna_label": True, "start_date": "2007-01-01", "end_date": "2020-08-01", - "market": "csi500", + "market": "csi300", } TRAINER_CONFIG = { @@ -204,7 +232,7 @@ Qlib provides implemented data handler `QLibDataHandlerV1`. The following exampl "test_end_date": "2020-08-01", } - exampleDataHandler = QLibDataHandlerV1(**DATA_HANDLER_CONFIG) + exampleDataHandler = QLibDataHandlerClose(**DATA_HANDLER_CONFIG) # example of 'get_split_data' x_train, y_train, x_validate, y_validate, x_test, y_test = exampleDataHandler.get_split_data(**TRAINER_CONFIG) @@ -222,22 +250,17 @@ Also, the above example has been given in ``examples.estimator.train_backtest_an API --------- -To know more abot ``Data Handler``, please refer to `Data Handler API <../reference/api.html#handler>`_. +To know more about ``Data Handler``, please refer to `Data Handler API <../reference/api.html#handler>`_. Cache ========== -``Cache`` is an optional module that helps accelerate providing data by saving some frequently-used data as cache file. +``Cache`` is an optional module that helps accelerate providing data by saving some frequently-used data as cache file. ``Qlib`` provides a `Memcache` class to cache the most-frequently-used data in memory, an inheritable `ExpressionCache` class and an inheritable `DatasetCache` class. -Memory Cache --------------- +Global Memory Cache +--------------------- -Base Class & Interface -~~~~~~~~~~~~~~~~~~~~~~~ - -``Qlib`` provides a `Memcache` class to cache the most-frequently-used data in memory, an inheritable `ExpressionCache` class, and an inheritable `DatasetCache` class. - -`Memcache` is a memory cache mechanism that composes of three `MemCacheUnit` instances to cache **Calendar**, **Instruments**, and **Features**. The MemCache is defined globally in `cache.py` as `H`. User can use `H['c'], H['i'], H['f']` to get/set memcache. +`Memcache` is a global memory cache mechanism that composes of three `MemCacheUnit` instances to cache **Calendar**, **Instruments**, and **Features**. The `MemCache` is defined globally in `cache.py` as `H`. Users can use `H['c'], H['i'], H['f']` to get/set `memcache`. .. autoclass:: qlib.data.cache.MemCacheUnit :members: @@ -246,60 +269,42 @@ Base Class & Interface :members: -Disk Cache --------------- - -Base Class & Interface -~~~~~~~~~~~~~~~~~~~~~~~ - -`ExpressionCache` is a disk cache mechanism that saves expressions such as **Mean($close, 5)**. Users can inherit this base class to define their own cache mechanism. Users need to override `self._uri` method to define how their cache file path is generated, `self._expression` method to define what data they want to cache and how to cache it. - -`DatasetCache` is a disk cache mechanism that saves datasets. A certain dataset is regulated by a stockpool configuration (or a series of instruments, though not recommended), a list of expressions or static feature fields, the start time and end time for the collected features and the frequency. Users need to override `self._uri` method to define how their cache file path is generated, `self._expression` method to define what data they want to cache and how to cache it. - -`ExpressionCache` and `DatasetCache` actually provides the same interfaces with `ExpressionProvider` and `DatasetProvider` so that the disk cache layer is transparent to users and will only be used if they want to define their own cache mechanism. The users can plug the cache mechanism into the server system by assigning the cache class they want to use in `config.py`: - -.. code-block:: python - - 'ExpressionCache': 'ServerExpressionCache', - 'DatasetCache': 'ServerDatasetCache', - -Users can find the cache interface here. - ExpressionCache -^^^^^^^^^^^^^^^^^^^^ +----------------- + +`ExpressionCache` is a cache mechanism that saves expressions such as **Mean($close, 5)**. Users can inherit this base class to define their own cache mechanism that saves expressions according to the following steps. + +- Override `self._uri` method to define how the cache file path is generated +- Override `self._expression` method to define what data will be cached and how to cache it. + +The following shows the details about the interfaces: .. autoclass:: qlib.data.cache.ExpressionCache :members: +``Qlib`` has currently provided implemented disk cache `DiskExpressionCache` which inherits from `ExpressionCache` . The expressions data will be stored in the disk. + DatasetCache -^^^^^^^^^^^^^^^^^^^^ +----------------- + +`DatasetCache` is a cache mechanism that saves datasets. A certain dataset is regulated by a stock pool configuration (or a series of instruments, though not recommended), a list of expressions or static feature fields, the start time, and end time for the collected features and the frequency. Users can inherit this base class to define their own cache mechanism that saves datasets according to the following steps. + +- Override `self._uri` method to define how their cache file path is generated +- Override `self._expression` method to define what data will be cached and how to cache it. + +The following shows the details about the interfaces: .. autoclass:: qlib.data.cache.DatasetCache :members: +``Qlib`` has currently provided implemented disk cache `DiskDatasetCache` which inherits from `DatasetCache` . The datasets data will be stored in the disk. -Implemented Disk Cache -~~~~~~~~~~~~~~~~~~~~~~~ - -.. note:: - - If the user does not use QlibServer, please ignore the content of this section - -Qlib has currently provided `ServerExpressionCache` class and `ServerDatasetCache` class as the cache mechanisms used for QlibServer. The class interface and file structure designed for server cache mechanism is listed below. - -DiskExpressionCache -^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: qlib.data.cache.ServerExpressionCache - -DiskDatasetCache -^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: qlib.data.cache.ServerDatasetCache Data and Cache File Structure -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +================================== + +We've specially designed a file structure to manage data and cache, please refer to the `File storage design section in Qlib paper `_ for detailed information.The file structure of data and cache is listed as follows. .. code-block:: json @@ -317,7 +322,7 @@ Data and Cache File Structure - close.day.bin - ... - ... - [cached data] updated by server when raw data is updated + [cached data] updated when raw data is updated - calculated features/ - sh600000/ - [hash(instrtument, field_expression, freq)] @@ -331,3 +336,5 @@ Data and Cache File Structure - .index : an assorted index file recording the line index of all calendars - ... + +.. TODO: refer to paper diff --git a/docs/component/estimator.rst b/docs/component/estimator.rst index 2556ae760..c8c0896ed 100644 --- a/docs/component/estimator.rst +++ b/docs/component/estimator.rst @@ -7,10 +7,10 @@ Estimator: Workflow Management Introduction =================== -The components in `Qlib Framework <../introduction/introduction.html#framework>`_ is designed in a loosely-coupled way. Users could build their own quant research workflow with these components like `Example `_ +The components in `Qlib Framework <../introduction/introduction.html#framework>`_ are designed in a loosely-coupled way. Users could build their own Quant research workflow with these components like `Example `_ -Besides, ``Qlib`` provides more user-friendly interfaces named ``Estimator`` to automatically run the whole workflow defined by a config. A concrete execution of the whole workflow is called an `experiment`. +Besides, ``Qlib`` provides more user-friendly interfaces named ``Estimator`` to automatically run the whole workflow defined by configuration. A concrete execution of the whole workflow is called an `experiment`. With ``Estimator``, user can easily run an `experiment`, which includes the following steps: - Data @@ -22,18 +22,13 @@ With ``Estimator``, user can easily run an `experiment`, which includes the foll - Saving & loading - Evaluation(Back-testing) -For each `experiment`, ``Qlib`` will capture the details of model training, performance evalution results and basic infomation(e.g. names, ids). The captured data will be stored in backend-storge(disk or database). +For each `experiment`, ``Qlib`` will capture the model training details, performance evaluation results and basic information (e.g. names, ids). The captured data will be stored in backend-storage (disk or database). -Example +Complete Example =================== -The following is an example: - -.. note:: Make sure install the latest version of `qlib`, please refer to `Qlib installation <../start/installation.html>`_. - -If users want to use the models and data provided by `Qlib`, they only need to do as follows. - -First, Write a simple configuration file as following, +Before getting into details, here is a complete example of ``Estimator``, which defines the workflow in typical Quant research. +Below is a typical config file of ``Estimator``. .. code-block:: YAML @@ -90,36 +85,37 @@ First, Write a simple configuration file as following, provider_uri: "~/.qlib/qlib_data/cn_data" region: "cn" - -Then run the following command: +After saving the config into `configuration.yaml`, users could start the workflow and test their ideas with a single command below. .. code-block:: bash estimator -c configuration.yaml -.. note:: 'estimator' is a built-in command of our program. +.. note:: `estimator` will be placed in your $PATH directory when installing ``Qlib``. Configuration File =================== -Before using ``estimator``, users need to prepare a configuration file. The following shows how to prepare each part of the configuration file. +Let's get into details of ``Estimator`` in this section. -Experiment Field +Before using ``estimator``, users need to prepare a configuration file. The following content shows how to prepare each part of the configuration file. + +Experiment Section -------------------- -First, the configuration file needs to have a field about the experiment, whose key is `experiment`. This field and its contents determine how `estimator` tracks and persists this `experiment`. ``Qlib`` used `sacred`, a lightweight open-source tool designed to configure, organize, generate logs, and manage experiment results. The field `experiment` will determine the partial behavior of `sacred`. +At first, the configuration file needs to contain a section named `experiment` about the basic information. This section describes how `estimator` tracks and persists current `experiment`. ``Qlib`` used `sacred`, a lightweight open-source tool, to configure, organize, generate logs, and manage experiment results. Partial behaviors of `sacred` will base on the `experiment` section. -Usually, in the running process of `estimator`, those following will be managed by `sacred`: +Following files will be saved by `sacred` after `estimator` finish an `experiment`: - `model.bin`, model binary file - `pred.pkl`, model prediction result file - `analysis.pkl`, backtest performance analysis file -- `positions.pkl`, backtest position record file +- `positions.pkl`, backtest position records file - `run`, the experiment information object, usually contains some meta information such as the experiment name, experiment date, etc. -Usually, it should contain the following: +Here is the typical configuration of `experiment section` .. code-block:: YAML @@ -138,14 +134,14 @@ Usually, it should contain the following: The meaning of each field is as follows: - `name` - The experiment name, str type, `sacred` will use this experiment name as an identifier for some important internal processes. Usually, users can see this field in `sacred` by `run` object. The default value is `test_experiment`. + The experiment name, str type, `sacred _` will use this experiment name as an identifier for some important internal processes. Users can find this field in `run` object of `sacred`. The default value is `test_experiment`. -- `observer_type` - Observer type, str type, there are two values which are `file_storage` and `mongo` respectively. If it is `file_storage`, all the above-mentioned managed contents will be stored in the `dir` directory, separated by the number of times of experiments as a subfolder. If it is `mongo`, the content will be stored in the database. The default is `file_storage`. +- `observer_type` + Observer type, str type, there are two choices which include `file_storage` and `mongo` respectively. If `file_storage` is selected, all the above-mentioned managed contents will be stored in the `dir` directory, separated by the number of times of experiments as a subfolder. If it is `mongo`, the content will be stored in the database. The default is `file_storage`. - For `file_storage` observer. - - `dir` - Directory url, str type, directory for `file_storage` observer type, files captures and managed by sacred with observer type of `file_storage` will be save to this directory, default is the directory of `config.json`. + - `dir` + Directory URL, str type, directory for `file_storage` observer type, files captured and managed by sacred with `file_storage` observer will be saved to this directory, which is the same directory as `config.json` by default. - For `mongo` observer. - `mongo_url` @@ -155,15 +151,17 @@ The meaning of each field is as follows: Database name, str type, required if the observer type is `mongo`. - `finetune` - Estimator will produce a model based on this flag + ``Estimator``'s behaviors to train models will base on this flag. + If you just want to train models from scratch each time instead of based on existing models, please leave `finetune=false`. Otherwise please read the + details below. The following table is the processing logic for different situations. ========== =========================================== ==================================== =========================================== ========================================== . Static Rolling - . Finetune=True Finetune=False Finetune=True Finetune=False + . finetune:true finetune:false finetune:true finetune:false ========== =========================================== ==================================== =========================================== ========================================== - Train - Need to provide model(Static or Rolling) - No need to provide model - Need to provide model(Static or Rolling) - Need to provide model(Static or Rolling) + Train - Need to provide model (Static or Rolling) - No need to provide model - Need to provide model (Static or Rolling) - Need to provide model (Static or Rolling) - The args in model section will be - The args in model section will be - The args in model section will be - The args in model section will be used for finetuning used for training used for finetuning used for finetuning - Update based on the provided model - Train model from scratch - Update based on the provided model - Based on the provided model update @@ -185,34 +183,40 @@ The meaning of each field is as follows: 3. If `loader.model_index` is None: - In 'Static Finetune=True', if provide 'Rolling', use the last model to update. - - For RollingTrainer with Finetune=Ture. + - For `RollingTrainer` with Finetune=True. - - If StaticTrainer is used in loader, the model will be used for initialization for finetuning. + - If `StaticTrainer` is used in loader, the model will be used for initialization for finetuning. - - If RollingTrainer is used in loader, the existing models will be used without any modification and the new models will be initialized with the model in the last period and finetune one by one. + - If `RollingTrainer` is used in loader, the existing models will be used without any modification and the new models will be initialized with the model in the last period and finetune one by one. - `exp_info_path` - experiment info save path, str type, save the experiment info and model prediction score after the experiment is finished. Optional parameter, the default value is `config_file_dir/ex_name/exp_info.json` + save path of experiment info, str type, save the experiment info and model `prediction score` after the experiment is finished. Optional parameter, the default value is `/ex_name/exp_info.json`. - `mode` - `train` or `test`, str type, if `mode` is test, it will load the model according to the parameters of `loader`. The default value is `train`. - Also note that when the load model failed, it will `fit` model. + `train` or `test`, str type. + - `test mode` is designed for inference. Under `test mode`, it will load the model according to the parameters of `loader` and skip model training. + - `train model` is the default value. It will train new models by default and + Please note that when it fails to load model, it will fall back to `fit` model. + .. note:: - if users choose `mode` test, they need to make sure: + if users choose ` test mode`, they need to make sure: - The loader of `test_start_date` must be less than or equal to the current `test_start_date`. - If other parameters of the `loader` model args are different, a warning will appear. - `loader` - If the `mode` is `test` or `finetune` is `true`, it will be used. + If you just want to train models from scratch each time instead of based on existing models, please ignore `loader` section. Otherwise please read the + details below. + + The `loader` section only works when the `mode` is `test` or `finetune` is `true`. - `model_index` Model index, int type. The index of the loaded model in loader_models (starting at 0) for the first `finetune`. The default value is None. - `exp_info_path` - Loader model experiment info path, str type. If the field exists, the following parameters will be parsed from `exp_info_path`, and the following parameters will not work. This field and `id` must exist one. + Loader model experiment info path, str type. If the field exists, the following parameters will be parsed from `exp_info_path`, and the following parameters will not work. One of this field and `id` must exist at least . - `id` The experiment id of the model that needs to be loaded, int type. If the `mode` is `test`, this value is required. This field and `exp_info_path` must exist one. @@ -222,7 +226,8 @@ The meaning of each field is as follows: - `observer_type` The experiment observer type of the model that needs to be loaded, str type. The default value is the current experiment `observer_type`. - .. note:: The observer type is a concept of the `sacred` module, which determines how files, standard input and output which are managed by sacred are stored. + + .. note:: The observer type is a concept of the `sacred` module, which determines how files, standard input, and output which are managed by sacred are stored. - `file_storage` @@ -249,11 +254,11 @@ The meaning of each field is as follows: .. note:: - If users choose mongo observer, they need to make sure: - - have an environment with the mongodb installed and a mongo database dedicated for storing the experiments results. - - The python environment(the version of python and package) to run the experiments and the one to fetch the results are consistent. + If users choose the mongo observer, they need to make sure: + - Have an environment with the mongodb installed and a mongo database dedicated to storing the results of the experiments. + - The python environment (the version of python and package) to run the experiments and the one to fetch the results are consistent. -Model Field +Model Section ----------------- Users can use a specified model by configuration with hyper-parameters. @@ -261,7 +266,7 @@ Users can use a specified model by configuration with hyper-parameters. Custom Models ~~~~~~~~~~~~~~~~~ -Qlib support custom models, but it must be a subclass of the `qlib.contrib.model.Model`, the config for custom model may be as following. +Qlib supports custom models, but it must be a subclass of the `qlib.contrib.model.Model`, the config for a custom model may be as following. .. code-block:: YAML @@ -274,12 +279,12 @@ Qlib support custom models, but it must be a subclass of the `qlib.contrib.model The class `SomeModel` should be in the module `custom_model`, and ``Qlib`` could parse the `module_path` to load the class. -To Know more about ``Model``, please refer to `Model `_. +To know more about ``Model``, please refer to `Model `_. -Data Field +Data Section ----------------- -``Data Handler`` can be used to load raw data, prepare features and label columns, preprocess data(standardization, remove NaN, etc.), split training, validation, and test sets. It is a subclass of `qlib.contrib.estimator.handler.BaseDataHandler`. +``Data Handler`` can be used to load raw data, prepare features and label columns, preprocess data (standardization, remove NaN, etc.), split training, validation, and test sets. It is a subclass of `qlib.contrib.estimator.handler.BaseDataHandler`. Users can use the specified data handler by config as follows. @@ -310,32 +315,32 @@ Users can use the specified data handler by config as follows. fend_time: 2018-12-11 - `class` - Data handler class, str type, which should be a subclass of `qlib.contrib.estimator.handler.BaseDataHandler`, and implements 5 important interfaces for loading features, loading raw data, preprocessing raw data, slicing train, validation, and test data. The default value is `ALPHA360`. If users want to write a data handler to retrieve the data in qlib, `QlibDataHandler` is suggested. + Data handler class, str type, which should be a subclass of `qlib.contrib.estimator.handler.BaseDataHandler`, and implements 5 important interfaces for loading features, loading raw data, preprocessing raw data, slicing train, validation, and test data. The default value is `ALPHA360`. If users want to write a data handler to retrieve the data in ``Qlib``, `QlibDataHandler` is suggested. - `module_path` - The module path, str type, absolute url is also supported, indicates the path of the `class` implementation of data processor class. The default value is `qlib.contrib.estimator.handler`. + The module path, str type, absolute url is also supported, indicates the path of the `class` implementation of the data processor class. The default value is `qlib.contrib.estimator.handler`. - `args` Parameters used for ``Data Handler`` initialization. - `train_start_date` - Training start time, str type, default value is `2005-01-01`. + Training start time, str type, the default value is `2005-01-01`. - `start_date` Data start date, str type. - `end_date` - Data end date, str type. the data from start_date to end_date decides which part of data will be loaded in datahandler, users can only use these data in the following parts. + Data end date, str type. the data from start_date to end_date decides which part of data will be loaded in `datahandler`, users can only use these data in the following parts. - `dropna_feature` (Optional in args) - Drop Nan feature, bool type, default value is False. + Drop Nan feature, bool type, the default value is False. - `dropna_label` (Optional in args) - Drop Nan label, bool type, default value is True. Some multi-label tasks will use this. + Drop Nan label, bool type, the default value is True. Some multi-label tasks will use this. - `normalize_method` (Optional in args) - Normalzie data by given method. str type. ``Qlib`` give two normalize method, `MinMax` and `Std`. - If users wants to build their own method, please override `_process_normalize_feature`. + Normalize data by a given method. str type. ``Qlib`` gives two normalizing methods, `MinMax` and `Std`. + If users want to build their own method, please override `_process_normalize_feature`. - `filter` Dynamically filtering the stocks based on the filter pipeline. @@ -353,7 +358,7 @@ Users can use the specified data handler by config as follows. The module path, str type. - `args` - The filter class parameters, this parameters are set according to the `class`, and all the parameters as kwargs to `class`. + The filter class parameters, these parameters are set according to the `class`, and all the parameters as kwargs to `class`. Custom Data Handler ~~~~~~~~~~~~~~~~~~~~~~ @@ -371,15 +376,15 @@ Qlib support custom data handler, but it must be a subclass of the ``qlib.contri The class `SomeDataHandler` should be in the module `custom_data_handler`, and ``Qlib`` could parse the `module_path` to load the class. -If users want to load features and labels by config, they can inherit ``qlib.contrib.estimator.handler.ConfigDataHandler``, ``Qlib`` also has provided some preprocess method in this subclass. -If users want to use qlib data, `QLibDataHandler` is recommended, from which users can inherit custom class. `QLibDataHandler` is also a subclass of `ConfigDataHandler`. +If users want to load features and labels by config, they can inherit ``qlib.contrib.estimator.handler.ConfigDataHandler``, ``Qlib`` also has provided some preprocess methods in this subclass. +If users want to use qlib data, `QLibDataHandler` is recommended, from which users can inherit the custom class. `QLibDataHandler` is also a subclass of `ConfigDataHandler`. -To Know more about ``Data Handler``, please refer to `Data Framework&Usage `_. +To know more about ``Data Handler``, please refer to `Data Framework&Usage `_. -Trainer Field +Trainer Section ----------------- -Users can specify the trainer ``Trainer`` by the config file, which is subclass of ``qlib.contrib.estimator.trainer.BaseTrainer`` and implement three important interfaces for training the model, restoring the model, and getting model predictions as follows. +Users can specify the trainer ``Trainer`` by the config file, which is a subclass of ``qlib.contrib.estimator.trainer.BaseTrainer`` and implement three important interfaces for training the model, restoring the model, and getting model predictions as follows. - `train` Implement this interface to train the model. @@ -447,7 +452,7 @@ Users can specify `trainer` with the configuration file: Custom Trainer ~~~~~~~~~~~~~~~~~~ -Qlib support custom trainer, but it must be a subclass of the `qlib.contrib.estimator.trainer.BaseTrainer`, the config for custom trainer may be as following, +Qlib supports custom trainer, but it must be a subclass of the `qlib.contrib.estimator.trainer.BaseTrainer`, the config for a custom trainer may be as following: .. code-block:: YAML @@ -465,7 +470,7 @@ Qlib support custom trainer, but it must be a subclass of the `qlib.contrib.esti The class `SomeTrainer` should be in the module `custom_trainer`, and ``Qlib`` could parse the `module_path` to load the class. -Strategy Field +Strategy Section ----------------- Users can specify strategy through a config file, for example: @@ -496,7 +501,7 @@ Users can specify strategy through a config file, for example: Custom Strategy ^^^^^^^^^^^^^^^^^^^ -Qlib support custom strategy, but it must be a subclass of the ``qlib.contrib.strategy.strategy.BaseStrategy``, the config for custom strategy may be as following, +Qlib supports custom strategy, but it must be a subclass of the ``qlib.contrib.strategy.strategy.BaseStrategy``, the config for custom strategy may be as following: .. code-block:: YAML @@ -507,9 +512,9 @@ Qlib support custom strategy, but it must be a subclass of the ``qlib.contrib.st The class `SomeStrategy` should be in the module `custom_strategy`, and ``Qlib`` could parse the `module_path` to load the class. -To Know more about ``Strategy``, please refer to `Strategy `_. +To know more about ``Strategy``, please refer to `Strategy `_. -Backtest Field +Backtest Section ----------------- Users can specify `backtest` through a config file, for example: @@ -532,7 +537,7 @@ Users can specify `backtest` through a config file, for example: Normal backtest parameters. All the parameters in this section will be passed to the ``qlib.contrib.evaluate.backtest`` function in the form of `**kwargs`. - `benchmark` - Stock index symbol, str or list type, the default value is `None`. + Stock index symbol, str, or list type, the default value is `None`. .. note:: @@ -556,7 +561,7 @@ Users can specify `backtest` through a config file, for example: Subscribe quote fields, array type, the default value is [`deal_price`, $close, $change, $factor]. -Qlib Data Field +Qlib Data Section -------------------- The `qlib_data` field describes the parameters of qlib initialization. @@ -574,65 +579,76 @@ The `qlib_data` field describes the parameters of qlib initialization. - If region == ``qlib.config.REG_CN``, 'qlib' will be initialized in US-stock mode. - If region == ``qlib.config.REG_US``, 'qlib' will be initialized in china-stock mode. -Please refer to `Initialization <../start/initialization.rst>`_. +Please refer to `Initialization <../start/initialization.html>`_. Experiment Result =================== Form of Experimental Result ---------------------------- -The result of the experiment is the result of the backtest, please refer to `Backtest `_. +The result of the experiment is also the result of the ``Interdat Trading(Backtest)``, please refer to `Interday Trading `_. Get Experiment Result ---------------------------- -Users can check the experiment results from file storage directly, or check the experiment results from database, or get the experiment results through two API of a module `fetcher` provided by ``Qlib``. +Base Class & Interface +~~~~~~~~~~~~~~~~~~~~~~~ -- `get_experiments()` - The API takes two parameters. The first parameter is the experiment name. The default is all experiments. The second parameter is the observer type. Users can get the experiment name dictionary with a list of ids and test end date by the API as follows. +Users can check the experiment results from file storage directly, or check the experiment results from the database, or get the experiment results through two interfaces of a base class `Fetcher` provided by ``Qlib``. - .. code-block:: JSON +The `Fetcher` provides the following interface + - `get_experiments(self, exp_name=None):` + The interface takes one parameters. The `exp_name` is the experiment name, the default is all experiments. Users can get the returned dictionary with a list of ids and test end date as follows. - { - "ex_a": [ - { - "id": 1, - "test_end_date": "2017-01-01" - } - ], - "ex_b": [ - ... - ] - } + .. code-block:: JSON + + { + "ex_a": [ + { + "id": 1, + "test_end_date": "2017-01-01" + } + ], + "ex_b": [ + ... + ] + } -- `get_experiment(exp_name, exp_id, fields=None)` - The API takes three parameters, the first parameter is the experiment name, the second parameter is the experiment id, and the third parameter is field list. - If fields is None, will get all fields. - - .. note:: - Currently supported fields: - ['model', 'analysis', 'positions', 'report_normal', 'pred', 'task_config', 'label'] + - `get_experiment(exp_name, exp_id, fields=None)` + The interface takes three parameters. The first parameter is the experiment name, the second parameter is the experiment id, and the third parameter is list of fields. The default value of `fields` is None, which means all fields. + - .. code-block:: JSON + .. note:: + Currently supported fields: + ['model', 'analysis', 'positions', 'report_normal', 'pred', 'task_config', 'label'] - { - 'analysis': analysis_df, - 'pred': pred_df, - 'positions': positions_dic, - 'report_normal': report_normal_df, - } + Users can get the returned dictionary as follows. + .. code-block:: JSON -Here is a simple example of `FileFetcher`, which could fetch files from `file_storage` observer. + { + 'analysis': analysis_df, + 'pred': pred_df, + 'positions': positions_dic, + 'report_normal': report_normal_df, + } +Implemented `Fetcher` s & Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``Qlib`` provides two implemented `Fetcher` s as follows. + +`FileFetcher` +^^^^^^^^^^^^^^^ + +The `FileFetcher` is a subclass of `Fetcher`, which could fetch files from `file_storage` observer. The following is an example: .. code-block:: python >>> from qlib.contrib.estimator.fetcher import FileFetcher >>> f = FileFetcher(experiments_dir=r'./') >>> print(f.get_experiments()) - { 'test_experiment': [ { @@ -649,23 +665,25 @@ Here is a simple example of `FileFetcher`, which could fetch files from `file_st } ] } - - >>> print(f.get_experiment('test_experiment', '1')) + risk + excess_return_without_cost mean 0.000605 + std 0.005481 + annualized_return 0.152373 + information_ratio 1.751319 + max_drawdown -0.059055 + excess_return_with_cost mean 0.000410 + std 0.005478 + annualized_return 0.103265 + information_ratio 1.187411 + max_drawdown -0.075024 - risk - sub_bench mean 0.000662 - std 0.004487 - annual 0.166720 - sharpe 2.340526 - mdd -0.080516 - sub_cost mean 0.000577 - std 0.004482 - annual 0.145392 - sharpe 2.043494 - mdd -0.083584 -If users use mongo observer when training, they should initialize their fether with mongo_url + +`MongoFetcher` +^^^^^^^^^^^^^^^ + +The `FileFetcher` is a subclass of `Fetcher`, which could fetch files from `mongo` observer. Users should initialize the fetcher with `mongo_url`. The following is an example: .. code-block:: python diff --git a/docs/component/model.rst b/docs/component/model.rst index e4db517a5..ad01b7eee 100644 --- a/docs/component/model.rst +++ b/docs/component/model.rst @@ -6,14 +6,14 @@ Interday Model: Model Training & Prediction Introduction =================== -``Interday Model`` is designed to make the prediction score about stocks. Users can use the ``Interday Model`` in an automatic workflow by ``Estimator``, please refer to `Estimator `_. +``Interday Model`` is designed to make the `prediction score` about stocks. Users can use the ``Interday Model`` in an automatic workflow by ``Estimator``, please refer to `Estimator `_. -Because the components in ``Qlib`` are designed in a loosely-coupled way, ``Interday Model`` can be used as a independent module also. +Because the components in ``Qlib`` are designed in a loosely-coupled way, ``Interday Model`` can be used as an independent module also. Base Class & Interface ====================== -``Qlib`` provides a base class `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_, which all models should inherit from. +``Qlib`` provides a base class `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_ from which all models should inherit. The base class provides the following interfaces: @@ -48,7 +48,7 @@ The base class provides the following interfaces: .. note:: - The number and names of the columns is determined by the data handler, please refer to `Data Handler `_ and `Estimator Data `_. + The number and names of the columns are determined by the data handler, please refer to `Data Handler `_ and `Estimator Data `_. - `y_train`, pd.DataFrame type, train label The following example explains the value of `y_train`: @@ -73,7 +73,7 @@ The base class provides the following interfaces: .. note:: - The number and names of the columns is determined by the ``Data Handler``, please refer to `Data Handler `_. + The number and names of the columns are determined by the ``Data Handler``, please refer to `Data Handler `_. - `x_valid`, pd.DataFrame type, validation feature The format of `x_valid` is same as `x_train` @@ -86,7 +86,7 @@ The base class provides the following interfaces: `w_train` is a pandas DataFrame, whose shape and index is same as `x_train`. The float value in `w_train` represents the weight of the feature at the same position in `x_train`. - `w_train`(Optional args, default is None), pd.DataFrame type, validation weight - `w_train` is a pandas DataFrame, whose shape and index is same as `x_valid`. The float value in `w_train` represents the weight of the feature at the same position in `x_train`. + `w_train` is a pandas DataFrame, whose shape and index is the same as `x_valid`. The float value in `w_train` represents the weight of the feature at the same position in `x_train`. - `predict(self, x_test, **kwargs)` - Predict test data 'x_test' @@ -115,10 +115,10 @@ For other interfaces such as `save`, `load`, `finetune`, please refer to `Model Example ================== -``Qlib`` provides ``LightGBM`` and ``DNN`` models as the baseline, the following steps shows how to run`` LightGBM`` as an independent module. +``Qlib`` provides ``LightGBM`` and ``DNN`` models as the baseline, the following steps show how to run`` LightGBM`` as an independent module. -- Initialize ``Qlib`` with `qlib.init` first, please refer to `initialization `_. -- Run the following code to get the prediction score `pred_score` +- Initialize ``Qlib`` with `qlib.init` first, please refer to `initialization <../start/initialization.html>`_. +- Run the following code to get the `prediction score` `pred_score` .. code-block:: Python from qlib.contrib.estimator.handler import QLibDataHandlerClose diff --git a/docs/component/report.rst b/docs/component/report.rst index 6e9933d69..11d765b33 100644 --- a/docs/component/report.rst +++ b/docs/component/report.rst @@ -6,7 +6,7 @@ Aanalysis: Evaluation & Results Analysis Introduction =================== -``Aanalysis`` is designed to show the graphical reports of ``Intraday Trading`` , which helps users to evaluate and analyse investment portfolios visually. There are the following graphics to view: +``Aanalysis`` is designed to show the graphical reports of ``Intraday Trading`` , which helps users to evaluate and analyse investment portfolios visually. The following are some graphics to view: - analysis_position - report_graph @@ -26,8 +26,8 @@ Users can run the following code to get all supported reports. .. code-block:: python - >>> import qlib.contrib.report as qcr - >>> print(qcr.GRAPH_NAME_LISt) + >> import qlib.contrib.report as qcr + >> print(qcr.GRAPH_NAME_LIST) ['analysis_position.report_graph', 'analysis_position.score_ic_graph', 'analysis_position.cumulative_return_graph', 'analysis_position.risk_analysis_graph', 'analysis_position.rank_label_graph', 'analysis_model.model_performance_graph'] .. note:: @@ -36,7 +36,7 @@ Users can run the following code to get all supported reports. -Usage&Example +Usage & Example =================== Usage of `analysis_position.report` @@ -54,9 +54,29 @@ Graphical Result .. note:: - Axis X: Trading day - - Axis Y: Accumulated value - - The shaded part above: Maximum drawdown corresponding to `cum return` - - The shaded part below: Maximum drawdown corresponding to `cum ex return wo cost` % + - Axis Y: + - `cum bench` + Cumulative returns series of benchmark + - `cum return wo cost` + Cumulative returns series of portfolio without cost + - `cum return w cost` + Cumulative returns series of portfolio with cost + - `return wo mdd` + Maximum drawdown series of cumulative return without cost + - `return w cost mdd`: + Maximum drawdown series of cumulative return with cost + - `cum ex return wo cost` + The `CAR` (cumulative abnormal return) series of the portfolio compared to the benchmark without cost. + - `cum ex return w cost` + The `CAR` (cumulative abnormal return) series of the portfolio compared to the benchmark with cost. + - `turnover` + Turnover rate series + - `cum ex return wo cost mdd` + Drawdown series of `CAR` (cumulative abnormal return) without cost + - `cum ex return w cost mdd` + Drawdown series of `CAR` (cumulative abnormal return) with cost + - The shaded part above: Maximum drawdown corresponding to `cum return wo cost` + - The shaded part below: Maximum drawdown corresponding to `cum ex return wo cost` .. image:: ../_static/img/analysis/report.png @@ -77,7 +97,13 @@ Graphical Result .. note:: - Axis X: Trading day - - Axis Y: `Ref($close, -1)/$close - 1` and `score` IC% + - Axis Y: + - `ic` + The `Pearson correlation coefficient` series between `label` and `prediction score`. + In the above example, the `label` is formulated as `Ref($close, -1)/$close - 1`. Please refer to `Data API Featrue `_ for more details. + + - `rank_ic` + The `Spearman's rank correlation coefficient` series between `label` and `prediction score`. .. image:: ../_static/img/analysis/score_ic.png @@ -96,14 +122,13 @@ Graphical Result .. note:: - - Cumulative return graphics. - - Axis X: Trading day - - Axis Y: - - Above axis Y: `(((Ref($close, -1)/$close - 1) * weight).sum() / weight.sum()).cumsum()` - - Below axis Y: Daily weight sum - - In the **sell** graph, `y < 0` stands for profit; in other cases, `y > 0` stands for profit. - - In the **buy_minus_sell** graph, the **y** value of the **weight** graph at the bottom is `buy_weight + sell_weight`. - - In each graph, the **red line** in the histogram on the right represents the average.% + - Axis X: Trading day + - Axis Y: + - Above axis Y: `(((Ref($close, -1)/$close - 1) * weight).sum() / weight.sum()).cumsum()` + - Below axis Y: Daily weight sum + - In the **sell** graph, `y < 0` stands for profit; in other cases, `y > 0` stands for profit. + - In the **buy_minus_sell** graph, the **y** value of the **weight** graph at the bottom is `buy_weight + sell_weight`. + - In each graph, the **red line** in the histogram on the right represents the average. .. image:: ../_static/img/analysis/cumulative_return_buy.png @@ -124,24 +149,76 @@ API :members: -.. note:: - - - annual/mdd/sharpe/std graphics - - Axis X: Trading days are grouped by month - - Axis Y: monthly(trading date) value - Graphical Result ~~~~~~~~~~~~~~~~~ +.. note:: + + - general graphics + - `std` + - `excess_return_without_cost` + The `Standard Deviation` of `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost` + The `Standard Deviation` of `CAR` (cumulative abnormal return) with cost. + - `annualized_return` + - `excess_return_without_cost` + The `Annualized Rate` of `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost` + The `Annualized Rate` of `CAR` (cumulative abnormal return) with cost. + - `information_ratio` + - `excess_return_without_cost` + The `Information Ratio` without cost. + - `excess_return_with_cost` + The `Information Ratio` with cost. + To know more about `Information Ratio`, please refer to `Information Ratio – IR `_. + - `max_drawdown` + - `excess_return_without_cost` + The `Maximum Drawdown` of `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost` + The `Maximum Drawdown` of `CAR` (cumulative abnormal return) with cost. + + .. image:: ../_static/img/analysis/risk_analysis_bar.png + :align: center -.. image:: ../_static/img/analysis/risk_analysis_annual.png +.. note:: -.. image:: ../_static/img/analysis/risk_analysis_mdd.png + - annualized_return/max_drawdown/information_ratio/std graphics + - Axis X: Trading days grouped by month + - Axis Y: + - annualized_return graphics + - `excess_return_without_cost_annualized_return` + The `Annualized Rate` series of monthly `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost_annualized_return` + The `Annualized Rate` series of monthly `CAR` (cumulative abnormal return) with cost. + - max_drawdown graphics + - `excess_return_without_cost_max_drawdown` + The `Maximum Drawdown` series of monthly `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost_max_drawdown` + The `Maximum Drawdown` series of monthly `CAR` (cumulative abnormal return) with cost. + - information_ratio graphics + - `excess_return_without_cost_information_ratio` + The `Information Ratio` series of monthly `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost_information_ratio` + The `Information Ratio` series of monthly `CAR` (cumulative abnormal return) with cost. + - std graphics + - `excess_return_without_cost_max_drawdown` + The `Standard Deviation` series of monthly `CAR` (cumulative abnormal return) without cost. + - `excess_return_with_cost_max_drawdown` + The `Standard Deviation` series of monthly `CAR` (cumulative abnormal return) with cost. + -.. image:: ../_static/img/analysis/risk_analysis_sharpe.png +.. image:: ../_static/img/analysis/risk_analysis_annualized_return.png + :align: center + +.. image:: ../_static/img/analysis/risk_analysis_max_drawdown.png + :align: center + +.. image:: ../_static/img/analysis/risk_analysis_information_ratio.png + :align: center .. image:: ../_static/img/analysis/risk_analysis_std.png + :align: center Usage of `analysis_position.rank_label` @@ -161,13 +238,22 @@ Graphical Result - hold/sell/buy graphics: - Axis X: Trading day - - Axis Y: Percentage of `'Ref($close, -1)/$close - 1'.rank(ascending=False) / (number of lines on the day) * 100` every trading day. (`ascending=False`: The higher the value, the higher the ranking)% + - Axis Y: + Average `ranking ratio`of `label` for stocks that is held/sold/bought on the trading day. + + In the above example, the `label` is formulated as `Ref($close, -1)/$close - 1`. The `ranking ratio` can be formulated as follows. + .. math:: + + ranking\ ratio = \frac{Ascending\ Ranking\ of\ label}{Number\ of\ Stocks\ in\ the\ Portfolio} .. image:: ../_static/img/analysis/rank_label_hold.png + :align: center .. image:: ../_static/img/analysis/rank_label_buy.png + :align: center .. image:: ../_static/img/analysis/rank_label_sell.png + :align: center @@ -181,17 +267,74 @@ API :members: -Graphical Result -~~~~~~~~~~~~~~~~~ +Graphical Results +~~~~~~~~~~~~~~~~~~ + +.. note:: + + - cumulative return graphics + - `Group1`: + The `Cumulative Return` series of stocks group with (`ranking ratio` of label <= 20%) + - `Group2`: + The `Cumulative Return` series of stocks group with (20% < `ranking ratio` of label <= 40%) + - `Group3`: + The `Cumulative Return` series of stocks group with (40% < `ranking ratio` of label <= 60%) + - `Group4`: + The `Cumulative Return` series of stocks group with (60% < `ranking ratio` of label <= 80%) + - `Group5`: + The `Cumulative Return` series of stocks group with (80% < `ranking ratio` of label) + - `long-short`: + The Difference series between `Cumulative Return` of `Group1` and of `Group5` + - `long-average` + The Difference series between `Cumulative Return` of `Group1` and average `Cumulative Return` for all stocks. + + The `ranking ratio` can be formulated as follows. + .. math:: + + ranking\ ratio = \frac{Ascending\ Ranking\ of\ label}{Number\ of\ Stocks\ in\ the\ Portfolio} .. image:: ../_static/img/analysis/analysis_model_cumulative_return.png + :align: center + +.. note:: + - long-short/long-average + The distribution of long-short/long-average returns on each trading day + .. image:: ../_static/img/analysis/analysis_model_long_short.png + :align: center + +.. TODO: ask xiao yang for detial + +.. note:: + - Information Coefficient + - The `Pearson correlation coefficient` series between `labels` and `prediction scores` of stocks in portfolio. + - The graphics reports can be used to evaluate the `prediction scores`. .. image:: ../_static/img/analysis/analysis_model_IC.png + :align: center + +.. note:: + - Monthly IC + Monthly average of the `Information Coefficient` .. image:: ../_static/img/analysis/analysis_model_monthly_IC.png + :align: center + +.. note:: + - IC + The distribution of the `Information Coefficient` on each trading day. + - IC Normal Dist. Q-Q + The `Quantile-Quantile Plot` is used for the normal distribution of `Information Coefficient` on each trading day. .. image:: ../_static/img/analysis/analysis_model_NDQ.png + :align: center -.. image:: ../_static/img/analysis/analysis_model_auto_correlation.png \ No newline at end of file +.. note:: + - Auto Correlation + - The `Pearson correlation coefficient` series between the latest `prediction scores` and the `prediction scores` `lag` days ago of stocks in portfolio on each trading day. + - The graphics reports can be used to estimate the turnover rate. + + +.. image:: ../_static/img/analysis/analysis_model_auto_correlation.png + :align: center diff --git a/docs/component/strategy.rst b/docs/component/strategy.rst index 10ee714e6..c3aa88303 100644 --- a/docs/component/strategy.rst +++ b/docs/component/strategy.rst @@ -9,9 +9,9 @@ Introduction ``Interday Strategy`` is designed to adopt different trading strategies, which means that users can adopt different algorithms to generate investment portfolios based on the prediction scores of the ``Interday Model``. Users can use the ``Interday Strategy`` in an automatic workflow by ``Estimator``, please refer to `Estimator `_. -Because the componets in ``Qlib`` are designed in a loosely-coupled way, ``Interday Strategy`` can be used as a independent module also. +Because the components in ``Qlib`` are designed in a loosely-coupled way, ``Interday Strategy`` can be used as an independent module also. -``Qlib`` provides several implemented trading strategy. Also, ``Qlib`` supports costom strategy, users can customize strategies according to their own needs. +``Qlib`` provides several implemented trading strategies. Also, ``Qlib`` supports custom strategy, users can customize strategies according to their own needs. Base Class & Interface ====================== @@ -27,7 +27,7 @@ Qlib provides a base class ``qlib.contrib.strategy.BaseStrategy``. All strategy - `generate_order_list` Rerturn the order list. -User can inherit `BaseStrategy` to costomize their strategy class. +Users can inherit `BaseStrategy` to customize their strategy class. WeightStrategyBase -------------------- @@ -49,19 +49,18 @@ Qlib alse provides a class ``qlib.contrib.strategy.WeightStrategyBase`` that is - Generate the target amount of stocks from the target position. - Generate the order list from the target amount -Users can inherit `WeightStrategyBase` and implement the inteface `generate_target_weight_position` to costomize their strategy class, which only focuses on the target positions. +Users can inherit `WeightStrategyBase` and implement the interface `generate_target_weight_position` to customize their strategy class, which only focuses on the target positions. Implemented Strategy ==================== -Qlib provides several implemented strategy classes `TopkDropoutStrategy`. - +Qlib provides a implemented strategy classes named `TopkDropoutStrategy`. TopkDropoutStrategy ------------------ `TopkDropoutStrategy` is a subclass of `BaseStrategy` and implement the interface `generate_order_list` whose process is as follows. -- Adopt the the ``Topk-Drop`` algorithm to calculate the target amount of each stock +- Adopt the ``Topk-Drop`` algorithm to calculate the target amount of each stock .. note:: ``Topk-Drop`` algorithm: @@ -70,7 +69,7 @@ TopkDropoutStrategy - `Drop`: The number of stocks sold on each trading day Currently, the number of held stocks is `Topk`. - On each trading day, the `Drop` number of held stocks with worst prediction score will be sold, and the same number of unheld stocks with best prediction score will be bought. + On each trading day, the `Drop` number of held stocks with the worst `prediction score` will be sold, and the same number of unheld stocks with the best `prediction score` will be bought. .. image:: ../_static/img/topk_drop.png :alt: Topk-Drop @@ -103,17 +102,17 @@ Usage & Example # custom Strategy, refer to: TODO: Strategy API url strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - # pred_score is the prediction score output by Model + # pred_score is the `prediction score` output by Model report_normal, positions_normal = backtest( pred_score, strategy=strategy, **BACKTEST_CONFIG ) Also, the above example has been given in ``examples\train_backtest_analyze.ipynb``. -To know more about the prediction score `pred_score` output by ``Interday Model``, please refer to `Interday Model: Model Training & Prediction `_. +To know more about the `prediction score` `pred_score` output by ``Interday Model``, please refer to `Interday Model: Model Training & Prediction `_. To know more about ``Intraday Trading``, please refer to `Intraday Trading: Model&Strategy Testing `_. Reference =================== -TO konw more about ``Interday Strategy``, please refer to `Strategy API <../reference/api.html>`_. +To know more about ``Interday Strategy``, please refer to `Strategy API <../reference/api.html>`_. diff --git a/docs/conf.py b/docs/conf.py index 0e815d7e0..265bcf1f1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,23 +34,25 @@ import pkg_resources # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', + "sphinx.ext.autodoc", + "sphinx.ext.todo", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" + + # General information about the project. project = u"QLib" @@ -71,15 +73,15 @@ release = pkg_resources.get_distribution("qlib").version # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en_US' +language = "en_US" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -102,11 +104,15 @@ todo_include_todos = True # html_theme = "sphinx_rtd_theme" +html_logo = '_static/img/logo/1.png' + + + # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_context = { -# "display_github": False, +# "display_github": False, # "last_updated": True, # "commit": True, # "github_user": "Microsoft", @@ -117,15 +123,16 @@ html_theme = "sphinx_rtd_theme" # } # html_theme_options = { - 'collapse_navigation': False, - 'display_version': False, - 'navigation_depth': 3, + "logo_only": True, + "collapse_navigation": False, + "display_version": False, + "navigation_depth": 3, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -133,11 +140,11 @@ html_theme_options = { # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', + "**": [ + "about.html", + "navigation.html", + "relations.html", # needs 'show_related': True theme option to display + "searchbox.html", ] } @@ -145,7 +152,7 @@ html_sidebars = { # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'qlibdoc' +htmlhelp_basename = "qlibdoc" # -- Options for LaTeX output --------------------------------------------- @@ -180,10 +187,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'qlib', u'QLib Documentation', - [author], 1) -] +man_pages = [(master_doc, "qlib", u"QLib Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -192,13 +196,18 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'QLib', u'QLib Documentation', - author, 'QLib', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "QLib", + u"QLib Documentation", + author, + "QLib", + "One line description of project.", + "Miscellaneous", + ), ] - # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. @@ -217,8 +226,8 @@ epub_copyright = copyright # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] -autodoc_member_order = 'bysource' -autodoc_default_flags = ['members'] +autodoc_member_order = "bysource" +autodoc_default_flags = ["members"] diff --git a/docs/hidden/online.rst b/docs/hidden/online.rst index c9c8e58f1..ddf7ce6f6 100644 --- a/docs/hidden/online.rst +++ b/docs/hidden/online.rst @@ -77,20 +77,18 @@ If Your account was saved in "./user_data/", you can see the performance of your ... Result of porfolio: - sub_bench: - risk - mean 0.001157 - std 0.003039 - annual 0.289131 - sharpe 6.017635 - mdd -0.013185 - sub_cost: - risk - mean 0.000800 - std 0.003043 - annual 0.199944 - sharpe 4.155963 - mdd -0.015517 + risk + excess_return_without_cost mean 0.000605 + std 0.005481 + annualized_return 0.152373 + information_ratio 1.751319 + max_drawdown -0.059055 + excess_return_with_cost mean 0.000410 + std 0.005478 + annualized_return 0.103265 + information_ratio 1.187411 + max_drawdown -0.075024 + Here 'SH000905' represents csi500 and 'SH000300' represents csi300 diff --git a/docs/hidden/tuner.rst b/docs/hidden/tuner.rst index 35d606c9c..6d62f899f 100644 --- a/docs/hidden/tuner.rst +++ b/docs/hidden/tuner.rst @@ -185,10 +185,10 @@ This part needs contain these fields: optim_type: max - `report_type` - The type of the report, str type, determines which kind of report you want to use. If you want to use the backtest result type, you can choose `pred_long`, `pred_long_short`, `pred_short`, `sub_bench` and `sub_cost`. If you want to use the model result type, you can only choose `model`. + The type of the report, str type, determines which kind of report you want to use. If you want to use the backtest result type, you can choose `pred_long`, `pred_long_short`, `pred_short`, `excess_return_without_cost` and `excess_return_with_cost`. If you want to use the model result type, you can only choose `model`. - `report_factor` - The factor you want to use in the report, str type, determines which factor you want to optimize. If your `report_type` is backtest result type, you can choose `annual`, `sharpe`, `mdd`, `mean` and `std`. If your `report_type` is model result type, you can choose `model_score` and `model_pearsonr`. + The factor you want to use in the report, str type, determines which factor you want to optimize. If your `report_type` is backtest result type, you can choose `annualized_return`, `information_ratio`, `max_drawdown`, `mean` and `std`. If your `report_type` is model result type, you can choose `model_score` and `model_pearsonr`. - `optim_type` The optimization type, str type, determines what kind of optimization you want to do. you can minimize the factor or maximize the factor, so you can choose `max`, `min` or `correlation` at this field. diff --git a/docs/index.rst b/docs/index.rst index 8aa9b93b1..b7dbc9e77 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,13 +16,14 @@ Document Structure .. toctree:: :maxdepth: 3 - :caption: INTRODUCTION: + :caption: GETTING STARTED: - Qlib + Introduction + Quick Start .. toctree:: :maxdepth: 3 - :caption: GETTING STARTED: + :caption: FIRST STEPS: Installation Initialization @@ -46,7 +47,7 @@ Document Structure :caption: ADVANCED TOPICS: Building Formulaic Alphas - + Online & Offline mode .. toctree:: :maxdepth: 3 :caption: REFERENCE: diff --git a/docs/introduction/introduction.rst b/docs/introduction/introduction.rst index f01c00ff9..3e4d11e28 100644 --- a/docs/introduction/introduction.rst +++ b/docs/introduction/introduction.rst @@ -1,45 +1,47 @@ =============================== -``Qlib``: Quantitative Library +``Qlib``: Quantitative Platform =============================== Introduction =================== +.. image:: ../_static/img/logo/white_bg_rec+word.png + :align: center + ``Qlib`` is an AI-oriented quantitative investment platform, which aims to realize the potential, empower the research, and create the value of AI technologies in quantitative investment. -With ``Qlib``, users can easily apply their favorite model to create better Quant investment strategy. - +With ``Qlib``, users can easily try their ideas to create better Quant investment strategies. Framework -================== - +=================== + .. image:: ../_static/img/framework.png - :alt: Framework + :align: center -At module level, ``Qlib`` is a platform that consists of the above components. Each components is loose-coupling and can be used stand-alone. +At the module level, Qlib is a platform that consists of above components. The components are designed as loose-coupled modules and each component could be used stand-alone. -====================== ======================================================================== +====================== ============================================================================== Name Description -====================== ======================================================================== -`Data layer` `DataServer` focus on providing high performance infrastructure for user - to retrieve and get raw data. `DataEnhancement` will preprocess the data - and provide the best dataset to be fed in to the models. +====================== ============================================================================== +`Data layer` `DataServer` focuses on providing high-performance infrastructure for users to + manage and retrieve raw data. `DataEnhancement` will preprocess the data and + provide the best dataset to be fed into the models. -`Interday Model` `Interday model` focus on producing forecasting signals(aka. `alpha`). - Models are trained by `Model Creator` and managed by `Model Manager`. - User could choose one or multiple models for forecasting. Multiple models - could be combined with `Ensemble` module. +`Interday Model` `Interday model` focuses on producing prediction scores (aka. `alpha`). Models + are trained by `Model Creator` and managed by `Model Manager`. Users could + choose one or multiple models for prediction. Multiple models could be combined + with `Ensemble` module. -`Interday Strategy` `Portfolio Generator` will take forecasting signals as input and output - the orders based on current position to achieve target portfolio. +`Interday Strategy` `Portfolio Generator` will take prediction scores as input and output the + orders based on the current position to achieve the target portfolio. `Intraday Trading` `Order Executor` is responsible for executing orders output by `Interday Strategy` and returning the executed results. -`Analysis` User could get detailed analysis report of forecasting signal and portfolio +`Analysis` Users could get a detailed analysis report of forecasting signals and portfolios in this part. -====================== ======================================================================== +====================== ============================================================================== -- The modules with hand-drawn style is under development and will be released in the future. -- The modules with dashed border is highly user-customizable and extendible. +- The modules with hand-drawn style are under development and will be released in the future. +- The modules with dashed borders are highly user-customizable and extendible. diff --git a/docs/introduction/quick.rst b/docs/introduction/quick.rst new file mode 100644 index 000000000..d91a8fe7e --- /dev/null +++ b/docs/introduction/quick.rst @@ -0,0 +1,93 @@ + +=============================== +Quick Start +=============================== + +Introduction +============== + +This ``Quick Start`` guide tries to demonstrate + +- It's very easy to build a complete Quant research workflow and try users' ideas with ``Qlib``. +- Though with public data and simple models, machine learning technologies work very well in practical Quant investment. + + + +Installation +================== + +Users can easily intsall ``Qlib`` according to the following steps: + +- Before installing ``Qlib`` from source, users need to install some dependencies: + + .. code-block:: + pip install numpy + pip install --upgrade cython + +- Clone the repository and install ``Qlib`` + + .. code-block:: + + git clone https://github.com/microsoft/qlib.git && cd qlib + python setup.py install + +To kown more about `installation`, please refer to `Qlib Installation <../start/installation.html>`_. + +Prepare Data +============== + +Load and prepare data by running the following code: + +.. code-block:: + + python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data + +This dataset is created by public data collected by crawler scripts in ``scripts/data_collector/``, which have been released in the same repository. Users could create the same dataset with it. + +To kown more about `prepare data`, please refer to `Data Preparation <../component/data.html>`_. + +Auto Quant Research Workflow +==================================== + +``Qlib`` provides a tool named ``Estimator`` to run the whole workflow automatically (including building dataset, training models, backtest and evaluation). Users can start an auto quant research workflow and have a graphical reports analysis according to the following steps: + +- Quant Research Workflow: + - Run ``Estimator`` with `estimator_config.yaml` as following. + .. code-block:: + + cd examples # Avoid running program under the directory contains `qlib` + estimator -c estimator/estimator_config.yaml + + + - Estimator result + The result of ``Estimator`` is as follows, which is also the result of ``Interday Trading``. Please refer to please refer to `Interdat Trading <../component/backtest.html>`_. for more details about the result. + + .. code-block:: python + + risk + excess_return_without_cost mean 0.000605 + std 0.005481 + annualized_return 0.152373 + information_ratio 1.751319 + max_drawdown -0.059055 + excess_return_with_cost mean 0.000410 + std 0.005478 + annualized_return 0.103265 + information_ratio 1.187411 + max_drawdown -0.075024 + + + To know more about `Estimator`, please refer to `Estimator <../component/estimator.html>`_. + +- Graphical Reports Analysis: + - Run ``examples/estimator/analyze_from_estimator.ipynb`` with jupyter notebook + Users can have portfolio analysis or prediction score (model prediction) analysis by run ``examples/estimator/analyze_from_estimator.ipynb``. + - Graphical Reports + Users can get graphical reports about the analysis, please refer to `Aanalysis: Evaluation & Results Analysis <../component/report.html>`_ for more details. + + + +Custom Model Integration +=============================================== + +``Qlib`` provides ``lightGBM`` and ``Dnn`` model as the baseline of ``Interday Model``. In addition to the default model, users can integrate their own custom models into ``Qlib``. If users are interested in the custom model, please refer to `Custom Model Integration <../start/integration.html>`_. diff --git a/docs/reference/api.rst b/docs/reference/api.rst index 536b09651..ea1a545e2 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -1,10 +1,11 @@ +.. _api: ================================ API Reference ================================ -Here you can find all ``QLib`` interfaces. +Here you can find all ``Qlib`` interfaces. Data @@ -49,10 +50,10 @@ Cache .. autoclass:: qlib.data.cache.DatasetCache :members: -.. autoclass:: qlib.data.cache.ServerExpressionCache +.. autoclass:: qlib.data.cache.DiskExpressionCache :members: -.. autoclass:: qlib.data.cache.ServerDatasetCache +.. autoclass:: qlib.data.cache.DiskDatasetCache :members: diff --git a/docs/start/getdata.rst b/docs/start/getdata.rst index 192712bb5..b352082cb 100644 --- a/docs/start/getdata.rst +++ b/docs/start/getdata.rst @@ -8,7 +8,7 @@ Data Retrieval Introduction ==================== -Users can get stock data by ``Qlib``. Following examples will demonstrate the basic user interface. +Users can get stock data with ``Qlib``. The following examples demonstrate the basic user interface. Examples ==================== @@ -16,122 +16,109 @@ Examples ``QLib`` Initialization: -.. note:: In order to get the data, users need to initialize ``Qlib`` with `qlib.init` first. Please refer to `initialization `_. +.. note:: In order to get the data, users need to initialize ``Qlib`` with `qlib.init` first. Please refer to `initialization `_. -It is recommended to use the following code to initialize qlib: +If users followed steps in `initialization `_ and downloaded the data, they should use the following code to initialize qlib .. code-block:: python - >>> import qlib - >>> qlib.init(provider_uri='~/.qlib/qlib_data/cn_data') + >> import qlib + >> qlib.init(provider_uri='~/.qlib/qlib_data/cn_data') -Load trading calendar with the given time range and frequency: +Load trading calendar with given time range and frequency: .. code-block:: python - >>> from qlib.data import D - >>> D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] + >> from qlib.data import D + >> D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] [Timestamp('2010-01-04 00:00:00'), Timestamp('2010-01-05 00:00:00')] -Parse a given market name into a stockpool config: +Parse a given market name into a stock pool config: .. code-block:: python - >>> from qlib.data import D - >>> D.instruments(market='all') + >> from qlib.data import D + >> D.instruments(market='all') {'market': 'all', 'filter_pipe': []} -Load instruments of certain stockpool in the given time range: +Load instruments of certain stock pool in the given time range: .. code-block:: python - >>> from qlib.data import D - >>> instruments = D.instruments(market='csi300') - >>> D.list_instruments(instruments=instruments, start_time='2010-01-01', end_time='2017-12-31', as_list=True)[:6] - + >> from qlib.data import D + >> instruments = D.instruments(market='csi300') + >> D.list_instruments(instruments=instruments, start_time='2010-01-01', end_time='2017-12-31', as_list=True)[:6] + ['SH600036', 'SH600110', 'SH600087', 'SH600900', 'SH600089', 'SZ000912'] Load dynamic instruments from a base market according to a name filter .. code-block:: python - >>> from qlib.data import D - >>> from qlib.data.filter import NameDFilter - >>> nameDFilter = NameDFilter(name_rule_re='SH[0-9]{4}55') - >>> instruments = D.instruments(market='csi300', filter_pipe=[nameDFilter]) - >>> D.list_instruments(instruments=instruments, start_time='2015-01-01', end_time='2016-02-15', as_list=True) + >> from qlib.data import D + >> from qlib.data.filter import NameDFilter + >> nameDFilter = NameDFilter(name_rule_re='SH[0-9]{4}55') + >> instruments = D.instruments(market='csi300', filter_pipe=[nameDFilter]) + >> D.list_instruments(instruments=instruments, start_time='2015-01-01', end_time='2016-02-15', as_list=True) + ['SH600655', 'SH601555'] Load dynamic instruments from a base market according to an expression filter .. code-block:: python - >>> from qlib.data import D - >>> from qlib.data.filter import ExpressionDFilter - >>> expressionDFilter = ExpressionDFilter(rule_expression='$close>100') - >>> instruments = D.instruments(market='csi300', filter_pipe=[expressionDFilter]) - >>> D.list_instruments(instruments=instruments, start_time='2015-01-01', end_time='2016-02-15', as_list=True) + >> from qlib.data import D + >> from qlib.data.filter import ExpressionDFilter + >> expressionDFilter = ExpressionDFilter(rule_expression='$close>2000') + >> instruments = D.instruments(market='csi300', filter_pipe=[expressionDFilter]) + >> D.list_instruments(instruments=instruments, start_time='2015-01-01', end_time='2016-02-15', as_list=True) + ['SZ000651', 'SZ000002', 'SH600655', 'SH600570'] -To know more about how to use the filter or how to build one's own filter, go to API Reference: `filter API <../reference/api.html#filter>`_ +For more details about filter, please refer `Filter API <../component/data.html>`_. -Load features of certain instruments in given time range: - -.. note:: This is not a recommended way to get features. +Load features of certain instruments in a given time range: .. code-block:: python - >>> from qlib.data import D - >>> instruments = ['SH600000'] - >>> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] - >>> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() - $close $volume Ref($close,1) Mean($close,3) \ - instrument datetime - SH600000 2010-01-04 81.809998 17144536.0 NaN 81.809998 - 2010-01-05 82.419998 29827816.0 81.809998 82.114998 - 2010-01-06 80.800003 25070040.0 82.419998 81.676666 - 2010-01-07 78.989998 22077858.0 80.800003 80.736666 - 2010-01-08 79.879997 17019168.0 78.989998 79.889999 + >> from qlib.data import D + >> instruments = ['SH600000'] + >> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] + >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() + + $close $volume Ref($close, 1) Mean($close, 3) $high-$low + instrument datetime + SH600000 2010-01-04 86.778313 16162960.0 88.825928 88.061483 2.907631 + 2010-01-05 87.433578 28117442.0 86.778313 87.679273 3.235252 + 2010-01-06 85.713585 23632884.0 87.433578 86.641825 1.720009 + 2010-01-07 83.788803 20813402.0 85.713585 85.645322 3.030487 + 2010-01-08 84.730675 16044853.0 83.788803 84.744354 2.047623 - Sub($high,$low) - instrument datetime - SH600000 2010-01-04 2.741158 - 2010-01-05 3.049736 - 2010-01-06 1.621399 - 2010-01-07 2.856926 - 2010-01-08 1.930397 - 2010-01-08 1.930397 +Load features of certain stock pool in a given time range: -Load features of certain stockpool in given time range: - -.. note:: Since the server need to cache all-time data for your request stockpool and fields, it may take longer to process your request than before. But in the second time, your request will be processed and responded in a flash even if you change the timespan. +.. note:: With cache enabled, the qlib data server will cache data all the time for the requested stock pool and fields, it may take longer to process the request for the first time than that without cache. But after the first time, requests with the same stock pool and fields will hit the cache and be processed faster even the requested time period changes. .. code-block:: python - >>> from qlib.data import D - >>> from qlib.data.filter import NameDFilter, ExpressionDFilter - >>> nameDFilter = NameDFilter(name_rule_re='SH[0-9]{4}55') - >>> expressionDFilter = ExpressionDFilter(rule_expression='($close/$factor)>100') - >>> instruments = D.instruments(market='csi300', filter_pipe=[nameDFilter, expressionDFilter]) - >>> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] - >>> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() + >> from qlib.data import D + >> from qlib.data.filter import NameDFilter, ExpressionDFilter + >> nameDFilter = NameDFilter(name_rule_re='SH[0-9]{4}55') + >> expressionDFilter = ExpressionDFilter(rule_expression='$close>Ref($close,1)') + >> instruments = D.instruments(market='csi300', filter_pipe=[nameDFilter, expressionDFilter]) + >> fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] + >> D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day').head() - $close $volume Ref($close, 1) \ - instrument datetime - SH600655 2015-06-15 4342.160156 258706.359375 4530.459961 - 2015-06-16 4409.270020 257349.718750 4342.160156 - 2015-06-17 4312.330078 235214.890625 4409.270020 - 2015-06-18 4086.729980 196772.859375 4312.330078 - 2015-06-19 3678.250000 182916.453125 4086.729980 - Mean($close, 3) high− low - instrument datetime - SH600655 2015-06-15 4480.743327 285.251465 - 2015-06-16 4427.296712 298.301270 - 2015-06-16 4354.586751 356.098145 - 2015-06-16 4269.443359 363.554932 - 2015-06-16 4025.770020 368.954346 + $close $volume Ref($close, 1) Mean($close, 3) $high-$low + instrument datetime + SH600655 2010-01-04 2699.567383 158193.328125 2619.070312 2626.097738 124.580566 + 2010-01-08 2612.359619 77501.406250 2584.567627 2623.220133 83.373047 + 2010-01-11 2712.982422 160852.390625 2612.359619 2636.636556 146.621582 + 2010-01-12 2788.688232 164587.937500 2712.982422 2704.676758 128.413818 + 2010-01-13 2790.604004 145460.453125 2788.688232 2764.091553 128.413818 -.. note:: When calling D.features() at client, use parameter 'disk_cache=0' to skip dataset cache, use 'disk_cache=1' to generate and use dataset cache. In addition, when calling at server, you can use 'disk_cache=2' to update the dataset cache. +For more details about features, please refer `Feature API <../component/data.html>`_. + +.. note:: When calling `D.features()` at the client, use parameter `disk_cache=0` to skip dataset cache, use `disk_cache=1` to generate and use dataset cache. In addition, when calling at the server, users can use `disk_cache=2` to update the dataset cache. API ==================== -To know more about how to use the Data, go to API Reference: `Data API <../reference/api.html#Data>`_ +To know more about how to use the Data, go to API Reference: `Data API <../reference/api.html#data>`_ diff --git a/docs/start/initialization.rst b/docs/start/initialization.rst index 94dc6e551..e2b601880 100644 --- a/docs/start/initialization.rst +++ b/docs/start/initialization.rst @@ -9,17 +9,16 @@ Qlib Initialization Initialization ========================= -Please execute the following process to initialize ``Qlib``. +Please follow the steps below to initialize ``Qlib``. -- Download and prepare the Data: execute the following command to download the stock data. +- Download and prepare the Data: execute the following command to download stock data. .. code-block:: bash python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data - - Know more about how to use ``get_data.py``, refer to `Raw Data <../advanced/data.html#raw-data>`_. + Please refer to `Raw Data <../component/data.html>`_ for more information about ``get_data.py``, -- Run the initialization code: run the following code in python: +- Initialize Qlib before calling other APIs: run following code in python. .. code-block:: Python @@ -34,17 +33,17 @@ Please execute the following process to initialize ``Qlib``. Parameters ------------------- -In fact, in addition to `provider_uri` and `region`, `qlib.init` has other parameters. The following are all the parameters of `qlib.init`: +Besides `provider_uri` and `region`, `qlib.init` has other parameters. The following are several important parameters of `qlib.init`: - `provider_uri` - Type: str. The local directory where the data loaded by ``get_data.py`` is stored. + Type: str. The URI of the Qlib data. For example, it could be the location where the data loaded by ``get_data.py`` are stored. - `region` - Type: str, optional parameter(default: ``qlib.config.REG_CN``). - Currently: ``qlib.config.REG_US``('us') and ``qlib.config.REG_CN``('cn') is supported. Different value of ``region`` will - result in different stock market mode. - + Type: str, optional parameter(default: `qlib.config.REG_CN`). + Currently: ``qlib.config.REG_US`` ('us') and ``qlib.config.REG_CN`` ('cn') is supported. Different value of `region` will result in different stock market mode. - ``qlib.config.REG_US``: US stock market. - ``qlib.config.REG_CN``: China stock market. + + Different modse will result in different trading limitations and costs. - `redis_host` Type: str, optional parameter(default: "127.0.0.1"), host of `redis` The lock and cache mechanism relies on redis. @@ -57,4 +56,4 @@ In fact, in addition to `provider_uri` and `region`, `qlib.init` has other param .. note:: - If redis connection failed with `redis_host` and `redis_port`, cache will not be used! Please refer to `Cache <../advanced/cache.rst>`_. + If Qlib fails to connect redis via `redis_host` and `redis_port`, cache mechanism will not be used! Please refer to `Cache <../component/data.html#cache>`_ for details. diff --git a/docs/start/installation.rst b/docs/start/installation.rst index 48b4f8e19..48daad310 100644 --- a/docs/start/installation.rst +++ b/docs/start/installation.rst @@ -6,33 +6,34 @@ Installation .. currentmodule:: qlib -How to Install ``Qlib`` -==================== +``Qlib`` Installation +===================== +.. note:: -``Qlib`` only supports Python3, and supports up to Python3.8. + `Qlib` supports both `Windows` and `Linux`. It's recommended to use `Qlib` in `Linux`. ``Qlib`` supports Python3, which is up to Python3.8. -Please execute the following process to install ``Qlib``: +Please follow the steps below to install ``Qlib``: -- Change the directory to ``Qlib``, in which the file ``setup.py`` exists. -- Then, please execute the following command: +- Enter the root directory of ``Qlib``, in which the file ``setup.py`` exists. +- Then, please execute the following command to install the environment dependencies and install ``Qlib``: .. code-block:: bash $ pip install numpy $ pip install --upgrade cython + $ git clone https://github.com/microsoft/qlib.git && cd qlib $ python setup.py install .. note:: - It's recommended to use anaconda/miniconda to setup environment. - ``Qlib`` needs lightgbm and tensorflow packages, use pip to install them. + It's recommended to use anaconda/miniconda to setup the environment. ``Qlib`` needs lightgbm and pytorch packages, use pip to install them. .. note:: - Do not import qlib in the repository folder which contains ``qlib``, otherwise errors may occur. + Do not import qlib in the root directory of ``Qlib``, otherwise, errors may occur. -Use the following code to confirm installation successful: +Use the following code to make sure the installation successful: .. code-block:: python @@ -41,3 +42,4 @@ Use the following code to confirm installation successful: +===================== diff --git a/docs/start/integration.rst b/docs/start/integration.rst index a81e77dbd..293de8e03 100644 --- a/docs/start/integration.rst +++ b/docs/start/integration.rst @@ -9,18 +9,18 @@ Introduction Users can integrate their own custom models according to the following steps. -- Define a custom model class, which should be a subclass of the `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_ -- Write a configuration file that describes the path and parameters of the custom model -- Test the custom model +- Define a custom model class, which should be a subclass of the `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_. +- Write a configuration file that describes the path and parameters of the custom model. +- Test the custom model. Custom Model Class =========================== The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_ and override the methods in it. - Override the `__init__` method - - ``Qlib`` passes the initialized parameters to the \_\_init\_\_ method + - ``Qlib`` passes the initialized parameters to the \_\_init\_\_ method. - The parameter must be consistent with the hyperparameters in the configuration file. - - Code Example: In the following example, the hyperparameter filed of the configuration file should contain parameters such as ‘loss:mse’. + - Code Example: In the following example, the hyperparameter filed of the configuration file should contain parameters such as `loss:mse`. .. code-block:: Python def __init__(self, loss='mse', **kwargs): @@ -32,9 +32,9 @@ The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/a - Override the `fit` method - ``Qlib`` calls the fit method to train the model - - The parameters must include training feature 'x_train', training label 'y_train', test feature 'x_valid', test label 'y_valid'at least. - - The parameters could include some optional parameters with default values, such as train weight 'w_train', test weight 'w_valid' and 'num_boost_round = 1000'. - - Code Example: In the following example, 'num_boost_round = 1000' is an optional parameter. + - The parameters must include training feature `x_train`, training label `y_train`, test feature `x_valid`, test label `y_valid` at least. + - The parameters could include some optional parameters with default values, such as train weight `w_train`, test weight `w_valid` and `num_boost_round = 1000`. + - Code Example: In the following example, `num_boost_round = 1000` is an optional parameter. .. code-block:: Python def fit(self, x_train:pd.DataFrame, y_train:pd.DataFrame, x_valid:pd.DataFrame, y_valid:pd.DataFrame, @@ -61,10 +61,10 @@ The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/a ) - Override the `predict` method - - The parameters include the test features - - Return the prediction score - - Please refer to `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_ for the parameter types of the fit method - - Code Example:In the following example, user need to user dnn to predict the label(such as 'preds') of test data 'x_test' and return it. + - The parameters include the test features. + - Return the `prediction score`. + - Please refer to `qlib.contrib.model.base.Model <../reference/api.html#module-qlib.contrib.model.base>`_ for the parameter types of the fit method. + - Code Example: In the following example, users need to use dnn to predict the label(such as `preds`) of test data `x_test` and return it. .. code-block:: Python def predict(self, x_test:pd.DataFrame, **kwargs)-> numpy.ndarray: @@ -73,9 +73,9 @@ The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/a return self._model.predict(x_test.values) - Override the `score` method - - The parameters include the test features and test labels - - Return the evaluation score of model. It's recommended to adopt the loss between labels and prediction score. - - Code Example:In the following example, user need to calculate the weighted loss with test data 'x_test', test label 'y_test' and the weight 'w_test'. + - The parameters include the test features and test labels. + - Return the evaluation score of the model. It's recommended to adopt the loss between labels and `prediction score`. + - Code Example: In the following example, users need to calculate the weighted loss with test data `x_test`, test label `y_test` and the weight `w_test`. .. code-block:: Python def score(self, x_test:pd.Dataframe, y_test:pd.Dataframe, w_test:pd.DataFrame = None) -> float: @@ -87,8 +87,8 @@ The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/a return scorer(y_test.values, preds, sample_weight=w_test_weight) - Override the `save` method & `load` method - - The `save` method parameter include the a `filename` that represents an absolute path, user need to save model into the path. - - The `load` method parameter include the a `buffer` read from the `filename` passed in `save` method , user need to load model from the `buffer`. + - The `save` method parameter includes the a `filename` that represents an absolute path, user need to save model into the path. + - The `load` method parameter includes the a `buffer` read from the `filename` passed in the `save` method, users need to load model from the `buffer`. - Code Example: .. code-block:: Python @@ -104,9 +104,9 @@ The Custom models need to inherit `qlib.contrib.model.base.Model <../reference/a Configuration File ======================= -The configuration file is described in detail in the `estimator <../advanced/estimator.html#Example>`_ document. In order to integrate the custom model into ``Qlib``, you need to modify the "model" field in the configuration file. +The configuration file is described in detail in the `estimator <../component/estimator.html#complete-example>`_ document. In order to integrate the custom model into ``Qlib``, users need to modify the "model" field in the configuration file. -- Example: The following example describes the ‘model’ field of configuration file about the custom lightgbm model mentioned above , where ‘module_path’ is the module path, ‘class’ is the class name, and ‘args’ is the hyperparameter passed into the __init__ method. All parameters in the field is passed to 'self._params' by '\*\*kwargs' in `__init__` except 'loss = mse'. +- Example: The following example describes the `model` field of configuration file about the custom lightgbm model mentioned above, where `module_path` is the module path, `class` is the class name, and `args` is the hyperparameter passed into the __init__ method. All parameters in the field is passed to `self._params` by `\*\*kwargs` in `__init__` except `loss = mse`. .. code-block:: YAML @@ -128,7 +128,7 @@ Users could find configuration file of the baseline of the ``Model`` in ``qlib/e Model Testing ===================== -Assuming that the configuration file is ``examples/estimator/estimator_config.yaml``, user can run the following command to test the custom model: +Assuming that the configuration file is ``examples/estimator/estimator_config.yaml``, users can run the following command to test the custom model: .. code-block:: bash @@ -137,10 +137,10 @@ Assuming that the configuration file is ``examples/estimator/estimator_config.ya .. note:: ``estimator`` is a built-in command of ``Qlib``. -Also, ``Model`` can also be tested as a single module. An example has been given in ``examples.estimator.train_backtest_analyze.ipynb``. +Also, ``Model`` can also be tested as a single module. An example has been given in ``examples/train_backtest_analyze.ipynb``. Reference ===================== -To know more about ``Model``, please refer to `Interday Model: Model Training & Prediction <../advanced/model.rst>`_ and `Model API <../reference/api.html#module-qlib.contrib.model.base>`_. +To know more about ``Model``, please refer to `Interday Model: Model Training & Prediction <../component/model.html>`_ and `Model API <../reference/api.html#module-qlib.contrib.model.base>`_. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..fb4151f4d --- /dev/null +++ b/examples/README.md @@ -0,0 +1,10 @@ +# Requirements + +Here is the minimal hardware requirements to run the example. +- Memory: 16G +- Free Dick: 5G + + +# NOTE +The results will slightly vary on different OSs(the variance of annualized return will be less than 2%). +The evaluation results in the `README.md` page are from Linux OS. diff --git a/examples/estimator/estimator_config.yaml b/examples/estimator/estimator_config.yaml index b6d9fd9f7..b0c73af09 100644 --- a/examples/estimator/estimator_config.yaml +++ b/examples/estimator/estimator_config.yaml @@ -14,9 +14,8 @@ model: lambda_l1: 205.6999 lambda_l2: 580.9768 max_depth: 8 - num_leaves: 64 + num_leaves: 210 num_threads: 20 - min_data_in_leaf: 10 data: class: QLibDataHandlerClose args: @@ -52,4 +51,3 @@ qlib_data: # when testing, please modify the following parameters according to the specific environment provider_uri: "~/.qlib/qlib_data/cn_data" region: "cn" - redis_port: 4312 diff --git a/examples/train_and_backtest.py b/examples/train_and_backtest.py index 6c05b1606..db3fc7984 100644 --- a/examples/train_and_backtest.py +++ b/examples/train_and_backtest.py @@ -113,7 +113,7 @@ if __name__ == "__main__": # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb ################################### analysis = dict() - analysis["sub_bench"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["sub_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) analysis_df = pd.concat(analysis) # type: pd.DataFrame print(analysis_df) diff --git a/examples/train_backtest_analyze.ipynb b/examples/train_backtest_analyze.ipynb index dec184ea2..b80996b54 100644 --- a/examples/train_backtest_analyze.ipynb +++ b/examples/train_backtest_analyze.ipynb @@ -1,355 +1,366 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "\n", - "import qlib\n", - "import pandas as pd\n", - "from qlib.config import REG_CN\n", - "from qlib.contrib.model.gbdt import LGBModel\n", - "from qlib.contrib.estimator.handler import QLibDataHandlerClose\n", - "from qlib.contrib.strategy.strategy import TopkDropoutStrategy\n", - "from qlib.contrib.evaluate import (\n", - " backtest as normal_backtest,\n", - " risk_analysis,\n", - ")\n", - "from qlib.utils import exists_qlib_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# use default data\n", - "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data\n", - "provider_uri = \"~/.qlib/qlib_data/cn_data\" # target_dir\n", - "if not exists_qlib_data(provider_uri):\n", - " print(f\"Qlib data is not found in {provider_uri}\")\n", - " sys.path.append(str(Path.cwd().parent.joinpath(\"scripts\")))\n", - " from get_data import GetData\n", - " GetData().qlib_data_cn(provider_uri)\n", - "qlib.init(provider_uri=provider_uri, region=REG_CN)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MARKET = \"csi300\"\n", - "BENCHMARK = \"SH000300\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# train model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# train model\n", - "###################################\n", - "DATA_HANDLER_CONFIG = {\n", - " \"dropna_label\": True,\n", - " \"start_date\": \"2008-01-01\",\n", - " \"end_date\": \"2020-08-01\",\n", - " \"market\": MARKET,\n", - "}\n", - "\n", - "TRAINER_CONFIG = {\n", - " \"train_start_date\": \"2008-01-01\",\n", - " \"train_end_date\": \"2014-12-31\",\n", - " \"validate_start_date\": \"2015-01-01\",\n", - " \"validate_end_date\": \"2016-12-31\",\n", - " \"test_start_date\": \"2017-01-01\",\n", - " \"test_end_date\": \"2020-08-01\",\n", - "}\n", - "\n", - "# use default DataHandler\n", - "# custom DataHandler, refer to: TODO: DataHandler api url\n", - "x_train, y_train, x_validate, y_validate, x_test, y_test = QLibDataHandlerClose(**DATA_HANDLER_CONFIG).get_split_data(**TRAINER_CONFIG)\n", - "\n", - "\n", - "MODEL_CONFIG = {\n", - " \"loss\": \"mse\",\n", - " \"colsample_bytree\": 0.8879,\n", - " \"learning_rate\": 0.0421,\n", - " \"subsample\": 0.8789,\n", - " \"lambda_l1\": 205.6999,\n", - " \"lambda_l2\": 580.9768,\n", - " \"max_depth\": 8,\n", - " \"num_leaves\": 210,\n", - " \"num_threads\": 20,\n", - "}\n", - "# use default model\n", - "# custom Model, refer to: TODO: Model api url\n", - "model = LGBModel(**MODEL_CONFIG)\n", - "model.fit(x_train, y_train, x_validate, y_validate)\n", - "_pred = model.predict(x_test)\n", - "_pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns)\n", - "\n", - "# backtest requires pred_score\n", - "pred_score = pd.DataFrame(index=_pred.index)\n", - "pred_score[\"score\"] = _pred.iloc(axis=1)[0]\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# backtest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# backtest\n", - "###################################\n", - "STRATEGY_CONFIG = {\n", - " \"topk\": 50,\n", - " \"n_drop\": 5", - "}\n", - "BACKTEST_CONFIG = {\n", - " \"verbose\": False,\n", - " \"limit_threshold\": 0.095,\n", - " \"account\": 100000000,\n", - " \"benchmark\": BENCHMARK,\n", - " \"deal_price\": \"close\",\n", - " \"open_cost\": 0.0005,\n", - " \"close_cost\": 0.0015,\n", - " \"min_cost\": 5,\n", - " \n", - "}\n", - "\n", - "# use default strategy\n", - "# custom Strategy, refer to: TODO: Strategy api url\n", - "strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)\n", - "report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# analyze" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# analyze\n", - "# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb\n", - "###################################\n", - "analysis = dict()\n", - "analysis[\"sub_bench\"] = risk_analysis(report_normal[\"return\"] - report_normal[\"bench\"])\n", - "analysis[\"sub_cost\"] = risk_analysis(\n", - " report_normal[\"return\"] - report_normal[\"bench\"] - report_normal[\"cost\"]\n", - ")\n", - "analysis_df = pd.concat(analysis) # type: pd.DataFrame\n", - "print(analysis_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# analyze graphs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from qlib.contrib.report import analysis_model, analysis_position" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get label data\n", - "from qlib.data import D\n", - "pred_df_dates = pred_score.index.get_level_values(level='datetime')\n", - "features_df = D.features(D.instruments(MARKET), ['Ref($close, -1)/$close - 1'], pred_df_dates.min(), pred_df_dates.max())\n", - "features_df.columns = ['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis position" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### report" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.report_graph(report_normal)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### score IC" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_label = pd.concat([features_df, pred_score], axis=1, sort=True).reindex(features_df.index)\n", - "analysis_position.score_ic_graph(pred_label)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### cumulative return" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "analysis_position.cumulative_return_graph(positions_normal, report_normal, features_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### risk analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "analysis_position.risk_analysis_graph(analysis_df, report_normal)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### rank label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.rank_label_graph(positions_normal, features_df, pred_df_dates.min(), pred_df_dates.max())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### model performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "analysis_model.model_performance_graph(pred_label)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "import qlib\n", + "import pandas as pd\n", + "from qlib.config import REG_CN\n", + "from qlib.contrib.model.gbdt import LGBModel\n", + "from qlib.contrib.estimator.handler import QLibDataHandlerClose\n", + "from qlib.contrib.strategy.strategy import TopkDropoutStrategy\n", + "from qlib.contrib.evaluate import (\n", + " backtest as normal_backtest,\n", + " risk_analysis,\n", + ")\n", + "from qlib.utils import exists_qlib_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# use default data\n", + "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data\n", + "provider_uri = \"~/.qlib/qlib_data/cn_data\" # target_dir\n", + "if not exists_qlib_data(provider_uri):\n", + " print(f\"Qlib data is not found in {provider_uri}\")\n", + " sys.path.append(str(Path.cwd().parent.joinpath(\"scripts\")))\n", + " from get_data import GetData\n", + " GetData().qlib_data_cn(provider_uri)\n", + "qlib.init(provider_uri=provider_uri, region=REG_CN)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MARKET = \"csi300\"\n", + "BENCHMARK = \"SH000300\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# train model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "###################################\n", + "# train model\n", + "###################################\n", + "DATA_HANDLER_CONFIG = {\n", + " \"dropna_label\": True,\n", + " \"start_date\": \"2008-01-01\",\n", + " \"end_date\": \"2020-08-01\",\n", + " \"market\": MARKET,\n", + "}\n", + "\n", + "TRAINER_CONFIG = {\n", + " \"train_start_date\": \"2008-01-01\",\n", + " \"train_end_date\": \"2014-12-31\",\n", + " \"validate_start_date\": \"2015-01-01\",\n", + " \"validate_end_date\": \"2016-12-31\",\n", + " \"test_start_date\": \"2017-01-01\",\n", + " \"test_end_date\": \"2020-08-01\",\n", + "}\n", + "\n", + "# use default DataHandler\n", + "# custom DataHandler, refer to: TODO: DataHandler api url\n", + "x_train, y_train, x_validate, y_validate, x_test, y_test = QLibDataHandlerClose(**DATA_HANDLER_CONFIG).get_split_data(**TRAINER_CONFIG)\n", + "\n", + "\n", + "MODEL_CONFIG = {\n", + " \"loss\": \"mse\",\n", + " \"colsample_bytree\": 0.8879,\n", + " \"learning_rate\": 0.0421,\n", + " \"subsample\": 0.8789,\n", + " \"lambda_l1\": 205.6999,\n", + " \"lambda_l2\": 580.9768,\n", + " \"max_depth\": 8,\n", + " \"num_leaves\": 210,\n", + " \"num_threads\": 20,\n", + "}\n", + "# use default model\n", + "# custom Model, refer to: TODO: Model api url\n", + "model = LGBModel(**MODEL_CONFIG)\n", + "model.fit(x_train, y_train, x_validate, y_validate)\n", + "_pred = model.predict(x_test)\n", + "_pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns)\n", + "\n", + "# backtest requires pred_score\n", + "pred_score = pd.DataFrame(index=_pred.index)\n", + "pred_score[\"score\"] = _pred.iloc(axis=1)[0]\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# backtest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################\n", + "# backtest\n", + "###################################\n", + "STRATEGY_CONFIG = {\n", + " \"topk\": 50,\n", + " \"n_drop\": 5}\n", + "BACKTEST_CONFIG = {\n", + " \"verbose\": False,\n", + " \"limit_threshold\": 0.095,\n", + " \"account\": 100000000,\n", + " \"benchmark\": BENCHMARK,\n", + " \"deal_price\": \"close\",\n", + " \"open_cost\": 0.0005,\n", + " \"close_cost\": 0.0015,\n", + " \"min_cost\": 5,\n", + " \n", + "}\n", + "\n", + "# use default strategy\n", + "# custom Strategy, refer to: TODO: Strategy api url\n", + "strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)\n", + "report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# analyze" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################\n", + "# analyze\n", + "# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb\n", + "###################################\n", + "analysis = dict()\n", + "analysis[\"excess_return_without_cost\"] = risk_analysis(report_normal[\"return\"] - report_normal[\"bench\"])\n", + "analysis[\"excess_return_with_cost\"] = risk_analysis(\n", + " report_normal[\"return\"] - report_normal[\"bench\"] - report_normal[\"cost\"]\n", + ")\n", + "analysis_df = pd.concat(analysis) # type: pd.DataFrame\n", + "print(analysis_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# analyze graphs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.contrib.report import analysis_model, analysis_position" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label data\n", + "from qlib.data import D\n", + "pred_df_dates = pred_score.index.get_level_values(level='datetime')\n", + "features_df = D.features(D.instruments(MARKET), ['Ref($close, -1)/$close - 1'], pred_df_dates.min(), pred_df_dates.max())\n", + "features_df.columns = ['label']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## analysis position" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.report_graph(report_normal)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### score IC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pred_label = pd.concat([features_df, pred_score], axis=1, sort=True).reindex(features_df.index)\n", + "analysis_position.score_ic_graph(pred_label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### cumulative return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "analysis_position.cumulative_return_graph(positions_normal, report_normal, features_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### risk analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "analysis_position.risk_analysis_graph(analysis_df, report_normal)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### rank label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.rank_label_graph(positions_normal, features_df, pred_df_dates.min(), pred_df_dates.max())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## analysis model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### model performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "analysis_model.model_performance_graph(pred_label)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/qlib/__init__.py b/qlib/__init__.py index 1e79540d8..a80c71643 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. -__version__ = "0.4.6.dev" +__version__ = "0.5.0" import os import copy diff --git a/qlib/config.py b/qlib/config.py index 18f34e0d9..687945c54 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -28,7 +28,7 @@ _default_config = { "default_disk_cache": 1, # 0:skip/1:use "disable_disk_cache": False, # disable disk cache; if High-frequency data generally disable_disk_cache=True "mem_cache_size_limit": 500, - # memory cache expire second, only in used 'ClientDatasetCache' and 'client D.calendar' + # memory cache expire second, only in used 'DatasetURICache' and 'client D.calendar' # default 1 hour "mem_cache_expire": 60 * 60, # memory cache space limit, default 5GB, only in used client @@ -86,8 +86,8 @@ _default_server_config = { "redis_task_db": 1, "kernels": 64, # cache - "expression_cache": "ServerExpressionCache", - "dataset_cache": "ServerDatasetCache", + "expression_cache": "DiskExpressionCache", + "dataset_cache": "DiskDatasetCache", } _default_client_config = { @@ -102,8 +102,8 @@ _default_client_config = { "provider_uri": "~/.qlib/qlib_data/cn_data", # cache # Using parameter 'remote' to announce the client is using server_cache, and the writing access will be disabled. - "expression_cache": "ServerExpressionCache", - "dataset_cache": "ServerDatasetCache", + "expression_cache": "DiskExpressionCache", + "dataset_cache": "DiskDatasetCache", "calendar_cache": None, # client config "kernels": 16, diff --git a/qlib/contrib/estimator/estimator.py b/qlib/contrib/estimator/estimator.py index 151b2b002..3a7dce438 100644 --- a/qlib/contrib/estimator/estimator.py +++ b/qlib/contrib/estimator/estimator.py @@ -121,7 +121,7 @@ class Estimator(object): else: raise ValueError("unexpected mode: %s" % self.ex_config.mode) analysis = self.backtest() - self.logger.info(analysis) + print(analysis) self.logger.info( "experiment id: {}, experiment name: {}".format(self.ex.experiment.current_run._id, self.ex_config.name) ) @@ -182,8 +182,8 @@ class Estimator(object): # analysis["pred_long"] = risk_analysis(long_short_reports["long"]) # analysis["pred_short"] = risk_analysis(long_short_reports["short"]) # analysis["pred_long_short"] = risk_analysis(long_short_reports["long_short"]) - analysis["sub_bench"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["sub_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) analysis_df = pd.concat(analysis) # type: pd.DataFrame TimeInspector.log_cost_time( "Finished generating analysis," " average turnover is: {0:.4f}.".format(report_normal["turnover"].mean()) diff --git a/qlib/contrib/estimator/launcher.py b/qlib/contrib/estimator/launcher.py index d8c4fc276..80717a32c 100644 --- a/qlib/contrib/estimator/launcher.py +++ b/qlib/contrib/estimator/launcher.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# coding=utf-8 import argparse import importlib diff --git a/qlib/contrib/evaluate.py b/qlib/contrib/evaluate.py index d6149531c..eaece5cf5 100644 --- a/qlib/contrib/evaluate.py +++ b/qlib/contrib/evaluate.py @@ -27,14 +27,15 @@ def risk_analysis(r, N=252): r : pandas.Series daily return series N: int - scaler for annualizing sharpe ratio (day: 250, week: 50, month: 12) + scaler for annualizing information_ratio (day: 250, week: 50, month: 12) """ mean = r.mean() std = r.std(ddof=1) - annual = mean * N - sharpe = mean / std * np.sqrt(N) - mdd = (r.cumsum() - r.cumsum().cummax()).min() - data = {"mean": mean, "std": std, "annual": annual, "sharpe": sharpe, "mdd": mdd} + annualized_return = mean * N + information_ratio = mean / std * np.sqrt(N) + max_drawdown = (r.cumsum() - r.cumsum().cummax()).min() + data = {"mean": mean, "std": std, "annualized_return": annualized_return, + "information_ratio": information_ratio, "max_drawdown": max_drawdown} res = pd.Series(data, index=data.keys()).to_frame("risk") return res diff --git a/qlib/contrib/online/operator.py b/qlib/contrib/online/operator.py index 500e732ff..a00e17d40 100644 --- a/qlib/contrib/online/operator.py +++ b/qlib/contrib/online/operator.py @@ -279,7 +279,7 @@ class Operator(object): self.show(id, path, bench) def show(self, id, path, bench="SH000905"): - """show the newly report (mean, std, sharpe, annual) + """show the newly report (mean, std, information_ratio, annualized_return) Parameters ---------- @@ -299,14 +299,14 @@ class Operator(object): report["bench"] = bench analysis_result = {} r = (report["return"] - report["bench"]).dropna() - analysis_result["sub_bench"] = risk_analysis(r) + analysis_result["excess_return_without_cost"] = risk_analysis(r) r = (report["return"] - report["bench"] - report["cost"]).dropna() - analysis_result["sub_cost"] = risk_analysis(r) + analysis_result["excess_return_with_cost"] = risk_analysis(r) print("Result:") - print("sub_bench:") - print(analysis_result["sub_bench"]) - print("sub_cost:") - print(analysis_result["sub_cost"]) + print("excess_return_without_cost:") + print(analysis_result["excess_return_without_cost"]) + print("excess_return_with_cost:") + print(analysis_result["excess_return_with_cost"]) def run(): diff --git a/qlib/contrib/online/user.py b/qlib/contrib/online/user.py index d8a8fdabe..9b33ec24c 100644 --- a/qlib/contrib/online/user.py +++ b/qlib/contrib/online/user.py @@ -53,7 +53,7 @@ class User: def showReport(self, benchmark="SH000905"): """ - show the newly report (mean, std, sharpe, annual) + show the newly report (mean, std, information_ratio, annualized_return) Parameter benchmark : string bench that to be compared, 'SH000905' for csi500 @@ -61,14 +61,14 @@ class User: bench = D.features([benchmark], ["$change"], disk_cache=True).loc[benchmark, "$change"] report = self.account.report.generate_report_dataframe() report["bench"] = bench - analysis_result = {"pred": {}, "sub_bench": {}, "sub_cost": {}} + analysis_result = {"pred": {}, "excess_return_without_cost": {}, "excess_return_with_cost": {}} r = (report["return"] - report["bench"]).dropna() - analysis_result["sub_bench"][0] = risk_analysis(r) + analysis_result["excess_return_without_cost"][0] = risk_analysis(r) r = (report["return"] - report["bench"] - report["cost"]).dropna() - analysis_result["sub_cost"][0] = risk_analysis(r) + analysis_result["excess_return_with_cost"][0] = risk_analysis(r) self.logger.info("Result of porfolio:") - self.logger.info("sub_bench:") - self.logger.info(analysis_result["sub_bench"][0]) - self.logger.info("sub_cost:") - self.logger.info(analysis_result["sub_cost"][0]) + self.logger.info("excess_return_without_cost:") + self.logger.info(analysis_result["excess_return_without_cost"][0]) + self.logger.info("excess_return_with_cost:") + self.logger.info(analysis_result["excess_return_with_cost"][0]) return report diff --git a/qlib/contrib/report/__init__.py b/qlib/contrib/report/__init__.py index 06309f412..3638ebfa2 100644 --- a/qlib/contrib/report/__init__.py +++ b/qlib/contrib/report/__init__.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -GRAPH_NAME_LISt = [ +GRAPH_NAME_LIST = [ "analysis_position.report_graph", "analysis_position.score_ic_graph", "analysis_position.cumulative_return_graph", diff --git a/qlib/contrib/report/analysis_model/analysis_model_performance.py b/qlib/contrib/report/analysis_model/analysis_model_performance.py index cf0bff18f..b879a467d 100644 --- a/qlib/contrib/report/analysis_model/analysis_model_performance.py +++ b/qlib/contrib/report/analysis_model/analysis_model_performance.py @@ -35,7 +35,7 @@ def _group_return( # Group t_df = pd.DataFrame( { - "Group-%d" + "Group%d" % (i + 1): pred_label_drop.groupby(level="datetime")["label"].apply( lambda x: x[len(x) // N * i : len(x) // N * (i + 1)].mean() ) @@ -45,11 +45,11 @@ def _group_return( t_df.index = pd.to_datetime(t_df.index) # Long-Short - t_df["long-short"] = t_df["Group-1"] - t_df["Group-%d" % N] + t_df["long-short"] = t_df["Group1"] - t_df["Group%d" % N] # Long-Average t_df["long-average"] = ( - t_df["Group-1"] - pred_label.groupby(level="datetime")["label"].mean() + t_df["Group1"] - pred_label.groupby(level="datetime")["label"].mean() ) t_df = t_df.dropna(how="all") # for days which does not contain label diff --git a/qlib/contrib/report/analysis_position/cumulative_return.py b/qlib/contrib/report/analysis_position/cumulative_return.py index 8efd6cd98..da0d88eba 100644 --- a/qlib/contrib/report/analysis_position/cumulative_return.py +++ b/qlib/contrib/report/analysis_position/cumulative_return.py @@ -228,11 +228,11 @@ def cumulative_return_graph( Graph desc: - Axis X: Trading day - Axis Y: - - Above axis Y: (((Ref($close, -1)/$close - 1) * weight).sum() / weight.sum()).cumsum() + - Above axis Y: `(((Ref($close, -1)/$close - 1) * weight).sum() / weight.sum()).cumsum()` - Below axis Y: Daily weight sum - - In the sell graph, y < 0 stands for profit; in other cases, y > 0 stands for profit. - - In the buy_minus_sell graph, the y value of the weight graph at the bottom is buy_weight + sell_weight. - - In each graph, the red line in the histogram on the right represents the average. + - In the **sell** graph, `y < 0` stands for profit; in other cases, `y > 0` stands for profit. + - In the **buy_minus_sell** graph, the **y** value of the **weight** graph at the bottom is `buy_weight + sell_weight`. + - In each graph, the **red line** in the histogram on the right represents the average. :param position: position data :param report_normal: @@ -250,7 +250,7 @@ def cumulative_return_graph( :param label_data: `D.features` result; index is `pd.MultiIndex`, index name is [`instrument`, `datetime`]; columns names is [`label`]. - **The ``label`` T is the change from T to T+1**, it is recommended to use ``close``, example: D.features(D.instruments('csi500'), ['Ref($close, -1)/$close-1']) + **The label T is the change from T to T+1**, it is recommended to use ``close``, example: `D.features(D.instruments('csi500'), ['Ref($close, -1)/$close-1'])` .. code-block:: python diff --git a/qlib/contrib/report/analysis_position/rank_label.py b/qlib/contrib/report/analysis_position/rank_label.py index 6427ac900..75119c597 100644 --- a/qlib/contrib/report/analysis_position/rank_label.py +++ b/qlib/contrib/report/analysis_position/rank_label.py @@ -99,7 +99,7 @@ def rank_label_graph( :param position: position data; **qlib.contrib.backtest.backtest.backtest** result :param label_data: **D.features** result; index is **pd.MultiIndex**, index name is **[instrument, datetime]**; columns names is **[label]**. - **The ``label`` T is the change from T to T+1**, it is recommended to use ``close``, example: D.features(D.instruments('csi500'), ['Ref($close, -1)/$close-1']) + **The label T is the change from T to T+1**, it is recommended to use ``close``, example: `D.features(D.instruments('csi500'), ['Ref($close, -1)/$close-1'])` .. code-block:: python diff --git a/qlib/contrib/report/analysis_position/risk_analysis.py b/qlib/contrib/report/analysis_position/risk_analysis.py index 300974195..4341e750a 100644 --- a/qlib/contrib/report/analysis_position/risk_analysis.py +++ b/qlib/contrib/report/analysis_position/risk_analysis.py @@ -32,10 +32,10 @@ def _get_risk_analysis_data_with_report( # analysis["pred_long_short"] = risk_analysis(report_long_short_df["long_short"]) if not report_normal_df.empty: - analysis["sub_bench"] = risk_analysis( + analysis["excess_return_without_cost"] = risk_analysis( report_normal_df["return"] - report_normal_df["bench"] ) - analysis["sub_cost"] = risk_analysis( + analysis["excess_return_with_cost"] = risk_analysis( report_normal_df["return"] - report_normal_df["bench"] - report_normal_df["cost"] @@ -97,7 +97,7 @@ def _get_monthly_risk_analysis_with_report(report_normal_df: pd.DataFrame) -> pd def _get_monthly_analysis_with_feature( - monthly_df: pd.DataFrame, feature: str = "annual" + monthly_df: pd.DataFrame, feature: str = "annualized_return" ) -> pd.DataFrame: """ @@ -156,7 +156,7 @@ def _get_monthly_risk_analysis_figure(report_normal_df: pd.DataFrame) -> Iterabl # report_long_short_df=report_long_short_df, ) - for _feature in ["annual", "mdd", "sharpe", "std"]: + for _feature in ["annualized_return", "max_drawdown", "information_ratio", "std"]: _temp_df = _get_monthly_analysis_with_feature(_monthly_df, _feature) yield ScatterGraph( _temp_df, @@ -200,8 +200,8 @@ def risk_analysis_graph( # analysis['pred_long'] = risk_analysis(report_long_short_df['long']) # analysis['pred_short'] = risk_analysis(report_long_short_df['short']) # analysis['pred_long_short'] = risk_analysis(report_long_short_df['long_short']) - analysis['sub_bench'] = risk_analysis(report_normal_df['return'] - report_normal_df['bench']) - analysis['sub_cost'] = risk_analysis(report_normal_df['return'] - report_normal_df['bench'] - report_normal_df['cost']) + analysis['excess_return_without_cost'] = risk_analysis(report_normal_df['return'] - report_normal_df['bench']) + analysis['excess_return_with_cost'] = risk_analysis(report_normal_df['return'] - report_normal_df['bench'] - report_normal_df['cost']) analysis_df = pd.concat(analysis) analysis_position.risk_analysis_graph(analysis_df, report_normal_df) @@ -213,17 +213,17 @@ def risk_analysis_graph( .. code-block:: python - risk - sub_bench mean 0.000662 - std 0.004487 - annual 0.166720 - sharpe 2.340526 - mdd -0.080516 - sub_cost mean 0.000577 - std 0.004482 - annual 0.145392 - sharpe 2.043494 - mdd -0.083584 + risk + excess_return_without_cost mean 0.000692 + std 0.005374 + annualized_return 0.174495 + information_ratio 2.045576 + max_drawdown -0.079103 + excess_return_with_cost mean 0.000499 + std 0.005372 + annualized_return 0.125625 + information_ratio 1.473152 + max_drawdown -0.088263 :param report_normal_df: **df.index.name** must be **date**, df.columns must contain **return**, **turnover**, **cost**, **bench** diff --git a/qlib/contrib/tuner/config.py b/qlib/contrib/tuner/config.py index 28796bcf2..4825ca092 100644 --- a/qlib/contrib/tuner/config.py +++ b/qlib/contrib/tuner/config.py @@ -61,26 +61,26 @@ class OptimizationConfig(object): "pred_long", "pred_long_short", "pred_short", - "sub_bench", - "sub_cost", + "excess_return_without_cost", + "excess_return_with_cost", "model", ]: raise ValueError( - "report_type should be one of pred_long, pred_long_short, pred_short, sub_bench, sub_cost and model" + "report_type should be one of pred_long, pred_long_short, pred_short, excess_return_without_cost, excess_return_with_cost and model" ) - self.report_factor = config.get("report_factor", "sharpe") + self.report_factor = config.get("report_factor", "information_ratio") if self.report_factor not in [ - "annual", - "sharpe", - "mdd", + "annualized_return", + "information_ratio", + "max_drawdown", "mean", "std", "model_score", "model_pearsonr", ]: raise ValueError( - "report_factor should be one of annual, sharpe, mdd, mean, std, model_pearsonr and model_score" + "report_factor should be one of annualized_return, information_ratio, max_drawdown, mean, std, model_pearsonr and model_score" ) self.optim_type = config.get("optim_type", "max") diff --git a/qlib/data/__init__.py b/qlib/data/__init__.py index b6eb66468..ef5fe4708 100644 --- a/qlib/data/__init__.py +++ b/qlib/data/__init__.py @@ -28,9 +28,9 @@ from .data import ( from .cache import ( ExpressionCache, DatasetCache, - ServerExpressionCache, - ServerDatasetCache, + DiskExpressionCache, + DiskDatasetCache, SimpleDatasetCache, - ClientDatasetCache, - ClientCalendarCache, + DatasetURICache, + MemoryCalendarCache, ) diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 33327107d..19f92353c 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -385,11 +385,11 @@ class DatasetCache(BaseProviderCache): return instruments, fields, freq -class ServerExpressionCache(ExpressionCache): +class DiskExpressionCache(ExpressionCache): """Prepared cache mechanism for server.""" def __init__(self, provider, **kwargs): - super(ServerExpressionCache, self).__init__(provider) + super(DiskExpressionCache, self).__init__(provider) self.r = get_redis_connection() # remote==True means client is using this module, writing behaviour will not be allowed. self.remote = kwargs.get("remote", False) @@ -575,11 +575,11 @@ class ServerExpressionCache(ExpressionCache): return 0 -class ServerDatasetCache(DatasetCache): +class DiskDatasetCache(DatasetCache): """Prepared cache mechanism for server.""" def __init__(self, provider, **kwargs): - super(ServerDatasetCache, self).__init__(provider) + super(DiskDatasetCache, self).__init__(provider) self.r = get_redis_connection() self.remote = kwargs.get("remote", False) if self.remote: @@ -612,7 +612,7 @@ class ServerDatasetCache(DatasetCache): :return: """ - im = ServerDatasetCache.IndexManager(cache_path) + im = DiskDatasetCache.IndexManager(cache_path) index_data = im.get_index(start_time, end_time) if index_data.shape[0] > 0: start, stop = ( @@ -625,9 +625,7 @@ class ServerDatasetCache(DatasetCache): with pd.HDFStore(cache_path, mode="r") as store: if "/{}".format(im.KEY) in store.keys(): df = store.select(key=im.KEY, start=start, stop=stop) - df.reset_index(inplace=True) - df.set_index(["instrument", "datetime"], inplace=True) - df.sort_index(inplace=True) + df = df.swaplevel("datetime", "instrument").sort_index() # read cache and need to replace not-space fields to field df = cls.cache_to_origin_data(df, fields) @@ -684,10 +682,7 @@ class ServerDatasetCache(DatasetCache): freq=freq, ) if not features.empty: - features.reset_index(inplace=True) - features.set_index(["datetime", "instrument"], inplace=True) - features.sort_index(inplace=True) - features = features.loc[start_time:end_time] + features = features.sort_index().loc(axis=0)[:, start_time:end_time] return features def _dataset_uri( @@ -851,11 +846,11 @@ class ServerDatasetCache(DatasetCache): features = self.provider.dataset(instruments, fields, _calendar[0], _calendar[-1], freq) - # sort index by datetime - if not features.empty: - features.reset_index(inplace=True) - features.set_index(["datetime", "instrument"], inplace=True) - features.sort_index(inplace=True) + if features.empty: + return features + + # swap index and sorted + features = features.swaplevel("instrument", "datetime").sort_index() # write cache data with pd.HDFStore(cache_path + ".data") as store: @@ -881,7 +876,7 @@ class ServerDatasetCache(DatasetCache): pickle.dump(meta, f) os.chmod(cache_path + ".meta", stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) # write index file - im = ServerDatasetCache.IndexManager(cache_path) + im = DiskDatasetCache.IndexManager(cache_path) index_data = im.build_index_from_data(features) im.update(index_data) @@ -890,7 +885,7 @@ class ServerDatasetCache(DatasetCache): # temporarily os.replace(cache_path + ".data", cache_path) # the fields of the cached features are converted to the original fields - return features + return features.swaplevel("datetime", "instrument") def update(self, cache_uri): cp_cache_uri = os.path.join(self.dtst_cache_path, cache_uri) @@ -900,7 +895,7 @@ class ServerDatasetCache(DatasetCache): self.clear_cache(cp_cache_uri) return 2 - im = ServerDatasetCache.IndexManager(cp_cache_uri) + im = DiskDatasetCache.IndexManager(cp_cache_uri) with CacheUtils.writer_lock(self.r, "dataset-%s" % cache_uri): with open(cp_cache_uri + ".meta", "rb") as f: d = pickle.load(f) @@ -1061,11 +1056,11 @@ class SimpleDatasetCache(DatasetCache): return self.cache_to_origin_data(data, fields) -class ClientDatasetCache(DatasetCache): +class DatasetURICache(DatasetCache): """Prepared cache mechanism for server.""" def __init__(self, provider): - super(ClientDatasetCache, self).__init__(provider) + super(DatasetURICache, self).__init__(provider) def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, **kwargs): return hash_args(*self.normalize_uri_args(instruments, fields, freq), disk_cache) @@ -1117,7 +1112,7 @@ class ClientDatasetCache(DatasetCache): get_module_logger("cache").debug(f"get feature from {C.dataset_provider}") else: mnt_feature_uri = os.path.join(C.mount_path, C.dataset_cache_dir_name, feature_uri) - df = ServerDatasetCache.read_data_from_cache(mnt_feature_uri, start_time, end_time, fields) + df = DiskDatasetCache.read_data_from_cache(mnt_feature_uri, start_time, end_time, fields) get_module_logger("cache").debug("get feature from uri cache") return df @@ -1127,7 +1122,7 @@ class CalendarCache(BaseProviderCache): pass -class ClientCalendarCache(CalendarCache): +class MemoryCalendarCache(CalendarCache): def calendar(self, start_time=None, end_time=None, freq="day", future=False): uri = self._uri(start_time, end_time, freq, future) result, expire = MemCacheExpire.get_cache(H["c"], uri) diff --git a/qlib/data/data.py b/qlib/data/data.py index 010e3668a..7908669c4 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -24,7 +24,7 @@ from .ops import * from ..log import get_module_logger from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields from .base import Feature -from .cache import ServerDatasetCache, ServerExpressionCache +from .cache import DiskDatasetCache, DiskExpressionCache @six.add_metaclass(abc.ABCMeta) @@ -357,7 +357,7 @@ class DatasetProvider(object): whether to skip(0)/use(1)/replace(2) disk_cache """ - return ServerDatasetCache._uri(instruments, fields, start_time, end_time, freq, disk_cache) + return DiskDatasetCache._uri(instruments, fields, start_time, end_time, freq, disk_cache) @staticmethod def get_instruments_d(instruments, freq): @@ -452,7 +452,7 @@ class DatasetProvider(object): if len(new_data) > 0: data = pd.concat(new_data, names=["instrument"], sort=False) - data = ServerDatasetCache.cache_to_origin_data(data, column_names) + data = DiskDatasetCache.cache_to_origin_data(data, column_names) else: data = pd.DataFrame(columns=column_names) @@ -915,7 +915,7 @@ class ClientDatasetProvider(DatasetProvider): try: # pre-mound nfs, used for demo mnt_feature_uri = os.path.join(C.mount_path, C.dataset_cache_dir_name, feature_uri) - df = ServerDatasetCache.read_data_from_cache(mnt_feature_uri, start_time, end_time, fields) + df = DiskDatasetCache.read_data_from_cache(mnt_feature_uri, start_time, end_time, fields) get_module_logger("data").debug("finish slicing data") if return_uri: return df, feature_uri diff --git a/qlib/data/filter.py b/qlib/data/filter.py index 1552aeee7..368a9ddcc 100644 --- a/qlib/data/filter.py +++ b/qlib/data/filter.py @@ -142,6 +142,7 @@ class SeriesDFilter(BaseDFilter): the series of bool value indicating whether the date satisfies the filter condition and exists in target timestamp """ fstart, fend = list(filter_series.keys())[0], list(filter_series.keys())[-1] + filter_series = filter_series.astype('bool') # Make sure the filter_series is boolean timestamp_series[fstart:fend] = timestamp_series[fstart:fend] & filter_series return timestamp_series diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 104296a0e..ca85baf6c 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -914,10 +914,7 @@ class IdxMax(Rolling): if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) else: - series = series.rolling(self.N, min_periods=1).apply( - lambda x: x.argmax() + 1, - raw=True, - ) + series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) return series @@ -965,10 +962,7 @@ class IdxMin(Rolling): if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) else: - series = series.rolling(self.N, min_periods=1).apply( - lambda x: x.argmin() + 1, - raw=True, - ) + series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) return series @@ -1194,11 +1188,12 @@ class Rsquare(Rolling): super(Rsquare, self).__init__(feature, N, "rsquare") def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + _series = self.feature.load(instrument, start_index, end_index, freq) if self.N == 0: - series = pd.Series(expanding_rsquare(series.values), index=series.index) + series = pd.Series(expanding_rsquare(_series.values), index=_series.index) else: - series = pd.Series(rolling_rsquare(series.values, self.N), index=series.index) + series = pd.Series(rolling_rsquare(_series.values, self.N), index=_series.index) + series.loc[np.isclose(_series.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)] = np.nan return series @@ -1341,12 +1336,7 @@ class PairRolling(ExpressionOps): if self.N == 0: return np.inf return ( - max( - self.feature_left.get_longest_back_rolling(), - self.feature_right.get_longest_back_rolling(), - ) - + self.N - - 1 + max(self.feature_left.get_longest_back_rolling(), self.feature_right.get_longest_back_rolling()) + self.N - 1 ) def get_extended_window_size(self): @@ -1382,6 +1372,18 @@ class Corr(PairRolling): def __init__(self, feature_left, feature_right, N): super(Corr, self).__init__(feature_left, feature_right, N, "corr") + def _load_internal(self, instrument, start_index, end_index, freq): + res = super(Corr, self)._load_internal(instrument, start_index, end_index, freq) + + # NOTE: Load uses MemCache, so calling load again will not cause performance degradation + series_left = self.feature_left.load(instrument, start_index, end_index, freq) + series_right = self.feature_right.load(instrument, start_index, end_index, freq) + res.loc[ + np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) + | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) + ] = np.nan + return res + class Cov(PairRolling): """Rolling Covariance @@ -1403,3 +1405,4 @@ class Cov(PairRolling): def __init__(self, feature_left, feature_right, N): super(Cov, self).__init__(feature_left, feature_right, N, "cov") + diff --git a/scripts/data_collector/csi/collector.py b/scripts/data_collector/csi/collector.py index 22cef51b6..cc6833a7c 100644 --- a/scripts/data_collector/csi/collector.py +++ b/scripts/data_collector/csi/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import re +import sys import bisect from io import BytesIO from pathlib import Path @@ -12,16 +13,17 @@ import pandas as pd from lxml import etree from loguru import logger +CUR_DIR = Path(__file__).resolve().parent +sys.path.append(str(CUR_DIR.parent.parent)) +from data_collector.utils import get_hs_calendar_list as get_calendar_list + + NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/000300cons.xls" CSI300_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC%E8%82%A1%E7%9A%84%E5%85%AC%E5%91%8A" -CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101" - CSI300_START_DATE = pd.Timestamp("2005-01-01") -CUR_DIR = Path(__file__).resolve().parent - class CSI300: @@ -50,12 +52,7 @@ class CSI300: Returns ------- """ - # TODO: get calendar from MSN - if self._calendar_list is None: - logger.info("get all trading date") - value_list = requests.get(CSI300_BENCH_URL).json()["data"]["klines"] - self._calendar_list = sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), value_list)) - return self._calendar_list + return get_calendar_list(bench=True) def _get_trading_date_by_shift(self, trading_date: pd.Timestamp, shift=1): """get trading date by shift diff --git a/scripts/data_collector/msn/README.md b/scripts/data_collector/msn/README.md deleted file mode 100644 index c197d2b97..000000000 --- a/scripts/data_collector/msn/README.md +++ /dev/null @@ -1 +0,0 @@ -# TODO: Support collecting data from MSN \ No newline at end of file diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py new file mode 100644 index 000000000..e3f949fd1 --- /dev/null +++ b/scripts/data_collector/utils.py @@ -0,0 +1,103 @@ +import re +import requests + +import pandas as pd +from lxml import etree + +SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" +CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101" +SH600000_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.600000&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101" + +_BENCH_CALENDAR_LIST = None +_ALL_CALENDAR_LIST = None +_HS_SYMBOLS = None + + +def get_hs_calendar_list(bench=False) -> list: + """get SH/SZ history calendar list + + Parameters + ---------- + bench: bool + whether to get the bench calendar list, by default False + + Returns + ------- + history calendar list + """ + global _ALL_CALENDAR_LIST + global _BENCH_CALENDAR_LIST + + def _get_calendar(url): + _value_list = requests.get(url).json()["data"]["klines"] + return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list)) + + # TODO: get calendar from MSN + if bench: + if _BENCH_CALENDAR_LIST is None: + _BENCH_CALENDAR_LIST = _get_calendar(CSI300_BENCH_URL) + return _BENCH_CALENDAR_LIST + + if _ALL_CALENDAR_LIST is None: + _ALL_CALENDAR_LIST = _get_calendar(SH600000_BENCH_URL) + return _ALL_CALENDAR_LIST + + +def get_hs_stock_symbols() -> list: + """get SH/SZ stock symbols + + Returns + ------- + stock symbols + """ + global _HS_SYMBOLS + if _HS_SYMBOLS is None: + _res = set() + for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")): + resp = requests.get(SYMBOLS_URL.format(s_type=_k)) + _res |= set( + map( + lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v), + etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"), + ) + ) + _HS_SYMBOLS = sorted(list(_res)) + return _HS_SYMBOLS + + +def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str: + """symbol suffix to prefix + + Parameters + ---------- + symbol: str + symbol + capital : bool + by default True + Returns + ------- + + """ + code, exchange = symbol.split(".") + if exchange.lower() in ["sh", "ss"]: + res = f"sh{code}" + else: + res = f"{exchange}{code}" + return res.upper() if capital else res.lower() + + +def symbol_prefix_to_sufix(symbol: str, capital: bool = True) -> str: + """symbol prefix to sufix + + Parameters + ---------- + symbol: str + symbol + capital : bool + by default True + Returns + ------- + + """ + res = f"{symbol[:-2]}.{symbol[-2:]}" + return res.upper() if capital else res.lower() diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index bfa095c58..b652311a6 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import re import sys from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed @@ -11,45 +10,33 @@ import requests import numpy as np import pandas as pd from tqdm import tqdm -from lxml import etree from loguru import logger from yahooquery import Ticker CUR_DIR = Path(__file__).resolve().parent sys.path.append(str(CUR_DIR.parent.parent)) from dump_bin import DumpData +from data_collector.utils import get_hs_calendar_list as get_calendar_list, get_hs_stock_symbols -SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101" class YahooCollector: - def __init__(self, save_dir: [str, Path], max_workers=4): + def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=True, max_collector_count=3): self.save_dir = Path(save_dir).expanduser().resolve() self.save_dir.mkdir(parents=True, exist_ok=True) self._stock_list = None self.max_workers = max_workers + self._asynchronous = asynchronous + self._max_collector_count = max_collector_count @property def stock_list(self): if self._stock_list is None: - self._stock_list = self.get_stock_list() + self._stock_list = get_hs_stock_symbols() return self._stock_list - @staticmethod - def get_stock_list() -> list: - _res = set() - for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")): - resp = requests.get(SYMBOLS_URL.format(s_type=_k)) - _res |= set( - map( - lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v), - etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"), - ) - ) - return sorted(list(_res)) - def save_stock(self, symbol, df: pd.DataFrame): """save stock data to file @@ -69,19 +56,16 @@ class YahooCollector: df["symbol"] = symbol df.to_csv(stock_path, index=False) - def collector_data(self): - """collector data + def _collector(self, stock_list): - """ - logger.info("start collector yahoo data......") error_symbol = [] with ThreadPoolExecutor(max_workers=self.max_workers) as worker: futures = {} - p_bar = tqdm(total=len(self.stock_list)) - for symbols in [ - self.stock_list[i : i + self.max_workers] for i in range(0, len(self.stock_list), self.max_workers) - ]: - resp = Ticker(symbols, asynchronous=True, max_workers=self.max_workers).history(period="max") + p_bar = tqdm(total=len(stock_list)) + for symbols in [stock_list[i : i + self.max_workers] for i in range(0, len(stock_list), self.max_workers)]: + resp = Ticker(symbols, asynchronous=self._asynchronous, max_workers=self.max_workers).history( + period="max" + ) if isinstance(resp, dict): for symbol, df in resp.items(): if isinstance(df, pd.DataFrame): @@ -106,12 +90,26 @@ class YahooCollector: logger.error(e) error_symbol.append(futures[future]) p_bar.update() + print(error_symbol) + logger.info(f"error symbol nums: {len(error_symbol)}") + logger.info(f"current get symbol nums: {len(stock_list)}") + return error_symbol - logger.info(error_symbol) - logger.info(len(error_symbol)) - logger.info(len(self.stock_list)) + def collector_data(self): + """collector data + + """ + logger.info("start collector yahoo data......") + stock_list = self.stock_list + for i in range(self._max_collector_count): + if not stock_list: + break + logger.info(f"getting data: {i+1}") + stock_list = self._collector(stock_list) + logger.info(f"{i+1} finish.") # TODO: from MSN + logger.info(f"get bench data: csi300(SH000300)......") df = pd.DataFrame(map(lambda x: x.split(","), requests.get(CSI300_BENCH_URL).json()["data"]["klines"])) df.columns = ["date", "open", "close", "high", "low", "volume", "money", "change"] df["date"] = pd.to_datetime(df["date"]) @@ -164,8 +162,14 @@ class Run: def _normalize(file_path: Path): columns = ["open", "close", "high", "low", "volume"] df = pd.read_csv(file_path) - df.sort_values("date", inplace=True) - df.loc[df["volume"] <= 0, set(df.columns) - {"symbol", "date"}] = np.nan + df.set_index("date", inplace=True) + df.index = pd.to_datetime(df.index) + + # using China stock market data calendar + df = df.reindex(pd.Index(get_calendar_list())) + df.sort_index(inplace=True) + + df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {"symbol"}] = np.nan df["factor"] = df["adjclose"] / df["close"] for _col in columns: if _col == "volume": @@ -176,7 +180,8 @@ class Run: df["change"] = _tmp_series / _tmp_series.shift(1) - 1 columns += ["change", "factor"] df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan - df.loc[:, columns + ["date"]].to_csv(self.normalize_dir.joinpath(file_path.name), index=False) + df.index.names = ["date"] + df.loc[:, columns].to_csv(self.normalize_dir.joinpath(file_path.name)) with ThreadPoolExecutor(max_workers=self.max_workers) as worker: file_list = list(self.source_dir.glob("*.csv")) @@ -192,12 +197,13 @@ class Run: $ python collector.py manual_adj_data --normalize_dir ~/.qlib/stock_data/normalize """ + def _adj(file_path: Path): df = pd.read_csv(file_path) - df = df.loc[:, ["open", "close", "high", "low", "volume", "change", "factor"]] + df = df.loc[:, ["open", "close", "high", "low", "volume", "change", "factor", "date"]] df.sort_values("date", inplace=True) df = df.set_index("date") - df = df.loc[df.first_valid_index():] + df = df.loc[df.first_valid_index() :] _close = df["close"].iloc[0] for _col in df.columns: if _col == "volume": @@ -214,7 +220,6 @@ class Run: for _ in worker.map(_adj, file_list): p_bar.update() - def dump_data(self): """dump yahoo data diff --git a/scripts/get_data.py b/scripts/get_data.py index b6f5b64f5..0ab40df1c 100644 --- a/scripts/get_data.py +++ b/scripts/get_data.py @@ -52,21 +52,23 @@ class GetData: for _file in tqdm(zp.namelist()): zp.extract(_file, str(target_dir.resolve())) - def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data"): + def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"): """download cn qlib data from remote Parameters ---------- target_dir: str data save directory + version: str + data version, value from [v0, v1], by default v1 Examples --------- - python get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data + python get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data --version v1 ------- """ - file_name = "qlib_data_cn.zip" + file_name = f"qlib_data_cn_{version}.zip" self._download_data(file_name, target_dir) def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"): diff --git a/setup.py b/setup.py index 479fa8dda..01e8f71d9 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from setuptools import find_packages, setup, Extension # Package meta-data. NAME = "qlib" -DESCRIPTION = "A Quantitative-research Library" +DESCRIPTION = "A Quantitative-research Platform" REQUIRES_PYTHON = ">=3.5.0" VERSION = "0.4.6.dev" diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py index 643cfa496..e30d10774 100644 --- a/tests/test_all_pipeline.py +++ b/tests/test_all_pipeline.py @@ -123,8 +123,8 @@ def backtest(pred): def analyze(report_normal): _analysis = dict() - _analysis["sub_bench"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - _analysis["sub_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) + _analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + _analysis["excess_return_with_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"]) analysis_df = pd.concat(_analysis) # type: pd.DataFrame print(analysis_df) return analysis_df @@ -157,7 +157,7 @@ class TestAllFlow(unittest.TestCase): ) analyze_df = analyze(TestAllFlow.REPORT_NORMAL) self.assertGreaterEqual( - analyze_df.loc(axis=0)["sub_cost", "annual"].values[0], 0.10, "backtest failed", + analyze_df.loc(axis=0)["excess_return_with_cost", "annualized_return"].values[0], 0.10, "backtest failed", ) diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py index 39fb00f40..46cfbaadc 100644 --- a/tests/test_dump_data.py +++ b/tests/test_dump_data.py @@ -37,7 +37,7 @@ class TestDumpData(unittest.TestCase): def setUpClass(cls) -> None: GetData().csv_data_cn(SOURCE_DIR) TestDumpData.DUMP_DATA = DumpData(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR) - TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.iterdir())) + TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) provider_uri = str(QLIB_DIR.resolve()) qlib.init( provider_uri=provider_uri, diff --git a/tests/test_get_data.py b/tests/test_get_data.py index c6465b564..935e7982d 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -45,7 +45,7 @@ class TestGetData(unittest.TestCase): def test_1_csv_data(self): GetData().csv_data_cn(SOURCE_DIR) stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) - self.assertEqual(len(stock_name), 96, "get csv data failed") + self.assertEqual(len(stock_name), 85, "get csv data failed") if __name__ == "__main__":