1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-29 09:01:18 +08:00

Compare commits

..

234 Commits

Author SHA1 Message Date
Young
0da9b909f6 update version number 2021-02-02 09:48:41 +00:00
Jactus
5f87fc32ad Fix CI 2021-02-02 17:35:07 +08:00
Young
97d354fa73 update version for releasing 2021-02-02 09:24:33 +00:00
Young
a87fb5a68c fix contrib data freq 2021-02-02 16:52:50 +08:00
Young
835b47a7e7 simplify parameters 2021-02-02 16:52:50 +08:00
Young
802dac81c9 move freq params to dataloader 2021-02-02 16:52:50 +08:00
Wendi Li
bdc70c192a Update pytorch_nn.py 2021-02-02 14:48:12 +08:00
Wendi Li
213f809148 Update pytorch_alstm_ts.py 2021-02-02 14:47:41 +08:00
Wendi Li
f3fd5e0773 Update pytorch_gats.py 2021-02-02 14:47:31 +08:00
Wendi Li
decf74cbdf Update pytorch_gru.py 2021-02-02 14:47:20 +08:00
Wendi Li
b4a92d55f8 Update pytorch_gru_ts.py 2021-02-02 14:47:00 +08:00
Wendi Li
ebc31b9bdb Update pytorch_lstm.py 2021-02-02 14:46:49 +08:00
Wendi Li
56ebe9bf36 Update pytorch_lstm_ts.py 2021-02-02 14:46:21 +08:00
Wendi Li
ddd68fc761 Update pytorch_alstm.py 2021-02-02 14:34:57 +08:00
bxdd
f50463aca9 Fix bug in alpha360 2021-02-01 18:33:51 +08:00
Jactus
c0e7cbc983 Add filter_pipe API 2021-01-29 12:47:04 +08:00
you-n-g
828993b397 Merge pull request #222 from bxdd/rl-highfreq-include-examples
Qlib Highfreq Support & Highfreq DataHanlder/Operator/Processor Examples
2021-01-29 00:08:10 +08:00
bxdd
8ef89b4fa8 update 2021-01-28 15:01:07 +00:00
bxdd
76cf9dad99 update 2021-01-28 14:30:20 +00:00
bxdd
f3eb02a0bd update docstring 2021-01-28 14:26:30 +00:00
bxdd
ffa68fd010 update 2021-01-28 14:25:55 +00:00
bxdd
f6dd006c35 update 2021-01-28 11:31:15 +00:00
you-n-g
8c29105bca Update cache.py 2021-01-27 19:52:33 +08:00
bxdd
948b829ff4 add get_data in highfreq 2021-01-27 10:34:31 +00:00
Jactus
304a0c3d7a Add paper year 2021-01-27 18:15:52 +08:00
bxdd
02dea2aeb6 update paused 2021-01-27 07:42:00 +00:00
bxdd
6fc4f2b249 fix a bug 2021-01-27 07:02:59 +00:00
bxdd
2a5f06ee9e update dataset test 2021-01-27 06:25:40 +00:00
zhupr
7f9216dc90 Fix the number of minutes on the first and last trading day of high frequency 2021-01-27 10:59:46 +08:00
zhupr
263ccdfe6f US stock code supports Windows 2021-01-27 10:59:46 +08:00
zhupr
1a8f1bfc57 support collecting yahoo 1min data 2021-01-27 10:59:46 +08:00
bxdd
9dc11a9e3c Merge github.com:microsoft/qlib into qlib_register_ops 2021-01-26 17:12:33 +00:00
bxdd
3bdd54308b update some little code 2021-01-26 17:02:30 +00:00
bxdd
1b569d371d simpson vwap 2021-01-26 14:32:08 +00:00
you-n-g
36e5c601de Merge pull request #78 from zhupr/main
Fix the error when the stock code is a number
2021-01-26 21:50:21 +08:00
zhupr
ae45711e2b Merge remote-tracking branch 'qlib/main' into save_inst 2021-01-26 19:42:59 +08:00
you-n-g
bcc47aa4cb Merge pull request #92 from bxdd/qlib_register_ops
Support Register of Custom Feature Operators Easily
2021-01-26 18:53:43 +08:00
bxdd
ee94634b23 black 2021-01-26 08:47:53 +00:00
bxdd
2016ebbbb2 update tests 2021-01-26 08:47:07 +00:00
zhupr
1eaf09cce1 version removed .dev 2021-01-26 16:29:26 +08:00
zhupr
7579f4b4c0 Merge remote-tracking branch 'qlib/main' into save_inst 2021-01-26 16:14:11 +08:00
zhupr
1a1c45981c US stock code supports Windows 2021-01-26 16:06:38 +08:00
bxdd
e4ecea55e4 fix 2021-01-26 07:41:22 +00:00
bxdd
58616fced9 black format 2021-01-26 07:33:50 +00:00
bxdd
8e9ca22b07 del some print 2021-01-26 07:33:26 +00:00
bxdd
6a145df87c fix bug 2021-01-26 07:32:06 +00:00
bxdd
06dbd02b99 black format 2021-01-25 17:59:48 +00:00
bxdd
ffedb6382f add highfreq example 2021-01-25 17:58:45 +00:00
zhupr
3f9f295a87 add register in config 2021-01-24 11:22:02 +08:00
Wendi Li
84d77f4585 Update pytorch_nn.py 2021-01-24 10:40:47 +08:00
you-n-g
afdf58b4fa Update serial.py 2021-01-24 10:36:56 +08:00
Alex Wang
2b6d16feb1 fix naming 2021-01-22 19:16:57 +08:00
Alex Wang
0a86a6f392 update format 2021-01-22 19:16:57 +08:00
Alex Wang
5da5ad4b9f tabnet 2021-01-22 19:16:57 +08:00
you-n-g
dd07810b66 Update README.md 2021-01-22 12:53:05 +08:00
bxdd
a762248d98 update test&docs 2021-01-22 01:06:32 +09:00
bxdd
80c9a47e51 Merge github.com:microsoft/qlib into qlib_register_ops 2021-01-22 00:52:30 +09:00
王雪
784e73bceb black formatting 2021-01-21 00:07:03 +08:00
王雪
5ad1b4cc33 for IDE auto-complete with global Wrapper
R, D, Cal, Inst, FeatureD, ExpressionD, DatasetD, D
2021-01-21 00:07:03 +08:00
王雪
e85646762c Update .gitignore 2021-01-20 22:12:35 +08:00
Young
fc81a39317 Add dataset standalone usage example 2021-01-20 21:14:27 +08:00
you-n-g
d44c5bb2b2 Update README.md 2021-01-20 21:14:03 +08:00
bxdd
c622d3f6f8 Update data.rst 2021-01-20 18:55:30 +08:00
bxdd
6daaa79519 add register ops config 2021-01-20 18:44:53 +09:00
zhupr
3dda2cb379 Merge remote-tracking branch 'qlib/main' into qlib_register_ops 2021-01-20 15:16:06 +08:00
zhupr
4fcfde7cfb Initialization is split into: set_config and config_based_on_C 2021-01-20 15:06:18 +08:00
bxdd
3403c00b6b Update requirements.txt
fix readthedocs cant find cmake error
2021-01-19 20:35:11 +08:00
bxdd
ecdfe49fd1 del custom ops test for check the CI status 2021-01-19 20:39:15 +09:00
bxdd
cc214a3462 black format 2021-01-19 09:14:17 +08:00
bxdd
65d8af41e7 restructure backtest 2021-01-19 09:14:17 +08:00
bxdd
0e0970f06e update backtest 2021-01-19 09:14:17 +08:00
bxdd
917261dbf6 update backtest 2021-01-19 09:14:17 +08:00
bxdd
6a9105e065 add highfreq_backtest 2021-01-19 09:14:17 +08:00
王雪
570bb272eb fix setup error
why required pymongo
2021-01-18 19:37:24 +08:00
Wendi Li
0524a47cf4 Update pytorch_lstm_ts.py 2021-01-18 12:20:40 +08:00
Wendi Li
9abc0b0d4f Update pytorch_gru_ts.py 2021-01-18 12:20:31 +08:00
Wendi Li
fe60e40927 Update pytorch_gats_ts.py 2021-01-18 12:20:20 +08:00
Wendi Li
740c297618 Update pytorch_alstm_ts.py 2021-01-18 12:20:00 +08:00
Anon-Artist
b4a088efe8 Update cli.py 2021-01-14 18:42:33 +08:00
Jactus
b34890772f Make note more clear 2021-01-13 19:19:48 +08:00
Jactus
054ffa29f6 Update readme 2021-01-13 19:19:48 +08:00
Jactus
74e08c9e37 Add deepcopy to config 2021-01-13 19:19:48 +08:00
Jactus
ea96c9e22d Update docs and support Python 3.9 2021-01-13 19:19:48 +08:00
王雪
86e7c44c6b Update initialization.rst
need line changing
2021-01-13 15:28:05 +08:00
you-n-g
64cf2e2df8 Update data.rst 2021-01-12 18:43:05 +08:00
Jactus
4361a4049a Fix create_recorder bug 2021-01-07 18:30:18 +08:00
Zhichong Fang
231f37376b Fix unrecognized config bug 2021-01-07 18:28:17 +08:00
you-n-g
328cdeda4a Update README.md 2021-01-07 11:12:49 +08:00
Zhichong Fang
4dbc8e52ec Update data.py
Fix some typo
2021-01-06 16:36:23 +08:00
Young
ba447d3448 update valute 2021-01-06 14:43:14 +08:00
zhupr
df556532d0 Fix the error when the stock code is a number 2021-01-06 11:21:33 +08:00
Wendi Li
18e040f506 Update workflow_config_gru_Alpha158.yaml
Delete a redundant parameter.
2021-01-04 17:05:21 +08:00
Wendi Li
aefc98b1d7 Update workflow_config_lstm_Alpha158.yaml
Delete a redundant parameter.
2021-01-04 17:05:13 +08:00
Jactus
46c8d791ac Fix doc bugs 2020-12-30 23:51:05 +08:00
Young
afcd91a2d0 black format 2020-12-28 12:04:03 +00:00
Young
4a30d9d1ec update github issue template 2020-12-28 12:02:01 +00:00
you-n-g
2da2e9bd9e Update README.md 2020-12-26 20:21:30 +08:00
you-n-g
3e6877ff0f Update README.md 2020-12-25 22:01:18 +08:00
zhupr
a0f32036a6 Fix the first trading day of the calendar extra in report_df 2020-12-24 11:22:48 +08:00
bxdd
d8f36df7f4 debug on macos 2020-12-23 18:28:05 +00:00
bxdd
cb3b6c5bde black format 2020-12-23 16:41:32 +00:00
bxdd
b11712fa54 fix cant find ops error on Windows 2020-12-23 16:39:17 +00:00
Jactus
660edeb94f Remove fm in recorder 2020-12-23 21:14:53 +08:00
Jactus
95de4088df Fix recorder temp dir bug 2020-12-23 21:14:53 +08:00
hadrianl
e8d7a22651 fix _adjust_size 2020-12-23 17:39:04 +08:00
hadrianl
4a62b929ad add _get_value_size and remove _limit_flag 2020-12-23 17:39:04 +08:00
hadrianl
5efe82fb56 make code cleaner 2020-12-23 17:39:04 +08:00
hadrianl
40bbafcaab black format 2020-12-23 17:39:04 +08:00
hadrianl
4c4f0f3c5e black format 2020-12-23 17:39:04 +08:00
hadrianl
ae0e0eca3d better MemCacheUnit implement 2020-12-23 17:39:04 +08:00
bxdd
7e37fa710a update alpha.rst 2020-12-21 23:31:31 +08:00
bxdd
e0c460c33c Update alpha.rst 2020-12-21 23:31:31 +08:00
bxdd
53f501ac19 del import 2020-12-21 12:44:27 +00:00
bxdd
132df027a5 update format 2020-12-21 12:09:25 +00:00
bxdd
7d97fd39ce update ops register 2020-12-21 12:06:42 +00:00
Young
995fa98fc6 add more doc to PortAnaRecord 2020-12-20 16:11:07 +08:00
Maciej Domagała
824de921d1 fixing typos #4 2020-12-19 11:59:23 +08:00
Maciej Domagała
66d9bd1a68 fixing typos #3
I just randomly find these by the way. Good work on the framework!
2020-12-18 20:16:54 +08:00
you-n-g
1c0bb2f827 Merge pull request #97 from Derek-Wds/main
Update benchmark performance
2020-12-17 17:12:40 +08:00
Maciej Domagała
ea018ed4dc fixing typos #2 2020-12-17 17:12:18 +08:00
hadrianl
f3f1867b14 fix wrong attribute 2020-12-17 15:04:07 +08:00
hadrianl
8bbfd8810c formatting 2020-12-17 15:04:07 +08:00
hadrianl
3f84c3768a Make __getattr__ to raise AttributeError instead of return it.Avoid using try except. 2020-12-17 15:04:07 +08:00
Dingsu Wang
7372a3a598 Merge branch 'main' into main 2020-12-17 14:43:21 +08:00
Jactus
4b4cd38ca6 Update benchmark results 2020-12-17 14:41:12 +08:00
you-n-g
7d40ba753a Update README.md 2020-12-17 00:35:35 +08:00
Young
9b60214e0c make info more friendly 2020-12-16 02:16:06 +00:00
Young
f7e775f941 make message more friendly 2020-12-16 02:14:38 +00:00
Young
aefbf3b5f1 update collect info 2020-12-15 13:24:29 +00:00
G_will
3f85af05e5 Refactor to Python3 style 2020-12-15 20:37:43 +08:00
Jactus
192c2dc5ef Add demo 2020-12-15 20:33:32 +08:00
Jactus
911edd7839 Add stale bot 2020-12-15 20:31:38 +08:00
Maciej Domagała
3d47dd78c8 Typo fix 2020-12-15 20:29:30 +08:00
Jactus
8f6ab0af54 Format 2020-12-14 19:23:43 +08:00
Jactus
cb0b6fcdaa Update CI and script 2020-12-14 19:23:43 +08:00
Yifan Deng (FA Talent)
6b8824dd29 Update Sign in ops.py 2020-12-14 16:55:23 +08:00
Yifan Deng
c217e7c479 Update ops.py
Fix the bug when Sign followed by True/False
2020-12-14 16:55:23 +08:00
you-n-g
ea4fe1577b Update README.md 2020-12-14 13:05:12 +08:00
you-n-g
1bab07e419 Update README.md 2020-12-13 22:45:07 +08:00
bxdd
422d1d8c93 Update README.md 2020-12-12 19:41:16 +08:00
bxdd
c8f9b1162d Update README.md 2020-12-12 19:01:00 +08:00
Young
e2bdef7ffe update version number to dev 2020-12-12 10:09:18 +00:00
Young
e49b590322 Release qlib 0.6.1 2020-12-12 09:51:52 +00:00
bxdd
9d19294f15 update Note 2020-12-12 17:42:23 +08:00
bxdd
b0e7a85601 update readme 2020-12-12 17:42:23 +08:00
you-n-g
8ea45802df Update README.md 2020-12-12 14:04:21 +08:00
Jactus
bba94d72dc Add author names 2020-12-11 17:19:12 +08:00
Jactus
d6dd423dc2 Update benchmark performance 2020-12-11 17:19:12 +08:00
Jactus
c10955d026 Update tft 2020-12-11 14:33:16 +08:00
Jactus
d642c7b6ea Update benchmark performance 2020-12-11 09:55:37 +08:00
G_will
9307bcc8d1 fix typo
fix typo
2020-12-10 20:11:57 +08:00
Jactus
99f3820e42 Update readme 2020-12-10 19:37:18 +08:00
Jactus
b04d2c39c8 Update CI 2020-12-10 19:37:18 +08:00
bxdd
0cdc5e125a update docs 2020-12-10 10:08:29 +00:00
bxdd
2de812f262 update ops docs 2020-12-10 10:04:09 +00:00
bxdd
16450c2876 fix import 2020-12-10 09:54:05 +00:00
bxdd
729b57e4a7 add example script 2020-12-10 09:11:12 +00:00
bxdd
87cc52cd05 black format 2020-12-10 09:02:43 +00:00
bxdd
0be57d51be support register custom feature ops easily 2020-12-10 09:00:00 +00:00
bxdd
9c482ebbe2 black format 2020-12-10 15:50:14 +08:00
bxdd
eb67f1037a update setup 2020-12-10 15:50:14 +08:00
bxdd
59282c8965 fix req 2020-12-10 15:50:14 +08:00
bxdd
03ab67ad5c fix req 2020-12-10 15:50:14 +08:00
bxdd
e2d862bfb2 fix system package 2020-12-10 15:50:14 +08:00
bxdd
936d5abb1f fix docs req 2020-12-10 15:50:14 +08:00
bxdd
7296780149 fix setup 2020-12-10 15:50:14 +08:00
bxdd
97c053ba73 update setup 2020-12-10 15:50:14 +08:00
bxdd
2c5864204e del qlib.readthedocs.yml 2020-12-09 13:58:17 +00:00
bxdd
6562c9aaa4 Merge https://github.com/microsoft/qlib into main 2020-12-09 13:55:56 +00:00
bxdd
85a217c121 update readthedocs 2020-12-09 13:54:22 +00:00
bxdd
f156280a51 Merge pull request #84 from bxdd/qlib_readthedocs
Fix readthedocs
2020-12-09 21:43:51 +08:00
bxdd
e8eb034a97 fix the config of readthedocs 2020-12-09 13:41:18 +00:00
bxdd
7763cf5a5c add the config of readthedocs 2020-12-09 21:27:39 +08:00
bxdd
053736c0ea add the config of readthedocs 2020-12-09 13:23:59 +00:00
Young
74ac230edb rename for pytest 2020-12-09 20:24:56 +08:00
Young
303021cd47 CI updating 2020-12-09 20:24:56 +08:00
Young
c0f1696adb add downcast to save data 2020-12-09 20:24:56 +08:00
Young
361d168890 remove dataloader 2020-12-09 20:24:56 +08:00
lwwang1995
73669de392 Fix code. 2020-12-09 20:24:56 +08:00
Wendi Li
89ec87e45b Add files via upload 2020-12-09 17:20:36 +08:00
Wendi Li
15cdfeb121 Add files via upload 2020-12-09 17:20:36 +08:00
Wendi Li
1bbd026195 Add files via upload 2020-12-09 17:20:36 +08:00
Jactus
a5c098de92 Update tft results 2020-12-09 17:20:36 +08:00
bxdd
a63ba3e819 black format 2020-12-09 17:20:36 +08:00
bxdd
56e579e20f add arg weight_decay 2020-12-09 17:20:36 +08:00
bxdd
2873813562 update mlp model 2020-12-09 17:20:36 +08:00
Jactus
a8ac56a82f Format 2020-12-09 17:20:36 +08:00
Jactus
6ef339b1ec Update mlp results and add doc 2020-12-09 17:20:36 +08:00
Jactus
579caa757c Update readme 2020-12-09 17:20:36 +08:00
Jactus
a1e579ff39 Update readme 2020-12-09 17:20:36 +08:00
Jactus
217019a640 Update benchmark readme 2020-12-09 17:20:36 +08:00
Jactus
c14404afe1 Add benchmark results 2020-12-09 17:20:36 +08:00
lwwang1995
4596a7e000 Delete the setting of SFM on the Alpha158. 2020-12-09 17:20:36 +08:00
lwwang1995
ec40845513 Update settings. 2020-12-09 17:20:36 +08:00
lwwang1995
dcfa8110e8 Fix bugs for Gats model. 2020-12-09 17:20:36 +08:00
lwwang1995
666e1ffcbd Update settings. 2020-12-09 17:20:36 +08:00
lwwang1995
70fb760830 Fix bugs for models. 2020-12-09 17:20:36 +08:00
lwwang1995
4a748525bc Fix bugs of model. 2020-12-09 17:20:36 +08:00
Young
fb4a2e65cc support multi indexing of TSDatasetSample 2020-12-09 17:20:36 +08:00
lwwang1995
71ad651514 Add sample for Gats. 2020-12-09 17:20:36 +08:00
lwwang1995
65a9a72a88 Update models. 2020-12-09 17:20:36 +08:00
lwwang1995
ec0d7838ac Update models. 2020-12-09 17:20:36 +08:00
lwwang1995
752f17e51e Fix SFM bug. 2020-12-09 17:20:36 +08:00
lwwang1995
8d42092a7e Fix models. 2020-12-09 17:20:36 +08:00
lwwang1995
412c9eee2e Update models. 2020-12-09 17:20:36 +08:00
Young
abb90ca2f6 fix sampler performance bug 2020-12-09 17:20:36 +08:00
lwwang1995
a7c6aea386 Update record to support time series dataset. 2020-12-09 17:20:36 +08:00
lwwang1995
a88697151a Test CSRankNorm. 2020-12-09 17:20:36 +08:00
Young
d2107c9957 dataset performance optm 2020-12-09 17:20:36 +08:00
lwwang1995
65902e424c Add filter columns. 2020-12-09 17:20:36 +08:00
lwwang1995
bf8de72605 Update test_dataset 2020-12-09 17:20:36 +08:00
lwwang1995
60f62482b7 Update test_dataset 2020-12-09 17:20:36 +08:00
Young
d2d865fb7a fix bug of workflow 2020-12-09 17:20:36 +08:00
Young
5d5f8c8868 update TimeSeriesDataset 2020-12-09 17:20:36 +08:00
you-n-g
d093afd684 Merge pull request #74 from Derek-Wds/main
Update scripts
2020-12-04 14:30:42 +08:00
Dingsu Wang
46396c229a Update README.md 2020-12-04 14:28:13 +08:00
Jactus
eef90c7901 Update readme 2020-12-04 14:25:26 +08:00
Jactus
895b1e7944 Update CI 2020-12-04 13:10:45 +08:00
Jactus
2fb7774927 Update config names 2020-12-04 13:02:09 +08:00
Jactus
86b0b63771 Update table generator 2020-12-04 12:55:22 +08:00
Jactus
99adc514a5 Update CI 2020-12-04 11:51:45 +08:00
Jactus
07fb9031c6 Update setup 2020-12-04 10:47:20 +08:00
Jactus
f237a344c3 Update 2020-12-04 10:31:50 +08:00
Dingsu Wang
2cb888c8b9 Merge branch 'main' into main 2020-12-04 09:45:00 +08:00
Jactus
ab762b3cd7 Update lightgbm lr 2020-12-02 20:04:00 +08:00
Jactus
703ae5d4aa Update tft and readme 2020-12-02 18:00:26 +08:00
Jactus
91c3dfddf5 Update run_all_model 2020-12-02 17:55:11 +08:00
Jactus
745b93138d Update devices 2020-12-02 13:28:32 +08:00
Jactus
7f385345bb Fix GPU 2020-12-02 13:25:29 +08:00
Jactus
d109d3d44e Fix GPU 2020-12-02 12:23:42 +08:00
Jactus
a2603fe27a Update config 2020-12-02 09:41:24 +08:00
Young
e5590de2a4 udpate header and test docs 2020-12-01 10:11:29 +00:00
Jactus
77884db3a5 Fix and update run_all_model 2020-12-01 11:54:26 +08:00
Jactus
bb5f3cb33d Fix and move to version 0.6.0 2020-12-01 09:59:46 +08:00
114 changed files with 6647 additions and 1768 deletions

View File

@@ -28,7 +28,8 @@ Steps to reproduce the behavior:
## Environment
**Note**: One could run `python scripts/collect_info.py` under the `qlib` directory to get the following information.
**Note**: User could run `cd scripts && python collect_info.py all` under project directory to get system information
and paste them here directly.
- Qlib version:
- Python version:
@@ -37,4 +38,4 @@ Steps to reproduce the behavior:
## Additional Notes
<!-- Add any other information about the problem here. -->
<!-- Add any other information about the problem here. -->

62
.github/stale.yml vendored Normal file
View File

@@ -0,0 +1,62 @@
# Configuration for probot-stale - https://github.com/probot/stale
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 60
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
daysUntilClose: 7
# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
onlyLabels: []
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
exemptLabels:
- bug
- pinned
- security
- "[Status] Maybe Later"
# Set to true to ignore issues in a project (defaults to false)
exemptProjects: false
# Set to true to ignore issues in a milestone (defaults to false)
exemptMilestones: false
# Set to true to ignore issues with an assignee (defaults to false)
exemptAssignees: false
# Label to use when marking as stale
staleLabel: wontfix
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
# Comment to post when removing the stale label.
# unmarkComment: >
# Your comment here.
# Comment to post when closing a stale Issue or Pull Request.
# closeComment: >
# Your comment here.
# Limit the number of actions per hour, from 1-30. Default is 30
limitPerRun: 30
# Limit to only `issues` or `pulls`
# only: issues
# Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
# pulls:
# daysUntilStale: 30
# markComment: >
# This pull request has been automatically marked as stale because it has not had
# recent activity. It will be closed if no further activity occurs. Thank you
# for your contributions.
# issues:
# exemptLabels:
# - confirmed

View File

@@ -12,8 +12,8 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, ubuntu-16.04, ubuntu-18.04, macos-latest]
python-version: [3.6, 3.7, 3.8]
os: [windows-latest, ubuntu-16.04, ubuntu-18.04, ubuntu-20.04, macos-latest]
python-version: [3.6, 3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
@@ -23,37 +23,96 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
- name: Lint with Black
run: |
pip install --upgrade cython
pip install numpy jupyter jupyter_contrib_nbextensions
python setup.py install
cd ..
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install black
$CONDA\\python.exe -m black qlib -l 120 --check --diff
else
sudo $CONDA/bin/python -m pip install black
$CONDA/bin/python -m black qlib -l 120 --check --diff
fi
shell: bash
# Test Qlib installed with pip
- name: Install Qlib with pip
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install pyqlib --ignore-installed ruamel.yaml --user
else
sudo $CONDA/bin/python -m pip install pyqlib --ignore-installed ruamel.yaml
fi
shell: bash
- name: Install Lightgbm for MacOS
if: runner.os == 'macOS'
run: |
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
- name: Test data downloads
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
else
$CONDA/bin/python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
fi
shell: bash
- name: Test workflow by config (install from pip)
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
$CONDA\\python.exe -m pip uninstall -y pyqlib
else
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
sudo $CONDA/bin/python -m pip uninstall -y pyqlib
fi
shell: bash
# Test Qlib installed from source
- name: Install Qlib from source
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install --upgrade cython
$CONDA\\python.exe -m pip install numpy jupyter jupyter_contrib_nbextensions
$CONDA\\python.exe -m pip install -U scipy scikit-learn # installing without this line will cause errors on GitHub Actions, while instsalling locally won't
$CONDA\\python.exe setup.py install
else
sudo $CONDA/bin/python -m pip install --upgrade cython
sudo $CONDA/bin/python -m pip install numpy jupyter jupyter_contrib_nbextensions
sudo $CONDA/bin/python -m pip install -U scipy scikit-learn # installing without this line will cause errors on GitHub Actions, while instsalling locally won't
sudo $CONDA/bin/python setup.py install
fi
shell: bash
- name: Install test dependencies
run: |
python -m pip install --upgrade pip
pip install black pytest
- name: Lint with Black
run: |
cd ..
python -m black qlib -l 120 --check --diff
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install --upgrade pip
$CONDA\\python.exe -m pip install black pytest
else
sudo $CONDA/bin/python -m pip install --upgrade pip
sudo $CONDA/bin/python -m pip install black pytest
fi
shell: bash
- name: Unit tests with Pytest
run: |
cd tests
pytest . --durations=0
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pytest . --durations=0
else
$CONDA/bin/python -m pytest . --durations=0
fi
shell: bash
- name: Test data downloads
- name: Test workflow by config (install from source)
run: |
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
- name: Test workflow by config
run: |
qrun examples/benchmarks/LightGBM/workflow_config_lightgbm.yaml
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
else
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
fi
shell: bash

1
.gitignore vendored
View File

@@ -2,6 +2,7 @@
__pycache__/
*.pyc
*.pyd
*.so
*.ipynb
.ipynb_checkpoints

21
.readthedocs.yml Normal file
View File

@@ -0,0 +1,21 @@
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# Build all formats
formats: all
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
- method: setuptools
path: .

View File

@@ -114,7 +114,7 @@ Version 0.4.1
Version 0.4.2
--------------------
- Refactor DataHandler
- Add ``ALPHA360`` DataHandler
- Add ``Alpha360`` DataHandler
Version 0.4.3

View File

@@ -34,6 +34,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
- [More About Qlib](#more-about-qlib)
- [Offline Mode and Online Mode](#offline-mode-and-online-mode)
- [Performance of Qlib Data Server](#performance-of-qlib-data-server)
- [Related Reports](#related-reports)
- [Contributing](#contributing)
@@ -61,17 +62,36 @@ At the module level, Qlib is a platform that consists of the above components. T
This quick start guide tries to demonstrate
1. It's very easy to build a complete Quant research workflow and try your ideas with _Qlib_.
1. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
2. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
Here is a quick **[demo](https://terminalizer.com/view/3f24561a4470)** shows how to install ``Qlib``, and run LightGBM with ``qrun``. **But**, please make sure you have already prepared the data following the [instruction](#data-preparation).
## Installation
Users can easily install ``Qlib`` by pip according to the following command
This table demonstrates the supported Python version of `Qlib`:
| | install with pip | install from source | plot |
| ------------- |:---------------------:|:--------------------:|:----:|
| Python 3.6 | :heavy_check_mark: | :heavy_check_mark: (only with `Anaconda`) | :heavy_check_mark: |
| Python 3.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.8 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.9 | :x: | :heavy_check_mark: | :x: |
**Note**:
1. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
2. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future.
### Install with pip
Users can easily install ``Qlib`` by pip according to the following command.
```bash
pip install pyqlib
```
Also, users can install ``Qlib`` by the source code according to the following steps:
**Note**: pip will install the latest stable qlib. However, the main branch of qlib is in active development. If you want to test the latest scripts or functions in the main branch. Please install qlib with the methods below.
### Install from source
Also, users can install the latest dev version ``Qlib`` by the source code according to the following steps:
* Before installing ``Qlib`` from source, users need to install some dependencies:
@@ -80,13 +100,20 @@ Also, users can install ``Qlib`` by the source code according to the following s
pip install --upgrade cython
```
* Clone the repository and install ``Qlib``:
```bash
git clone https://github.com/microsoft/qlib.git && cd qlib
python setup.py install
```
* Clone the repository and install ``Qlib`` as follows.
* If you haven't installed qlib by the command ``pip install pyqlib`` before:
```bash
git clone https://github.com/microsoft/qlib.git && cd qlib
python setup.py install
```
* If you have already installed the stable version by the command ``pip install pyqlib``:
```bash
git clone https://github.com/microsoft/qlib.git && cd qlib
pip install .
```
**Note**: **Only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**.
**Tips**: If you fail to install `Qlib` or run the examples in your environment, comparing your steps and the [CI workflow](.github/workflows/test.yml) may help you find the problem.
## Data Preparation
Load and prepare data by running the following code:
@@ -130,12 +157,16 @@ Users could create the same dataset with it.
## Auto Quant Research Workflow
Qlib provides a tool named `qrun` to run the whole workflow automatically (including building dataset, training models, backtest and evaluation). You can start an auto quant research workflow and have a graphical reports analysis according to the following steps:
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm.yaml)) as following.
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm_Alpha158.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml) as following.
```bash
cd examples # Avoid running program under the directory contains `qlib`
qrun benchmarks/LightGBM/workflow_config_lightgbm.yaml
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
The result of `qrun` is as follows, please refer to please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
If users want to use `qrun` under debug mode, please use the following command:
```bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
The result of `qrun` is as follows, please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
```bash
@@ -153,9 +184,6 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu
annualized_return 0.128982
information_ratio 1.444287
max_drawdown -0.091078
```
Here are detailed documents for `qrun` and [workflow](https://qlib.readthedocs.io/en/latest/component/workflow.html).
@@ -193,19 +221,22 @@ The automatic workflow may not suite the research workflow of all Quant research
# [Quant Model Zoo](examples/benchmarks)
Here is a list of models built on `Qlib`.
- [GBDT based on LightGBM](qlib/contrib/model/gbdt.py)
- [GBDT based on Catboost](qlib/contrib/model/catboost_model.py)
- [GBDT based on XGBoost](qlib/contrib/model/xgboost.py)
- [GBDT based on XGBoost (Tianqi Chen, et al. 2016)](qlib/contrib/model/xgboost.py)
- [GBDT based on LightGBM (Guolin Ke, et al. 2017)](qlib/contrib/model/gbdt.py)
- [GBDT based on Catboost (Liudmila Prokhorenkova, et al. 2017)](qlib/contrib/model/catboost_model.py)
- [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py)
- [GRU based on pytorch](qlib/contrib/model/pytorch_gru.py)
- [LSTM based on pytorcn](qlib/contrib/model/pytorch_lstm.py)
- [ALSTM based on pytorcn](qlib/contrib/model/pytorch_alstm.py)
- [GATs based on pytorch](qlib/contrib/model/pytorch_gats.py)
- [SFM based on pytorch](qlib/contrib/model/pytorch_sfm.py)
<!-- - [TFT based on tensorflow](examples/benchmarks/TFT/tft.py) -->
- [LSTM based on pytorch (Sepp Hochreiter, et al. 1997)](qlib/contrib/model/pytorch_lstm.py)
- [GRU based on pytorch (Kyunghyun Cho, et al. 2014)](qlib/contrib/model/pytorch_gru.py)
- [ALSTM based on pytorch (Yao Qin, et al. 2017)](qlib/contrib/model/pytorch_alstm.py)
- [GATs based on pytorch (Petar Velickovic, et al. 2017)](qlib/contrib/model/pytorch_gats.py)
- [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
- [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
- [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
Your PR of new Quant models is highly welcomed.
The performance of each model on the `Alpha158` and `Alpha360` dataset can be found [here](examples/benchmarks/README.md).
## Run a single model
All the models listed above are runnable with ``Qlib``. Users can find the config files we provide and some details about the model through the [benchmarks](examples/benchmarks) folder. More information can be retrieved at the model files listed above.
@@ -216,9 +247,9 @@ All the models listed above are runnable with ``Qlib``. Users can find the confi
- User can use the script [`run_all_model.py`](examples/run_all_model.py) listed in the `examples` folder to run a model. Here is an example of the specific shell command to be used: `python run_all_model.py --models=lightgbm`, where the `--models` arguments can take any number of models listed above(the available models can be found in [benchmarks](examples/benchmarks/)). For more use cases, please refer to the file's [docstrings](examples/run_all_model.py).
## Run multiple models
`Qlib` also provides a script [`run_all_model.py`](examples/run_all_model.py) which can run multiple models for several iterations. (**Note**: the script only supprots *Linux* now. Other OS will be supported in the future.)
`Qlib` also provides a script [`run_all_model.py`](examples/run_all_model.py) which can run multiple models for several iterations. (**Note**: the script only support *Linux* for now. Other OS will be supported in the future. Besides, it doesn't support parrallel running the same model for multiple times as well, and this will be fixed in the future development too.)
The script will create a unique virtual environment for each model, and delete the environments after training. Thus, only experiment results such as `IC` and `backtest` results will be generated and stored. (**Note**: the script will erase your previous experiment records created by running itself.)
The script will create a unique virtual environment for each model, and delete the environments after training. Thus, only experiment results such as `IC` and `backtest` results will be generated and stored.
Here is an example of running all the models for 10 iterations:
```python
@@ -229,12 +260,12 @@ It also provides the API to run specific models at once. For more use cases, ple
# Quant Dataset Zoo
Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`.
Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`:
| Dataset | US Market | China Market |
| -- | -- | -- |
| [Alpha360](./qlib/contrib/data/handler.py) | √ | √ |
| [Alpha158](./qlib/contrib/data/handler.py) | √ | √ |
| [Alpha158](./qlib/contrib/data/handler.py) | √ | √ |
[Here](https://qlib.readthedocs.io/en/latest/advanced/alpha.html) is a tutorial to build dataset with `Qlib`.
Your PR to build new Quant dataset is highly welcomed.
@@ -281,7 +312,11 @@ Such overheads greatly slow down the data loading process.
Qlib data are stored in a compact format, which is efficient to be combined into arrays for scientific computation.
# Related Reports
- [Guide To Qlib: Microsofts AI Investment Platform](https://analyticsindiamag.com/qlib/)
- [【华泰金工林晓明团队】微软AI量化投资平台Qlib体验——华泰人工智能系列之四十](https://mp.weixin.qq.com/s/Brcd7im4NibJOJzZfMn6tQ)
- [微软也搞AI量化平台还是开源的](https://mp.weixin.qq.com/s/47bP5YwxfTp2uTHjUBzJQQ)
- [微矿Qlib业内首个AI量化投资开源平台](https://mp.weixin.qq.com/s/vsJv7lsgjEi-ALYUz4CvtQ)
# Contributing

12
docs/_static/demo.sh vendored Normal file
View File

@@ -0,0 +1,12 @@
#!/bin/sh
git clone https://github.com/microsoft/qlib.git
cd qlib
ls
pip install pyqlib
# or
# pip install numpy
# pip install --upgrade cython
# python setup.py install
cd examples
ls
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml

View File

@@ -50,57 +50,37 @@ Users can use ``Data Handler`` to build formulaic alphas `MACD` in qlib:
.. code-block:: python
>> from qlib.data.dataset.handler import QLibDataHandler
>> from qlib.data.dataset.loader import QlibDataLoader
>> MACD_EXP = '(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'
>> fields = [MACD_EXP] # MACD
>> names = ['MACD']
>> labels = ['$close'] # label
>> labels = ['Ref($close, -2)/Ref($close, -1) - 1'] # label
>> label_names = ['LABEL']
>> data_handler = QLibDataHandler(start_date='2010-01-01', end_date='2017-12-31', fields=fields, names=names, labels=labels, label_names=label_names)
>> TRAINER_CONFIG = {
.. "train_start_date": "2007-01-01",
.. "train_end_date": "2014-12-31",
.. "validate_start_date": "2015-01-01",
.. "validate_end_date": "2016-12-31",
.. "test_start_date": "2017-01-01",
.. "test_end_date": "2020-08-01",
>> data_loader_config = {
.. "feature": (fields, names),
.. "label": (labels, label_names)
.. }
>> feature_train, label_train, feature_validate, label_validate, feature_test, label_test = data_handler.get_split_data(**TRAINER_CONFIG)
>> print(feature_train, label_train)
MACD
instrument datetime
SH600000 2010-01-04 -0.008625
2010-01-05 -0.007234
2010-01-06 -0.007693
2010-01-07 -0.009633
2010-01-08 -0.009891
... ...
SZ300251 2014-12-25 0.043072
2014-12-26 0.041345
2014-12-29 0.042733
2014-12-30 0.042066
2014-12-31 0.036299
[322025 rows x 1 columns]
LABEL
instrument datetime
SH600000 2010-01-04 4.260015
2010-01-05 4.292182
2010-01-06 4.207747
2010-01-07 4.113258
2010-01-08 4.159496
... ...
SZ300251 2014-12-25 4.343212
2014-12-26 4.470587
2014-12-29 4.762474
2014-12-30 4.369748
2014-12-31 4.182222
[322025 rows x 1 columns]
>> data_loader = QlibDataLoader(config=data_loader_config)
>> df = data_loader.load(instruments='csi300', start_time='2010-01-01', end_time='2017-12-31')
>> print(df)
feature label
MACD LABEL
datetime instrument
2010-01-04 SH600000 -0.011547 -0.019672
SH600004 0.002745 -0.014721
SH600006 0.010133 0.002911
SH600008 -0.001113 0.009818
SH600009 0.025878 -0.017758
... ... ...
2017-12-29 SZ300124 0.007306 -0.005074
SZ300136 -0.013492 0.056352
SZ300144 -0.000966 0.011853
SZ300251 0.004383 0.021739
SZ300315 -0.030557 0.012455
Reference
===========
To learn more about ``Data Handler``, please refer to `Data Handler <../component/data.html>`_
To learn more about ``Data Loader``, please refer to `Data Loader <../component/data.html#data-loader>`_
To learn more about ``Data API``, please refer to `Data API <../component/data.html>`_

View File

@@ -126,17 +126,17 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
- `open`
The opening price
The adjusted opening price
- `close`
The closing price
The adjusted closing price
- `high`
The highest price
The adjusted highest price
- `low`
The lowest price
The adjusted lowest price
- `volume`
The trading volume
The adjusted trading volume
- `factor`
The Restoration factor
The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
@@ -195,6 +195,7 @@ Feature
- `ExpressionOps`
`ExpressionOps` will use operator for feature construction.
To know more about ``Operator``, please refer to `Operator API <../reference/api.html#module-qlib.data.ops>`_.
Also, ``Qlib`` supports users to define their own custom ``Operator``, an example has been given in ``tests/test_register_ops.py``.
To know more about ``Feature``, please refer to `Feature API <../reference/api.html#module-qlib.data.base>`_.
@@ -295,6 +296,7 @@ The ``Processor`` module in ``Qlib`` is designed to be learnable and it is respo
- ``RobustZScoreNorm``: `processor` that applies robust z-score normalization.
- ``CSZScoreNorm``: `processor` that applies cross sectional z-score normalization.
- ``CSRankNorm``: `processor` that applies cross sectional rank normalization.
- ``CSZFillna``: `processor` that fills N/A values in a cross sectional way by the mean of the column.
Users can also create their own `processor` by inheriting the base class of ``Processor``. Please refer to the implementation of all the processors for more information (`Processor Link <https://github.com/microsoft/qlib/blob/main/qlib/data/dataset/processor.py>`_).

View File

@@ -34,8 +34,9 @@ Here is a general view of the structure of the system:
- Recorder 2
- ...
- ...
This experiment management system defines a set of interface and provided a concrete implementation based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
This experiment management system defines a set of interface and provided a concrete implementation ``MLflowExpManager``, which is based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
If users set the implementation of ``ExpManager`` to be ``MLflowExpManager``, they can use the command `mlflow ui` to visualize and check the experiment results. For more information, pleaes refer to the related documents `here <https://www.mlflow.org/docs/latest/cli.html#mlflow-ui>`_.
Qlib Recorder
===================
@@ -91,7 +92,7 @@ Record Template
The ``RecordTemp`` class is a class that enables generate experiment results such as IC and backtest in a certain format. We have provided three different `Record Template` class:
- ``SignalRecord``: This class generates the `preidction` results of the model.
- ``SignalRecord``: This class generates the `prediction` results of the model.
- ``SigAnaRecord``: This class generates the `IC`, `ICIR`, `Rank IC` and `Rank ICIR` of the model.
- ``PortAnaRecord``: This class generates the results of `backtest`. The detailed information about `backtest` as well as the available `strategy`, users can refer to `Strategy <../component/strategy.html>`_ and `Backtest <../component/backtest.html>`_.

View File

@@ -103,6 +103,12 @@ After saving the config into `configuration.yaml`, users could start the workflo
qrun configuration.yaml
If users want to use ``qrun`` under debug mode, please use the following command:
.. code-block:: bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
.. note::
`qrun` will be placed in your $PATH directory when installing ``Qlib``.

View File

@@ -226,3 +226,8 @@ epub_exclude_files = ["search.html"]
autodoc_member_order = "bysource"
autodoc_default_flags = ["members"]
autodoc_default_options = {
"members": True,
"member-order": "bysource",
"special-members": "__init__",
}

View File

@@ -1 +1,5 @@
Cython==0.29.21
Cython
cmake
numpy
scipy
scikit-learn

View File

@@ -63,6 +63,7 @@ Besides `provider_uri` and `region`, `qlib.init` has other parameters. The follo
If Qlib fails to connect redis via `redis_host` and `redis_port`, cache mechanism will not be used! Please refer to `Cache <../component/data.html#cache>`_ for details.
- `exp_manager`
Type: dict, optional parameter, the setting of `experiment manager` to be used in qlib. Users can specify an experiment manager class, as well as the tracking URI for all the experiments. However, please be aware that we only support input of a dictionary in the following style for `exp_manager`. For more information about `exp_manager`, users can refer to `Recorder: Experiment Management <../component/recorder.html>`_.
.. code-block:: Python
# For example, if you want to set your tracking_uri to a <specific folder>, you can initialize qlib below

View File

@@ -1,6 +1,6 @@
# Requirements
Here is the minimal hardware requirements to run the example.
Here is the minimal hardware requirements to run the `workflow_by_code` example.
- Memory: 16G
- Free Disk: 5G

View File

@@ -0,0 +1,93 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: FilterCol
kwargs:
fields_group: feature
col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10",
"ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5",
"RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
]
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: ALSTM
module_path: qlib.contrib.model.pytorch_alstm_ts
kwargs:
d_feat: 20
hidden_size: 64
num_layers: 2
dropout: 0.0
n_epochs: 200
lr: 1e-3
early_stop: 10
batch_size: 800
metric: loss
loss: mse
n_jobs: 20
GPU: 0
rnn_type: GRU
dataset:
class: TSDatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
step_len: 20
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -54,7 +54,6 @@ task:
batch_size: 800
metric: loss
loss: mse
seed: 0
GPU: 0
rnn_type: GRU
dataset:
@@ -62,7 +61,7 @@ task:
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:

View File

@@ -0,0 +1,72 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors: []
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: CatBoostModel
module_path: qlib.contrib.model.catboost_model
kwargs:
loss: RMSE
learning_rate: 0.0421
subsample: 0.8789
max_depth: 6
num_leaves: 100
thread_count: 20
grow_policy: Lossguide
bootstrap_type: Poisson
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,92 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: FilterCol
kwargs:
fields_group: feature
col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10",
"ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5",
"RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
]
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: GATs
module_path: qlib.contrib.model.pytorch_gats_ts
kwargs:
d_feat: 20
hidden_size: 64
num_layers: 2
dropout: 0.7
n_epochs: 200
lr: 1e-4
early_stop: 10
metric: loss
loss: mse
base_model: LSTM
with_pretrain: True
model_path: "benchmarks/LSTM/csi300_lstm_ts.pkl"
GPU: 0
dataset:
class: TSDatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
step_len: 20
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -56,14 +56,13 @@ task:
base_model: LSTM
with_pretrain: True
model_path: "benchmarks/LSTM/model_lstm_csi300.pkl"
seed: 0
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
@@ -74,6 +73,11 @@ task:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:

Binary file not shown.

View File

@@ -0,0 +1,92 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: FilterCol
kwargs:
fields_group: feature
col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10",
"ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5",
"RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
]
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: GRU
module_path: qlib.contrib.model.pytorch_gru_ts
kwargs:
d_feat: 20
hidden_size: 64
num_layers: 2
dropout: 0.0
n_epochs: 200
lr: 2e-4
early_stop: 10
batch_size: 800
metric: loss
loss: mse
n_jobs: 20
GPU: 0
dataset:
class: TSDatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
step_len: 20
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -54,14 +54,13 @@ task:
batch_size: 800
metric: loss
loss: mse
seed: 0
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:

Binary file not shown.

View File

@@ -0,0 +1,92 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: FilterCol
kwargs:
fields_group: feature
col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10",
"ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5",
"RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
]
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: LSTM
module_path: qlib.contrib.model.pytorch_lstm_ts
kwargs:
d_feat: 20
hidden_size: 64
num_layers: 2
dropout: 0.0
n_epochs: 200
lr: 1e-3
early_stop: 10
batch_size: 800
metric: loss
loss: mse
n_jobs: 20
GPU: 0
dataset:
class: TSDatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
step_len: 20
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -54,14 +54,13 @@ task:
batch_size: 800
metric: loss
loss: mse
seed: 0
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:

View File

@@ -32,7 +32,7 @@ task:
kwargs:
loss: mse
colsample_bytree: 0.8879
learning_rate: 0.0421
learning_rate: 0.2
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768

View File

@@ -0,0 +1,73 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors: []
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: LGBModel
module_path: qlib.contrib.model.gbdt
kwargs:
loss: mse
colsample_bytree: 0.8879
learning_rate: 0.0421
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768
max_depth: 8
num_leaves: 210
num_threads: 20
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -65,8 +65,9 @@ task:
lr_decay_steps: 100
optimizer: adam
max_steps: 8000
batch_size: 4096
batch_size: 8192
GPU: 0
weight_decay: 0.0002
dataset:
class: DatasetH
module_path: qlib.data.dataset

View File

@@ -0,0 +1,82 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: DNNModelPytorch
module_path: qlib.contrib.model.pytorch_nn
kwargs:
loss: mse
input_dim: 360
output_dim: 1
lr: 0.002
lr_decay: 0.96
lr_decay_steps: 100
optimizer: adam
max_steps: 8000
batch_size: 4096
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,35 @@
# Benchmarks Performance
Here are the results of each benchmark model running on Qlib's `Alpha360` and `Alpha158` dataset with China's A shared-stock & CSI300 data respectively. The values of each metric are the mean and std calculated based on 20 runs.
The numbers shown below demonstrate the performance of the entire `workflow` of each model. We will update the `workflow` as well as models in the near future for better results.
## Alpha360 dataset
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|---|---|---|---|---|---|---|---|---|
| Linear | Alpha360 | 0.0150±0.00 | 0.1049±0.00| 0.0284±0.00 | 0.1970±0.00 | -0.0659±0.00 | -0.7072±0.00| -0.2955±0.00 |
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha360 | 0.0397±0.00 | 0.2878±0.00| 0.0470±0.00 | 0.3703±0.00 | 0.0342±0.00 | 0.4092±0.00| -0.1057±0.00 |
| XGBoost (Tianqi Chen, et al.) | Alpha360 | 0.0400±0.00 | 0.3031±0.00| 0.0461±0.00 | 0.3862±0.00 | 0.0528±0.00 | 0.6307±0.00| -0.1113±0.00 |
| LightGBM (Guolin Ke, et al.) | Alpha360 | 0.0399±0.00 | 0.3075±0.00| 0.0492±0.00 | 0.4019±0.00 | 0.0323±0.00 | 0.4370±0.00| -0.0917±0.00 |
| MLP | Alpha360 | 0.0285±0.00 | 0.1981±0.02| 0.0402±0.00 | 0.2993±0.02 | 0.0073±0.02 | 0.0880±0.22| -0.1446±0.03 |
| GRU (Kyunghyun Cho, et al.) | Alpha360 | 0.0490±0.01 | 0.3787±0.05| 0.0581±0.00 | 0.4664±0.04 | 0.0726±0.02 | 0.9817±0.34| -0.0902±0.03 |
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
## Alpha158 dataset
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|---|---|---|---|---|---|---|---|---|
| Linear | Alpha158 | 0.0393±0.00 | 0.2980±0.00| 0.0475±0.00 | 0.3546±0.00 | 0.0795±0.00 | 1.0712±0.00| -0.1449±0.00 |
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0503±0.00 | 0.3586±0.00| 0.0483±0.00 | 0.3667±0.00 | 0.1080±0.00 | 1.1561±0.00| -0.0787±0.00 |
| XGBoost (Tianqi Chen, et al.) | Alpha158 | 0.0481±0.00 | 0.3659±0.00| 0.0495±0.00 | 0.4033±0.00 | 0.1111±0.00 | 1.2915±0.00| -0.0893±0.00 |
| LightGBM (Guolin Ke, et al.) | Alpha158 | 0.0475±0.00 | 0.3979±0.00| 0.0485±0.00 | 0.4123±0.00 | 0.1143±0.00 | 1.2744±0.00| -0.0800±0.00 |
| MLP | Alpha158 | 0.0358±0.00 | 0.2738±0.03| 0.0425±0.00 | 0.3221±0.01 | 0.0836±0.02 | 1.0323±0.25| -0.1127±0.02 |
| TabNet with pretrain (Sercan O. Arikm et al) | Alpha158 | 0.0344±0.00|0.205±0.11|0.0398±0.00 |0.3479±0.01|0.0827±0.02|1.1141±0.32 |-0.0925±0.02 |
| TFT (Bryan Lim, et al.) | Alpha158 (with selected 20 features) | 0.0343±0.00 | 0.2071±0.02| 0.0107±0.00 | 0.0660±0.02 | 0.0623±0.02 | 0.5818±0.20| -0.1762±0.01 |
| GRU (Kyunghyun Cho, et al.) | Alpha158 (with selected 20 features) | 0.0311±0.00 | 0.2418±0.04| 0.0425±0.00 | 0.3434±0.02 | 0.0330±0.02 | 0.4805±0.30| -0.1021±0.02 |
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
- The selected 20 features are based on the feature importance of a lightgbm-based model.

View File

@@ -1,3 +1,3 @@
# State-Frequency-Memory
- State Frequency Memory (SFM) is a novel recurrent network that uses Discrete Fourier Transform to decompose the hidden states of memory cells and capture the multi-frequency trading patterns from past market data to make stock price predictions.
- Paper: Stock Price Prediction via Discovering Multi-Frequency Trading Patterns. [https://www.cs.ucf.edu/~gqi/publications/kdd2017_stock.pdf.](https://www.cs.ucf.edu/~gqi/publications/kdd2017_stock.pdf.)
- Paper: Stock Price Prediction via Discovering Multi-Frequency Trading Patterns. [http://www.eecs.ucf.edu/~gqi/publications/kdd2017_stock.pdf.](http://www.eecs.ucf.edu/~gqi/publications/kdd2017_stock.pdf)

View File

@@ -57,14 +57,13 @@ task:
eval_steps: 5
loss: mse
optimizer: adam
GPU: 1
seed: 710
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: ALPHA360
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:

View File

@@ -1,219 +1,229 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Alpha158 dataset.
Defines dataset specific column definitions and data transformations.
"""
import data_formatters.base
import libs.utils as utils
import sklearn.preprocessing
GenericDataFormatter = data_formatters.base.GenericDataFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class Alpha158Formatter(GenericDataFormatter):
"""Defines and formats data for the Alpha158 dataset.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
("instrument", DataTypes.CATEGORICAL, InputTypes.ID),
("LABEL0", DataTypes.REAL_VALUED, InputTypes.TARGET),
("date", DataTypes.DATE, InputTypes.TIME),
("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
# Selected 10 features
("RESI5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("WVMA5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("KLEN", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("ROC60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RESI10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("const", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print("Formatting train-valid-test splits.")
index = df["year"]
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
test = df.loc[index >= test_boundary]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df):
"""Calibrates scalers using the data supplied.
Args:
df: Data to use to calibrate scalers.
"""
print("Setting scalers with training data...")
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
# Format real scalers
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
data = df[real_inputs].values
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
df[[target_column]].values
) # used for predictions
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
categorical_scalers = {}
num_classes = []
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
output = df.copy()
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError("Scalers have not been set!")
column_definitions = self.get_column_definition()
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
# Format real inputs
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
output = predictions.copy()
column_names = predictions.columns
for col in column_names:
if col not in {"forecast_time", "identifier"}:
output[col] = self._target_scaler.inverse_transform(predictions[col])
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
"total_time_steps": 6 + 6,
"num_encoder_steps": 6,
"num_epochs": 100,
"early_stopping_patience": 10,
"multiprocessing_workers": 5,
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
"dropout_rate": 0.4,
"hidden_layer_size": 16,
"learning_rate": 0.0001,
"minibatch_size": 128,
"max_gradient_norm": 0.0135,
"num_heads": 1,
"stack_size": 1,
}
return model_params
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Alpha158 dataset.
Defines dataset specific column definitions and data transformations.
"""
import data_formatters.base
import libs.utils as utils
import sklearn.preprocessing
GenericDataFormatter = data_formatters.base.GenericDataFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class Alpha158Formatter(GenericDataFormatter):
"""Defines and formats data for the Alpha158 dataset.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
("instrument", DataTypes.CATEGORICAL, InputTypes.ID),
("LABEL0", DataTypes.REAL_VALUED, InputTypes.TARGET),
("date", DataTypes.DATE, InputTypes.TIME),
("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
# Selected features
("RESI5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("WVMA5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("KLEN", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("ROC60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RESI10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("VSTD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("WVMA60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("STD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("RSQR20", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORD60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORD10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("CORR20", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("KLOW", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
("const", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print("Formatting train-valid-test splits.")
index = df["year"]
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
test = df.loc[index >= test_boundary]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df):
"""Calibrates scalers using the data supplied.
Args:
df: Data to use to calibrate scalers.
"""
print("Setting scalers with training data...")
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
# Format real scalers
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
data = df[real_inputs].values
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
df[[target_column]].values
) # used for predictions
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
categorical_scalers = {}
num_classes = []
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
output = df.copy()
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError("Scalers have not been set!")
column_definitions = self.get_column_definition()
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
)
# Format real inputs
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
output = predictions.copy()
column_names = predictions.columns
for col in column_names:
if col not in {"forecast_time", "identifier"}:
output[col] = self._target_scaler.inverse_transform(predictions[col])
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
"total_time_steps": 6 + 6,
"num_encoder_steps": 6,
"num_epochs": 100,
"early_stopping_patience": 10,
"multiprocessing_workers": 5,
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
"dropout_rate": 0.4,
"hidden_layer_size": 160,
"learning_rate": 0.0001,
"minibatch_size": 128,
"max_gradient_norm": 0.0135,
"num_heads": 1,
"stack_size": 1,
}
return model_params

View File

@@ -25,7 +25,7 @@ import os
import data_formatters.qlib_Alpha158
class ExperimentConfig(object):
class ExperimentConfig:
"""Defines experiment configs and paths to outputs.
Attributes:

View File

@@ -320,7 +320,7 @@ class InterpretableMultiHeadAttention:
return outputs, attn
class TFTDataCache(object):
class TFTDataCache:
"""Caches data for the TFT."""
_data_cache = {}
@@ -348,7 +348,7 @@ class TFTDataCache(object):
# TFT model definitions.
class TemporalFusionTransformer(object):
class TemporalFusionTransformer:
"""Defines Temporal Fusion Transformer.
Attributes:
@@ -972,7 +972,7 @@ class TemporalFusionTransformer(object):
valid_quantiles = self.quantiles
output_size = self.output_size
class QuantileLossCalculator(object):
class QuantileLossCalculator:
"""Computes the combined quantile loss for prespecified quantiles.
Attributes:

View File

@@ -1,249 +1,291 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
import data_formatters.base
import expt_settings.configs
import libs.hyperparam_opt
import libs.tft_model
import libs.utils as utils
import os
import datetime as dte
from qlib.model.base import ModelFT
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
# To register new datasets, please add them here.
ALLOW_DATASET = ["Alpha158"]
DATASET_SETTING = {
"Alpha158": {
"feature_col": ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", "ROC60", "RESI10"],
"label_col": ["LABEL0"],
},
}
# To register new datasets, please add their configurations here.
def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
def fill_test_na(test_df):
test_df_res = test_df.copy()
feature_cols = ~test_df_res.columns.str.contains("label", case=False)
test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
test_df_res.loc[:, feature_cols] = test_feature_fna
return test_df_res
def process_qlib_data(df, dataset, fillna=False):
"""Prepare data to fit the TFT model.
Args:
df: Original DataFrame.
fillna: Whether to fill the data with the mean values.
Returns:
Transformed DataFrame.
"""
# Several features selected manually
feature_col = DATASET_SETTING[dataset]["feature_col"]
label_col = DATASET_SETTING[dataset]["label_col"]
temp_df = df.loc[:, feature_col + label_col]
if fillna:
temp_df = fill_test_na(temp_df)
temp_df = temp_df.swaplevel()
temp_df = temp_df.sort_index()
temp_df = temp_df.reset_index(level=0)
dates = pd.to_datetime(temp_df.index)
temp_df["date"] = dates
temp_df["day_of_week"] = dates.dayofweek
temp_df["month"] = dates.month
temp_df["year"] = dates.year
temp_df["const"] = 1.0
return temp_df
def process_predicted(df, col_name):
"""Transform the TFT predicted data into Qlib format.
Args:
df: Original DataFrame.
fillna: New column name.
Returns:
Transformed DataFrame.
"""
df_res = df.copy()
df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+4": col_name})
df_res = df_res.set_index(["datetime", "instrument"]).sort_index()
df_res = df_res[[col_name]]
return df_res
def format_score(forecast_df, col_name="pred", label_shift=5):
pred = process_predicted(forecast_df, col_name=col_name)
pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
pred = pred.dropna()[col_name]
return pred
def transform_df(df, col_name="LABEL0"):
df_res = df["feature"]
df_res[col_name] = df["label"]
return df_res
class TFTModel(ModelFT):
"""TFT Model"""
def __init__(self, **kwargs):
self.model = None
def _prepare_data(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
return transform_df(df_train), transform_df(df_valid)
def fit(
self,
dataset: DatasetH,
DATASET="Alpha158",
MODEL_FOLDER="qlib_alpha158_model",
LABEL_COL="LABEL0",
LABEL_SHIFT=5,
USE_GPU_ID=0,
**kwargs
):
if DATASET not in ALLOW_DATASET:
raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
dtrain, dvalid = self._prepare_data(dataset)
dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
ExperimentConfig = expt_settings.configs.ExperimentConfig
config = ExperimentConfig(DATASET)
self.data_formatter = config.make_data_formatter()
self.model_folder = MODEL_FOLDER
self.gpu_id = USE_GPU_ID
self.label_shift = LABEL_SHIFT
self.expt_name = DATASET
self.label_col = LABEL_COL
use_gpu = (True, self.gpu_id)
# ===========================Training Process===========================
ModelClass = libs.tft_model.TemporalFusionTransformer
if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
raise ValueError(
"Data formatters should inherit from"
+ "AbstractDataFormatter! Type={}".format(type(self.data_formatter))
)
default_keras_session = tf.keras.backend.get_session()
if use_gpu[0]:
self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
else:
self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
self.data_formatter.set_scalers(train)
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
# Wendi: 合并调优的参数和非调优的参数
params = {**params, **fixed_params}
if not os.path.exists(self.model_folder):
os.makedirs(self.model_folder)
params["model_folder"] = self.model_folder
print("*** Begin training ***")
best_loss = np.Inf
tf.reset_default_graph()
self.tf_graph = tf.Graph()
with self.tf_graph.as_default():
self.sess = tf.Session(config=self.tf_config)
tf.keras.backend.set_session(self.sess)
self.model = ModelClass(params, use_cudnn=use_gpu[0])
self.sess.run(tf.global_variables_initializer())
self.model.fit(train_df=train, valid_df=valid)
print("*** Finished training ***")
saved_model_dir = self.model_folder + "/" + "saved_model"
if not os.path.exists(saved_model_dir):
os.makedirs(saved_model_dir)
self.model.save(saved_model_dir)
def extract_numerical_data(data):
"""Strips out forecast time and identifier columns."""
return data[[col for col in data.columns if col not in {"forecast_time", "identifier"}]]
# p50_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p50_forecast),
# 0.5)
# p90_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p90_forecast),
# 0.9)
tf.keras.backend.set_session(default_keras_session)
print("Training completed.".format(dte.datetime.now()))
# ===========================Training Process===========================
def predict(self, dataset):
if self.model is None:
raise ValueError("model is not fitted yet!")
d_test = dataset.prepare("test", col_set=["feature", "label"])
d_test = transform_df(d_test)
d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
use_gpu = (True, self.gpu_id)
# ===========================Predicting Process===========================
default_keras_session = tf.keras.backend.get_session()
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
params = {**params, **fixed_params}
print("*** Begin predicting ***")
tf.reset_default_graph()
with self.tf_graph.as_default():
tf.keras.backend.set_session(self.sess)
output_map = self.model.predict(test, return_targets=True)
targets = self.data_formatter.format_predictions(output_map["targets"])
p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
tf.keras.backend.set_session(default_keras_session)
predict50 = format_score(p50_forecast, "pred", 1)
predict90 = format_score(p90_forecast, "pred", 1)
predict = (predict50 + predict90) / 2 # self.label_shift
# ===========================Predicting Process===========================
return predict
def finetune(self, dataset: DatasetH):
"""
finetune model
Parameters
----------
dataset : DatasetH
dataset for finetuning
"""
pass
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
import data_formatters.base
import expt_settings.configs
import libs.hyperparam_opt
import libs.tft_model
import libs.utils as utils
import os
import datetime as dte
from qlib.model.base import ModelFT
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
# To register new datasets, please add them here.
ALLOW_DATASET = ["Alpha158", "Alpha360"]
# To register new datasets, please add their configurations here.
DATASET_SETTING = {
"Alpha158": {
"feature_col": [
"RESI5",
"WVMA5",
"RSQR5",
"KLEN",
"RSQR10",
"CORR5",
"CORD5",
"CORR10",
"ROC60",
"RESI10",
"VSTD5",
"RSQR60",
"CORR60",
"WVMA60",
"STD5",
"RSQR20",
"CORD60",
"CORD10",
"CORR20",
"KLOW",
],
"label_col": "LABEL0",
},
"Alpha360": {
"feature_col": [
"HIGH0",
"LOW0",
"OPEN0",
"CLOSE1",
"HIGH1",
"VOLUME1",
"LOW1",
"VOLUME3",
"OPEN1",
"VOLUME4",
"CLOSE2",
"CLOSE4",
"VOLUME5",
"LOW2",
"CLOSE3",
"VOLUME2",
"HIGH2",
"LOW4",
"VOLUME8",
"VOLUME11",
],
"label_col": "LABEL0",
},
}
def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
def fill_test_na(test_df):
test_df_res = test_df.copy()
feature_cols = ~test_df_res.columns.str.contains("label", case=False)
test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
test_df_res.loc[:, feature_cols] = test_feature_fna
return test_df_res
def process_qlib_data(df, dataset, fillna=False):
"""Prepare data to fit the TFT model.
Args:
df: Original DataFrame.
fillna: Whether to fill the data with the mean values.
Returns:
Transformed DataFrame.
"""
# Several features selected manually
feature_col = DATASET_SETTING[dataset]["feature_col"]
label_col = [DATASET_SETTING[dataset]["label_col"]]
temp_df = df.loc[:, feature_col + label_col]
if fillna:
temp_df = fill_test_na(temp_df)
temp_df = temp_df.swaplevel()
temp_df = temp_df.sort_index()
temp_df = temp_df.reset_index(level=0)
dates = pd.to_datetime(temp_df.index)
temp_df["date"] = dates
temp_df["day_of_week"] = dates.dayofweek
temp_df["month"] = dates.month
temp_df["year"] = dates.year
temp_df["const"] = 1.0
return temp_df
def process_predicted(df, col_name):
"""Transform the TFT predicted data into Qlib format.
Args:
df: Original DataFrame.
fillna: New column name.
Returns:
Transformed DataFrame.
"""
df_res = df.copy()
df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+4": col_name})
df_res = df_res.set_index(["datetime", "instrument"]).sort_index()
df_res = df_res[[col_name]]
return df_res
def format_score(forecast_df, col_name="pred", label_shift=5):
pred = process_predicted(forecast_df, col_name=col_name)
pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
pred = pred.dropna()[col_name]
return pred
def transform_df(df, col_name="LABEL0"):
df_res = df["feature"]
df_res[col_name] = df["label"]
return df_res
class TFTModel(ModelFT):
"""TFT Model"""
def __init__(self, **kwargs):
self.model = None
self.params = {"DATASET": "Alpha158", "label_shift": 5}
self.params.update(kwargs)
def _prepare_data(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
return transform_df(df_train), transform_df(df_valid)
def fit(self, dataset: DatasetH, MODEL_FOLDER="qlib_tft_model", USE_GPU_ID=0, **kwargs):
DATASET = self.params["DATASET"]
LABEL_SHIFT = self.params["label_shift"]
LABEL_COL = DATASET_SETTING[DATASET]["label_col"]
if DATASET not in ALLOW_DATASET:
raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
dtrain, dvalid = self._prepare_data(dataset)
dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
ExperimentConfig = expt_settings.configs.ExperimentConfig
config = ExperimentConfig(DATASET)
self.data_formatter = config.make_data_formatter()
self.model_folder = MODEL_FOLDER
self.gpu_id = USE_GPU_ID
self.label_shift = LABEL_SHIFT
self.expt_name = DATASET
self.label_col = LABEL_COL
use_gpu = (True, self.gpu_id)
# ===========================Training Process===========================
ModelClass = libs.tft_model.TemporalFusionTransformer
if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
raise ValueError(
"Data formatters should inherit from"
+ "AbstractDataFormatter! Type={}".format(type(self.data_formatter))
)
default_keras_session = tf.keras.backend.get_session()
if use_gpu[0]:
self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
else:
self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
self.data_formatter.set_scalers(train)
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
# Wendi: 合并调优的参数和非调优的参数
params = {**params, **fixed_params}
if not os.path.exists(self.model_folder):
os.makedirs(self.model_folder)
params["model_folder"] = self.model_folder
print("*** Begin training ***")
best_loss = np.Inf
tf.reset_default_graph()
self.tf_graph = tf.Graph()
with self.tf_graph.as_default():
self.sess = tf.Session(config=self.tf_config)
tf.keras.backend.set_session(self.sess)
self.model = ModelClass(params, use_cudnn=use_gpu[0])
self.sess.run(tf.global_variables_initializer())
self.model.fit(train_df=train, valid_df=valid)
print("*** Finished training ***")
saved_model_dir = self.model_folder + "/" + "saved_model"
if not os.path.exists(saved_model_dir):
os.makedirs(saved_model_dir)
self.model.save(saved_model_dir)
def extract_numerical_data(data):
"""Strips out forecast time and identifier columns."""
return data[[col for col in data.columns if col not in {"forecast_time", "identifier"}]]
# p50_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p50_forecast),
# 0.5)
# p90_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p90_forecast),
# 0.9)
tf.keras.backend.set_session(default_keras_session)
print("Training completed.".format(dte.datetime.now()))
# ===========================Training Process===========================
def predict(self, dataset):
if self.model is None:
raise ValueError("model is not fitted yet!")
d_test = dataset.prepare("test", col_set=["feature", "label"])
d_test = transform_df(d_test)
d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
use_gpu = (True, self.gpu_id)
# ===========================Predicting Process===========================
default_keras_session = tf.keras.backend.get_session()
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
params = {**params, **fixed_params}
print("*** Begin predicting ***")
tf.reset_default_graph()
with self.tf_graph.as_default():
tf.keras.backend.set_session(self.sess)
output_map = self.model.predict(test, return_targets=True)
targets = self.data_formatter.format_predictions(output_map["targets"])
p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
tf.keras.backend.set_session(default_keras_session)
predict50 = format_score(p50_forecast, "pred", 1)
predict90 = format_score(p90_forecast, "pred", 1)
predict = (predict50 + predict90) / 2 # self.label_shift
# ===========================Predicting Process===========================
return predict
def finetune(self, dataset: DatasetH):
"""
finetune model
Parameters
----------
dataset : DatasetH
dataset for finetuning
"""
pass

Binary file not shown.

View File

@@ -0,0 +1,4 @@
pandas==1.1.2
numpy==1.17.4
scikit_learn==0.23.2
torch==1.7.0

View File

@@ -0,0 +1,74 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: TabnetModel
module_path: qlib.contrib.model.pytorch_tabnet
kwargs:
pretrain: True
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
pretrain: [2008-01-01, 2014-12-31]
pretrain_validation: [2015-01-01, 2020-08-01]
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,71 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors: []
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: XGBModel
module_path: qlib.contrib.model.xgboost
kwargs:
eval_metric: rmse
colsample_bytree: 0.8879
eta: 0.0421
max_depth: 8
n_estimators: 647
subsample: 0.8789
nthread: 20
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

View File

@@ -0,0 +1,172 @@
from qlib.data.dataset.handler import DataHandler, DataHandlerLP
from qlib.data.dataset.processor import Processor
from qlib.utils import get_cls_kwargs
from qlib.log import TimeInspector
class HighFreqHandler(DataHandlerLP):
def __init__(
self,
instruments="csi300",
start_time=None,
end_time=None,
infer_processors=[],
learn_processors=[],
fit_start_time=None,
fit_end_time=None,
drop_raw=True,
):
def check_transform_proc(proc_l):
new_l = []
for p in proc_l:
p["kwargs"].update(
{
"fit_start_time": fit_start_time,
"fit_end_time": fit_end_time,
}
)
new_l.append(p)
return new_l
infer_processors = check_transform_proc(infer_processors)
learn_processors = check_transform_proc(learn_processors)
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
"freq": "1min",
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
infer_processors=infer_processors,
learn_processors=learn_processors,
drop_raw=drop_raw,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(IsNull({1}), {0}, {1})"
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
template_fillnan = "BFillNan(FFillNan({0}))"
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
def get_normalized_price_feature(price_field, shift=0):
"""Get normalized price feature ops"""
if shift == 0:
template_norm = "{0}/Ref(DayLast({1}), 240)"
else:
template_norm = "Ref({0}, " + str(shift) + ")/Ref(DayLast({1}), 240)"
feature_ops = template_norm.format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format(price_field),
),
template_fillnan.format(template_paused.format("$close")),
)
return feature_ops
fields += [get_normalized_price_feature("$open", 0)]
fields += [get_normalized_price_feature("$high", 0)]
fields += [get_normalized_price_feature("$low", 0)]
fields += [get_normalized_price_feature("$close", 0)]
fields += [get_normalized_price_feature(simpson_vwap, 0)]
names += ["$open", "$high", "$low", "$close", "$vwap"]
fields += [get_normalized_price_feature("$open", 240)]
fields += [get_normalized_price_feature("$high", 240)]
fields += [get_normalized_price_feature("$low", 240)]
fields += [get_normalized_price_feature("$close", 240)]
fields += [get_normalized_price_feature(simpson_vwap, 240)]
names += ["$open_1", "$high_1", "$low_1", "$close_1", "$vwap_1"]
fields += [
"{0}/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume"]
fields += [
"Ref({0}, 240)/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume_1"]
fields += [template_paused.format("Date($close)")]
names += ["date"]
return fields, names
class HighFreqBacktestHandler(DataHandler):
def __init__(
self,
instruments="csi300",
start_time=None,
end_time=None,
):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
"freq": "1min",
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(IsNull({1}), {0}, {1})"
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
template_fillnan = "BFillNan(FFillNan({0}))"
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
fields += [
template_fillnan.format(template_paused.format("$close")),
]
names += ["$close0"]
fields += [
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format(simpson_vwap),
)
]
names += ["$vwap0"]
fields += [
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
]
names += ["$volume0"]
return fields, names

View File

@@ -0,0 +1,56 @@
import numpy as np
import pandas as pd
import importlib
from qlib.data.ops import ElemOperator, PairOperator
from qlib.config import C
from qlib.data.cache import H
from qlib.data.data import Cal
def get_calendar_day(freq="day", future=False):
flag = f"{freq}_future_{future}_day"
if flag in H["c"]:
_calendar = H["c"][flag]
else:
_calendar = np.array(list(map(lambda x: x.date(), Cal.load_calendar(freq, future))))
H["c"][flag] = _calendar
return _calendar
class DayLast(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = get_calendar_day(freq=freq)
series = self.feature.load(instrument, start_index, end_index, freq)
return series.groupby(_calendar[series.index]).transform("last")
class FFillNan(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.fillna(method="ffill")
class BFillNan(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.fillna(method="bfill")
class Date(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = get_calendar_day(freq=freq)
series = self.feature.load(instrument, start_index, end_index, freq)
return pd.Series(_calendar[series.index], index=series.index)
class Select(PairOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series_condition = self.feature_left.load(instrument, start_index, end_index, freq)
series_feature = self.feature_right.load(instrument, start_index, end_index, freq)
return series_feature.loc[series_condition]
class IsNull(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.isnull()

View File

@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
from qlib.data.dataset.processor import Processor
from qlib.data.dataset.utils import fetch_df_by_index
class HighFreqNorm(Processor):
def __init__(self, fit_start_time, fit_end_time):
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
def fit(self, df_features):
fetch_df = fetch_df_by_index(df_features, slice(self.fit_start_time, self.fit_end_time), level="datetime")
del df_features
df_values = fetch_df.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
self.feature_med = {}
self.feature_std = {}
self.feature_vmax = {}
self.feature_vmin = {}
for name, name_val in names.items():
part_values = df_values[:, name_val].astype(np.float32)
if name == "volume":
part_values = np.log1p(part_values)
self.feature_med[name] = np.nanmedian(part_values)
part_values = part_values - self.feature_med[name]
self.feature_std[name] = np.nanmedian(np.absolute(part_values)) * 1.4826 + 1e-12
part_values = part_values / self.feature_std[name]
self.feature_vmax[name] = np.nanmax(part_values)
self.feature_vmin[name] = np.nanmin(part_values)
def __call__(self, df_features):
df_features.set_index("date", append=True, drop=True, inplace=True)
df_values = df_features.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
for name, name_val in names.items():
if name == "volume":
df_values[:, name_val] = np.log1p(df_values[:, name_val])
df_values[:, name_val] -= self.feature_med[name]
df_values[:, name_val] /= self.feature_std[name]
slice0 = df_values[:, name_val] > 3.0
slice1 = df_values[:, name_val] > 3.5
slice2 = df_values[:, name_val] < -3.0
slice3 = df_values[:, name_val] < -3.5
df_values[:, name_val][slice0] = (
3.0 + (df_values[:, name_val][slice0] - 3.0) / (self.feature_vmax[name] - 3) * 0.5
)
df_values[:, name_val][slice1] = 3.5
df_values[:, name_val][slice2] = (
-3.0 - (df_values[:, name_val][slice2] + 3.0) / (self.feature_vmin[name] + 3) * 0.5
)
df_values[:, name_val][slice3] = -3.5
idx = df_features.index.droplevel("datetime").drop_duplicates()
idx.set_names(["instrument", "datetime"], inplace=True)
# Reshape is specifically for adapting to RL high-freq executor
feat = df_values[:, [0, 1, 2, 3, 4, 10]].reshape(-1, 6 * 240)
feat_1 = df_values[:, [5, 6, 7, 8, 9, 11]].reshape(-1, 6 * 240)
df_new_features = pd.DataFrame(
data=np.concatenate((feat, feat_1), axis=1),
index=idx,
columns=["FEATURE_%d" % i for i in range(12 * 240)],
).sort_index()
return df_new_features

View File

@@ -0,0 +1,166 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import fire
from pathlib import Path
import qlib
import pickle
import numpy as np
import pandas as pd
from qlib.config import HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import init_instance_by_config, exists_qlib_data
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.ops import Operators
from qlib.data.data import Cal
from qlib.tests.data import GetData
from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull
class HighfreqWorkflow(object):
SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
MARKET = "all"
BENCHMARK = "SH000300"
start_time = "2020-09-14 00:00:00"
end_time = "2021-01-18 16:00:00"
train_end_time = "2020-11-30 16:00:00"
test_start_time = "2020-12-01 00:00:00"
DATA_HANDLER_CONFIG0 = {
"start_time": start_time,
"end_time": end_time,
"freq": "1min",
"fit_start_time": start_time,
"fit_end_time": train_end_time,
"instruments": MARKET,
"infer_processors": [{"class": "HighFreqNorm", "module_path": "highfreq_processor", "kwargs": {}}],
}
DATA_HANDLER_CONFIG1 = {
"start_time": start_time,
"end_time": end_time,
"freq": "1min",
"instruments": MARKET,
}
task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqHandler",
"module_path": "highfreq_handler",
"kwargs": DATA_HANDLER_CONFIG0,
},
"segments": {
"train": (start_time, train_end_time),
"test": (
test_start_time,
end_time,
),
},
},
},
"dataset_backtest": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqBacktestHandler",
"module_path": "highfreq_handler",
"kwargs": DATA_HANDLER_CONFIG1,
},
"segments": {
"train": (start_time, train_end_time),
"test": (
test_start_time,
end_time,
),
},
},
},
}
def _init_qlib(self):
"""initialize qlib"""
# use yahoo_cn_1min data
QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG, **self.SPEC_CONF}
provider_uri = QLIB_INIT_CONFIG.get("provider_uri")
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN)
qlib.init(**QLIB_INIT_CONFIG)
def _prepare_calender_cache(self):
"""preload the calendar for cache"""
# This code used the copy-on-write feature of Linux to avoid calculating the calendar multiple times in the subprocess
# This code may accelerate, but may be not useful on Windows and Mac Os
Cal.calendar(freq="1min")
get_calendar_day(freq="1min")
def get_data(self):
"""use dataset to get highreq data"""
self._init_qlib()
self._prepare_calender_cache()
dataset = init_instance_by_config(self.task["dataset"])
xtrain, xtest = dataset.prepare(["train", "test"])
print(xtrain, xtest)
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
def dump_and_load_dataset(self):
"""dump and load dataset state on disk"""
self._init_qlib()
self._prepare_calender_cache()
dataset = init_instance_by_config(self.task["dataset"])
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
##=============dump dataset=============
dataset.to_pickle(path="dataset.pkl")
dataset_backtest.to_pickle(path="dataset_backtest.pkl")
del dataset, dataset_backtest
##=============reload dataset=============
with open("dataset.pkl", "rb") as file_dataset:
dataset = pickle.load(file_dataset)
with open("dataset_backtest.pkl", "rb") as file_dataset_backtest:
dataset_backtest = pickle.load(file_dataset_backtest)
self._prepare_calender_cache()
##=============reload_dataset=============
dataset.init(init_type=DataHandlerLP.IT_LS)
dataset_backtest.init()
##=============get data=============
xtrain, xtest = dataset.prepare(["train", "test"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(xtrain, xtest)
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
if __name__ == "__main__":
fire.Fire(HighfreqWorkflow)

View File

@@ -15,6 +15,7 @@ import traceback
import functools
import statistics
import subprocess
from datetime import datetime
from pathlib import Path
from operator import xor
from pprint import pprint
@@ -45,8 +46,6 @@ if not exists_qlib_data(provider_uri):
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN, exp_manager=exp_manager)
if os.path.isdir(exp_path):
shutil.rmtree(exp_path)
# decorator to check the arguments
def only_allow_defined_args(function_to_decorate):
@@ -70,9 +69,9 @@ def handler(signum, frame):
os.system("kill -9 %d" % os.getpid())
signal.signal(signal.SIGTSTP, handler)
signal.signal(signal.SIGINT, handler)
# function to calculate the mean and std of a list in the results dictionary
def cal_mean_std(results) -> dict:
mean_std = dict()
@@ -136,9 +135,9 @@ def get_all_folders(models, exclude) -> dict:
# function to get all the files under the model folder
def get_all_files(folder_path) -> (str, str):
yaml_path = str(Path(f"{folder_path}") / "*.yaml")
req_path = str(Path(f"{folder_path}") / "*.txt")
def get_all_files(folder_path, dataset) -> (str, str):
yaml_path = str(Path(f"{folder_path}") / f"*{dataset}*.yaml")
req_path = str(Path(f"{folder_path}") / f"*.txt")
return glob.glob(yaml_path)[0], glob.glob(req_path)[0]
@@ -152,6 +151,10 @@ def get_all_results(folders) -> dict:
result["annualized_return_with_cost"] = list()
result["information_ratio_with_cost"] = list()
result["max_drawdown_with_cost"] = list()
result["ic"] = list()
result["icir"] = list()
result["rank_ic"] = list()
result["rank_icir"] = list()
for recorder_id in recorders:
if recorders[recorder_id].status == "FINISHED":
recorder = R.get_recorder(recorder_id=recorder_id, experiment_name=fn)
@@ -159,19 +162,27 @@ def get_all_results(folders) -> dict:
result["annualized_return_with_cost"].append(metrics["excess_return_with_cost.annualized_return"])
result["information_ratio_with_cost"].append(metrics["excess_return_with_cost.information_ratio"])
result["max_drawdown_with_cost"].append(metrics["excess_return_with_cost.max_drawdown"])
result["ic"].append(metrics["IC"])
result["icir"].append(metrics["ICIR"])
result["rank_ic"].append(metrics["Rank IC"])
result["rank_icir"].append(metrics["Rank ICIR"])
results[fn] = result
return results
# function to generate and save markdown table
def gen_and_save_md_table(metrics):
table = "| Model Name | Annualized Return | Information Ratio | Max Drawdown |\n"
table += "|---|---|---|---|\n"
def gen_and_save_md_table(metrics, dataset):
table = "| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |\n"
table += "|---|---|---|---|---|---|---|---|---|\n"
for fn in metrics:
ic = metrics[fn]["ic"]
icir = metrics[fn]["icir"]
ric = metrics[fn]["rank_ic"]
ricir = metrics[fn]["rank_icir"]
ar = metrics[fn]["annualized_return_with_cost"]
ir = metrics[fn]["information_ratio_with_cost"]
md = metrics[fn]["max_drawdown_with_cost"]
table += f"| {fn} | {ar[0]:9.4f}±{ar[1]:9.2f} | {ir[0]:9.4f}±{ir[1]:9.2f}| {md[0]:9.4f}±{md[1]:9.2f} |\n"
table += f"| {fn} | {dataset} | {ic[0]:5.4f}±{ic[1]:2.2f} | {icir[0]:5.4f}±{icir[1]:2.2f}| {ric[0]:5.4f}±{ric[1]:2.2f} | {ricir[0]:5.4f}±{ricir[1]:2.2f} | {ar[0]:5.4f}±{ar[1]:2.2f} | {ir[0]:5.4f}±{ir[1]:2.2f}| {md[0]:5.4f}±{md[1]:2.2f} |\n"
pprint(table)
with open("table.md", "w") as f:
f.write(table)
@@ -180,10 +191,11 @@ def gen_and_save_md_table(metrics):
# function to run the all the models
@only_allow_defined_args
def run(times=1, models=None, exclude=False):
def run(times=1, models=None, dataset="Alpha360", exclude=False):
"""
Please be aware that this function can only work under Linux. MacOS and Windows will be supported in the future.
Any PR to enhance this method is highly welcomed.
Any PR to enhance this method is highly welcomed. Besides, this script doesn't support parrallel running the same model
for multiple times, and this will be fixed in the future development.
Parameters:
-----------
@@ -193,6 +205,8 @@ def run(times=1, models=None, exclude=False):
determines the specific model or list of models to run or exclude.
exclude : boolean
determines whether the model being used is excluded or included.
dataset : str
determines the dataset to be used for each model.
Usage:
-------
@@ -206,13 +220,16 @@ def run(times=1, models=None, exclude=False):
# Case 2 - run specific models multiple times
python run_all_model.py 3 mlp
# Case 3 - run other models except those are given as arguments for multiple times
python run_all_model.py 3 [mlp,tft,lstm] True
# Case 3 - run specific models multiple times with specific dataset
python run_all_model.py 3 mlp Alpha158
# Case 4 - run specific models for one time
# Case 4 - run other models except those are given as arguments for multiple times
python run_all_model.py 3 [mlp,tft,lstm] --exclude=True
# Case 5 - run specific models for one time
python run_all_model.py --models=[mlp,lightgbm]
# Case 5 - run other models except those are given as aruments for one time
# Case 6 - run other models except those are given as aruments for one time
python run_all_model.py --models=[mlp,tft,sfm] --exclude=True
"""
@@ -226,7 +243,7 @@ def run(times=1, models=None, exclude=False):
env_path, python_path, conda_activate = create_env()
# get all files
sys.stderr.write("Retrieving files...\n")
yaml_path, req_path = get_all_files(folders[fn])
yaml_path, req_path = get_all_files(folders[fn], dataset)
sys.stderr.write("\n")
# install requirements.txt
sys.stderr.write("Installing requirements.txt...\n")
@@ -240,6 +257,7 @@ def run(times=1, models=None, exclude=False):
sys.stderr.write("\n")
# install qlib
sys.stderr.write("Installing qlib...\n")
execute(f"{python_path} -m pip install --upgrade pip") # TODO: FIX ME!
execute(f"{python_path} -m pip install --upgrade cython") # TODO: FIX ME!
if fn == "TFT":
execute(
@@ -272,12 +290,15 @@ def run(times=1, models=None, exclude=False):
results = cal_mean_std(results)
# generating md table
sys.stderr.write(f"Generating markdown table...\n")
gen_and_save_md_table(results)
gen_and_save_md_table(results, dataset)
sys.stderr.write("\n")
# print erros
sys.stderr.write(f"Here are some of the errors of the models...\n")
pprint(errors)
sys.stderr.write("\n")
# move results folder
shutil.move(exp_path, exp_path + f"_{dataset}_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")
shutil.move("table.md", f"table_{dataset}_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}.md")
if __name__ == "__main__":

View File

@@ -17,7 +17,7 @@ from qlib.contrib.evaluate import (
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.tests.data import GetData
if __name__ == "__main__":
@@ -25,9 +25,6 @@ if __name__ == "__main__":
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)
@@ -98,6 +95,7 @@ if __name__ == "__main__":
"open_cost": 0.0005,
"close_cost": 0.0015,
"min_cost": 5,
"return_order": True,
},
}
@@ -105,6 +103,11 @@ if __name__ == "__main__":
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
# NOTE: This line is optional
# It demonstrates that the dataset can be used standalone.
example_df = dataset.prepare("train")
print(example_df.head())
# start exp
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))

View File

@@ -2,91 +2,49 @@
# Licensed under the MIT License.
__version__ = "0.6.0"
__version__ = "0.6.3"
import os
import re
import sys
import copy
import yaml
import logging
import platform
import subprocess
from pathlib import Path
from .utils import can_use_cache, init_instance_by_config, get_module_by_module_path
from .workflow.utils import experiment_exit_handler
# init qlib
def init(default_conf="client", **kwargs):
from .config import C, REG_CN, REG_US, QlibConfig
from .data.data import register_all_wrappers
from .log import get_module_logger, set_log_with_config
from .config import C
from .log import get_module_logger
from .data.cache import H
from .workflow import R, QlibRecorder
C.reset()
H.clear()
_logging_config = C.logging_config
if "logging_config" in kwargs:
_logging_config = kwargs["logging_config"]
# set global config
if _logging_config:
set_log_with_config(_logging_config)
# FIXME: this logger ignored the level in config
LOG = get_module_logger("Initialization", level=logging.INFO)
LOG.info(f"default_conf: {default_conf}.")
logger = get_module_logger("Initialization", level=logging.INFO)
C.set_mode(default_conf)
C.set_region(kwargs.get("region", C["region"] if "region" in C else REG_CN))
for k, v in kwargs.items():
C[k] = v
if k not in C:
LOG.warning("Unrecognized config %s" % k)
C.resolve_path()
if not (C["expression_cache"] is None and C["dataset_cache"] is None):
# check redis
if not can_use_cache():
LOG.warning(
f"redis connection failed(host={C['redis_host']} port={C['redis_port']}), cache will not be used!"
)
C["expression_cache"] = None
C["dataset_cache"] = None
C.set(default_conf, **kwargs)
# check path if server/local
if C.get_uri_type() == QlibConfig.LOCAL_URI:
if C.get_uri_type() == C.LOCAL_URI:
if not os.path.exists(C["provider_uri"]):
if C["auto_mount"]:
LOG.error(
logger.error(
f"Invalid provider uri: {C['provider_uri']}, please check if a valid provider uri has been set. This path does not exist."
)
else:
LOG.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
elif C.get_uri_type() == QlibConfig.NFS_URI:
logger.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
elif C.get_uri_type() == C.NFS_URI:
_mount_nfs_uri(C)
else:
raise NotImplementedError(f"This type of URI is not supported")
LOG.info("qlib successfully initialized based on %s settings." % default_conf)
register_all_wrappers()
LOG.info(f"data_path={C.get_data_path()}")
C.register()
if "flask_server" in C:
LOG.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
# set up QlibRecorder
exp_manager = init_instance_by_config(C["exp_manager"])
qr = QlibRecorder(exp_manager)
R.register(qr)
# clean up experiment when python program ends
experiment_exit_handler()
logger.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
logger.info("qlib successfully initialized based on %s settings." % default_conf)
logger.info(f"data_path={C.get_data_path()}")
def _mount_nfs_uri(C):

View File

@@ -11,26 +11,27 @@ Two modes are supported
"""
import copy
from pathlib import Path
import re
import os
import re
import copy
import logging
import multiprocessing
from pathlib import Path
class Config:
def __init__(self, default_conf):
self.__dict__["_default_config"] = default_conf # avoiding conflictions with __getattr__
self.__dict__["_default_config"] = copy.deepcopy(default_conf) # avoiding conflictions with __getattr__
self.reset()
def __getitem__(self, key):
return self.__dict__["_config"][key]
def __getattr__(self, attr):
try:
if attr in self.__dict__["_config"]:
return self.__dict__["_config"][attr]
except KeyError:
return AttributeError(f"No such {attr} in self._config")
raise AttributeError(f"No such {attr} in self._config")
def __setitem__(self, key, value):
self.__dict__["_config"][key] = value
@@ -59,6 +60,9 @@ class Config:
def update(self, *args, **kwargs):
self.__dict__["_config"].update(*args, **kwargs)
def set_conf_from_C(self, config_c):
self.update(**config_c.__dict__["_config"])
# REGION CONST
REG_CN = "cn"
@@ -86,7 +90,6 @@ _default_config = {
# How many tasks belong to one process. Recommend 1 for high-frequency data and None for daily data.
"maxtasksperchild": None,
"default_disk_cache": 1, # 0:skip/1:use
"disable_disk_cache": False, # disable disk cache; if High-frequency data generally disable_disk_cache=True
"mem_cache_size_limit": 500,
# memory cache expire second, only in used 'DatasetURICache' and 'client D.calendar'
# default 1 hour
@@ -184,9 +187,17 @@ MODE_CONF = {
"timeout": 100,
"logging_level": "INFO",
"region": REG_CN,
## Custom Operator
"custom_ops": [],
},
}
HIGH_FREQ_CONFIG = {
"provider_uri": "~/.qlib/qlib_data/yahoo_cn_1min",
"dataset_cache": None,
"expression_cache": "DiskExpressionCache",
"region": REG_CN,
}
_default_region_config = {
REG_CN: {
@@ -207,6 +218,10 @@ class QlibConfig(Config):
LOCAL_URI = "local"
NFS_URI = "nfs"
def __init__(self, default_conf):
super().__init__(default_conf)
self._registered = False
def set_mode(self, mode):
# raise KeyError
self.update(MODE_CONF[mode])
@@ -243,6 +258,64 @@ class QlibConfig(Config):
else:
raise NotImplementedError(f"This type of uri is not supported")
def set(self, default_conf="client", **kwargs):
from .utils import set_log_with_config, get_module_logger, can_use_cache
self.reset()
_logging_config = self.logging_config
if "logging_config" in kwargs:
_logging_config = kwargs["logging_config"]
# set global config
if _logging_config:
set_log_with_config(_logging_config)
# FIXME: this logger ignored the level in config
logger = get_module_logger("Initialization", level=logging.INFO)
logger.info(f"default_conf: {default_conf}.")
self.set_mode(default_conf)
self.set_region(kwargs.get("region", self["region"] if "region" in self else REG_CN))
for k, v in kwargs.items():
if k not in self:
logger.warning("Unrecognized config %s" % k)
self[k] = v
self.resolve_path()
if not (self["expression_cache"] is None and self["dataset_cache"] is None):
# check redis
if not can_use_cache():
logger.warning(
f"redis connection failed(host={self['redis_host']} port={self['redis_port']}), cache will not be used!"
)
self["expression_cache"] = None
self["dataset_cache"] = None
def register(self):
from .utils import init_instance_by_config
from .data.ops import register_all_ops
from .data.data import register_all_wrappers
from .workflow import R, QlibRecorder
from .workflow.utils import experiment_exit_handler
register_all_ops(self)
register_all_wrappers(self)
# set up QlibRecorder
exp_manager = init_instance_by_config(self["exp_manager"])
qr = QlibRecorder(exp_manager)
R.register(qr)
# clean up experiment when python program ends
experiment_exit_handler()
self._registered = True
@property
def registered(self):
return self._registered
# global config
C = QlibConfig(_default_config)

View File

@@ -1,9 +1,324 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# -*- coding: utf-8 -*-
from .order import Order
from .account import Account
from .position import Position
from .exchange import Exchange
from .report import Report
from .backtest import backtest as backtest_func, get_date_range
import numpy as np
import inspect
from ...utils import init_instance_by_config
from ...log import get_module_logger
from ...config import C
logger = get_module_logger("backtest caller")
def get_strategy(
strategy=None,
topk=50,
margin=0.5,
n_drop=5,
risk_degree=0.95,
str_type="dropout",
adjust_dates=None,
):
"""get_strategy
There will be 3 ways to return a stratgy. Please follow the code.
Parameters
----------
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
Returns
-------
:class: Strategy
an initialized strategy object
"""
# There will be 3 ways to return a strategy.
if strategy is None:
# 1) create strategy with param `strategy`
str_cls_dict = {
"amount": "TopkAmountStrategy",
"weight": "TopkWeightStrategy",
"dropout": "TopkDropoutStrategy",
}
logger.info("Create new strategy ")
from .. import strategy as strategy_pool
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
strategy = str_cls(
topk=topk,
buffer_margin=margin,
n_drop=n_drop,
risk_degree=risk_degree,
adjust_dates=adjust_dates,
)
elif isinstance(strategy, (dict, str)):
# 2) create strategy with init_instance_by_config
logger.info("Create new strategy ")
strategy = init_instance_by_config(strategy)
from ..strategy.strategy import BaseStrategy
# else: nothing happens. 3) Use the strategy directly
if not isinstance(strategy, BaseStrategy):
raise TypeError("Strategy not supported")
return strategy
def get_exchange(
pred,
exchange=None,
subscribe_fields=[],
open_cost=0.0015,
close_cost=0.0025,
min_cost=5.0,
trade_unit=None,
limit_threshold=None,
deal_price=None,
extract_codes=False,
shift=1,
):
"""get_exchange
Parameters
----------
# exchange related arguments
exchange: Exchange().
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost.
close_cost : float
close transaction cost.
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
NOTE: This will be faster with offline qlib.
Returns
-------
:class: Exchange
an initialized Exchange object
"""
if trade_unit is None:
trade_unit = C.trade_unit
if limit_threshold is None:
limit_threshold = C.limit_threshold
if deal_price is None:
deal_price = C.deal_price
if exchange is None:
logger.info("Create new exchange")
# handle exception for deal_price
if deal_price[0] != "$":
deal_price = "$" + deal_price
if extract_codes:
codes = sorted(pred.index.get_level_values("instrument").unique())
else:
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
dates = sorted(pred.index.get_level_values("datetime").unique())
dates = np.append(dates, get_date_range(dates[-1], left_shift=1, right_shift=shift))
exchange = Exchange(
trade_dates=dates,
codes=codes,
deal_price=deal_price,
subscribe_fields=subscribe_fields,
limit_threshold=limit_threshold,
open_cost=open_cost,
close_cost=close_cost,
min_cost=min_cost,
trade_unit=trade_unit,
)
return exchange
def get_executor(
executor=None,
trade_exchange=None,
verbose=True,
):
"""get_executor
There will be 3 ways to return a executor. Please follow the code.
Parameters
----------
executor : BaseExecutor
executor used in backtest.
trade_exchange : Exchange
exchange used in executor
verbose : bool
whether to print log.
Returns
-------
:class: BaseExecutor
an initialized BaseExecutor object
"""
# There will be 3 ways to return a executor.
if executor is None:
# 1) create executor with param `executor`
logger.info("Create new executor ")
from ..online.executor import SimulatorExecutor
executor = SimulatorExecutor(trade_exchange=trade_exchange, verbose=verbose)
elif isinstance(executor, (dict, str)):
# 2) create executor with config
logger.info("Create new executor ")
executor = init_instance_by_config(executor)
from ..online.executor import BaseExecutor
# 3) Use the executor directly
if not isinstance(executor, BaseExecutor):
raise TypeError("Executor not supported")
return executor
# This is the API for compatibility for legacy code
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, return_order=False, **kwargs):
"""This function will help you set a reasonable Exchange and provide default value for strategy
Parameters
----------
- **backtest workflow related or commmon arguments**
pred : pandas.DataFrame
predict should has <datetime, instrument> index and one `score` column.
account : float
init account value.
shift : int
whether to shift prediction by one day.
benchmark : str
benchmark code, default is SH000905 CSI 500.
verbose : bool
whether to print log.
return_order : bool
whether to return order list
- **strategy related arguments**
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
- **exchange related arguments**
exchange: Exchange()
pass the exchange for speeding up.
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost. The default value is 0.002(0.2%).
close_cost : float
close transaction cost. The default value is 0.002(0.2%).
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
.. note:: This will be faster with offline qlib.
- **executor related arguments**
executor : BaseExecutor()
executor used in backtest.
verbose : bool
whether to print log.
"""
# check strategy:
spec = inspect.getfullargspec(get_strategy)
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
strategy = get_strategy(**str_args)
# init exchange:
spec = inspect.getfullargspec(get_exchange)
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
trade_exchange = get_exchange(pred, **ex_args)
# init executor:
executor = get_executor(executor=kwargs.get("executor"), trade_exchange=trade_exchange, verbose=verbose)
# run backtest
report_dict = backtest_func(
pred=pred,
strategy=strategy,
executor=executor,
trade_exchange=trade_exchange,
shift=shift,
verbose=verbose,
account=account,
benchmark=benchmark,
return_order=return_order,
)
# for compatibility of the old API. return the dict positions
positions = report_dict.get("positions")
report_dict.update({"positions": {k: p.position for k, p in positions.items()}})
return report_dict

View File

@@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
from ...utils import get_date_by_shift, get_date_range
from ..online.executor import SimulatorExecutor
from ...data import D
from .account import Account
from ...config import C
@@ -15,7 +14,7 @@ from ...data.dataset.utils import get_level_index
LOG = get_module_logger("backtest")
def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark):
def backtest(pred, strategy, executor, trade_exchange, shift, verbose, account, benchmark, return_order):
"""Parameters
----------
pred : pandas.DataFrame
@@ -69,9 +68,9 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
bench = _temp_result.groupby(level="datetime")[_temp_result.columns.tolist()[0]].mean()
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
executor = SimulatorExecutor(trade_exchange, verbose=verbose)
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
if return_order:
multi_order_list = []
# trading apart
for pred_date, trade_date in zip(predict_dates, trade_dates):
# for loop predict date and trading date
@@ -103,6 +102,8 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
)
else:
order_list = []
if return_order:
multi_order_list.append((trade_account, order_list, trade_date))
# 4. Get result after executing order list
# NOTE: The following operation will modify order.amount.
# NOTE: If it is buy and the cash is insufficient, the tradable amount will be recalculated
@@ -115,7 +116,11 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
report_df = trade_account.report.generate_report_dataframe()
report_df["bench"] = bench
positions = trade_account.get_positions()
return report_df, positions
report_dict = {"report_df": report_df, "positions": positions}
if return_order:
report_dict.update({"order_list": multi_order_list})
return report_dict
def update_account(trade_account, trade_info, trade_exchange, trade_date):

View File

@@ -43,16 +43,18 @@ _DEFAULT_INFER_PROCESSORS = [
]
class ALPHA360(DataHandlerLP):
class Alpha360(DataHandlerLP):
def __init__(
self,
instruments="csi500",
start_time=None,
end_time=None,
freq="day",
infer_processors=_DEFAULT_INFER_PROCESSORS,
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
filter_pipe=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
@@ -65,13 +67,15 @@ class ALPHA360(DataHandlerLP):
"feature": self.get_feature_config(),
"label": kwargs.get("label", self.get_label_config()),
},
"filter_pipe": filter_pipe,
"freq": freq,
},
}
super().__init__(
instruments,
start_time,
end_time,
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
learn_processors=learn_processors,
infer_processors=infer_processors,
@@ -119,7 +123,7 @@ class ALPHA360(DataHandlerLP):
return fields, names
class ALPHA360vwap(ALPHA360):
class Alpha360vwap(Alpha360):
def get_label_config(self):
return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"])
@@ -130,11 +134,13 @@ class Alpha158(DataHandlerLP):
instruments="csi500",
start_time=None,
end_time=None,
freq="day",
infer_processors=[],
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
process_type=DataHandlerLP.PTYPE_A,
filter_pipe=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
@@ -143,13 +149,18 @@ class Alpha158(DataHandlerLP):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": {"feature": self.get_feature_config(), "label": kwargs.get("label", self.get_label_config())},
"config": {
"feature": self.get_feature_config(),
"label": kwargs.get("label", self.get_label_config()),
},
"filter_pipe": filter_pipe,
"freq": freq,
},
}
super().__init__(
instruments,
start_time,
end_time,
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
infer_processors=infer_processors,
learn_processors=learn_processors,

View File

@@ -6,17 +6,16 @@ from __future__ import print_function
import numpy as np
import pandas as pd
import inspect
import warnings
from ..log import get_module_logger
from . import strategy as strategy_pool
from .strategy.strategy import BaseStrategy
from .backtest.exchange import Exchange
from .backtest.backtest import backtest as backtest_func, get_date_range
from .backtest import get_exchange, backtest as backtest_func
from .backtest.backtest import get_date_range
from ..data import D
from ..config import C
from ..data.dataset.utils import get_level_index
logger = get_module_logger("Evaluate")
@@ -46,144 +45,6 @@ def risk_analysis(r, N=252):
return res
def get_strategy(
strategy=None,
topk=50,
margin=0.5,
n_drop=5,
risk_degree=0.95,
str_type="amount",
adjust_dates=None,
):
"""get_strategy
Parameters
----------
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
Returns
-------
:class: Strategy
an initialized strategy object
"""
if strategy is None:
str_cls_dict = {
"amount": "TopkAmountStrategy",
"weight": "TopkWeightStrategy",
"dropout": "TopkDropoutStrategy",
}
logger.info("Create new streategy ")
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
strategy = str_cls(
topk=topk,
buffer_margin=margin,
n_drop=n_drop,
risk_degree=risk_degree,
adjust_dates=adjust_dates,
)
if not isinstance(strategy, BaseStrategy):
raise TypeError("Strategy not supported")
return strategy
def get_exchange(
pred,
exchange=None,
subscribe_fields=[],
open_cost=0.0015,
close_cost=0.0025,
min_cost=5.0,
trade_unit=None,
limit_threshold=None,
deal_price=None,
extract_codes=False,
shift=1,
):
"""get_exchange
Parameters
----------
# exchange related arguments
exchange: Exchange().
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost.
close_cost : float
close transaction cost.
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
NOTE: This will be faster with offline qlib.
Returns
-------
:class: Exchange
an initialized Exchange object
"""
if trade_unit is None:
trade_unit = C.trade_unit
if limit_threshold is None:
limit_threshold = C.limit_threshold
if deal_price is None:
deal_price = C.deal_price
if exchange is None:
logger.info("Create new exchange")
# handle exception for deal_price
if deal_price[0] != "$":
deal_price = "$" + deal_price
if extract_codes:
codes = sorted(pred.index.get_level_values("instrument").unique())
else:
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
dates = sorted(pred.index.get_level_values("datetime").unique())
dates = np.append(dates, get_date_range(dates[-1], shift=shift))
exchange = Exchange(
trade_dates=dates,
codes=codes,
deal_price=deal_price,
subscribe_fields=subscribe_fields,
limit_threshold=limit_threshold,
open_cost=open_cost,
close_cost=close_cost,
min_cost=min_cost,
trade_unit=trade_unit,
)
return exchange
# This is the API for compatibility for legacy code
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **kwargs):
"""This function will help you set a reasonable Exchange and provide default value for strategy
@@ -249,30 +110,22 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
will we pass the codes extracted from the pred to the exchange.
.. note:: This will be faster with offline qlib.
- **executor related arguments**
executor : BaseExecutor()
executor used in backtest.
verbose : bool
whether to print log.
"""
# check strategy:
spec = inspect.getfullargspec(get_strategy)
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
strategy = get_strategy(**str_args)
# init exchange:
spec = inspect.getfullargspec(get_exchange)
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
trade_exchange = get_exchange(pred, **ex_args)
# run backtest
report_df, positions = backtest_func(
pred=pred,
strategy=strategy,
trade_exchange=trade_exchange,
shift=shift,
verbose=verbose,
account=account,
benchmark=benchmark,
warnings.warn(
"this function is deprecated, please use backtest function in qlib.contrib.backtest", DeprecationWarning
)
# for compatibility of the old API. return the dict positions
positions = {k: p.position for k, p in positions.items()}
return report_df, positions
report_dict = backtest_func(
pred=pred, account=account, shift=shift, benchmark=benchmark, verbose=verbose, return_order=False, **kwargs
)
return report_dict.get("report_df"), report_dict.get("positions")
def long_short_backtest(
@@ -340,7 +193,7 @@ def long_short_backtest(
_pred_dates = pred.index.get_level_values(level="datetime")
predict_dates = D.calendar(start_time=_pred_dates.min(), end_time=_pred_dates.max())
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
long_returns = {}
short_returns = {}

View File

@@ -56,8 +56,8 @@ class ALSTM(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
seed=0,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
@@ -76,7 +76,7 @@ class ALSTM(Model):
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.visible_GPU = GPU
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -113,8 +113,9 @@ class ALSTM(Model):
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.ALSTM_model = ALSTMModel(
d_feat=self.d_feat,
@@ -130,11 +131,7 @@ class ALSTM(Model):
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.ALSTM_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
self.ALSTM_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
@@ -172,12 +169,8 @@ class ALSTM(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.ALSTM_model(feature)
loss = self.loss_fn(pred, label)
@@ -205,12 +198,8 @@ class ALSTM(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.ALSTM_model(feature)
loss = self.loss_fn(pred, label)
@@ -298,10 +287,7 @@ class ALSTM(Model):
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
with torch.no_grad():
if self.use_gpu:

View File

@@ -0,0 +1,331 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ...model.base import Model
from ...data.dataset import DatasetH, TSDatasetH
from ...data.dataset.handler import DataHandlerLP
class ALSTM(Model):
"""ALSTM Model
Parameters
----------
d_feat : int
input dimension for each time step
metric: str
the evaluate metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=6,
hidden_size=64,
num_layers=2,
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
n_jobs=10,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("ALSTM")
self.logger.info("ALSTM pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.n_jobs = n_jobs
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.logger.info(
"ALSTM parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nn_jobs : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
n_jobs,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.ALSTM_model = ALSTMModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
dropout=self.dropout,
).to(self.device)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.ALSTM_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.ALSTM_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
self.ALSTM_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
self.ALSTM_model.train()
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.ALSTM_model(feature.float())
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.ALSTM_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_loader):
self.ALSTM_model.eval()
scores = []
losses = []
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
pred = self.ALSTM_model(feature.float())
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset,
evals_result=dict(),
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self._fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(train_loader)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(train_loader)
val_loss, val_score = self.test_epoch(valid_loader)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.ALSTM_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.ALSTM_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.ALSTM_model.eval()
preds = []
for data in test_loader:
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
if self.use_gpu:
pred = self.ALSTM_model(feature.float()).detach().cpu().numpy()
else:
pred = self.ALSTM_model(feature.float()).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=dl_test.get_index())
class ALSTMModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"):
super().__init__()
self.hid_size = hidden_size
self.input_size = d_feat
self.dropout = dropout
self.rnn_type = rnn_type
self.rnn_layer = num_layers
self._build_model()
def _build_model(self):
try:
klass = getattr(nn, self.rnn_type.upper())
except:
raise ValueError("unknown rnn_type `%s`" % self.rnn_type)
self.net = nn.Sequential()
self.net.add_module("fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size))
self.net.add_module("act", nn.Tanh())
self.rnn = klass(
input_size=self.hid_size,
hidden_size=self.hid_size,
num_layers=self.rnn_layer,
batch_first=True,
dropout=self.dropout,
)
self.fc_out = nn.Linear(in_features=self.hid_size * 2, out_features=1)
self.att_net = nn.Sequential()
self.att_net.add_module(
"att_fc_in",
nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2)),
)
self.att_net.add_module("att_dropout", torch.nn.Dropout(self.dropout))
self.att_net.add_module("att_act", nn.Tanh())
self.att_net.add_module(
"att_fc_out",
nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False),
)
self.att_net.add_module("att_softmax", nn.Softmax(dim=1))
def forward(self, inputs):
rnn_out, _ = self.rnn(self.net(inputs)) # [batch, seq_len, num_directions * hidden_size]
attention_score = self.att_net(rnn_out) # [batch, seq_len, 1]
out_att = torch.mul(rnn_out, attention_score)
out_att = torch.sum(out_att, dim=1)
out = self.fc_out(
torch.cat((rnn_out[:, -1, :], out_att), dim=1)
) # [batch, seq_len, num_directions * hidden_size] -> [batch, 1]
return out[..., 0]

View File

@@ -61,8 +61,8 @@ class GATs(Model):
with_pretrain=True,
model_path=None,
optimizer="adam",
GPU="0",
seed=0,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
@@ -83,7 +83,7 @@ class GATs(Model):
self.base_model = base_model
self.with_pretrain = with_pretrain
self.model_path = model_path
self.visible_GPU = GPU
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -123,8 +123,11 @@ class GATs(Model):
seed,
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.GAT_model = GATModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
@@ -140,11 +143,7 @@ class GATs(Model):
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.GAT_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
self.GAT_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
@@ -190,12 +189,8 @@ class GATs(Model):
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_train_values[batch]).float()
label = torch.from_numpy(y_train_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_train_values[batch]).float().to(self.device)
label = torch.from_numpy(y_train_values[batch]).float().to(self.device)
pred = self.GAT_model(feature)
loss = self.loss_fn(pred, label)
@@ -221,12 +216,8 @@ class GATs(Model):
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
feature = torch.from_numpy(x_values[batch]).float()
label = torch.from_numpy(y_values[batch]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_values[batch]).float().to(self.device)
label = torch.from_numpy(y_values[batch]).float().to(self.device)
pred = self.GAT_model(feature)
loss = self.loss_fn(pred, label)
@@ -330,10 +321,7 @@ class GATs(Model):
for idx, count in zip(daily_index, daily_count):
batch = slice(idx, idx + count)
x_batch = torch.from_numpy(x_values[batch]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
x_batch = torch.from_numpy(x_values[batch]).float().to(self.device)
with torch.no_grad():
if self.use_gpu:

View File

@@ -0,0 +1,413 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Sampler
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...contrib.model.pytorch_lstm import LSTMModel
from ...contrib.model.pytorch_gru import GRUModel
class DailyBatchSampler(Sampler):
def __init__(self, data_source):
self.data_source = data_source
self.data = self.data_source.data.loc[self.data_source.get_index()]
self.daily_count = self.data.groupby(level=0).size().values # calculate number of samples in each batch
self.daily_index = np.roll(np.cumsum(self.daily_count), 1) # calculate begin index of each batch
self.daily_index[0] = 0
def __iter__(self):
for idx, count in zip(self.daily_index, self.daily_count):
yield np.arange(idx, idx + count)
def __len__(self):
return len(self.data_source)
class GATs(Model):
"""GATs Model
Parameters
----------
lr : float
learning rate
d_feat : int
input dimensions for each time step
metric : str
the evaluate metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=20,
hidden_size=64,
num_layers=2,
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="",
early_stop=20,
loss="mse",
base_model="GRU",
with_pretrain=True,
model_path=None,
optimizer="adam",
GPU="0",
n_jobs=10,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("GATs")
self.logger.info("GATs pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.base_model = base_model
self.with_pretrain = with_pretrain
self.model_path = model_path
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.n_jobs = n_jobs
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.logger.info(
"GATs parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nbase_model : {}"
"\nwith_pretrain : {}"
"\nmodel_path : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
early_stop,
optimizer.lower(),
loss,
base_model,
with_pretrain,
model_path,
GPU,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.GAT_model = GATModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
dropout=self.dropout,
base_model=self.base_model,
)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.GAT_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.GAT_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
self.GAT_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def get_daily_inter(self, df, shuffle=False):
# organize the train data into daily batches
daily_count = df.groupby(level=0).size().values
daily_index = np.roll(np.cumsum(daily_count), 1)
daily_index[0] = 0
if shuffle:
# shuffle data
daily_shuffle = list(zip(daily_index, daily_count))
np.random.shuffle(daily_shuffle)
daily_index, daily_count = zip(*daily_shuffle)
return daily_index, daily_count
def train_epoch(self, data_loader):
self.GAT_model.train()
for data in data_loader:
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.GAT_model(feature.float())
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.GAT_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_loader):
self.GAT_model.eval()
scores = []
losses = []
for data in data_loader:
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
pred = self.GAT_model(feature.float())
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset,
evals_result=dict(),
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
sampler_train = DailyBatchSampler(dl_train)
sampler_valid = DailyBatchSampler(dl_valid)
train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs)
valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs)
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# load pretrained base_model
if self.with_pretrain:
if self.model_path == None:
raise ValueError("the path of the pretrained model should be given first!")
self.logger.info("Loading pretrained model...")
if self.base_model == "LSTM":
pretrained_model = LSTMModel(
d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers
)
pretrained_model.load_state_dict(torch.load(self.model_path))
elif self.base_model == "GRU":
pretrained_model = GRUModel(
d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers
)
pretrained_model.load_state_dict(torch.load(self.model_path))
else:
raise ValueError("unknown base model name `%s`" % self.base_model)
model_dict = self.GAT_model.state_dict()
pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
model_dict.update(pretrained_dict)
self.GAT_model.load_state_dict(model_dict)
self.logger.info("Loading pretrained model Done...")
# train
self.logger.info("training...")
self._fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(train_loader)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(train_loader)
val_loss, val_score = self.test_epoch(valid_loader)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.GAT_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.GAT_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
sampler_test = DailyBatchSampler(dl_test)
test_loader = DataLoader(dl_test, sampler=sampler_test, num_workers=self.n_jobs)
self.GAT_model.eval()
preds = []
for data in test_loader:
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
if self.use_gpu:
pred = self.GAT_model(feature.float()).detach().cpu().numpy()
else:
pred = self.GAT_model(feature.float()).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=dl_test.get_index())
class GATModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"):
super().__init__()
if base_model == "GRU":
self.rnn = nn.GRU(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
elif base_model == "LSTM":
self.rnn = nn.LSTM(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
else:
raise ValueError("unknown base model name `%s`" % base_model)
self.hidden_size = hidden_size
self.d_feat = d_feat
self.transformation = nn.Linear(self.hidden_size, self.hidden_size)
self.a = nn.Parameter(torch.randn(self.hidden_size * 2, 1))
self.a.requires_grad = True
self.fc = nn.Linear(self.hidden_size, self.hidden_size)
self.fc_out = nn.Linear(hidden_size, 1)
self.leaky_relu = nn.LeakyReLU()
self.softmax = nn.Softmax(dim=1)
def cal_attention(self, x, y):
x = self.transformation(x)
y = self.transformation(y)
sample_num = x.shape[0]
dim = x.shape[1]
e_x = x.expand(sample_num, sample_num, dim)
e_y = torch.transpose(e_x, 0, 1)
attention_in = torch.cat((e_x, e_y), 2).view(-1, dim * 2)
self.a_t = torch.t(self.a)
attention_out = self.a_t.mm(torch.t(attention_in)).view(sample_num, sample_num)
attention_out = self.leaky_relu(attention_out)
att_weight = self.softmax(attention_out)
return att_weight
def forward(self, x):
out, _ = self.rnn(x)
hidden = out[:, -1, :]
att_weight = self.cal_attention(hidden, hidden)
hidden = att_weight.mm(hidden) + hidden
hidden = self.fc(hidden)
hidden = self.leaky_relu(hidden)
return self.fc_out(hidden).squeeze()

View File

@@ -56,8 +56,8 @@ class GRU(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
seed=0,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
@@ -76,7 +76,7 @@ class GRU(Model):
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.visible_GPU = GPU
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -113,8 +113,9 @@ class GRU(Model):
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.gru_model = GRUModel(
d_feat=self.d_feat,
@@ -130,11 +131,7 @@ class GRU(Model):
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.gru_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
self.gru_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
@@ -172,12 +169,8 @@ class GRU(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.gru_model(feature)
loss = self.loss_fn(pred, label)
@@ -205,12 +198,8 @@ class GRU(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.gru_model(feature)
loss = self.loss_fn(pred, label)
@@ -298,10 +287,7 @@ class GRU(Model):
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
with torch.no_grad():
if self.use_gpu:

View File

@@ -0,0 +1,301 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ...model.base import Model
from ...data.dataset import DatasetH, TSDatasetH
from ...data.dataset.handler import DataHandlerLP
class GRU(Model):
"""GRU Model
Parameters
----------
d_feat : int
input dimension for each time step
metric: str
the evaluate metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=6,
hidden_size=64,
num_layers=2,
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
n_jobs=10,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("GRU")
self.logger.info("GRU pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.n_jobs = n_jobs
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.logger.info(
"GRU parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nn_jobs : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
n_jobs,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.GRU_model = GRUModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
dropout=self.dropout,
).to(self.device)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.GRU_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.GRU_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
self.GRU_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
self.GRU_model.train()
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.GRU_model(feature.float())
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.GRU_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_loader):
self.GRU_model.eval()
scores = []
losses = []
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
pred = self.GRU_model(feature.float())
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset,
evals_result=dict(),
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self._fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(train_loader)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(train_loader)
val_loss, val_score = self.test_epoch(valid_loader)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.GRU_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.GRU_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.GRU_model.eval()
preds = []
for data in test_loader:
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
if self.use_gpu:
pred = self.GRU_model(feature.float()).detach().cpu().numpy()
else:
pred = self.GRU_model(feature.float()).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=dl_test.get_index())
class GRUModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
super().__init__()
self.rnn = nn.GRU(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
self.fc_out = nn.Linear(hidden_size, 1)
self.d_feat = d_feat
def forward(self, x):
out, _ = self.rnn(x)
return self.fc_out(out[:, -1, :]).squeeze()

View File

@@ -56,8 +56,8 @@ class LSTM(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
seed=0,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
@@ -76,7 +76,7 @@ class LSTM(Model):
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.visible_GPU = GPU
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -113,8 +113,9 @@ class LSTM(Model):
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.lstm_model = LSTMModel(
d_feat=self.d_feat,
@@ -130,11 +131,7 @@ class LSTM(Model):
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
if self.use_gpu:
self.lstm_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
self.lstm_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
@@ -172,12 +169,8 @@ class LSTM(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.lstm_model(feature)
loss = self.loss_fn(pred, label)
@@ -205,12 +198,8 @@ class LSTM(Model):
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
if self.use_gpu:
feature = feature.cuda()
label = label.cuda()
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.lstm_model(feature)
loss = self.loss_fn(pred, label)
@@ -298,10 +287,7 @@ class LSTM(Model):
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float()
if self.use_gpu:
x_batch = x_batch.cuda()
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
with torch.no_grad():
if self.use_gpu:

View File

@@ -0,0 +1,301 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ...model.base import Model
from ...data.dataset import DatasetH, TSDatasetH
from ...data.dataset.handler import DataHandlerLP
class LSTM(Model):
"""LSTM Model
Parameters
----------
d_feat : int
input dimension for each time step
metric: str
the evaluate metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
d_feat=6,
hidden_size=64,
num_layers=2,
dropout=0.0,
n_epochs=200,
lr=0.001,
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
n_jobs=10,
GPU=0,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("LSTM")
self.logger.info("LSTM pytorch version...")
# set hyper-parameters.
self.d_feat = d_feat
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.n_jobs = n_jobs
self.use_gpu = torch.cuda.is_available()
self.seed = seed
self.logger.info(
"LSTM parameters setting:"
"\nd_feat : {}"
"\nhidden_size : {}"
"\nnum_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size : {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nn_jobs : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
d_feat,
hidden_size,
num_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
n_jobs,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.LSTM_model = LSTMModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
dropout=self.dropout,
).to(self.device)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.LSTM_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.LSTM_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self._fitted = False
self.LSTM_model.to(self.device)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
self.LSTM_model.train()
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.LSTM_model(feature.float())
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.LSTM_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_loader):
self.LSTM_model.eval()
scores = []
losses = []
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
pred = self.LSTM_model(feature.float())
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset,
evals_result=dict(),
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
if save_path == None:
save_path = create_save_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self._fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(train_loader)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(train_loader)
val_loss, val_score = self.test_epoch(valid_loader)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.LSTM_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.LSTM_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.LSTM_model.eval()
preds = []
for data in test_loader:
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
if self.use_gpu:
pred = self.LSTM_model(feature.float()).detach().cpu().numpy()
else:
pred = self.LSTM_model(feature.float()).detach().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=dl_test.get_index())
class LSTMModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
super().__init__()
self.rnn = nn.LSTM(
input_size=d_feat,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
)
self.fc_out = nn.Linear(hidden_size, 1)
self.d_feat = d_feat
def forward(self, x):
out, _ = self.rnn(x)
return self.fc_out(out[:, -1, :]).squeeze()

View File

@@ -50,7 +50,7 @@ class DNNModelPytorch(Model):
self,
input_dim,
output_dim,
layers=(256, 512, 768, 512, 256, 128, 64),
layers=(256,),
lr=0.001,
max_steps=300,
batch_size=2000,
@@ -60,8 +60,9 @@ class DNNModelPytorch(Model):
lr_decay_steps=100,
optimizer="gd",
loss="mse",
GPU="0",
seed=0,
GPU=0,
seed=None,
weight_decay=0.0,
**kwargs
):
# Set logger.
@@ -79,9 +80,10 @@ class DNNModelPytorch(Model):
self.lr_decay_steps = lr_decay_steps
self.optimizer = optimizer.lower()
self.loss_type = loss
self.visible_GPU = GPU
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_GPU = torch.cuda.is_available()
self.seed = seed
self.weight_decay = weight_decay
self.logger.info(
"DNN parameters setting:"
@@ -98,7 +100,8 @@ class DNNModelPytorch(Model):
"\neval_steps : {}"
"\nseed : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}".format(
"\nuse_GPU : {}"
"\nweight_decay : {}".format(
layers,
lr,
max_steps,
@@ -113,11 +116,13 @@ class DNNModelPytorch(Model):
seed,
GPU,
self.use_GPU,
weight_decay,
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if loss not in {"mse", "binary"}:
raise NotImplementedError("loss {} is not supported!".format(loss))
@@ -125,9 +130,9 @@ class DNNModelPytorch(Model):
self.dnn_model = Net(input_dim, output_dim, layers, loss=self.loss_type)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.dnn_model.parameters(), lr=self.lr)
self.train_optimizer = optim.Adam(self.dnn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.dnn_model.parameters(), lr=self.lr)
self.train_optimizer = optim.SGD(self.dnn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
@@ -146,11 +151,7 @@ class DNNModelPytorch(Model):
)
self._fitted = False
if self.use_GPU:
self.dnn_model.cuda()
# set the visible GPU
if self.visible_GPU:
os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
self.dnn_model.to(self.device)
def fit(
self,
@@ -187,13 +188,9 @@ class DNNModelPytorch(Model):
w_train_values = torch.from_numpy(w_train.values).float()
train_num = y_train_values.shape[0]
# prepare validation data
x_val_auto = torch.from_numpy(x_valid.values).float()
y_val_auto = torch.from_numpy(y_valid.values).float()
w_val_auto = torch.from_numpy(w_valid.values).float()
if self.use_GPU:
x_val_auto = x_val_auto.cuda()
y_val_auto = y_val_auto.cuda()
w_val_auto = w_val_auto.cuda()
x_val_auto = torch.from_numpy(x_valid.values).float().to(self.device)
y_val_auto = torch.from_numpy(y_valid.values).float().to(self.device)
w_val_auto = torch.from_numpy(w_valid.values).float().to(self.device)
for step in range(self.max_steps):
if stop_steps >= self.early_stop_rounds:
@@ -204,14 +201,9 @@ class DNNModelPytorch(Model):
self.dnn_model.train()
self.train_optimizer.zero_grad()
choice = np.random.choice(train_num, self.batch_size)
x_batch_auto = x_train_values[choice]
y_batch_auto = y_train_values[choice]
w_batch_auto = w_train_values[choice]
if self.use_GPU:
x_batch_auto = x_batch_auto.cuda()
y_batch_auto = y_batch_auto.cuda()
w_batch_auto = w_batch_auto.cuda()
x_batch_auto = x_train_values[choice].to(self.device)
y_batch_auto = y_train_values[choice].to(self.device)
w_batch_auto = w_train_values[choice].to(self.device)
# forward
preds = self.dnn_model(x_batch_auto)
@@ -267,7 +259,7 @@ class DNNModelPytorch(Model):
loss = torch.mul(sqr_loss, w).mean()
return loss
elif loss_type == "binary":
loss = nn.BCELoss()
loss = nn.BCELoss(weight=w)
return loss(pred, target)
else:
raise NotImplementedError("loss {} is not supported!".format(loss_type))
@@ -276,9 +268,7 @@ class DNNModelPytorch(Model):
if not self._fitted:
raise ValueError("model is not fitted yet!")
x_test_pd = dataset.prepare("test", col_set="feature")
x_test = torch.from_numpy(x_test_pd.values).float()
if self.use_GPU:
x_test = x_test.cuda()
x_test = torch.from_numpy(x_test_pd.values).float().to(self.device)
self.dnn_model.eval()
with torch.no_grad():
@@ -306,7 +296,7 @@ class DNNModelPytorch(Model):
self._fitted = True
class AverageMeter(object):
class AverageMeter:
"""Computes and stores the average and current value"""
def __init__(self):

View File

@@ -217,7 +217,7 @@ class SFM(Model):
loss="mse",
optimizer="gd",
GPU="0",
seed=0,
seed=None,
**kwargs
):
# Set logger.
@@ -239,7 +239,7 @@ class SFM(Model):
self.eval_steps = eval_steps
self.optimizer = optimizer.lower()
self.loss = loss
self.device = "cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu"
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu")
self.use_gpu = torch.cuda.is_available()
self.seed = seed
@@ -282,8 +282,9 @@ class SFM(Model):
)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.sfm_model = SFM_Model(
d_feat=self.d_feat,
@@ -463,7 +464,7 @@ class SFM(Model):
return pd.Series(np.concatenate(preds), index=index)
class AverageMeter(object):
class AverageMeter:
"""Computes and stores the average and current value"""
def __init__(self):

View File

@@ -0,0 +1,642 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Function
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class TabnetModel(Model):
def __init__(
self,
d_feat=158,
out_dim=64,
final_out_dim=1,
batch_size=4096,
n_d=64,
n_a=64,
n_shared=2,
n_ind=2,
n_steps=5,
n_epochs=100,
pretrain_n_epochs=50,
relax=1.3,
vbs=2048,
seed=993,
optimizer="adam",
loss="mse",
metric="",
early_stop=20,
GPU="1",
pretrain_loss="custom",
ps=0.3,
lr=0.01,
pretrain=True,
pretrain_file="./pretrain/best.model",
):
"""
TabNet model for Qlib
Args
ps: probability to generate the bernoulli mask
"""
# set hyper-parameters.
self.d_feat = d_feat
self.out_dim = out_dim
self.final_out_dim = final_out_dim
self.lr = lr
self.batch_size = batch_size
self.optimizer = optimizer.lower()
self.pretrain_loss = pretrain_loss
self.seed = seed
self.ps = ps
self.n_epochs = n_epochs
self.logger = get_module_logger("TabNet")
self.pretrain_n_epochs = pretrain_n_epochs
self.device = "cuda:%s" % (GPU) if torch.cuda.is_available() else "cpu"
self.loss = loss
self.metric = metric
self.early_stop = early_stop
self.pretrain = pretrain
self.pretrain_file = pretrain_file
self.logger.info(
"TabNet:"
"\nbatch_size : {}"
"\nvirtual bs : {}"
"\nGPU : {}"
"\npretrain: {}".format(self.batch_size, vbs, GPU, pretrain)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.tabnet_model = TabNet(
inp_dim=self.d_feat, out_dim=self.out_dim, vbs=vbs, relax=relax, device=self.device
).to(self.device)
self.tabnet_decoder = TabNet_Decoder(self.out_dim, self.d_feat, n_shared, n_ind, vbs, n_steps, self.device).to(
self.device
)
if optimizer.lower() == "adam":
self.pretrain_optimizer = optim.Adam(
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
)
self.train_optimizer = optim.Adam(self.tabnet_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.pretrain_optimizer = optim.SGD(
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
)
self.train_optimizer = optim.SGD(self.tabnet_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
def pretrain_fn(self, dataset=DatasetH, pretrain_file="./pretrain/best.model"):
# make a directory if pretrian director does not exist
if pretrain_file.startswith("./pretrain") and not os.path.exists("pretrain"):
self.logger.info("make folder to store model...")
os.makedirs("pretrain")
[df_train, df_valid] = dataset.prepare(
["pretrain", "pretrain_validation"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
df_train.fillna(df_train.mean(), inplace=True)
df_valid.fillna(df_valid.mean(), inplace=True)
x_train = df_train["feature"]
x_valid = df_valid["feature"]
# Early stop setup
stop_steps = 0
train_loss = 0
best_loss = np.inf
for epoch_idx in range(self.pretrain_n_epochs):
self.logger.info("epoch: %s" % (epoch_idx))
self.logger.info("pre-training...")
self.pretrain_epoch(x_train)
self.logger.info("evaluating...")
train_loss = self.pretrain_test_epoch(x_train)
valid_loss = self.pretrain_test_epoch(x_valid)
self.logger.info("train %.6f, valid %.6f" % (train_loss, valid_loss))
if valid_loss < best_loss:
self.logger.info("Save Model...")
torch.save(self.tabnet_model.state_dict(), pretrain_file)
best_loss = valid_loss
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
verbose=True,
save_path=None,
):
if self.pretrain:
# there is a pretrained model, load the model
self.logger.info("Pretrain...")
self.pretrain_fn(dataset, self.pretrain_file)
self.logger.info("Load Pretrain model")
self.tabnet_model.load_state_dict(torch.load(self.pretrain_file))
# adding one more linear layer to fit the final output dimension
self.tabnet_model = FinetuneModel(self.out_dim, self.final_out_dim, self.tabnet_model).to(self.device)
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
df_train.fillna(df_train.mean(), inplace=True)
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
stop_steps = 0
train_loss = 0
best_score = np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
self.logger.info("training...")
self._fitted = True
for epoch_idx in range(self.n_epochs):
self.logger.info("epoch: %s" % (epoch_idx))
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
valid_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score < best_score:
best_score = val_score
stop_steps = 0
best_epoch = epoch_idx
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
index = x_test.index
self.tabnet_model.eval()
x_values = torch.from_numpy(x_test.values)
x_values[torch.isnan(x_values)] = 0
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = x_values[begin:end].float().to(self.device)
priors = torch.ones(end - begin, self.d_feat).to(self.device)
with torch.no_grad():
pred = self.tabnet_model(x_batch, priors).detach().cpu().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = torch.from_numpy(data_x.values)
y_values = torch.from_numpy(np.squeeze(data_y.values))
x_values[torch.isnan(x_values)] = 0
y_values[torch.isnan(y_values)] = 0
self.tabnet_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = x_values[indices[i : i + self.batch_size]].float().to(self.device)
label = y_values[indices[i : i + self.batch_size]].float().to(self.device)
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
pred = self.tabnet_model(feature, priors)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def train_epoch(self, x_train, y_train):
x_train_values = torch.from_numpy(x_train.values)
y_train_values = torch.from_numpy(np.squeeze(y_train.values))
x_train_values[torch.isnan(x_train_values)] = 0
y_train_values[torch.isnan(y_train_values)] = 0
self.tabnet_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = x_train_values[indices[i : i + self.batch_size]].float().to(self.device)
label = y_train_values[indices[i : i + self.batch_size]].float().to(self.device)
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
pred = self.tabnet_model(feature, priors)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.tabnet_model.parameters(), 3.0)
self.train_optimizer.step()
def pretrain_epoch(self, x_train):
train_set = torch.from_numpy(x_train.values)
train_set[torch.isnan(train_set)] = 0
indices = np.arange(len(train_set))
np.random.shuffle(indices)
self.tabnet_model.train()
self.tabnet_decoder.train()
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
S_mask = S_mask.to(self.device)
feature = x_train_values.float().to(self.device)
label = y_train_values.float().to(self.device)
priors = 1 - S_mask
(vec, sparse_loss) = self.tabnet_model(feature, priors)
f = self.tabnet_decoder(vec)
loss = self.pretrain_loss_fn(label, f, S_mask)
self.pretrain_optimizer.zero_grad()
loss.backward()
self.pretrain_optimizer.step()
def pretrain_test_epoch(self, x_train):
train_set = torch.from_numpy(x_train.values)
train_set[torch.isnan(train_set)] = 0
indices = np.arange(len(train_set))
self.tabnet_model.eval()
self.tabnet_decoder.eval()
losses = []
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
feature = x_train_values.float().to(self.device)
label = y_train_values.float().to(self.device)
S_mask = S_mask.to(self.device)
priors = 1 - S_mask
(vec, sparse_loss) = self.tabnet_model(feature, priors)
f = self.tabnet_decoder(vec)
loss = self.pretrain_loss_fn(label, f, S_mask)
losses.append(loss.item())
return np.mean(losses)
def pretrain_loss_fn(self, f_hat, f, S):
"""
Pretrain loss function defined in the original paper, read "Tabular self-supervised learning" in https://arxiv.org/pdf/1908.07442.pdf
"""
down_mean = torch.mean(f, dim=0)
down = torch.sqrt(torch.sum(torch.square(f - down_mean), dim=0))
up = (f_hat - f) * S
return torch.sum(torch.square(up / down))
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
class FinetuneModel(nn.Module):
"""
FinuetuneModel for adding a layer by the end
"""
def __init__(self, input_dim, output_dim, trained_model):
super().__init__()
self.model = trained_model
self.fc = nn.Linear(input_dim, output_dim)
def forward(self, x, priors):
return self.fc(self.model(x, priors)[0]).squeeze() # take the vec out
class DecoderStep(nn.Module):
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
super().__init__()
self.fea_tran = FeatureTransformer(inp_dim, out_dim, shared, n_ind, vbs, device)
self.fc = nn.Linear(out_dim, out_dim)
def forward(self, x):
x = self.fea_tran(x)
return self.fc(x)
class TabNet_Decoder(nn.Module):
def __init__(self, inp_dim, out_dim, n_shared, n_ind, vbs, n_steps, device):
"""
TabNet decoder that is used in pre-training
"""
self.out_dim = out_dim
super().__init__()
if n_shared > 0:
self.shared = nn.ModuleList()
self.shared.append(nn.Linear(inp_dim, 2 * out_dim))
for x in range(n_shared - 1):
self.shared.append(nn.Linear(out_dim, 2 * out_dim)) # preset the linear function we will use
else:
self.shared = None
self.n_steps = n_steps
self.steps = nn.ModuleList()
for x in range(n_steps):
self.steps.append(DecoderStep(inp_dim, out_dim, self.shared, n_ind, vbs, device))
def forward(self, x):
out = torch.zeros(x.size(0), self.out_dim).to(x.device)
for step in self.steps:
out += step(x)
return out
class TabNet(nn.Module):
def __init__(
self, inp_dim=6, out_dim=6, n_d=64, n_a=64, n_shared=2, n_ind=2, n_steps=5, relax=1.2, vbs=1024, device="cpu"
):
"""
TabNet AKA the original encoder
Args:
n_d: dimension of the features used to calculate the final results
n_a: dimension of the features input to the attention transformer of the next step
n_shared: numbr of shared steps in feature transfomer(optional)
n_ind: number of independent steps in feature transformer
n_steps: number of steps of pass through tabbet
relax coefficient:
virtual batch size:
"""
super().__init__()
# set the number of shared step in feature transformer
if n_shared > 0:
self.shared = nn.ModuleList()
self.shared.append(nn.Linear(inp_dim, 2 * (n_d + n_a)))
for x in range(n_shared - 1):
self.shared.append(nn.Linear(n_d + n_a, 2 * (n_d + n_a))) # preset the linear function we will use
else:
self.shared = None
self.first_step = FeatureTransformer(inp_dim, n_d + n_a, self.shared, n_ind, vbs, device)
self.steps = nn.ModuleList()
for x in range(n_steps - 1):
self.steps.append(DecisionStep(inp_dim, n_d, n_a, self.shared, n_ind, relax, vbs, device))
self.fc = nn.Linear(n_d, out_dim)
self.bn = nn.BatchNorm1d(inp_dim, momentum=0.01)
self.n_d = n_d
def forward(self, x, priors):
assert not torch.isnan(x).any()
x = self.bn(x)
x_a = self.first_step(x)[:, self.n_d :]
sparse_loss = torch.zeros(1).to(x.device)
out = torch.zeros(x.size(0), self.n_d).to(x.device)
for step in self.steps:
x_te, l = step(x, x_a, priors)
out += F.relu(x_te[:, : self.n_d]) # split the feautre from feat_transformer
x_a = x_te[:, self.n_d :]
sparse_loss += l
return self.fc(out), sparse_loss
class GBN(nn.Module):
"""
Ghost Batch Normalization
an efficient way of doing batch normalization
Args:
vbs: virtual batch size
"""
def __init__(self, inp, vbs=1024, momentum=0.01):
super().__init__()
self.bn = nn.BatchNorm1d(inp, momentum=momentum)
self.vbs = vbs
def forward(self, x):
chunk = torch.chunk(x, x.size(0) // self.vbs, 0)
res = [self.bn(y) for y in chunk]
return torch.cat(res, 0)
class GLU(nn.Module):
"""
GLU block that extracts only the most essential information
Args:
vbs: virtual batch size
"""
def __init__(self, inp_dim, out_dim, fc=None, vbs=1024):
super().__init__()
if fc:
self.fc = fc
else:
self.fc = nn.Linear(inp_dim, out_dim * 2)
self.bn = GBN(out_dim * 2, vbs=vbs)
self.od = out_dim
def forward(self, x):
x = self.bn(self.fc(x))
return torch.mul(x[:, : self.od], torch.sigmoid(x[:, self.od :]))
class AttentionTransformer(nn.Module):
"""
Args:
relax: relax coefficient. The greater it is, we can
use the same features more. When it is set to 1
we can use every feature only once
"""
def __init__(self, d_a, inp_dim, relax, vbs=1024):
super().__init__()
self.fc = nn.Linear(d_a, inp_dim)
self.bn = GBN(inp_dim, vbs=vbs)
self.r = relax
# a:feature from previous decision step
def forward(self, a, priors):
a = self.bn(self.fc(a))
mask = SparsemaxFunction.apply(a * priors)
priors = priors * (self.r - mask) # updating the prior
return mask
class FeatureTransformer(nn.Module):
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
super().__init__()
first = True
self.shared = nn.ModuleList()
if shared:
self.shared.append(GLU(inp_dim, out_dim, shared[0], vbs=vbs))
first = False
for fc in shared[1:]:
self.shared.append(GLU(out_dim, out_dim, fc, vbs=vbs))
else:
self.shared = None
self.independ = nn.ModuleList()
if first:
self.independ.append(GLU(inp, out_dim, vbs=vbs))
for x in range(first, n_ind):
self.independ.append(GLU(out_dim, out_dim, vbs=vbs))
self.scale = torch.sqrt(torch.tensor([0.5], device=device))
def forward(self, x):
if self.shared:
x = self.shared[0](x)
for glu in self.shared[1:]:
x = torch.add(x, glu(x))
x = x * self.scale
for glu in self.independ:
x = torch.add(x, glu(x))
x = x * self.scale
return x
class DecisionStep(nn.Module):
"""
One step for the TabNet
"""
def __init__(self, inp_dim, n_d, n_a, shared, n_ind, relax, vbs, device):
super().__init__()
self.atten_tran = AttentionTransformer(n_a, inp_dim, relax, vbs)
self.fea_tran = FeatureTransformer(inp_dim, n_d + n_a, shared, n_ind, vbs, device)
def forward(self, x, a, priors):
mask = self.atten_tran(a, priors)
sparse_loss = ((-1) * mask * torch.log(mask + 1e-10)).mean()
x = self.fea_tran(x * mask)
return x, sparse_loss
def make_ix_like(input, dim=0):
d = input.size(dim)
rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
view = [1] * input.dim()
view[0] = -1
return rho.view(view).transpose(0, dim)
class SparsemaxFunction(Function):
"""
SparseMax function for replacing reLU
"""
@staticmethod
def forward(ctx, input, dim=-1):
ctx.dim = dim
max_val, _ = input.max(dim=dim, keepdim=True)
input -= max_val # same numerical stability trick as for softmax
tau, supp_size = SparsemaxFunction.threshold_and_support(input, dim=dim)
output = torch.clamp(input - tau, min=0)
ctx.save_for_backward(supp_size, output)
return output
@staticmethod
def backward(ctx, grad_output):
supp_size, output = ctx.saved_tensors
dim = ctx.dim
grad_input = grad_output.clone()
grad_input[output == 0] = 0
v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
v_hat = v_hat.unsqueeze(dim)
grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
return grad_input, None
@staticmethod
def threshold_and_support(input, dim=-1):
input_srt, _ = torch.sort(input, descending=True, dim=dim)
input_cumsum = input_srt.cumsum(dim) - 1
rhos = make_ix_like(input, dim)
support = rhos * input_srt > input_cumsum
support_size = support.sum(dim=dim).unsqueeze(dim)
tau = input_cumsum.gather(dim, support_size - 1)
tau /= support_size.to(input.dtype)
return tau, support_size

View File

@@ -21,7 +21,7 @@ from .executor import SimulatorExecutor
from .executor import save_score_series, load_score_series
class Operator(object):
class Operator:
def __init__(self, client: str):
"""
Parameters

View File

@@ -38,7 +38,7 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
:param df:
:return:
"""
index_names = df.index.names
df.index = df.index.strftime("%Y-%m-%d")
report_df = pd.DataFrame()
@@ -58,6 +58,8 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
report_df["turnover"] = df["turnover"]
report_df.sort_index(ascending=True, inplace=True)
report_df.index.names = index_names
return report_df

View File

@@ -17,7 +17,7 @@ from plotly.figure_factory import create_distplot
from ...utils import get_module_by_module_path
class BaseGraph(object):
class BaseGraph:
""""""
_name = None
@@ -204,7 +204,7 @@ class HistogramGraph(BaseGraph):
return _data
class SubplotsGraph(object):
class SubplotsGraph:
"""Create subplots same as df.plot(subplots=True)
Simple package for `plotly.tools.subplots`

View File

@@ -30,7 +30,7 @@ class BaseStrategy:
Parameters
-----------
score_series : pd.Seires
score_series : pd.Series
stock_id , score.
current : Position()
current state of position.

View File

@@ -6,7 +6,7 @@ import copy
import os
class TunerConfigManager(object):
class TunerConfigManager:
def __init__(self, config_path):
if not config_path:
@@ -27,7 +27,7 @@ class TunerConfigManager(object):
self.qlib_client_config = config.get("qlib_client", dict())
class PipelineExperimentConfig(object):
class PipelineExperimentConfig:
def __init__(self, config, TUNER_CONFIG_MANAGER):
"""
:param config: The config dict for tuner experiment
@@ -53,7 +53,7 @@ class PipelineExperimentConfig(object):
yaml.dump(TUNER_CONFIG_MANAGER.config, fp)
class OptimizationConfig(object):
class OptimizationConfig:
def __init__(self, config, TUNER_CONFIG_MANAGER):
self.report_type = config.get("report_type", "pred_long")

View File

@@ -11,7 +11,7 @@ from ...log import get_module_logger, TimeInspector
from ...utils import get_module_by_module_path
class Pipeline(object):
class Pipeline:
GLOBAL_BEST_PARAMS_NAME = "global_best_params.json"

View File

@@ -19,7 +19,7 @@ from hyperopt import fmin, tpe
from hyperopt import STATUS_OK, STATUS_FAIL
class Tuner(object):
class Tuner:
def __init__(self, tuner_config, optim_config):
self.logger = get_module_logger("Tuner", sh_level=logging.INFO)

View File

@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
from libcpp.vector cimport vector
cdef class Expanding(object):
cdef class Expanding:
"""1-D array expanding"""
cdef vector[double] barv
cdef int na_count

View File

@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
from libcpp.deque cimport deque
cdef class Rolling(object):
cdef class Rolling:
"""1-D array rolling"""
cdef int window
cdef deque[double] barv

View File

@@ -157,7 +157,7 @@ class Expression(abc.ABC):
@abc.abstractmethod
def _load_internal(self, instrument, start_index, end_index, freq):
pass
raise NotImplementedError("This function must be implemented in your newly defined feature")
@abc.abstractmethod
def get_longest_back_rolling(self):

View File

@@ -13,6 +13,7 @@ import pickle
import traceback
import redis_lock
import contextlib
import abc
from pathlib import Path
import numpy as np
import pandas as pd
@@ -32,43 +33,107 @@ from ..utils import (
from ..log import get_module_logger
from .base import Feature
from .ops import *
from .ops import Operators
class QlibCacheException(RuntimeError):
pass
class MemCacheUnit(OrderedDict):
class MemCacheUnit(abc.ABC):
"""Memory Cache Unit."""
# TODO: use min_heap to replace ordereddict for better performance
def __init__(self, *args, **kwargs):
self.size_limit = kwargs.pop("size_limit", None)
# limit_type: check size_limit type, length(call fun: len) or size(call fun: sys.getsizeof)
self.limit_type = kwargs.pop("limit_type", "length")
super(MemCacheUnit, self).__init__(*args, **kwargs)
self._check_size_limit()
self.size_limit = kwargs.pop("size_limit", 0)
self._size = 0
self.od = OrderedDict()
def __setitem__(self, key, value):
super(MemCacheUnit, self).__setitem__(key, value)
self._check_size_limit()
# TODO: thread safe?__setitem__ failure might cause inconsistent size?
def __getitem__(self, key):
value = super(MemCacheUnit, self).__getitem__(key)
super(MemCacheUnit, self).__delitem__(key)
super(MemCacheUnit, self).__setitem__(key, value)
return value
# precalculate the size after od.__setitem__
self._adjust_size(key, value)
def _check_size_limit(self):
if self.size_limit is not None:
get_cur_size = lambda x: len(x) if self.limit_type == "length" else sum(map(sys.getsizeof, x.values()))
while get_cur_size(self) > self.size_limit:
self.od.__setitem__(key, value)
# move the key to end,make it latest
self.od.move_to_end(key)
if self.limited:
# pop the oldest items beyond size limit
while self._size > self.size_limit:
self.popitem(last=False)
def __getitem__(self, key):
v = self.od.__getitem__(key)
self.od.move_to_end(key)
return v
class MemCache(object):
def __contains__(self, key):
return key in self.od
def __len__(self):
return self.od.__len__()
def __repr__(self):
return f"{self.__class__.__name__}<size_limit:{self.size_limit if self.limited else 'no limit'} total_size:{self._size}>\n{self.od.__repr__()}"
def set_limit_size(self, limit):
self.size_limit = limit
@property
def limited(self):
"""whether memory cache is limited"""
return self.size_limit > 0
@property
def total_size(self):
return self._size
def clear(self):
self._size = 0
self.od.clear()
def popitem(self, last=True):
k, v = self.od.popitem(last=last)
self._size -= self._get_value_size(v)
return k, v
def pop(self, key):
v = self.od.pop(key)
self._size -= self._get_value_size(v)
return v
def _adjust_size(self, key, value):
if key in self.od:
self._size -= self._get_value_size(self.od[key])
self._size += self._get_value_size(value)
@abc.abstractmethod
def _get_value_size(self, value):
raise NotImplementedError
class MemCacheLengthUnit(MemCacheUnit):
def __init__(self, size_limit=0):
super().__init__(size_limit=size_limit)
def _get_value_size(self, value):
return 1
class MemCacheSizeofUnit(MemCacheUnit):
def __init__(self, size_limit=0):
super().__init__(size_limit=size_limit)
def _get_value_size(self, value):
return sys.getsizeof(value)
class MemCache:
"""Memory cache."""
def __init__(self, mem_cache_size_limit=None, limit_type="length"):
@@ -79,21 +144,19 @@ class MemCache(object):
mem_cache_size_limit: cache max size.
limit_type: length or sizeof; length(call fun: len), size(call fun: sys.getsizeof).
"""
if limit_type not in ["length", "sizeof"]:
size_limit = C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit
if limit_type == "length":
klass = MemCacheLengthUnit
elif limit_type == "sizeof":
klass = MemCacheSizeofUnit
else:
raise ValueError(f"limit_type must be length or sizeof, your limit_type is {limit_type}")
self.__calendar_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__instrument_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__feature_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__calendar_mem_cache = klass(size_limit)
self.__instrument_mem_cache = klass(size_limit)
self.__feature_mem_cache = klass(size_limit)
def __getitem__(self, key):
if key == "c":
@@ -140,7 +203,7 @@ class MemCacheExpire:
return value, expire
class CacheUtils(object):
class CacheUtils:
LOCK_ID = "QLIB"
@staticmethod
@@ -224,7 +287,7 @@ class CacheUtils(object):
current_cache_wlock.release()
class BaseProviderCache(object):
class BaseProviderCache:
"""Provider cache base class"""
def __init__(self, provider):
@@ -762,8 +825,8 @@ class DiskDatasetCache(DatasetCache):
.. note:: The start is closed. The end is open!!!!!
- Each line contains two element <timestamp, end_index>
- It indicates the `end_index` of the data for `timestamp`
- Each line contains two element <start_index, end_index> with a timestamp as its index.
- It indicates the `start_index`(included) and `end_index`(excluded) of the data for `timestamp`
- meta data: cache/d41366901e25de3ec47297f12e2ba11d.meta

View File

@@ -12,7 +12,7 @@ from ..log import get_module_logger
import pickle
class Client(object):
class Client:
"""A client class
Provide the connection tool functions for ClientProvider.

View File

@@ -15,14 +15,13 @@ import importlib
import traceback
import numpy as np
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
from .cache import H
from ..config import C
from .ops import *
from .ops import Operators
from ..log import get_module_logger
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname
from .base import Feature
from .cache import DiskDatasetCache, DiskExpressionCache
from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path
@@ -118,7 +117,7 @@ class CalendarProvider(abc.ABC):
if flag in H["c"]:
_calendar, _calendar_index = H["c"][flag]
else:
_calendar = np.array(self._load_calendar(freq, future))
_calendar = np.array(self.load_calendar(freq, future))
_calendar_index = {x: i for i, x in enumerate(_calendar)} # for fast search
H["c"][flag] = _calendar, _calendar_index
return _calendar, _calendar_index
@@ -215,20 +214,6 @@ class InstrumentProvider(abc.ABC):
return cls.LIST
raise ValueError(f"Unknown instrument type {inst}")
def convert_instruments(self, instrument):
_instruments_map = getattr(self, "_instruments_map", None)
if _instruments_map is None:
_df_list = []
# FIXME: each process will read these files
for _path in Path(C.get_data_path()).joinpath("instruments").glob("*.txt"):
_df = pd.read_csv(_path, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
_df_list.append(_df.iloc[:, [0, -1]])
df = pd.concat(_df_list, sort=False).sort_values("save_inst")
df = df.drop_duplicates(subset=["save_inst"], keep="first").fillna(axis=1, method="ffill")
_instruments_map = df.set_index("inst").iloc[:, 0].to_dict()
setattr(self, "_instruments_map", _instruments_map)
return _instruments_map.get(instrument, instrument)
class FeatureProvider(abc.ABC):
"""Feature provider class
@@ -481,11 +466,10 @@ class DatasetProvider(abc.ABC):
"""
# FIXME: Windows OS or MacOS using spawn: https://docs.python.org/3.8/library/multiprocessing.html?highlight=spawn#contexts-and-start-methods
global C
C = g_config
# NOTE: This place is compatible with windows, windows multi-process is spawn
if getattr(ExpressionD, "_provider", None) is None:
register_all_wrappers()
if not C.registered:
C.set_conf_from_C(g_config)
C.register()
obj = dict()
for field in column_names:
@@ -520,7 +504,7 @@ class LocalCalendarProvider(CalendarProvider):
"""Calendar file uri."""
return os.path.join(C.get_data_path(), "calendars", "{}.txt")
def _load_calendar(self, freq, future):
def load_calendar(self, freq, future):
"""Load original calendar timestamp from file.
Parameters
@@ -587,10 +571,16 @@ class LocalInstrumentProvider(InstrumentProvider):
fname = self._uri_inst.format(market)
if not os.path.exists(fname):
raise ValueError("instruments not exists for market " + market)
_instruments = dict()
df = pd.read_csv(fname, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
df["start_datetime"] = pd.to_datetime(df["start_datetime"])
df["end_datetime"] = pd.to_datetime(df["end_datetime"])
df = pd.read_csv(
fname,
sep="\t",
usecols=[0, 1, 2],
names=["inst", "start_datetime", "end_datetime"],
dtype={"inst": str},
parse_dates=["start_datetime", "end_datetime"],
)
for row in df.itertuples(index=False):
_instruments.setdefault(row[0], []).append((row[1], row[2]))
return _instruments
@@ -647,7 +637,7 @@ class LocalFeatureProvider(FeatureProvider):
def feature(self, instrument, field, start_index, end_index, freq):
# validate
field = str(field).lower()[1:]
instrument = Inst.convert_instruments(instrument)
instrument = code_to_fname(instrument)
uri_data = self._uri_data.format(instrument.lower(), field, freq)
if not os.path.exists(uri_data):
get_module_logger("data").warning("WARN: data not found for %s.%s" % (instrument, field))
@@ -682,6 +672,8 @@ class LocalExpressionProvider(ExpressionProvider):
series = series.astype(np.float32)
except ValueError:
pass
except TypeError:
pass
if not series.empty:
series = series.loc[start_index:end_index]
return series
@@ -969,8 +961,7 @@ class BaseProvider:
is a provider class.
"""
disk_cache = C.default_disk_cache if disk_cache is None else disk_cache
if C.disable_disk_cache:
disk_cache = False
fields = list(fields) # In case of tuple.
try:
return DatasetD.dataset(instruments, fields, start_time, end_time, freq, disk_cache)
except TypeError:
@@ -1035,15 +1026,34 @@ class ClientProvider(BaseProvider):
DatasetD.set_conn(self.client)
Cal = Wrapper()
Inst = Wrapper()
FeatureD = Wrapper()
ExpressionD = Wrapper()
DatasetD = Wrapper()
D = Wrapper()
import sys
if sys.version_info >= (3, 9):
from typing import Annotated
CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
else:
CalendarProviderWrapper = CalendarProvider
InstrumentProviderWrapper = InstrumentProvider
FeatureProviderWrapper = FeatureProvider
ExpressionProviderWrapper = ExpressionProvider
DatasetProviderWrapper = DatasetProvider
BaseProviderWrapper = BaseProvider
Cal: CalendarProviderWrapper = Wrapper()
Inst: InstrumentProviderWrapper = Wrapper()
FeatureD: FeatureProviderWrapper = Wrapper()
ExpressionD: ExpressionProviderWrapper = Wrapper()
DatasetD: DatasetProviderWrapper = Wrapper()
D: BaseProviderWrapper = Wrapper()
def register_all_wrappers():
def register_all_wrappers(C):
"""register_all_wrappers"""
logger = get_module_logger("data")
module = get_module_by_module_path("qlib.data")
@@ -1052,7 +1062,7 @@ def register_all_wrappers():
if getattr(C, "calendar_cache", None) is not None:
_calendar_provider = init_instance_by_config(C.calendar_cache, module, provide=_calendar_provider)
register_wrapper(Cal, _calendar_provider, "qlib.data")
logger.debug(f"registering Cal {C.calendar_provider}-{C.calenar_cache}")
logger.debug(f"registering Cal {C.calendar_provider}-{C.calendar_cache}")
register_wrapper(Inst, C.instrument_provider, "qlib.data")
logger.debug(f"registering Inst {C.instrument_provider}")

View File

@@ -1,10 +1,14 @@
from ...utils.serial import Serializable
from typing import Union, List, Tuple
from ...utils import init_instance_by_config
from ...utils import init_instance_by_config, np_ffill
from ...log import get_module_logger
from .handler import DataHandler, DataHandlerLP
from inspect import getfullargspec
import pandas as pd
import numpy as np
import bisect
from ...utils import lazy_sort_index
from .utils import get_level_index
class Dataset(Serializable):
@@ -72,18 +76,22 @@ class DatasetH(Dataset):
- The processing is related to data split.
"""
def __init__(self, handler: Union[dict, DataHandler], segments: list):
def __init__(self, handler: Union[dict, DataHandler], segments: dict):
"""
Parameters
----------
handler : Union[dict, DataHandler]
handler will be passed into setup_data.
segments : list
segments : dict
handler will be passed into setup_data.
"""
super().__init__(handler, segments)
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
def init(self, **kwargs):
"""Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
self.handler.init(**kwargs)
def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
"""
Setup the underlying data.
@@ -96,7 +104,7 @@ class DatasetH(Dataset):
- config of `DataHandler`. Please refer to `DataHandler`
segments : list
segments : dict
Describe the options to segment the data.
Here are some examples:
@@ -112,8 +120,18 @@ class DatasetH(Dataset):
'outsample': ("2017-01-01", "2020-08-01",),
}
"""
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
self._segments = segments.copy()
self.handler = init_instance_by_config(handler, accept_types=DataHandler)
self.segments = segments.copy()
def _prepare_seg(self, slc: slice, **kwargs):
"""
Give a slice, retrieve the according data
Parameters
----------
slc : slice
"""
return self.handler.fetch(slc, **kwargs)
def prepare(
self,
@@ -136,7 +154,7 @@ class DatasetH(Dataset):
- ['train', 'valid']
col_set : str
The col_set will be passed to self._handler when fetching data.
The col_set will be passed to self.handler when fetching data.
data_key : str
The data to fetch: DK_*
Default is DK_I, which indicate fetching data for **inference**.
@@ -152,14 +170,263 @@ class DatasetH(Dataset):
logger = get_module_logger("DatasetH")
fetch_kwargs = {"col_set": col_set}
fetch_kwargs.update(kwargs)
if "data_key" in getfullargspec(self._handler.fetch).args:
if "data_key" in getfullargspec(self.handler.fetch).args:
fetch_kwargs["data_key"] = data_key
else:
logger.info(f"data_key[{data_key}] is ignored.")
# Handle all kinds of segments format
if isinstance(segments, (list, tuple)):
return [self._handler.fetch(slice(*self._segments[seg]), **fetch_kwargs) for seg in segments]
return [self._prepare_seg(slice(*self.segments[seg]), **fetch_kwargs) for seg in segments]
elif isinstance(segments, str):
return self._handler.fetch(slice(*self._segments[segments]), **fetch_kwargs)
return self._prepare_seg(slice(*self.segments[segments]), **fetch_kwargs)
elif isinstance(segments, slice):
return self._prepare_seg(segments, **fetch_kwargs)
else:
raise NotImplementedError(f"This type of input is not supported")
class TSDataSampler:
"""
(T)ime-(S)eries DataSampler
This is the result of TSDatasetH
It works like `torch.data.utils.Dataset`, it provides a very convient interface for constructing time-series
dataset based on tabular data.
If user have further requirements for processing data, user could process them based on `TSDataSampler` or create
more powerful subclasses.
Known Issues:
- For performance issues, this Sampler will convert dataframe into arrays for better performance. This could result
in a different data type
"""
def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none"):
"""
Build a dataset which looks like torch.data.utils.Dataset.
Parameters
----------
data : pd.DataFrame
The raw tabular data
start :
The indexable start time
end :
The indexable end time
step_len : int
The length of the time-series step
fillna_type : int
How will qlib handle the sample if there is on sample in a specific date.
none:
fill with np.nan
ffill:
ffill with previous sample
ffill+bfill:
ffill with previous samples first and fill with later samples second
"""
self.start = start
self.end = end
self.step_len = step_len
self.fillna_type = fillna_type
assert get_level_index(data, "datetime") == 0
self.data = lazy_sort_index(data)
self.data_arr = np.array(self.data) # Get index from numpy.array will much faster than DataFrame.values! But
# NOTE: append last line with full NaN for better performance in `__getitem__`
self.data_arr = np.append(self.data_arr, np.full((1, self.data_arr.shape[1]), np.nan), axis=0)
self.nan_idx = -1 # The last line is all NaN
# the data type will be changed
# The index of usable data is between start_idx and end_idx
self.start_idx, self.end_idx = self.data.index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end))
# self.index_link = self.build_link(self.data)
self.idx_df, self.idx_map = self.build_index(self.data)
self.idx_arr = np.array(self.idx_df.values, dtype=np.float64) # for better performance
def get_index(self):
"""
Get the pandas index of the data, it will be useful in following scenarios
- Special sampler will be used (e.g. user want to sample day by day)
"""
return self.data.index[self.start_idx : self.end_idx]
def config(self, **kwargs):
# Config the attributes
for k, v in kwargs.items():
setattr(self, k, v)
@staticmethod
def build_index(data: pd.DataFrame) -> dict:
"""
The relation of the data
Parameters
----------
data : pd.DataFrame
The dataframe with <datetime, DataFrame>
Returns
-------
dict:
{<index>: <prev_index or None>}
# get the previous index of a line given index
"""
# object incase of pandas converting int to flaot
idx_df = pd.Series(range(data.shape[0]), index=data.index, dtype=np.object)
idx_df = lazy_sort_index(idx_df.unstack())
# NOTE: the correctness of `__getitem__` depends on columns sorted here
idx_df = lazy_sort_index(idx_df, axis=1)
idx_map = {}
for i, (_, row) in enumerate(idx_df.iterrows()):
for j, real_idx in enumerate(row):
if not np.isnan(real_idx):
idx_map[real_idx] = (i, j)
return idx_df, idx_map
def _get_indices(self, row: int, col: int) -> np.array:
"""
get series indices of self.data_arr from the row, col indices of self.idx_df
Parameters
----------
row : int
the row in self.idx_df
col : int
the col in self.idx_df
Returns
-------
np.array:
The indices of data of the data
"""
indices = self.idx_arr[max(row - self.step_len + 1, 0) : row + 1, col]
if len(indices) < self.step_len:
indices = np.concatenate([np.full((self.step_len - len(indices),), np.nan), indices])
if self.fillna_type == "ffill":
indices = np_ffill(indices)
elif self.fillna_type == "ffill+bfill":
indices = np_ffill(np_ffill(indices)[::-1])[::-1]
else:
assert self.fillna_type == "none"
return indices
def _get_row_col(self, idx) -> Tuple[int]:
"""
get the col index and row index of a given sample index in self.idx_df
Parameters
----------
idx :
the input of `__getitem__`
Returns
-------
Tuple[int]:
the row and col index
"""
# The the right row number `i` and col number `j` in idx_df
if isinstance(idx, (int, np.integer)):
real_idx = self.start_idx + idx
if self.start_idx <= real_idx < self.end_idx:
i, j = self.idx_map[real_idx] # TODO: The performance of this line is not good
else:
raise KeyError(f"{real_idx} is out of [{self.start_idx}, {self.end_idx})")
elif isinstance(idx, tuple):
# <TSDataSampler object>["datetime", "instruments"]
date, inst = idx
date = pd.Timestamp(date)
i = bisect.bisect_right(self.idx_df.index, date) - 1
# NOTE: This relies on the idx_df columns sorted in `__init__`
j = bisect.bisect_left(self.idx_df.columns, inst)
else:
raise NotImplementedError(f"This type of input is not supported")
return i, j
def __getitem__(self, idx: Union[int, Tuple[object, str], List[int]]):
"""
# We have two method to get the time-series of a sample
tsds is a instance of TSDataSampler
# 1) sample by int index directly
tsds[len(tsds) - 1]
# 2) sample by <datetime,instrument> index
tsds['2016-12-31', "SZ300315"]
# The return value will be similar to the data retrieved by following code
df.loc(axis=0)['2015-01-01':'2016-12-31', "SZ300315"].iloc[-30:]
Parameters
----------
idx : Union[int, Tuple[object, str]]
"""
# Multi-index type
mtit = (list, np.ndarray)
if isinstance(idx, mtit):
indices = [self._get_indices(*self._get_row_col(i)) for i in idx]
indices = np.concatenate(indices)
else:
indices = self._get_indices(*self._get_row_col(idx))
# 1) for better performance, use the last nan line for padding the lost date
# 2) In case of precision problems. We use np.float64. # TODO: I'm not sure if whether np.float64 will result in
# precision problems. It will not cause any problems in my tests at least
indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(np.int)
data = self.data_arr[indices]
if isinstance(idx, mtit):
# if we get multiple indexes, addition dimension should be added.
# <sample_idx, step_idx, feature_idx>
data = data.reshape(-1, self.step_len, *data.shape[1:])
return data
def __len__(self):
return self.end_idx - self.start_idx
class TSDatasetH(DatasetH):
"""
(T)ime-(S)eries Dataset (H)andler
Covnert the tabular data to Time-Series data
Requirements analysis
The typical workflow of a user to get time-series data for an sample
- process features
- slice proper data from data handler: dimension of sample <feature, >
- Build relation of samples by <time, instrument> index
- Be able to sample times series of data <timestep, feature>
- It will be better if the interface is like "torch.utils.data.Dataset"
- User could build customized batch based on the data
- The dimension of a batch of data <batch_idx, feature, timestep>
"""
def __init__(self, step_len=30, *args, **kwargs):
self.step_len = step_len
super().__init__(*args, **kwargs)
def setup_data(self, *args, **kwargs):
super().setup_data(*args, **kwargs)
cal = self.handler.fetch(col_set=self.handler.CS_RAW).index.get_level_values("datetime").unique()
cal = sorted(cal)
# Get the datatime index for building timestamp
self.cal = cal
def _prepare_seg(self, slc: slice, **kwargs) -> TSDataSampler:
# Dataset decide how to slice data(Get more data for timeseries).
start, end = slc.start, slc.stop
start_idx = bisect.bisect_left(self.cal, pd.Timestamp(start))
pad_start_idx = max(0, start_idx - self.step_len)
pad_start = self.cal[pad_start_idx]
# TSDatasetH will retrieve more data for complete
data = super()._prepare_seg(slice(pad_start, end), **kwargs)
tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len)
return tsds

View File

@@ -83,22 +83,42 @@ class DataHandler(Serializable):
# Setup data loader
assert data_loader is not None # to make start_time end_time could have None default value
# what data source to load data
self.data_loader = init_instance_by_config(
data_loader,
None if (isinstance(data_loader, dict) and "module_path" in data_loader) else data_loader_module,
accept_types=DataLoader,
)
# what data to be loaded from data source
# For IDE auto-completion.
self.instruments = instruments
self.start_time = start_time
self.end_time = end_time
self.fetch_orig = fetch_orig
if init_data:
with TimeInspector.logt("Init data"):
self.init()
super().__init__()
def init(self, enable_cache: bool = True):
def conf_data(self, **kwargs):
"""
configuration of data.
# what data to be loaded from data source
This method will be used when loading pickled handler from dataset.
The data will be initialized with different time range.
"""
attr_list = {"instruments", "start_time", "end_time"}
for k, v in kwargs.items():
if k in attr_list:
setattr(self, k, v)
else:
raise KeyError("Such config is not supported.")
def init(self, enable_cache: bool = False):
"""
initialize the data.
In case of running intialization for multiple time, it will do nothing for the second time.
@@ -155,6 +175,9 @@ class DataHandler(Serializable):
select a set of meaningful columns.(e.g. features, columns)
if cal_set == CS_RAW:
the raw dataset will be returned.
- if isinstance(col_set, List[str]):
select several sets of meaningful columns, the returned data has multiple levels
@@ -259,6 +282,7 @@ class DataHandlerLP(DataHandler):
infer_processors=[],
learn_processors=[],
process_type=PTYPE_A,
drop_raw=False,
**kwargs,
):
"""
@@ -300,6 +324,8 @@ class DataHandlerLP(DataHandler):
- self._learn will be processed by infer_processors + learn_processors
- (e.g. self._infer processed by learn_processors )
drop_raw: bool
Whether to drop the raw data
"""
# Setup preprocessor
@@ -316,6 +342,7 @@ class DataHandlerLP(DataHandler):
)
self.process_type = process_type
self.drop_raw = drop_raw
super().__init__(instruments, start_time, end_time, data_loader, **kwargs)
def get_all_processors(self):
@@ -345,7 +372,7 @@ class DataHandlerLP(DataHandler):
"""
# data for inference
_infer_df = self._data
if len(self.infer_processors) > 0: # avoid modifying the original data
if len(self.infer_processors) > 0 and not self.drop_raw: # avoid modifying the original data
_infer_df = _infer_df.copy()
for proc in self.infer_processors:
@@ -375,6 +402,9 @@ class DataHandlerLP(DataHandler):
_learn_df = proc(_learn_df)
self._learn = _learn_df
if self.drop_raw:
del self._data
# init type
IT_FIT_SEQ = "fit_seq" # the input of `fit` will be the output of the previous processor
IT_FIT_IND = "fit_ind" # the input of `fit` will be the original df
@@ -413,6 +443,10 @@ class DataHandlerLP(DataHandler):
# TODO: Be able to cache handler data. Save the memory for data processing
def _get_df_by_key(self, data_key: str = DK_I) -> pd.DataFrame:
if data_key == self.DK_R and self.drop_raw:
raise AttributeError(
"DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
)
df = getattr(self, {self.DK_R: "_data", self.DK_I: "_infer", self.DK_L: "_learn"}[data_key])
return df

View File

@@ -10,7 +10,9 @@ import pandas as pd
from typing import Tuple, Union
from qlib.data import D
from qlib.utils import load_dataset
from qlib.data import filter as filter_module
from qlib.data.filter import BaseDFilter
from qlib.utils import load_dataset, init_instance_by_config
class DataLoader(abc.ABC):
@@ -76,6 +78,7 @@ class DLWParser(DataLoader):
<config> := <fields_info>
<fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
# NOTE: list or tuple will be treated as the things when parsing
"""
self.is_group = isinstance(config, dict)
@@ -85,9 +88,15 @@ class DLWParser(DataLoader):
self.fields = self._parse_fields_info(config)
def _parse_fields_info(self, fields_info: Tuple[list, tuple]) -> Tuple[list, list]:
if isinstance(fields_info, list):
if len(fields_info) == 0:
raise ValueError("The size of fields must be greater than 0")
if not isinstance(fields_info, (list, tuple)):
raise TypeError("Unsupported type")
if isinstance(fields_info[0], str):
exprs = names = fields_info
elif isinstance(fields_info, tuple):
elif isinstance(fields_info[0], (list, tuple)):
exprs, names = fields_info
else:
raise NotImplementedError(f"This type of input is not supported")
@@ -132,7 +141,7 @@ class DLWParser(DataLoader):
class QlibDataLoader(DLWParser):
"""Same as QlibDataLoader. The fields can be define by config"""
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None):
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None, swap_level=True, freq="day"):
"""
Parameters
----------
@@ -140,8 +149,19 @@ class QlibDataLoader(DLWParser):
Please refer to the doc of DLWParser
filter_pipe :
Filter pipe for the instruments
swap_level :
Whether to swap level of MultiIndex
"""
if filter_pipe is not None:
assert isinstance(filter_pipe, list), "The type of `filter_pipe` must be list."
filter_pipe = [
init_instance_by_config(fp, None if "module_path" in fp else filter_module, accept_types=BaseDFilter)
for fp in filter_pipe
]
self.filter_pipe = filter_pipe
self.swap_level = swap_level
self.freq = freq
super().__init__(config)
def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame:
@@ -153,9 +173,10 @@ class QlibDataLoader(DLWParser):
elif self.filter_pipe is not None:
warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")
df = D.features(instruments, exprs, start_time, end_time)
df = D.features(instruments, exprs, start_time, end_time, self.freq)
df.columns = names
df = df.swaplevel().sort_index() # NOTE: always return <datetime, instrument>
if self.swap_level:
df = df.swaplevel().sort_index() # NOTE: if swaplevel, return <datetime, instrument>
return df

View File

@@ -102,6 +102,21 @@ class DropCol(Processor):
return df.loc[:, ~mask]
class FilterCol(Processor):
def __init__(self, fields_group="feature", col_list=[]):
self.fields_group = fields_group
self.col_list = col_list
def __call__(self, df):
cols = get_group_columns(df, self.fields_group)
all_cols = df.columns
diff_cols = np.setdiff1d(all_cols.get_level_values(-1), cols.get_level_values(-1))
self.col_list = np.union1d(diff_cols, self.col_list)
mask = df.columns.get_level_values(-1).isin(self.col_list)
return df.loc[:, mask]
class TanhProcess(Processor):
""" Use tanh to process noise data"""

View File

@@ -6,6 +6,7 @@ from __future__ import division
from __future__ import print_function
import sys
import abc
import numpy as np
import pandas as pd
@@ -17,57 +18,12 @@ from ..log import get_module_logger
try:
from ._libs.rolling import rolling_slope, rolling_rsquare, rolling_resi
from ._libs.expanding import expanding_slope, expanding_rsquare, expanding_resi
except ImportError as err:
print("Do not import qlib package in the repository directory!")
except ImportError:
print(
"#### Do not import qlib package in the repository directory in case of importing qlib from . without compiling #####"
)
raise
__all__ = (
"Ref",
"Max",
"Min",
"Sum",
"Mean",
"Std",
"Var",
"Skew",
"Kurt",
"Med",
"Mad",
"Slope",
"Rsquare",
"Resi",
"Rank",
"Quantile",
"Count",
"EMA",
"WMA",
"Corr",
"Cov",
"Delta",
"Abs",
"Sign",
"Log",
"Power",
"Add",
"Sub",
"Mul",
"Div",
"Greater",
"Less",
"And",
"Or",
"Not",
"Gt",
"Ge",
"Lt",
"Le",
"Eq",
"Ne",
"Mask",
"IdxMax",
"IdxMin",
"If",
)
np.seterr(invalid="ignore")
@@ -77,12 +33,39 @@ np.seterr(invalid="ignore")
class ElemOperator(ExpressionOps):
"""Element-wise Operator
Parameters
----------
feature : Expression
feature instance
Returns
----------
Expression
feature operation output
"""
def __init__(self, feature):
self.feature = feature
def __str__(self):
return "{}({})".format(type(self).__name__, self.feature)
def get_longest_back_rolling(self):
return self.feature.get_longest_back_rolling()
def get_extended_window_size(self):
return self.feature.get_extended_window_size()
class NpElemOperator(ElemOperator):
"""Numpy Element-wise Operator
Parameters
----------
feature : Expression
feature instance
func : str
feature operation method
numpy feature operation method
Returns
----------
@@ -93,22 +76,14 @@ class ElemOperator(ExpressionOps):
def __init__(self, feature, func):
self.feature = feature
self.func = func
def __str__(self):
return "{}({})".format(type(self).__name__, self.feature)
super(NpElemOperator, self).__init__(feature)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return getattr(np, self.func)(series)
def get_longest_back_rolling(self):
return self.feature.get_longest_back_rolling()
def get_extended_window_size(self):
return self.feature.get_extended_window_size()
class Abs(ElemOperator):
class Abs(NpElemOperator):
"""Feature Absolute Value
Parameters
@@ -126,7 +101,7 @@ class Abs(ElemOperator):
super(Abs, self).__init__(feature, "abs")
class Sign(ElemOperator):
class Sign(NpElemOperator):
"""Feature Sign
Parameters
@@ -143,8 +118,17 @@ class Sign(ElemOperator):
def __init__(self, feature):
super(Sign, self).__init__(feature, "sign")
def _load_internal(self, instrument, start_index, end_index, freq):
"""
To avoid error raised by bool type input, we transform the data into float32.
"""
series = self.feature.load(instrument, start_index, end_index, freq)
# TODO: More precision types should be configurable
series = series.astype(np.float32)
return getattr(np, self.func)(series)
class Log(ElemOperator):
class Log(NpElemOperator):
"""Feature Log
Parameters
@@ -162,7 +146,7 @@ class Log(ElemOperator):
super(Log, self).__init__(feature, "log")
class Power(ElemOperator):
class Power(NpElemOperator):
"""Feature Power
Parameters
@@ -188,7 +172,7 @@ class Power(ElemOperator):
return getattr(np, self.func)(series, self.exponent)
class Mask(ElemOperator):
class Mask(NpElemOperator):
"""Feature Mask
Parameters
@@ -215,7 +199,7 @@ class Mask(ElemOperator):
return self.feature.load(self.instrument, start_index, end_index, freq)
class Not(ElemOperator):
class Not(NpElemOperator):
"""Not Operator
Parameters
@@ -254,28 +238,13 @@ class PairOperator(ExpressionOps):
two features' operation output
"""
def __init__(self, feature_left, feature_right, func):
def __init__(self, feature_left, feature_right):
self.feature_left = feature_left
self.feature_right = feature_right
self.func = func
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
else:
series_right = self.feature_right
return getattr(np, self.func)(series_left, series_right)
def get_longest_back_rolling(self):
if isinstance(self.feature_left, Expression):
left_br = self.feature_left.get_longest_back_rolling()
@@ -301,7 +270,46 @@ class PairOperator(ExpressionOps):
return max(ll, rl), max(lr, rr)
class Add(PairOperator):
class NpPairOperator(PairOperator):
"""Numpy Pair-wise operator
Parameters
----------
feature_left : Expression
feature instance or numeric value
feature_right : Expression
feature instance or numeric value
func : str
operator function
Returns
----------
Feature:
two features' operation output
"""
def __init__(self, feature_left, feature_right, func):
self.feature_left = feature_left
self.feature_right = feature_right
self.func = func
super(NpPairOperator, self).__init__(feature_left, feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
else:
series_right = self.feature_right
return getattr(np, self.func)(series_left, series_right)
class Add(NpPairOperator):
"""Add Operator
Parameters
@@ -321,7 +329,7 @@ class Add(PairOperator):
super(Add, self).__init__(feature_left, feature_right, "add")
class Sub(PairOperator):
class Sub(NpPairOperator):
"""Subtract Operator
Parameters
@@ -341,7 +349,7 @@ class Sub(PairOperator):
super(Sub, self).__init__(feature_left, feature_right, "subtract")
class Mul(PairOperator):
class Mul(NpPairOperator):
"""Multiply Operator
Parameters
@@ -361,7 +369,7 @@ class Mul(PairOperator):
super(Mul, self).__init__(feature_left, feature_right, "multiply")
class Div(PairOperator):
class Div(NpPairOperator):
"""Division Operator
Parameters
@@ -381,7 +389,7 @@ class Div(PairOperator):
super(Div, self).__init__(feature_left, feature_right, "divide")
class Greater(PairOperator):
class Greater(NpPairOperator):
"""Greater Operator
Parameters
@@ -401,7 +409,7 @@ class Greater(PairOperator):
super(Greater, self).__init__(feature_left, feature_right, "maximum")
class Less(PairOperator):
class Less(NpPairOperator):
"""Less Operator
Parameters
@@ -421,7 +429,7 @@ class Less(PairOperator):
super(Less, self).__init__(feature_left, feature_right, "minimum")
class Gt(PairOperator):
class Gt(NpPairOperator):
"""Greater Than Operator
Parameters
@@ -441,7 +449,7 @@ class Gt(PairOperator):
super(Gt, self).__init__(feature_left, feature_right, "greater")
class Ge(PairOperator):
class Ge(NpPairOperator):
"""Greater Equal Than Operator
Parameters
@@ -461,7 +469,7 @@ class Ge(PairOperator):
super(Ge, self).__init__(feature_left, feature_right, "greater_equal")
class Lt(PairOperator):
class Lt(NpPairOperator):
"""Less Than Operator
Parameters
@@ -481,7 +489,7 @@ class Lt(PairOperator):
super(Lt, self).__init__(feature_left, feature_right, "less")
class Le(PairOperator):
class Le(NpPairOperator):
"""Less Equal Than Operator
Parameters
@@ -501,7 +509,7 @@ class Le(PairOperator):
super(Le, self).__init__(feature_left, feature_right, "less_equal")
class Eq(PairOperator):
class Eq(NpPairOperator):
"""Equal Operator
Parameters
@@ -521,7 +529,7 @@ class Eq(PairOperator):
super(Eq, self).__init__(feature_left, feature_right, "equal")
class Ne(PairOperator):
class Ne(NpPairOperator):
"""Not Equal Operator
Parameters
@@ -541,7 +549,7 @@ class Ne(PairOperator):
super(Ne, self).__init__(feature_left, feature_right, "not_equal")
class And(PairOperator):
class And(NpPairOperator):
"""And Operator
Parameters
@@ -561,7 +569,7 @@ class And(PairOperator):
super(And, self).__init__(feature_left, feature_right, "bitwise_and")
class Or(PairOperator):
class Or(NpPairOperator):
"""Or Operator
Parameters
@@ -1430,3 +1438,93 @@ class Cov(PairRolling):
def __init__(self, feature_left, feature_right, N):
super(Cov, self).__init__(feature_left, feature_right, N, "cov")
OpsList = [
Ref,
Max,
Min,
Sum,
Mean,
Std,
Var,
Skew,
Kurt,
Med,
Mad,
Slope,
Rsquare,
Resi,
Rank,
Quantile,
Count,
EMA,
WMA,
Corr,
Cov,
Delta,
Abs,
Sign,
Log,
Power,
Add,
Sub,
Mul,
Div,
Greater,
Less,
And,
Or,
Not,
Gt,
Ge,
Lt,
Le,
Eq,
Ne,
Mask,
IdxMax,
IdxMin,
If,
]
class OpsWrapper(object):
"""Ops Wrapper"""
def __init__(self):
self._ops = {}
def reset(self):
self._ops = {}
def register(self, ops_list):
for operator in ops_list:
if not issubclass(operator, ExpressionOps):
raise TypeError("operator must be subclass of ExpressionOps, not {}".format(operator))
if operator.__name__ in self._ops:
get_module_logger(self.__class__.__name__).warning(
"The custom operator [{}] will override the qlib default definition".format(operator.__name__)
)
self._ops[operator.__name__] = operator
def __getattr__(self, key):
if key not in self._ops:
raise AttributeError("The operator [{0}] is not registered".format(key))
return self._ops[key]
Operators = OpsWrapper()
def register_all_ops(C):
"""register all operator"""
logger = get_module_logger("ops")
Operators.reset()
Operators.register(OpsList)
if getattr(C, "custom_ops", None) is not None:
Operators.register(C.custom_ops)
logger.debug("register custom operator {}".format(C.custom_ops))

View File

@@ -36,7 +36,7 @@ def get_module_logger(module_name, level=None):
return module_logger
class TimeInspector(object):
class TimeInspector:
timer_logger = get_module_logger("timer", level=logging.WARNING)

View File

@@ -30,11 +30,6 @@ class Model(BaseModel):
The attribute names of learned model should `not` start with '_'. So that the model could be
dumped to disk.
Parameters
----------
dataset : Dataset
dataset will generate the processed data from model training.
The following code example shows how to retrieve `x_train`, `y_train` and `w_train` from the `dataset`:
.. code-block:: Python
@@ -53,6 +48,12 @@ class Model(BaseModel):
except KeyError as e:
w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)
w_valid = pd.DataFrame(np.ones_like(y_valid.values), index=y_valid.index)
Parameters
----------
dataset : Dataset
dataset will generate the processed data from model training.
"""
raise NotImplementedError()

View File

@@ -9,7 +9,7 @@ import scipy.optimize as so
from typing import Optional, Union, Callable, List
class PortfolioOptimizer(object):
class PortfolioOptimizer:
"""Portfolio Optimizer
The following optimization algorithms are supported:

27
qlib/tests/__init__.py Normal file
View File

@@ -0,0 +1,27 @@
import sys
import unittest
from ..utils import exists_qlib_data
from .data import GetData
from .. import init
from ..config import REG_CN
class TestAutoData(unittest.TestCase):
_setup_kwargs = {}
@classmethod
def setUpClass(cls) -> None:
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(
name="qlib_data_simple",
region="cn",
interval="1d",
target_dir=provider_uri,
delete_old=False,
)
init(provider_uri=provider_uri, region=REG_CN, **cls._setup_kwargs)

167
qlib/tests/data.py Normal file
View File

@@ -0,0 +1,167 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import re
import qlib
import shutil
import zipfile
import requests
import datetime
from tqdm import tqdm
from pathlib import Path
from loguru import logger
class GetData:
DATASET_VERSION = "v1"
REMOTE_URL = "http://fintech.msra.cn/stock_data/downloads"
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
def __init__(self, delete_zip_file=False):
"""
Parameters
----------
delete_zip_file : bool, optional
Whether to delete the zip file, value from True or False, by default False
"""
self.delete_zip_file = delete_zip_file
def normalize_dataset_version(self, dataset_version: str = None):
if dataset_version is None:
dataset_version = self.DATASET_VERSION
return dataset_version
def merge_remote_url(self, file_name: str, dataset_version: str = None):
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
def _download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
):
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
target_path = target_dir.joinpath(_target_file_name)
url = self.merge_remote_url(file_name, dataset_version)
resp = requests.get(url, stream=True)
if resp.status_code != 200:
raise requests.exceptions.HTTPError()
chunk_size = 1024
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{file_name} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
fp.write(chunk)
p_bar.update(chunk_size)
self._unzip(target_path, target_dir, delete_old)
if self.delete_zip_file:
target_path.unlink()
def check_dataset(self, file_name: str, dataset_version: str = None):
url = self.merge_remote_url(file_name, dataset_version)
resp = requests.get(url, stream=True)
status = True
if resp.status_code == 404:
status = False
return status
@staticmethod
def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
if delete_old:
logger.warning(
f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"
)
GetData._delete_qlib_data(target_dir)
logger.info(f"{file_path} unzipping......")
with zipfile.ZipFile(str(file_path.resolve()), "r") as zp:
for _file in tqdm(zp.namelist()):
zp.extract(_file, str(target_dir.resolve()))
@staticmethod
def _delete_qlib_data(file_dir: Path):
logger.info(f"delete {file_dir}")
rm_dirs = []
for _name in ["features", "calendars", "instruments", "features_cache", "dataset_cache"]:
_p = file_dir.joinpath(_name)
if _p.exists():
rm_dirs.append(str(_p.resolve()))
if rm_dirs:
flag = input(
f"Will be deleted: "
f"\n\t{rm_dirs}"
f"\nIf you do not need to delete {file_dir}, please change the <--target_dir>"
f"\nAre you sure you want to delete, yes(Y/y), no (N/n):"
)
if str(flag) not in ["Y", "y"]:
exit()
for _p in rm_dirs:
logger.warning(f"delete: {_p}")
shutil.rmtree(_p)
def qlib_data(
self,
name="qlib_data",
target_dir="~/.qlib/qlib_data/cn_data",
version=None,
interval="1d",
region="cn",
delete_old=True,
):
"""download cn qlib data from remote
Parameters
----------
target_dir: str
data save directory
name: str
dataset name, value from [qlib_data, qlib_data_simple], by default qlib_data
version: str
data version, value from [v1, ...], by default None(use script to specify version)
interval: str
data freq, value from [1d], by default 1d
region: str
data region, value from [cn, us], by default cn
delete_old: bool
delete an existing directory, by default True
Examples
---------
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
-------
"""
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
def _get_file_name(v):
return self.QLIB_DATA_NAME.format(
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
)
file_name = _get_file_name(qlib_version)
if not self.check_dataset(file_name, version):
file_name = _get_file_name("latest")
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
"""download cn csv data from remote
Parameters
----------
target_dir: str
data save directory
Examples
---------
python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
-------
"""
file_name = "csv_data_cn.zip"
self._download_data(file_name, target_dir)

View File

@@ -27,7 +27,7 @@ from pathlib import Path
from typing import Union, Tuple
from ..config import C
from ..log import get_module_logger
from ..log import get_module_logger, set_log_with_config
log = get_module_logger("utils")
@@ -55,6 +55,22 @@ def read_bin(file_path, start_index, end_index):
return series
def np_ffill(arr: np.array):
"""
forward fill a 1D numpy array
Parameters
----------
arr : np.array
Input numpy 1D array
"""
mask = np.isnan(arr.astype(np.float)) # np.isnan only works on np.float
# get fill index
idx = np.where(~mask, np.arange(mask.shape[0]), 0)
np.maximum.accumulate(idx, out=idx)
return arr[idx]
#################### Search ####################
def lower_bound(data, val, level=0):
"""multi fields list lower bound.
@@ -146,7 +162,7 @@ def parse_field(field):
# - $open+$close -> Feature("open")+Feature("close")
if not isinstance(field, str):
field = str(field)
return re.sub(r"\$(\w+)", r'Feature("\1")', field)
return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field))
def get_module_by_module_path(module_path):
@@ -263,8 +279,10 @@ def compare_dict_value(src_data: dict, dst_data: dict):
def create_save_path(save_path=None):
"""Create save path
:param save_path:
:return:
Parameters
----------
save_path: str
"""
if save_path:
if not os.path.exists(save_path):
@@ -455,30 +473,28 @@ def is_tradable_date(cur_date):
return str(cur_date.date()) == str(D.calendar(start_time=cur_date, future=True)[0].date())
def get_date_range(trading_date, shift, future=False):
def get_date_range(trading_date, left_shift=0, right_shift=0, future=False):
"""get trading date range by shift
:param trading_date:
:param shift: int
:param future: bool
:return:
Parameters
----------
trading_date: pd.Timestamp
left_shift: int
right_shift: int
future: bool
"""
from ..data import D
calendar = D.calendar(future=future)
if pd.to_datetime(trading_date) not in list(calendar):
raise ValueError("{} is not trading day!".format(str(trading_date)))
day_index = bisect.bisect_left(calendar, trading_date)
if 0 <= (day_index + shift) < len(calendar):
if shift > 0:
return calendar[day_index + 1 : day_index + 1 + shift]
else:
return calendar[day_index + shift : day_index]
else:
return calendar
start = get_date_by_shift(trading_date, left_shift, future=future)
end = get_date_by_shift(trading_date, right_shift, future=future)
calendar = D.calendar(start, end, future=future)
return calendar
def get_date_by_shift(trading_date, shift, future=False):
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True):
"""get trading date with shift bias wil cur_date
e.g. : shift == 1, return next trading date
shift == -1, return previous trading date
@@ -486,8 +502,22 @@ def get_date_by_shift(trading_date, shift, future=False):
trading_date : pandas.Timestamp
current date
shift : int
clip_shift: bool
"""
return get_date_range(trading_date, shift, future)[0 if shift < 0 else -1] if shift != 0 else trading_date
from qlib.data import D
cal = D.calendar(future=future)
if pd.to_datetime(trading_date) not in list(cal):
raise ValueError("{} is not trading day!".format(str(trading_date)))
_index = bisect.bisect_left(cal, trading_date)
shift_index = _index + shift
if shift_index < 0 or shift_index >= len(cal):
if clip_shift:
shift_index = np.clip(shift_index, 0, len(cal) - 1)
else:
raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range")
return cal[shift_index]
def get_next_trading_date(trading_date, future=False):
@@ -613,18 +643,31 @@ def exists_qlib_data(qlib_dir):
# check instruments
code_names = set(map(lambda x: x.name.lower(), features_dir.iterdir()))
_instrument = instruments_dir.joinpath("all.txt")
df = pd.read_csv(_instrument, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
df = df.iloc[:, [0, -1]].fillna(axis=1, method="ffill")
miss_code = set(df.iloc[:, -1].apply(str.lower)) - set(code_names)
miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names)
if miss_code and any(map(lambda x: "sht" not in x, miss_code)):
return False
return True
def lexsort_index(df: pd.DataFrame) -> pd.DataFrame:
def check_qlib_data(qlib_config):
inst_dir = Path(qlib_config["provider_uri"]).joinpath("instruments")
for _p in inst_dir.glob("*.txt"):
try:
assert len(pd.read_csv(_p, sep="\t", nrows=0, header=None).columns) == 3, (
f"\nThe {str(_p.resolve())} of qlib data is not equal to 3 columns:"
f"\n\tIf you are using the data provided by qlib: "
f"https://qlib.readthedocs.io/en/latest/component/data.html#qlib-format-dataset"
f"\n\tIf you are using your own data, please dump the data again: "
f"https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format"
)
except AssertionError:
raise
def lazy_sort_index(df: pd.DataFrame, axis=0) -> pd.DataFrame:
"""
make the df index lexsorted
make the df index sorted
df.sort_index() will take a lot of time even when `df.is_lexsorted() == True`
This function could avoid such case
@@ -638,10 +681,11 @@ def lexsort_index(df: pd.DataFrame) -> pd.DataFrame:
pd.DataFrame:
sorted dataframe
"""
if df.index.is_lexsorted():
idx = df.index if axis == 0 else df.columns
if idx.is_monotonic_increasing:
return df
else:
return df.sort_index()
return df.sort_index(axis=axis)
def flatten_dict(d, parent_key="", sep="."):
@@ -669,7 +713,7 @@ def flatten_dict(d, parent_key="", sep="."):
#################### Wrapper #####################
class Wrapper(object):
class Wrapper:
"""Wrapper class for anything that needs to set up during qlib.init"""
def __init__(self):
@@ -711,3 +755,36 @@ def load_dataset(path_or_obj):
elif extension == ".csv":
return pd.read_csv(path_or_obj, parse_dates=True, index_col=[0, 1])
raise ValueError(f"unsupported file type `{extension}`")
def code_to_fname(code: str):
"""stock code to file name
Parameters
----------
code: str
"""
# NOTE: In windows, the following name is I/O device, and the file with the corresponding name cannot be created
# reference: https://superuser.com/questions/86999/why-cant-i-name-a-folder-or-file-con-in-windows
replace_names = ["CON", "PRN", "AUX", "NUL"]
replace_names += [f"COM{i}" for i in range(10)]
replace_names += [f"LPT{i}" for i in range(10)]
prefix = "_qlib_"
if str(code).upper() in replace_names:
code = prefix + str(code)
return code
def fname_to_code(fname: str):
"""file name to stock code
Parameters
----------
fname: str
"""
prefix = "_qlib_"
if fname.startswith(prefix):
fname = fname.lstrip(prefix)
return fname

View File

@@ -27,11 +27,6 @@ class Serializable:
def dump_all(self):
"""
will the object dump all object
Parameters
----------
self : [TODO:type]
[TODO:description]
"""
return getattr(self, "_dump_all", False)
@@ -39,11 +34,6 @@ class Serializable:
def exclude(self):
"""
What attribute will be dumped
Parameters
----------
self : [TODO:type]
[TODO:description]
"""
return getattr(self, "_exclude", [])

View File

@@ -3,6 +3,7 @@
from contextlib import contextmanager
from .expm import MLflowExpManager
from .exp import Experiment
from .recorder import Recorder
from ..utils import Wrapper
@@ -165,7 +166,7 @@ class QlibRecorder:
"""
return self.get_exp(experiment_id, experiment_name).list_recorders()
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True):
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True) -> Experiment:
"""
Method for retrieving an experiment with given id or name. Once the `create` argument is set to
True, if no valid experiment is found, this method will create one for you. Otherwise, it will
@@ -461,5 +462,14 @@ class QlibRecorder:
self.get_exp().get_recorder().set_tags(**kwargs)
import sys
if sys.version_info >= (3, 9):
from typing import Annotated
QlibRecorderWrapper = Annotated[QlibRecorder, Wrapper]
else:
QlibRecorderWrapper = QlibRecorder
# global record
R = Wrapper()
R: QlibRecorderWrapper = Wrapper()

View File

@@ -44,7 +44,7 @@ def sys_config(config, config_path):
# worflow handler function
def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
with open(config_path) as fp:
config = yaml.load(fp, Loader=yaml.Loader)
config = yaml.load(fp, Loader=yaml.SafeLoader)
# config the `sys` section
sys_config(config, config_path)

View File

@@ -65,13 +65,13 @@ class Experiment:
"""
raise NotImplementedError(f"Please implement the `end` method.")
def create_recorder(self, name=None):
def create_recorder(self, recorder_name=None):
"""
Create a recorder for each experiment.
Parameters
----------
name : str
recorder_name : str
the name of the recorder to be created.
Returns

View File

@@ -5,10 +5,9 @@ import re
import pandas as pd
from pathlib import Path
from pprint import pprint
from ..contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from ..contrib.evaluate import risk_analysis
from ..contrib.backtest import backtest as normal_backtest
from ..data.dataset import DatasetH
from ..data.dataset.handler import DataHandlerLP
from ..utils import init_instance_by_config, get_module_by_module_path
@@ -130,18 +129,25 @@ class SignalRecord(RecordTemp):
pprint(f"The following are prediction results of the {type(self.model).__name__} model.")
pprint(pred.head(5))
# save according label
if isinstance(self.dataset, DatasetH):
params = dict(self=self.dataset, segments="test", col_set="label", data_key=DataHandlerLP.DK_R)
# NOTE:
# Python doesn't provide the downcasting mechanism.
# We use the trick here to downcast the class
orig_cls = self.dataset.__class__
self.dataset.__class__ = DatasetH
params = dict(segments="test", col_set="label", data_key=DataHandlerLP.DK_R)
try:
# Assume the backend handler is DataHandlerLP
raw_label = DatasetH.prepare(**params)
raw_label = self.dataset.prepare(**params)
except TypeError:
# The argument number is not right
del params["data_key"]
# The backend handler should be DataHandler
raw_label = DatasetH.prepare(**params)
raw_label = self.dataset.prepare(**params)
self.recorder.save_objects(**{"label.pkl": raw_label})
self.dataset.__class__ = orig_cls
def list(self):
return ["pred.pkl", "label.pkl"]
@@ -206,6 +212,11 @@ class SigAnaRecord(SignalRecord):
class PortAnaRecord(SignalRecord):
"""
This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class.
The following files will be stored in recorder
- report_normal.pkl & positions_normal.pkl:
- The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest`
- port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis`
"""
artifact_path = "portfolio_analysis"
@@ -229,9 +240,14 @@ class PortAnaRecord(SignalRecord):
# custom strategy and get backtest
pred_score = super().load()
report_normal, positions_normal = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
report_dict = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
report_normal = report_dict.get("report_df")
positions_normal = report_dict.get("positions")
self.recorder.save_objects(**{"report_normal.pkl": report_normal}, artifact_path=PortAnaRecord.get_path())
self.recorder.save_objects(**{"positions_normal.pkl": positions_normal}, artifact_path=PortAnaRecord.get_path())
order_normal = report_dict.get("order_list")
if order_normal:
self.recorder.save_objects(**{"order_normal.pkl": order_normal}, artifact_path=PortAnaRecord.get_path())
# analysis
analysis = dict()

View File

@@ -2,7 +2,7 @@
# Licensed under the MIT License.
import mlflow
import shutil, os, pickle, tempfile, codecs
import shutil, os, pickle, tempfile, codecs, pickle
from pathlib import Path
from datetime import datetime
from ..utils.objm import FileManager
@@ -202,9 +202,6 @@ class MLflowRecorder(Recorder):
super(MLflowRecorder, self).__init__(experiment_id, name)
self._uri = uri
self.artifact_uri = None
# set up file manager for saving objects
self.temp_dir = tempfile.mkdtemp()
self.fm = FileManager(Path(self.temp_dir).absolute())
self.client = mlflow.tracking.MlflowClient(tracking_uri=self._uri)
# construct from mlflow run
if mlflow_run is not None:
@@ -248,16 +245,18 @@ class MLflowRecorder(Recorder):
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if self.status != Recorder.STATUS_S:
self.status = status
shutil.rmtree(self.temp_dir)
def save_objects(self, local_path=None, artifact_path=None, **kwargs):
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."
if local_path is not None:
self.client.log_artifacts(self.id, local_path, artifact_path)
else:
temp_dir = Path(tempfile.mkdtemp()).resolve()
for name, data in kwargs.items():
self.fm.save_obj(data, name)
self.client.log_artifact(self.id, self.fm.path / name, artifact_path)
with (temp_dir / name).open("wb") as f:
pickle.dump(data, f)
self.client.log_artifact(self.id, temp_dir / name, artifact_path)
shutil.rmtree(temp_dir)
def load_object(self, name):
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."

Some files were not shown because too many files have changed in this diff Show More