1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 02:50:58 +08:00

Supporting Arctic Backend Provider & Orderbook, Tick Data Example (#744)

* change weight_decay & batchsize

* del weight_decay

* big weight_decay

* mid weight_decay

* small layer

* 2 layer

* full layer

* no weight decay

* divide into two data source

* change parse field

* delete some debug

* add Toperator

* new format of arctic

* fix cache bug to arctic read

* fix connection problem

* add some operator

* final version for arcitc

* clear HZ cache

* remove not used function

* add topswrappers

* successfully import data and run first test

* A simpler version to support arctic

* Successfully run all high-freq expressions

* Black format and fix add docs

* Add docs for download and test data

* update scripts and docs

* Add docs

* fix bug

* Refine docs

* fix test bug

* fix CI error

* clean code

Co-authored-by: bxdd <bxddream@gmail.com>
Co-authored-by: wangwenxi.handsome <wangwenxi.handsome@gmail.com>
Co-authored-by: Young <afe.young@gmail.com>
This commit is contained in:
luocy16
2022-01-18 09:13:11 +08:00
committed by GitHub
parent 7f274b1e4e
commit 2bb8a4ce0e
16 changed files with 923 additions and 90 deletions

View File

@@ -0,0 +1,51 @@
# Introduction
This example tries to demonstrate how Qlib supports data without fixed shared frequency.
For example,
- Daily prices volume data are fixed-frequency data. The data comes in a fixed frequency (i.e. daily)
- Orders are not fixed data and they may come at any time point
To support such non-fixed-frequency, Qlib implements an Arctic-based backend.
Here is an example to import and query data based on this backend.
# Installation
Please refer to [the installation docs](https://docs.mongodb.com/manual/installation/) of mongodb.
Current version of script with default value tries to connect localhost **via default port without authentication**.
Run following command to install necessary libraries
```
pip install pytest
```
# Importing example data
1. (Optional) Please follow the first part of [this section](https://github.com/microsoft/qlib#data-preparation) to **get 1min data** of Qlib.
2. Please follow following steps to download example data
```bash
cd examples/orderbook_data/
wget http://fintech.msra.cn/stock_data/downloads/highfreq_orderboook_example_data.tar.bz2
tar xf highfreq_orderboook_example_data.tar.bz2
```
3. Please import the example data to your mongo db
```bash
cd examples/orderbook_data/
python create_dataset.py initialize_library # Initialization Libraries
python create_dataset.py import_data # Initialization Libraries
```
# Query Examples
After importing these data, you run `example.py` to create some high-frequency features.
```bash
cd examples/orderbook_data/
pytest -s --disable-warnings example.py # If you want run all examples
pytest -s --disable-warnings example.py::TestClass::test_exp_10 # If you want to run specific example
```
# Known limitations
Expression computing between different frequencies are not supported yet

View File

@@ -0,0 +1,315 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
NOTE:
- This scripts is a demo to import example data import Qlib
- !!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!:
- Its structure is not well designed and very ugly, your contribution is welcome to make importing dataset easier
"""
from datetime import date, datetime as dt
import os
from pathlib import Path
import random
import shutil
import time
import traceback
from arctic import Arctic, chunkstore
import arctic
from arctic import Arctic, CHUNK_STORE
from arctic.chunkstore.chunkstore import CHUNK_SIZE
import fire
from joblib import Parallel, delayed, parallel
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas.core.indexes.datetimes import date_range
from pymongo.mongo_client import MongoClient
DIRNAME = Path(__file__).absolute().resolve().parent
# CONFIG
N_JOBS = -1 # leaving one kernel free
LOG_FILE_PATH = DIRNAME / "log_file"
DATA_PATH = DIRNAME / "raw_data"
DATABASE_PATH = DIRNAME / "orig_data"
DATA_INFO_PATH = DIRNAME / "data_info"
DATA_FINISH_INFO_PATH = DIRNAME / "./data_finish_info"
DOC_TYPE = ["Tick", "Order", "OrderQueue", "Transaction", "Day", "Minute"]
MAX_SIZE = 3000 * 1024 * 1024 * 1024
ALL_STOCK_PATH = DATABASE_PATH / "all.txt"
ARCTIC_SRV = "127.0.0.1"
def get_library_name(doc_type):
if str.lower(doc_type) == str.lower("Tick"):
return "ticks"
else:
return str.lower(doc_type)
def is_stock(exchange_place, code):
if exchange_place == "SH" and code[0] != "6":
return False
if exchange_place == "SZ" and code[0] != "0" and code[:2] != "30":
return False
return True
def add_one_stock_daily_data(filepath, type, exchange_place, arc, date):
"""
exchange_place: "SZ" OR "SH"
type: "tick", "orderbook", ...
filepath: the path of csv
arc: arclink created by a process
"""
code = os.path.split(filepath)[-1].split(".csv")[0]
if exchange_place == "SH" and code[0] != "6":
return
if exchange_place == "SZ" and code[0] != "0" and code[:2] != "30":
return
df = pd.read_csv(filepath, encoding="gbk", dtype={"code": str})
code = os.path.split(filepath)[-1].split(".csv")[0]
def format_time(day, hms):
day = str(day)
hms = str(hms)
if hms[0] == "1": # >=10,
return (
"-".join([day[0:4], day[4:6], day[6:8]]) + " " + ":".join([hms[:2], hms[2:4], hms[4:6] + "." + hms[6:]])
)
else:
return (
"-".join([day[0:4], day[4:6], day[6:8]]) + " " + ":".join([hms[:1], hms[1:3], hms[3:5] + "." + hms[5:]])
)
## Discard the entire row if wrong data timestamp encoutered.
timestamp = list(zip(list(df["date"]), list(df["time"])))
error_index_list = []
for index, t in enumerate(timestamp):
try:
pd.Timestamp(format_time(t[0], t[1]))
except Exception:
error_index_list.append(index) ## The row number of the error line
# to-do: writting to logs
if len(error_index_list) > 0:
print("error: {}, {}".format(filepath, len(error_index_list)))
df = df.drop(error_index_list)
timestamp = list(zip(list(df["date"]), list(df["time"]))) ## The cleaned timestamp
# generate timestamp
pd_timestamp = pd.DatetimeIndex(
[pd.Timestamp(format_time(timestamp[i][0], timestamp[i][1])) for i in range(len(df["date"]))]
)
df = df.drop(columns=["date", "time", "name", "code", "wind_code"])
# df = pd.DataFrame(data=df.to_dict("list"), index=pd_timestamp)
df["date"] = pd.to_datetime(pd_timestamp)
df.set_index("date", inplace=True)
if str.lower(type) == "orderqueue":
## extract ab1~ab50
df["ab"] = [
",".join([str(int(row["ab" + str(i + 1)])) for i in range(0, row["ab_items"])])
for timestamp, row in df.iterrows()
]
df = df.drop(columns=["ab" + str(i) for i in range(1, 51)])
type = get_library_name(type)
# arc.initialize_library(type, lib_type=CHUNK_STORE)
lib = arc[type]
symbol = "".join([exchange_place, code])
if symbol in lib.list_symbols():
print("update {0}, date={1}".format(symbol, date))
if df.empty == True:
return error_index_list
lib.update(symbol, df, chunk_size="D")
else:
print("write {0}, date={1}".format(symbol, date))
lib.write(symbol, df, chunk_size="D")
return error_index_list
def add_one_stock_daily_data_wrapper(filepath, type, exchange_place, index, date):
pid = os.getpid()
code = os.path.split(filepath)[-1].split(".csv")[0]
arc = Arctic(ARCTIC_SRV)
try:
if index % 100 == 0:
print("index = {}, filepath = {}".format(index, filepath))
error_index_list = add_one_stock_daily_data(filepath, type, exchange_place, arc, date)
if error_index_list is not None and len(error_index_list) > 0:
f = open(os.path.join(LOG_FILE_PATH, "temp_timestamp_error_{0}_{1}_{2}.txt".format(pid, date, type)), "a+")
f.write("{}, {}, {}\n".format(filepath, error_index_list, exchange_place + "_" + code))
f.close()
except Exception as e:
info = traceback.format_exc()
print("error:" + str(e))
f = open(os.path.join(LOG_FILE_PATH, "temp_fail_{0}_{1}_{2}.txt".format(pid, date, type)), "a+")
f.write("fail:" + str(filepath) + "\n" + str(e) + "\n" + str(info) + "\n")
f.close()
finally:
arc.reset()
def add_data(tick_date, doc_type, stock_name_dict):
pid = os.getpid()
if doc_type not in DOC_TYPE:
print("doc_type not in {}".format(DOC_TYPE))
return
try:
begin_time = time.time()
os.system(f"cp {DATABASE_PATH}/{tick_date + '_{}.tar.gz'.format(doc_type)} {DATA_PATH}/")
os.system(
f"tar -xvzf {DATA_PATH}/{tick_date + '_{}.tar.gz'.format(doc_type)} -C {DATA_PATH}/ {tick_date + '_' + doc_type}/SH"
)
os.system(
f"tar -xvzf {DATA_PATH}/{tick_date + '_{}.tar.gz'.format(doc_type)} -C {DATA_PATH}/ {tick_date + '_' + doc_type}/SZ"
)
os.system(f"chmod 777 {DATA_PATH}")
os.system(f"chmod 777 {DATA_PATH}/{tick_date + '_' + doc_type}")
os.system(f"chmod 777 {DATA_PATH}/{tick_date + '_' + doc_type}/SH")
os.system(f"chmod 777 {DATA_PATH}/{tick_date + '_' + doc_type}/SZ")
os.system(f"chmod 777 {DATA_PATH}/{tick_date + '_' + doc_type}/SH/{tick_date}")
os.system(f"chmod 777 {DATA_PATH}/{tick_date + '_' + doc_type}/SZ/{tick_date}")
print("tick_date={}".format(tick_date))
temp_data_path_sh = os.path.join(DATA_PATH, tick_date + "_" + doc_type, "SH", tick_date)
temp_data_path_sz = os.path.join(DATA_PATH, tick_date + "_" + doc_type, "SZ", tick_date)
is_files_exist = {"sh": os.path.exists(temp_data_path_sh), "sz": os.path.exists(temp_data_path_sz)}
sz_files = (
(
set([i.split(".csv")[0] for i in os.listdir(temp_data_path_sz) if i[:2] == "30" or i[0] == "0"])
& set(stock_name_dict["SZ"])
)
if is_files_exist["sz"]
else set()
)
sz_file_nums = len(sz_files) if is_files_exist["sz"] else 0
sh_files = (
(
set([i.split(".csv")[0] for i in os.listdir(temp_data_path_sh) if i[0] == "6"])
& set(stock_name_dict["SH"])
)
if is_files_exist["sh"]
else set()
)
sh_file_nums = len(sh_files) if is_files_exist["sh"] else 0
print("sz_file_nums:{}, sh_file_nums:{}".format(sz_file_nums, sh_file_nums))
f = (DATA_INFO_PATH / "data_info_log_{}_{}".format(doc_type, tick_date)).open("w+")
f.write("sz:{}, sh:{}, date:{}:".format(sz_file_nums, sh_file_nums, tick_date) + "\n")
f.close()
if sh_file_nums > 0:
# write is not thread-safe, update may be thread-safe
Parallel(n_jobs=N_JOBS)(
delayed(add_one_stock_daily_data_wrapper)(
os.path.join(temp_data_path_sh, name + ".csv"), doc_type, "SH", index, tick_date
)
for index, name in enumerate(list(sh_files))
)
if sz_file_nums > 0:
# write is not thread-safe, update may be thread-safe
Parallel(n_jobs=N_JOBS)(
delayed(add_one_stock_daily_data_wrapper)(
os.path.join(temp_data_path_sz, name + ".csv"), doc_type, "SZ", index, tick_date
)
for index, name in enumerate(list(sz_files))
)
os.system(f"rm -f {DATA_PATH}/{tick_date + '_{}.tar.gz'.format(doc_type)}")
os.system(f"rm -rf {DATA_PATH}/{tick_date + '_' + doc_type}")
total_time = time.time() - begin_time
f = (DATA_FINISH_INFO_PATH / "data_info_finish_log_{}_{}".format(doc_type, tick_date)).open("w+")
f.write("finish: date:{}, consume_time:{}, end_time: {}".format(tick_date, total_time, time.time()) + "\n")
f.close()
except Exception as e:
info = traceback.format_exc()
print("date error:" + str(e))
f = open(os.path.join(LOG_FILE_PATH, "temp_fail_{0}_{1}_{2}.txt".format(pid, tick_date, doc_type)), "a+")
f.write("fail:" + str(tick_date) + "\n" + str(e) + "\n" + str(info) + "\n")
f.close()
class DSCreator:
"""Dataset creator"""
def clear(self):
client = MongoClient(ARCTIC_SRV)
client.drop_database("arctic")
def initialize_library(self):
arc = Arctic(ARCTIC_SRV)
for doc_type in DOC_TYPE:
arc.initialize_library(get_library_name(doc_type), lib_type=CHUNK_STORE)
def _get_empty_folder(self, fp: Path):
fp = Path(fp)
if fp.exists():
shutil.rmtree(fp)
fp.mkdir(parents=True, exist_ok=True)
def import_data(self, doc_type_l=["Tick", "Transaction", "Order"]):
# clear all the old files
for fp in LOG_FILE_PATH, DATA_INFO_PATH, DATA_FINISH_INFO_PATH, DATA_PATH:
self._get_empty_folder(fp)
arc = Arctic(ARCTIC_SRV)
for doc_type in DOC_TYPE:
# arc.initialize_library(get_library_name(doc_type), lib_type=CHUNK_STORE)
arc.set_quota(get_library_name(doc_type), MAX_SIZE)
arc.reset()
# doc_type = 'Day'
for doc_type in doc_type_l:
date_list = list(set([int(path.split("_")[0]) for path in os.listdir(DATABASE_PATH) if doc_type in path]))
date_list.sort()
date_list = [str(date) for date in date_list]
f = open(ALL_STOCK_PATH, "r")
stock_name_list = [lines.split("\t")[0] for lines in f.readlines()]
f.close()
stock_name_dict = {
"SH": [stock_name[2:] for stock_name in stock_name_list if "SH" in stock_name],
"SZ": [stock_name[2:] for stock_name in stock_name_list if "SZ" in stock_name],
}
lib_name = get_library_name(doc_type)
a = Arctic(ARCTIC_SRV)
# a.initialize_library(lib_name, lib_type=CHUNK_STORE)
stock_name_exist = a[lib_name].list_symbols()
lib = a[lib_name]
initialize_count = 0
for stock_name in stock_name_list:
if stock_name not in stock_name_exist:
initialize_count += 1
# A placeholder for stocks
pdf = pd.DataFrame(index=[pd.Timestamp("1900-01-01")])
pdf.index.name = "date" # an col named date is necessary
lib.write(stock_name, pdf)
print("initialize count: {}".format(initialize_count))
print("tasks: {}".format(date_list))
a.reset()
# date_list = [files.split("_")[0] for files in os.listdir("./raw_data_price") if "tar" in files]
# print(len(date_list))
date_list = ["20201231"] # for test
Parallel(n_jobs=min(2, len(date_list)))(
delayed(add_data)(date, doc_type, stock_name_dict) for date in date_list
)
if __name__ == "__main__":
fire.Fire(DSCreator)

View File

@@ -0,0 +1,308 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from arctic.arctic import Arctic
import qlib
from qlib.data import D
import unittest
class TestClass(unittest.TestCase):
"""
Useful commands
- run all tests: pytest examples/orderbook_data/example.py
- run a single test: pytest -s --pdb --disable-warnings examples/orderbook_data/example.py::TestClass::test_basic01
"""
def setUp(self):
"""
Configure for arctic
"""
provider_uri = "~/.qlib/qlib_data/yahoo_cn_1min"
qlib.init(
provider_uri=provider_uri,
mem_cache_size_limit=1024 ** 3 * 2,
mem_cache_type="sizeof",
kernels=1,
expression_provider={"class": "LocalExpressionProvider", "kwargs": {"time2idx": False}},
feature_provider={"class": "ArcticFeatureProvider", "kwargs": {"uri": "127.0.0.1"}},
dataset_provider={
"class": "LocalDatasetProvider",
"kwargs": {
"align_time": False, # Order book is not fixed, so it can't be align to a shared fixed frequency calendar
},
},
)
# self.stocks_list = ["SH600519"]
self.stocks_list = ["SZ000725"]
def test_basic(self):
# NOTE: this data contains a lot of zeros in $askX and $bidX
df = D.features(
self.stocks_list,
fields=["$ask1", "$ask2", "$bid1", "$bid2"],
freq="ticks",
start_time="20201230",
end_time="20210101",
)
print(df)
def test_basic_without_time(self):
df = D.features(self.stocks_list, fields=["$ask1"], freq="ticks")
print(df)
def test_basic01(self):
df = D.features(
self.stocks_list,
fields=["TResample($ask1, '1min', 'last')"],
freq="ticks",
start_time="20201230",
end_time="20210101",
)
print(df)
def test_basic02(self):
df = D.features(
self.stocks_list,
fields=["$function_code"],
freq="transaction",
start_time="20201230",
end_time="20210101",
)
print(df)
def test_basic03(self):
df = D.features(
self.stocks_list,
fields=["$function_code"],
freq="order",
start_time="20201230",
end_time="20210101",
)
print(df)
# Here are some popular expressions for high-frequency
# 1) some shared expression
expr_sum_buy_ask_1 = "(TResample($ask1, '1min', 'last') + TResample($bid1, '1min', 'last'))"
total_volume = (
"TResample("
+ "+".join([f"${name}{i}" for i in range(1, 11) for name in ["asize", "bsize"]])
+ ", '1min', 'sum')"
)
@staticmethod
def total_func(name, method):
return "TResample(" + "+".join([f"${name}{i}" for i in range(1, 11)]) + ",'1min', '{}')".format(method)
def test_exp_01(self):
exprs = []
names = []
for name in ["asize", "bsize"]:
for i in range(1, 11):
exprs.append(f"TResample(${name}{i}, '1min', 'mean') / ({self.total_volume})")
names.append(f"v_{name}_{i}")
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
# 2) some often used papers;
def test_exp_02(self):
spread_func = (
lambda index: f"2 * TResample($ask{index} - $bid{index}, '1min', 'last') / {self.expr_sum_buy_ask_1}"
)
mid_func = (
lambda index: f"2 * TResample(($ask{index} + $bid{index})/2, '1min', 'last') / {self.expr_sum_buy_ask_1}"
)
exprs = []
names = []
for i in range(1, 11):
exprs.extend([spread_func(i), mid_func(i)])
names.extend([f"p_spread_{i}", f"p_mid_{i}"])
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
def test_exp_03(self):
expr3_func1 = (
lambda name, index_left, index_right: f"2 * TResample(Abs(${name}{index_left} - ${name}{index_right}), '1min', 'last') / {self.expr_sum_buy_ask_1}"
)
for name in ["ask", "bid"]:
for i in range(1, 10):
exprs = [expr3_func1(name, i + 1, i)]
names = [f"p_diff_{name}_{i}_{i+1}"]
exprs.extend([expr3_func1("ask", 10, 1), expr3_func1("bid", 1, 10)])
names.extend(["p_diff_ask_10_1", "p_diff_bid_1_10"])
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
def test_exp_04(self):
exprs = []
names = []
for name in ["asize", "bsize"]:
exprs.append(f"(({ self.total_func(name, 'mean')}) / 10) / {self.total_volume}")
names.append(f"v_avg_{name}")
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
def test_exp_05(self):
exprs = [
f"2 * Sub({ self.total_func('ask', 'last')}, {self.total_func('bid', 'last')})/{self.expr_sum_buy_ask_1}",
f"Sub({ self.total_func('asize', 'mean')}, {self.total_func('bsize', 'mean')})/{self.total_volume}",
]
names = ["p_accspread", "v_accspread"]
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
# (p|v)_diff_(ask|bid|asize|bsize)_(time_interval)
def test_exp_06(self):
t = 3
expr6_price_func = (
lambda name, index, method: f'2 * (TResample(${name}{index}, "{t}s", "{method}") - Ref(TResample(${name}{index}, "{t}s", "{method}"), 1)) / {t}'
)
exprs = []
names = []
for i in range(1, 11):
for name in ["bid", "ask"]:
exprs.append(
f"TResample({expr6_price_func(name, i, 'last')}, '1min', 'mean') / {self.expr_sum_buy_ask_1}"
)
names.append(f"p_diff_{name}{i}_{t}s")
for i in range(1, 11):
for name in ["asize", "bsize"]:
exprs.append(f"TResample({expr6_price_func(name, i, 'mean')}, '1min', 'mean') / {self.total_volume}")
names.append(f"v_diff_{name}{i}_{t}s")
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
# TODOs:
# Following expressions may be implemented in the future
# expr7_2 = lambda funccode, bsflag, time_interval: \
# "TResample(TRolling(TEq(@transaction.function_code, {}) & TEq(@transaction.bs_flag ,{}), '{}s', 'sum') / \
# TRolling(@transaction.function_code, '{}s', 'count') , '1min', 'mean')".format(ord(funccode), bsflag,time_interval,time_interval)
# create_dataset(7, "SH600000", [expr7_2("C")] + [expr7(funccode, ordercode) for funccode in ['B','S'] for ordercode in ['0','1']])
# create_dataset(7, ["SH600000"], [expr7_2("C", 48)] )
@staticmethod
def expr7_init(funccode, ordercode, time_interval):
# NOTE: based on on order frequency (i.e. freq="order")
return f"Rolling(Eq($function_code, {ord(funccode)}) & Eq($order_kind ,{ord(ordercode)}), '{time_interval}s', 'sum') / Rolling($function_code, '{time_interval}s', 'count')"
# (la|lb|ma|mb|ca|cb)_intensity_(time_interval)
def test_exp_07_1(self):
# NOTE: based on transaction frequency (i.e. freq="transaction")
expr7_3 = (
lambda funccode, code, time_interval: f"TResample(Rolling(Eq($function_code, {ord(funccode)}) & {code}($ask_order, $bid_order) , '{time_interval}s', 'sum') / Rolling($function_code, '{time_interval}s', 'count') , '1min', 'mean')"
)
exprs = [expr7_3("C", "Gt", "3"), expr7_3("C", "Lt", "3")]
names = ["ca_intensity_3s", "cb_intensity_3s"]
df = D.features(self.stocks_list, fields=exprs, freq="transaction")
df.columns = names
print(df)
trans_dict = {"B": "a", "S": "b", "0": "l", "1": "m"}
def test_exp_07_2(self):
# NOTE: based on on order frequency
expr7 = (
lambda funccode, ordercode, time_interval: f"TResample({self.expr7_init(funccode, ordercode, time_interval)}, '1min', 'mean')"
)
exprs = []
names = []
for funccode in ["B", "S"]:
for ordercode in ["0", "1"]:
exprs.append(expr7(funccode, ordercode, "3"))
names.append(self.trans_dict[ordercode] + self.trans_dict[funccode] + "_intensity_3s")
df = D.features(self.stocks_list, fields=exprs, freq="transaction")
df.columns = names
print(df)
@staticmethod
def expr7_3_init(funccode, code, time_interval):
# NOTE: It depends on transaction frequency
return f"Rolling(Eq($function_code, {ord(funccode)}) & {code}($ask_order, $bid_order) , '{time_interval}s', 'sum') / Rolling($function_code, '{time_interval}s', 'count')"
# (la|lb|ma|mb|ca|cb)_relative_intensity_(time_interval_small)_(time_interval_big)
def test_exp_08_1(self):
expr8_1 = (
lambda funccode, ordercode, time_interval_short, time_interval_long: f"TResample(Gt({self.expr7_init(funccode, ordercode, time_interval_short)},{self.expr7_init(funccode, ordercode, time_interval_long)}), '1min', 'mean')"
)
exprs = []
names = []
for funccode in ["B", "S"]:
for ordercode in ["0", "1"]:
exprs.append(expr8_1(funccode, ordercode, "10", "900"))
names.append(self.trans_dict[ordercode] + self.trans_dict[funccode] + "_relative_intensity_10s_900s")
df = D.features(self.stocks_list, fields=exprs, freq="order")
df.columns = names
print(df)
def test_exp_08_2(self):
# NOTE: It depends on transaction frequency
expr8_2 = (
lambda funccode, ordercode, time_interval_short, time_interval_long: f"TResample(Gt({self.expr7_3_init(funccode, ordercode, time_interval_short)},{self.expr7_3_init(funccode, ordercode, time_interval_long)}), '1min', 'mean')"
)
exprs = [expr8_2("C", "Gt", "10", "900"), expr8_2("C", "Lt", "10", "900")]
names = ["ca_relative_intensity_10s_900s", "cb_relative_intensity_10s_900s"]
df = D.features(self.stocks_list, fields=exprs, freq="transaction")
df.columns = names
print(df)
## v9(la|lb|ma|mb|ca|cb)_diff_intensity_(time_interval1)_(time_interval2)
# 1) calculating the original data
# 2) Resample data to 3s and calculate the changing rate
# 3) Resample data to 1min
def test_exp_09_trans(self):
exprs = [
f'TResample(Div(Sub(TResample({self.expr7_3_init("C", "Gt", "3")}, "3s", "last"), Ref(TResample({self.expr7_3_init("C", "Gt", "3")}, "3s","last"), 1)), 3), "1min", "mean")',
f'TResample(Div(Sub(TResample({self.expr7_3_init("C", "Lt", "3")}, "3s", "last"), Ref(TResample({self.expr7_3_init("C", "Lt", "3")}, "3s","last"), 1)), 3), "1min", "mean")',
]
names = ["ca_diff_intensity_3s_3s", "cb_diff_intensity_3s_3s"]
df = D.features(self.stocks_list, fields=exprs, freq="transaction")
df.columns = names
print(df)
def test_exp_09_order(self):
exprs = []
names = []
for funccode in ["B", "S"]:
for ordercode in ["0", "1"]:
exprs.append(
f'TResample(Div(Sub(TResample({self.expr7_init(funccode, ordercode, "3")}, "3s", "last"), Ref(TResample({self.expr7_init(funccode, ordercode, "3")},"3s", "last"), 1)), 3) ,"1min", "mean")'
)
names.append(self.trans_dict[ordercode] + self.trans_dict[funccode] + "_diff_intensity_3s_3s")
df = D.features(self.stocks_list, fields=exprs, freq="order")
df.columns = names
print(df)
def test_exp_10(self):
exprs = []
names = []
for i in [5, 10, 30, 60]:
exprs.append(
f'TResample(Ref(TResample($ask1 + $bid1, "1s", "ffill"), {-i}) / TResample($ask1 + $bid1, "1s", "ffill") - 1, "1min", "mean" )'
)
names.append(f"lag_{i}_change_rate" for i in [5, 10, 30, 60])
df = D.features(self.stocks_list, fields=exprs, freq="ticks")
df.columns = names
print(df)
if __name__ == "__main__":
unittest.main()