1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 19:10:58 +08:00
Files
qlib/qlib/rl/order_execution/interpreter.py
Yuge Zhang 9a40fd3cdc Qlib RL framework (stage 1) - single-asset order execution (#1076)
* rl init

* aux info

* Reward config

* update

* simple

* update saoe init

* update simulator and seed

* minor

* minor

* update sim

* checkpoint

* obs

* Update interpreter

* init qlib simulator

* checkpoint

* Refine codebase

* checkpoint

* checkpoint

* Add one test

* More tests

* Simulator checkpoint

* checkpoint

* First-step tested

* Checkpoint

* Update data_queue API

* Checkpoint

* Update test

* Move files

* Checkpoint

* Single-quote -> double-quote

* Fix finite env tests

* Tested with mypy

* pep-574

* No call for env done

* Update finite env docs

* Fix csv writer

* Refine tester

* Update logger

* Add another logger test

* Checkpoint

* Add network sanity test

* steps per episode is not correct

* Cleanup code, ready for PR

* Reformat with black

* Fix pylint for py37

* Fix lint

* Fix lint

* Fix flake

* update mypy command

* mypy

* Update exclude pattern

* Use pyproject.toml

* test

* .

* .

* Refactor pipeline

* .

* defaults run bash

* .

* Revert and skip follow_imports

* Fix toml issue

* fix mypy

* .

* .

* .

* Fix install

* Minor fix

* Fix test

* Fix test

* Remove requirements

* Revert

* fix tests

* Fix lint

* .

* .

* .

* .

* .

* update install from source command

* .

* Fix data download

* .

* .

* .

* .

* .

* .

* Fix py37

* Ignore tests on non-linux

* resolve comments

* fix tests

* resolve comments

* some typo

* style updates

* More comments

* fix dummy

* add warning

* Align precision in some system

* Added some impl notes

Co-authored-by: Young <afe.young@gmail.com>
2022-05-21 18:19:24 +08:00

223 lines
8.2 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import annotations
import math
from pathlib import Path
from typing import Any, cast
import numpy as np
import pandas as pd
from gym import spaces
from qlib.constant import EPS
from qlib.rl.interpreter import StateInterpreter, ActionInterpreter
from qlib.rl.data import pickle_styled
from qlib.typehint import TypedDict
from .simulator_simple import SAOEState
__all__ = [
"FullHistoryStateInterpreter",
"CurrentStepStateInterpreter",
"CategoricalActionInterpreter",
"TwapRelativeActionInterpreter",
]
def canonicalize(value: int | float | np.ndarray | pd.DataFrame | dict) -> np.ndarray | dict:
"""To 32-bit numeric types. Recursively."""
if isinstance(value, pd.DataFrame):
return value.to_numpy()
if isinstance(value, (float, np.floating)) or (isinstance(value, np.ndarray) and value.dtype.kind == "f"):
return np.array(value, dtype=np.float32)
elif isinstance(value, (int, bool, np.integer)) or (isinstance(value, np.ndarray) and value.dtype.kind == "i"):
return np.array(value, dtype=np.int32)
elif isinstance(value, dict):
return {k: canonicalize(v) for k, v in value.items()}
else:
return value
class FullHistoryObs(TypedDict):
data_processed: Any
data_processed_prev: Any
acquiring: Any
cur_tick: Any
cur_step: Any
num_step: Any
target: Any
position: Any
position_history: Any
class FullHistoryStateInterpreter(StateInterpreter[SAOEState, FullHistoryObs]):
"""The observation of all the history, including today (until this moment), and yesterday.
Parameters
----------
data_dir
Path to load data after feature engineering.
max_step
Total number of steps (an upper-bound estimation). For example, 390min / 30min-per-step = 13 steps.
data_ticks
Equal to the total number of records. For example, in SAOE per minute,
the total ticks is the length of day in minutes.
data_dim
Number of dimensions in data.
"""
def __init__(self, data_dir: Path, max_step: int, data_ticks: int, data_dim: int) -> None:
self.data_dir = data_dir
self.max_step = max_step
self.data_ticks = data_ticks
self.data_dim = data_dim
def interpret(self, state: SAOEState) -> FullHistoryObs:
processed = pickle_styled.load_intraday_processed_data(
self.data_dir,
state.order.stock_id,
pd.Timestamp(state.order.start_time.date()),
self.data_dim,
state.ticks_index,
)
position_history = np.full(self.max_step + 1, 0.0, dtype=np.float32)
position_history[0] = state.order.amount
position_history[1 : len(state.history_steps) + 1] = state.history_steps["position"].to_numpy()
assert self.env is not None
# The min, slice here are to make sure that indices fit into the range,
# even after the final step of the simulator (in the done step),
# to make network in policy happy.
return cast(
FullHistoryObs,
canonicalize(
{
"data_processed": self._mask_future_info(processed.today, state.cur_time),
"data_processed_prev": processed.yesterday,
"acquiring": state.order.direction == state.order.BUY,
"cur_tick": min(np.sum(state.ticks_index < state.cur_time), self.data_ticks - 1),
"cur_step": min(self.env.status["cur_step"], self.max_step - 1),
"num_step": self.max_step,
"target": state.order.amount,
"position": state.position,
"position_history": position_history[: self.max_step],
}
),
)
@property
def observation_space(self):
space = {
"data_processed": spaces.Box(-np.inf, np.inf, shape=(self.data_ticks, self.data_dim)),
"data_processed_prev": spaces.Box(-np.inf, np.inf, shape=(self.data_ticks, self.data_dim)),
"acquiring": spaces.Discrete(2),
"cur_tick": spaces.Box(0, self.data_ticks - 1, shape=(), dtype=np.int32),
"cur_step": spaces.Box(0, self.max_step - 1, shape=(), dtype=np.int32),
# TODO: support arbitrary length index
"num_step": spaces.Box(self.max_step, self.max_step, shape=(), dtype=np.int32),
"target": spaces.Box(-EPS, np.inf, shape=()),
"position": spaces.Box(-EPS, np.inf, shape=()),
"position_history": spaces.Box(-EPS, np.inf, shape=(self.max_step,)),
}
return spaces.Dict(space)
@staticmethod
def _mask_future_info(arr: pd.DataFrame, current: pd.Timestamp) -> pd.DataFrame:
arr = arr.copy(deep=True)
arr.loc[current:] = 0.0 # mask out data after this moment (inclusive)
return arr
class CurrentStateObs(TypedDict):
acquiring: bool
cur_step: int
num_step: int
target: float
position: float
class CurrentStepStateInterpreter(StateInterpreter[SAOEState, CurrentStateObs]):
"""The observation of current step.
Used when policy only depends on the latest state, but not history.
The key list is not full. You can add more if more information is needed by your policy.
"""
def __init__(self, max_step: int):
self.max_step = max_step
@property
def observation_space(self):
space = {
"acquiring": spaces.Discrete(2),
"cur_step": spaces.Box(0, self.max_step - 1, shape=(), dtype=np.int32),
"num_step": spaces.Box(self.max_step, self.max_step, shape=(), dtype=np.int32),
"target": spaces.Box(-EPS, np.inf, shape=()),
"position": spaces.Box(-EPS, np.inf, shape=()),
}
return spaces.Dict(space)
def interpret(self, state: SAOEState) -> CurrentStateObs:
assert self.env is not None
assert self.env.status["cur_step"] <= self.max_step
obs = CurrentStateObs(
{
"acquiring": state.order.direction == state.order.BUY,
"cur_step": self.env.status["cur_step"],
"num_step": self.max_step,
"target": state.order.amount,
"position": state.position,
}
)
return obs
class CategoricalActionInterpreter(ActionInterpreter[SAOEState, int, float]):
"""Convert a discrete policy action to a continuous action, then multiplied by ``order.amount``.
Parameters
----------
values
It can be a list of length $L$: $[a_1, a_2, \\ldots, a_L]$.
Then when policy givens decision $x$, $a_x$ times order amount is the output.
It can also be an integer $n$, in which case the list of length $n+1$ is auto-generated,
i.e., $[0, 1/n, 2/n, \\ldots, n/n]$.
"""
def __init__(self, values: int | list[float]):
if isinstance(values, int):
values = [i / values for i in range(0, values + 1)]
self.action_values = values
@property
def action_space(self) -> spaces.Discrete:
return spaces.Discrete(len(self.action_values))
def interpret(self, state: SAOEState, action: int) -> float:
assert 0 <= action < len(self.action_values)
return min(state.position, state.order.amount * self.action_values[action])
class TwapRelativeActionInterpreter(ActionInterpreter[SAOEState, float, float]):
"""Convert a continous ratio to deal amount.
The ratio is relative to TWAP on the remainder of the day.
For example, there are 5 steps left, and the left position is 300.
With TWAP strategy, in each position, 60 should be traded.
When this interpreter receives action $a$, its output is $60 \\cdot a$.
"""
@property
def action_space(self) -> spaces.Box:
return spaces.Box(0, np.inf, shape=(), dtype=np.float32)
def interpret(self, state: SAOEState, action: float) -> float:
assert self.env is not None
estimated_total_steps = math.ceil(len(state.ticks_for_order) / state.ticks_per_step)
twap_volume = state.position / (estimated_total_steps - self.env.status["cur_step"])
return min(state.position, twap_volume * action)