mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-01 18:11:18 +08:00
482 lines
17 KiB
Python
482 lines
17 KiB
Python
import gym
|
|
|
|
gym.logger.set_level(40)
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pickle as pkl
|
|
import datetime
|
|
import random
|
|
import os
|
|
import json
|
|
import time
|
|
import tianshou as ts
|
|
import copy
|
|
from multiprocessing import Process, Pipe, Queue
|
|
from typing import List, Tuple, Union, Optional, Callable, Any
|
|
from tianshou.env.utils import CloudpickleWrapper
|
|
from scipy.stats import pearsonr
|
|
from sklearn.metrics import roc_auc_score
|
|
|
|
import sys
|
|
|
|
sys.path.append("..")
|
|
from util import merge_dicts, nan_weighted_avg, robust_auc
|
|
import reward
|
|
import observation
|
|
import action
|
|
|
|
ZERO = 1e-7
|
|
|
|
|
|
class StockEnv(gym.Env):
|
|
"""Single-assert environment"""
|
|
|
|
def __init__(self, config):
|
|
self.max_step_num = config["max_step_num"]
|
|
self.limit = config["limit"]
|
|
self.time_interval = config["time_interval"]
|
|
self.interval_num = config["interval_num"]
|
|
self.offset = config["offset"] if "offset" in config else 0
|
|
if "last_reward" in config:
|
|
self.last_reward = config["last_reward"]
|
|
else:
|
|
self.last_reward = None
|
|
if "log" in config:
|
|
self.log = config["log"]
|
|
else:
|
|
self.log = True
|
|
# loader_conf = config['loader']['config']
|
|
obs_conf = config["obs"]["config"]
|
|
obs_conf["features"] = config["features"]
|
|
obs_conf["time_interval"] = self.time_interval
|
|
obs_conf["max_step_num"] = self.max_step_num
|
|
self.obs = getattr(observation, config["obs"]["name"])(obs_conf)
|
|
self.action_func = getattr(action, config["action"]["name"])(config["action"]["config"])
|
|
self.reward_func_list = []
|
|
self.reward_log_dict = {}
|
|
self.reward_coef = []
|
|
for name, conf in config["reward"].items():
|
|
self.reward_coef.append(conf.pop("coefficient"))
|
|
self.reward_func_list.append(getattr(reward, name)(conf))
|
|
self.reward_log_dict[name] = 0.0
|
|
self.observation_space = self.obs.get_space()
|
|
self.action_space = self.action_func.get_space()
|
|
|
|
def toggle_log(self, log):
|
|
self.log = log
|
|
|
|
def reset(self, sample):
|
|
"""
|
|
|
|
:param sample:
|
|
|
|
"""
|
|
|
|
for key in self.reward_log_dict.keys():
|
|
self.reward_log_dict[key] = 0.0
|
|
if not sample is None:
|
|
(
|
|
self.ins,
|
|
self.date,
|
|
self.raw_df_values,
|
|
self.raw_df_columns,
|
|
self.raw_df_index,
|
|
self.feature_dfs,
|
|
self.target,
|
|
self.is_buy,
|
|
) = sample
|
|
self.raw_df = pd.DataFrame(index=self.raw_df_index, data=self.raw_df_values, columns=self.raw_df_columns,)
|
|
del self.raw_df_values, self.raw_df_columns, self.raw_df_index
|
|
start_time = time.time()
|
|
self.load_time = time.time() - start_time
|
|
self.day_vwap = nan_weighted_avg(
|
|
self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num],
|
|
self.raw_df["$volume0"].values[self.offset : self.offset + self.max_step_num],
|
|
)
|
|
try:
|
|
assert not (np.isnan(self.day_vwap) or np.isinf(self.day_vwap))
|
|
except:
|
|
print(self.raw_df)
|
|
print(self.ins)
|
|
print(self.day_vwap)
|
|
self.raw_df.to_pickle("/nfs_data1/kanren/error_df.pkl")
|
|
self.day_twap = np.nanmean(self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num])
|
|
self.t = -1 + self.offset
|
|
self.interval = 0
|
|
self.position = self.target
|
|
self.eps_start = time.time()
|
|
|
|
self.state = self.obs(
|
|
self.raw_df,
|
|
self.feature_dfs,
|
|
self.t,
|
|
self.interval,
|
|
self.position,
|
|
self.target,
|
|
self.is_buy,
|
|
self.max_step_num,
|
|
self.interval_num,
|
|
)
|
|
if self.log:
|
|
index_array = [
|
|
np.array([self.ins] * self.max_step_num),
|
|
self.raw_df.index.to_numpy()[self.offset : self.offset + self.max_step_num],
|
|
np.array([self.date] * self.max_step_num),
|
|
]
|
|
self.traded_log = pd.DataFrame(
|
|
data={
|
|
"$v_t": np.nan,
|
|
"$max_vol_t": (self.raw_df["$volume0"] * self.limit).values[
|
|
self.offset : self.offset + self.max_step_num
|
|
],
|
|
"$traded_t": np.nan,
|
|
"$vwap_t": self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num],
|
|
"action": np.nan,
|
|
},
|
|
index=index_array,
|
|
)
|
|
# v_t: The amount of shares the agent hope to trade
|
|
# max_vol_t: The max amount of shares can be traded
|
|
# traded_t: The amount of shares that is acually traded
|
|
# action: the action of agent, may have various meanings in different settings.
|
|
self.done = False
|
|
if self.limit > 1:
|
|
self.this_valid = np.inf
|
|
else:
|
|
self.this_valid = np.nansum(self.raw_df["$volume0"].values) * self.limit
|
|
self.this_cash = 0
|
|
|
|
self.step_time = []
|
|
self.action_log = [np.nan] * self.interval_num
|
|
self.reset_time = time.time() - start_time
|
|
self.real_eps_time = self.reset_time
|
|
self.total_reward = 0
|
|
self.total_instant_rew = 0
|
|
self.last_rew = 0
|
|
return self.state
|
|
|
|
def step(self, action):
|
|
"""
|
|
|
|
:param action:
|
|
|
|
"""
|
|
start_time = time.time()
|
|
self.action_log[self.interval] = action
|
|
volume_t = self.action_func(
|
|
action,
|
|
self.target,
|
|
self.position,
|
|
max_step_num=self.max_step_num,
|
|
t=self.t - self.offset,
|
|
interval=self.interval,
|
|
interval_num=self.interval_num,
|
|
)
|
|
self.interval += 1
|
|
reward = 0.0
|
|
time_left = self.max_step_num - self.t - 1 + self.offset
|
|
|
|
for i in range(self.time_interval):
|
|
v_t = volume_t / min(self.time_interval, time_left)
|
|
self.t += 1
|
|
if self.t == self.max_step_num - 1 + self.offset:
|
|
v_t = self.position
|
|
if self.log:
|
|
log_index = self.t - self.offset
|
|
self.traded_log.iat[log_index, 0] = v_t
|
|
self.traded_log.iat[log_index, 4] = action
|
|
vwap_t, vol_t = self.raw_df.iloc[self.t][["$vwap0", "$volume0"]]
|
|
max_vol_t = self.limit * vol_t
|
|
if self.limit >= 1:
|
|
max_vol_t = np.inf
|
|
if v_t > min(self.position, max_vol_t):
|
|
if self.position <= max_vol_t:
|
|
v_t = self.position
|
|
else:
|
|
v_t = max_vol_t
|
|
self.position -= v_t
|
|
self.this_cash += vwap_t * v_t
|
|
if self.log:
|
|
self.traded_log.iat[log_index, 2] = v_t
|
|
|
|
if self.is_buy:
|
|
performance_raise = (1 - vwap_t / self.day_vwap) * 10000
|
|
PA_t = (1 - vwap_t / self.day_twap) * 10000
|
|
else:
|
|
performance_raise = (vwap_t / self.day_vwap - 1) * 10000
|
|
PA_t = (vwap_t / self.day_twap - 1) * 10000
|
|
|
|
for i, reward_func in enumerate(self.reward_func_list):
|
|
if reward_func.isinstant:
|
|
tmp_r = reward_func(performance_raise, v_t, self.target, PA_t)
|
|
reward += tmp_r * self.reward_coef[i]
|
|
self.reward_log_dict[type(reward_func).__name__] += tmp_r
|
|
|
|
if self.t == self.max_step_num - 1 + self.offset:
|
|
break
|
|
|
|
if self.position < ZERO:
|
|
self.done = True
|
|
|
|
if self.interval == self.interval_num:
|
|
self.done = True
|
|
|
|
self.step_time.append(time.time() - start_time)
|
|
self.real_eps_time += time.time() - start_time
|
|
if self.done:
|
|
this_traded = self.target - self.position
|
|
this_vwap = (self.this_cash / this_traded) if this_traded > ZERO else self.day_vwap
|
|
valid = min(self.target, self.this_valid)
|
|
this_ffr = (this_traded / valid) if valid > ZERO else 1.0
|
|
if abs(this_ffr - 1.0) < ZERO:
|
|
this_ffr = 1.0
|
|
this_ffr *= 100
|
|
this_vv_ratio = this_vwap / self.day_vwap
|
|
vwap = self.raw_df["$vwap0"].values[self.offset : self.max_step_num + self.offset]
|
|
this_tt_ratio = this_vwap / np.nanmean(vwap)
|
|
|
|
if self.is_buy:
|
|
performance_raise = (1 - this_vv_ratio) * 10000
|
|
PA = (1 - this_tt_ratio) * 10000
|
|
else:
|
|
performance_raise = (this_vv_ratio - 1) * 10000
|
|
PA = (this_tt_ratio - 1) * 10000
|
|
|
|
for i, reward_func in enumerate(self.reward_func_list):
|
|
if not reward_func.isinstant:
|
|
tmp_r = reward_func(performance_raise, this_ffr, this_tt_ratio, self.is_buy)
|
|
reward += tmp_r * self.reward_coef[i]
|
|
self.reward_log_dict[type(reward_func).__name__] += tmp_r
|
|
|
|
self.state = self.obs(
|
|
self.raw_df,
|
|
self.feature_dfs,
|
|
self.t,
|
|
self.interval,
|
|
self.position,
|
|
self.target,
|
|
self.is_buy,
|
|
self.max_step_num,
|
|
self.interval_num,
|
|
action,
|
|
)
|
|
if self.log:
|
|
res = pd.DataFrame(
|
|
{
|
|
"target": self.target,
|
|
"sell": not self.is_buy,
|
|
"vwap": this_vwap,
|
|
"this_vv_ratio": this_vv_ratio,
|
|
"this_ffr": this_ffr,
|
|
},
|
|
index=[[self.ins], [self.date]],
|
|
)
|
|
money = self.target * self.day_vwap
|
|
if self.is_buy:
|
|
info = {
|
|
"money": money,
|
|
"money_buy": money,
|
|
"action": self.action_log,
|
|
"ffr": this_ffr,
|
|
"obs0_PR": performance_raise,
|
|
"ffr_buy": this_ffr,
|
|
"PR_buy": performance_raise,
|
|
"PA": PA,
|
|
"PA_buy": PA,
|
|
"vwap": this_vwap,
|
|
}
|
|
else:
|
|
info = {
|
|
"money": money,
|
|
"money_sell": money,
|
|
"action": self.action_log,
|
|
"ffr": this_ffr,
|
|
"obs0_PR": performance_raise,
|
|
"ffr_sell": this_ffr,
|
|
"PR_sell": performance_raise,
|
|
"PA": PA,
|
|
"PA_sell": PA,
|
|
"vwap": this_vwap,
|
|
}
|
|
info = merge_dicts(info, self.reward_log_dict)
|
|
if self.log:
|
|
info["df"] = self.traded_log
|
|
info["res"] = res
|
|
del self.feature_dfs
|
|
return self.state, reward, self.done, info
|
|
|
|
else:
|
|
self.state = self.obs(
|
|
self.raw_df,
|
|
self.feature_dfs,
|
|
self.t,
|
|
self.interval,
|
|
self.position,
|
|
self.target,
|
|
self.is_buy,
|
|
self.max_step_num,
|
|
self.interval_num,
|
|
action,
|
|
)
|
|
return self.state, reward, self.done, {}
|
|
|
|
|
|
class StockEnv_Acc(StockEnv):
|
|
def step(self, action):
|
|
start_time = time.time()
|
|
self.action_log[self.interval] = action
|
|
volume_t = self.action_func(
|
|
action,
|
|
self.target,
|
|
self.position,
|
|
max_step_num=self.max_step_num,
|
|
t=self.t - self.offset,
|
|
interval=self.interval,
|
|
interval_num=self.interval_num,
|
|
)
|
|
self.interval += 1
|
|
reward = 0.0
|
|
time_left = self.max_step_num - self.t - 1 + self.offset
|
|
time_left = min(self.time_interval, time_left)
|
|
|
|
v_t = np.repeat(volume_t / time_left, time_left)
|
|
minutes = np.arange(self.t + 1, self.t + time_left + 1)
|
|
if self.log:
|
|
log_index = minutes - self.offset
|
|
self.traded_log.iloc[log_index, 0] = v_t
|
|
self.traded_log.iloc[log_index, 4] = action
|
|
vwap_t = self.raw_df.iloc[minutes]["$vwap0"].values
|
|
vol_t = self.raw_df.iloc[minutes]["$volume0"].values
|
|
max_vol_t = self.limit * vol_t if self.limit < 1 else np.inf
|
|
v_t = np.minimum(v_t, max_vol_t)
|
|
if self.t + time_left == self.max_step_num - 1 + self.offset:
|
|
left = self.position - v_t.sum()
|
|
v_t[-1] += left
|
|
v_t = np.minimum(v_t, max_vol_t)
|
|
this_money = (v_t * vwap_t).sum()
|
|
this_vol = v_t.sum()
|
|
this_vwap = np.nan_to_num(this_money / this_vol)
|
|
self.t += time_left
|
|
self.position -= this_vol
|
|
self.this_cash += this_money
|
|
if self.log:
|
|
self.traded_log.iloc[log_index, 2] = v_t
|
|
|
|
if self.is_buy:
|
|
performance_raise = (1 - this_vwap / self.day_vwap) * 10000
|
|
PA_t = (1 - this_vwap / self.day_twap) * 10000
|
|
else:
|
|
performance_raise = (this_vwap / self.day_vwap - 1) * 10000
|
|
PA_t = (this_vwap / self.day_twap - 1) * 10000
|
|
|
|
for i, reward_func in enumerate(self.reward_func_list):
|
|
if reward_func.isinstant:
|
|
tmp_r = reward_func(performance_raise, v_t, self.target, PA_t)
|
|
reward += tmp_r * self.reward_coef[i]
|
|
self.reward_log_dict[type(reward_func).__name__] += tmp_r
|
|
|
|
if self.position < ZERO:
|
|
self.done = True
|
|
|
|
if self.interval == self.interval_num:
|
|
self.done = True
|
|
|
|
self.step_time.append(time.time() - start_time)
|
|
self.real_eps_time += time.time() - start_time
|
|
if self.done:
|
|
this_traded = self.target - self.position
|
|
this_vwap = (self.this_cash / this_traded) if this_traded > ZERO else self.day_vwap
|
|
valid = min(self.target, self.this_valid)
|
|
this_ffr = (this_traded / valid) if valid > ZERO else 1.0
|
|
if abs(this_ffr - 1.0) < ZERO:
|
|
this_ffr = 1.0
|
|
this_ffr *= 100
|
|
this_vv_ratio = this_vwap / self.day_vwap
|
|
vwap = self.raw_df["$vwap0"].values[self.offset : self.max_step_num + self.offset]
|
|
this_tt_ratio = this_vwap / np.nanmean(vwap)
|
|
|
|
if self.is_buy:
|
|
performance_raise = (1 - this_vv_ratio) * 10000
|
|
PA = (1 - this_tt_ratio) * 10000
|
|
else:
|
|
performance_raise = (this_vv_ratio - 1) * 10000
|
|
PA = (this_tt_ratio - 1) * 10000
|
|
|
|
for i, reward_func in enumerate(self.reward_func_list):
|
|
if not reward_func.isinstant:
|
|
tmp_r = reward_func(performance_raise, this_ffr, this_tt_ratio, self.is_buy)
|
|
reward += tmp_r * self.reward_coef[i]
|
|
self.reward_log_dict[type(reward_func).__name__] += tmp_r
|
|
|
|
self.state = self.obs(
|
|
self.raw_df,
|
|
self.feature_dfs,
|
|
self.t,
|
|
self.interval,
|
|
self.position,
|
|
self.target,
|
|
self.is_buy,
|
|
self.max_step_num,
|
|
self.interval_num,
|
|
action,
|
|
)
|
|
if self.log:
|
|
res = pd.DataFrame(
|
|
{
|
|
"target": self.target,
|
|
"sell": not self.is_buy,
|
|
"vwap": this_vwap,
|
|
"this_vv_ratio": this_vv_ratio,
|
|
"this_ffr": this_ffr,
|
|
},
|
|
index=[[self.ins], [self.date]],
|
|
)
|
|
money = self.target * self.day_vwap
|
|
if self.is_buy:
|
|
info = {
|
|
"money": money,
|
|
"money_buy": money,
|
|
"action": self.action_log,
|
|
"ffr": this_ffr,
|
|
"obs0_PR": performance_raise,
|
|
"ffr_buy": this_ffr,
|
|
"PR_buy": performance_raise,
|
|
"PA": PA,
|
|
"PA_buy": PA,
|
|
"vwap": this_vwap,
|
|
}
|
|
else:
|
|
info = {
|
|
"money": money,
|
|
"money_sell": money,
|
|
"action": self.action_log,
|
|
"ffr": this_ffr,
|
|
"obs0_PR": performance_raise,
|
|
"ffr_sell": this_ffr,
|
|
"PR_sell": performance_raise,
|
|
"PA": PA,
|
|
"PA_sell": PA,
|
|
"vwap": this_vwap,
|
|
}
|
|
info = merge_dicts(info, self.reward_log_dict)
|
|
if self.log:
|
|
info["df"] = self.traded_log
|
|
info["res"] = res
|
|
del self.feature_dfs
|
|
return self.state, reward, self.done, info
|
|
|
|
else:
|
|
self.state = self.obs(
|
|
self.raw_df,
|
|
self.feature_dfs,
|
|
self.t,
|
|
self.interval,
|
|
self.position,
|
|
self.target,
|
|
self.is_buy,
|
|
self.max_step_num,
|
|
self.interval_num,
|
|
action,
|
|
)
|
|
return self.state, reward, self.done, {}
|