1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-01 18:11:18 +08:00
Files
qlib/examples/trade/env/env_rl.py
Yuchen Fang a03b08bb4c format
2021-01-28 00:41:02 +08:00

482 lines
17 KiB
Python

import gym
gym.logger.set_level(40)
import numpy as np
import pandas as pd
import pickle as pkl
import datetime
import random
import os
import json
import time
import tianshou as ts
import copy
from multiprocessing import Process, Pipe, Queue
from typing import List, Tuple, Union, Optional, Callable, Any
from tianshou.env.utils import CloudpickleWrapper
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score
import sys
sys.path.append("..")
from util import merge_dicts, nan_weighted_avg, robust_auc
import reward
import observation
import action
ZERO = 1e-7
class StockEnv(gym.Env):
"""Single-assert environment"""
def __init__(self, config):
self.max_step_num = config["max_step_num"]
self.limit = config["limit"]
self.time_interval = config["time_interval"]
self.interval_num = config["interval_num"]
self.offset = config["offset"] if "offset" in config else 0
if "last_reward" in config:
self.last_reward = config["last_reward"]
else:
self.last_reward = None
if "log" in config:
self.log = config["log"]
else:
self.log = True
# loader_conf = config['loader']['config']
obs_conf = config["obs"]["config"]
obs_conf["features"] = config["features"]
obs_conf["time_interval"] = self.time_interval
obs_conf["max_step_num"] = self.max_step_num
self.obs = getattr(observation, config["obs"]["name"])(obs_conf)
self.action_func = getattr(action, config["action"]["name"])(config["action"]["config"])
self.reward_func_list = []
self.reward_log_dict = {}
self.reward_coef = []
for name, conf in config["reward"].items():
self.reward_coef.append(conf.pop("coefficient"))
self.reward_func_list.append(getattr(reward, name)(conf))
self.reward_log_dict[name] = 0.0
self.observation_space = self.obs.get_space()
self.action_space = self.action_func.get_space()
def toggle_log(self, log):
self.log = log
def reset(self, sample):
"""
:param sample:
"""
for key in self.reward_log_dict.keys():
self.reward_log_dict[key] = 0.0
if not sample is None:
(
self.ins,
self.date,
self.raw_df_values,
self.raw_df_columns,
self.raw_df_index,
self.feature_dfs,
self.target,
self.is_buy,
) = sample
self.raw_df = pd.DataFrame(index=self.raw_df_index, data=self.raw_df_values, columns=self.raw_df_columns,)
del self.raw_df_values, self.raw_df_columns, self.raw_df_index
start_time = time.time()
self.load_time = time.time() - start_time
self.day_vwap = nan_weighted_avg(
self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num],
self.raw_df["$volume0"].values[self.offset : self.offset + self.max_step_num],
)
try:
assert not (np.isnan(self.day_vwap) or np.isinf(self.day_vwap))
except:
print(self.raw_df)
print(self.ins)
print(self.day_vwap)
self.raw_df.to_pickle("/nfs_data1/kanren/error_df.pkl")
self.day_twap = np.nanmean(self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num])
self.t = -1 + self.offset
self.interval = 0
self.position = self.target
self.eps_start = time.time()
self.state = self.obs(
self.raw_df,
self.feature_dfs,
self.t,
self.interval,
self.position,
self.target,
self.is_buy,
self.max_step_num,
self.interval_num,
)
if self.log:
index_array = [
np.array([self.ins] * self.max_step_num),
self.raw_df.index.to_numpy()[self.offset : self.offset + self.max_step_num],
np.array([self.date] * self.max_step_num),
]
self.traded_log = pd.DataFrame(
data={
"$v_t": np.nan,
"$max_vol_t": (self.raw_df["$volume0"] * self.limit).values[
self.offset : self.offset + self.max_step_num
],
"$traded_t": np.nan,
"$vwap_t": self.raw_df["$vwap0"].values[self.offset : self.offset + self.max_step_num],
"action": np.nan,
},
index=index_array,
)
# v_t: The amount of shares the agent hope to trade
# max_vol_t: The max amount of shares can be traded
# traded_t: The amount of shares that is acually traded
# action: the action of agent, may have various meanings in different settings.
self.done = False
if self.limit > 1:
self.this_valid = np.inf
else:
self.this_valid = np.nansum(self.raw_df["$volume0"].values) * self.limit
self.this_cash = 0
self.step_time = []
self.action_log = [np.nan] * self.interval_num
self.reset_time = time.time() - start_time
self.real_eps_time = self.reset_time
self.total_reward = 0
self.total_instant_rew = 0
self.last_rew = 0
return self.state
def step(self, action):
"""
:param action:
"""
start_time = time.time()
self.action_log[self.interval] = action
volume_t = self.action_func(
action,
self.target,
self.position,
max_step_num=self.max_step_num,
t=self.t - self.offset,
interval=self.interval,
interval_num=self.interval_num,
)
self.interval += 1
reward = 0.0
time_left = self.max_step_num - self.t - 1 + self.offset
for i in range(self.time_interval):
v_t = volume_t / min(self.time_interval, time_left)
self.t += 1
if self.t == self.max_step_num - 1 + self.offset:
v_t = self.position
if self.log:
log_index = self.t - self.offset
self.traded_log.iat[log_index, 0] = v_t
self.traded_log.iat[log_index, 4] = action
vwap_t, vol_t = self.raw_df.iloc[self.t][["$vwap0", "$volume0"]]
max_vol_t = self.limit * vol_t
if self.limit >= 1:
max_vol_t = np.inf
if v_t > min(self.position, max_vol_t):
if self.position <= max_vol_t:
v_t = self.position
else:
v_t = max_vol_t
self.position -= v_t
self.this_cash += vwap_t * v_t
if self.log:
self.traded_log.iat[log_index, 2] = v_t
if self.is_buy:
performance_raise = (1 - vwap_t / self.day_vwap) * 10000
PA_t = (1 - vwap_t / self.day_twap) * 10000
else:
performance_raise = (vwap_t / self.day_vwap - 1) * 10000
PA_t = (vwap_t / self.day_twap - 1) * 10000
for i, reward_func in enumerate(self.reward_func_list):
if reward_func.isinstant:
tmp_r = reward_func(performance_raise, v_t, self.target, PA_t)
reward += tmp_r * self.reward_coef[i]
self.reward_log_dict[type(reward_func).__name__] += tmp_r
if self.t == self.max_step_num - 1 + self.offset:
break
if self.position < ZERO:
self.done = True
if self.interval == self.interval_num:
self.done = True
self.step_time.append(time.time() - start_time)
self.real_eps_time += time.time() - start_time
if self.done:
this_traded = self.target - self.position
this_vwap = (self.this_cash / this_traded) if this_traded > ZERO else self.day_vwap
valid = min(self.target, self.this_valid)
this_ffr = (this_traded / valid) if valid > ZERO else 1.0
if abs(this_ffr - 1.0) < ZERO:
this_ffr = 1.0
this_ffr *= 100
this_vv_ratio = this_vwap / self.day_vwap
vwap = self.raw_df["$vwap0"].values[self.offset : self.max_step_num + self.offset]
this_tt_ratio = this_vwap / np.nanmean(vwap)
if self.is_buy:
performance_raise = (1 - this_vv_ratio) * 10000
PA = (1 - this_tt_ratio) * 10000
else:
performance_raise = (this_vv_ratio - 1) * 10000
PA = (this_tt_ratio - 1) * 10000
for i, reward_func in enumerate(self.reward_func_list):
if not reward_func.isinstant:
tmp_r = reward_func(performance_raise, this_ffr, this_tt_ratio, self.is_buy)
reward += tmp_r * self.reward_coef[i]
self.reward_log_dict[type(reward_func).__name__] += tmp_r
self.state = self.obs(
self.raw_df,
self.feature_dfs,
self.t,
self.interval,
self.position,
self.target,
self.is_buy,
self.max_step_num,
self.interval_num,
action,
)
if self.log:
res = pd.DataFrame(
{
"target": self.target,
"sell": not self.is_buy,
"vwap": this_vwap,
"this_vv_ratio": this_vv_ratio,
"this_ffr": this_ffr,
},
index=[[self.ins], [self.date]],
)
money = self.target * self.day_vwap
if self.is_buy:
info = {
"money": money,
"money_buy": money,
"action": self.action_log,
"ffr": this_ffr,
"obs0_PR": performance_raise,
"ffr_buy": this_ffr,
"PR_buy": performance_raise,
"PA": PA,
"PA_buy": PA,
"vwap": this_vwap,
}
else:
info = {
"money": money,
"money_sell": money,
"action": self.action_log,
"ffr": this_ffr,
"obs0_PR": performance_raise,
"ffr_sell": this_ffr,
"PR_sell": performance_raise,
"PA": PA,
"PA_sell": PA,
"vwap": this_vwap,
}
info = merge_dicts(info, self.reward_log_dict)
if self.log:
info["df"] = self.traded_log
info["res"] = res
del self.feature_dfs
return self.state, reward, self.done, info
else:
self.state = self.obs(
self.raw_df,
self.feature_dfs,
self.t,
self.interval,
self.position,
self.target,
self.is_buy,
self.max_step_num,
self.interval_num,
action,
)
return self.state, reward, self.done, {}
class StockEnv_Acc(StockEnv):
def step(self, action):
start_time = time.time()
self.action_log[self.interval] = action
volume_t = self.action_func(
action,
self.target,
self.position,
max_step_num=self.max_step_num,
t=self.t - self.offset,
interval=self.interval,
interval_num=self.interval_num,
)
self.interval += 1
reward = 0.0
time_left = self.max_step_num - self.t - 1 + self.offset
time_left = min(self.time_interval, time_left)
v_t = np.repeat(volume_t / time_left, time_left)
minutes = np.arange(self.t + 1, self.t + time_left + 1)
if self.log:
log_index = minutes - self.offset
self.traded_log.iloc[log_index, 0] = v_t
self.traded_log.iloc[log_index, 4] = action
vwap_t = self.raw_df.iloc[minutes]["$vwap0"].values
vol_t = self.raw_df.iloc[minutes]["$volume0"].values
max_vol_t = self.limit * vol_t if self.limit < 1 else np.inf
v_t = np.minimum(v_t, max_vol_t)
if self.t + time_left == self.max_step_num - 1 + self.offset:
left = self.position - v_t.sum()
v_t[-1] += left
v_t = np.minimum(v_t, max_vol_t)
this_money = (v_t * vwap_t).sum()
this_vol = v_t.sum()
this_vwap = np.nan_to_num(this_money / this_vol)
self.t += time_left
self.position -= this_vol
self.this_cash += this_money
if self.log:
self.traded_log.iloc[log_index, 2] = v_t
if self.is_buy:
performance_raise = (1 - this_vwap / self.day_vwap) * 10000
PA_t = (1 - this_vwap / self.day_twap) * 10000
else:
performance_raise = (this_vwap / self.day_vwap - 1) * 10000
PA_t = (this_vwap / self.day_twap - 1) * 10000
for i, reward_func in enumerate(self.reward_func_list):
if reward_func.isinstant:
tmp_r = reward_func(performance_raise, v_t, self.target, PA_t)
reward += tmp_r * self.reward_coef[i]
self.reward_log_dict[type(reward_func).__name__] += tmp_r
if self.position < ZERO:
self.done = True
if self.interval == self.interval_num:
self.done = True
self.step_time.append(time.time() - start_time)
self.real_eps_time += time.time() - start_time
if self.done:
this_traded = self.target - self.position
this_vwap = (self.this_cash / this_traded) if this_traded > ZERO else self.day_vwap
valid = min(self.target, self.this_valid)
this_ffr = (this_traded / valid) if valid > ZERO else 1.0
if abs(this_ffr - 1.0) < ZERO:
this_ffr = 1.0
this_ffr *= 100
this_vv_ratio = this_vwap / self.day_vwap
vwap = self.raw_df["$vwap0"].values[self.offset : self.max_step_num + self.offset]
this_tt_ratio = this_vwap / np.nanmean(vwap)
if self.is_buy:
performance_raise = (1 - this_vv_ratio) * 10000
PA = (1 - this_tt_ratio) * 10000
else:
performance_raise = (this_vv_ratio - 1) * 10000
PA = (this_tt_ratio - 1) * 10000
for i, reward_func in enumerate(self.reward_func_list):
if not reward_func.isinstant:
tmp_r = reward_func(performance_raise, this_ffr, this_tt_ratio, self.is_buy)
reward += tmp_r * self.reward_coef[i]
self.reward_log_dict[type(reward_func).__name__] += tmp_r
self.state = self.obs(
self.raw_df,
self.feature_dfs,
self.t,
self.interval,
self.position,
self.target,
self.is_buy,
self.max_step_num,
self.interval_num,
action,
)
if self.log:
res = pd.DataFrame(
{
"target": self.target,
"sell": not self.is_buy,
"vwap": this_vwap,
"this_vv_ratio": this_vv_ratio,
"this_ffr": this_ffr,
},
index=[[self.ins], [self.date]],
)
money = self.target * self.day_vwap
if self.is_buy:
info = {
"money": money,
"money_buy": money,
"action": self.action_log,
"ffr": this_ffr,
"obs0_PR": performance_raise,
"ffr_buy": this_ffr,
"PR_buy": performance_raise,
"PA": PA,
"PA_buy": PA,
"vwap": this_vwap,
}
else:
info = {
"money": money,
"money_sell": money,
"action": self.action_log,
"ffr": this_ffr,
"obs0_PR": performance_raise,
"ffr_sell": this_ffr,
"PR_sell": performance_raise,
"PA": PA,
"PA_sell": PA,
"vwap": this_vwap,
}
info = merge_dicts(info, self.reward_log_dict)
if self.log:
info["df"] = self.traded_log
info["res"] = res
del self.feature_dfs
return self.state, reward, self.done, info
else:
self.state = self.obs(
self.raw_df,
self.feature_dfs,
self.t,
self.interval,
self.position,
self.target,
self.is_buy,
self.max_step_num,
self.interval_num,
action,
)
return self.state, reward, self.done, {}