1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-30 01:21:18 +08:00
Files
qlib/examples/trade/executor.py
Yuchen Fang a03b08bb4c format
2021-01-28 00:41:02 +08:00

338 lines
13 KiB
Python

import env
from vecenv import *
import sampler
import logger
import json
import os
import agent
import model
import policy
import random
import tianshou as ts
import tqdm
from tianshou.utils import tqdm_config, MovAvg
from torch.utils.tensorboard import SummaryWriter
from collector import *
import numpy as np
from util import merge_dicts
def get_best_gpu(force=None):
if force is not None:
return force
s = os.popen("nvidia-smi --query-gpu=memory.free --format=csv")
a = []
ss = s.read().replace("MiB", "").replace("memory.free", "").split("\n")
s.close()
for i in range(1, len(ss) - 1):
a.append(int(ss[i]))
best = int(np.argmax(a))
print("the best GPU is ", best, " with free memories of ", ss[best + 1])
return best
def setup_seed(seed):
"""
:param seed:
"""
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
class BaseExecutor(object):
def __init__(
self, log_dir, resources, env_conf, optim=None, policy_conf=None, network=None, policy_path=None, seed=None,
):
"""A base class for executor
:param log_dir: The directory to write all the logs.
:type log_dir: string
:param resources: A dict which describes available computational resources.
:type resources: dict
:param env_conf: Configurations for the envionments.
:type env_conf: dict
:param optim: Optimization configuration, defaults to None
:type optim: dict, optional
:param policy_conf: Configurations for the RL algorithm, defaults to None
:type policy_conf: dict, optional
:param network: Configurations for policy network, defaults to None
:type network: dict, optional
:param policy_path: If is not None, would load the policy from this path, defaults to None
:type policy_path: string, optional
:param seed: Random seed, defaults to None
:type seed: int, optional
"""
# self.config = config
self.log_dir = log_dir
print(self.log_dir)
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
if resources["device"] == "cuda":
resources["device"] = "cuda:" + str(get_best_gpu())
self.device = torch.device(resources["device"])
if seed:
setup_seed(seed)
assert not policy_path is None or not policy_conf is None, "Policy must be defined"
if policy_path:
self.policy = torch.load(policy_path, map_location=self.device)
self.policy.actor.extractor.device = self.device
# policy.eval()
elif hasattr(agent, policy_conf["name"]):
policy_conf["config"] = merge_dicts(policy_conf["config"], resources)
self.policy = getattr(agent, policy_conf["name"])(policy_conf["config"])
# print(self.policy)
else:
assert not network is None
if "extractor" in network.keys():
net = getattr(model, network["extractor"]["name"] + "_Extractor")(
device=self.device, **network["config"]
)
else:
net = getattr(model, network["name"] + "_Extractor")(device=self.device, **network["config"])
net.to(self.device)
actor = getattr(model, network["name"] + "_Actor")(extractor=net, device=self.device, **network["config"])
actor.to(self.device)
critic = getattr(model, network["name"] + "_Critic")(extractor=net, device=self.device, **network["config"])
critic.to(self.device)
self.optim = torch.optim.Adam(
list(actor.parameters()) + list(critic.parameters()),
lr=optim["lr"],
weight_decay=optim["weight_decay"] if "weight_decay" in optim else 0.0,
)
self.dist = torch.distributions.Categorical
try:
self.policy = getattr(ts.policy, policy_conf["name"])(
actor, critic, self.optim, self.dist, **policy_conf["config"]
)
except:
self.policy = getattr(policy, policy_conf["name"])(
actor, critic, self.optim, self.dist, **policy_conf["config"]
)
self.writer = SummaryWriter(self.log_dir)
def train(
self,
max_epoch,
step_per_epoch,
repeat_per_collect,
collect_per_step,
batch_size,
iteration=0,
global_step=0,
early_stopping=5,
*args,
**kargs,
):
"""Run the whole training process.
:param max_epoch: The total number of epoch.
:param step_per_epoch: The times of bp in one epoch.
:param collect_per_step: Number of episodes to collect before one bp.
:param repeat_per_collect: Times of bps after every rould of experience collecting.
:param batch_size: Batch size when bp.
:param iteration: The iteration when starting the training, used when fine tuning. (Default value = 0)
:param global_step: The number of steps when starting the training, used when fine tuning. (Default value = 0)
:param early_stopping: If the test reward does not reach a new high in `early_stopping` iterations, the training would stop. (Default value = 5)
:returns: The result on test set.
"""
raise NotImplementedError
def train_round(self, repeat_per_collect, collect_per_step, batch_size, *args, **kargs):
"""Do an round of training
:param collect_per_step: Number of episodes to collect before one bp.
:param repeat_per_collect: Times of bps after every rould of experience collecting.
:param batch_size: Batch size when bp.
"""
raise NotImplementedError
def eval(self, order_dir, save_res=False, logdir=None, *args, **kargs):
"""Evaluate the policy on orders in order_dir
:param order_dir: the orders to be evaluated on.
:param save_res: whether the result of evaluation be saved to self.logdir/res.json (Default value = False)
:param logdir: the place to save the .log and .pkl log files to. If None, don't save logfiles. (Default value = None)
:returns: The result of evaluation.
"""
raise NotImplementedError
class Executor(BaseExecutor):
def __init__(
self,
log_dir,
resources,
env_conf,
train_paths,
valid_paths,
test_paths,
io_conf,
optim=None,
policy_conf=None,
network=None,
policy_path=None,
seed=None,
share_memory=False,
buffer_size=200000,
q_learning=False,
*args,
**kargs,
):
"""[summary]
:param log_dir: The directory to write all the logs.
:type log_dir: string
:param resources: A dict which describes available computational resources.
:type resources: dict
:param env_conf: Configurations for the envionments.
:type env_conf: dict
:param train_paths: The paths of training datasets including orders, backtest files and features.
:type train_paths: string
:param valid_paths: The paths of validation datasets including orders, backtest files and features.
:type valid_paths: string
:param test_paths: The paths of test datasets including orders, backtest files and features.
:type test_paths: string
:param io_conf: Configuration for sampler and loggers.
:type io_conf: dict
:param share_memory: Whether to use shared memory vecnev, defaults to False
:type share_memory: bool, optional
:param buffer_size: The size of replay buffer, defaults to 200000
:type buffer_size: int, optional
"""
super().__init__(log_dir, resources, env_conf, optim, policy_conf, network, policy_path, seed)
single_env = getattr(env, env_conf["name"])
env_conf = merge_dicts(env_conf, train_paths)
env_conf["log"] = True
print("CPU_COUNT:", resources["num_cpus"])
if share_memory:
self.env = ShmemVectorEnv([lambda: single_env(env_conf) for _ in range(resources["num_cpus"])])
else:
self.env = SubprocVectorEnv([lambda: single_env(env_conf) for _ in range(resources["num_cpus"])])
self.test_collector = Collector(policy=self.policy, env=self.env, testing=True, reward_metric=np.sum)
self.train_collector = Collector(
self.policy, self.env, buffer=ts.data.ReplayBuffer(buffer_size), reward_metric=np.sum,
)
self.train_paths = train_paths
self.test_paths = test_paths
self.valid_paths = valid_paths
train_sampler_conf = train_paths
train_sampler_conf["features"] = env_conf["features"]
test_sampler_conf = test_paths
test_sampler_conf["features"] = env_conf["features"]
self.train_sampler = getattr(sampler, io_conf["train_sampler"])(train_sampler_conf)
self.test_sampler = getattr(sampler, io_conf["test_sampler"])(test_sampler_conf)
self.train_logger = logger.InfoLogger()
self.test_logger = getattr(logger, io_conf["test_logger"])
self.q_learning = q_learning
def train(
self,
max_epoch,
step_per_epoch,
repeat_per_collect,
collect_per_step,
batch_size,
iteration=0,
global_step=0,
early_stopping=5,
train_step_min=0,
log_valid=True,
*args,
**kargs,
):
best_epoch, best_reward = -1, -1
stat = {}
for epoch in range(1, 1 + max_epoch):
with tqdm.tqdm(total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t:
while t.n < t.total:
result, losses = self.train_round(repeat_per_collect, collect_per_step, batch_size, iteration)
global_step += result["n/st"]
iteration += 1
for k in result.keys():
self.writer.add_scalar("Train/" + k, result[k], global_step=global_step)
for k in losses.keys():
if stat.get(k) is None:
stat[k] = MovAvg()
stat[k].add(losses[k])
self.writer.add_scalar("Train/" + k, stat[k].get(), global_step=global_step)
t.update(1)
if t.n <= t.total:
t.update()
result = self.eval(
self.valid_paths["order_dir"], logdir=f"{self.log_dir}/valid/{iteration}/" if log_valid else None,
)
for k in result.keys():
self.writer.add_scalar("Valid/" + k, result[k], global_step=global_step)
if best_epoch == -1 or best_reward < result["rew"]:
best_reward = result["rew"]
best_epoch = epoch
best_state = self.policy.state_dict()
early_stop_round = 0
torch.save(self.policy, f"{self.log_dir}/policy_best")
elif global_step >= train_step_min:
early_stop_round += 1
torch.save(self.policy, f"{self.log_dir}/policy_{epoch}")
print(
f'Epoch #{epoch}: test_reward: {result["rew"]:.4f}, ' # train_reward: {result_train["rew"]:.4f}, '
f"best_reward: {best_reward:.4f} in #{best_epoch}"
)
if early_stop_round >= early_stopping:
print("Early stopped")
break
print("Testing...")
self.policy.load_state_dict(best_state)
result = self.eval(self.test_paths["order_dir"], logdir=f"{self.log_dir}/test/", save_res=True)
for k in result.keys():
self.writer.add_scalar("Test/" + k, result[k], global_step=global_step)
return result
def train_round(self, repeat_per_collect, collect_per_step, batch_size, *args, **kargs):
self.policy.train()
self.env.toggle_log(False)
self.env.sampler = self.train_sampler
if not self.q_learning:
self.train_collector.reset()
result = self.train_collector.collect(n_episode=collect_per_step, log_fn=self.train_logger)
result = merge_dicts(result, self.train_logger.summary())
if not self.q_learning:
losses = self.policy.update(
0, self.train_collector.buffer, batch_size=batch_size, repeat=repeat_per_collect,
)
else:
losses = self.policy.update(batch_size, self.train_collector.buffer,)
return result, losses
def eval(self, order_dir, save_res=False, logdir=None, *args, **kargs):
print(f"start evaluating on {order_dir}")
self.policy.eval()
self.env.toggle_log(True)
self.test_sampler.reset(order_dir)
self.env.sampler = self.test_sampler
self.test_collector.reset()
if not logdir is None:
if not os.path.exists(logdir):
os.makedirs(logdir)
eval_logger = self.test_logger(logdir, order_dir)
eval_logger.reset()
else:
eval_logger = self.train_logger
result = self.test_collector.collect(log_fn=eval_logger)
result = merge_dicts(result, eval_logger.summary())
if save_res:
with open(self.log_dir + "/res.json", "w") as f:
json.dump(result, f, sort_keys=True, indent=4)
print(f"finish evaluating on {order_dir}")
return result