qlib/examples/rl_order_execution/exp_configs/train_ppo.yml

simulator:
  data_granularity: 5
  time_per_step: 30
  vol_limit: null
env:
  concurrency: 48
  parallel_mode: shmem
action_interpreter:
  class: CategoricalActionInterpreter
  kwargs:
    values: 4
    max_step: 8
  module_path: qlib.rl.order_execution.interpreter
state_interpreter:
  class: FullHistoryStateInterpreter
  kwargs:
    data_dim: 5
    data_ticks: 48  # 48 = 240 min / 5 min
    max_step: 8
    processed_data_provider:
      class: PickleProcessedDataProvider
      module_path: qlib.rl.data.pickle_styled
      kwargs:
        data_dir: ./data/pickle_dataframe/feature
  module_path: qlib.rl.order_execution.interpreter
reward:
  class: PPOReward
  kwargs:
    max_step: 8
    start_time_index: 0
    end_time_index: 46  # 46 = (240 - 5) min / 5 min - 1
  module_path: qlib.rl.order_execution.reward
data:
  source:
    order_dir: ./data/orders
    data_dir: ./data/pickle_dataframe/backtest
    total_time: 240
    default_start_time_index: 0
    default_end_time_index: 235
    proc_data_dim: 5
  num_workers: 0
  queue_size: 20
network:
  class: Recurrent
  module_path: qlib.rl.order_execution.network
policy:
  class: PPO  # PPO, DQN
  kwargs:
    lr: 0.0001
  module_path: qlib.rl.order_execution.policy
runtime:
  seed: 42
  use_cuda: false
trainer:
  max_epoch: 500
  repeat_per_collect: 25
  earlystop_patience: 50
  episode_per_collect: 10000
  batch_size: 1024
  val_every_n_epoch: 4
  checkpoint_path: ./outputs/ppo
  checkpoint_every_n_iters: 1