qlib/examples/rl/experiment_config/training/config.yml

simulator:
  time_per_step: 30
  vol_limit: null
env:
  concurrency: 1
  parallel_mode: dummy
action_interpreter:
  class: CategoricalActionInterpreter
  kwargs:
    values: 14
    max_step: 8
  module_path: qlib.rl.order_execution.interpreter
state_interpreter:
  class: FullHistoryStateInterpreter
  kwargs:
    data_dim: 6
    data_ticks: 240
    max_step: 8
    processed_data_provider:
      class: PickleProcessedDataProvider
      module_path: qlib.rl.data.pickle_styled
      kwargs:
        data_dir: ./data/pickle_dataframe/feature
  module_path: qlib.rl.order_execution.interpreter
reward:
  class: PAPenaltyReward
  kwargs:
    penalty: 100.0
  module_path: qlib.rl.order_execution.reward
data:
  source:
    order_dir: ./data/training_order_split
    data_dir: ./data/pickle_dataframe/backtest
    total_time: 240
    default_start_time: 0
    default_end_time: 240
    proc_data_dim: 6
  num_workers: 0
  queue_size: 20
network:
  class: Recurrent
  module_path: qlib.rl.order_execution.network
policy:
  class: PPO
  kwargs:
    lr: 0.0001
  module_path: qlib.rl.order_execution.policy
runtime:
  seed: 42
  use_cuda: false
trainer:
  max_epoch: 2
  repeat_per_collect: 5
  earlystop_patience: 2
  episode_per_collect: 20
  batch_size: 16
  val_every_n_epoch: 1
  checkpoint_path: ./checkpoints
  checkpoint_every_n_iters: 1