1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00
Files
qlib/examples/rl/scripts/gen_backtest_orders.py
Huoran Li 577923a9f0 Fix RL example bug (#1384)
* Fix data pipeline

* Add TODO
2022-12-06 20:49:56 +08:00

56 lines
1.8 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
import os
import pandas as pd
import numpy as np
import pickle
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=20220926)
parser.add_argument("--num_order", type=int, default=10)
args = parser.parse_args()
np.random.seed(args.seed)
path = os.path.join("data", "pickle", "backtesttest.pkl")
df = pickle.load(open(path, "rb")).reset_index()
df["date"] = df["datetime"].dt.date.astype("datetime64")
instruments = sorted(set(df["instrument"]))
# TODO: The example is expected to be able to handle data containing missing values.
# TODO: Currently, we just simply skip dates that contain missing data. We will add
# TODO: this feature in the future.
skip_dates = {}
for instrument in instruments:
csv_df = pd.read_csv(os.path.join("data", "csv", f"{instrument}.csv"))
csv_df = csv_df[csv_df["close"].isna()]
dates = set([str(d).split(" ")[0] for d in csv_df["date"]])
skip_dates[instrument] = dates
df_list = []
for instrument in instruments:
print(instrument)
cur_df = df[df["instrument"] == instrument]
dates = sorted(set([str(d).split(" ")[0] for d in cur_df["date"]]))
dates = [date for date in dates if date not in skip_dates[instrument]]
n = args.num_order
df_list.append(
pd.DataFrame(
{
"date": sorted(np.random.choice(dates, size=n, replace=False)),
"instrument": [instrument] * n,
"amount": np.random.randint(low=3, high=11, size=n) * 100.0,
"order_type": np.random.randint(low=0, high=2, size=n),
}
).set_index(["date", "instrument"]),
)
total_df = pd.concat(df_list)
total_df.to_csv("data/backtest_orders.csv")