qlib/examples/workflow_by_code.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a href=\"https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Copyright (c) Microsoft Corporation.\n",
    "#  Licensed under the MIT License."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys, site\n",
    "from pathlib import Path\n",
    "\n",
    "################################# NOTE #################################\n",
    "#  Please be aware that if colab installs the latest numpy and pyqlib  #\n",
    "#  in this cell, users should RESTART the runtime in order to run the  #\n",
    "#  following cells successfully.                                       #\n",
    "########################################################################\n",
    "\n",
    "try:\n",
    "    import qlib\n",
    "except ImportError:\n",
    "    # install qlib\n",
    "    ! pip install --upgrade numpy\n",
    "    ! pip install pyqlib\n",
    "    # reload\n",
    "    site.main()\n",
    "\n",
    "scripts_dir = Path.cwd().parent.joinpath(\"scripts\")\n",
    "if not scripts_dir.joinpath(\"get_data.py\").exists():\n",
    "    # download get_data.py script\n",
    "    scripts_dir = Path(\"~/tmp/qlib_code/scripts\").expanduser().resolve()\n",
    "    scripts_dir.mkdir(parents=True, exist_ok=True)\n",
    "    import requests\n",
    "    with requests.get(\"https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py\") as resp:\n",
    "        with open(scripts_dir.joinpath(\"get_data.py\"), \"wb\") as fp:\n",
    "            fp.write(resp.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import qlib\n",
    "import pandas as pd\n",
    "from qlib.config import REG_CN\n",
    "from qlib.contrib.model.gbdt import LGBModel\n",
    "from qlib.contrib.data.handler import Alpha158\n",
    "from qlib.contrib.evaluate import (\n",
    "    backtest as normal_backtest,\n",
    "    risk_analysis,\n",
    ")\n",
    "from qlib.utils import exists_qlib_data, init_instance_by_config\n",
    "from qlib.workflow import R\n",
    "from qlib.workflow.record_temp import SignalRecord, PortAnaRecord\n",
    "from qlib.utils import flatten_dict\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# use default data\n",
    "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data\n",
    "provider_uri = \"~/.qlib/qlib_data/cn_data\"  # target_dir\n",
    "if not exists_qlib_data(provider_uri):\n",
    "    print(f\"Qlib data is not found in {provider_uri}\")\n",
    "    sys.path.append(str(scripts_dir))\n",
    "    from get_data import GetData\n",
    "    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)\n",
    "qlib.init(provider_uri=provider_uri, region=REG_CN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "market = \"csi300\"\n",
    "benchmark = \"SH000300\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "###################################\n",
    "# train model\n",
    "###################################\n",
    "data_handler_config = {\n",
    "    \"start_time\": \"2008-01-01\",\n",
    "    \"end_time\": \"2020-08-01\",\n",
    "    \"fit_start_time\": \"2008-01-01\",\n",
    "    \"fit_end_time\": \"2014-12-31\",\n",
    "    \"instruments\": market,\n",
    "}\n",
    "\n",
    "task = {\n",
    "    \"model\": {\n",
    "        \"class\": \"LGBModel\",\n",
    "        \"module_path\": \"qlib.contrib.model.gbdt\",\n",
    "        \"kwargs\": {\n",
    "            \"loss\": \"mse\",\n",
    "            \"colsample_bytree\": 0.8879,\n",
    "            \"learning_rate\": 0.0421,\n",
    "            \"subsample\": 0.8789,\n",
    "            \"lambda_l1\": 205.6999,\n",
    "            \"lambda_l2\": 580.9768,\n",
    "            \"max_depth\": 8,\n",
    "            \"num_leaves\": 210,\n",
    "            \"num_threads\": 20,\n",
    "        },\n",
    "    },\n",
    "    \"dataset\": {\n",
    "        \"class\": \"DatasetH\",\n",
    "        \"module_path\": \"qlib.data.dataset\",\n",
    "        \"kwargs\": {\n",
    "            \"handler\": {\n",
    "                \"class\": \"Alpha158\",\n",
    "                \"module_path\": \"qlib.contrib.data.handler\",\n",
    "                \"kwargs\": data_handler_config,\n",
    "            },\n",
    "            \"segments\": {\n",
    "                \"train\": (\"2008-01-01\", \"2014-12-31\"),\n",
    "                \"valid\": (\"2015-01-01\", \"2016-12-31\"),\n",
    "                \"test\": (\"2017-01-01\", \"2020-08-01\"),\n",
    "            },\n",
    "        },\n",
    "    },\n",
    "}\n",
    "\n",
    "# model initiaiton\n",
    "model = init_instance_by_config(task[\"model\"])\n",
    "dataset = init_instance_by_config(task[\"dataset\"])\n",
    "\n",
    "# start exp to train model\n",
    "with R.start(experiment_name=\"train_model\"):\n",
    "    R.log_params(**flatten_dict(task))\n",
    "    model.fit(dataset)\n",
    "    R.save_objects(trained_model=model)\n",
    "    rid = R.get_recorder().id\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# prediction, backtest & analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "###################################\n",
    "# prediction, backtest & analysis\n",
    "###################################\n",
    "port_analysis_config = {\n",
    "    \"executor\": {\n",
    "        \"class\": \"SimulatorExecutor\",\n",
    "        \"module_path\": \"qlib.backtest.executor\",\n",
    "        \"kwargs\": {\n",
    "            \"time_per_step\": \"day\",\n",
    "            \"generate_portfolio_metrics\": True,\n",
    "        },\n",
    "    },\n",
    "    \"strategy\": {\n",
    "        \"class\": \"TopkDropoutStrategy\",\n",
    "        \"module_path\": \"qlib.contrib.strategy.model_strategy\",\n",
    "        \"kwargs\": {\n",
    "            \"model\": model,\n",
    "            \"dataset\": dataset,\n",
    "            \"topk\": 50,\n",
    "            \"n_drop\": 5,\n",
    "        },\n",
    "    },\n",
    "    \"backtest\": {\n",
    "        \"start_time\": \"2017-01-01\",\n",
    "        \"end_time\": \"2020-08-01\",\n",
    "        \"account\": 100000000,\n",
    "        \"benchmark\": benchmark,\n",
    "        \"exchange_kwargs\": {\n",
    "            \"freq\": \"day\",\n",
    "            \"limit_threshold\": 0.095,\n",
    "            \"deal_price\": \"close\",\n",
    "            \"open_cost\": 0.0005,\n",
    "            \"close_cost\": 0.0015,\n",
    "            \"min_cost\": 5,\n",
    "        },\n",
    "    },\n",
    "}\n",
    "\n",
    "# backtest and analysis\n",
    "with R.start(experiment_name=\"backtest_analysis\"):\n",
    "    recorder = R.get_recorder(recorder_id=rid, experiment_name=\"train_model\")\n",
    "    model = recorder.load_object(\"trained_model\")\n",
    "\n",
    "    # prediction\n",
    "    recorder = R.get_recorder()\n",
    "    ba_rid = recorder.id\n",
    "    sr = SignalRecord(model, dataset, recorder)\n",
    "    sr.generate()\n",
    "\n",
    "    # backtest & analysis\n",
    "    par = PortAnaRecord(recorder, port_analysis_config, \"day\")\n",
    "    par.generate()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# analyze graphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from qlib.contrib.report import analysis_model, analysis_position\n",
    "from qlib.data import D\n",
    "recorder = R.get_recorder(recorder_id=ba_rid, experiment_name=\"backtest_analysis\")\n",
    "print(recorder)\n",
    "pred_df = recorder.load_object(\"pred.pkl\")\n",
    "pred_df_dates = pred_df.index.get_level_values(level='datetime')\n",
    "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal_1day.pkl\")\n",
    "positions = recorder.load_object(\"portfolio_analysis/positions_normal_1day.pkl\")\n",
    "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis_1day.pkl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## analysis position"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_position.report_graph(report_normal_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### risk analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## analysis model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_df = dataset.prepare(\"test\", col_set=\"label\")\n",
    "label_df.columns = ['label']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### score IC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n",
    "analysis_position.score_ic_graph(pred_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### model performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_model.model_performance_graph(pred_label)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}