refine code to use qrun

2026-07-03 11:00:57 +08:00 · 2023-07-03 17:56:22 +08:00
parent 9119bcdd3c
commit 6cb87ecfd1
4 changed files with 113 additions and 21 deletions
--- a/qlib/finco/data_cache_demo.py
+++ b/qlib/finco/data_cache_demo.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+    The motivation of this demo
+    - To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
+"""
+
+from copy import deepcopy
+from pathlib import Path
+import pickle
+from pprint import pprint
+import subprocess
+import yaml
+from qlib.log import TimeInspector
+
+from qlib import init
+from qlib.data.dataset.handler import DataHandlerLP
+from qlib.utils import init_instance_by_config
+
+# For general purpose, we use relative path
+DIRNAME = Path(__file__).absolute().resolve().parent / "finco_workspace"
+
+if __name__ == "__main__":
+    init()
+
+    config_path = Path("/home/xuyang/workspace/qlib/qlib/finco/finco_workspace/workflow_config.yaml")
+
+    # 1) show original time
+    # with TimeInspector.logt("The original time without handler cache:"):
+    #     subprocess.run(f"qrun {config_path}", shell=True)
+
+    # 2) dump handler
+    task_config = yaml.safe_load(config_path.open())
+    # hd_conf = task_config["task"]["dataset"]["kwargs"]["handler"]
+    # pprint(hd_conf)
+    # hd: DataHandlerLP = init_instance_by_config(hd_conf)
+    hd_path = DIRNAME / "handler.pkl"
+    # hd.to_pickle(hd_path, dump_all=True)
+
+    # 3) create new task with handler cache
+    new_task_config = deepcopy(task_config)
+    new_task_config["task"]["dataset"]["kwargs"]["handler"] = f"file://{hd_path}"
+    new_task_config["sys"] = {"path": [str(config_path.parent.resolve())]}
+    new_task_path = DIRNAME / "workflow_config.yaml"
+    print("The location of the new task", new_task_path)
+
+    # save new task
+    with new_task_path.open("w") as f:
+        yaml.safe_dump(new_task_config, f, indent=4, sort_keys=False)
+
+    # 4) train model with new task
+    # with TimeInspector.logt("The time for task with handler cache:"):
+    #     subprocess.run(f"qrun {new_task_path}", shell=True)
--- a/qlib/finco/llm.py
+++ b/qlib/finco/llm.py
@@ -62,7 +62,7 @@ class APIBackend(Singleton):
            try:
                response = self.create_chat_completion(**kwargs)
                return response
-            except (openai.error.RateLimitError, openai.error.Timeout) as e:
+            except (openai.error.RateLimitError, openai.error.Timeout, openai.error.APIError) as e:
                print(e)
                print(f"Retrying {i+1}th time...")
                time.sleep(1)
--- a/qlib/finco/prompt_template.yaml
+++ b/qlib/finco/prompt_template.yaml
@@ -126,7 +126,9 @@ HyperparameterActionTask_system : |-
  The user has provided the requirements, chose the predefined classes and made plan and reason to each component. You should strictly follow user's choice and you should provide the reason of your hyperparameter choices if exist and some suggestion if the user wants to finetune the hyperparameters after the hyperparameter. 

  You only need to response the hyperparameters in the exact format in exsample below with no explanation or conversation. "Hyperparameters:", "Reason:", "Improve suggestion:" are key tags so always include them in response.
-  {% if target_module == "DataHandler" %}
+  {% if target_module == "Dataset" %}
+  Caution, if the user chose {qlib.data.dataset}-{DatasetH}, always remember to set hyperparameter: {segments}!
+  {% elif target_module == "DataHandler" %}
  Qlib has these processors {processor_name}-{hyperparameter kwargs}: 
  {DropnaProcessor}-{['fields_group']},{DropnaLabel}-{['fields_group']},{CSRankNorm}-{['fields_group']},{ProcessInf}-{[]},{Processor}-{[]},{MinMaxNorm}-{['fit_start_time', 'fit_end_time', 'fields_group']},{CSZFillna}-{['fields_group']},{TanhProcess}-{[]},{CSZScoreNorm}-{['fields_group', 'method']},{RobustZScoreNorm}-{['fit_start_time', 'fit_end_time', 'fields_group', 'clip_outlier']},{FilterCol}-{['fields_group', 'col_list']},{HashStockFormat}-{[]},{ZScoreNorm}-{['fit_start_time', 'fit_end_time', 'fields_group']},{DropCol}-{['col_list']},{Fillna}-{['fields_group', 'fill_value']}. 
  You can choose some of them to use in {infer_processors} or {learn_processors} if necessary and pick the kwargs of them.
@@ -342,6 +344,9 @@ ConfigActionTask_system: |-
  {%for module_path, class_name in classes%}{% raw %}{{% endraw %}{{module_path}}{% raw %}}{% endraw %}-{% raw %}{{% endraw %}{{class_name}}{% raw %}}{% endraw %}.{% endfor %} and you have decided all the hyperparameters.
  {% endif %}

+  The predefined classes and user's hint are hard requirments, you should copy them to your answer with no modification to avoid errors!
+  "```yaml(.*)" and "```" are key tags in response, always include them in your response!
+
  Default in user's hyperparameter means using default value in Qlib code. So always remember to avoid puting them in the config and delete this key in yaml string!!!
  You only output the target component part of the config, Don't output all the config file!!!

@@ -352,6 +357,7 @@ ConfigActionTask_system: |-
  4. each predefined class's hyperparameter to initialize the class

  You will response the YAML config with no explanation and interaction.
+  Most importantly, always make sure the yaml string you response can be converted to yaml object without any format issue!

  Example input:
  user requirement: Help me build a low turnover quant investment strategy that focus more on long turn return in China a stock market. I want to use a big LSTM model and add several MLP layer before the head.
@@ -441,7 +447,7 @@ ConfigActionTask_system: |-
  target component: Backtest
  {% endif %}
  Example output:
-  """yaml{% if target_module == "Dataset" %}
+  ```yaml{% if target_module == "Dataset" %}
  dataset:
    class: DatasetH
    module_path: qlib.data.dataset
@@ -511,7 +517,7 @@ ConfigActionTask_system: |-
        open_cost: 0.0005
        close_cost: 0.0015
        min_cost: 5
-  {% endif %}"""
+  {% endif %}```

 ConfigActionTask_user: |-
  user requirement: {{user_requirement}}
--- a/qlib/finco/task.py
+++ b/qlib/finco/task.py
@@ -262,8 +262,27 @@ class TrainTask(Task):
        if confirm is False:
            return []

-        command = f"qrun {workflow_path}"
-        self._output = subprocess.check_output(command, shell=True, cwd=workspace)
+        command = ["qrun", str(workflow_path)]
+        try:
+            # Run the command and capture the output
+            workspace = self._context_manager.get_context("workspace")
+            result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True, cwd=str(workspace))
+        
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while running the subprocess: {e.stderr} {e.stdout}")
+            real_error = e.stderr+e.stdout
+            if "model" in  e.stdout.lower():
+                return [HyperparameterActionTask("Model", regenerate=True, error=real_error), ConfigActionTask("Model"), YamlEditTask("Model"), TrainTask()]
+            elif "dataset" in  e.stdout.lower() or "handler" in  e.stdout.lower():
+                return [HyperparameterActionTask("Dataset", regenerate=True, error=real_error), HyperparameterActionTask("DataHandler", regenerate=True, error=real_error), ConfigActionTask("Dataset"), ConfigActionTask("DataHandler"), YamlEditTask("Dataset"), YamlEditTask("DataHandler"), TrainTask()]
+            else:
+                ret_list = []
+                for component in COMPONENT_LIST:
+                    ret_list.append(HyperparameterActionTask(component, regenerate=True, error=real_error))
+                    ret_list.append(ConfigActionTask(component))
+                    ret_list.append(YamlEditTask(component))
+                ret_list.append(TrainTask())
+                return ret_list

        return [AnalysisTask()]

@@ -403,11 +422,12 @@ class DifferentiatedComponentActionTask(ActionTask):


 class HyperparameterActionTask(ActionTask):
-    def __init__(self, component, regenerate=False, error=None) -> None:
+    def __init__(self, component, regenerate=False, error=None, error_type=None) -> None:
        super().__init__()
        self.target_component = component
        self.regenerate = regenerate
        self.error = error
+        self.error_type = error_type

    def execute(self):
        user_prompt = self._context_manager.get_context("user_prompt")
@@ -444,7 +464,10 @@ class HyperparameterActionTask(ActionTask):
        )
        former_messages = []
        if self.regenerate:
-            user_prompt = f"your hyperparameter cannot be initialized, may be caused by wrong format of the value or wrong name or some value is not supported in Qlib.\nPlease rewrite the hyperparameters and answer with exact required format in system prompt and reply with no more explainations.\nThe error message: {self.error}. Please correct the former answer accordingly.\nHyperparameters, Reason and Improve suggestion should always be included."
+            if self.error_type == "yaml":
+                user_prompt = f"your yaml config generated from your hyperparameter is not in the right format.\n The Yaml string generated from the hyperparameters is not in the right format.\nPlease rewrite the hyperparameters and answer with exact required format in system prompt and reply with no more explainations.\nThe error message: {self.error}. Please correct the former answer accordingly.\nHyperparameters, Reason and Improve suggestion should always be included."
+            else:
+                user_prompt = f"your hyperparameter cannot be initialized, may be caused by wrong format of the value or wrong name or some value is not supported in Qlib.\nPlease rewrite the hyperparameters and answer with exact required format in system prompt and reply with no more explainations.\nThe error message: {self.error}. Please correct the former answer accordingly.\nHyperparameters, Reason and Improve suggestion should always be included."
            former_messages = self._context_manager.get_context("chat_history")[self.__class__.__name__][self.target_component][1:]
        response = APIBackend().build_messages_and_create_chat_completion(
            user_prompt, system_prompt, former_messages=former_messages
@@ -472,11 +495,9 @@ class HyperparameterActionTask(ActionTask):


 class ConfigActionTask(ActionTask):
-    def __init__(self, component, reconfig=False, error=None) -> None:
+    def __init__(self, component) -> None:
        super().__init__()
        self.target_component = component
-        self.reconfig = reconfig
-        self.error = error
    
    def execute(self):
        user_prompt = self._context_manager.get_context("user_prompt")
@@ -494,9 +515,9 @@ class ConfigActionTask(ActionTask):
            target_component_hyperparameters=target_component_hyperparameters
        )
        former_messages = []
-        if self.reconfig and user_prompt == self._context_manager.get_context("chat_history")[self.__class__.__name__][self.target_component][-2]["content"]:
-            user_prompt = f"your config cannot be converted to YAML, may be caused by wrong format. Please rewrite the yaml and answer with exact required format in system prompt and reply with no more explainations.\nerror message: {self.error}\n"
-            former_messages = self._context_manager.get_context("chat_history")[self.__class__.__name__][self.target_component][1:]
+        # if self.reconfig and user_prompt == self._context_manager.get_context("chat_history")[self.__class__.__name__][self.target_component][-2]["content"]:
+        #     user_prompt = f"your config cannot be converted to YAML, may be caused by wrong format. Please rewrite the yaml and answer with exact required format in system prompt and reply with no more explainations.\nerror message: {self.error}\n"
+        #     former_messages = self._context_manager.get_context("chat_history")[self.__class__.__name__][self.target_component][1:]
        response = APIBackend().build_messages_and_create_chat_completion(
            user_prompt, system_prompt, former_messages=former_messages
        )
@@ -509,10 +530,13 @@ class ConfigActionTask(ActionTask):
            yaml_config = yaml.safe_load(io.StringIO(config))
        except yaml.YAMLError as e:
            self.logger.info(f"Yaml file is not in the correct format: {e}")
-            return_tasks = [HyperparameterActionTask(self.target_component, regenerate=True, error=str(e)),  ConfigActionTask(self.target_component, reconfig=True, error=str(e))]
+            return_tasks = [HyperparameterActionTask(self.target_component, regenerate=True, error=str(e), error_type="yaml"),  ConfigActionTask(self.target_component)]
            return return_tasks
        
-        if self.target_component == "DataHandler":
+        if self.target_component == "Dataset":
+            if 'handler' in yaml_config["dataset"]:
+                del yaml_config['dataset']['handler']
+        elif self.target_component == "DataHandler":
            for processor in yaml_config['handler']['kwargs']['infer_processors']:
                if "kwargs" in processor and "fields_group" in processor["kwargs"]:
                    del processor["kwargs"]['fields_group']
@@ -520,8 +544,12 @@ class ConfigActionTask(ActionTask):
                if "kwargs" in processor and "fields_group" in processor["kwargs"]:
                    del processor["kwargs"]['fields_group']
            
-            if 'freq' in yaml_config['handler']['kwargs'] and yaml_config['handler']['kwargs']['freq'] == '1d':
-                yaml_config['handler']['kwargs']['freq'] = "day"
+            if 'freq' in yaml_config['handler']['kwargs']:
+                yaml_config['handler']['kwargs']['freq'] = "day" # TODO hot fix freq because no data
+        elif self.target_component == "Record":
+            for record in yaml_config['record']:
+                if record['class'] == 'SigAnaRecord' and 'label_col' in record['kwargs']:
+                    del record['kwargs']["label_col"]
        
        def remove_default(config):
            if isinstance(config, dict):
@@ -688,12 +716,17 @@ class YamlEditTask(ActionTask):
        else:
            real_target_config_key = self.target_config_key

-
-
        # 3) replace the module
        assert isinstance(update_config, dict) and real_target_config_key in update_config, "The config file is not in the correct format"
        assert self.replace_key_value_recursive(target_config, real_target_config_key, update_config[real_target_config_key]), "Replace of the yaml file failed."
-        
+
+        # TODO hotfix for the bug that the record signalrecord config is not updated
+        for record in target_config['task']['record']:
+            if record['class'] == 'SignalRecord':
+                if 'model' in record['kwargs']:
+                    del record['kwargs']["model"]
+                if 'dataset' in record['kwargs']:
+                    del record['kwargs']["dataset"]
        
        # 4) save the config file
        with self.original_config_location.open("w") as f: