1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-05 20:11:08 +08:00

Connect TrainTask & Rolling & DDG-DA (#1599)

* Connect train task to ddg-da & rolling

* Pylint & black formatting

* Formatting
This commit is contained in:
you-n-g
2023-07-17 09:58:58 +08:00
committed by GitHub
parent 5e0873ca81
commit 1c9841b15e
14 changed files with 584 additions and 356 deletions

View File

@@ -15,4 +15,6 @@ MAX_TOKENS=1600
MAX_RETRY=120
CONTINOUS_MODE=True
DEBUG_MODE=True
DEBUG_MODE=True
# TEMPERATURE=

View File

@@ -10,4 +10,4 @@ def get_finco_path() -> Path:
return the template path
Because the template path is located in the folder. We don't know where it is located. So __file__ for this module will be used.
"""
return DIRNAME
return DIRNAME

View File

@@ -1,5 +1,5 @@
# TODO: use pydantic for other modules in Qlib
from pydantic import BaseSettings
# from pydantic_settings import BaseSettings
from qlib.finco.utils import SingletonBaseClass
import os

97
qlib/finco/context.py Normal file
View File

@@ -0,0 +1,97 @@
from dataclasses import dataclass, field
import copy
from pathlib import Path
from typing import Optional, List
from qlib.finco.log import FinCoLog
from qlib.typehint import Literal
from qlib.finco.utils import similarity
@dataclass
class Design:
plan: str
classes: str
decision: str
@dataclass
class Exp:
"""Experiment"""
# compoments
dataset: Optional[Design] = None
datahandler: Optional[Design] = None
model: Optional[Design] = None
record: Optional[Design] = None
strategy: Optional[Design] = None
backtest: Optional[Design] = None
# basic
template: Optional[Path] = None
# rolling strategy. None indicates no rolling
rolling: Optional[Literal["base", "ddgda"]] = None
@dataclass
class StructContext:
"""Part of the context have clear meaning and structure, so they will be saved here and can be easily retrieved and understood"""
# TODO: move more content in WorkflowContextManager.context to here
workspace: Path
exp_list: List[Exp] = field(default_factory=list) # the planned experiments
class WorkflowContextManager:
"""Context Manager stores the context of the workflow"""
"""All context are key value pairs which saves the input, output and status of the whole workflow"""
def __init__(self, workspace: Path) -> None:
self.context = {}
self.logger = FinCoLog()
# this context is public
self.struct_context = StructContext(workspace) # TODO: move more content in context to here
self.set_context("workspace", workspace) # TODO: remove me
def set_context(self, key, value):
if key in self.context:
self.logger.warning("The key already exists in the context, the value will be overwritten")
self.context[key] = value
def get_context(self, key):
# NOTE: if the key doesn't exist, return None. In the future, we may raise an error to detect abnormal behavior
if key not in self.context:
self.logger.warning("The key doesn't exist in the context")
return None
return self.context[key]
def update_context(self, key, new_value):
# NOTE: if the key doesn't exist, return None. In the future, we may raise an error to detect abnormal behavior
if key not in self.context:
self.logger.warning("The key doesn't exist in the context")
self.context.update({key: new_value})
def get_all_context(self):
"""return a deep copy of the context"""
"""TODO: do we need to return a deep copy?"""
return copy.deepcopy(self.context)
def retrieve(self, query: str) -> dict:
if query in self.context.keys():
return {query: self.context.get(query)}
# Note: retrieve information from context by string similarity maybe abandon in future
scores = {}
for k, v in self.context.items():
scores.update({k: max(similarity(query, k), similarity(query, v))})
max_score_key = max(scores, key=scores.get)
return {max_score_key: self.context.get(max_score_key)}
def clear(self, reserve: list = None):
if reserve is None:
reserve = []
_context = {k: self.get_context(k) for k in reserve}
self.context = _context

View File

@@ -65,6 +65,7 @@ class YamlStorage(Storage):
This class is responsible for storage and loading of Knowledge related data in yaml format.
"""
DEFAULT_NAME = "storage.yml"
def __init__(self, path: Union[str, Path]):
@@ -82,7 +83,7 @@ class YamlStorage(Storage):
def save(self, **kwargs):
"""use pickle as the default save method"""
Path.mkdir(self.path.parent, exist_ok=True)
with open(self.path, 'w') as f:
with open(self.path, "w") as f:
yaml.dump(self.documents, f)
@@ -190,9 +191,14 @@ class ExperimentKnowledge(Knowledge):
def brief(self):
docs = []
for recorder in self.storage.recs:
docs.append({"exp_name": self.storage.exp.name, "record_info": recorder.info,
"config": recorder.load_object("config"),
"context_summary": recorder.load_object("context_summary")})
docs.append(
{
"exp_name": self.storage.exp.name,
"record_info": recorder.info,
"config": recorder.load_object("config"),
"context_summary": recorder.load_object("context_summary"),
}
)
return docs
@@ -295,7 +301,7 @@ class InfrastructureKnowledge(Knowledge):
"""
functions = []
for py_file_path in Path(directory).rglob('*.py'):
for py_file_path in Path(directory).rglob("*.py"):
for _functions in self.get_functions_with_docstrings(py_file_path):
functions.append(_functions)
@@ -314,7 +320,7 @@ class InfrastructureKnowledge(Knowledge):
docstring = None
for line in lines:
if line.strip().startswith("def ") or line.strip().startswith("class "):
func = line.strip().split(' ')[1].split('(')[0]
func = line.strip().split(" ")[1].split("(")[0]
if func.startswith("__"):
continue
if current_func is not None:
@@ -336,7 +342,6 @@ class InfrastructureKnowledge(Knowledge):
class Topic:
def __init__(self, name: str, describe: Template):
self.name = name
self.describe = describe
@@ -347,9 +352,7 @@ class Topic:
def summarize(self, docs: list):
self.logger.info(f"Summarize topic: \nname: {self.name}\ndescribe: {self.describe.module}")
prompt_workflow_selection = self.describe.render(docs=docs)
response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=prompt_workflow_selection
)
response = APIBackend().build_messages_and_create_chat_completion(user_prompt=prompt_workflow_selection)
self.knowledge = response
self.docs = docs
@@ -395,22 +398,26 @@ class KnowledgeBase:
def load_practice_knowledge(self, path: Path) -> PracticeKnowledge:
self.practice_knowledge = PracticeKnowledge(
YamlStorage(path.joinpath(f"{self.KT_PRACTICE}/{YamlStorage.DEFAULT_NAME}")))
YamlStorage(path.joinpath(f"{self.KT_PRACTICE}/{YamlStorage.DEFAULT_NAME}"))
)
return self.practice_knowledge
def load_execute_knowledge(self, path: Path) -> ExecuteKnowledge:
self.execute_knowledge = ExecuteKnowledge(
YamlStorage(path.joinpath(f"{self.KT_EXECUTE}/{YamlStorage.DEFAULT_NAME}")))
YamlStorage(path.joinpath(f"{self.KT_EXECUTE}/{YamlStorage.DEFAULT_NAME}"))
)
return self.execute_knowledge
def load_finance_knowledge(self, path: Path) -> FinanceKnowledge:
self.finance_knowledge = FinanceKnowledge(
YamlStorage(path.joinpath(f"{self.KT_FINANCE}/{YamlStorage.DEFAULT_NAME}")))
YamlStorage(path.joinpath(f"{self.KT_FINANCE}/{YamlStorage.DEFAULT_NAME}"))
)
return self.finance_knowledge
def load_infrastructure_knowledge(self, path: Path) -> InfrastructureKnowledge:
self.infrastructure_knowledge = InfrastructureKnowledge(
YamlStorage(path.joinpath(f"{self.KT_INFRASTRUCTURE}/{YamlStorage.DEFAULT_NAME}")))
YamlStorage(path.joinpath(f"{self.KT_INFRASTRUCTURE}/{YamlStorage.DEFAULT_NAME}"))
)
return self.infrastructure_knowledge
def get_knowledge(self, knowledge_type: str = None):
@@ -423,8 +430,12 @@ class KnowledgeBase:
elif knowledge_type == self.KT_INFRASTRUCTURE:
knowledge = self.infrastructure_knowledge.knowledge
else:
knowledge = self.execute_knowledge.knowledge + self.practice_knowledge.knowledge \
+ self.finance_knowledge.knowledge + self.infrastructure_knowledge.knowledge
knowledge = (
self.execute_knowledge.knowledge
+ self.practice_knowledge.knowledge
+ self.finance_knowledge.knowledge
+ self.infrastructure_knowledge.knowledge
)
return knowledge
def query(self, knowledge_type: str = None, content: str = None, n: int = 5):
@@ -450,8 +461,9 @@ class KnowledgeBase:
similar_n_docs = [knowledge[i] for i in similar_n_indexes]
prompt = Template(
"""find the most relevant doc with this query: '{{content}}' from docs='{{docs}}'.
Just return the most relevant item I provided, no more explain. For example: {'function': 'config.resolve_path', 'docstring': None}""")
"""find the most relevant doc with this query: '{{content}}' from docs='{{docs}}'.
Just return the most relevant item I provided, no more explain. For example: {'function': 'config.resolve_path', 'docstring': None}"""
)
prompt_workflow_selection = prompt.render(content=content, docs=similar_n_docs)
response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=prompt_workflow_selection, system_prompt="You are an excellent assistant."

View File

@@ -43,7 +43,7 @@ class APIBackend(SingletonBaseClass):
"content": system_prompt,
}
]
messages.extend(former_messages[-1*cfg.max_past_message_include:])
messages.extend(former_messages[-1 * cfg.max_past_message_include :])
messages.append(
{
"role": "user",
@@ -82,7 +82,6 @@ class APIBackend(SingletonBaseClass):
temperature: float = None,
max_tokens: Optional[int] = None,
) -> str:
if self.debug_mode:
key = json.dumps(messages)
if key in self.cache:

View File

@@ -12,6 +12,7 @@ class LogColors:
"""
ANSI color codes for use in console output.
"""
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
@@ -58,8 +59,11 @@ def formatting_log(logger, title="Info"):
a context manager, print liens before and after a function
"""
length = {"Start": 120, "Task": 120, "Info": 60, "Interact": 60, "End": 120}.get(title, 60)
color, bold = (LogColors.YELLOW, LogColors.BOLD) \
if title in ["Start", "Task", "Info", "Interact", "End"] else (LogColors.CYAN, "")
color, bold = (
(LogColors.YELLOW, LogColors.BOLD)
if title in ["Start", "Task", "Info", "Interact", "End"]
else (LogColors.CYAN, "")
)
logger.info("")
logger.info(f"{color}{bold}{'-'} {title} {'-' * (length - len(title))}{LogColors.END}")
yield
@@ -99,12 +103,12 @@ class FinCoLog(SingletonBaseClass):
f"{LogColors.MAGENTA}{LogColors.BOLD}Role:{LogColors.END} "
f"{LogColors.CYAN}{m['role']}{LogColors.END}\n"
+ f"{LogColors.MAGENTA}{LogColors.BOLD}Content:{LogColors.END} "
f"{LogColors.CYAN}{m['content']}{LogColors.END}\n")
f"{LogColors.CYAN}{m['content']}{LogColors.END}\n"
)
def log_response(self, response: str):
with formatting_log(self.logger, "GPT Response"):
self.logger.info(
f"{LogColors.CYAN}{response}{LogColors.END}\n")
self.logger.info(f"{LogColors.CYAN}{response}{LogColors.END}\n")
# TODO:
# It looks wierd if we only have logger
@@ -118,14 +122,13 @@ class FinCoLog(SingletonBaseClass):
def plain_info(self, *args):
for arg in args:
self.logger.info(
f"{LogColors.YELLOW}{LogColors.BOLD}Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END}")
f"{LogColors.YELLOW}{LogColors.BOLD}Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END}"
)
def warning(self, *args):
for arg in args:
self.logger.warning(
f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}")
self.logger.warning(f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}")
def error(self, *args):
for arg in args:
self.logger.error(
f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}")
self.logger.error(f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}")

View File

@@ -10,8 +10,9 @@ from qlib.finco import get_finco_path
class PromptTemplate(SingletonBaseClass):
def __init__(self) -> None:
super().__init__()
_template = yaml.load(open(Path.joinpath(get_finco_path(), "prompt_template.yaml"), "r"),
Loader=yaml.FullLoader)
_template = yaml.load(
open(Path.joinpath(get_finco_path(), "prompt_template.yaml"), "r"), Loader=yaml.FullLoader
)
for k, v in _template.items():
if k == "mods":
continue
@@ -28,5 +29,5 @@ class PromptTemplate(SingletonBaseClass):
file_path = Path(file_path)
Path.mkdir(file_path.parent, exist_ok=True)
with open(file_path, 'w') as f:
with open(file_path, "w") as f:
yaml.dump(self.__dict__, f)

File diff suppressed because it is too large Load Diff

View File

@@ -10,13 +10,25 @@ data_handler_config: &data_handler_config
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
model: <MODEL>
dataset: <DATASET>
signal: <PRED>
topk: 50
n_drop: 5
backtest:
@@ -32,18 +44,11 @@ port_analysis_config: &port_analysis_config
min_cost: 5
task:
model:
class: LGBModel
module_path: qlib.contrib.model.gbdt
class: LinearModel
module_path: qlib.contrib.model.linear
kwargs:
loss: mse
colsample_bytree: 0.8879
learning_rate: 0.2
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768
max_depth: 8
num_leaves: 210
num_threads: 20
estimator: ridge
alpha: 0.05
dataset:
class: DatasetH
module_path: qlib.data.dataset
@@ -65,7 +70,7 @@ task:
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ana_long_short: True
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp

View File

@@ -20,6 +20,7 @@ class SingletonBaseClass(metaclass=SingletonMeta):
This class becomes necessary
"""
# TODO: Add move this class to Qlib's general utils.
@@ -42,4 +43,4 @@ def similarity(text1, text2):
def random_string(length=10):
letters = string.ascii_letters + string.digits
return ''.join(random.choice(letters) for i in range(length))
return "".join(random.choice(letters) for i in range(length))

View File

@@ -1,67 +1,40 @@
import sys
import copy
import shutil
from pathlib import Path
from typing import List
from qlib.finco.task import HighLevelPlanTask, SummarizeTask, TrainTask
from pathlib import Path
from qlib.finco.task import HighLevelPlanTask, SummarizeTask
from qlib.finco.prompt_template import PromptTemplate, Template
from qlib.finco.log import FinCoLog, LogColors
from qlib.finco.utils import similarity
from qlib.finco.llm import APIBackend
from qlib.finco.conf import Config
from qlib.finco.knowledge import KnowledgeBase, Topic
from qlib.finco.context import WorkflowContextManager
class WorkflowContextManager:
"""Context Manager stores the context of the workflow"""
"""All context are key value pairs which saves the input, output and status of the whole workflow"""
def __init__(self) -> None:
self.context = {}
self.logger = FinCoLog()
def set_context(self, key, value):
if key in self.context:
self.logger.warning("The key already exists in the context, the value will be overwritten")
self.context[key] = value
def get_context(self, key):
# NOTE: if the key doesn't exist, return None. In the future, we may raise an error to detect abnormal behavior
if key not in self.context:
self.logger.warning("The key doesn't exist in the context")
return None
return self.context[key]
def update_context(self, key, new_value):
# NOTE: if the key doesn't exist, return None. In the future, we may raise an error to detect abnormal behavior
if key not in self.context:
self.logger.warning("The key doesn't exist in the context")
self.context.update({key: new_value})
def get_all_context(self):
"""return a deep copy of the context"""
"""TODO: do we need to return a deep copy?"""
return copy.deepcopy(self.context)
def retrieve(self, query: str) -> dict:
if query in self.context.keys():
return {query: self.context.get(query)}
# Note: retrieve information from context by string similarity maybe abandon in future
scores = {}
for k, v in self.context.items():
scores.update({k: max(similarity(query, k), similarity(query, v))})
max_score_key = max(scores, key=scores.get)
return {max_score_key: self.context.get(max_score_key)}
def clear(self, reserve: list = None):
if reserve is None:
reserve = []
_context = {k: self.get_context(k) for k in reserve}
self.context = _context
# TODO: it is not necessary in current phase
# class TaskDAG:
# """
# This is a Task manager. it maintains a graph and a stack stucture to manager the task
# The reason why the DGA relationship is maintained outside instead of inside the task is that
# - To make the creating of task simpler(user don't have to care about the relation-ship)
# - To manage the relation ship when poping and executing the tasks is relatively easier instead of scattering them everywhere
# """
# def __init__(self) -> None:
# self._finished = []
# self._stack = []
# self._dag = defaultdict(list) # from id(object) -> list of id(object)
#
# def pop(self):
# return self._stack.pop(0)
#
# def push(self, task: Union[Task, List[Task]], parent: Optional[Task] = None):
# if isinstance(task, Task):
# task = [task]
# if parent is not None:
# self._dag
#
# def done(self) -> bool:
# return len(self._stack) == 0
class WorkflowManager:
@@ -78,8 +51,7 @@ class WorkflowManager:
self._confirm_and_rm()
self.prompt_template = PromptTemplate()
self.context = WorkflowContextManager()
self.context.set_context("workspace", self._workspace)
self.context = WorkflowContextManager(workspace=self._workspace)
self.default_user_prompt = "Please help me build a low turnover strategy that focus more on longterm return in China A csi300. Please help to use lightgbm model."
def _confirm_and_rm(self):
@@ -92,7 +64,8 @@ class WorkflowManager:
f"If you do not need to delete {self._workspace},"
f" please change the workspace dir or rename existing files\n"
f"Are you sure you want to delete, yes(Y/y), no (N/n):",
color=LogColors.WHITE)
color=LogColors.WHITE,
)
)
if str(flag) not in ["Y", "y"]:
sys.exit()
@@ -150,10 +123,12 @@ class WorkflowManager:
# TODO: sort the task list based on the priority of the task
# task_list = sorted(task_list, key=lambda x: x.task_type)
t = task_list.pop(0)
self.logger.info(f"Task finished: {[str(task) for task in task_finished]}",
f"Task in queue: {task_list_info}",
f"Executing task: {str(t)}",
title="Task")
self.logger.info(
f"Task finished: {[str(task) for task in task_finished]}",
f"Task in queue: {task_list_info}",
f"Executing task: {str(t)}",
title="Task",
)
t.assign_context_manager(self.context)
res = t.execute()
@@ -174,9 +149,10 @@ class LearnManager:
self.epoch = 0
self.wm = WorkflowManager()
self.topics = [Topic(name=topic, describe=self.wm.prompt_template.get(f"Topic_{topic}")) for topic in
self.__DEFAULT_TOPICS]
self.knowledge_base = KnowledgeBase(workdir=Path.cwd().joinpath('knowledge'))
self.topics = [
Topic(name=topic, describe=self.wm.prompt_template.get(f"Topic_{topic}")) for topic in self.__DEFAULT_TOPICS
]
self.knowledge_base = KnowledgeBase(workdir=Path.cwd().joinpath("knowledge"))
self.knowledge_base.execute_knowledge.add([])
self.knowledge_base.query(knowledge_type="infrastructure", content="resolve_path")
@@ -209,14 +185,17 @@ class LearnManager:
for task in task_finished:
prompt_workflow_selection = self.wm.prompt_template.get(f"{self.__class__.__name__}_user").render(
summary=summary, brief=knowledge_of_topics,
summary=summary,
brief=knowledge_of_topics,
task_finished=[str(t) for t in task_finished],
task=task.__class__.__name__, system=task.system.render(), user_prompt=user_prompt
task=task.__class__.__name__,
system=task.system.render(),
user_prompt=user_prompt,
)
response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=prompt_workflow_selection,
system_prompt=self.wm.prompt_template.get(f"{self.__class__.__name__}_system").render()
system_prompt=self.wm.prompt_template.get(f"{self.__class__.__name__}_system").render(),
)
# todo: response assertion

View File

@@ -12,4 +12,4 @@ if [ -e $DIR/cridential.sh ]; then
fi
# run the command
python -m qlib.finco.cli "please help me build a low turnover strategy that focus more on longterm return"
python -m qlib.finco.cli "build an A-share stock market daily portfolio in quantitative investment and minimize the maximum drawdown."

View File

@@ -176,11 +176,12 @@ setup(
],
"finco": [
# finco is not necessary for all Qlib users; So a single require section is used for it.
"openapi",
"openai",
"pydantic", # Please add it to basic requirements after the design of pydantic is state.
"pydantic-settings",
"python-dotenv", # I don't think this is necessary if we use pydantic.
"fuzzywuzzy",
"python-Levenshtein" # not necessary but accelerate fuzzywuzzy calculation
"python-Levenshtein", # not necessary but accelerate fuzzywuzzy calculation
],
},
include_package_data=True,