mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
fixed a problem with multi index caused by the default value of groupkey (#1917)
* fixed a problem with multi index caused by the default value of groupkey * modify group_key default value * limit pandas verion * format with black * fix docs error * fix docs error * fixed bugs caused by pandas upgrade * remove needless code * reformat with black * limit version & add docs
This commit is contained in:
@@ -462,6 +462,14 @@ python run_all_model.py run 10
|
|||||||
|
|
||||||
It also provides the API to run specific models at once. For more use cases, please refer to the file's [docstrings](examples/run_all_model.py).
|
It also provides the API to run specific models at once. For more use cases, please refer to the file's [docstrings](examples/run_all_model.py).
|
||||||
|
|
||||||
|
### Break change
|
||||||
|
In `pandas`, `group_key` is one of the parameters of the `groupby` method. From version 1.5 to 2.0 of `pandas`, the default value of `group_key` has been changed from `no default` to `True`, which will cause qlib to report an error during operation. So we set `group_key=False`, but it doesn't guarantee that some programmes will run correctly, including:
|
||||||
|
* qlib\examples\rl_order_execution\scripts\gen_training_orders.py
|
||||||
|
* qlib\examples\benchmarks\TRA\src\dataset.MTSDatasetH.py
|
||||||
|
* qlib\examples\benchmarks\TFT\tft.py
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## [Adapting to Market Dynamics](examples/benchmarks_dynamic)
|
## [Adapting to Market Dynamics](examples/benchmarks_dynamic)
|
||||||
|
|
||||||
Due to the non-stationary nature of the environment of the financial market, the data distribution may change in different periods, which makes the performance of models build on training data decays in the future test data.
|
Due to the non-stationary nature of the environment of the financial market, the data distribution may change in different periods, which makes the performance of models build on training data decays in the future test data.
|
||||||
|
|||||||
@@ -599,7 +599,7 @@ class TemporalFusionTransformer:
|
|||||||
print("Getting valid sampling locations.")
|
print("Getting valid sampling locations.")
|
||||||
valid_sampling_locations = []
|
valid_sampling_locations = []
|
||||||
split_data_map = {}
|
split_data_map = {}
|
||||||
for identifier, df in data.groupby(id_col):
|
for identifier, df in data.groupby(id_col, group_key=False):
|
||||||
print("Getting locations for {}".format(identifier))
|
print("Getting locations for {}".format(identifier))
|
||||||
num_entries = len(df)
|
num_entries = len(df)
|
||||||
if num_entries >= self.time_steps:
|
if num_entries >= self.time_steps:
|
||||||
@@ -678,7 +678,7 @@ class TemporalFusionTransformer:
|
|||||||
input_cols = [tup[0] for tup in self.column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
|
input_cols = [tup[0] for tup in self.column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
|
||||||
|
|
||||||
data_map = {}
|
data_map = {}
|
||||||
for _, sliced in data.groupby(id_col):
|
for _, sliced in data.groupby(id_col, group_keys=False):
|
||||||
col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols}
|
col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols}
|
||||||
|
|
||||||
for k in col_mappings:
|
for k in col_mappings:
|
||||||
|
|||||||
@@ -78,13 +78,15 @@ DATASET_SETTING = {
|
|||||||
|
|
||||||
|
|
||||||
def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
|
def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
|
||||||
return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
|
return data_df[[col_shift]].groupby("instrument", group_keys=False).apply(lambda df: df.shift(shifts))
|
||||||
|
|
||||||
|
|
||||||
def fill_test_na(test_df):
|
def fill_test_na(test_df):
|
||||||
test_df_res = test_df.copy()
|
test_df_res = test_df.copy()
|
||||||
feature_cols = ~test_df_res.columns.str.contains("label", case=False)
|
feature_cols = ~test_df_res.columns.str.contains("label", case=False)
|
||||||
test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
|
test_feature_fna = (
|
||||||
|
test_df_res.loc[:, feature_cols].groupby("datetime", group_keys=False).apply(lambda df: df.fillna(df.mean()))
|
||||||
|
)
|
||||||
test_df_res.loc[:, feature_cols] = test_feature_fna
|
test_df_res.loc[:, feature_cols] = test_feature_fna
|
||||||
return test_df_res
|
return test_df_res
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def _create_ts_slices(index, seq_len):
|
|||||||
assert index.is_lexsorted(), "index should be sorted"
|
assert index.is_lexsorted(), "index should be sorted"
|
||||||
|
|
||||||
# number of dates for each code
|
# number of dates for each code
|
||||||
sample_count_by_codes = pd.Series(0, index=index).groupby(level=0).size().values
|
sample_count_by_codes = pd.Series(0, index=index).groupby(level=0, group_keys=False).size().values
|
||||||
|
|
||||||
# start_index for each code
|
# start_index for each code
|
||||||
start_index_of_codes = np.roll(np.cumsum(sample_count_by_codes), 1)
|
start_index_of_codes = np.roll(np.cumsum(sample_count_by_codes), 1)
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ class DayLast(ElemOperator):
|
|||||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||||
_calendar = get_calendar_day(freq=freq)
|
_calendar = get_calendar_day(freq=freq)
|
||||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||||
return series.groupby(_calendar[series.index]).transform("last")
|
return series.groupby(_calendar[series.index], group_keys=False).transform("last")
|
||||||
|
|
||||||
|
|
||||||
class FFillNan(ElemOperator):
|
class FFillNan(ElemOperator):
|
||||||
@@ -44,7 +44,7 @@ class FFillNan(ElemOperator):
|
|||||||
|
|
||||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||||
return series.fillna(method="ffill")
|
return series.ffill()
|
||||||
|
|
||||||
|
|
||||||
class BFillNan(ElemOperator):
|
class BFillNan(ElemOperator):
|
||||||
@@ -63,7 +63,7 @@ class BFillNan(ElemOperator):
|
|||||||
|
|
||||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||||
return series.fillna(method="bfill")
|
return series.bfill()
|
||||||
|
|
||||||
|
|
||||||
class Date(ElemOperator):
|
class Date(ElemOperator):
|
||||||
|
|||||||
@@ -19,9 +19,9 @@ def generate_order(stock: str, start_idx: int, end_idx: int) -> bool:
|
|||||||
|
|
||||||
df["date"] = df["datetime"].dt.date.astype("datetime64")
|
df["date"] = df["datetime"].dt.date.astype("datetime64")
|
||||||
df = df.set_index(["instrument", "datetime", "date"])
|
df = df.set_index(["instrument", "datetime", "date"])
|
||||||
df = df.groupby("date").take(range(start_idx, end_idx)).droplevel(level=0)
|
df = df.groupby("date", group_keys=False).take(range(start_idx, end_idx)).droplevel(level=0)
|
||||||
|
|
||||||
order_all = pd.DataFrame(df.groupby(level=(2, 0)).mean().dropna())
|
order_all = pd.DataFrame(df.groupby(level=(2, 0), group_keys=False).mean().dropna())
|
||||||
order_all["amount"] = np.random.lognormal(-3.28, 1.14) * order_all["$volume0"]
|
order_all["amount"] = np.random.lognormal(-3.28, 1.14) * order_all["$volume0"]
|
||||||
order_all = order_all[order_all["amount"] > 0.0]
|
order_all = order_all[order_all["amount"] > 0.0]
|
||||||
order_all["order_type"] = 0
|
order_all["order_type"] = 0
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ readme = {file = "README.md", content-type = "text/markdown"}
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"pyyaml",
|
"pyyaml",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pandas",
|
"pandas>=0.24",
|
||||||
"mlflow",
|
"mlflow",
|
||||||
"filelock>=3.16.0",
|
"filelock>=3.16.0",
|
||||||
"redis",
|
"redis",
|
||||||
@@ -67,10 +67,13 @@ lint = [
|
|||||||
"flake8",
|
"flake8",
|
||||||
"nbqa",
|
"nbqa",
|
||||||
]
|
]
|
||||||
|
# snowballstemmer, a dependency of sphinx, was released on 2025-05-08 with version 3.0.0,
|
||||||
|
# which causes errors in the build process. So we've limited the version for now.
|
||||||
docs = [
|
docs = [
|
||||||
"sphinx",
|
"sphinx",
|
||||||
"sphinx_rtd_theme",
|
"sphinx_rtd_theme",
|
||||||
"readthedocs_sphinx_ext",
|
"readthedocs_sphinx_ext",
|
||||||
|
"snowballstemmer<3.0",
|
||||||
]
|
]
|
||||||
package = [
|
package = [
|
||||||
"twine",
|
"twine",
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ class PandasQuote(BaseQuote):
|
|||||||
def __init__(self, quote_df: pd.DataFrame, freq: str) -> None:
|
def __init__(self, quote_df: pd.DataFrame, freq: str) -> None:
|
||||||
super().__init__(quote_df=quote_df, freq=freq)
|
super().__init__(quote_df=quote_df, freq=freq)
|
||||||
quote_dict = {}
|
quote_dict = {}
|
||||||
for stock_id, stock_val in quote_df.groupby(level="instrument"):
|
for stock_id, stock_val in quote_df.groupby(level="instrument", group_keys=False):
|
||||||
quote_dict[stock_id] = stock_val.droplevel(level="instrument")
|
quote_dict[stock_id] = stock_val.droplevel(level="instrument")
|
||||||
self.data = quote_dict
|
self.data = quote_dict
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ class NumpyQuote(BaseQuote):
|
|||||||
"""
|
"""
|
||||||
super().__init__(quote_df=quote_df, freq=freq)
|
super().__init__(quote_df=quote_df, freq=freq)
|
||||||
quote_dict = {}
|
quote_dict = {}
|
||||||
for stock_id, stock_val in quote_df.groupby(level="instrument"):
|
for stock_id, stock_val in quote_df.groupby(level="instrument", group_keys=False):
|
||||||
quote_dict[stock_id] = idd.MultiData(stock_val.droplevel(level="instrument"))
|
quote_dict[stock_id] = idd.MultiData(stock_val.droplevel(level="instrument"))
|
||||||
quote_dict[stock_id].sort_index() # To support more flexible slicing, we must sort data first
|
quote_dict[stock_id].sort_index() # To support more flexible slicing, we must sort data first
|
||||||
self.data = quote_dict
|
self.data = quote_dict
|
||||||
|
|||||||
@@ -311,7 +311,7 @@ class Position(BasePosition):
|
|||||||
freq=freq,
|
freq=freq,
|
||||||
disk_cache=True,
|
disk_cache=True,
|
||||||
).dropna()
|
).dropna()
|
||||||
price_dict = price_df.groupby(["instrument"]).tail(1).reset_index(level=1, drop=True)["$close"].to_dict()
|
price_dict = price_df.groupby(["instrument"], group_keys=False).tail(1)["$close"].to_dict()
|
||||||
|
|
||||||
if len(price_dict) < len(stock_list):
|
if len(price_dict) < len(stock_list):
|
||||||
lack_stock = set(stock_list) - set(price_dict)
|
lack_stock = set(stock_list) - set(price_dict)
|
||||||
|
|||||||
@@ -114,7 +114,11 @@ class PortfolioMetrics:
|
|||||||
_temp_result, _ = get_higher_eq_freq_feature(_codes, fields, start_time, end_time, freq=freq)
|
_temp_result, _ = get_higher_eq_freq_feature(_codes, fields, start_time, end_time, freq=freq)
|
||||||
if len(_temp_result) == 0:
|
if len(_temp_result) == 0:
|
||||||
raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
|
raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
|
||||||
return _temp_result.groupby(level="datetime")[_temp_result.columns.tolist()[0]].mean().fillna(0)
|
return (
|
||||||
|
_temp_result.groupby(level="datetime", group_keys=False)[_temp_result.columns.tolist()[0]]
|
||||||
|
.mean()
|
||||||
|
.fillna(0)
|
||||||
|
)
|
||||||
|
|
||||||
def _sample_benchmark(
|
def _sample_benchmark(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def _create_ts_slices(index, seq_len):
|
|||||||
assert index.is_monotonic_increasing, "index should be sorted"
|
assert index.is_monotonic_increasing, "index should be sorted"
|
||||||
|
|
||||||
# number of dates for each instrument
|
# number of dates for each instrument
|
||||||
sample_count_by_insts = index.to_series().groupby(level=0).size().values
|
sample_count_by_insts = index.to_series().groupby(level=0, group_keys=False).size().values
|
||||||
|
|
||||||
# start index for each instrument
|
# start index for each instrument
|
||||||
start_index_of_insts = np.roll(np.cumsum(sample_count_by_insts), 1)
|
start_index_of_insts = np.roll(np.cumsum(sample_count_by_insts), 1)
|
||||||
|
|||||||
@@ -55,14 +55,18 @@ class ConfigSectionProcessor(Processor):
|
|||||||
|
|
||||||
# Label
|
# Label
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^LABEL")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^LABEL")]
|
||||||
df_focus[cols] = df_focus[cols].groupby(level="datetime").apply(_label_norm)
|
df_focus[cols] = df_focus[cols].groupby(level="datetime", group_keys=False).apply(_label_norm)
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^KLEN|^KLOW|^KUP")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^KLEN|^KLOW|^KUP")]
|
||||||
df_focus[cols] = df_focus[cols].apply(lambda x: x**0.25).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = (
|
||||||
|
df_focus[cols].apply(lambda x: x**0.25).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^KLOW2|^KUP2")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^KLOW2|^KUP2")]
|
||||||
df_focus[cols] = df_focus[cols].apply(lambda x: x**0.5).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = (
|
||||||
|
df_focus[cols].apply(lambda x: x**0.5).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
)
|
||||||
|
|
||||||
_cols = [
|
_cols = [
|
||||||
"KMID",
|
"KMID",
|
||||||
@@ -88,25 +92,35 @@ class ConfigSectionProcessor(Processor):
|
|||||||
]
|
]
|
||||||
pat = "|".join(["^" + x for x in _cols])
|
pat = "|".join(["^" + x for x in _cols])
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains(pat) & (~df_focus.columns.isin(["HIGH0", "LOW0"]))]
|
cols = df_focus.columns[df_focus.columns.str.contains(pat) & (~df_focus.columns.isin(["HIGH0", "LOW0"]))]
|
||||||
df_focus[cols] = df_focus[cols].groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = df_focus[cols].groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^STD|^VOLUME|^VMA|^VSTD")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^STD|^VOLUME|^VMA|^VSTD")]
|
||||||
df_focus[cols] = df_focus[cols].apply(np.log).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = df_focus[cols].apply(np.log).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^RSQR")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^RSQR")]
|
||||||
df_focus[cols] = df_focus[cols].fillna(0).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = df_focus[cols].fillna(0).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^MAX|^HIGH0")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^MAX|^HIGH0")]
|
||||||
df_focus[cols] = df_focus[cols].apply(lambda x: (x - 1) ** 0.5).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = (
|
||||||
|
df_focus[cols]
|
||||||
|
.apply(lambda x: (x - 1) ** 0.5)
|
||||||
|
.groupby(level="datetime", group_keys=False)
|
||||||
|
.apply(_feature_norm)
|
||||||
|
)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^MIN|^LOW0")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^MIN|^LOW0")]
|
||||||
df_focus[cols] = df_focus[cols].apply(lambda x: (1 - x) ** 0.5).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = (
|
||||||
|
df_focus[cols]
|
||||||
|
.apply(lambda x: (1 - x) ** 0.5)
|
||||||
|
.groupby(level="datetime", group_keys=False)
|
||||||
|
.apply(_feature_norm)
|
||||||
|
)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^CORR|^CORD")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^CORR|^CORD")]
|
||||||
df_focus[cols] = df_focus[cols].apply(np.exp).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = df_focus[cols].apply(np.exp).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
|
||||||
cols = df_focus.columns[df_focus.columns.str.contains("^WVMA")]
|
cols = df_focus.columns[df_focus.columns.str.contains("^WVMA")]
|
||||||
df_focus[cols] = df_focus[cols].apply(np.log1p).groupby(level="datetime").apply(_feature_norm)
|
df_focus[cols] = df_focus[cols].apply(np.log1p).groupby(level="datetime", group_keys=False).apply(_feature_norm)
|
||||||
|
|
||||||
df[selected_cols] = df_focus.values
|
df[selected_cols] = df_focus.values
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ def calc_long_short_prec(
|
|||||||
long precision and short precision in time level
|
long precision and short precision in time level
|
||||||
"""
|
"""
|
||||||
if is_alpha:
|
if is_alpha:
|
||||||
label = label - label.mean(level=date_col)
|
label = label - label.groupby(level=date_col, group_keys=False).mean()
|
||||||
if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
|
if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
|
||||||
raise ValueError("Need more instruments to calculate precision")
|
raise ValueError("Need more instruments to calculate precision")
|
||||||
|
|
||||||
@@ -47,23 +47,25 @@ def calc_long_short_prec(
|
|||||||
if dropna:
|
if dropna:
|
||||||
df.dropna(inplace=True)
|
df.dropna(inplace=True)
|
||||||
|
|
||||||
group = df.groupby(level=date_col)
|
group = df.groupby(level=date_col, group_keys=False)
|
||||||
|
|
||||||
def N(x):
|
def N(x):
|
||||||
return int(len(x) * quantile)
|
return int(len(x) * quantile)
|
||||||
|
|
||||||
# find the top/low quantile of prediction and treat them as long and short target
|
# find the top/low quantile of prediction and treat them as long and short target
|
||||||
long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label).reset_index(level=0, drop=True)
|
long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label)
|
||||||
short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label).reset_index(level=0, drop=True)
|
short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label)
|
||||||
|
|
||||||
groupll = long.groupby(date_col)
|
groupll = long.groupby(date_col, group_keys=False)
|
||||||
l_dom = groupll.apply(lambda x: x > 0)
|
l_dom = groupll.apply(lambda x: x > 0)
|
||||||
l_c = groupll.count()
|
l_c = groupll.count()
|
||||||
|
|
||||||
groups = short.groupby(date_col)
|
groups = short.groupby(date_col, group_keys=False)
|
||||||
s_dom = groups.apply(lambda x: x < 0)
|
s_dom = groups.apply(lambda x: x < 0)
|
||||||
s_c = groups.count()
|
s_c = groups.count()
|
||||||
return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)
|
return (l_dom.groupby(date_col, group_keys=False).sum() / l_c), (
|
||||||
|
s_dom.groupby(date_col, group_keys=False).sum() / s_c
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def calc_long_short_return(
|
def calc_long_short_return(
|
||||||
@@ -100,7 +102,7 @@ def calc_long_short_return(
|
|||||||
df = pd.DataFrame({"pred": pred, "label": label})
|
df = pd.DataFrame({"pred": pred, "label": label})
|
||||||
if dropna:
|
if dropna:
|
||||||
df.dropna(inplace=True)
|
df.dropna(inplace=True)
|
||||||
group = df.groupby(level=date_col)
|
group = df.groupby(level=date_col, group_keys=False)
|
||||||
|
|
||||||
def N(x):
|
def N(x):
|
||||||
return int(len(x) * quantile)
|
return int(len(x) * quantile)
|
||||||
@@ -173,8 +175,8 @@ def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False
|
|||||||
ic and rank ic
|
ic and rank ic
|
||||||
"""
|
"""
|
||||||
df = pd.DataFrame({"pred": pred, "label": label})
|
df = pd.DataFrame({"pred": pred, "label": label})
|
||||||
ic = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"]))
|
ic = df.groupby(date_col, group_keys=False).apply(lambda df: df["pred"].corr(df["label"]))
|
||||||
ric = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
|
ric = df.groupby(date_col, group_keys=False).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
|
||||||
if dropna:
|
if dropna:
|
||||||
return ic.dropna(), ric.dropna()
|
return ic.dropna(), ric.dropna()
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class InternalData:
|
|||||||
|
|
||||||
def _calc_perf(self, pred, label):
|
def _calc_perf(self, pred, label):
|
||||||
df = pd.DataFrame({"pred": pred, "label": label})
|
df = pd.DataFrame({"pred": pred, "label": label})
|
||||||
df = df.groupby("datetime").corr(method="spearman")
|
df = df.groupby("datetime", group_keys=False).corr(method="spearman")
|
||||||
corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1)
|
corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1)
|
||||||
return corr
|
return corr
|
||||||
|
|
||||||
@@ -161,7 +161,7 @@ class MetaTaskDS(MetaTask):
|
|||||||
raise ValueError(f"Most of samples are dropped. Please check this task: {task}")
|
raise ValueError(f"Most of samples are dropped. Please check this task: {task}")
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
d_test.groupby("datetime").size().shape[0] >= 5
|
d_test.groupby("datetime", group_keys=False).size().shape[0] >= 5
|
||||||
), "In this segment, this trading dates is less than 5, you'd better check the data."
|
), "In this segment, this trading dates is less than 5, you'd better check the data."
|
||||||
|
|
||||||
sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1]))
|
sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1]))
|
||||||
|
|||||||
@@ -125,7 +125,11 @@ class MetaModelDS(MetaTaskModel):
|
|||||||
loss_l.setdefault(phase, []).append(running_loss)
|
loss_l.setdefault(phase, []).append(running_loss)
|
||||||
|
|
||||||
pred_y_all = pd.concat(pred_y_all)
|
pred_y_all = pd.concat(pred_y_all)
|
||||||
ic = pred_y_all.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")).mean()
|
ic = (
|
||||||
|
pred_y_all.groupby("datetime", group_keys=False)
|
||||||
|
.apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
|
||||||
|
.mean()
|
||||||
|
)
|
||||||
|
|
||||||
R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch})
|
R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch})
|
||||||
R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})
|
R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ class DEnsembleModel(Model, FeatureInt):
|
|||||||
|
|
||||||
# calculate weights
|
# calculate weights
|
||||||
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
|
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
|
||||||
h_avg = h.groupby("bins")["h_value"].mean()
|
h_avg = h.groupby("bins", group_keys=False, observed=False)["h_value"].mean()
|
||||||
weights = pd.Series(np.zeros(N, dtype=float))
|
weights = pd.Series(np.zeros(N, dtype=float))
|
||||||
for b in h_avg.index:
|
for b in h_avg.index:
|
||||||
weights[h["bins"] == b] = 1.0 / (self.decay**k_th * h_avg[b] + 0.1)
|
weights[h["bins"] == b] = 1.0 / (self.decay**k_th * h_avg[b] + 0.1)
|
||||||
|
|||||||
@@ -90,8 +90,14 @@ class HFLGBModel(ModelFT, LightGBMFInt):
|
|||||||
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
|
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
|
||||||
l_name = df_train["label"].columns[0]
|
l_name = df_train["label"].columns[0]
|
||||||
# Convert label into alpha
|
# Convert label into alpha
|
||||||
df_train["label"][l_name] = df_train["label"][l_name] - df_train["label"][l_name].mean(level=0)
|
df_train.loc[:, ("label", l_name)] = (
|
||||||
df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid["label"][l_name].mean(level=0)
|
df_train.loc[:, ("label", l_name)]
|
||||||
|
- df_train.loc[:, ("label", l_name)].groupby(level=0, group_keys=False).mean()
|
||||||
|
)
|
||||||
|
df_valid.loc[:, ("label", l_name)] = (
|
||||||
|
df_valid.loc[:, ("label", l_name)]
|
||||||
|
- df_valid.loc[:, ("label", l_name)].groupby(level=0, group_keys=False).mean()
|
||||||
|
)
|
||||||
|
|
||||||
def mapping_fn(x):
|
def mapping_fn(x):
|
||||||
return 0 if x < 0 else 1
|
return 0 if x < 0 else 1
|
||||||
|
|||||||
@@ -214,8 +214,10 @@ class ADARNN(Model):
|
|||||||
def calc_all_metrics(pred):
|
def calc_all_metrics(pred):
|
||||||
"""pred is a pandas dataframe that has two attributes: score (pred) and label (real)"""
|
"""pred is a pandas dataframe that has two attributes: score (pred) and label (real)"""
|
||||||
res = {}
|
res = {}
|
||||||
ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score))
|
ic = pred.groupby(level="datetime", group_keys=False).apply(lambda x: x.label.corr(x.score))
|
||||||
rank_ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score, method="spearman"))
|
rank_ic = pred.groupby(level="datetime", group_keys=False).apply(
|
||||||
|
lambda x: x.label.corr(x.score, method="spearman")
|
||||||
|
)
|
||||||
res["ic"] = ic.mean()
|
res["ic"] = ic.mean()
|
||||||
res["icir"] = ic.mean() / ic.std()
|
res["icir"] = ic.mean() / ic.std()
|
||||||
res["ric"] = rank_ic.mean()
|
res["ric"] = rank_ic.mean()
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ class ADD(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
@@ -349,7 +349,7 @@ class ADD(Model):
|
|||||||
return best_score
|
return best_score
|
||||||
|
|
||||||
def gen_market_label(self, df, raw_label):
|
def gen_market_label(self, df, raw_label):
|
||||||
market_label = raw_label.groupby("datetime").mean().squeeze()
|
market_label = raw_label.groupby("datetime", group_keys=False).mean().squeeze()
|
||||||
bins = [-np.inf, self.lo, self.hi, np.inf]
|
bins = [-np.inf, self.lo, self.hi, np.inf]
|
||||||
market_label = pd.cut(market_label, bins, labels=False)
|
market_label = pd.cut(market_label, bins, labels=False)
|
||||||
market_label.name = ("market_return", "market_return")
|
market_label.name = ("market_return", "market_return")
|
||||||
@@ -357,7 +357,7 @@ class ADD(Model):
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
def fit_thresh(self, train_label):
|
def fit_thresh(self, train_label):
|
||||||
market_label = train_label.groupby("datetime").mean().squeeze()
|
market_label = train_label.groupby("datetime", group_keys=False).mean().squeeze()
|
||||||
self.lo, self.hi = market_label.quantile([1 / 3, 2 / 3])
|
self.lo, self.hi = market_label.quantile([1 / 3, 2 / 3])
|
||||||
|
|
||||||
def fit(
|
def fit(
|
||||||
|
|||||||
@@ -163,7 +163,7 @@ class GATs(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
|
|||||||
@@ -27,7 +27,9 @@ class DailyBatchSampler(Sampler):
|
|||||||
def __init__(self, data_source):
|
def __init__(self, data_source):
|
||||||
self.data_source = data_source
|
self.data_source = data_source
|
||||||
# calculate number of samples in each batch
|
# calculate number of samples in each batch
|
||||||
self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values
|
self.daily_count = (
|
||||||
|
pd.Series(index=self.data_source.get_index()).groupby("datetime", group_keys=False).size().values
|
||||||
|
)
|
||||||
self.daily_index = np.roll(np.cumsum(self.daily_count), 1) # calculate begin index of each batch
|
self.daily_index = np.roll(np.cumsum(self.daily_count), 1) # calculate begin index of each batch
|
||||||
self.daily_index[0] = 0
|
self.daily_index[0] = 0
|
||||||
|
|
||||||
@@ -181,7 +183,7 @@ class GATs(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ class HIST(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
|
|||||||
@@ -170,7 +170,7 @@ class IGMTF(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
|
|||||||
@@ -368,7 +368,7 @@ class KRNN(Model):
|
|||||||
|
|
||||||
def get_daily_inter(self, df, shuffle=False):
|
def get_daily_inter(self, df, shuffle=False):
|
||||||
# organize the train data into daily batches
|
# organize the train data into daily batches
|
||||||
daily_count = df.groupby(level=0).size().values
|
daily_count = df.groupby(level=0, group_keys=False).size().values
|
||||||
daily_index = np.roll(np.cumsum(daily_count), 1)
|
daily_index = np.roll(np.cumsum(daily_count), 1)
|
||||||
daily_index[0] = 0
|
daily_index[0] = 0
|
||||||
if shuffle:
|
if shuffle:
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ class DayCumsum(ElemOperator):
|
|||||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||||
_calendar = get_calendar_day(freq=freq)
|
_calendar = get_calendar_day(freq=freq)
|
||||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||||
return series.groupby(_calendar[series.index]).transform(self.period_cusum)
|
return series.groupby(_calendar[series.index], group_keys=False).transform(self.period_cusum)
|
||||||
|
|
||||||
|
|
||||||
class DayLast(ElemOperator):
|
class DayLast(ElemOperator):
|
||||||
@@ -116,7 +116,7 @@ class DayLast(ElemOperator):
|
|||||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||||
_calendar = get_calendar_day(freq=freq)
|
_calendar = get_calendar_day(freq=freq)
|
||||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||||
return series.groupby(_calendar[series.index]).transform("last")
|
return series.groupby(_calendar[series.index], group_keys=False).transform("last")
|
||||||
|
|
||||||
|
|
||||||
class FFillNan(ElemOperator):
|
class FFillNan(ElemOperator):
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def _group_return(pred_label: pd.DataFrame = None, reverse: bool = False, N: int
|
|||||||
t_df = pd.DataFrame(
|
t_df = pd.DataFrame(
|
||||||
{
|
{
|
||||||
"Group%d"
|
"Group%d"
|
||||||
% (i + 1): pred_label_drop.groupby(level="datetime")["label"].apply(
|
% (i + 1): pred_label_drop.groupby(level="datetime", group_keys=False)["label"].apply(
|
||||||
lambda x: x[len(x) // N * i : len(x) // N * (i + 1)].mean() # pylint: disable=W0640
|
lambda x: x[len(x) // N * i : len(x) // N * (i + 1)].mean() # pylint: disable=W0640
|
||||||
)
|
)
|
||||||
for i in range(N)
|
for i in range(N)
|
||||||
@@ -50,7 +50,7 @@ def _group_return(pred_label: pd.DataFrame = None, reverse: bool = False, N: int
|
|||||||
t_df["long-short"] = t_df["Group1"] - t_df["Group%d" % N]
|
t_df["long-short"] = t_df["Group1"] - t_df["Group%d" % N]
|
||||||
|
|
||||||
# Long-Average
|
# Long-Average
|
||||||
t_df["long-average"] = t_df["Group1"] - pred_label.groupby(level="datetime")["label"].mean()
|
t_df["long-average"] = t_df["Group1"] - pred_label.groupby(level="datetime", group_keys=False)["label"].mean()
|
||||||
|
|
||||||
t_df = t_df.dropna(how="all") # for days which does not contain label
|
t_df = t_df.dropna(how="all") # for days which does not contain label
|
||||||
# Cumulative Return By Group
|
# Cumulative Return By Group
|
||||||
@@ -137,7 +137,9 @@ def _pred_ic(
|
|||||||
|
|
||||||
ic_df = pd.concat(
|
ic_df = pd.concat(
|
||||||
[
|
[
|
||||||
pred_label.groupby(level="datetime").apply(partial(_corr_series, method=_methods_mapping[m])).rename(m)
|
pred_label.groupby(level="datetime", group_keys=False)
|
||||||
|
.apply(partial(_corr_series, method=_methods_mapping[m]))
|
||||||
|
.rename(m)
|
||||||
for m in methods
|
for m in methods
|
||||||
],
|
],
|
||||||
axis=1,
|
axis=1,
|
||||||
@@ -145,7 +147,7 @@ def _pred_ic(
|
|||||||
_ic = ic_df.iloc(axis=1)[0]
|
_ic = ic_df.iloc(axis=1)[0]
|
||||||
|
|
||||||
_index = _ic.index.get_level_values(0).astype("str").str.replace("-", "").str.slice(0, 6)
|
_index = _ic.index.get_level_values(0).astype("str").str.replace("-", "").str.slice(0, 6)
|
||||||
_monthly_ic = _ic.groupby(_index).mean()
|
_monthly_ic = _ic.groupby(_index, group_keys=False).mean()
|
||||||
_monthly_ic.index = pd.MultiIndex.from_arrays(
|
_monthly_ic.index = pd.MultiIndex.from_arrays(
|
||||||
[_monthly_ic.index.str.slice(0, 4), _monthly_ic.index.str.slice(4, 6)],
|
[_monthly_ic.index.str.slice(0, 4), _monthly_ic.index.str.slice(4, 6)],
|
||||||
names=["year", "month"],
|
names=["year", "month"],
|
||||||
@@ -220,8 +222,10 @@ def _pred_ic(
|
|||||||
|
|
||||||
def _pred_autocorr(pred_label: pd.DataFrame, lag=1, **kwargs) -> tuple:
|
def _pred_autocorr(pred_label: pd.DataFrame, lag=1, **kwargs) -> tuple:
|
||||||
pred = pred_label.copy()
|
pred = pred_label.copy()
|
||||||
pred["score_last"] = pred.groupby(level="instrument")["score"].shift(lag)
|
pred["score_last"] = pred.groupby(level="instrument", group_keys=False)["score"].shift(lag)
|
||||||
ac = pred.groupby(level="datetime").apply(lambda x: x["score"].rank(pct=True).corr(x["score_last"].rank(pct=True)))
|
ac = pred.groupby(level="datetime", group_keys=False).apply(
|
||||||
|
lambda x: x["score"].rank(pct=True).corr(x["score_last"].rank(pct=True))
|
||||||
|
)
|
||||||
_df = ac.to_frame("value")
|
_df = ac.to_frame("value")
|
||||||
ac_figure = ScatterGraph(
|
ac_figure = ScatterGraph(
|
||||||
_df,
|
_df,
|
||||||
@@ -235,13 +239,13 @@ def _pred_autocorr(pred_label: pd.DataFrame, lag=1, **kwargs) -> tuple:
|
|||||||
|
|
||||||
def _pred_turnover(pred_label: pd.DataFrame, N=5, lag=1, **kwargs) -> tuple:
|
def _pred_turnover(pred_label: pd.DataFrame, N=5, lag=1, **kwargs) -> tuple:
|
||||||
pred = pred_label.copy()
|
pred = pred_label.copy()
|
||||||
pred["score_last"] = pred.groupby(level="instrument")["score"].shift(lag)
|
pred["score_last"] = pred.groupby(level="instrument", group_keys=False)["score"].shift(lag)
|
||||||
top = pred.groupby(level="datetime").apply(
|
top = pred.groupby(level="datetime", group_keys=False).apply(
|
||||||
lambda x: 1
|
lambda x: 1
|
||||||
- x.nlargest(len(x) // N, columns="score").index.isin(x.nlargest(len(x) // N, columns="score_last").index).sum()
|
- x.nlargest(len(x) // N, columns="score").index.isin(x.nlargest(len(x) // N, columns="score_last").index).sum()
|
||||||
/ (len(x) // N)
|
/ (len(x) // N)
|
||||||
)
|
)
|
||||||
bottom = pred.groupby(level="datetime").apply(
|
bottom = pred.groupby(level="datetime", group_keys=False).apply(
|
||||||
lambda x: 1
|
lambda x: 1
|
||||||
- x.nsmallest(len(x) // N, columns="score")
|
- x.nsmallest(len(x) // N, columns="score")
|
||||||
.index.isin(x.nsmallest(len(x) // N, columns="score_last").index)
|
.index.isin(x.nsmallest(len(x) // N, columns="score_last").index)
|
||||||
@@ -313,7 +317,7 @@ def model_performance_graph(
|
|||||||
2017-12-15 -0.102778 -0.102778
|
2017-12-15 -0.102778 -0.102778
|
||||||
|
|
||||||
|
|
||||||
:param lag: `pred.groupby(level='instrument')['score'].shift(lag)`. It will be only used in the auto-correlation computing.
|
:param lag: `pred.groupby(level='instrument', group_keys=False)['score'].shift(lag)`. It will be only used in the auto-correlation computing.
|
||||||
:param N: group number, default 5.
|
:param N: group number, default 5.
|
||||||
:param reverse: if `True`, `pred['score'] *= -1`.
|
:param reverse: if `True`, `pred['score'] *= -1`.
|
||||||
:param rank: if **True**, calculate rank ic.
|
:param rank: if **True**, calculate rank ic.
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def _get_cum_return_data_with_position(
|
|||||||
|
|
||||||
_cumulative_return_df["label"] = _cumulative_return_df["label"] - _cumulative_return_df["bench"]
|
_cumulative_return_df["label"] = _cumulative_return_df["label"] - _cumulative_return_df["bench"]
|
||||||
_cumulative_return_df = _cumulative_return_df.dropna()
|
_cumulative_return_df = _cumulative_return_df.dropna()
|
||||||
df_gp = _cumulative_return_df.groupby(level="datetime")
|
df_gp = _cumulative_return_df.groupby(level="datetime", group_keys=False)
|
||||||
result_list = []
|
result_list = []
|
||||||
for gp in df_gp:
|
for gp in df_gp:
|
||||||
date = gp[0]
|
date = gp[0]
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ def _calculate_label_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
g_df["excess_return"] = g_df[_label_name] - g_df[_label_name].mean()
|
g_df["excess_return"] = g_df[_label_name] - g_df[_label_name].mean()
|
||||||
return g_df
|
return g_df
|
||||||
|
|
||||||
return df.groupby(level="datetime").apply(_calculate_day_value)
|
return df.groupby(level="datetime", group_keys=False).apply(_calculate_day_value)
|
||||||
|
|
||||||
|
|
||||||
def get_position_data(
|
def get_position_data(
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ def _get_figure_with_position(
|
|||||||
)
|
)
|
||||||
|
|
||||||
res_dict = dict()
|
res_dict = dict()
|
||||||
_pos_gp = _position_df.groupby(level=1)
|
_pos_gp = _position_df.groupby(level=1, group_keys=False)
|
||||||
for _item in _pos_gp:
|
for _item in _pos_gp:
|
||||||
_date = _item[0]
|
_date = _item[0]
|
||||||
_day_df = _item[1]
|
_day_df = _item[1]
|
||||||
|
|||||||
@@ -63,9 +63,11 @@ def _get_monthly_risk_analysis_with_report(report_normal_df: pd.DataFrame) -> pd
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Group by month
|
# Group by month
|
||||||
report_normal_gp = report_normal_df.groupby([report_normal_df.index.year, report_normal_df.index.month])
|
report_normal_gp = report_normal_df.groupby(
|
||||||
|
[report_normal_df.index.year, report_normal_df.index.month], group_keys=False
|
||||||
|
)
|
||||||
# report_long_short_gp = report_long_short_df.groupby(
|
# report_long_short_gp = report_long_short_df.groupby(
|
||||||
# [report_long_short_df.index.year, report_long_short_df.index.month]
|
# [report_long_short_df.index.year, report_long_short_df.index.month], group_keys=False
|
||||||
# )
|
# )
|
||||||
|
|
||||||
gp_month = sorted(set(report_normal_gp.size().index))
|
gp_month = sorted(set(report_normal_gp.size().index))
|
||||||
@@ -97,7 +99,7 @@ def _get_monthly_analysis_with_feature(monthly_df: pd.DataFrame, feature: str =
|
|||||||
:param feature:
|
:param feature:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
_monthly_df_gp = monthly_df.reset_index().groupby(["level_1"])
|
_monthly_df_gp = monthly_df.reset_index().groupby(["level_1"], group_keys=False)
|
||||||
|
|
||||||
_name_df = _monthly_df_gp.get_group(feature).set_index(["level_0", "level_1"])
|
_name_df = _monthly_df_gp.get_group(feature).set_index(["level_0", "level_1"])
|
||||||
_temp_df = _name_df.pivot_table(index="date", values=["risk"], columns=_name_df.index)
|
_temp_df = _name_df.pivot_table(index="date", values=["risk"], columns=_name_df.index)
|
||||||
|
|||||||
@@ -15,8 +15,10 @@ def _get_score_ic(pred_label: pd.DataFrame):
|
|||||||
"""
|
"""
|
||||||
concat_data = pred_label.copy()
|
concat_data = pred_label.copy()
|
||||||
concat_data.dropna(axis=0, how="any", inplace=True)
|
concat_data.dropna(axis=0, how="any", inplace=True)
|
||||||
_ic = concat_data.groupby(level="datetime").apply(lambda x: x["label"].corr(x["score"]))
|
_ic = concat_data.groupby(level="datetime", group_keys=False).apply(lambda x: x["label"].corr(x["score"]))
|
||||||
_rank_ic = concat_data.groupby(level="datetime").apply(lambda x: x["label"].corr(x["score"], method="spearman"))
|
_rank_ic = concat_data.groupby(level="datetime", group_keys=False).apply(
|
||||||
|
lambda x: x["label"].corr(x["score"], method="spearman")
|
||||||
|
)
|
||||||
return pd.DataFrame({"ic": _ic, "rank_ic": _rank_ic})
|
return pd.DataFrame({"ic": _ic, "rank_ic": _rank_ic})
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -72,10 +72,10 @@ class ValueCNT(FeaAnalyser):
|
|||||||
self._val_cnt = {}
|
self._val_cnt = {}
|
||||||
for col, item in self._dataset.items():
|
for col, item in self._dataset.items():
|
||||||
if not super().skip(col):
|
if not super().skip(col):
|
||||||
self._val_cnt[col] = item.groupby(DT_COL_NAME).apply(lambda s: len(s.unique()))
|
self._val_cnt[col] = item.groupby(DT_COL_NAME, group_keys=False).apply(lambda s: len(s.unique()))
|
||||||
self._val_cnt = pd.DataFrame(self._val_cnt)
|
self._val_cnt = pd.DataFrame(self._val_cnt)
|
||||||
if self.ratio:
|
if self.ratio:
|
||||||
self._val_cnt = self._val_cnt.div(self._dataset.groupby(DT_COL_NAME).size(), axis=0)
|
self._val_cnt = self._val_cnt.div(self._dataset.groupby(DT_COL_NAME, group_keys=False).size(), axis=0)
|
||||||
|
|
||||||
# TODO: transfer this feature to other analysers
|
# TODO: transfer this feature to other analysers
|
||||||
ymin, ymax = self._val_cnt.min().min(), self._val_cnt.max().max()
|
ymin, ymax = self._val_cnt.min().min(), self._val_cnt.max().max()
|
||||||
@@ -98,7 +98,7 @@ class FeaInfAna(NumFeaAnalyser):
|
|||||||
self._inf_cnt = {}
|
self._inf_cnt = {}
|
||||||
for col, item in self._dataset.items():
|
for col, item in self._dataset.items():
|
||||||
if not super().skip(col):
|
if not super().skip(col):
|
||||||
self._inf_cnt[col] = item.apply(np.isinf).astype(np.int).groupby(DT_COL_NAME).sum()
|
self._inf_cnt[col] = item.apply(np.isinf).astype(np.int).groupby(DT_COL_NAME, group_keys=False).sum()
|
||||||
self._inf_cnt = pd.DataFrame(self._inf_cnt)
|
self._inf_cnt = pd.DataFrame(self._inf_cnt)
|
||||||
|
|
||||||
def skip(self, col):
|
def skip(self, col):
|
||||||
@@ -111,7 +111,7 @@ class FeaInfAna(NumFeaAnalyser):
|
|||||||
|
|
||||||
class FeaNanAna(FeaAnalyser):
|
class FeaNanAna(FeaAnalyser):
|
||||||
def calc_stat_values(self):
|
def calc_stat_values(self):
|
||||||
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum()
|
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME, group_keys=False).sum()
|
||||||
|
|
||||||
def skip(self, col):
|
def skip(self, col):
|
||||||
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
|
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
|
||||||
@@ -123,8 +123,8 @@ class FeaNanAna(FeaAnalyser):
|
|||||||
|
|
||||||
class FeaNanAnaRatio(FeaAnalyser):
|
class FeaNanAnaRatio(FeaAnalyser):
|
||||||
def calc_stat_values(self):
|
def calc_stat_values(self):
|
||||||
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum()
|
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME, group_keys=False).sum()
|
||||||
self._total_cnt = self._dataset.groupby(DT_COL_NAME).size()
|
self._total_cnt = self._dataset.groupby(DT_COL_NAME, group_keys=False).size()
|
||||||
|
|
||||||
def skip(self, col):
|
def skip(self, col):
|
||||||
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
|
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
|
||||||
@@ -176,8 +176,8 @@ class FeaSkewTurt(NumFeaAnalyser):
|
|||||||
|
|
||||||
class FeaMeanStd(NumFeaAnalyser):
|
class FeaMeanStd(NumFeaAnalyser):
|
||||||
def calc_stat_values(self):
|
def calc_stat_values(self):
|
||||||
self._std = self._dataset.groupby(DT_COL_NAME).std()
|
self._std = self._dataset.groupby(DT_COL_NAME, group_keys=False).std()
|
||||||
self._mean = self._dataset.groupby(DT_COL_NAME).mean()
|
self._mean = self._dataset.groupby(DT_COL_NAME, group_keys=False).mean()
|
||||||
|
|
||||||
def plot_single(self, col, ax):
|
def plot_single(self, col, ax):
|
||||||
self._mean[col].plot(ax=ax, label="mean")
|
self._mean[col].plot(ax=ax, label="mean")
|
||||||
|
|||||||
@@ -347,7 +347,7 @@ class SBBStrategyEMA(SBBStrategyBase):
|
|||||||
self.signal = {}
|
self.signal = {}
|
||||||
|
|
||||||
if not signal_df.empty:
|
if not signal_df.empty:
|
||||||
for stock_id, stock_val in signal_df.groupby(level="instrument"):
|
for stock_id, stock_val in signal_df.groupby(level="instrument", group_keys=False):
|
||||||
self.signal[stock_id] = stock_val["signal"].droplevel(level="instrument")
|
self.signal[stock_id] = stock_val["signal"].droplevel(level="instrument")
|
||||||
|
|
||||||
def reset_level_infra(self, level_infra):
|
def reset_level_infra(self, level_infra):
|
||||||
@@ -434,7 +434,7 @@ class ACStrategy(BaseStrategy):
|
|||||||
self.signal = {}
|
self.signal = {}
|
||||||
|
|
||||||
if not signal_df.empty:
|
if not signal_df.empty:
|
||||||
for stock_id, stock_val in signal_df.groupby(level="instrument"):
|
for stock_id, stock_val in signal_df.groupby(level="instrument", group_keys=False):
|
||||||
self.signal[stock_id] = stock_val["volatility"].droplevel(level="instrument")
|
self.signal[stock_id] = stock_val["volatility"].droplevel(level="instrument")
|
||||||
|
|
||||||
def reset_level_infra(self, level_infra):
|
def reset_level_infra(self, level_infra):
|
||||||
|
|||||||
@@ -842,7 +842,7 @@ class DiskDatasetCache(DatasetCache):
|
|||||||
def build_index_from_data(data, start_index=0):
|
def build_index_from_data(data, start_index=0):
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
line_data = data.groupby("datetime").size()
|
line_data = data.groupby("datetime", group_keys=False).size()
|
||||||
line_data.sort_index(inplace=True)
|
line_data.sort_index(inplace=True)
|
||||||
index_end = line_data.cumsum()
|
index_end = line_data.cumsum()
|
||||||
index_start = index_end.shift(1, fill_value=0)
|
index_start = index_end.shift(1, fill_value=0)
|
||||||
|
|||||||
@@ -352,7 +352,7 @@ class CSRankNorm(Processor):
|
|||||||
def __call__(self, df):
|
def __call__(self, df):
|
||||||
# try not modify original dataframe
|
# try not modify original dataframe
|
||||||
cols = get_group_columns(df, self.fields_group)
|
cols = get_group_columns(df, self.fields_group)
|
||||||
t = df[cols].groupby("datetime").rank(pct=True)
|
t = df[cols].groupby("datetime", group_keys=False).rank(pct=True)
|
||||||
t -= 0.5
|
t -= 0.5
|
||||||
t *= 3.46 # NOTE: towards unit std
|
t *= 3.46 # NOTE: towards unit std
|
||||||
df[cols] = t
|
df[cols] = t
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class HashingStockStorage(BaseHandlerStorage):
|
|||||||
def __init__(self, df):
|
def __init__(self, df):
|
||||||
self.hash_df = dict()
|
self.hash_df = dict()
|
||||||
self.stock_level = get_level_index(df, "instrument")
|
self.stock_level = get_level_index(df, "instrument")
|
||||||
for k, v in df.groupby(level="instrument"):
|
for k, v in df.groupby(level="instrument", group_keys=False):
|
||||||
self.hash_df[k] = v
|
self.hash_df[k] = v
|
||||||
self.columns = df.columns
|
self.columns = df.columns
|
||||||
|
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ class MockInstrumentStorage(MockStorageBase, InstrumentStorage):
|
|||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
instruments = {}
|
instruments = {}
|
||||||
for symbol, group in self.df.groupby(by="symbol"):
|
for symbol, group in self.df.groupby(by="symbol", group_keys=False):
|
||||||
start = group["datetime"].iloc[0]
|
start = group["datetime"].iloc[0]
|
||||||
end = group["datetime"].iloc[-1]
|
end = group["datetime"].iloc[-1]
|
||||||
instruments[symbol] = [(start, end)]
|
instruments[symbol] = [(start, end)]
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class ParallelExt(Parallel):
|
|||||||
|
|
||||||
|
|
||||||
def datetime_groupby_apply(
|
def datetime_groupby_apply(
|
||||||
df, apply_func: Union[Callable, Text], axis=0, level="datetime", resample_rule="M", n_jobs=-1
|
df, apply_func: Union[Callable, Text], axis=0, level="datetime", resample_rule="ME", n_jobs=-1
|
||||||
):
|
):
|
||||||
"""datetime_groupby_apply
|
"""datetime_groupby_apply
|
||||||
This function will apply the `apply_func` on the datetime level index.
|
This function will apply the `apply_func` on the datetime level index.
|
||||||
@@ -57,12 +57,12 @@ def datetime_groupby_apply(
|
|||||||
|
|
||||||
def _naive_group_apply(df):
|
def _naive_group_apply(df):
|
||||||
if isinstance(apply_func, str):
|
if isinstance(apply_func, str):
|
||||||
return getattr(df.groupby(axis=axis, level=level), apply_func)()
|
return getattr(df.groupby(axis=axis, level=level, group_keys=False), apply_func)()
|
||||||
return df.groupby(axis=axis, level=level).apply(apply_func)
|
return df.groupby(level=level, group_keys=False).apply(apply_func)
|
||||||
|
|
||||||
if n_jobs != 1:
|
if n_jobs != 1:
|
||||||
dfs = ParallelExt(n_jobs=n_jobs)(
|
dfs = ParallelExt(n_jobs=n_jobs)(
|
||||||
delayed(_naive_group_apply)(sub_df) for idx, sub_df in df.resample(resample_rule, axis=axis, level=level)
|
delayed(_naive_group_apply)(sub_df) for idx, sub_df in df.resample(resample_rule, level=level)
|
||||||
)
|
)
|
||||||
return pd.concat(dfs, axis=axis).sort_index()
|
return pd.concat(dfs, axis=axis).sort_index()
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -194,9 +194,9 @@ def resam_ts_data(
|
|||||||
if isinstance(feature.index, pd.MultiIndex):
|
if isinstance(feature.index, pd.MultiIndex):
|
||||||
if callable(method):
|
if callable(method):
|
||||||
method_func = method
|
method_func = method
|
||||||
return feature.groupby(level="instrument").apply(method_func, **method_kwargs)
|
return feature.groupby(level="instrument", group_keys=False).apply(method_func, **method_kwargs)
|
||||||
elif isinstance(method, str):
|
elif isinstance(method, str):
|
||||||
return getattr(feature.groupby(level="instrument"), method)(**method_kwargs)
|
return getattr(feature.groupby(level="instrument", group_keys=False), method)(**method_kwargs)
|
||||||
else:
|
else:
|
||||||
if callable(method):
|
if callable(method):
|
||||||
method_func = method
|
method_func = method
|
||||||
|
|||||||
@@ -652,7 +652,7 @@ class MultiPassPortAnaRecord(PortAnaRecord):
|
|||||||
combined_df = pd.concat(risk_analysis_df_map[_analysis_freq])
|
combined_df = pd.concat(risk_analysis_df_map[_analysis_freq])
|
||||||
|
|
||||||
# Calculate return and information ratio's mean, std and mean/std
|
# Calculate return and information ratio's mean, std and mean/std
|
||||||
multi_pass_port_analysis_df = combined_df.groupby(level=[0, 1]).apply(
|
multi_pass_port_analysis_df = combined_df.groupby(level=[0, 1], group_keys=False).apply(
|
||||||
lambda x: pd.Series(
|
lambda x: pd.Series(
|
||||||
{"mean": x["risk"].mean(), "std": x["risk"].std(), "mean_std": x["risk"].mean() / x["risk"].std()}
|
{"mean": x["risk"].mean(), "std": x["risk"].std(), "mean_std": x["risk"].mean() / x["risk"].std()}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -808,7 +808,7 @@ def calc_paused_num(df: pd.DataFrame, _date_field_name, _symbol_field_name):
|
|||||||
all_nan_nums = 0
|
all_nan_nums = 0
|
||||||
# Record the number of consecutive occurrences of trading days that are not nan throughout the day
|
# Record the number of consecutive occurrences of trading days that are not nan throughout the day
|
||||||
not_nan_nums = 0
|
not_nan_nums = 0
|
||||||
for _date, _df in df.groupby("_tmp_date"):
|
for _date, _df in df.groupby("_tmp_date", group_keys=False):
|
||||||
_df["paused"] = 0
|
_df["paused"] = 0
|
||||||
if not _df.loc[_df["volume"] < 0].empty:
|
if not _df.loc[_df["volume"] < 0].empty:
|
||||||
logger.warning(f"volume < 0, will fill np.nan: {_date} {_symbol}")
|
logger.warning(f"volume < 0, will fill np.nan: {_date} {_symbol}")
|
||||||
|
|||||||
@@ -458,7 +458,7 @@ class DumpDataUpdate(DumpDataBase):
|
|||||||
error_code = {}
|
error_code = {}
|
||||||
with ProcessPoolExecutor(max_workers=self.works) as executor:
|
with ProcessPoolExecutor(max_workers=self.works) as executor:
|
||||||
futures = {}
|
futures = {}
|
||||||
for _code, _df in self._all_data.groupby(self.symbol_field_name):
|
for _code, _df in self._all_data.groupby(self.symbol_field_name, group_keys=False):
|
||||||
_code = fname_to_code(str(_code).lower()).upper()
|
_code = fname_to_code(str(_code).lower()).upper()
|
||||||
_start, _end = self._get_date(_df, is_begin_end=True)
|
_start, _end = self._get_date(_df, is_begin_end=True)
|
||||||
if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
|
if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
|
||||||
|
|||||||
@@ -7,8 +7,8 @@ from qlib.tests import TestAutoData
|
|||||||
class TestDataset(TestAutoData):
|
class TestDataset(TestAutoData):
|
||||||
def testCSI300(self):
|
def testCSI300(self):
|
||||||
close_p = D.features(D.instruments("csi300"), ["$close"])
|
close_p = D.features(D.instruments("csi300"), ["$close"])
|
||||||
size = close_p.groupby("datetime").size()
|
size = close_p.groupby("datetime", group_keys=False).size()
|
||||||
cnt = close_p.groupby("datetime").count()["$close"]
|
cnt = close_p.groupby("datetime", group_keys=False).count()["$close"]
|
||||||
size_desc = size.describe(percentiles=np.arange(0.1, 1.0, 0.1))
|
size_desc = size.describe(percentiles=np.arange(0.1, 1.0, 0.1))
|
||||||
cnt_desc = cnt.describe(percentiles=np.arange(0.1, 1.0, 0.1))
|
cnt_desc = cnt.describe(percentiles=np.arange(0.1, 1.0, 0.1))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user