1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 02:21:18 +08:00
Files
qlib/qlib/contrib/data/utils/sepdf.py

211 lines
6.8 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
from typing import Dict, Iterable, Union
def align_index(df_dict, join):
res = {}
for k, df in df_dict.items():
if join is not None and k != join:
df = df.reindex(df_dict[join].index)
res[k] = df
return res
# Mocking the pd.DataFrame class
class SepDataFrame:
"""
(Sep)erate DataFrame
We usually concat multiple dataframe to be processed together(Such as feature, label, weight, filter).
However, they are usually be used separately at last.
This will result in extra cost for concatenating and splitting data(reshaping and copying data in the memory is very expensive)
SepDataFrame tries to act like a DataFrame whose column with multiindex
"""
# TODO:
# SepDataFrame try to behave like pandas dataframe, but it is still not them same
# Contributions are welcome to make it more complete.
def __init__(self, df_dict: Dict[str, pd.DataFrame], join: str, skip_align=False):
"""
initialize the data based on the dataframe dictionary
Parameters
----------
df_dict : Dict[str, pd.DataFrame]
dataframe dictionary
join : str
how to join the data
It will reindex the dataframe based on the join key.
If join is None, the reindex step will be skipped
skip_align :
for some cases, we can improve performance by skipping aligning index
"""
self.join = join
if skip_align:
self._df_dict = df_dict
else:
self._df_dict = align_index(df_dict, join)
@property
def loc(self):
return SDFLoc(self, join=self.join)
@property
def index(self):
return self._df_dict[self.join].index
def apply_each(self, method: str, skip_align=True, *args, **kwargs):
"""
Assumptions:
- inplace methods will return None
"""
inplace = False
df_dict = {}
for k, df in self._df_dict.items():
df_dict[k] = getattr(df, method)(*args, **kwargs)
if df_dict[k] is None:
inplace = True
if not inplace:
return SepDataFrame(df_dict=df_dict, join=self.join, skip_align=skip_align)
def sort_index(self, *args, **kwargs):
return self.apply_each("sort_index", True, *args, **kwargs)
def copy(self, *args, **kwargs):
return self.apply_each("copy", True, *args, **kwargs)
def _update_join(self):
if self.join not in self:
if len(self._df_dict) > 0:
self.join = next(iter(self._df_dict.keys()))
else:
# NOTE: this will change the behavior of previous reindex when all the keys are empty
self.join = None
def __getitem__(self, item):
# TODO: behave more like pandas when multiindex
return self._df_dict[item]
def __setitem__(self, item: str, df: Union[pd.DataFrame, pd.Series]):
# TODO: consider the join behavior
if not isinstance(item, tuple):
self._df_dict[item] = df
else:
# NOTE: corner case of MultiIndex
_df_dict_key, *col_name = item
col_name = tuple(col_name)
if _df_dict_key in self._df_dict:
if len(col_name) == 1:
col_name = col_name[0]
self._df_dict[_df_dict_key][col_name] = df
else:
if isinstance(df, pd.Series):
if len(col_name) == 1:
col_name = col_name[0]
self._df_dict[_df_dict_key] = df.to_frame(col_name)
else:
df_copy = df.copy() # avoid changing df
df_copy.columns = pd.MultiIndex.from_tuples([(*col_name, *idx) for idx in df.columns.to_list()])
self._df_dict[_df_dict_key] = df_copy
def __delitem__(self, item: str):
del self._df_dict[item]
self._update_join()
def __contains__(self, item):
return item in self._df_dict
def __len__(self):
return len(self._df_dict[self.join])
def droplevel(self, *args, **kwargs):
raise NotImplementedError(f"Please implement the `droplevel` method")
@property
def columns(self):
dfs = []
for k, df in self._df_dict.items():
df = df.head(0)
df.columns = pd.MultiIndex.from_product([[k], df.columns])
dfs.append(df)
return pd.concat(dfs, axis=1).columns
# Useless methods
@staticmethod
def merge(df_dict: Dict[str, pd.DataFrame], join: str):
all_df = df_dict[join]
for k, df in df_dict.items():
if k != join:
all_df = all_df.join(df)
return all_df
class SDFLoc:
"""Mock Class"""
def __init__(self, sdf: SepDataFrame, join):
self._sdf = sdf
self.axis = None
self.join = join
def __call__(self, axis):
self.axis = axis
return self
def __getitem__(self, args):
if self.axis == 1:
if isinstance(args, str):
return self._sdf[args]
elif isinstance(args, (tuple, list)):
new_df_dict = {k: self._sdf[k] for k in args}
return SepDataFrame(new_df_dict, join=self.join if self.join in args else args[0], skip_align=True)
else:
raise NotImplementedError(f"This type of input is not supported")
elif self.axis == 0:
return SepDataFrame(
{k: df.loc(axis=0)[args] for k, df in self._sdf._df_dict.items()}, join=self.join, skip_align=True
)
else:
df = self._sdf
if isinstance(args, tuple):
ax0, *ax1 = args
if len(ax1) == 0:
ax1 = None
if ax1 is not None:
df = df.loc(axis=1)[ax1]
if ax0 is not None:
df = df.loc(axis=0)[ax0]
return df
else:
return df.loc(axis=0)[args]
# Patch pandas DataFrame
# Tricking isinstance to accept SepDataFrame as its subclass
import builtins
def _isinstance(instance, cls):
if isinstance_orig(instance, SepDataFrame): # pylint: disable=E0602 # noqa: F821
if isinstance(cls, Iterable):
for c in cls:
if c is pd.DataFrame:
return True
elif cls is pd.DataFrame:
return True
return isinstance_orig(instance, cls) # pylint: disable=E0602 # noqa: F821
builtins.isinstance_orig = builtins.isinstance
builtins.isinstance = _isinstance
if __name__ == "__main__":
sdf = SepDataFrame({}, join=None)
print(isinstance(sdf, (pd.DataFrame,)))
print(isinstance(sdf, pd.DataFrame))