1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-30 09:31:18 +08:00
Files
qlib/qlib/utils/index_data.py
you-n-g be4646b4b7 Adjust rolling api (#1594)
* Intermediate version

* Fix yaml template & Successfully run rolling

* Be compatible with benchmark

* Get same results with previous linear model

* Black formatting

* Update black

* Update the placeholder mechanism

* Update CI

* Update CI

* Upgrade Black

* Fix CI and simplify code

* Fix CI

* Move the data processing caching mechanism into utils.

* Adjusting DDG-DA

* Organize import
2023-07-14 12:16:12 +08:00

644 lines
21 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Motivation of index_data
- Pandas has a lot of user-friendly interfaces. However, integrating too much features in a single tool bring too much overhead and makes it much slower than numpy.
Some users just want a simple numpy dataframe with indices and don't want such a complicated tools.
Such users are the target of `index_data`
`index_data` try to behave like pandas (some API will be different because we try to be simpler and more intuitive) but don't compromise the performance. It provides the basic numpy data and simple indexing feature. If users call APIs which may compromise the performance, index_data will raise Errors.
"""
from __future__ import annotations
from typing import Dict, Tuple, Union, Callable, List
import bisect
import numpy as np
import pandas as pd
def concat(data_list: Union[SingleData], axis=0) -> MultiData:
"""concat all SingleData by index.
TODO: now just for SingleData.
Parameters
----------
data_list : List[SingleData]
the list of all SingleData to concat.
Returns
-------
MultiData
the MultiData with ndim == 2
"""
if axis == 0:
raise NotImplementedError(f"please implement this func when axis == 0")
elif axis == 1:
# get all index and row
all_index = set()
for index_data in data_list:
all_index = all_index | set(index_data.index)
all_index = list(all_index)
all_index.sort()
all_index_map = dict(zip(all_index, range(len(all_index))))
# concat all
tmp_data = np.full((len(all_index), len(data_list)), np.NaN)
for data_id, index_data in enumerate(data_list):
assert isinstance(index_data, SingleData)
now_data_map = [all_index_map[index] for index in index_data.index]
tmp_data[now_data_map, data_id] = index_data.data
return MultiData(tmp_data, all_index)
else:
raise ValueError(f"axis must be 0 or 1")
def sum_by_index(data_list: Union[SingleData], new_index: list, fill_value=0) -> SingleData:
"""concat all SingleData by new index.
Parameters
----------
data_list : List[SingleData]
the list of all SingleData to sum.
new_index : list
the new_index of new SingleData.
fill_value : float
fill the missing values or replace np.NaN.
Returns
-------
SingleData
the SingleData with new_index and values after sum.
"""
data_list = [data.to_dict() for data in data_list]
data_sum = {}
for id in new_index:
item_sum = 0
for data in data_list:
if id in data and not np.isnan(data[id]):
item_sum += data[id]
else:
item_sum += fill_value
data_sum[id] = item_sum
return SingleData(data_sum)
class Index:
"""
This is for indexing(rows or columns)
Read-only operations has higher priorities than others.
So this class is designed in a **read-only** way to shared data for queries.
Modifications will results in new Index.
NOTE: the indexing has following flaws
- duplicated index value is not well supported (only the first appearance will be considered)
- The order of the index is not considered!!!! So the slicing will not behave like pandas when indexings are ordered
"""
def __init__(self, idx_list: Union[List, pd.Index, "Index", int]):
self.idx_list: np.ndarray = None # using array type for index list will make things easier
if isinstance(idx_list, Index):
# Fast read-only copy
self.idx_list = idx_list.idx_list
self.index_map = idx_list.index_map
self._is_sorted = idx_list._is_sorted
elif isinstance(idx_list, int):
self.index_map = self.idx_list = np.arange(idx_list)
self._is_sorted = True
else:
self.idx_list = np.array(idx_list)
# NOTE: only the first appearance is indexed
self.index_map = dict(zip(self.idx_list, range(len(self))))
self._is_sorted = False
def __getitem__(self, i: int):
return self.idx_list[i]
def _convert_type(self, item):
"""
After user creates indices with Type A, user may query data with other types with the same info.
This method try to make type conversion and make query sane rather than raising KeyError strictly
Parameters
----------
item :
The item to query index
"""
if self.idx_list.dtype.type is np.datetime64:
if isinstance(item, pd.Timestamp):
# This happens often when creating index based on pandas.DatetimeIndex and query with pd.Timestamp
return item.to_numpy()
return item
def index(self, item) -> int:
"""
Given the index value, get the integer index
Parameters
----------
item :
The item to query
Returns
-------
int:
The index of the item
Raises
------
KeyError:
If the query item does not exist
"""
try:
return self.index_map[self._convert_type(item)]
except IndexError as index_e:
raise KeyError(f"{item} can't be found in {self}") from index_e
def __or__(self, other: "Index"):
return Index(idx_list=list(set(self.idx_list) | set(other.idx_list)))
def __eq__(self, other: "Index"):
# NOTE: np.nan is not supported in the index
if self.idx_list.shape != other.idx_list.shape:
return False
return (self.idx_list == other.idx_list).all()
def __len__(self):
return len(self.idx_list)
def is_sorted(self):
return self._is_sorted
def sort(self) -> Tuple["Index", np.ndarray]:
"""
sort the index
Returns
-------
Tuple["Index", np.ndarray]:
the sorted Index and the changed index
"""
sorted_idx = np.argsort(self.idx_list)
idx = Index(self.idx_list[sorted_idx])
idx._is_sorted = True
return idx, sorted_idx
def tolist(self):
"""return the index with the format of list."""
return self.idx_list.tolist()
class LocIndexer:
"""
`Indexer` will behave like the `LocIndexer` in Pandas
Read-only operations has higher priorities than others.
So this class is designed in a read-only way to shared data for queries.
Modifications will results in new Index.
"""
def __init__(self, index_data: "IndexData", indices: List[Index], int_loc: bool = False):
self._indices: List[Index] = indices
self._bind_id = index_data # bind index data
self._int_loc = int_loc
assert self._bind_id.data.ndim == len(self._indices)
@staticmethod
def proc_idx_l(indices: List[Union[List, pd.Index, Index]], data_shape: Tuple = None) -> List[Index]:
"""process the indices from user and output a list of `Index`"""
res = []
for i, idx in enumerate(indices):
res.append(Index(data_shape[i] if len(idx) == 0 else idx))
return res
def _slc_convert(self, index: Index, indexing: slice) -> slice:
"""
convert value-based indexing to integer-based indexing.
Parameters
----------
index : Index
index data.
indexing : slice
value based indexing data with slice type for indexing.
Returns
-------
slice:
the integer based slicing
"""
if index.is_sorted():
int_start = None if indexing.start is None else bisect.bisect_left(index, indexing.start)
int_stop = None if indexing.stop is None else bisect.bisect_right(index, indexing.stop)
else:
int_start = None if indexing.start is None else index.index(indexing.start)
int_stop = None if indexing.stop is None else index.index(indexing.stop) + 1
return slice(int_start, int_stop)
def __getitem__(self, indexing):
"""
Parameters
----------
indexing :
query for data
Raises
------
KeyError:
If the non-slice index is queried but does not exist, `KeyError` is raised.
"""
# 1) convert slices to int loc
if not isinstance(indexing, tuple):
# NOTE: tuple is not supported for indexing
indexing = (indexing,)
# TODO: create a subclass for single value query
assert len(indexing) <= len(self._indices)
int_indexing = []
for dim, index in enumerate(self._indices):
if dim < len(indexing):
_indexing = indexing[dim]
if not self._int_loc: # type converting is only necessary when it is not `iloc`
if isinstance(_indexing, slice):
_indexing = self._slc_convert(index, _indexing)
elif isinstance(_indexing, (IndexData, np.ndarray)):
if isinstance(_indexing, IndexData):
_indexing = _indexing.data
assert _indexing.ndim == 1
if _indexing.dtype != bool:
_indexing = np.array(list(index.index(i) for i in _indexing))
else:
_indexing = index.index(_indexing)
else:
# Default to select all when user input is not given
_indexing = slice(None)
int_indexing.append(_indexing)
# 2) select data and index
new_data = self._bind_id.data[tuple(int_indexing)]
# return directly if it is scalar
if new_data.ndim == 0:
return new_data
# otherwise we go on to the index part
new_indices = [idx[indexing] for idx, indexing in zip(self._indices, int_indexing)]
# 3) squash dimensions
new_indices = [
idx for idx in new_indices if isinstance(idx, np.ndarray) and idx.ndim > 0
] # squash the zero dim indexing
if new_data.ndim == 1:
cls = SingleData
elif new_data.ndim == 2:
cls = MultiData
else:
raise ValueError("Not supported")
return cls(new_data, *new_indices)
class BinaryOps:
def __init__(self, method_name):
self.method_name = method_name
def __get__(self, obj, *args):
# bind object
self.obj = obj
return self
def __call__(self, other):
self_data_method = getattr(self.obj.data, self.method_name)
if isinstance(other, (int, float, np.number)):
return self.obj.__class__(self_data_method(other), *self.obj.indices)
elif isinstance(other, self.obj.__class__):
other_aligned = self.obj._align_indices(other)
return self.obj.__class__(self_data_method(other_aligned.data), *self.obj.indices)
else:
return NotImplemented
def index_data_ops_creator(*args, **kwargs):
"""
meta class for auto generating operations for index data.
"""
for method_name in ["__add__", "__sub__", "__rsub__", "__mul__", "__truediv__", "__eq__", "__gt__", "__lt__"]:
args[2][method_name] = BinaryOps(method_name=method_name)
return type(*args)
class IndexData(metaclass=index_data_ops_creator):
"""
Base data structure of SingleData and MultiData.
NOTE:
- For performance issue, only **np.floating** is supported in the underlayer data !!!
- Boolean based on np.floating is also supported. Here are some examples
.. code-block:: python
np.array([ np.nan]).any() -> True
np.array([ np.nan]).all() -> True
np.array([1. , 0.]).any() -> True
np.array([1. , 0.]).all() -> False
"""
loc_idx_cls = LocIndexer
def __init__(self, data: np.ndarray, *indices: Union[List, pd.Index, Index]):
self.data = data
self.indices = indices
# get the expected data shape
# - The index has higher priority
self.data = np.array(data)
expected_dim = max(self.data.ndim, len(indices))
data_shape = []
for i in range(expected_dim):
idx_l = indices[i] if len(indices) > i else []
if len(idx_l) == 0:
data_shape.append(self.data.shape[i])
else:
data_shape.append(len(idx_l))
data_shape = tuple(data_shape)
# broadcast the data to expected shape
if self.data.shape != data_shape:
self.data = np.broadcast_to(self.data, data_shape)
self.data = self.data.astype(np.float64)
# Please notice following cases when converting the type
# - np.array([None, 1]).astype(np.float64) -> array([nan, 1.])
# create index from user's index data.
self.indices: List[Index] = self.loc_idx_cls.proc_idx_l(indices, data_shape)
for dim in range(expected_dim):
assert self.data.shape[dim] == len(self.indices[dim])
self.ndim = expected_dim
# indexing related methods
@property
def loc(self):
return self.loc_idx_cls(index_data=self, indices=self.indices)
@property
def iloc(self):
return self.loc_idx_cls(index_data=self, indices=self.indices, int_loc=True)
@property
def index(self):
return self.indices[0]
@property
def columns(self):
return self.indices[1]
def __getitem__(self, args):
# NOTE: this tries to behave like a numpy array to be compatible with numpy aggregating function like nansum and nanmean
return self.iloc[args]
def _align_indices(self, other: "IndexData") -> "IndexData":
"""
Align all indices of `other` to `self` before performing the arithmetic operations.
This function will return a new IndexData rather than changing data in `other` inplace
Parameters
----------
other : "IndexData"
the index in `other` is to be changed
Returns
-------
IndexData:
the data in `other` with index aligned to `self`
"""
raise NotImplementedError(f"please implement _align_indices func")
def sort_index(self, axis=0, inplace=True):
assert inplace, "Only support sorting inplace now"
self.indices[axis], sorted_idx = self.indices[axis].sort()
self.data = np.take(self.data, sorted_idx, axis=axis)
# The code below could be simpler like methods in __getattribute__
def __invert__(self):
return self.__class__(~self.data.astype(bool), *self.indices)
def abs(self):
"""get the abs of data except np.NaN."""
tmp_data = np.absolute(self.data)
return self.__class__(tmp_data, *self.indices)
def replace(self, to_replace: Dict[np.number, np.number]):
assert isinstance(to_replace, dict)
tmp_data = self.data.copy()
for num in to_replace:
if num in tmp_data:
tmp_data[self.data == num] = to_replace[num]
return self.__class__(tmp_data, *self.indices)
def apply(self, func: Callable):
"""apply a function to data."""
tmp_data = func(self.data)
return self.__class__(tmp_data, *self.indices)
def __len__(self):
"""the length of the data.
Returns
-------
int
the length of the data.
"""
return len(self.data)
def sum(self, axis=None, dtype=None, out=None):
assert out is None and dtype is None, "`out` is just for compatible with numpy's aggregating function"
# FIXME: weird logic and not general
if axis is None:
return np.nansum(self.data)
elif axis == 0:
tmp_data = np.nansum(self.data, axis=0)
return SingleData(tmp_data, self.columns)
elif axis == 1:
tmp_data = np.nansum(self.data, axis=1)
return SingleData(tmp_data, self.index)
else:
raise ValueError(f"axis must be None, 0 or 1")
def mean(self, axis=None, dtype=None, out=None):
assert out is None and dtype is None, "`out` is just for compatible with numpy's aggregating function"
# FIXME: weird logic and not general
if axis is None:
return np.nanmean(self.data)
elif axis == 0:
tmp_data = np.nanmean(self.data, axis=0)
return SingleData(tmp_data, self.columns)
elif axis == 1:
tmp_data = np.nanmean(self.data, axis=1)
return SingleData(tmp_data, self.index)
else:
raise ValueError(f"axis must be None, 0 or 1")
def isna(self):
return self.__class__(np.isnan(self.data), *self.indices)
def fillna(self, value=0.0, inplace: bool = False):
if inplace:
self.data = np.nan_to_num(self.data, nan=value)
else:
return self.__class__(np.nan_to_num(self.data, nan=value), *self.indices)
def count(self):
return len(self.data[~np.isnan(self.data)])
def all(self):
if None in self.data:
return self.data[self.data is not None].all()
else:
return self.data.all()
@property
def empty(self):
return len(self.data) == 0
@property
def values(self):
return self.data
class SingleData(IndexData):
def __init__(
self, data: Union[int, float, np.number, list, dict, pd.Series] = [], index: Union[List, pd.Index, Index] = []
):
"""A data structure of index and numpy data.
It's used to replace pd.Series due to high-speed.
Parameters
----------
data : Union[int, float, np.number, list, dict, pd.Series]
the input data
index : Union[list, pd.Index]
the index of data.
empty list indicates that auto filling the index to the length of data
"""
# for special data type
if isinstance(data, dict):
assert len(index) == 0
if len(data) > 0:
index, data = zip(*data.items())
else:
index, data = [], []
elif isinstance(data, pd.Series):
assert len(index) == 0
index, data = data.index, data.values
elif isinstance(data, (int, float, np.number)):
data = [data]
super().__init__(data, index)
assert self.ndim == 1
def _align_indices(self, other):
if self.index == other.index:
return other
elif set(self.index) == set(other.index):
return other.reindex(self.index)
else:
raise ValueError(
f"The indexes of self and other do not meet the requirements of the four arithmetic operations"
)
def reindex(self, index: Index, fill_value=np.NaN) -> SingleData:
"""reindex data and fill the missing value with np.NaN.
Parameters
----------
new_index : list
new index
fill_value:
what value to fill if index is missing
Returns
-------
SingleData
reindex data
"""
# TODO: This method can be more general
if self.index == index:
return self
tmp_data = np.full(len(index), fill_value, dtype=np.float64)
for index_id, index_item in enumerate(index):
try:
tmp_data[index_id] = self.loc[index_item]
except KeyError:
pass
return SingleData(tmp_data, index)
def add(self, other: SingleData, fill_value=0):
# TODO: add and __add__ are a little confusing.
# This could be a more general
common_index = self.index | other.index
common_index, _ = common_index.sort()
tmp_data1 = self.reindex(common_index, fill_value)
tmp_data2 = other.reindex(common_index, fill_value)
return tmp_data1.fillna(fill_value) + tmp_data2.fillna(fill_value)
def to_dict(self):
"""convert SingleData to dict.
Returns
-------
dict
data with the dict format.
"""
return dict(zip(self.index, self.data.tolist()))
def to_series(self):
return pd.Series(self.data, index=self.index)
def __repr__(self) -> str:
return str(pd.Series(self.data, index=self.index))
class MultiData(IndexData):
def __init__(
self,
data: Union[int, float, np.number, list] = [],
index: Union[List, pd.Index, Index] = [],
columns: Union[List, pd.Index, Index] = [],
):
"""A data structure of index and numpy data.
It's used to replace pd.DataFrame due to high-speed.
Parameters
----------
data : Union[list, np.ndarray]
the dim of data must be 2.
index : Union[List, pd.Index, Index]
the index of data.
columns: Union[List, pd.Index, Index]
the columns of data.
"""
if isinstance(data, pd.DataFrame):
index, columns, data = data.index, data.columns, data.values
super().__init__(data, index, columns)
assert self.ndim == 2
def _align_indices(self, other):
if self.indices == other.indices:
return other
else:
raise ValueError(
f"The indexes of self and other do not meet the requirements of the four arithmetic operations"
)
def __repr__(self) -> str:
return str(pd.DataFrame(self.data, index=self.index, columns=self.columns))