1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 11:00:57 +08:00

init commit

This commit is contained in:
Young
2020-09-22 01:43:21 +00:00
parent aa51e5aad3
commit 99ebd87cba
131 changed files with 20218 additions and 0 deletions

36
qlib/data/__init__.py Normal file
View File

@@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
from .data import (
D,
CalendarProvider,
InstrumentProvider,
FeatureProvider,
ExpressionProvider,
DatasetProvider,
LocalCalendarProvider,
LocalInstrumentProvider,
LocalFeatureProvider,
LocalExpressionProvider,
LocalDatasetProvider,
ClientCalendarProvider,
ClientInstrumentProvider,
ClientDatasetProvider,
BaseProvider,
LocalProvider,
ClientProvider,
)
from .cache import (
ExpressionCache,
DatasetCache,
ServerExpressionCache,
ServerDatasetCache,
SimpleDatasetCache,
ClientDatasetCache,
ClientCalendarCache,
)

View File

@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

View File

@@ -0,0 +1,152 @@
# cython: profile=False
# cython: boundscheck=False, wraparound=False, cdivision=True
cimport cython
cimport numpy as np
import numpy as np
from libc.math cimport sqrt, isnan, NAN
from libcpp.vector cimport vector
cdef class Expanding(object):
"""1-D array expanding"""
cdef vector[double] barv
cdef int na_count
def __init__(self):
self.na_count = 0
cdef double update(self, double val):
pass
cdef class Mean(Expanding):
"""1-D array expanding mean"""
cdef double vsum
def __init__(self):
super(Mean, self).__init__()
self.vsum = 0
cdef double update(self, double val):
self.barv.push_back(val)
if isnan(val):
self.na_count += 1
else:
self.vsum += val
return self.vsum / (self.barv.size() - self.na_count)
cdef class Slope(Expanding):
"""1-D array expanding slope"""
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double xy_sum
def __init__(self):
super(Slope, self).__init__()
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
cdef size_t size = self.barv.size()
if isnan(val):
self.na_count += 1
else:
self.x_sum += size
self.x2_sum += size * size
self.y_sum += val
self.xy_sum += size * val
cdef int N = size - self.na_count
return (N*self.xy_sum - self.x_sum*self.y_sum) / \
(N*self.x2_sum - self.x_sum*self.x_sum)
cdef class Resi(Expanding):
"""1-D array expanding residuals"""
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double xy_sum
def __init__(self):
super(Resi, self).__init__()
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
cdef size_t size = self.barv.size()
if isnan(val):
self.na_count += 1
else:
self.x_sum += size
self.x2_sum += size * size
self.y_sum += val
self.xy_sum += size * val
cdef int N = size - self.na_count
slope = (N*self.xy_sum - self.x_sum*self.y_sum) / \
(N*self.x2_sum - self.x_sum*self.x_sum)
x_mean = self.x_sum / N
y_mean = self.y_sum / N
interp = y_mean - slope*x_mean
return val - (slope*size + interp)
cdef class Rsquare(Expanding):
"""1-D array expanding rsquare"""
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double y2_sum
cdef double xy_sum
def __init__(self):
super(Rsquare, self).__init__()
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.y2_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
cdef size_t size = self.barv.size()
if isnan(val):
self.na_count += 1
else:
self.x_sum += size
self.x2_sum += size
self.y_sum += val
self.y2_sum += val * val
self.xy_sum += size * val
cdef int N = size - self.na_count
cdef double rvalue = (N*self.xy_sum - self.x_sum*self.y_sum) / \
sqrt((N*self.x2_sum - self.x_sum*self.x_sum) * (N*self.y2_sum - self.y_sum*self.y_sum))
return rvalue * rvalue
cdef np.ndarray[double, ndim=1] expanding(Expanding r, np.ndarray a):
cdef int i
cdef int N = len(a)
cdef np.ndarray[double, ndim=1] ret = np.empty(N)
for i in range(N):
ret[i] = r.update(a[i])
return ret
def expanding_mean(np.ndarray a):
cdef Mean r = Mean()
return expanding(r, a)
def expanding_slope(np.ndarray a):
cdef Slope r = Slope()
return expanding(r, a)
def expanding_rsquare(np.ndarray a):
cdef Rsquare r = Rsquare()
return expanding(r, a)
def expanding_resi(np.ndarray a):
cdef Resi r = Resi()
return expanding(r, a)

207
qlib/data/_libs/rolling.pyx Normal file
View File

@@ -0,0 +1,207 @@
# cython: profile=False
# cython: boundscheck=False, wraparound=False, cdivision=True
cimport cython
cimport numpy as np
import numpy as np
from libc.math cimport sqrt, isnan, NAN
from libcpp.deque cimport deque
cdef class Rolling(object):
"""1-D array rolling"""
cdef int window
cdef deque[double] barv
cdef int na_count
def __init__(self, int window):
self.window = window
self.na_count = window
cdef int i
for i in range(window):
self.barv.push_back(NAN)
cdef double update(self, double val):
pass
cdef class Mean(Rolling):
"""1-D array rolling mean"""
cdef double vsum
def __init__(self, int window):
super(Mean, self).__init__(window)
self.vsum = 0
cdef double update(self, double val):
self.barv.push_back(val)
if not isnan(self.barv.front()):
self.vsum -= self.barv.front()
else:
self.na_count -= 1
self.barv.pop_front()
if isnan(val):
self.na_count += 1
# return NAN
else:
self.vsum += val
return self.vsum / (self.window - self.na_count)
cdef class Slope(Rolling):
"""1-D array rolling slope"""
cdef double i_sum # can be used as i2_sum
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double xy_sum
def __init__(self, int window):
super(Slope, self).__init__(window)
self.i_sum = 0
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
self.xy_sum = self.xy_sum - self.y_sum
self.x2_sum = self.x2_sum + self.i_sum - 2*self.x_sum
self.x_sum = self.x_sum - self.i_sum
cdef double _val
_val = self.barv.front()
if not isnan(_val):
self.i_sum -= 1
self.y_sum -= _val
else:
self.na_count -= 1
self.barv.pop_front()
if isnan(val):
self.na_count += 1
# return NAN
else:
self.i_sum += 1
self.x_sum += self.window
self.x2_sum += self.window * self.window
self.y_sum += val
self.xy_sum += self.window * val
cdef int N = self.window - self.na_count
return (N*self.xy_sum - self.x_sum*self.y_sum) / \
(N*self.x2_sum - self.x_sum*self.x_sum)
cdef class Resi(Rolling):
"""1-D array rolling residuals"""
cdef double i_sum # can be used as i2_sum
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double xy_sum
def __init__(self, int window):
super(Resi, self).__init__(window)
self.i_sum = 0
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
self.xy_sum = self.xy_sum - self.y_sum
self.x2_sum = self.x2_sum + self.i_sum - 2*self.x_sum
self.x_sum = self.x_sum - self.i_sum
cdef double _val
_val = self.barv.front()
if not isnan(_val):
self.i_sum -= 1
self.y_sum -= _val
else:
self.na_count -= 1
self.barv.pop_front()
if isnan(val):
self.na_count += 1
# return NAN
else:
self.i_sum += 1
self.x_sum += self.window
self.x2_sum += self.window * self.window
self.y_sum += val
self.xy_sum += self.window * val
cdef int N = self.window - self.na_count
slope = (N*self.xy_sum - self.x_sum*self.y_sum) / \
(N*self.x2_sum - self.x_sum*self.x_sum)
x_mean = self.x_sum / N
y_mean = self.y_sum / N
interp = y_mean - slope*x_mean
return val - (slope*self.window + interp)
cdef class Rsquare(Rolling):
"""1-D array rolling rsquare"""
cdef double i_sum
cdef double x_sum
cdef double x2_sum
cdef double y_sum
cdef double y2_sum
cdef double xy_sum
def __init__(self, int window):
super(Rsquare, self).__init__(window)
self.i_sum = 0
self.x_sum = 0
self.x2_sum = 0
self.y_sum = 0
self.y2_sum = 0
self.xy_sum = 0
cdef double update(self, double val):
self.barv.push_back(val)
self.xy_sum = self.xy_sum - self.y_sum
self.x2_sum = self.x2_sum + self.i_sum - 2*self.x_sum
self.x_sum = self.x_sum - self.i_sum
cdef double _val
_val = self.barv.front()
if not isnan(_val):
self.i_sum -= 1
self.y_sum -= _val
self.y2_sum -= _val * _val
else:
self.na_count -= 1
self.barv.pop_front()
if isnan(val):
self.na_count += 1
# return NAN
else:
self.i_sum += 1
self.x_sum += self.window
self.x2_sum += self.window * self.window
self.y_sum += val
self.y2_sum += val * val
self.xy_sum += self.window * val
cdef int N = self.window - self.na_count
cdef double rvalue
rvalue = (N*self.xy_sum - self.x_sum*self.y_sum) / \
sqrt((N*self.x2_sum - self.x_sum*self.x_sum) * (N*self.y2_sum - self.y_sum*self.y_sum))
return rvalue * rvalue
cdef np.ndarray[double, ndim=1] rolling(Rolling r, np.ndarray a):
cdef int i
cdef int N = len(a)
cdef np.ndarray[double, ndim=1] ret = np.empty(N)
for i in range(N):
ret[i] = r.update(a[i])
return ret
def rolling_mean(np.ndarray a, int window):
cdef Mean r = Mean(window)
return rolling(r, a)
def rolling_slope(np.ndarray a, int window):
cdef Slope r = Slope(window)
return rolling(r, a)
def rolling_rsquare(np.ndarray a, int window):
cdef Rsquare r = Rsquare(window)
return rolling(r, a)
def rolling_resi(np.ndarray a, int window):
cdef Resi r = Resi(window)
return rolling(r, a)

229
qlib/data/base.py Normal file
View File

@@ -0,0 +1,229 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import abc
import six
import pandas as pd
@six.add_metaclass(abc.ABCMeta)
class Expression(object):
"""Expression base class"""
def __str__(self):
return type(self).__name__
def __repr__(self):
return str(self)
def __gt__(self, other):
from .ops import Gt
return Gt(self, other)
def __ge__(self, other):
from .ops import Ge
return Ge(self, other)
def __lt__(self, other):
from .ops import Lt
return Lt(self, other)
def __le__(self, other):
from .ops import Le
return Le(self, other)
def __eq__(self, other):
from .ops import Eq
return Eq(self, other)
def __ne__(self, other):
from .ops import Ne
return Ne(self, other)
def __add__(self, other):
from .ops import Add
return Add(self, other)
def __radd__(self, other):
from .ops import Add
return Add(other, self)
def __sub__(self, other):
from .ops import Sub
return Sub(self, other)
def __rsub__(self, other):
from .ops import Sub
return Sub(other, self)
def __mul__(self, other):
from .ops import Mul
return Mul(self, other)
def __rmul__(self, other):
from .ops import Mul
return Mul(self, other)
def __div__(self, other):
from .ops import Div
return Div(self, other)
def __rdiv__(self, other):
from .ops import Div
return Div(other, self)
def __truediv__(self, other):
from .ops import Div
return Div(self, other)
def __rtruediv__(self, other):
from .ops import Div
return Div(other, self)
def __pow__(self, other):
from .ops import Power
return Power(self, other)
def __and__(self, other):
from .ops import And
return And(self, other)
def __rand__(self, other):
from .ops import And
return And(other, self)
def __or__(self, other):
from .ops import Or
return Or(self, other)
def __ror__(self, other):
from .ops import Or
return Or(other, self)
def load(self, instrument, start_index, end_index, freq):
"""load feature
Parameters
----------
instrument : str
instrument code
start_index : str
feature start index [in calendar]
end_index : str
feature end index [in calendar]
freq : str
feature frequency
Returns
----------
pd.Series
feature series: The index of the series is the calendar index
"""
from .cache import H
# cache
args = str(self), instrument, start_index, end_index, freq
if args in H["f"]:
return H["f"][args]
if start_index is None or end_index is None or start_index > end_index:
raise ValueError("Invalid index range: {} {}".format(start_index, end_index))
series = self._load_internal(instrument, start_index, end_index, freq)
series.name = str(self)
H["f"][args] = series
return series
@abc.abstractmethod
def _load_internal(self, instrument, start_index, end_index, freq):
pass
@abc.abstractmethod
def get_longest_back_rolling(self):
"""Get the longest length of historical data the feature has accessed
This is designed for getting the needed range of the data to calculate
the features in specific range at first. However, situations like
Ref(Ref($close, -1), 1) can not be handled rightly.
So this will only used for detecting the length of historical data needed.
"""
# TODO: forward operator like Ref($close, -1) is not supported yet.
raise NotImplementedError("This function must be implemented in your newly defined feature")
@abc.abstractmethod
def get_extended_window_size(self):
"""get_extend_window_size
For to calculate this Operator in range[start_index, end_index]
We have to get the *leaf feature* in
range[start_index - lft_etd, end_index + rght_etd].
Returns
----------
(int, int)
lft_etd, rght_etd
"""
raise NotImplementedError("This function must be implemented in your newly defined feature")
class Feature(Expression):
"""Static Expression
This kind of feature will load data from provider
"""
def __init__(self, name=None):
if name:
self._name = name.lower()
else:
self._name = type(self).__name__.lower()
def __str__(self):
return "$" + self._name
def _load_internal(self, instrument, start_index, end_index, freq):
# load
from .data import FeatureD
return FeatureD.feature(instrument, str(self), start_index, end_index, freq)
def get_longest_back_rolling(self):
return 0
def get_extended_window_size(self):
return 0, 0
@six.add_metaclass(abc.ABCMeta)
class ExpressionOps(Expression):
"""Operator Expression
This kind of feature will use operator for feature
construction on the fly.
"""
pass

1149
qlib/data/cache.py Normal file

File diff suppressed because it is too large Load Diff

102
qlib/data/client.py Normal file
View File

@@ -0,0 +1,102 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import socketio
from .. import __version__
from ..log import get_module_logger
import pickle
class Client(object):
"""A client class
Provide the connection tool functions for ClientProvider.
"""
def __init__(self, host, port):
super(Client, self).__init__()
self.sio = socketio.Client()
self.server_host = host
self.server_port = port
self.logger = get_module_logger(self.__class__.__name__)
# bind connect/disconnect callbacks
self.sio.on(
"connect",
lambda: self.logger.debug("Connect to server {}".format(self.sio.connection_url)),
)
self.sio.on("disconnect", lambda: self.logger.debug("Disconnect from server!"))
def connect_server(self):
"""Connect to server."""
try:
self.sio.connect("ws://" + self.server_host + ":" + str(self.server_port))
except socketio.exceptions.ConnectionError:
self.logger.error("Cannot connect to server - check your network or server status")
def disconnect(self):
"""Disconnect from server."""
try:
self.sio.eio.disconnect(True)
except Exception as e:
self.logger.error("Cannot disconnect from server : %s" % e)
def send_request(self, request_type, request_content, msg_queue, msg_proc_func=None):
"""Send a certain request to server.
Parameters
----------
request_type : str
type of proposed request, 'calendar'/'instrument'/'feature'
request_content : dict
records the information of the request
msg_proc_func : func
the function to process the message when receiving response, should have arg `*args`
msg_queue: Queue
The queue to pass the messsage after callback
"""
head_info = {"version": __version__}
def request_callback(*args):
"""callback_wrapper
:param *args: args[0] is the response content
"""
# args[0] is the response content
self.logger.debug("receive data and enter queue")
msg = dict(args[0])
if msg["detailed_info"] is not None:
if msg["status"] != 0:
self.logger.error(msg["detailed_info"])
else:
self.logger.info(msg["detailed_info"])
if msg["status"] != 0:
ex = ValueError(f"Bad response(status=={msg['status']}), detailed info: {msg['detailed_info']}")
msg_queue.put(ex)
else:
if msg_proc_func is not None:
try:
ret = msg_proc_func(msg["result"])
except Exception as e:
self.logger.exception("Error when processing message.")
ret = e
else:
ret = msg["result"]
msg_queue.put(ret)
self.disconnect()
self.logger.debug("disconnected")
self.logger.debug("try connecting")
self.connect_server()
self.logger.debug("connected")
# The pickle is for passing some parameters with special type(such as
# pd.Timestamp)
request_content = {"head": head_info, "body": pickle.dumps(request_content)}
self.sio.on(request_type + "_response", request_callback)
self.logger.debug("try sending")
self.sio.emit(request_type + "_request", request_content)
self.sio.wait()

1110
qlib/data/data.py Normal file

File diff suppressed because it is too large Load Diff

375
qlib/data/filter.py Normal file
View File

@@ -0,0 +1,375 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import print_function
from abc import abstractmethod
import re
import pandas as pd
import numpy as np
import six
import abc
from .data import Cal, DatasetD
@six.add_metaclass(abc.ABCMeta)
class BaseDFilter(object):
"""Dynamic Instruments Filter Abstract class
Users can override this class to construct their own filter
Override __init__ to input filter regulations
Override filter_main to use the regulations to filter instruments
"""
def __init__(self):
pass
@staticmethod
def from_config(config):
"""Construct an instance from config dict.
Parameters
----------
config : dict
dict of config parameters
"""
raise NotImplementedError("Subclass of BaseDFilter must reimplement `from_config` method")
@abstractmethod
def to_config(self):
"""Construct an instance from config dict.
Returns
----------
dict
return the dict of config parameters
"""
raise NotImplementedError("Subclass of BaseDFilter must reimplement `to_config` method")
@six.add_metaclass(abc.ABCMeta)
class SeriesDFilter(BaseDFilter):
"""Dynamic Instruments Filter Abstract class to filter a series of certain features
Filters should provide parameters:
- filter start time
- filter end time
- filter rule
Override __init__ to assign a certain rule to filter the series.
Override _getFilterSeries to use the rule to filter the series and get a dict of {inst => series}, or override filter_main for more advanced series filter rule
"""
def __init__(self, fstart_time=None, fend_time=None):
"""Init function for filter base class.
Filter a set of instruments based on a certain rule within a certain period assigned by fstart_time and fend_time.
Parameters
----------
fstart_time: str
the time for the filter rule to start filter the instruments
fend_time: str
the time for the filter rule to stop filter the instruments
"""
super(SeriesDFilter, self).__init__()
self.filter_start_time = pd.Timestamp(fstart_time) if fstart_time else None
self.filter_end_time = pd.Timestamp(fend_time) if fend_time else None
def _getTimeBound(self, instruments):
"""Get time bound for all instruments.
Parameters
----------
instruments: dict
the dict of instruments in the form {instrument_name => list of timestamp tuple}
Returns
----------
pd.Timestamp, pd.Timestamp
the lower time bound and upper time bound of all the instruments
"""
trange = Cal.calendar(freq=self.filter_freq)
ubound, lbound = trange[0], trange[-1]
for _, timestamp in instruments.items():
if timestamp:
lbound = timestamp[0][0] if timestamp[0][0] < lbound else lbound
ubound = timestamp[-1][-1] if timestamp[-1][-1] > ubound else ubound
return lbound, ubound
def _toSeries(self, time_range, target_timestamp):
"""Convert the target timestamp to a pandas series of bool value within a time range.
Make the time inside the target_timestamp range TRUE, others FALSE.
Parameters
----------
time_range : D.calendar
the time range of the instruments
target_timestamp : list
the list of tuple (timestamp, timestamp)
Returns
----------
pd.Series
the series of bool value for an instrument
"""
# Construct a whole dict of {date => bool}
timestamp_series = {timestamp: False for timestamp in time_range}
# Convert to pd.Series
timestamp_series = pd.Series(timestamp_series)
# Fill the date within target_timestamp with TRUE
for start, end in target_timestamp:
timestamp_series[Cal.calendar(start_time=start, end_time=end, freq=self.filter_freq)] = True
return timestamp_series
def _filterSeries(self, timestamp_series, filter_series):
"""Filter the timestamp series with filter series by using element-wise AND operation of the two series
Parameters
----------
timestamp_series : pd.Series
the series of bool value indicating existing time
filter_series : pd.Series
the series of bool value indicating filter feature
Returns
----------
pd.Series
the series of bool value indicating whether the date satisfies the filter condition and exists in target timestamp
"""
fstart, fend = list(filter_series.keys())[0], list(filter_series.keys())[-1]
timestamp_series[fstart:fend] = timestamp_series[fstart:fend] & filter_series
return timestamp_series
def _toTimestamp(self, timestamp_series):
"""Convert the timestamp series to a list of tuple (timestamp, timestamp) indicating a continuous range of TRUE
Parameters
----------
timestamp_series: pd.Series
the series of bool value after being filtered
Returns
----------
list
the list of tuple (timestamp, timestamp)
"""
# sort the timestamp_series according to the timestamps
timestamp_series.sort_index()
timestamp = []
_lbool = None
_ltime = None
for _ts, _bool in timestamp_series.items():
# there is likely to be NAN when the filter series don't have the
# bool value, so we just change the NAN into False
if _bool == np.nan:
_bool = False
if _lbool is None:
_cur_start = _ts
_lbool = _bool
_ltime = _ts
continue
if (_lbool, _bool) == (True, False):
if _cur_start:
timestamp.append((_cur_start, _ltime))
elif (_lbool, _bool) == (False, True):
_cur_start = _ts
_lbool = _bool
_ltime = _ts
if _lbool:
timestamp.append((_cur_start, _ltime))
return timestamp
def __call__(self, instruments, start_time=None, end_time=None, freq="day"):
"""Call this filter to get filtered instruments list"""
self.filter_freq = freq
return self.filter_main(instruments, start_time, end_time)
@abstractmethod
def _getFilterSeries(self, instruments, fstart, fend):
"""Get filter series based on the rules assigned during the initialization and the input time range.
Parameters
----------
instruments : dict
the dict of instruments to be filtered
fstart : pd.Timestamp
start time of filter
fend : pd.Timestamp
end time of filter
.. note:: fstart/fend indicates the intersection of instruments start/end time and filter start/end time
Returns
----------
pd.Dataframe
a series of {pd.Timestamp => bool}
"""
raise NotImplementedError("Subclass of SeriesDFilter must reimplement `getFilterSeries` method")
def filter_main(self, instruments, start_time=None, end_time=None):
"""Implement this method to filter the instruments.
Parameters
----------
instruments: dict
input instruments to be filtered
start_time: str
start of the time range
end_time: str
end of the time range
Returns
----------
dict
filtered instruments, same structure as input instruments
"""
lbound, ubound = self._getTimeBound(instruments)
start_time = pd.Timestamp(start_time or lbound)
end_time = pd.Timestamp(end_time or ubound)
_instruments_filtered = {}
_all_calendar = Cal.calendar(start_time=start_time, end_time=end_time, freq=self.filter_freq)
_filter_calendar = Cal.calendar(
start_time=self.filter_start_time and max(self.filter_start_time, _all_calendar[0]) or _all_calendar[0],
end_time=self.filter_end_time and min(self.filter_end_time, _all_calendar[-1]) or _all_calendar[-1],
freq=self.filter_freq,
)
_all_filter_series = self._getFilterSeries(instruments, _filter_calendar[0], _filter_calendar[-1])
for inst, timestamp in instruments.items():
# Construct a whole map of date
_timestamp_series = self._toSeries(_all_calendar, timestamp)
# Get filter series
if inst in _all_filter_series:
_filter_series = _all_filter_series[inst]
else:
if self.keep:
_filter_series = pd.Series({timestamp: True for timestamp in _filter_calendar})
else:
_filter_series = pd.Series({timestamp: False for timestamp in _filter_calendar})
# Calculate bool value within the range of filter
_timestamp_series = self._filterSeries(_timestamp_series, _filter_series)
# Reform the map to (start_timestamp, end_timestamp) format
_timestamp = self._toTimestamp(_timestamp_series)
# Remove empty timestamp
if _timestamp:
_instruments_filtered[inst] = _timestamp
return _instruments_filtered
class NameDFilter(SeriesDFilter):
"""Name dynamic instrument filter
Filter the instruments based on a regulated name format.
A name rule regular expression is required.
"""
def __init__(self, name_rule_re, fstart_time=None, fend_time=None):
"""Init function for name filter class
params:
------
name_rule_re: str
regular expression for the name rule
"""
super(NameDFilter, self).__init__(fstart_time, fend_time)
self.name_rule_re = name_rule_re
def _getFilterSeries(self, instruments, fstart, fend):
all_filter_series = {}
filter_calendar = Cal.calendar(start_time=fstart, end_time=fend, freq=self.filter_freq)
for inst, timestamp in instruments.items():
if re.match(self.name_rule_re, inst):
_filter_series = pd.Series({timestamp: True for timestamp in filter_calendar})
else:
_filter_series = pd.Series({timestamp: False for timestamp in filter_calendar})
all_filter_series[inst] = _filter_series
return all_filter_series
@staticmethod
def from_config(config):
return NameDFilter(
name_rule_re=config["name_rule_re"],
fstart_time=config["filter_start_time"],
fend_time=config["filter_end_time"],
)
def to_config(self):
return {
"filter_type": "NameDFilter",
"name_rule_re": self.name_rule_re,
"filter_start_time": str(self.filter_start_time) if self.filter_start_time else self.filter_start_time,
"filter_end_time": str(self.filter_end_time) if self.filter_end_time else self.filter_end_time,
}
class ExpressionDFilter(SeriesDFilter):
"""Expression dynamic instrument filter
Filter the instruments based on a certain expression.
An expression rule indicating a certain feature field is required.
Examples
----------
- *basic features filter* : rule_expression = '$close/$open>5'
- *cross-sectional features filter* : rule_expression = '$rank($close)<10'
- *time-sequence features filter* : rule_expression = '$Ref($close, 3)>100'
"""
def __init__(self, rule_expression, fstart_time=None, fend_time=None, keep=False):
"""Init function for expression filter class
params:
------
fstart_time: str
filter the feature starting from this time
fend_time: str
filter the feature ending by this time
rule_expression: str
an input expression for the rule
keep: bool
whether to keep the instruments of which features don't exist in the filter time span
"""
super(ExpressionDFilter, self).__init__(fstart_time, fend_time)
self.rule_expression = rule_expression
self.keep = keep
def _getFilterSeries(self, instruments, fstart, fend):
# do not use dataset cache
try:
_features = DatasetD.dataset(
instruments,
[self.rule_expression],
fstart,
fend,
freq=self.filter_freq,
disk_cache=0,
)
except TypeError:
# use LocalDatasetProvider
_features = DatasetD.dataset(instruments, [self.rule_expression], fstart, fend, freq=self.filter_freq)
rule_expression_field_name = list(_features.keys())[0]
all_filter_series = _features[rule_expression_field_name]
return all_filter_series
def from_config(config):
return ExpressionDFilter(
rule_expression=config["rule_expression"],
fstart_time=config["filter_start_time"],
fend_time=config["filter_end_time"],
keep=config["keep"],
)
def to_config(self):
return {
"filter_type": "ExpressionDFilter",
"rule_expression": self.rule_expression,
"filter_start_time": str(self.filter_start_time) if self.filter_start_time else self.filter_start_time,
"filter_end_time": str(self.filter_end_time) if self.filter_end_time else self.filter_end_time,
"keep": self.keep,
}

1405
qlib/data/ops.py Normal file

File diff suppressed because it is too large Load Diff