From 1a0ac1ab6d644ff94ddfe19d56340c89603e7f7a Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 19 Jan 2022 10:39:37 +0800 Subject: [PATCH] Remove arctic from Qlib core to Contrib (#865) * Remove arctic from Qlib core to Contrib * fix empty df bug --- examples/orderbook_data/README.md | 1 + examples/orderbook_data/example.py | 6 +++- qlib/contrib/data/data.py | 55 ++++++++++++++++++++++++++++++ qlib/data/__init__.py | 1 - qlib/data/data.py | 43 +---------------------- setup.py | 1 - 6 files changed, 62 insertions(+), 45 deletions(-) create mode 100644 qlib/contrib/data/data.py diff --git a/examples/orderbook_data/README.md b/examples/orderbook_data/README.md index a4b25ac1a..c0aadd5c6 100644 --- a/examples/orderbook_data/README.md +++ b/examples/orderbook_data/README.md @@ -17,6 +17,7 @@ Current version of script with default value tries to connect localhost **via de Run following command to install necessary libraries ``` pip install pytest +pip install arctic # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied. ``` # Importing example data diff --git a/examples/orderbook_data/example.py b/examples/orderbook_data/example.py index 6e3232229..3daa0a1ee 100644 --- a/examples/orderbook_data/example.py +++ b/examples/orderbook_data/example.py @@ -25,7 +25,11 @@ class TestClass(unittest.TestCase): mem_cache_type="sizeof", kernels=1, expression_provider={"class": "LocalExpressionProvider", "kwargs": {"time2idx": False}}, - feature_provider={"class": "ArcticFeatureProvider", "kwargs": {"uri": "127.0.0.1"}}, + feature_provider={ + "class": "ArcticFeatureProvider", + "module_path": "qlib.contrib.data.data", + "kwargs": {"uri": "127.0.0.1"}, + }, dataset_provider={ "class": "LocalDatasetProvider", "kwargs": { diff --git a/qlib/contrib/data/data.py b/qlib/contrib/data/data.py new file mode 100644 index 000000000..c153cfb8f --- /dev/null +++ b/qlib/contrib/data/data.py @@ -0,0 +1,55 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# We remove arctic from core framework of Qlib to contrib due to +# - Arctic has very strict limitation on pandas and numpy version +# - https://github.com/man-group/arctic/pull/908 +# - pip fail to computing the right version number!!!! +# - Maybe we can solve this problem by poetry + +# FIXME: So if you want to use arctic-based provider, please install arctic manually +# `pip install arctic` may not be enough. +from arctic import Arctic +import pandas as pd +import pymongo + +from qlib.data.data import FeatureProvider + + +class ArcticFeatureProvider(FeatureProvider): + def __init__( + self, uri="127.0.0.1", retry_time=0, market_transaction_time_list=[("09:15", "11:30"), ("13:00", "15:00")] + ): + super().__init__() + self.uri = uri + # TODO: + # retry connecting if error occurs + # does it real matters? + self.retry_time = retry_time + # NOTE: this is especially important for TResample operator + self.market_transaction_time_list = market_transaction_time_list + + def feature(self, instrument, field, start_index, end_index, freq): + field = str(field)[1:] + with pymongo.MongoClient(self.uri) as client: + # TODO: this will result in frequently connecting the server and performance issue + arctic = Arctic(client) + + if freq not in arctic.list_libraries(): + raise ValueError("lib {} not in arctic".format(freq)) + + if instrument not in arctic[freq].list_symbols(): + # instruments does not exist + return pd.Series() + else: + df = arctic[freq].read(instrument, columns=[field], chunk_range=(start_index, end_index)) + s = df[field] + + if not s.empty: + s = pd.concat( + [ + s.between_time(time_tuple[0], time_tuple[1]) + for time_tuple in self.market_transaction_time_list + ] + ) + return s diff --git a/qlib/data/__init__.py b/qlib/data/__init__.py index 6549d16f7..ef5fe4708 100644 --- a/qlib/data/__init__.py +++ b/qlib/data/__init__.py @@ -15,7 +15,6 @@ from .data import ( LocalCalendarProvider, LocalInstrumentProvider, LocalFeatureProvider, - ArcticFeatureProvider, LocalExpressionProvider, LocalDatasetProvider, ClientCalendarProvider, diff --git a/qlib/data/data.py b/qlib/data/data.py index 9849f36ed..587d21d8d 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -17,11 +17,9 @@ import pandas as pd from multiprocessing import Pool from typing import Iterable, Union from typing import List, Union -from arctic import Arctic # For supporting multiprocessing in outer code, joblib is used from joblib import delayed -import pymongo from .cache import H from ..config import C @@ -582,7 +580,7 @@ class DatasetProvider(abc.ABC): data.index = _calendar[data.index.values.astype(int)] data.index.names = ["datetime"] - if spans is not None: + if not data.empty and spans is not None: mask = np.zeros(len(data), dtype=bool) for begin, end in spans: mask |= (data.index >= begin) & (data.index <= end) @@ -702,45 +700,6 @@ class LocalFeatureProvider(FeatureProvider, ProviderBackendMixin): return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] -class ArcticFeatureProvider(FeatureProvider): - def __init__( - self, uri="127.0.0.1", retry_time=0, market_transaction_time_list=[("09:15", "11:30"), ("13:00", "15:00")] - ): - super().__init__() - self.uri = uri - # TODO: - # retry connecting if error occurs - # does it real matters? - self.retry_time = retry_time - # NOTE: this is especially important for TResample operator - self.market_transaction_time_list = market_transaction_time_list - - def feature(self, instrument, field, start_index, end_index, freq): - field = str(field)[1:] - with pymongo.MongoClient(self.uri) as client: - # TODO: this will result in frequently connecting the server and performance issue - arctic = Arctic(client) - - if freq not in arctic.list_libraries(): - raise ValueError("lib {} not in arctic".format(freq)) - - if instrument not in arctic[freq].list_symbols(): - # instruments does not exist - return pd.Series() - else: - df = arctic[freq].read(instrument, columns=[field], chunk_range=(start_index, end_index)) - s = df[field] - - if not s.empty: - s = pd.concat( - [ - s.between_time(time_tuple[0], time_tuple[1]) - for time_tuple in self.market_transaction_time_list - ] - ) - return s - - class LocalExpressionProvider(ExpressionProvider): """Local expression data provider class diff --git a/setup.py b/setup.py index 1b9d4f490..ab397e1cf 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,6 @@ REQUIRED = [ "dill", "dataclasses;python_version<'3.7'", "filelock", - "arctic", ] # Numpy include