From c14a99a735cdfffc2d4f1fbdbb687269c71f7b85 Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 25 Nov 2020 17:35:26 +0800 Subject: [PATCH] Fix TopkDropoutStrategy && dump_bin --- qlib/contrib/strategy/strategy.py | 168 ++++++++++++++++++++++-------- scripts/README.md | 4 + scripts/dump_bin.py | 10 +- 3 files changed, 136 insertions(+), 46 deletions(-) diff --git a/qlib/contrib/strategy/strategy.py b/qlib/contrib/strategy/strategy.py index 6eac9bafe..2fc5dbc0f 100644 --- a/qlib/contrib/strategy/strategy.py +++ b/qlib/contrib/strategy/strategy.py @@ -26,7 +26,7 @@ class BaseStrategy: def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date): """ - Parameters: + Parameters ----------- score_series : pd.Seires stock_id , score @@ -46,7 +46,7 @@ class BaseStrategy: def update(self, score_series, pred_date, trade_date): """User can use this method to update strategy state each trade date. - Parameters: + Parameters ----------- score_series : pd.Series stock_id , score @@ -140,12 +140,15 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer): def generate_target_weight_position(self, score, current, trade_date): """ - Parameters: + Parameters ----------- - score : pred score for this trade date, pd.Series, index is stock_id, contain 'score' column - current : current position, use Position() class - trade_exchange : Exchange() - trade_date : trade date + score : pd.Series + pred score for this trade date, index is stock_id, contain 'score' column + current : Position + current position, use Position() class + trade_exchange : Exchange + trade_date : str, pd.Timestamp + trade date generate target position from score for this date and the current position The cash is not considered in the position """ @@ -153,7 +156,7 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer): def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date): """ - Parameters: + Parameters ---------- score_series : pd.Seires stock_id , score @@ -186,16 +189,29 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer): class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): - def __init__(self, topk, n_drop, method="bottom", risk_degree=0.95, thresh=1, hold_thresh=1, **kwargs): + def __init__( + self, + topk, + n_drop, + method_sell="bottom", + method_buy="top", + risk_degree=0.95, + thresh=1, + hold_thresh=1, + only_tradable=False, + **kwargs, + ): """ - Parameters: - ----------- + Parameters + ---------- topk : int The number of stocks in the portfolio n_drop : int number of stocks to be replaced in each trading date - method : str - dropout method, random/bottom + method_sell : str + dropout method_sell, random/bottom + method_buy : str + dropout method_buy, random/top risk_degree : float position percentage of total value thresh : int @@ -203,12 +219,19 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): hold_thresh : int minimum holding days before sell stock , will check current.get_stock_count(order.stock_id) >= self.thresh + only_tradable : bool + will the strategy only consider the tradable stock when buying and selling. + if only_tradable: + strategy will make buy sell decision without checking the tradable state of the stock + else: + strategy will make decision with the tradable state of the stock info and avoid buy and sell them """ super(TopkDropoutStrategy, self).__init__() ListAdjustTimer.__init__(self, kwargs.get("adjust_dates", None)) self.topk = topk self.n_drop = n_drop - self.method = method + self.method_sell = method_sell + self.method_buy = method_buy self.risk_degree = risk_degree self.thresh = thresh # self.stock_count['code'] will be the days the stock has been hold @@ -216,6 +239,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): self.stock_count = {} self.hold_thresh = hold_thresh + self.only_tradable = only_tradable def get_risk_degree(self, date): """get_risk_degree @@ -226,42 +250,102 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): return self.risk_degree def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date): - """ - Gnererate order list according to score_series at trade_date, will not change current. - - Parameters: - ----------- - score_series : pd.Series - stock_id , score - current : Position() - current of account - trade_exchange : Exchange() - exchange - pred_date : pd.Timestamp - predict date - trade_date : pd.Timestamp - trade date + """Gnererate order list according to score_series at trade_date. + will not change current. + Parameters + ---------- + score_series : pd.Seires + stock_id , score + current : Position() + current of account + trade_exchange : Exchange() + exchange + pred_date : pd.Timestamp + predict date + trade_date : pd.Timestamp + trade date """ if not self.is_adjust(trade_date): return [] + + if self.only_tradable: + # If The strategy only consider tradable stock when make decision + # It needs following actions to filter stocks + def get_first_n(l, n, reverse=False): + cur_n = 0 + res = [] + for si in reversed(l) if reverse else l: + if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date): + res.append(si) + cur_n += 1 + if cur_n >= n: + break + return res[::-1] if reverse else res + + def get_last_n(l, n): + return get_first_n(l, n, reverse=True) + + def filter_stock(l): + return [si for si in l if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date)] + + else: + # Otherwise, the stock will make decision with out the stock tradable info + def get_first_n(l, n): + return list(l)[:n] + + def get_last_n(l, n): + return list(l)[-n:] + + def filter_stock(l): + return l + current_temp = copy.deepcopy(current) # generate order list for this adjust date sell_order_list = [] buy_order_list = [] # load score + cash = current_temp.get_cash() current_stock_list = current_temp.get_stock_list() + # last position (sorted by score) last = score_series.reindex(current_stock_list).sort_values(ascending=False).index - today = ( - score_series[~score_series.index.isin(last)] - .sort_values(ascending=False) - .index[: self.n_drop + self.topk - len(last)] - ) - comb = score_series.reindex(last.union(today)).sort_values(ascending=False).index - if self.method == "bottom": - sell = last[last.isin(comb[-self.n_drop :])] - elif self.method == "random": - sell = pd.Index(np.random.choice(last, self.n_drop) if len(last) else []) + # The new stocks today want to buy **at most** + if self.method_buy == "top": + today = get_first_n( + score_series[~score_series.index.isin(last)].sort_values(ascending=False).index, + self.n_drop + self.topk - len(last), + ) + elif self.method_buy == "random": + topk_candi = get_first_n(score_series.sort_values(ascending=False).index, self.topk) + candi = list(filter(lambda x: x not in last, topk_candi)) + n = self.n_drop + self.topk - len(last) + try: + today = np.random.choice(candi, n, replace=False) + except ValueError: + today = candi + else: + raise NotImplementedError(f"This type of input is not supported") + # combine(new stocks + last stocks), we will drop stocks from this list + # In case of dropping higher score stock and buying lower score stock. + comb = score_series.reindex(last.union(pd.Index(today))).sort_values(ascending=False).index + + # Get the stock list we really want to sell (After filtering the case that we sell high and buy low) + if self.method_sell == "bottom": + sell = last[last.isin(get_last_n(comb, self.n_drop))] + elif self.method_sell == "random": + candi = filter_stock(last) + try: + sell = pd.Index(np.random.choice(candi, self.n_drop, replace=False) if len(last) else []) + except ValueError: # No enough candidates + sell = candi + else: + raise NotImplementedError(f"This type of input is not supported") + + # Get the stock list we really want to buy buy = today[: len(sell) + self.topk - len(last)] + + # buy singal: if a stock falls into topk, it appear in the buy_sinal + buy_signal = score_series.sort_values(ascending=False).iloc[: self.topk].index + for code in current_stock_list: if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date): continue @@ -285,12 +369,14 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): if trade_exchange.check_order(sell_order): sell_order_list.append(sell_order) trade_val, trade_cost, trade_price = trade_exchange.deal_order(sell_order, position=current_temp) + # update cash + cash += trade_val - trade_cost # sold del self.stock_count[code] else: # no buy signal, but the stock is kept self.stock_count[code] += 1 - elif code in buy: + elif code in buy_signal: # NOTE: This is different from the original version # get new buy signal # Only the stock fall in to topk will produce buy signal @@ -300,7 +386,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer): # buy new stock # note the current has been changed current_stock_list = current_temp.get_stock_list() - value = current_temp.get_cash() * self.risk_degree / len(buy) if len(buy) > 0 else 0 + value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0 # open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it # as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line diff --git a/scripts/README.md b/scripts/README.md index 88ebdc680..99af4a457 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -43,6 +43,8 @@ python get_data.py qlib_data --help ### US data +> Need to download data first: [Downlaod US Data](#Downlaod-US-Data) + ```python import qlib from qlib.config import REG_US @@ -52,6 +54,8 @@ qlib.init(provider_uri=provider_uri, region=REG_US) ### CN data +> Need to download data first: [Download CN Data](#Download-CN-Data) + ```python import qlib from qlib.config import REG_CN diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py index 2bca4f037..9f6dd88e2 100644 --- a/scripts/dump_bin.py +++ b/scripts/dump_bin.py @@ -140,7 +140,7 @@ class DumpDataBase: def _get_source_data(self, file_path: Path) -> pd.DataFrame: df = pd.read_csv(str(file_path.resolve()), low_memory=False) - df[self.date_field_name] = df[self.date_field_name].astype(np.datetime64) + df[self.date_field_name] = df[self.date_field_name].astype(str).astype(np.datetime64) # df.drop_duplicates([self.date_field_name], inplace=True) return df @@ -339,10 +339,10 @@ class DumpDataFix(DumpDataAll): def dump(self): self._calendars_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt")) # noinspection PyAttributeOutsideInit - self._old_instruments = self._read_instruments( - self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME) - ).to_dict( - orient="index" + self._old_instruments = ( + self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)) + .set_index([self.symbol_field_name]) + .to_dict(orient="index") ) # type: dict self._dump_instruments() self._dump_features()