1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 18:40:58 +08:00

Merge pull request #280 from yongzhengqi/main

Implement Enhanced Indexing as a Portfolio Optimizer
This commit is contained in:
you-n-g
2021-03-17 12:07:39 +08:00
committed by GitHub
14 changed files with 624 additions and 224 deletions

View File

@@ -7,7 +7,6 @@ import numpy as np
import pandas as pd
from ..backtest.order import Order
from ...utils import get_pre_trading_date
from .order_generator import OrderGenWInteract
@@ -390,11 +389,11 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
current_stock_list = current_temp.get_stock_list()
value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0
# open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it
# as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
# open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not
# consider it as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
# value = value / (1+trade_exchange.open_cost) # set open_cost limit
for code in buy:
# check is stock supended
# check is stock suspended
if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date):
continue
# buy order

View File

@@ -43,7 +43,8 @@ class Model(BaseModel):
# get weights
try:
wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"], data_key=DataHandlerLP.DK_L)
wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"],
data_key=DataHandlerLP.DK_L)
w_train, w_valid = wdf_train["weight"], wdf_valid["weight"]
except KeyError as e:
w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)

View File

@@ -0,0 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from .base import RiskModel
from .poet import POETCovEstimator
from .shrink import ShrinkCovEstimator
from .structured import StructuredCovEstimator

View File

@@ -0,0 +1,147 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import inspect
import numpy as np
import pandas as pd
from typing import Union
from qlib.model.base import BaseModel
class RiskModel(BaseModel):
"""Risk Model
A risk model is used to estimate the covariance matrix of stock returns.
"""
MASK_NAN = "mask"
FILL_NAN = "fill"
IGNORE_NAN = "ignore"
def __init__(self, nan_option: str = "ignore", assume_centered: bool = False, scale_return: bool = True):
"""
Args:
nan_option (str): nan handling option (`ignore`/`mask`/`fill`).
assume_centered (bool): whether the data is assumed to be centered.
scale_return (bool): whether scale returns as percentage.
"""
# nan
assert nan_option in [
self.MASK_NAN,
self.FILL_NAN,
self.IGNORE_NAN,
], f"`nan_option={nan_option}` is not supported"
self.nan_option = nan_option
self.assume_centered = assume_centered
self.scale_return = scale_return
def predict(
self,
X: Union[pd.Series, pd.DataFrame, np.ndarray],
return_corr: bool = False,
is_price: bool = True,
return_decomposed_components=False,
) -> Union[pd.DataFrame, np.ndarray, tuple]:
"""
Args:
X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance,
with variables as columns and observations as rows.
return_corr (bool): whether return the correlation matrix.
is_price (bool): whether `X` contains price (if not assume stock returns).
return_decomposed_components (bool): whether return decomposed components of the covariance matrix.
Returns:
pd.DataFrame or np.ndarray: estimated covariance (or correlation).
"""
assert (
not return_corr or not return_decomposed_components
), "Can only return either correlation matrix or decomposed components."
# transform input into 2D array
if not isinstance(X, (pd.Series, pd.DataFrame)):
columns = None
else:
if isinstance(X.index, pd.MultiIndex):
if isinstance(X, pd.DataFrame):
X = X.iloc[:, 0].unstack(level="instrument") # always use the first column
else:
X = X.unstack(level="instrument")
else:
# X is 2D DataFrame
pass
columns = X.columns # will be used to restore dataframe
X = X.values
# calculate pct_change
if is_price:
X = X[1:] / X[:-1] - 1 # NOTE: resulting `n - 1` rows
# scale return
if self.scale_return:
X *= 100
# handle nan and centered
X = self._preprocess(X)
# return decomposed components if needed
if return_decomposed_components:
assert (
"return_decomposed_components" in inspect.getfullargspec(self._predict).args
), "This risk model does not support return decomposed components of the covariance matrix "
F, cov_b, var_u = self._predict(X, return_decomposed_components=True)
return F, cov_b, var_u
# estimate covariance
S = self._predict(X)
# return correlation if needed
if return_corr:
vola = np.sqrt(np.diag(S))
corr = S / np.outer(vola, vola)
if columns is None:
return corr
return pd.DataFrame(corr, index=columns, columns=columns)
# return covariance
if columns is None:
return S
return pd.DataFrame(S, index=columns, columns=columns)
def _predict(self, X: np.ndarray) -> np.ndarray:
"""covariance estimation implementation
This method should be overridden by child classes.
By default, this method implements the empirical covariance estimation.
Args:
X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).
Returns:
np.ndarray: covariance matrix.
"""
xTx = np.asarray(X.T.dot(X))
N = len(X)
if isinstance(X, np.ma.MaskedArray):
M = 1 - X.mask
N = M.T.dot(M) # each pair has distinct number of samples
return xTx / N
def _preprocess(self, X: np.ndarray) -> Union[np.ndarray, np.ma.MaskedArray]:
"""handle nan and centerize data
Note:
if `nan_option='mask'` then the returned array will be `np.ma.MaskedArray`.
"""
# handle nan
if self.nan_option == self.FILL_NAN:
X = np.nan_to_num(X)
elif self.nan_option == self.MASK_NAN:
X = np.ma.masked_invalid(X)
# centralize
if not self.assume_centered:
X = X - np.nanmean(X, axis=0)
return X

View File

@@ -0,0 +1,84 @@
import numpy as np
from qlib.model.riskmodel import RiskModel
class POETCovEstimator(RiskModel):
"""Principal Orthogonal Complement Thresholding Estimator (POET)
Reference:
[1] Fan, J., Liao, Y., & Mincheva, M. (2013). Large covariance estimation by thresholding principal orthogonal complements.
Journal of the Royal Statistical Society. Series B: Statistical Methodology, 75(4), 603680. https://doi.org/10.1111/rssb.12016
[2] http://econweb.rutgers.edu/yl1114/papers/poet/POET.m
"""
THRESH_SOFT = "soft"
THRESH_HARD = "hard"
THRESH_SCAD = "scad"
def __init__(self, num_factors: int = 0, thresh: float = 1.0, thresh_method: str = "soft", **kwargs):
"""
Args:
num_factors (int): number of factors (if set to zero, no factor model will be used).
thresh (float): the positive constant for thresholding.
thresh_method (str): thresholding method, which can be
- 'soft': soft thresholding.
- 'hard': hard thresholding.
- 'scad': scad thresholding.
kwargs: see `RiskModel` for more information.
"""
super().__init__(**kwargs)
assert num_factors >= 0, "`num_factors` requires a positive integer"
self.num_factors = num_factors
assert thresh >= 0, "`thresh` requires a positive float number"
self.thresh = thresh
assert thresh_method in [
self.THRESH_HARD,
self.THRESH_SOFT,
self.THRESH_SCAD,
], "`thresh_method` should be `soft`/`hard`/`scad`"
self.thresh_method = thresh_method
def _predict(self, X: np.ndarray) -> np.ndarray:
Y = X.T # NOTE: to match POET's implementation
p, n = Y.shape
if self.num_factors > 0:
Dd, V = np.linalg.eig(Y.T.dot(Y))
V = V[:, np.argsort(Dd)]
F = V[:, -self.num_factors :][:, ::-1] * np.sqrt(n)
LamPCA = Y.dot(F) / n
uhat = np.asarray(Y - LamPCA.dot(F.T))
Lowrank = np.asarray(LamPCA.dot(LamPCA.T))
rate = 1 / np.sqrt(p) + np.sqrt(np.log(p) / n)
else:
uhat = np.asarray(Y)
rate = np.sqrt(np.log(p) / n)
Lowrank = 0
lamb = rate * self.thresh
SuPCA = uhat.dot(uhat.T) / n
SuDiag = np.diag(np.diag(SuPCA))
R = np.linalg.inv(SuDiag ** 0.5).dot(SuPCA).dot(np.linalg.inv(SuDiag ** 0.5))
if self.thresh_method == self.THRESH_HARD:
M = R * (np.abs(R) > lamb)
elif self.thresh_method == self.THRESH_SOFT:
res = np.abs(R) - lamb
res = (res + np.abs(res)) / 2
M = np.sign(R) * res
else:
M1 = (np.abs(R) < 2 * lamb) * np.sign(R) * (np.abs(R) - lamb) * (np.abs(R) > lamb)
M2 = (np.abs(R) < 3.7 * lamb) * (np.abs(R) >= 2 * lamb) * (2.7 * R - 3.7 * np.sign(R) * lamb) / 1.7
M3 = (np.abs(R) >= 3.7 * lamb) * R
M = M1 + M2 + M3
Rthresh = M - np.diag(np.diag(M)) + np.eye(p)
SigmaU = (SuDiag ** 0.5).dot(Rthresh).dot(SuDiag ** 0.5)
SigmaY = SigmaU + Lowrank
return SigmaY

View File

@@ -1,133 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import warnings
import numpy as np
import pandas as pd
from typing import Union
from qlib.model.base import BaseModel
class RiskModel(BaseModel):
"""Risk Model
A risk model is used to estimate the covariance matrix of stock returns.
"""
MASK_NAN = "mask"
FILL_NAN = "fill"
IGNORE_NAN = "ignore"
def __init__(self, nan_option: str = "ignore", assume_centered: bool = False, scale_return: bool = True):
"""
Args:
nan_option (str): nan handling option (`ignore`/`mask`/`fill`).
assume_centered (bool): whether the data is assumed to be centered.
scale_return (bool): whether scale returns as percentage.
"""
# nan
assert nan_option in [
self.MASK_NAN,
self.FILL_NAN,
self.IGNORE_NAN,
], f"`nan_option={nan_option}` is not supported"
self.nan_option = nan_option
self.assume_centered = assume_centered
self.scale_return = scale_return
def predict(
self, X: Union[pd.Series, pd.DataFrame, np.ndarray], return_corr: bool = False, is_price: bool = True
) -> Union[pd.DataFrame, np.ndarray]:
"""
Args:
X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance,
with variables as columns and observations as rows.
return_corr (bool): whether return the correlation matrix.
is_price (bool): whether `X` contains price (if not assume stock returns).
Returns:
pd.DataFrame or np.ndarray: estimated covariance (or correlation).
"""
# transform input into 2D array
if not isinstance(X, (pd.Series, pd.DataFrame)):
columns = None
else:
if isinstance(X.index, pd.MultiIndex):
if isinstance(X, pd.DataFrame):
X = X.iloc[:, 0].unstack(level="instrument") # always use the first column
else:
X = X.unstack(level="instrument")
else:
# X is 2D DataFrame
pass
columns = X.columns # will be used to restore dataframe
X = X.values
# calculate pct_change
if is_price:
X = X[1:] / X[:-1] - 1 # NOTE: resulting `n - 1` rows
# scale return
if self.scale_return:
X *= 100
# handle nan and centered
X = self._preprocess(X)
# estimate covariance
S = self._predict(X)
# return correlation if needed
if return_corr:
vola = np.sqrt(np.diag(S))
corr = S / np.outer(vola, vola)
if columns is None:
return corr
return pd.DataFrame(corr, index=columns, columns=columns)
# return covariance
if columns is None:
return S
return pd.DataFrame(S, index=columns, columns=columns)
def _predict(self, X: np.ndarray) -> np.ndarray:
"""covariance estimation implementation
This method should be overridden by child classes.
By default, this method implements the empirical covariance estimation.
Args:
X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).
Returns:
np.ndarray: covariance matrix.
"""
xTx = np.asarray(X.T.dot(X))
N = len(X)
if isinstance(X, np.ma.MaskedArray):
M = 1 - X.mask
N = M.T.dot(M) # each pair has distinct number of samples
return xTx / N
def _preprocess(self, X: np.ndarray) -> Union[np.ndarray, np.ma.MaskedArray]:
"""handle nan and centerize data
Note:
if `nan_option='mask'` then the returned array will be `np.ma.MaskedArray`.
"""
# handle nan
if self.nan_option == self.FILL_NAN:
X = np.nan_to_num(X)
elif self.nan_option == self.MASK_NAN:
X = np.ma.masked_invalid(X)
# centerize
if not self.assume_centered:
X = X - np.nanmean(X, axis=0)
return X
from qlib.model.riskmodel import RiskModel
class ShrinkCovEstimator(RiskModel):
@@ -162,8 +36,9 @@ class ShrinkCovEstimator(RiskModel):
[3] Ledoit, O., & Wolf, M. (2003). Improved estimation of the covariance matrix of stock returns
with an application to portfolio selection.
Journal of Empirical Finance, 10(5), 603621. https://doi.org/10.1016/S0927-5398(03)00007-0
[4] Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. (2010). Shrinkage algorithms for MMSE covariance estimation.
IEEE Transactions on Signal Processing, 58(10), 50165029. https://doi.org/10.1109/TSP.2010.2053029
[4] Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. (2010). Shrinkage algorithms for MMSE covariance
estimation. IEEE Transactions on Signal Processing, 58(10), 50165029.
https://doi.org/10.1109/TSP.2010.2053029
[5] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-0000-00007f64e5b9/cov1para.m.zip
[6] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-ffff-ffffde5e2d4e/covCor.m.zip
[7] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-0000-0000648dfc98/covMarket.m.zip
@@ -384,84 +259,3 @@ class ShrinkCovEstimator(RiskModel):
alpha = max(0, min(1, kappa / t))
return alpha
class POETCovEstimator(RiskModel):
"""Principal Orthogonal Complement Thresholding Estimator (POET)
Reference:
[1] Fan, J., Liao, Y., & Mincheva, M. (2013). Large covariance estimation by thresholding principal orthogonal complements.
Journal of the Royal Statistical Society. Series B: Statistical Methodology, 75(4), 603680. https://doi.org/10.1111/rssb.12016
[2] http://econweb.rutgers.edu/yl1114/papers/poet/POET.m
"""
THRESH_SOFT = "soft"
THRESH_HARD = "hard"
THRESH_SCAD = "scad"
def __init__(self, num_factors: int = 0, thresh: float = 1.0, thresh_method: str = "soft", **kwargs):
"""
Args:
num_factors (int): number of factors (if set to zero, no factor model will be used).
thresh (float): the positive constant for thresholding.
thresh_method (str): thresholding method, which can be
- 'soft': soft thresholding.
- 'hard': hard thresholding.
- 'scad': scad thresholding.
kwargs: see `RiskModel` for more information.
"""
super().__init__(**kwargs)
assert num_factors >= 0, "`num_factors` requires a positive integer"
self.num_factors = num_factors
assert thresh >= 0, "`thresh` requires a positive float number"
self.thresh = thresh
assert thresh_method in [
self.THRESH_HARD,
self.THRESH_SOFT,
self.THRESH_SCAD,
], "`thresh_method` should be `soft`/`hard`/`scad`"
self.thresh_method = thresh_method
def _predict(self, X: np.ndarray) -> np.ndarray:
Y = X.T # NOTE: to match POET's implementation
p, n = Y.shape
if self.num_factors > 0:
Dd, V = np.linalg.eig(Y.T.dot(Y))
V = V[:, np.argsort(Dd)]
F = V[:, -self.num_factors :][:, ::-1] * np.sqrt(n)
LamPCA = Y.dot(F) / n
uhat = np.asarray(Y - LamPCA.dot(F.T))
Lowrank = np.asarray(LamPCA.dot(LamPCA.T))
rate = 1 / np.sqrt(p) + np.sqrt(np.log(p) / n)
else:
uhat = np.asarray(Y)
rate = np.sqrt(np.log(p) / n)
Lowrank = 0
lamb = rate * self.thresh
SuPCA = uhat.dot(uhat.T) / n
SuDiag = np.diag(np.diag(SuPCA))
R = np.linalg.inv(SuDiag ** 0.5).dot(SuPCA).dot(np.linalg.inv(SuDiag ** 0.5))
if self.thresh_method == self.THRESH_HARD:
M = R * (np.abs(R) > lamb)
elif self.thresh_method == self.THRESH_SOFT:
res = np.abs(R) - lamb
res = (res + np.abs(res)) / 2
M = np.sign(R) * res
else:
M1 = (np.abs(R) < 2 * lamb) * np.sign(R) * (np.abs(R) - lamb) * (np.abs(R) > lamb)
M2 = (np.abs(R) < 3.7 * lamb) * (np.abs(R) >= 2 * lamb) * (2.7 * R - 3.7 * np.sign(R) * lamb) / 1.7
M3 = (np.abs(R) >= 3.7 * lamb) * R
M = M1 + M2 + M3
Rthresh = M - np.diag(np.diag(M)) + np.eye(p)
SigmaU = (SuDiag ** 0.5).dot(Rthresh).dot(SuDiag ** 0.5)
SigmaY = SigmaU + Lowrank
return SigmaY

View File

@@ -0,0 +1,84 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
from typing import Union
from sklearn.decomposition import PCA, FactorAnalysis
from qlib.model.riskmodel import RiskModel
class StructuredCovEstimator(RiskModel):
"""Structured Covariance Estimator
This estimator assumes observations can be predicted by multiple factors
X = FB + U
where `F` can be specified by explicit risk factors or latent factors.
Therefore the structured covariance can be estimated by
cov(X) = F cov(B) F.T + cov(U)
We use latent factor models to estimate the structured covariance.
Specifically, the following latent factor models are supported:
- `pca`: Principal Component Analysis
- `fa`: Factor Analysis
Reference: [1] Fan, J., Liao, Y., & Liu, H. (2016). An overview of the estimation of large covariance and
precision matrices. Econometrics Journal, 19(1), C1C32. https://doi.org/10.1111/ectj.12061
"""
FACTOR_MODEL_PCA = "pca"
FACTOR_MODEL_FA = "fa"
DEFAULT_NAN_OPTION = "fill"
def __init__(self, factor_model: str = "pca", num_factors: int = 10, **kwargs):
"""
Args:
factor_model (str): the latent factor models used to estimate the structured covariance (`pca`/`fa`).
num_factors (int): number of components to keep.
kwargs: see `RiskModel` for more information
"""
if "nan_option" in kwargs.keys():
assert kwargs["nan_option"] in [self.DEFAULT_NAN_OPTION], "nan_option={} is not supported".format(
kwargs["nan_option"]
)
else:
kwargs["nan_option"] = self.DEFAULT_NAN_OPTION
super().__init__(**kwargs)
assert factor_model in [
self.FACTOR_MODEL_PCA,
self.FACTOR_MODEL_FA,
], "factor_model={} is not supported".format(factor_model)
self.solver = PCA if factor_model == self.FACTOR_MODEL_PCA else FactorAnalysis
self.num_factors = num_factors
def _predict(self, X: np.ndarray, return_decomposed_components=False) -> Union[np.ndarray, tuple]:
"""
covariance estimation implementation
Args:
X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).
return_decomposed_components (bool): whether return decomposed components of the covariance matrix.
Returns:
tuple or np.ndarray: decomposed covariance matrix or covariance matrix.
"""
model = self.solver(self.num_factors, random_state=0).fit(X)
F = model.components_.T # num_features x num_factors
B = model.transform(X) # num_samples x num_factors
U = X - B @ F.T
cov_b = np.cov(B.T) # num_factors x num_factors
var_u = np.var(U, axis=0) # diagonal
if return_decomposed_components:
return F, cov_b, var_u
cov_x = F @ cov_b @ F.T + np.diag(var_u)
return cov_x

View File

@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

View File

@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from .base import BaseOptimizer
from .optimizer import PortfolioOptimizer
from .enhanced_indexing import EnhancedIndexingOptimizer

View File

@@ -0,0 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import abc
class BaseOptimizer(abc.ABC):
""" Construct portfolio with a optimization related method """
@abc.abstractmethod
def __call__(self, *args, **kwargs) -> object:
""" Generate a optimized portfolio allocation """
pass

View File

@@ -0,0 +1,143 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import cvxpy as cp
import pandas as pd
from typing import Union
from qlib.portfolio.optimizer import BaseOptimizer
class EnhancedIndexingOptimizer(BaseOptimizer):
"""
Portfolio Optimizer with Enhanced Indexing
Note:
This optimizer always assumes full investment and no-shorting.
"""
START_FROM_W0 = "w0"
START_FROM_BENCH = "benchmark"
def __init__(
self,
lamb: float = 10,
delta: float = 0.4,
bench_dev: float = 0.01,
inds_dev: float = None,
scale_alpha: bool = True,
verbose: bool = False,
warm_start: str = None,
max_iters: int = 10000,
):
"""
Args:
lamb (float): risk aversion parameter (larger `lamb` means less focus on return)
delta (float): turnover rate limit
bench_dev (float): benchmark deviation limit
inds_dev (float/None): industry deviation limit, set `inds_dev` to None to ignore industry specific
restriction
scale_alpha (bool): if to scale alpha to match the volatility of the covariance matrix
verbose (bool): if print detailed information about the solver
warm_start (str): whether try to warm start (`w0`/`benchmark`/``)
(https://www.cvxpy.org/tutorial/advanced/index.html#warm-start)
"""
assert lamb >= 0, "risk aversion parameter `lamb` should be positive"
self.lamb = lamb
assert delta >= 0, "turnover limit `delta` should be positive"
self.delta = delta
assert bench_dev >= 0, "benchmark deviation limit `bench_dev` should be positive"
self.bench_dev = bench_dev
assert inds_dev is None or inds_dev >= 0, "industry deviation limit `inds_dev` should be positive or None."
self.inds_dev = inds_dev
assert warm_start in [
None,
self.START_FROM_W0,
self.START_FROM_BENCH,
], "illegal warm start option"
self.start_from_w0 = warm_start == self.START_FROM_W0
self.start_from_bench = warm_start == self.START_FROM_BENCH
self.scale_alpha = scale_alpha
self.verbose = verbose
self.max_iters = max_iters
def __call__(
self,
u: Union[np.ndarray, pd.Series],
F: np.ndarray,
covB: np.ndarray,
varU: np.ndarray,
w0: np.ndarray,
w_bench: np.ndarray,
inds_onehot: np.ndarray = None,
) -> Union[np.ndarray, pd.Series]:
"""
Args:
u (np.ndarray or pd.Series): expected returns (a.k.a., alpha)
F, covB, varU (np.ndarray): see StructuredCovEstimator
w0 (np.ndarray): initial weights (for turnover control)
w_bench (np.ndarray): benchmark weights
inds_onehot (np.ndarray): industry (onehot)
Returns:
np.ndarray or pd.Series: optimized portfolio allocation
"""
assert inds_onehot is not None or self.inds_dev is None, "Industry onehot vector is required."
# transform dataframe into array
if isinstance(u, pd.Series):
u = u.values
# scale alpha to match volatility
if self.scale_alpha:
u = u / u.std()
x_variance = np.mean(np.diag(F @ covB @ F.T) + varU)
u *= x_variance ** 0.5
w = cp.Variable(len(u)) # num_assets
v = w @ F # num_factors
ret = w @ u
risk = cp.quad_form(v, covB) + cp.sum(cp.multiply(varU, w ** 2))
obj = cp.Maximize(ret - self.lamb * risk)
d_bench = w - w_bench
cons = [
w >= 0,
cp.sum(w) == 1,
d_bench >= -self.bench_dev,
d_bench <= self.bench_dev,
]
if self.inds_dev is not None:
d_inds = d_bench @ inds_onehot
cons.append(d_inds >= -self.inds_dev)
cons.append(d_inds <= self.inds_dev)
if w0 is not None:
turnover = cp.sum(cp.abs(w - w0))
cons.append(turnover <= self.delta)
warm_start = False
if self.start_from_w0:
if w0 is None:
print("Warning: try warm start with w0, but w0 is `None`.")
else:
w.value = w0
warm_start = True
elif self.start_from_bench:
w.value = w_bench
warm_start = True
prob = cp.Problem(obj, cons)
prob.solve(solver=cp.SCS, verbose=self.verbose, warm_start=warm_start, max_iters=self.max_iters)
if prob.status != "optimal":
print("Warning: solve failed.", prob.status)
return np.asarray(w.value)

View File

@@ -1,15 +1,17 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import warnings
import numpy as np
import pandas as pd
import scipy.optimize as so
from typing import Optional, Union, Callable, List
from qlib.portfolio.optimizer import BaseOptimizer
class PortfolioOptimizer:
class PortfolioOptimizer(BaseOptimizer):
"""Portfolio Optimizer
The following optimization algorithms are supported:
@@ -42,6 +44,7 @@ class PortfolioOptimizer:
lamb (float): risk aversion parameter (larger `lamb` means more focus on return)
delta (float): turnover rate limit
alpha (float): l2 norm regularizer
scale_alpha (bool): if to scale alpha to match the volatility of the covariance matrix
tol (float): tolerance for optimization termination
"""
assert method in [self.OPT_GMV, self.OPT_MVO, self.OPT_RP, self.OPT_INV], f"method `{method}` is not supported"
@@ -57,6 +60,7 @@ class PortfolioOptimizer:
self.alpha = alpha
self.tol = tol
self.scale_alpha = scale_alpha
def __call__(
self,
@@ -83,18 +87,18 @@ class PortfolioOptimizer:
if u is not None:
assert len(u) == len(S), "`u` has mismatched shape"
if isinstance(u, pd.Series):
assert all(u.index == index), "`u` has mismatched index"
assert u.index.equals(index), "`u` has mismatched index"
u = u.values
# transform initial weights
if w0 is not None:
assert len(w0) == len(S), "`w0` has mismatched shape"
if isinstance(w0, pd.Series):
assert all(w0.index == index), "`w0` has mismatched index"
assert w0.index.equals(index), "`w0` has mismatched index"
w0 = w0.values
# scale alpha to match volatility
if u is not None:
if u is not None and self.scale_alpha:
u = u / u.std()
u *= np.mean(np.diag(S)) ** 0.5
@@ -173,7 +177,7 @@ class PortfolioOptimizer:
"""
return self._solve(len(S), self._get_objective_rp(S), *self._get_constrains(w0))
def _get_objective_gmv(self, S: np.ndarray) -> np.ndarray:
def _get_objective_gmv(self, S: np.ndarray) -> Callable:
"""global minimum variance optimization objective
Optimization objective
@@ -185,7 +189,7 @@ class PortfolioOptimizer:
return func
def _get_objective_mvo(self, S: np.ndarray, u: np.ndarray = None) -> np.ndarray:
def _get_objective_mvo(self, S: np.ndarray, u: np.ndarray = None) -> Callable:
"""mean-variance optimization objective
Optimization objective
@@ -199,7 +203,7 @@ class PortfolioOptimizer:
return func
def _get_objective_rp(self, S: np.ndarray) -> np.ndarray:
def _get_objective_rp(self, S: np.ndarray) -> Callable:
"""risk-parity optimization objective
Optimization objective
@@ -247,7 +251,11 @@ class PortfolioOptimizer:
# add l2 regularization
wrapped_obj = obj
if self.alpha > 0:
wrapped_obj = lambda x: obj(x) + self.alpha * np.sum(np.square(x))
def opt_obj(x):
return obj(x) + self.alpha * np.sum(np.square(x))
wrapped_obj = opt_obj
# solve
x0 = np.ones(n) / n # init results

View File

@@ -55,6 +55,7 @@ REQUIRED = [
"tornado",
"joblib>=0.17.0",
"ruamel.yaml>=0.16.12",
"scikit-learn>=0.22",
]
# Numpy include

View File

@@ -0,0 +1,111 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import unittest
import numpy as np
from scipy.linalg import sqrtm
from qlib.model.riskmodel import StructuredCovEstimator
class TestStructuredCovEstimator(unittest.TestCase):
def test_random_covariance(self):
# Try to estimate the covariance from a randomly generated matrix.
NUM_VARIABLE = 10
NUM_OBSERVATION = 200
EPS = 1e-6
estimator = StructuredCovEstimator(scale_return=False, assume_centered=True)
X = np.random.rand(NUM_OBSERVATION, NUM_VARIABLE)
est_cov = estimator.predict(X, is_price=False)
np_cov = np.cov(X.T) # While numpy assume row means variable, qlib assume the other wise.
delta = abs(est_cov - np_cov)
if_identical = (delta < EPS).all()
self.assertTrue(if_identical)
def test_nan_option_covariance(self):
# Test if nan_option is correctly passed.
NUM_VARIABLE = 10
NUM_OBSERVATION = 200
EPS = 1e-6
estimator = StructuredCovEstimator(scale_return=False, assume_centered=True, nan_option="fill")
X = np.random.rand(NUM_OBSERVATION, NUM_VARIABLE)
est_cov = estimator.predict(X, is_price=False)
np_cov = np.cov(X.T) # While numpy assume row means variable, qlib assume the other wise.
delta = abs(est_cov - np_cov)
if_identical = (delta < EPS).all()
self.assertTrue(if_identical)
def test_decompose_covariance(self):
# Test if return_decomposed_components is correctly passed.
NUM_VARIABLE = 10
NUM_OBSERVATION = 200
estimator = StructuredCovEstimator(scale_return=False, assume_centered=True, nan_option="fill")
X = np.random.rand(NUM_OBSERVATION, NUM_VARIABLE)
F, cov_b, var_u = estimator.predict(X, is_price=False, return_decomposed_components=True)
self.assertTrue(F is not None and cov_b is not None and var_u is not None)
def test_constructed_covariance(self):
# Try to estimate the covariance from a specially crafted matrix.
# There should be some significant correlation since X is specially crafted.
NUM_VARIABLE = 7
NUM_OBSERVATION = 500
EPS = 0.1
estimator = StructuredCovEstimator(scale_return=False, assume_centered=True, num_factors=NUM_VARIABLE - 1)
sqrt_cov = None
while sqrt_cov is None or (np.iscomplex(sqrt_cov)).any():
cov = np.random.rand(NUM_VARIABLE, NUM_VARIABLE)
for i in range(NUM_VARIABLE):
cov[i][i] = 1
sqrt_cov = sqrtm(cov)
X = np.random.rand(NUM_OBSERVATION, NUM_VARIABLE) @ sqrt_cov
est_cov = estimator.predict(X, is_price=False)
np_cov = np.cov(X.T) # While numpy assume row means variable, qlib assume the other wise.
delta = abs(est_cov - np_cov)
if_identical = (delta < EPS).all()
self.assertTrue(if_identical)
def test_decomposition(self):
# Try to estimate the covariance from a specially crafted matrix.
# The matrix is generated in the assumption that observations can be predicted by multiple factors.
NUM_VARIABLE = 30
NUM_OBSERVATION = 100
NUM_FACTOR = 10
EPS = 0.1
estimator = StructuredCovEstimator(scale_return=False, assume_centered=True, num_factors=NUM_FACTOR)
F = np.random.rand(NUM_VARIABLE, NUM_FACTOR)
B = np.random.rand(NUM_FACTOR, NUM_OBSERVATION)
U = np.random.rand(NUM_OBSERVATION, NUM_VARIABLE)
X = (F @ B).T + U
est_cov = estimator.predict(X, is_price=False)
np_cov = np.cov(X.T) # While numpy assume row means variable, qlib assume the other wise.
delta = abs(est_cov - np_cov)
if_identical = (delta < EPS).all()
self.assertTrue(if_identical)
if __name__ == "__main__":
unittest.main()