1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 10:31:00 +08:00

Add Structured Covariance Estimator to riskmodel.py

This commit is contained in:
Charles Young
2021-02-09 20:28:42 +08:00
parent 12c8bfa545
commit 988b42e159

View File

@@ -39,7 +39,7 @@ class RiskModel(BaseModel):
self.scale_return = scale_return
def predict(
self, X: Union[pd.Series, pd.DataFrame, np.ndarray], return_corr: bool = False, is_price: bool = True
self, X: Union[pd.Series, pd.DataFrame, np.ndarray], return_corr: bool = False, is_price: bool = True
) -> Union[pd.DataFrame, np.ndarray]:
"""
Args:
@@ -373,7 +373,8 @@ class ShrinkCovEstimator(RiskModel):
roff1 = np.sum(v1 * cov_mkt[:, None].T) / var_mkt - np.sum(np.diag(v1) * cov_mkt) / var_mkt
v3 = z.T.dot(z) / t - var_mkt * S
roff3 = (
np.sum(v3 * np.outer(cov_mkt, cov_mkt)) / var_mkt ** 2 - np.sum(np.diag(v3) * cov_mkt ** 2) / var_mkt ** 2
np.sum(v3 * np.outer(cov_mkt, cov_mkt)) / var_mkt ** 2 - np.sum(
np.diag(v3) * cov_mkt ** 2) / var_mkt ** 2
)
roff = 2 * roff1 - roff3
rho = rdiag + roff
@@ -433,7 +434,7 @@ class POETCovEstimator(RiskModel):
if self.num_factors > 0:
Dd, V = np.linalg.eig(Y.T.dot(Y))
V = V[:, np.argsort(Dd)]
F = V[:, -self.num_factors :][:, ::-1] * np.sqrt(n)
F = V[:, -self.num_factors:][:, ::-1] * np.sqrt(n)
LamPCA = Y.dot(F) / n
uhat = np.asarray(Y - LamPCA.dot(F.T))
Lowrank = np.asarray(LamPCA.dot(LamPCA.T))
@@ -465,3 +466,137 @@ class POETCovEstimator(RiskModel):
SigmaY = SigmaU + Lowrank
return SigmaY
class StructuredCovEstimator(RiskModel):
"""Structured Covariance Estimator
This estimator assumes observations can be predicted by multiple factors
X = FB + U
where `F` can be specified by explicit risk factors or latent factors.
Therefore the structured covariance can be estimated by
cov(X) = F cov(B) F.T + cov(U)
We use latent factor models to estimate the structured covariance.
Specifically, the following latent factor models are supported:
- `pca`: Principal Component Analysis
- `fa`: Factor Analysis
Reference: [1] Fan, J., Liao, Y., & Liu, H. (2016). An overview of the estimation of large covariance and
precision matrices. Econometrics Journal, 19(1), C1C32. https://doi.org/10.1111/ectj.12061
"""
FACTOR_MODEL_PCA = "pca"
FACTOR_MODEL_FA = "fa"
def __init__(self, factor_model: str = 'pca', num_factors: int = 10, nan_option: str = "ignore",
assume_centered: bool = False, scale_return: bool = True):
"""
Args:
factor_model (str): the latent factor models used to estimate the structured covariance (`pca`/`fa`).
num_factors (int): number of components to keep.
nan_option (str): nan handling option (`ignore`/`fill`).
assume_centered (bool): whether the data is assumed to be centered.
scale_return (bool): whether scale returns as percentage.
"""
super().__init__(nan_option, assume_centered, scale_return)
assert factor_model in [
self.FACTOR_MODEL_PCA,
self.FACTOR_MODEL_FA,
], 'factor_model={} is not supported'.format(factor_model)
self.solver = PCA if factor_model == self.FACTOR_MODEL_PCA else FactorAnalysis
self.num_factors = num_factors
def predict(
self, X: Union[pd.Series, pd.DataFrame, np.ndarray], return_corr: bool = False, is_price: bool = True,
return_decomposed_components=False
) -> Union[pd.DataFrame, np.ndarray, tuple]:
"""
Args:
X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance,
with variables as columns and observations as rows.
return_corr (bool): whether return the correlation matrix.
is_price (bool): whether `X` contains price (if not assume stock returns).
return_decomposed_components (bool): whether return decomposed components of the covariance matrix.
Returns:
tuple or pd.DataFrame or np.ndarray: decomposed covariance matrix or estimated covariance or correlation.
"""
assert not return_corr or not return_decomposed_components, \
'Can only return either correlation matrix or decomposed components.'
# transform input into 2D array
if not isinstance(X, (pd.Series, pd.DataFrame)):
columns = None
else:
if isinstance(X.index, pd.MultiIndex):
if isinstance(X, pd.DataFrame):
X = X.iloc[:, 0].unstack(level="instrument") # always use the first column
else:
X = X.unstack(level="instrument")
else:
# X is 2D DataFrame
pass
columns = X.columns # will be used to restore dataframe
X = X.values
# calculate pct_change
if is_price:
X = X[1:] / X[:-1] - 1 # NOTE: resulting `n - 1` rows
# scale return
if self.scale_return:
X *= 100
# handle nan and centered
X = self._preprocess(X)
if return_decomposed_components:
F, cov_b, var_u = self._predict(X, return_structured=True)
return F, cov_b, var_u
else:
# estimate covariance
S = self._predict(X)
# return correlation if needed
if return_corr:
vola = np.sqrt(np.diag(S))
corr = S / np.outer(vola, vola)
if columns is None:
return corr
return pd.DataFrame(corr, index=columns, columns=columns)
# return covariance
if columns is None:
return S
return pd.DataFrame(S, index=columns, columns=columns)
def _predict(self, X: np.ndarray, return_structured=False) -> Union[np.ndarray, tuple]:
"""
covariance estimation implementation
Args:
X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).
return_structured (bool): whether return decomposed components of the covariance matrix.
Returns:
tuple or np.ndarray: decomposed covariance matrix or covariance matrix.
"""
model = self.solver(self.num_factors, random_state=0).fit(X)
F = model.components_.T # num_features x num_factors
B = model.transform(X) # num_samples x num_factors
U = X - B @ F.T
cov_b = np.cov(B.T) # num_factors x num_factors
var_u = np.var(U, axis=0) # diagonal
if return_structured:
return F, cov_b, var_u
cov_x = F @ cov_b @ F.T + np.diag(var_u)
return cov_x