Skip to content

Commit

Permalink
翻新barra部分和中性化函数
Browse files Browse the repository at this point in the history
  • Loading branch information
chen-001 committed Aug 5, 2024
1 parent 2e1929d commit 75e69fb
Show file tree
Hide file tree
Showing 3 changed files with 253 additions and 25 deletions.
29 changes: 27 additions & 2 deletions pure_ocean_breeze/jason/data/read_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__updated__ = "2023-07-26 16:42:17"
__updated__ = "2024-08-01 18:57:24"

import os
import numpy as np
Expand All @@ -9,6 +9,7 @@
from pure_ocean_breeze.jason.state.states import STATES
from pure_ocean_breeze.jason.state.homeplace import HomePlace
from pure_ocean_breeze.jason.state.decorators import *
from pure_ocean_breeze.jason.data.tools import boom_one
from cachier import cachier

try:
Expand Down Expand Up @@ -440,9 +441,33 @@ def deal_dummy(industry_dummy):
def moon_read_barra():
styles = os.listdir(homeplace.barra_data_file)
styles = [i for i in styles if (i.endswith(".parquet")) and (i[0] != ".")]
styles=[i for i in styles if 'together' not in i]
rename_dict = {
"size": "市值",
"nonlinearsize": "非线性市值",
"booktoprice": "估值",
"earningsyield": "盈利",
"growth": "成长",
"leverage": "杠杆",
"liquidity": "流动性",
"momentum": "动量",
"residualvolatility": "波动率",
"beta": "贝塔",
}
facs_dict = {
"反转_5天收益率均值": boom_one(read_daily(ret=1)),
"波动_20天收益率标准差": read_daily(ret=1)
.rolling(20, min_periods=10)
.std()
.resample("W")
.last(),
"换手_5天换手率均值": boom_one(read_daily(tr=1)),
}
barras = {}
for s in styles:
k = s.split(".")[0]
v = pd.read_parquet(homeplace.barra_data_file + s).resample("W").last()
barras[k] = v
barras[rename_dict[k]] = v
barras.update(facs_dict)

return barras
218 changes: 215 additions & 3 deletions pure_ocean_breeze/jason/data/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
针对一些不常见的文件格式,读取数据文件的一些工具函数,以及其他数据工具
"""

__updated__ = "2023-07-10 12:46:10"
__updated__ = "2024-08-01 18:55:29"

import os
import pandas as pd
Expand All @@ -16,6 +16,8 @@
import joblib
import mpire
import statsmodels.formula.api as smf
import polars as pl
import polars_ols as pls

from pure_ocean_breeze.jason.state.homeplace import HomePlace
from pure_ocean_breeze.jason.state.decorators import do_on_dfs
Expand Down Expand Up @@ -317,6 +319,20 @@ def standardlize(df: pd.DataFrame, all_pos: bool = 0) -> pd.DataFrame:
return df


@do_on_dfs
def boom_one(
df: pd.DataFrame, backsee: int = 5, daily: bool = 0, min_periods: int = None
) -> pd.DataFrame:
if min_periods is None:
min_periods = int(backsee * 0.5)
if not daily:
df_mean = (
df.rolling(backsee, min_periods=min_periods).mean().resample("W").last()
)
else:
df_mean = df.rolling(backsee, min_periods=min_periods).mean()
return df_mean

@do_on_dfs
def count_value(df: pd.DataFrame, with_zero: bool = 0) -> int:
"""计算dataframe中总共有多少(非0)非空的值
Expand Down Expand Up @@ -1236,7 +1252,7 @@ def count_pos_neg(s: Union[pd.Series, pd.DataFrame]):



def de_cross(
def de_cross_old(
y: pd.DataFrame, xs: Union[List[pd.DataFrame], pd.DataFrame]
) -> pd.DataFrame:
"""使用若干因子对某个因子进行正交化处理
Expand Down Expand Up @@ -1270,4 +1286,200 @@ def sing(date:pd.Timestamp):
with mpire.WorkerPool(20) as pool:
dfs=pool.map(sing,dates)
dfs=pd.concat(dfs).reset_index().pivot(index='date',columns='code',values='fac1')
return dfs
return dfs


def de_cross(
y: Union[pd.DataFrame, pl.DataFrame],
xs: Union[list[pd.DataFrame], list[pl.DataFrame]],
) -> pd.DataFrame:
"""因子正交函数,使用polars库实现
速度:10个barra因子、2016-2022、大约11.5秒
Parameters
----------
y : Union[pd.DataFrame, pl.DataFrame]
要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票
xs : Union[list[pd.DataFrame], list[pl.DataFrame]]
要被正交掉的干扰因子们,传入一个列表,每个都是h5存储的那种形式的df,index是时间,columns是股票
Returns
-------
pd.DataFrame
正交后的残差,形式与y相同,index是时间,columns是股票
"""
if isinstance(y, pd.DataFrame):
y.index.name='date'
y = pl.from_pandas(y.reset_index())
if isinstance(xs[0], pd.DataFrame):
for i in range(len(xs)):
xs[i].index.name='date'
xs = [pl.from_pandas(x.reset_index()) for x in xs]
y = y.unpivot(index="date", variable_name="code").drop_nulls()
xs = [x.unpivot(index="date", variable_name="code").drop_nulls() for x in xs]
for num, i in enumerate(xs):
y = y.join(i, on=["date", "code"], suffix=f"_{num}")
y = (
y.select(
"date",
"code",
pl.col("value")
.least_squares.ols(
*[pl.col(f"value_{i}") for i in range(len(xs))],
add_intercept=True,
mode="residuals",
)
.over("date")
.alias("resid"),
)
.pivot("code", index="date", values="resid")
.sort('date')
.to_pandas()
.set_index("date")
)
return y


def de_cross_special_for_barra_daily_jason(
y: Union[pd.DataFrame, pl.DataFrame],
) -> pd.DataFrame:
"""因子正交函数,但固定了xs为barra数据
速度:10个barra因子、2016-2022、大约3.2秒
Parameters
----------
y : Union[pd.DataFrame, pl.DataFrame]
要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票
Returns
-------
pd.DataFrame
正交后的残差,形式与y相同,index是时间,columns是股票
"""
if isinstance(y, pd.DataFrame):
y.index.name='date'
y = pl.from_pandas(y.reset_index())
y = y.unpivot(index="date", variable_name="code").drop_nulls()
xs = pl.read_parquet(
homeplace.barra_data_file+"barra_daily_together.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅
)
y = y.join(xs, on=["date", "code"])
cols = y.columns[3:]
y = (
y.select(
"date",
"code",
pl.col("value")
.least_squares.ols(
*[pl.col(i) for i in cols],
add_intercept=True,
mode="residuals",
)
.over("date")
.alias("resid"),
)
.pivot("code", index="date", values="resid")
.to_pandas()
.set_index("date")
.sort_index()
)
return y


def de_cross_special_for_barra_daily_jason(
y: Union[pd.DataFrame, pl.DataFrame],
) -> pd.DataFrame:
"""因子正交函数,但固定了xs为barra数据
速度:10个barra因子、2016-2022、大约3.2秒
Parameters
----------
y : Union[pd.DataFrame, pl.DataFrame]
要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票
Returns
-------
pd.DataFrame
正交后的残差,形式与y相同,index是时间,columns是股票
"""
if isinstance(y, pd.DataFrame):
y.index.name='date'
y = pl.from_pandas(y.reset_index())
y = y.unpivot(index="date", variable_name="code").drop_nulls()
xs = pl.read_parquet(
homeplace.barra_data_file+"barra_daily_together_jason.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅
)
y = y.join(xs, on=["date", "code"])
cols = y.columns[3:]
y = (
y.select(
"date",
"code",
pl.col("value")
.least_squares.ols(
*[pl.col(i) for i in cols],
add_intercept=True,
mode="residuals",
)
.over("date")
.alias("resid"),
)
.pivot("code", index="date", values="resid")
.to_pandas()
.set_index("date")
.sort_index()
)
return y


def de_cross_special_for_barra_weekly(
y: Union[pd.DataFrame, pl.DataFrame],with_corr:int=1
) -> pd.DataFrame:
"""因子正交函数,但固定了xs为barra数据
速度:10个barra因子、2016-2022、大约3.2秒
Parameters
----------
y : Union[pd.DataFrame, pl.DataFrame]
要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票
Returns
-------
pd.DataFrame
正交后的残差,形式与y相同,index是时间,columns是股票
"""
if isinstance(y, pd.DataFrame):
y.index.name='date'
y = pl.from_pandas(y.reset_index())
y = y.unpivot(index="date", variable_name="code").drop_nulls()
xs = pl.read_parquet(
homeplace.barra_data_file+"barra_industry_weekly_together.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅
)
y = y.join(xs, on=["date", "code"])
cols = y.columns[3:]
yresid = (
y.select(
"date",
"code",
pl.col("value")
.least_squares.ols(
*[pl.col(i) for i in cols],
add_intercept=True,
mode="residuals",
)
.over("date")
.alias("resid"),
)
.pivot("code", index="date", values="resid")
.to_pandas()
.set_index("date")
.sort_index()
)
if with_corr:
colss=y.columns[3:16]
corr=y[y.columns[:16]].select(*[pl.corr('value',i).over('date').mean().alias(i) for i in colss]).to_pandas()
corr.index=['相关系数']
corr=corr.applymap(to_percent)
return yresid,corr
else:
return yresid
31 changes: 11 additions & 20 deletions pure_ocean_breeze/jason/labor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
select_max,
select_min,
merge_many,
boom_one,
de_cross_special_for_barra_weekly,
)
from pure_ocean_breeze.jason.labor.comment import (
comments_on_twins,
Expand Down Expand Up @@ -261,19 +263,7 @@ def deboth(df: pd.DataFrame) -> pd.DataFrame:
return shen()


@do_on_dfs
def boom_one(
df: pd.DataFrame, backsee: int = 5, daily: bool = 0, min_periods: int = None
) -> pd.DataFrame:
if min_periods is None:
min_periods = int(backsee * 0.5)
if not daily:
df_mean = (
df.rolling(backsee, min_periods=min_periods).mean().resample("W").last()
)
else:
df_mean = df.rolling(backsee, min_periods=min_periods).mean()
return df_mean



@do_on_dfs
Expand Down Expand Up @@ -1264,7 +1254,7 @@ def get_total_comments(self, groups_num):
# self.group_mean_rets_monthly = (
# self.group_mean_rets_monthly - self.group_mean_rets_monthly.mean()
# )
mar=self.market_ret.loc[self.factors_out.index]
mar=self.market_ret.reindex(self.factors_out.index)
self.group_mean_rets_monthly = (
self.group_mean_rets_monthly - mar.mean()
)*self.freq_ctrl.counts_one_year
Expand Down Expand Up @@ -2666,11 +2656,12 @@ def sing(dfs: list[pd.DataFrame], date: pd.Timestamp):


@do_on_dfs
def sun(factor:pd.DataFrame,rolling_days:int=10):
def sun(factor:pd.DataFrame,rolling_days:int=10,with_pri:bool=1):
'''先单因子测试,再测试其与常用风格之间的关系'''
ractor=boom_one(factor.rank(axis=1),rolling_days)
factor=boom_one(factor,rolling_days)
shen=pure_moonnight(factor)
pfi=pure_snowtrain(ractor)
shen=pure_moonnight(pfi,neutralize=1)
display(pfi.show_corr())
if with_pri:
factor=boom_one(factor,rolling_days)
shen=pure_moonnight(factor)
pfi=de_cross_special_for_barra_weekly(ractor)
shen=pure_moonnight(pfi[0],neutralize=1)
display(pfi[1])

0 comments on commit 75e69fb

Please sign in to comment.