From 75e69fb07d1303ee348c5a8fda0f022e9fdd8ef0 Mon Sep 17 00:00:00 2001 From: chenzongwei <17695480342@163.com> Date: Tue, 6 Aug 2024 07:58:39 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BF=BB=E6=96=B0barra=E9=83=A8=E5=88=86?= =?UTF-8?q?=E5=92=8C=E4=B8=AD=E6=80=A7=E5=8C=96=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pure_ocean_breeze/jason/data/read_data.py | 29 ++- pure_ocean_breeze/jason/data/tools.py | 218 +++++++++++++++++++++- pure_ocean_breeze/jason/labor/process.py | 31 ++- 3 files changed, 253 insertions(+), 25 deletions(-) diff --git a/pure_ocean_breeze/jason/data/read_data.py b/pure_ocean_breeze/jason/data/read_data.py index 2a0750d..478045b 100644 --- a/pure_ocean_breeze/jason/data/read_data.py +++ b/pure_ocean_breeze/jason/data/read_data.py @@ -1,4 +1,4 @@ -__updated__ = "2023-07-26 16:42:17" +__updated__ = "2024-08-01 18:57:24" import os import numpy as np @@ -9,6 +9,7 @@ from pure_ocean_breeze.jason.state.states import STATES from pure_ocean_breeze.jason.state.homeplace import HomePlace from pure_ocean_breeze.jason.state.decorators import * +from pure_ocean_breeze.jason.data.tools import boom_one from cachier import cachier try: @@ -440,9 +441,33 @@ def deal_dummy(industry_dummy): def moon_read_barra(): styles = os.listdir(homeplace.barra_data_file) styles = [i for i in styles if (i.endswith(".parquet")) and (i[0] != ".")] + styles=[i for i in styles if 'together' not in i] + rename_dict = { + "size": "市值", + "nonlinearsize": "非线性市值", + "booktoprice": "估值", + "earningsyield": "盈利", + "growth": "成长", + "leverage": "杠杆", + "liquidity": "流动性", + "momentum": "动量", + "residualvolatility": "波动率", + "beta": "贝塔", + } + facs_dict = { + "反转_5天收益率均值": boom_one(read_daily(ret=1)), + "波动_20天收益率标准差": read_daily(ret=1) + .rolling(20, min_periods=10) + .std() + .resample("W") + .last(), + "换手_5天换手率均值": boom_one(read_daily(tr=1)), + } barras = {} for s in styles: k = s.split(".")[0] v = pd.read_parquet(homeplace.barra_data_file + s).resample("W").last() - barras[k] = v + barras[rename_dict[k]] = v + barras.update(facs_dict) + return barras \ No newline at end of file diff --git a/pure_ocean_breeze/jason/data/tools.py b/pure_ocean_breeze/jason/data/tools.py index 460f4fb..2583a1f 100644 --- a/pure_ocean_breeze/jason/data/tools.py +++ b/pure_ocean_breeze/jason/data/tools.py @@ -2,7 +2,7 @@ 针对一些不常见的文件格式,读取数据文件的一些工具函数,以及其他数据工具 """ -__updated__ = "2023-07-10 12:46:10" +__updated__ = "2024-08-01 18:55:29" import os import pandas as pd @@ -16,6 +16,8 @@ import joblib import mpire import statsmodels.formula.api as smf +import polars as pl +import polars_ols as pls from pure_ocean_breeze.jason.state.homeplace import HomePlace from pure_ocean_breeze.jason.state.decorators import do_on_dfs @@ -317,6 +319,20 @@ def standardlize(df: pd.DataFrame, all_pos: bool = 0) -> pd.DataFrame: return df +@do_on_dfs +def boom_one( + df: pd.DataFrame, backsee: int = 5, daily: bool = 0, min_periods: int = None +) -> pd.DataFrame: + if min_periods is None: + min_periods = int(backsee * 0.5) + if not daily: + df_mean = ( + df.rolling(backsee, min_periods=min_periods).mean().resample("W").last() + ) + else: + df_mean = df.rolling(backsee, min_periods=min_periods).mean() + return df_mean + @do_on_dfs def count_value(df: pd.DataFrame, with_zero: bool = 0) -> int: """计算dataframe中总共有多少(非0)非空的值 @@ -1236,7 +1252,7 @@ def count_pos_neg(s: Union[pd.Series, pd.DataFrame]): -def de_cross( +def de_cross_old( y: pd.DataFrame, xs: Union[List[pd.DataFrame], pd.DataFrame] ) -> pd.DataFrame: """使用若干因子对某个因子进行正交化处理 @@ -1270,4 +1286,200 @@ def sing(date:pd.Timestamp): with mpire.WorkerPool(20) as pool: dfs=pool.map(sing,dates) dfs=pd.concat(dfs).reset_index().pivot(index='date',columns='code',values='fac1') - return dfs \ No newline at end of file + return dfs + + +def de_cross( + y: Union[pd.DataFrame, pl.DataFrame], + xs: Union[list[pd.DataFrame], list[pl.DataFrame]], +) -> pd.DataFrame: + """因子正交函数,使用polars库实现 + 速度:10个barra因子、2016-2022、大约11.5秒 + + Parameters + ---------- + y : Union[pd.DataFrame, pl.DataFrame] + 要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票 + xs : Union[list[pd.DataFrame], list[pl.DataFrame]] + 要被正交掉的干扰因子们,传入一个列表,每个都是h5存储的那种形式的df,index是时间,columns是股票 + + Returns + ------- + pd.DataFrame + 正交后的残差,形式与y相同,index是时间,columns是股票 + """ + if isinstance(y, pd.DataFrame): + y.index.name='date' + y = pl.from_pandas(y.reset_index()) + if isinstance(xs[0], pd.DataFrame): + for i in range(len(xs)): + xs[i].index.name='date' + xs = [pl.from_pandas(x.reset_index()) for x in xs] + y = y.unpivot(index="date", variable_name="code").drop_nulls() + xs = [x.unpivot(index="date", variable_name="code").drop_nulls() for x in xs] + for num, i in enumerate(xs): + y = y.join(i, on=["date", "code"], suffix=f"_{num}") + y = ( + y.select( + "date", + "code", + pl.col("value") + .least_squares.ols( + *[pl.col(f"value_{i}") for i in range(len(xs))], + add_intercept=True, + mode="residuals", + ) + .over("date") + .alias("resid"), + ) + .pivot("code", index="date", values="resid") + .sort('date') + .to_pandas() + .set_index("date") + ) + return y + + +def de_cross_special_for_barra_daily_jason( + y: Union[pd.DataFrame, pl.DataFrame], +) -> pd.DataFrame: + """因子正交函数,但固定了xs为barra数据 + 速度:10个barra因子、2016-2022、大约3.2秒 + + Parameters + ---------- + y : Union[pd.DataFrame, pl.DataFrame] + 要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票 + + Returns + ------- + pd.DataFrame + 正交后的残差,形式与y相同,index是时间,columns是股票 + """ + if isinstance(y, pd.DataFrame): + y.index.name='date' + y = pl.from_pandas(y.reset_index()) + y = y.unpivot(index="date", variable_name="code").drop_nulls() + xs = pl.read_parquet( + homeplace.barra_data_file+"barra_daily_together.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅 + ) + y = y.join(xs, on=["date", "code"]) + cols = y.columns[3:] + y = ( + y.select( + "date", + "code", + pl.col("value") + .least_squares.ols( + *[pl.col(i) for i in cols], + add_intercept=True, + mode="residuals", + ) + .over("date") + .alias("resid"), + ) + .pivot("code", index="date", values="resid") + .to_pandas() + .set_index("date") + .sort_index() + ) + return y + + +def de_cross_special_for_barra_daily_jason( + y: Union[pd.DataFrame, pl.DataFrame], +) -> pd.DataFrame: + """因子正交函数,但固定了xs为barra数据 + 速度:10个barra因子、2016-2022、大约3.2秒 + + Parameters + ---------- + y : Union[pd.DataFrame, pl.DataFrame] + 要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票 + + Returns + ------- + pd.DataFrame + 正交后的残差,形式与y相同,index是时间,columns是股票 + """ + if isinstance(y, pd.DataFrame): + y.index.name='date' + y = pl.from_pandas(y.reset_index()) + y = y.unpivot(index="date", variable_name="code").drop_nulls() + xs = pl.read_parquet( + homeplace.barra_data_file+"barra_daily_together_jason.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅 + ) + y = y.join(xs, on=["date", "code"]) + cols = y.columns[3:] + y = ( + y.select( + "date", + "code", + pl.col("value") + .least_squares.ols( + *[pl.col(i) for i in cols], + add_intercept=True, + mode="residuals", + ) + .over("date") + .alias("resid"), + ) + .pivot("code", index="date", values="resid") + .to_pandas() + .set_index("date") + .sort_index() + ) + return y + + +def de_cross_special_for_barra_weekly( + y: Union[pd.DataFrame, pl.DataFrame],with_corr:int=1 +) -> pd.DataFrame: + """因子正交函数,但固定了xs为barra数据 + 速度:10个barra因子、2016-2022、大约3.2秒 + + Parameters + ---------- + y : Union[pd.DataFrame, pl.DataFrame] + 要研究的因子,形式与h5存数据的形式相同,index是时间,columns是股票 + + Returns + ------- + pd.DataFrame + 正交后的残差,形式与y相同,index是时间,columns是股票 + """ + if isinstance(y, pd.DataFrame): + y.index.name='date' + y = pl.from_pandas(y.reset_index()) + y = y.unpivot(index="date", variable_name="code").drop_nulls() + xs = pl.read_parquet( + homeplace.barra_data_file+"barra_industry_weekly_together.parquet" # 我这个数据缺2020-08-04 和 2020-08-05,给你的版本可能不缺?不过测速用无伤大雅 + ) + y = y.join(xs, on=["date", "code"]) + cols = y.columns[3:] + yresid = ( + y.select( + "date", + "code", + pl.col("value") + .least_squares.ols( + *[pl.col(i) for i in cols], + add_intercept=True, + mode="residuals", + ) + .over("date") + .alias("resid"), + ) + .pivot("code", index="date", values="resid") + .to_pandas() + .set_index("date") + .sort_index() + ) + if with_corr: + colss=y.columns[3:16] + corr=y[y.columns[:16]].select(*[pl.corr('value',i).over('date').mean().alias(i) for i in colss]).to_pandas() + corr.index=['相关系数'] + corr=corr.applymap(to_percent) + return yresid,corr + else: + return yresid \ No newline at end of file diff --git a/pure_ocean_breeze/jason/labor/process.py b/pure_ocean_breeze/jason/labor/process.py index 5ef59ef..a8cc91a 100644 --- a/pure_ocean_breeze/jason/labor/process.py +++ b/pure_ocean_breeze/jason/labor/process.py @@ -62,6 +62,8 @@ select_max, select_min, merge_many, + boom_one, + de_cross_special_for_barra_weekly, ) from pure_ocean_breeze.jason.labor.comment import ( comments_on_twins, @@ -261,19 +263,7 @@ def deboth(df: pd.DataFrame) -> pd.DataFrame: return shen() -@do_on_dfs -def boom_one( - df: pd.DataFrame, backsee: int = 5, daily: bool = 0, min_periods: int = None -) -> pd.DataFrame: - if min_periods is None: - min_periods = int(backsee * 0.5) - if not daily: - df_mean = ( - df.rolling(backsee, min_periods=min_periods).mean().resample("W").last() - ) - else: - df_mean = df.rolling(backsee, min_periods=min_periods).mean() - return df_mean + @do_on_dfs @@ -1264,7 +1254,7 @@ def get_total_comments(self, groups_num): # self.group_mean_rets_monthly = ( # self.group_mean_rets_monthly - self.group_mean_rets_monthly.mean() # ) - mar=self.market_ret.loc[self.factors_out.index] + mar=self.market_ret.reindex(self.factors_out.index) self.group_mean_rets_monthly = ( self.group_mean_rets_monthly - mar.mean() )*self.freq_ctrl.counts_one_year @@ -2666,11 +2656,12 @@ def sing(dfs: list[pd.DataFrame], date: pd.Timestamp): @do_on_dfs -def sun(factor:pd.DataFrame,rolling_days:int=10): +def sun(factor:pd.DataFrame,rolling_days:int=10,with_pri:bool=1): '''先单因子测试,再测试其与常用风格之间的关系''' ractor=boom_one(factor.rank(axis=1),rolling_days) - factor=boom_one(factor,rolling_days) - shen=pure_moonnight(factor) - pfi=pure_snowtrain(ractor) - shen=pure_moonnight(pfi,neutralize=1) - display(pfi.show_corr()) \ No newline at end of file + if with_pri: + factor=boom_one(factor,rolling_days) + shen=pure_moonnight(factor) + pfi=de_cross_special_for_barra_weekly(ractor) + shen=pure_moonnight(pfi[0],neutralize=1) + display(pfi[1]) \ No newline at end of file