From 8d30b05622f40355d1e07a3fbc208ef0db21b6bc Mon Sep 17 00:00:00 2001 From: chenzongwei <63836858+chen-001@users.noreply.github.com> Date: Tue, 4 Jul 2023 09:24:33 +0800 Subject: [PATCH] v4.0.3 --- README.md | 2 +- pure_ocean_breeze/__init__.py | 4 +- pure_ocean_breeze/data/read_data.py | 150 +++++++++++---- pure_ocean_breeze/labor/process.py | 172 +++++++++++++----- .../version4.md" | 8 + 5 files changed, 257 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 870a053..8189565 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ### 全新大版本📢 * v4.0.0 — 2023.06.28 -> 因子框架4.0版本来啦!逐笔数据&任意秒级数据来啦! +> 因子框架4.0版本来啦!逐笔数据&任意秒级数据&全新的因子成果数据库来啦! * v3.0.0 — 2022.08.16 diff --git a/pure_ocean_breeze/__init__.py b/pure_ocean_breeze/__init__.py index a7ccc53..c5acedf 100644 --- a/pure_ocean_breeze/__init__.py +++ b/pure_ocean_breeze/__init__.py @@ -2,8 +2,8 @@ 一个量化多因子研究的框架,包含数据、回测、因子加工等方面的功能 """ -__updated__ = "2023-07-03 01:29:04" -__version__ = "4.0.2" +__updated__ = "2023-07-03 20:59:40" +__version__ = "4.0.3" __author__ = "chenzongwei" __author_email__ = "winterwinter999@163.com" __url__ = "https://github.com/chen-001/pure_ocean_breeze" diff --git a/pure_ocean_breeze/data/read_data.py b/pure_ocean_breeze/data/read_data.py index 9d9cf3c..386f762 100644 --- a/pure_ocean_breeze/data/read_data.py +++ b/pure_ocean_breeze/data/read_data.py @@ -1,12 +1,14 @@ -__updated__ = "2023-06-21 14:48:09" +__updated__ = "2023-07-04 09:13:50" import os import numpy as np import pandas as pd import datetime -from typing import Union, Dict, Tuple +import deprecation +from typing import Any, Union, Dict, Tuple from loguru import logger +from pure_ocean_breeze import __version__ from pure_ocean_breeze.state.states import STATES from pure_ocean_breeze.state.homeplace import HomePlace from pure_ocean_breeze.state.decorators import * @@ -172,33 +174,33 @@ def read_daily( elif open: opens = pd.read_parquet( homeplace.daily_data_file + "opens.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = opens elif close: closes = pd.read_parquet( homeplace.daily_data_file + "closes.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = closes elif high: highs = pd.read_parquet( homeplace.daily_data_file + "highs.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = highs elif low: lows = pd.read_parquet( homeplace.daily_data_file + "lows.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = lows elif vwap: df = ( pd.read_parquet(homeplace.daily_data_file + "vwaps.parquet") * read_daily(adjfactor=1, start=start) - * read_daily(state=1,start=start) + * read_daily(state=1, start=start) ) elif tr: trs = pd.read_parquet(homeplace.daily_data_file + "trs.parquet").replace( 0, np.nan - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = trs elif sharenum: sharenums = pd.read_parquet(homeplace.daily_data_file + "sharenums.parquet") @@ -208,7 +210,7 @@ def read_daily( elif amount: volumes = pd.read_parquet( homeplace.daily_data_file + "amounts.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = volumes elif money: df = pd.read_parquet( @@ -220,14 +222,14 @@ def read_daily( elif flow_cap: closes = pd.read_parquet( homeplace.daily_data_file + "closes_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) sharenums = pd.read_parquet(homeplace.daily_data_file + "sharenums.parquet") flow_cap = closes * sharenums df = flow_cap elif total_cap: closes = pd.read_parquet( homeplace.daily_data_file + "closes_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) sharenums = pd.read_parquet( homeplace.daily_data_file + "total_sharenums.parquet" ) @@ -237,9 +239,9 @@ def read_daily( # df=pd.read_parquet(homeplace.daily_data_file+'adjfactors.parquet') df = ( read_daily(close=1, start=start) - * read_daily(state=1,start=start) + * read_daily(state=1, start=start) / read_daily(close=1, start=start, unadjust=1) - * read_daily(state=1,start=start) + * read_daily(state=1, start=start) ) elif st: st = pd.read_parquet(homeplace.daily_data_file + "sts.parquet") @@ -278,46 +280,46 @@ def read_daily( ) / read_daily(close=1, start=start) elif pb: df = pd.read_parquet(homeplace.daily_data_file + "pb.parquet") * read_daily( - state=1,start=start + state=1, start=start ) elif pe: df = pd.read_parquet(homeplace.daily_data_file + "pe.parquet") * read_daily( - state=1,start=start + state=1, start=start ) elif pettm: - df = pd.read_parquet(homeplace.daily_data_file + "pettm.parquet") * read_daily( - state=1,start=start - ) + df = pd.read_parquet( + homeplace.daily_data_file + "pettm.parquet" + ) * read_daily(state=1, start=start) elif iret: df = pd.read_parquet( homeplace.daily_data_file + "idiosyncratic_ret.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif ivol: df = read_daily(iret=1, start=start) df = df.rolling(20, min_periods=10).std() elif illiquidity: df = pd.read_parquet( homeplace.daily_data_file + "illiquidity.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif swindustry_ret: df = pd.read_parquet( homeplace.daily_data_file + "股票对应申万一级行业每日收益率.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif zxindustry_ret: df = pd.read_parquet( homeplace.daily_data_file + "股票对应中信一级行业每日收益率.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif stop_up: df = ( pd.read_parquet(homeplace.daily_data_file + "stop_ups.parquet") * read_daily(adjfactor=1, start=start) - * read_daily(state=1,start=start) + * read_daily(state=1, start=start) ) elif stop_down: df = ( pd.read_parquet(homeplace.daily_data_file + "stop_downs.parquet") * read_daily(adjfactor=1, start=start) - * read_daily(state=1,start=start) + * read_daily(state=1, start=start) ) elif zxindustry_dummy_code: df = pd.read_parquet(homeplace.daily_data_file + "中信一级行业哑变量代码版.parquet") @@ -349,35 +351,35 @@ def read_daily( if open: opens = pd.read_parquet( homeplace.daily_data_file + "opens_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = opens elif close: closes = pd.read_parquet( homeplace.daily_data_file + "closes_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = closes elif high: highs = pd.read_parquet( homeplace.daily_data_file + "highs_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = highs elif low: lows = pd.read_parquet( homeplace.daily_data_file + "lows_unadj.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) df = lows elif vwap: df = pd.read_parquet( homeplace.daily_data_file + "vwaps.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif stop_up: df = pd.read_parquet( homeplace.daily_data_file + "stop_ups.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) elif stop_down: df = pd.read_parquet( homeplace.daily_data_file + "stop_downs.parquet" - ) * read_daily(state=1,start=start) + ) * read_daily(state=1, start=start) else: raise IOError("阁下总得读点什么吧?🤒") if "date" not in df.columns: @@ -755,6 +757,12 @@ def get_industry_dummies( return ress +@deprecation.deprecated( + deprecated_in="4.0", + removed_in="5.0", + current_version=__version__, + details="由于因子成果数据库升级,3.x版本的因子成果读取函数将下线", +) def database_read_final_factors( name: str = None, order: int = None, @@ -877,6 +885,12 @@ def database_read_final_factors( return df, "" +@deprecation.deprecated( + deprecated_in="4.0", + removed_in="5.0", + current_version=__version__, + details="由于因子成果数据库升级,3.x版本的因子成果读取函数将下线", +) def database_read_primary_factors(name: str, name2: str = None) -> pd.DataFrame: """根据因子名字,读取初级因子的因子值 @@ -900,3 +914,77 @@ def database_read_primary_factors(name: str, name2: str = None) -> pd.DataFrame: df = pd.read_parquet(homeplace.factor_data_file + name) df = df[sorted(list(df.columns))] return df + + +class FactorDone(object): + def __init__( + self, + order: str, + name: str = None, + place: str = None, + son_name: str = None, + freq: str = "月", + ) -> None: + self.homeplace = HomePlace() + self.order = order + self.freq = freq + self.qdb = Questdb() + try: + self.factor_infos = self.qdb.get_data( + f"select * from factor_infos where order='{self.order}' and freq='{self.freq}'" + ) + except Exception: + self.factor_infos = pd.DataFrame() + self.name = name + self.place = place + self.son_name = son_name + if (self.place is None) or (self.name is None): + final_line = self.qdb.get_data( + f"select * from factor_infos where order='{self.order}' and freq='{self.freq}'" + ) + self.name = final_line.name.iloc[0] + self.place = final_line.place.iloc[0] + if son_name is None: + self.file = f"因子{self.order}_{self.name}_{self.freq}_{self.place}.parquet" + self.son_factors = {} + if self.factor_infos.shape[0] > 0: + for row in self.factor_infos.dropna().itertuples(): + self.son_factors[row.son_name] = FactorDone( + order=row.order, + name=row.name, + place=row.place, + son_name=row.son_name, + freq=row.freq, + ) + else: + self.file = f"因子{self.order}_{self.name}_{self.son_name}_{self.freq}_{self.place}.parquet" + + + def __call__(self, son_name: str = None) -> Union[pd.DataFrame, dict]: + if son_name is None: + return pd.read_parquet(self.homeplace.final_factor_file + self.file) + else: + return self.son_factors[son_name]() + + def save_factor(self, factor: pd.DataFrame): + try: + son_info = self.qdb.get_data( + f"select * from factor_infos where order='{self.order}' and son_name='{self.son_name}' and freq='{self.freq}'" + ) + except Exception: + logger.warning(f"本次为第一次写入{self.name}_{self.son_name}因子") + son_info = pd.DataFrame() + if son_info.shape[0] == 0: + self.qdb.write_via_df( + pd.DataFrame( + { + "order": [self.order], + "name": [self.name], + "place": [self.place], + "son_name": [self.son_name], + "freq": [self.freq], + } + ), + "factor_infos", + ) + factor.to_parquet(self.homeplace.final_factor_file + self.file) diff --git a/pure_ocean_breeze/labor/process.py b/pure_ocean_breeze/labor/process.py index 92140ca..2b5ca9f 100644 --- a/pure_ocean_breeze/labor/process.py +++ b/pure_ocean_breeze/labor/process.py @@ -1,4 +1,4 @@ -__updated__ = "2023-07-02 02:33:46" +__updated__ = "2023-07-03 21:16:22" import warnings @@ -46,6 +46,7 @@ read_zxindustry_prices, database_read_final_factors, read_index_single, + FactorDone, ) from pure_ocean_breeze.state.homeplace import HomePlace @@ -1001,9 +1002,10 @@ def market_kind( def show_corr( fac1: pd.DataFrame, fac2: pd.DataFrame, - method: str = "spearman", + method: str = "pearson", plt_plot: bool = 1, show_series: bool = 0, + old_way: bool = 0, ) -> float: """展示两个因子的截面相关性 @@ -1014,21 +1016,28 @@ def show_corr( fac2 : pd.DataFrame 因子2 method : str, optional - 计算相关系数的方法, by default "spearman" + 计算相关系数的方法, by default "pearson" plt_plot : bool, optional 是否画出相关系数的时序变化图, by default 1 show_series : bool, optional 返回相关性的序列,而非均值 + old_way : bool, optional + 使用3.x版本的方式求相关系数 Returns ------- `float` 平均截面相关系数 """ - if method == "spearman": - corr = show_x_with_func(fac1, fac2, lambda x: x.rank().corr().iloc[0, 1]) + if old_way: + if method == "spearman": + corr = show_x_with_func(fac1, fac2, lambda x: x.rank().corr().iloc[0, 1]) + else: + corr = show_x_with_func( + fac1, fac2, lambda x: x.corr(method=method).iloc[0, 1] + ) else: - corr = show_x_with_func(fac1, fac2, lambda x: x.corr(method=method).iloc[0, 1]) + corr = fac1.corrwith(fac2, axis=1, method=method) if show_series: return corr else: @@ -1043,7 +1052,7 @@ def show_corrs( factor_names: List[str] = None, print_bool: bool = True, show_percent: bool = True, - method: str = "spearman", + method: str = "pearson", ) -> pd.DataFrame: """展示很多因子两两之间的截面相关性 @@ -1058,7 +1067,7 @@ def show_corrs( show_percent : bool, optional 是否以百分数的形式展示, by default True method : str, optional - 计算相关系数的方法, by default "spearman" + 计算相关系数的方法, by default "pearson" Returns ------- @@ -1223,7 +1232,12 @@ def de_cross( @do_on_dfs def show_corrs_with_old( - df: pd.DataFrame = None, method: str = "spearman", only_new: bool = 1 + df: pd.DataFrame = None, + method: str = "pearson", + only_new: bool = 1, + with_son_factors: bool = 1, + freq: str = "M", + old_database: bool = 0, ) -> pd.DataFrame: """计算新因子和已有因子的相关系数 @@ -1232,9 +1246,16 @@ def show_corrs_with_old( df : pd.DataFrame, optional 新因子, by default None method : str, optional - 求相关系数的方法, by default 'spearman' + 求相关系数的方法, by default 'pearson' only_new : bool, optional 仅计算新因子与旧因子之间的相关系数, by default 1 + with_son_factors : bool, optional + 计算新因子与数据库中各个细分因子的相关系数, by default 1 + freq : str, optional + 读取因子数据的频率, by default 'M' + old_database : bool, optional + 使用3.x版本的数据库, by default 0 + Returns ------- @@ -1242,43 +1263,102 @@ def show_corrs_with_old( 相关系数矩阵 """ if df is not None: - df0 = df.resample("M").last() + df0 = df.resample(freq).last() if df.shape[0] / df0.shape[0] > 2: daily = 1 else: daily = 0 - nums = os.listdir(homeplace.final_factor_file) - nums = sorted( - set( - [ - int(i.split("多因子")[1].split("_月")[0]) - for i in nums - if i.endswith("月.parquet") - ] + if old_database: + nums = os.listdir(homeplace.final_factor_file) + nums = sorted( + set( + [ + int(i.split("多因子")[1].split("_月")[0]) + for i in nums + if i.endswith("月.parquet") + ] + ) ) - ) - olds = [] - for i in nums: - try: + olds = [] + for i in nums: + try: + if daily: + old = database_read_final_factors(order=i)[0] + else: + old = database_read_final_factors(order=i)[0].resample("M").last() + olds.append(old) + except Exception: + break + if df is not None: + if only_new: + corrs = [ + to_percent(show_corr(df, i, plt_plot=0, method=method)) + for i in olds + ] + corrs = pd.Series(corrs, index=[f"old{i}" for i in nums]) + corrs = corrs.to_frame(f"{method}相关系数").T + else: + olds = [df] + olds + corrs = show_corrs( + olds, ["new"] + [f"old{i}" for i in nums], method=method + ) + else: + corrs = show_corrs(olds, [f"old{i}" for i in nums], method=method) + else: + qdb = Questdb() + if freq == "M": + factor_infos = qdb.get_data("select * from factor_infos where freq='月'") + else: + factor_infos = qdb.get_data("select * from factor_infos where freq='周'") + if not with_son_factors: + old_orders = list(set(factor_infos.order)) if daily: - old = database_read_final_factors(order=i)[0] + olds = [FactorDone(order=i)() for i in old_orders] else: - old = database_read_final_factors(order=i)[0].resample("M").last() - olds.append(old) - except Exception: - break - if df is not None: - if only_new: - corrs = [ - to_percent(show_corr(df, i, plt_plot=0, method=method)) for i in olds + olds = [FactorDone(order=i)().resample(freq).last() for i in old_orders] + else: + old_orders = [ + i.order + i.son_name.replace("因子", "") + for i in factor_infos.dropna().itertuples() ] - corrs = pd.Series(corrs, index=[f"old{i}" for i in nums]) - corrs = corrs.to_frame(f"{method}相关系数").T + if daily: + olds = [ + FactorDone(order=i.order)(i.son_name) + for i in factor_infos.dropna().itertuples() + ] + else: + olds = [ + FactorDone(order=i.order)(i.son_name).resample(freq).last() + for i in factor_infos.dropna().itertuples() + ] + if df is not None: + if only_new: + corrs = [ + to_percent(show_corr(df, i, plt_plot=0, method=method)) + for i in olds + ] + corrs = pd.Series(corrs, index=old_orders) + corrs = corrs.to_frame(f"{method}相关系数") + if corrs.shape[0] <= 30: + ... + elif corrs.shape[0] <= 60: + corrs = corrs.reset_index() + corrs.columns = ["因子名称", "相关系数"] + corrs1 = corrs.iloc[:30, :] + corrs2 = corrs.iloc[30:, :].reset_index(drop=True) + corrs = pd.concat([corrs1, corrs2], axis=1).fillna('') + elif corrs.shape[0] <= 90: + corrs = corrs.reset_index() + corrs.columns = ["因子名称", "相关系数"] + corrs1 = corrs.iloc[:30, :] + corrs2 = corrs.iloc[30:60, :].reset_index(drop=True) + corrs3 = corrs.iloc[60:90, :].reset_index(drop=True) + corrs = pd.concat([corrs1, corrs2, corrs3], axis=1).fillna('') + else: + olds = [df] + olds + corrs = show_corrs(olds, old_orders, method=method) else: - olds = [df] + olds - corrs = show_corrs(olds, ["new"] + [f"old{i}" for i in nums], method=method) - else: - corrs = show_corrs(olds, [f"old{i}" for i in nums], method=method) + corrs = show_corrs(olds, old_orders, method=method) return corrs @@ -2915,7 +2995,6 @@ def standardlize_in_cross_section(self, df): df = (df - df.mean()) / df.std() df = df.T return df - class pure_fallmount(pure_fall): @@ -3526,28 +3605,31 @@ def get_daily_factors( many_days=many_days, n_jobs=n_jobs, ) - if len(self.factor_new)>0: + if len(self.factor_new) > 0: self.factor_new = pd.concat(self.factor_new) # 拼接新的和旧的 self.factor = pd.concat([self.factor_old, self.factor_new]).sort_index() self.factor = drop_duplicates_index(self.factor.dropna(how="all")) - new_end_date = datetime.datetime.strftime(self.factor.index.max(), "%Y%m%d") + new_end_date = datetime.datetime.strftime( + self.factor.index.max(), "%Y%m%d" + ) # 存入本地 self.factor.to_parquet(self.factor_file) logger.info(f"截止到{new_end_date}的因子值计算完了") # 删除存储在questdb的中途备份数据 try: - self.factor_steps.do_order(f"drop table '{self.factor_file_pinyin}'") + self.factor_steps.do_order( + f"drop table '{self.factor_file_pinyin}'" + ) logger.info("备份在questdb的表格已删除") except Exception: logger.warning("删除questdb中表格时,存在某个未知错误,请当心") else: - logger.warning('由于某种原因,更新的因子值计算失败,建议检查🤒') + logger.warning("由于某种原因,更新的因子值计算失败,建议检查🤒") # 拼接新的和旧的 self.factor = pd.concat([self.factor_old]).sort_index() self.factor = drop_duplicates_index(self.factor.dropna(how="all")) - else: self.factor = drop_duplicates_index(self.factor_old) # 存入本地 @@ -6698,7 +6780,7 @@ def get_daily_factors( if self.factor_old is not None: self.factor = pd.concat([self.factor_old, self.factor_new]).sort_index() else: - self.factor=self.factor_new.sort_index() + self.factor = self.factor_new.sort_index() self.factor = drop_duplicates_index(self.factor.dropna(how="all")) new_end_date = datetime.datetime.strftime(self.factor.index.max(), "%Y%m%d") # 存入本地 diff --git "a/\346\233\264\346\226\260\346\227\245\345\277\227/version4.md" "b/\346\233\264\346\226\260\346\227\245\345\277\227/version4.md" index 283431e..0f70942 100644 --- "a/\346\233\264\346\226\260\346\227\245\345\277\227/version4.md" +++ "b/\346\233\264\346\226\260\346\227\245\345\277\227/version4.md" @@ -1,6 +1,14 @@ ## 更新日志🗓 — v4 +* v4.0.3 — 2023.7.4 + +> 1. 新增了全新的因子成果数据库FactorDone,每个最终复合因子,都附带其细分因子 +> 2. 对show_corr函数的默认计算方式进行了提速,并新增old_way参数,用于使用旧版方式计算 +> 3. 将show_corr、show_corrs、show_corrs_with_old函数中相关系数的默认计算方式调整为pearson相关系数 +> 4. 对show_corrs_with_old的内容进行了升级,以支持计算因子与已有因子的细分因子之间的相关系数 + + * v4.0.2 — 2023.7.3 > 1. 暂时取消了对numpy的强制依赖