Skip to content

Commit

Permalink
v4.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
chen-001 committed Jun 29, 2023
1 parent 2e5469c commit 04adb7a
Show file tree
Hide file tree
Showing 10 changed files with 1,017 additions and 176 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# pure_ocean_breeze
#### **众人的因子回测框架**
#### **众人的因子框架**
##### 我们的口号是:量价因子才是最牛的!
***

### 全新大版本📢
* v4.0.0 — 2023.06.28
> 因子框架4.0版本来啦!逐笔数据&任意秒级数据来啦!
* v3.0.0 — 2022.08.16

>回测框架3.0版本来啦! 模块拆分&说明文档来啦![pure_ocean_breeze说明文档](https://chen-001.github.io/pure_ocean_breeze/)
* v2.0.0 — 2022.07.12
>回测框架2.0版本来啦!数据库&自动更新&最终因子库功能上线啦!
Expand All @@ -16,8 +20,8 @@
>* 在初次安装框架时,请进行初始化,以将路径设置到自己的文件里
>* 使用如下语句进行初始化
>>```python
>>import pure_ocean_breeze.initialize.initialize
>>pure_ocean_breeze.initialize.initialize.initialize()
>>import pure_ocean_breeze as p
>>p.ini()
>>```
>* 然后根据提示进行操作即可
>* 请注意路径不要写反斜杠\,而要写成/
Expand Down
13 changes: 9 additions & 4 deletions pure_ocean_breeze/data/database.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__updated__ = "2023-05-16 11:40:07"
__updated__ = "2023-06-28 12:29:06"

import pandas as pd
import pymysql
Expand Down Expand Up @@ -732,9 +732,14 @@ def show_all_dates(self, table_name: str, mul_100=False) -> list:
list
表中所有的日期
"""
df = self.get_data(
f"select distinct(date) from {self.database_name}.{table_name}"
).sort_values("date")
if 'second' in table_name:
df = self.get_data(f"select distinct(toYYYYMMDD(date)) from {table_name}").sort_values(
"date"
)
else:
df = self.get_data(f"select distinct(date) from {table_name}").sort_values(
"date"
)
if mul_100:
return [i for i in list(df.date) if i != 0]
else:
Expand Down
69 changes: 1 addition & 68 deletions pure_ocean_breeze/data/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
针对一些不常见的文件格式,读取数据文件的一些工具函数,以及其他数据工具
"""

__updated__ = "2023-06-20 22:25:36"
__updated__ = "2023-06-28 16:09:30"

import os
import pandas as pd
Expand Down Expand Up @@ -47,73 +47,6 @@ def is_notebook() -> bool:
return False


@do_on_dfs
@deprecation.deprecated(
deprecated_in="3.0",
removed_in="4.0",
current_version=__version__,
details="考虑到h5文件的多样性,4.0版本开始将不再支持一键读入h5文件",
)
def read_h5(path: str) -> Dict:
"""
Reads a HDF5 file into a dictionary of pandas DataFrames.
Parameters
----------
path : str
The path to the HDF5 file.
Returns
-------
`Dict`
A dictionary of pandas DataFrames.
"""
res = {}
import h5py

a = h5py.File(path)
for k, v in tqdm.tqdm(list(a.items()), desc="数据加载中……"):
value = list(v.values())[-1]
try:
col = [i.decode("utf-8") for i in list(list(v.values())[0])]
except Exception:
col=list(list(v.values())[0])
try:
ind = [i.decode("utf-8") for i in list(list(v.values())[1])]
except Exception:
ind=list(list(v.values())[1])
res[k] = pd.DataFrame(value, columns=col, index=ind)
return res


@do_on_dfs
@deprecation.deprecated(
deprecated_in="3.0",
removed_in="4.0",
current_version=__version__,
details="考虑到h5文件的多样性,4.0版本开始将不再支持一键读入h5文件",
)
def read_h5_new(path: str) -> pd.DataFrame:
"""读取h5文件
Parameters
----------
path : str
h5文件路径
Returns
-------
`pd.DataFrame`
读取字典的第一个value
"""
import h5py

a = h5py.File(path)
v = list(a.values())[0]
v = a[v.name][:]
return pd.DataFrame(v)


@do_on_dfs
def read_mat(path: str) -> pd.DataFrame:
"""读取mat文件
Expand Down
96 changes: 94 additions & 2 deletions pure_ocean_breeze/data/write_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__updated__ = "2023-06-21 14:50:52"
__updated__ = "2023-06-27 20:38:39"

import time

Expand Down Expand Up @@ -53,6 +53,12 @@
from functools import reduce
from typing import Union, List
import dcube as dc
import py7zr
import unrar
import zipfile
import rarfile
import shutil
import chardet
from tenacity import retry, stop_after_attempt
import questdb.ingress as qdbing
from pure_ocean_breeze.state.homeplace import HomePlace
Expand Down Expand Up @@ -89,6 +95,92 @@
from pure_ocean_breeze.labor.process import pure_fama



# 待补充
def database_update_second_data_to_clickhouse():
...



def convert_tick_by_tick_data_to_parquet(file_name:str,PATH:str,delete_7z:bool=False):
try:
files = sorted(os.listdir(file_name))
files=[i for i in files if i[0]!='.']
files = [file_name + "/" + i for i in files]
dfs = []
for i in files:
with open(i,'rb') as f:
tmp = chardet.detect(f.read())
df = pd.read_csv(i,encoding=tmp['encoding'])
df.Time = file_name.split('/')[-1] + " " + df.Time
df.Time = pd.to_datetime(df.Time)
df = df.rename(
columns={
"TranID": "tranid",
"Time": "date",
"Price": "price",
"Volume": "money",
"SaleOrderVolume": "salemoney",
"BuyOrderVolume": "buymoney",
"Type": "action",
"SaleOrderID": "saleid",
"SaleOrderPrice": "saleprice",
"BuyOrderID": "buyid",
"BuyOrderPrice": "buyprice",
}
)
df = df.assign(code=add_suffix(i.split("/")[-1].split(".")[0]))
dfs.append(df)
dfs = pd.concat(dfs)
dfs.to_parquet(f"{'/'.join(PATH.split('/')[:-2])}/data/{file_name.split('/')[-1]}.parquet")
# logger.success(f"{file_name.split('/')[-1]}的逐笔数据已经写入完成!")
shutil.rmtree(file_name + "/",True)
if delete_7z:
os.remove(file_name + ".7z")

# logger.warning(f"{file_name.split('/')[-1]}的逐笔数据csv版已经删除")
except Exception:
file_name=file_name+'/'+file_name.split('/')[-1]
convert_tick_by_tick_data_to_parquet(file_name,PATH)


def convert_tick_by_tick_data_daily(day_path:str,PATH:str):
try:
olds=os.listdir('/Volumes/My Passport/data/')
theday=day_path.split('/')[-1].split('.')[0]
olds_ok=[i for i in olds if theday in i]
if len(olds_ok)==0:
if os.path.exists(day_path.split('.')[0]):
...
# print(f"{day_path.split('.')[0]}已存在")
elif day_path.endswith('.7z'):
archive = py7zr.SevenZipFile(day_path, mode='r')
archive.extractall(path='/'.join(day_path.split('/')[:-1]))
archive.close()
elif day_path.endswith('.zip'):
f = zipfile.ZipFile(day_path,'r') # 压缩文件位置
f.extractall('/'.join(day_path.split('/')[:-1])) # 解压位置
f.close()
elif day_path.endswith('.rar'):
f=rarfile.RarFile(day_path,'r')
f.extractall('/'.join(day_path.split('/')[:-1]))
f.close()
convert_tick_by_tick_data_to_parquet(day_path.split('.')[0],PATH)
else:
print(f'{theday}已经有了,跳过')
except Exception:
logger.error(f'{day_path}出错了,请当心!!!')


def convert_tick_by_tick_data_monthly(month_path:str,PATH:str):
files=os.listdir(month_path)
files=[i for i in files if i.startswith('20')]
date=month_path.split('/')[-1]
files=[month_path+'/'+i for i in files] # 每个形如2018-01-02.7z
for i in tqdm.auto.tqdm(files,f'{date}的进度'):
convert_tick_by_tick_data_daily(i,PATH)


def database_update_minute_data_to_clickhouse_and_questdb(
kind: str, web_port: str = "9001"
) -> None:
Expand Down Expand Up @@ -556,7 +648,7 @@ def to_mat(df, row, name, ind="date", col="code"):

# pettm
partpe = df2s[["date", "code", "pe_ttm"]].pivot(
index="date", columns="code", values="pe"
index="date", columns="code", values="pe_ttm"
)
partpe_old = pd.read_parquet(homeplace.daily_data_file + "pettm.parquet")
partpe_new = pd.concat([partpe_old, partpe])
Expand Down
10 changes: 9 additions & 1 deletion pure_ocean_breeze/initialize/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os


def initialize():
def ini():
user_file = os.path.expanduser("~") + "/"
# 日频数据路径
daily_data_file = input("请设置日频数据存放路径(请最终以斜杠结尾,请不要输入反斜杠'',请都替换为'/'):")
Expand Down Expand Up @@ -39,6 +39,13 @@ def initialize():
final_factor_file = input("请设置最终因子成果存放路径(请最终以斜杠结尾,请不要输入反斜杠'',请都替换为'/'):")
if final_factor_file[-1] != "/":
final_factor_file = final_factor_file + "/"
# 股票逐笔数据路径
tick_by_tick_data=input("请设置股票逐笔数据存放路径(请最终以斜杠结尾,请不要输入反斜杠'',请都替换为'/):")
while "/" not in tick_by_tick_data:
print("请不要输入反斜杠'',请替换为'/',并以'/'结尾")
tick_by_tick_data=input("请设置股票逐笔数据存放路径(请最终以斜杠结尾,请不要输入反斜杠'',请都替换为'/):")
if tick_by_tick_data[-1]!='/':
tick_by_tick_data=tick_by_tick_data+'/'
# 数立方token
api_token = input("请输入您的数立方token:")
save_dict = {
Expand All @@ -48,6 +55,7 @@ def initialize():
"update_data_file": update_data_file,
"final_factor_file": final_factor_file,
"api_token": api_token,
'tick_by_tick_data': tick_by_tick_data,
}
save_dict_file = open(user_file + "paths.settings", "wb")
pickle.dump(save_dict, save_dict_file)
Expand Down
Loading

0 comments on commit 04adb7a

Please sign in to comment.