V0.9.39 更新一批代码 (waditu#181)

* 0.9.39 新增两个 rolling 工具函数 * 0.9.39 加入2024交易日历 * 0.9.39 update * 0.9.39 新增特征计算相关工具函数 * 0.9.39 update
Peter-Don · Dec 17, 2023 · c55a00b · c55a00b
1 parent 8950731
commit c55a00b
Show file tree

Hide file tree

Showing 11 changed files with 233 additions and 10 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -5,7 +5,7 @@ name: Python package
 
 on:
   push:
-    branches: [ master, V0.9.38 ]
+    branches: [ master, V0.9.39 ]
   pull_request:
     branches: [ master ]
 

diff --git a/czsc/__init__.py b/czsc/__init__.py
@@ -111,12 +111,16 @@
     normalize_feature,
     normalize_ts_feature,
     feture_cross_layering,
+    rolling_rank,
+    rolling_norm,
+    rolling_qcut,
+    find_most_similarity,
 )
 
-__version__ = "0.9.38"
+__version__ = "0.9.39"
 __author__ = "zengbin93"
 __email__ = "[email protected]"
-__date__ = "20231126"
+__date__ = "20231212"
 
 
 def welcome():

diff --git a/czsc/connectors/cooperation.py b/czsc/connectors/cooperation.py
@@ -4,6 +4,8 @@
 email: [email protected]
 create_dt: 2023/11/15 20:45
 describe: CZSC开源协作团队内部使用数据接口
+
+接口说明：https://s0cqcxuy3p.feishu.cn/wiki/F3HGw9vDPisWtSkJr1ac5DEcnNh
 """
 import os
 import czsc

diff --git a/czsc/traders/weight_backtest.py b/czsc/traders/weight_backtest.py
@@ -209,7 +209,8 @@ def __init__(self, dfw, digits=2, **kwargs) -> None:
         self.fee_rate = kwargs.get('fee_rate', 0.0002)
         self.dfw['weight'] = self.dfw['weight'].astype('float').round(digits)
         self.symbols = list(self.dfw['symbol'].unique().tolist())
-        self.results = self.backtest(n_jobs=kwargs.get('n_jobs', int(cpu_count() / 2)))
+        default_n_jobs = min(cpu_count() // 2, len(self.symbols))
+        self.results = self.backtest(n_jobs=kwargs.get('n_jobs', default_n_jobs))
 
     def get_symbol_daily(self, symbol):
         """获取某个合约的每日收益率

diff --git a/czsc/utils/calendar.py b/czsc/utils/calendar.py
@@ -13,6 +13,15 @@
 calendar = pd.read_feather(Path(__file__).parent / "china_calendar.feather")
 
 
+def prepare_chain_calendar():
+    import tushare as ts
+    pro = ts.pro_api()
+    df = pro.trade_cal(exchange='', start_date='20100101', end_date='20301231')
+    df['cal_date'] = pd.to_datetime(df['cal_date'])
+    df = df.sort_values('cal_date').reset_index(drop=True)[['cal_date', 'is_open']]
+    df.to_feather(Path(__file__).parent / "china_calendar.feather")
+
+
 def is_trading_date(date=datetime.now()):
     """判断是否是交易日"""
     date = pd.to_datetime(pd.to_datetime(date).date())

diff --git a/czsc/utils/china_calendar.feather b/czsc/utils/china_calendar.feather
diff --git a/czsc/utils/data_client.py b/czsc/utils/data_client.py
@@ -29,6 +29,10 @@ def get_url_token(url):
     if file_token.exists():
         return open(file_token, 'r', encoding='utf-8').read()
     logger.warning(f"请设置 {url} 的访问凭证码，如果没有请联系管理员申请")
+    token = input(f"请输入 {url} 的访问凭证码（token）：")
+    if token:
+        set_url_token(token, url)
+        return token
     return None
 
 

diff --git a/czsc/utils/features.py b/czsc/utils/features.py
@@ -159,3 +159,112 @@ def _layering(x):
     df[f"{x_col}分层"] = df[f"{x_col}分层"].fillna(-1)
     df[f'{x_col}分层'] = df[f'{x_col}分层'].apply(lambda x: f'第{str(int(x+1)).zfill(2)}层')
     return df
+
+
+def rolling_rank(df: pd.DataFrame, col, n=None, new_col=None, **kwargs):
+    """计算序列的滚动排名
+
+    :param df: pd.DataFrame
+        待计算的数据
+    :param col: str
+        待计算的列
+    :param n: int
+        滚动窗口大小, 默认为None, 表示计算 expanding 排名，否则计算 rolling 排名
+    :param new_col: str
+        新列名，默认为 None, 表示使用 f'{col}_rank' 作为新列名
+    :param kwargs:
+        min_periods: int
+            最小计算周期
+    """
+    min_periods = kwargs.get('min_periods', 2)
+    new_col = new_col if new_col else f'{col}_rank'
+    if n is None:
+        df[new_col] = df[col].expanding(min_periods=min_periods).rank()
+    else:
+        df[new_col] = df[col].rolling(window=n, min_periods=min_periods).rank()
+    df[new_col] = df[new_col].fillna(0)
+
+
+def rolling_norm(df: pd.DataFrame, col, n=None, new_col=None, **kwargs):
+    """计算序列的滚动归一化值
+
+    :param df: pd.DataFrame
+        待计算的数据
+    :param col: str
+        待计算的列
+    :param n: int
+        滚动窗口大小, 默认为None, 表示计算 expanding ，否则计算 rolling
+    :param new_col: str
+        新列名，默认为 None, 表示使用 f'{col}_norm' 作为新列名
+    :param kwargs:
+        min_periods: int
+            最小计算周期
+    """
+    min_periods = kwargs.get('min_periods', 2)
+    new_col = new_col if new_col else f'{col}_norm'
+
+    if n is None:
+        df[new_col] = df[col].expanding(min_periods=min_periods).apply(lambda x: (x[-1] - x.mean()) / x.std(), raw=True)
+    else:
+        df[new_col] = df[col].rolling(window=n, min_periods=min_periods).apply(lambda x: (x[-1] - x.mean()) / x.std(), raw=True)
+    df[new_col] = df[new_col].fillna(0)
+
+
+def rolling_qcut(df: pd.DataFrame, col, n=None, new_col=None, **kwargs):
+    """计算序列的滚动分位数
+
+    :param df: pd.DataFrame
+        待计算的数据
+    :param col: str
+        待计算的列
+    :param n: int
+        滚动窗口大小, 默认为None, 表示计算 expanding ，否则计算 rolling
+    :param new_col: str
+        新列名，默认为 None, 表示使用 f'{col}_qcut' 作为新列名
+    :param kwargs:
+
+        - min_periods: int 最小计算周期
+        - q: int 分位数数量
+    """
+    q = kwargs.get('q', 10)
+    min_periods = kwargs.get('min_periods', q)
+    new_col = new_col if new_col else f'{col}_qcut'
+
+    def __qcut_func(x):
+        return pd.qcut(x, q=q, labels=False, duplicates='drop')[-1]
+
+    if n is None:
+        df[new_col] = df[col].expanding(min_periods=min_periods).apply(__qcut_func, raw=True)
+    else:
+        df[new_col] = df[col].rolling(window=n, min_periods=min_periods).apply(__qcut_func, raw=True)
+    df[new_col] = df[new_col].fillna(-1)
+
+
+def find_most_similarity(vector: pd.Series, matrix: pd.DataFrame, n=10, metric='cosine', **kwargs):
+    """寻找向量在矩阵中最相似的n个向量
+
+    :param vector: 1维向量, Series结构
+    :param matrix: 2维矩阵, DataFrame结构, 每一列是一个向量，列名是向量的标记
+    :param n: int, 返回最相似的n个向量
+    :param metric: str, 计算相似度的方法，
+
+        - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+        'manhattan']. These metrics support sparse matrix
+        inputs.
+        ['nan_euclidean'] but it does not yet support sparse matrices.
+
+        - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+        'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
+        'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
+        'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
+        See the documentation for scipy.spatial.distance for details on these
+        metrics. These metrics do not support sparse matrix inputs.
+
+    :param kwargs: 其他参数
+    """
+    from sklearn.metrics.pairwise import pairwise_distances
+    metric = kwargs.get('metric', 'cosine')
+    sim = pairwise_distances(vector.values.reshape(1, -1), matrix.T, metric=metric).reshape(-1)
+    sim = pd.Series(sim, index=matrix.columns)
+    sim = sim.sort_values(ascending=False)[:n]
+    return sim
diff --git a/czsc/utils/st_components.py b/czsc/utils/st_components.py
@@ -109,8 +109,9 @@ def show_sectional_ic(df, x_col, y_col, method='pearson', **kwargs):
     col4.dataframe(dfm.style.background_gradient(cmap='RdYlGn_r', axis=None).format('{:.4f}', na_rep='MISS'),
                    use_container_width=True)
 
-    fig = px.histogram(df, x=x_col, marginal="box", title="因子数据分布图")
-    st.plotly_chart(fig, use_container_width=True)
+    if kwargs.get("show_factor_histgram", False):
+        fig = px.histogram(df, x=x_col, marginal="box", title="因子数据分布图")
+        st.plotly_chart(fig, use_container_width=True)
 
 
 def show_factor_returns(df, x_col, y_col):

diff --git a/test/test_calendar.py b/test/test_calendar.py
@@ -3,10 +3,16 @@
 
 
 def test_is_trading_date():
-    assert is_trading_date('2023-09-08') == True
-    assert is_trading_date('2023-09-09') == False
-    assert is_trading_date('2023-09-10') == False
-    assert is_trading_date('2023-09-10 12:00') == False
+    test_cases = [
+        ('2023-09-08', True),
+        ('2023-09-09', False),
+        ('2023-09-10', False),
+        ('2023-09-10 12:00', False),
+        ('2024-04-18', True),
+        ('2024-01-13', False)
+    ]
+    for date, expected_result in test_cases:
+        assert is_trading_date(date) == expected_result
 
 
 def test_next_trading_date():

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -5,6 +5,7 @@
 create_dt: 2022/2/16 20:31
 describe: czsc.utils 单元测试
 """
+import sys
 import pytest
 import pandas as pd
 import numpy as np
@@ -117,3 +118,89 @@ def test_daily_performance():
     result = daily_performance([0.01, 0.02, -0.01, 0.03, 0.02, -0.02, 0.01, -0.01, 0.02, 0.01])
     assert result == {'年化': 2.016, '夏普': 5, '最大回撤': 0.02, '卡玛': 10, '日胜率': 0.7,
                       '年化波动率': 0.2439, '非零覆盖': 1.0, '盈亏平衡点': 0.7, '最大新高时间': 4}
+
+
+def test_find_most_similarity():
+    from czsc.utils.features import find_most_similarity
+
+    # 创建一个向量和一个矩阵
+    vector = pd.Series(np.random.rand(10))
+    matrix = pd.DataFrame(np.random.rand(10, 100))
+
+    # 调用函数
+    result = find_most_similarity(vector, matrix, n=5, metric='cosine')
+
+    # 检查结果的类型
+    assert isinstance(result, pd.Series)
+
+    # 检查结果的长度im
+    assert len(result) == 5
+
+    # 检查结果的索引
+    assert all(isinstance(index, int) for index in result.index)
+
+    # 检查结果的值
+    assert all(0 <= value <= 1 for value in result.values)
+
+
+def test_rolling_qcut():
+    from czsc.utils.features import rolling_qcut
+
+    # 创建一个DataFrame
+    df = pd.DataFrame({
+        'col1': np.random.rand(100),
+    })
+
+    # 调用函数
+    rolling_qcut(df, 'col1', n=10, new_col='col1_qcut', q=5, min_periods=5)
+
+    # 检查新列是否已添加到df
+    assert 'col1_qcut' in df.columns
+
+    # 检查新列的长度
+    assert len(df['col1_qcut']) == len(df['col1'])
+
+    # 检查新列的值
+    assert all(-1 <= value < 5 for value in df['col1_qcut'].dropna())
+
+
+def test_rolling_norm():
+    from czsc.utils.features import rolling_norm
+
+    df = pd.DataFrame({
+        'col1': np.random.rand(100),
+    })
+
+    # 调用函数
+    rolling_norm(df, 'col1', n=10, new_col='col1_norm')
+
+    # 检查新列是否已添加到df
+    assert 'col1_norm' in df.columns
+
+    # 检查新列的长度
+    assert len(df['col1_norm']) == len(df['col1'])
+
+    # 检查新列的值
+    assert all(-3 <= value <= 3 for value in df['col1_norm'].dropna())
+
+
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
+def test_rolling_rank():
+    from czsc.utils.features import rolling_rank
+
+    # 创建一个DataFrame
+    df = pd.DataFrame({
+        'col1': np.random.rand(100),
+    })
+
+    # 调用函数
+    rolling_rank(df, 'col1', n=10, new_col='col1_rank')
+
+    # 检查新列是否已添加到df
+    assert 'col1_rank' in df.columns
+
+    # 检查新列的长度
+    assert len(df['col1_rank']) == len(df['col1'])
+
+    # 检查新列的值
+    assert all(0 <= value <= 100 for value in df['col1_rank'].dropna())