Add function to group by hot words

dongjiashun · Aug 1, 2023 · 03aee86 · 03aee86
1 parent 2a3139b
commit 03aee86
Show file tree

Hide file tree

Showing 10 changed files with 273 additions and 57 deletions.
diff --git a/examples/data_runner/kdata_runner.py b/examples/data_runner/kdata_runner.py
@@ -6,6 +6,7 @@
 from examples.recorder_utils import run_data_recorder
 from examples.report_utils import inform
 from zvt import init_log
+from zvt.api.selector import get_entity_ids_by_filter
 from zvt.domain import (
     Stock,
     Stock1dHfqKdata,
@@ -16,6 +17,7 @@
     BlockCategory,
     Index,
     Index1dKdata,
+    StockNews,
 )
 from zvt.informer import EmailInformer
 from zvt.utils import next_date, current_date
@@ -25,6 +27,22 @@
 sched = BackgroundScheduler()
 
 
+@sched.scheduled_job("cron", hour=16, minute=30, day_of_week="mon-fri")
+def record_stock_news(data_provider="em"):
+    normal_stock_ids = get_entity_ids_by_filter(
+        provider="em", ignore_delist=True, ignore_st=False, ignore_new_stock=False
+    )
+
+    run_data_recorder(
+        entity_ids=normal_stock_ids,
+        day_data=True,
+        domain=StockNews,
+        data_provider=data_provider,
+        force_update=False,
+        sleeping_time=2,
+    )
+
+
 @sched.scheduled_job("cron", hour=15, minute=30, day_of_week="mon-fri")
 def record_stock_data(data_provider="em", entity_provider="em", sleeping_time=2):
     # A股指数
@@ -38,17 +56,6 @@ def record_stock_data(data_provider="em", entity_provider="em", sleeping_time=2)
         sleeping_time=sleeping_time,
     )
 
-    # A股标的
-    run_data_recorder(domain=Stock, data_provider=data_provider, force_update=False)
-    # A股后复权行情
-    run_data_recorder(
-        domain=Stock1dHfqKdata,
-        data_provider=data_provider,
-        entity_provider=entity_provider,
-        day_data=True,
-        sleeping_time=sleeping_time,
-    )
-
     # 板块(概念，行业)
     run_data_recorder(domain=Block, entity_provider=entity_provider, data_provider=entity_provider, force_update=False)
     # 板块行情(概念，行业)
@@ -76,8 +83,24 @@ def record_stock_data(data_provider="em", entity_provider="em", sleeping_time=2)
         title="report 新概念",
         entity_provider=entity_provider,
         entity_type="block",
-        em_group="关注板块",
-        em_group_over_write=True,
+        em_group="练气",
+        em_group_over_write=False,
+    )
+
+    # A股标的
+    run_data_recorder(domain=Stock, data_provider=data_provider, force_update=False)
+    # A股后复权行情
+    normal_stock_ids = get_entity_ids_by_filter(
+        provider="em", ignore_delist=True, ignore_st=False, ignore_new_stock=False
+    )
+
+    run_data_recorder(
+        entity_ids=normal_stock_ids,
+        domain=Stock1dHfqKdata,
+        data_provider=data_provider,
+        entity_provider=entity_provider,
+        day_data=True,
+        sleeping_time=sleeping_time,
     )
 
 

diff --git a/examples/hot.json b/examples/hot.json
@@ -0,0 +1,44 @@
+{
+  "新能源": [
+    "新能源",
+    "锂电 锂电池",
+    "钠离子电池",
+    "光伏",
+    "太阳能",
+    "储能",
+    "TOPCON电池",
+    "风电",
+    "核电"
+  ],
+  "新能车": [
+    "新能车 新能源汽车",
+    "整车 汽车整车",
+    "汽车零部件 汽车零件",
+    "无人驾驶",
+    "压铸一体化 一体化压铸"
+  ],
+  "人工智能": [
+    "人工智能 AI",
+    "GPT CHATGPT",
+    "算力"
+  ],
+  "机器人": [
+    "机器人",
+    "减速器",
+    "伺服 伺服系统",
+    "控制系统",
+    "电机"
+  ],
+  "核心资产": [
+    "核心资产",
+    "白马",
+    "沪深300",
+    "基金重仓",
+    "上证50"
+  ],
+  "人民币国际化": [
+    "人民币国际化",
+    "一带一路",
+    "跨境支付"
+  ]
+}
diff --git a/examples/reports/report_tops.py b/examples/reports/report_tops.py
@@ -23,11 +23,11 @@ def report_top_stocks():
         entity_type="stock",
         entity_provider="em",
         data_provider="em",
-        periods=[*range(2, 21)],
+        periods=[*range(2, 27)],
         ignore_new_stock=True,
         ignore_st=True,
         adjust_type=None,
-        top_count=20,
+        top_count=25,
         turnover_threshold=0,
         turnover_rate_threshold=0,
         informer=email_informer,
@@ -40,7 +40,7 @@ def report_top_stocks():
         entity_type="stock",
         entity_provider="em",
         data_provider="em",
-        periods=[*range(21, 60)],
+        periods=[*range(27, 67)],
         ignore_new_stock=True,
         ignore_st=True,
         adjust_type=None,
@@ -123,7 +123,7 @@ def report_top_stockhks():
         entity_provider="em",
         data_provider="em",
         top_count=10,
-        periods=[*range(2, 10)],
+        periods=[*range(2, 27)],
         ignore_new_stock=False,
         ignore_st=False,
         adjust_type=None,
@@ -152,30 +152,30 @@ def report_top_stockhks():
         return_type=TopType.positive,
     )
 
-    report_top_entities(
-        entity_type="stockhk",
-        entity_provider="em",
-        data_provider="em",
-        top_count=20,
-        periods=[365, 750],
-        ignore_new_stock=True,
-        ignore_st=False,
-        adjust_type=None,
-        turnover_threshold=50000000,
-        turnover_rate_threshold=0.005,
-        informer=email_informer,
-        em_group="谁有我惨",
-        em_group_over_write=False,
-        return_type=TopType.negative,
-    )
+    # report_top_entities(
+    #     entity_type="stockhk",
+    #     entity_provider="em",
+    #     data_provider="em",
+    #     top_count=20,
+    #     periods=[365, 750],
+    #     ignore_new_stock=True,
+    #     ignore_st=False,
+    #     adjust_type=None,
+    #     turnover_threshold=50000000,
+    #     turnover_rate_threshold=0.005,
+    #     informer=email_informer,
+    #     em_group="谁有我惨",
+    #     em_group_over_write=False,
+    #     return_type=TopType.negative,
+    # )
 
 
 if __name__ == "__main__":
     init_log("report_tops.log")
 
     report_top_stocks()
     report_top_blocks()
-    # report_top_stockhks()
+    report_top_stockhks()
 
     sched.start()
 

diff --git a/examples/reports/report_vol_up.py b/examples/reports/report_vol_up.py
@@ -19,7 +19,7 @@
 
 
 @sched.scheduled_job("cron", hour=17, minute=0, day_of_week="mon-fri")
-def report_vol_up():
+def report_vol_up_stocks():
     target_date = get_latest_kdata_date(entity_type="stock", adjust_type=AdjustType.hfq, provider="em")
     entity_ids = get_mini_and_small_stock(timestamp=target_date, provider="em")
 
@@ -34,7 +34,7 @@ def report_vol_up():
         em_group_over_write=True,
         filter_by_volume=False,
         adjust_type=AdjustType.hfq,
-        start_timestamp="2019-01-01",
+        start_timestamp="2021-01-01",
         # factor args
         windows=[120, 250],
         over_mode="or",
@@ -56,7 +56,7 @@ def report_vol_up():
         em_group_over_write=False,
         filter_by_volume=False,
         adjust_type=AdjustType.hfq,
-        start_timestamp="2019-01-01",
+        start_timestamp="2021-01-01",
         # factor args
         windows=[120, 250],
         over_mode="or",
@@ -66,6 +66,9 @@ def report_vol_up():
         entity_ids=entity_ids,
     )
 
+
+@sched.scheduled_job("cron", hour=17, minute=30, day_of_week="mon-fri")
+def report_vol_up_stockhks():
     report_targets(
         factor_cls=VolumeUpMaFactor,
         entity_provider="em",
@@ -77,11 +80,11 @@ def report_vol_up():
         em_group_over_write=False,
         filter_by_volume=False,
         adjust_type=AdjustType.hfq,
-        start_timestamp="2019-01-01",
+        start_timestamp="2021-01-01",
         # factor args
         windows=[120, 250],
         over_mode="or",
-        up_intervals=20,
+        up_intervals=60,
         turnover_threshold=100000000,
         turnover_rate_threshold=0.01,
     )
@@ -90,8 +93,8 @@ def report_vol_up():
 if __name__ == "__main__":
     init_log("report_vol_up.log")
 
-    report_vol_up()
-
+    report_vol_up_stocks()
+    report_vol_up_stockhks()
     sched.start()
 
     sched._thread.join()
diff --git a/examples/utils.py b/examples/utils.py
@@ -1,7 +1,16 @@
 # -*- coding: utf-8 -*-
+import json
 import logging
+import os
+import pprint
 
 import eastmoneypy
+import pandas as pd
+
+from zvt.api.stats import get_top_performance_entities_by_periods
+from zvt.contract.api import get_entities
+from zvt.domain import StockNews
+from zvt.utils import next_date, today
 
 logger = logging.getLogger(__name__)
 
@@ -19,3 +28,99 @@ def add_to_eastmoney(codes, group, entity_type="stock", over_write=True):
 
     for code in codes:
         eastmoneypy.add_to_group(code=code, entity_type=entity_type, group_name=group)
+
+
+def get_hot_words_config():
+    with open(os.path.join(os.path.dirname(__file__), "hot.json")) as f:
+        return json.load(f)
+
+
+def count_hot_words(text: str):
+    text = text.upper()
+    hot_words_config = get_hot_words_config()
+    word_stats = {}
+    topic_stats = {}
+    for topic in hot_words_config:
+        topic_count = 0
+        for word in hot_words_config[topic]:
+            word_stats[word] = text.count(word)
+            topic_count = topic_count + word_stats[word]
+        topic_stats[topic] = topic_count
+    return topic_stats, word_stats
+
+
+def hot_stats(data: pd.Series):
+    pass
+
+
+def group_stocks_by_topic(entities, start_timestamp=None):
+    if not start_timestamp:
+        start_timestamp = next_date(today(), -180)
+    stock_map = {}
+    for entity in entities:
+        stock_map[entity.entity_id] = {"code": entity.code, "name": entity.name}
+    df = StockNews.query_data(start_timestamp=start_timestamp, entity_ids=[entity.entity_id for entity in entities])
+    df = df.groupby("entity_id")["news_title"].apply(",".join).reset_index()
+
+    hot_words_config = get_hot_words_config()
+
+    hot_stocks_map = {}
+    topic_count = {}
+    word_count = {}
+    for _, row in df[["entity_id", "news_title"]].iterrows():
+        entity_id = row["entity_id"]
+        text = row["news_title"]
+
+        is_hot = False
+        for topic in hot_words_config:
+            topic_count.setdefault(topic, 0)
+            for words in hot_words_config[topic]:
+                hot_stocks_map.setdefault(words, [])
+                word_count.setdefault(words, 0)
+                for word in words.split():
+                    count = text.count(word)
+                    if count > 0:
+                        word_count[words] = word_count[words] + 1
+                        topic_count[topic] = topic_count[topic] + 1
+                        hot_stocks_map[words].append(
+                            (f"{stock_map[entity_id]['code']}({stock_map[entity_id]['name']})", count)
+                        )
+                        is_hot = True
+        if not is_hot:
+            hot_stocks_map.setdefault("其他", [])
+            hot_stocks_map["其他"].append((f"{stock_map[entity_id]['code']}({stock_map[entity_id]['name']})", 0))
+
+    sorted_topics = sorted(topic_count.items(), key=lambda item: item[1], reverse=True)
+    sorted_words = sorted(word_count.items(), key=lambda item: item[1], reverse=True)
+
+    result = []
+    for topic, count in sorted_topics:
+        topic_words = hot_words_config[topic]
+        topic_words_stocks = [
+            (f"{words}({count})", sorted(hot_stocks_map[words], key=lambda item: item[1], reverse=True))
+            for (words, count) in sorted_words
+            if words in topic_words
+        ]
+        result.append((f"{topic}({count})", topic_words_stocks))
+
+    result.append(("其他", [("其他", hot_stocks_map["其他"])]))
+
+    return result
+
+
+if __name__ == "__main__":
+    ids = get_top_performance_entities_by_periods(entity_provider="em", data_provider="em")
+
+    entities = get_entities(provider="em", entity_type="stock", entity_ids=ids, return_type="domain")
+
+    group_info = group_stocks_by_topic(entities=entities)
+    info = ""
+    for group in group_info:
+        topic = group[0]
+        info = info + f"^^^^^^ {topic} ^^^^^^\n"
+        for topic_word, stocks_count in group[1]:
+            info = info + f"{topic_word}\n"
+            stocks = [f"{stock_count[0]} {stock_count[1]}" for stock_count in stocks_count]
+            info = info + "\n".join(stocks) + "\n"
+
+    print(info)