From 50238cf865164c9429a008f088ef444fa2636893 Mon Sep 17 00:00:00 2001 From: David Chiu Date: Sun, 23 Apr 2023 15:50:29 +0800 Subject: [PATCH] Add Course 223 --- code/Course_223.ipynb | 478 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100644 code/Course_223.ipynb diff --git a/code/Course_223.ipynb b/code/Course_223.ipynb new file mode 100644 index 0000000..f315d6b --- /dev/null +++ b/code/Course_223.ipynb @@ -0,0 +1,478 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## 安裝 langchain 與 LLama_index 等必要套件" + ], + "metadata": { + "id": "w484mALE0pTn" + } + }, + { + "cell_type": "code", + "source": [ + "! pip install langchain" + ], + "metadata": { + "id": "NFJ8WLrQ0tF-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "! pip install llama_index" + ], + "metadata": { + "id": "jQF2HW1X4EQT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "! pip install pypdf" + ], + "metadata": { + "id": "DoOpfRMa4wdL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('punkt')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Crz5qh_S6251", + "outputId": "797e3f81-9b69-4499-dde5-d834bd72c335" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 讀取外資PDF報告 " + ], + "metadata": { + "id": "SLu5VkLR0u55" + } + }, + { + "cell_type": "code", + "source": [ + "! gdown https://drive.google.com/uc?id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dN5KoUeA4L0Z", + "outputId": "949ff416-1409-4af6-fcc6-1c49f558e649" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-\n", + "To: /content/TSMC2023Q1.zip\n", + "\r 0% 0.00/10.7M [00:00'\n", + "index = GPTSimpleVectorIndex([])\n", + "index.insert_nodes(nodes)" + ], + "metadata": { + "id": "CiYLW-8v1DOW" + }, + "execution_count": 72, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "nodes[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sntYXc7-8gw3", + "outputId": "3ca1ecf0-2edb-49b5-e798-c52be2ec2714" + }, + "execution_count": 71, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Node(text=\"ab 31 March 2023Global Research and Evidence Lab\\nTaiwan Semiconductor Manufacturing\\nQ123 earnings preview\\nSofter Q2 and 2023 outlook generally priced in\\nTSMC's Q123 earnings call is scheduled for 20 April (Thu) after market close.\\n\\nWe expect \\nQ1 earnings to be near the high-end of guidance (sales to fall 12% QoQ in US$ terms vs \\nour previous estimate of a 14% QoQ decline, GM at 55.3% vs 55.0% previously), \\nalthough we estimate Q2 sales to decline 8% QoQ, below the implied guidance of a 6% \\nQoQ decline provided in mid-January.\\n\\nNote that we cut our TSMC estimates in late \\nFebruary to factor in the potential downside of high-end smartphones and CPUs \\ndemand.\\n\\nWe are already more conservative on 2023 sales, but now slightly lift to a 1.5% \\ndecline in US$ terms from a 2.3% decline to reflect a slightly better Q1.\\n\\nHowever, we are \\ncutting capex estimates to US$32/35bn for 2023/24 from US$33/36bn to factor in the \\npotential impact of a more meaningful cyclical correction.\", doc_id='55c5bbab-7229-43ea-b306-fab5694e106c', embedding=None, doc_hash='e1674473d691a05a3e943e06982887d5578d5fcbe6030e759eafdd57344cf400', extra_info={'Document': 'citi'}, node_info=None, relationships={: 'f8879bdc-dc25-4dc3-9519-0aedc3a6ccc4', : '14dd1d82-80fd-408c-bc66-fd94e5659c2c'})" + ] + }, + "metadata": {}, + "execution_count": 71 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 查詢外資報告資料" + ], + "metadata": { + "id": "fYNpzmXy1EnE" + } + }, + { + "cell_type": "code", + "source": [ + "index.query('#zh-tw 請問citi 對台積電的看法? 請用正體中文')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bdofQhqs1GtZ", + "outputId": "d23bebf0-1ac1-41ee-b8a8-93e78254200d" + }, + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Response(response='\\n台積電的業績會在H2有所改善,但具體的程度仍然不確定,Citi仍然維持買入評級,並將價格目標定為NT$690.00,並將2023年EPS微調為NT$33.6,2024年EPS保持在NT$40.7。', source_nodes=[NodeWithScore(node=Node(text=\"For \\nsmartphones, we believe Apple has recently trimmed legacy N5/N4 app processor \\norders, while MediaTek and Qualcomm may remain cautious ahead of H2.\\n\\nWhat will the earnings call focus on?\\n\\n1) TSMC's expectations for the destocking cycle (which the company forecasted to finish \\nin H123) and the growth trajectory from H223; 2) HPC visibility and growth potential \\nfrom generative AI; 3) a capex update; 4) the profitability outlook, taking account of \\noverseas expansion and its US subsidy; 5) N3 ramp and customer migration.\\n\\nValuation: maintain Buy rating and price target of NT$690.00\\nWe fine-tune 2023E EPS from NT$33.4 to NT$33.6 to reflect the expected slightly better \\nQ1 performance, but maintain 2024E EPS at NT$40.7.\\n\\nAlthough the magnitude of the \\npotential H2 recovery is uncertain, we believe consensus expectations have been \\nlowered.\", doc_id='519d4b59-2463-45ee-bf96-3f5c86a4ba78', embedding=None, doc_hash='cfbe266a8a815a57aee4234c3fa66688159d7df353aae2be6ef038bb1d545b57', extra_info={'Document': 'citi'}, node_info=None, relationships={: 'f8879bdc-dc25-4dc3-9519-0aedc3a6ccc4', : '14dd1d82-80fd-408c-bc66-fd94e5659c2c', : '22bf400f-0473-4de9-84a3-b35ba115b391'}), score=0.7777160902823769)], extra_info={'519d4b59-2463-45ee-bf96-3f5c86a4ba78': {'Document': 'citi'}})" + ] + }, + "metadata": {}, + "execution_count": 73 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(index.query('#zh-tw 請問citi 對台積電的看法? 請用正體中文').response)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ydpfd-1A9TMy", + "outputId": "c40458ba-4968-4fe2-8d64-97163f123ca9" + }, + "execution_count": 74, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "台積電的業績會在H2有所改善,但具體的程度仍然不確定,Citi仍然維持買入評級,並將價格目標定為NT$690.00,並將2023年EPS微調為NT$33.6,2024年EPS保持在NT$40.7。\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 將所有外資報告鍵入索引中" + ], + "metadata": { + "id": "OSEr2Aex1HSR" + } + }, + { + "cell_type": "code", + "source": [ + "def build_nodes(f):\n", + " loader = PyPDFLoader(f)\n", + " pages = loader.load_and_split()\n", + "\n", + " splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " parser = SimpleNodeParser(text_splitter=splitter)\n", + "\n", + " docs = Document(pages[0].page_content)\n", + " nodes = parser.get_nodes_from_documents([docs])\n", + " for e in nodes:\n", + " e.extra_info= {'Document':f}\n", + " return nodes" + ], + "metadata": { + "id": "M-1WfZkpEqO6" + }, + "execution_count": 92, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import glob\n", + "index = GPTSimpleVectorIndex([])\n", + "for f in glob.glob('TSMC2023Q1/*'):\n", + " nodes = build_nodes(f)\n", + " index.insert_nodes(nodes)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y0_KF4z7E2K2", + "outputId": "abc30a5a-c862-4768-c05d-3ed4e0e69d51" + }, + "execution_count": 94, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:langchain.text_splitter:Created a chunk of size 1809, which is longer than the specified 1000\n", + "WARNING:langchain.text_splitter:Created a chunk of size 1208, which is longer than the specified 1000\n", + "WARNING:langchain.text_splitter:Created a chunk of size 1040, which is longer than the specified 1000\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 查詢外資對台積電的看法" + ], + "metadata": { + "id": "mmfUxldC1O6x" + } + }, + { + "cell_type": "code", + "source": [ + "print(index.query('#zh-tw 請問JPM對台積電目標價的看法? 請用正體中文回答').response)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a5lWiBtqDUjc", + "outputId": "4bfea3f6-e1f2-48a7-8928-35d399d9f9fe" + }, + "execution_count": 96, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "JPM對台積電的目標價是NT$650.0。\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 歡迎訂閱按讚分享開啟小鈴鐺\n", + "\n", + "給資料科學家的 Python 基礎課: \n", + "- https://www.youtube.com/watch?v=uzInb5gbl4M \n", + "\n", + "大數學堂 - 學習資料科學的第一站: \n", + "- https://www.youtube.com/channel/UCSmvtvsTjqkvKLqpmsFWRQw " + ], + "metadata": { + "id": "Gjhkmdh22172" + } + } + ] +} \ No newline at end of file