add requirements.txt to replace conda-.yml documentation; remove 2 li…

…nes from scraper to ensure compatibility
rui-ye · Nov 10, 2023 · e4b77a2 · e4b77a2
1 parent f1a63f1
commit e4b77a2
Show file tree

Hide file tree

Showing 9 changed files with 2,648 additions and 19 deletions.
diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/README.md b/fingpt/FinGPT_RAG/multisource_retrieval/README.md
@@ -9,8 +9,8 @@
 
 ## Setup
 
-* Visit environment_news_scraping.yml for the environment setup
 * Set up your .env file, can refer to /FinGPT_sentiment/.env.example
+* Visit FinGPT_RAG/requirements.txt for the environment setup
 
 ``` python
 

diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/__init__.py b/fingpt/FinGPT_RAG/multisource_retrieval/__init__.py
diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/data/sent_valid_scraped.csv b/fingpt/FinGPT_RAG/multisource_retrieval/data/sent_valid_scraped.csv
diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/news_scraper.py b/fingpt/FinGPT_RAG/multisource_retrieval/news_scraper.py
@@ -26,6 +26,7 @@
 from scrapers.cnbc import scrape_cnbc
 from scrapers.market_screener import scrape_market_screener
 from scrapers import url_encode
+from scrapers.google.scrape_google import scrape_google
 
 # TODO: Twitter API requests # https://twitter.com/bryan4665/
 
@@ -607,24 +608,20 @@ def select_column_and_classify():
                                                 choices=column_names)
                 if not sentence_column:
                     raise ValueError("Invalid context selected selection")
-                classification_column = gui.buttonbox("Column Selection",
-                                                      "Select the column for classification in the CSV:",
-                                                      choices=column_names)
-                if not classification_column:
-                    raise ValueError("Invalid context classification column selection")
 
                 counter = 0  # Counter variable to track the number of rows processed
-                row_index_input = gui.enterbox("Enter the row index to classify", "Row Index Input")
+                row_index_input = gui.enterbox("Enter the row index to classify", "Row Index Input", 1)
                 if row_index_input is None or not row_index_input.isdigit() or int(row_index_input) >= len(df):
                     row_index = 1  # Set a default starting index
                 else:
                     row_index = int(row_index_input)
 
+                print("loaded file as df: ", df)
+
                 for row_index, row in itertools.islice(df.iterrows(), row_index, None):
                     # If role is not empty or N/A or has the same sentence as "contextualized_sentence", means context is added, then skip
-                    if process_existing_file and row["link"] != "N/A" and not pd.isnull(row["link"]) and row[sentence_column] != row["contextualized_sentence"]:
-                        continue
-
+                    # if process_existing_file and row["link"] != "N/A" and not pd.isnull(row["link"]) and row[sentence_column] != row["contextualized_sentence"]:
+                    #     continue
                     target_sentence = row[sentence_column]
                     ticker, remaining_sentence, link = split_sentence(target_sentence)
 

diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/scrapers/__init__.py b/fingpt/FinGPT_RAG/multisource_retrieval/scrapers/__init__.py
diff --git a/...urce_retrieval/SeekingAlpha_Content.ipynb → .../seeking_alpha/SeekingAlpha_Content.ipynb b/...urce_retrieval/SeekingAlpha_Content.ipynb → .../seeking_alpha/SeekingAlpha_Content.ipynb
diff --git a/fingpt/FinGPT_RAG/multisource_retrieval/scrapers/seeking_alpha/__init__.py b/fingpt/FinGPT_RAG/multisource_retrieval/scrapers/seeking_alpha/__init__.py
diff --git a/...ource_retrieval/seeking_alpha_scraping.py → ...s/seeking_alpha/seeking_alpha_scraping.py b/...ource_retrieval/seeking_alpha_scraping.py → ...s/seeking_alpha/seeking_alpha_scraping.py
diff --git a/fingpt/FinGPT_RAG/requirements.txt b/fingpt/FinGPT_RAG/requirements.txt
@@ -1,9 +1,154 @@
-tokenizers>=0.13.3
-bitsandbytes
-datasets>=2.8.0
-sentencepiece>=0.1.97
-protobuf==3.20.3
-accelerate>=0.15.0
-torch>=1.12.0
-deepspeed>=0.9.0
-git+https://github.com/huggingface/transformers
+accelerate==0.23.0
+aiohttp==3.8.5
+aiosignal==1.3.1
+anyio==4.0.0
+appnope==0.1.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.0
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.1.0
+Babel==2.12.1
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bleach==6.0.0
+bs4==0.0.1
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.0
+comm==0.1.4
+datasets==2.14.5
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+easygui==0.98.2
+executing==2.0.0
+fastjsonschema==2.18.1
+filelock==3.12.4
+fqdn==1.5.1
+frozenlist==1.4.0
+fsspec==2023.6.0
+h11==0.14.0
+huggingface-hub==0.16.4
+idna==3.4
+ipykernel==6.25.2
+ipython==8.16.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+jedi==0.19.0
+Jinja2==3.1.2
+joblib==1.3.2
+json5==0.9.14
+jsonpointer==2.4
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.7.0
+jupyter-lsp==2.2.0
+jupyter_client==8.3.1
+jupyter_core==5.3.2
+jupyter_server==2.7.3
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.6
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+jupyterlab_server==2.25.0
+loguru==0.7.2
+lxml==4.9.3
+MarkupSafe==2.1.3
+matplotlib-inline==0.1.6
+mistune==3.0.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+nbclient==0.8.0
+nbconvert==7.8.0
+nbformat==5.9.2
+nest-asyncio==1.5.8
+networkx==3.1
+notebook==7.0.4
+notebook_shim==0.2.3
+numpy==1.26.0
+oauthlib==3.2.2
+openai==0.28.1
+outcome==1.2.0
+overrides==7.4.0
+packaging==23.2
+pandas==2.1.1
+pandocfilters==1.5.0
+parso==0.8.3
+peft==0.5.0
+pexpect==4.8.0
+pickleshare==0.7.5
+platformdirs==3.10.0
+prometheus-client==0.17.1
+prompt-toolkit==3.0.39
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==13.0.0
+pycparser==2.21
+Pygments==2.16.1
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-json-logger==2.0.7
+pytz==2023.3.post1
+PyYAML==6.0.1
+pyzmq==25.1.1
+qtconsole==5.4.4
+QtPy==2.4.0
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+requests-oauthlib==1.3.1
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.10.3
+safetensors==0.3.3
+scikit-learn==1.3.1
+scipy==1.11.3
+searchtweets==1.7.6
+selenium==4.13.0
+Send2Trash==1.8.2
+simplejson==3.19.1
+six==1.16.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+stack-data==0.6.3
+sympy==1.12
+terminado==0.17.1
+threadpoolctl==3.2.0
+tinycss2==1.2.1
+tokenizers==0.14.0
+torch==2.0.1
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.10.1
+transformers==4.34.0
+trio==0.22.2
+trio-websocket==0.11.1
+tushare==1.2.89
+tweepy==4.14.0
+tweet-parser==1.13.2
+types-python-dateutil==2.8.19.14
+typing_extensions==4.8.0
+tzdata==2023.3
+uri-template==1.3.0
+urllib3==2.0.5
+wcwidth==0.2.8
+webcolors==1.13
+webdriver-manager==4.0.1
+webencodings==0.5.1
+websocket-client==0.57.0
+widgetsnbextension==4.0.9
+wsproto==1.2.0
+xxhash==3.3.0
+yarl==1.9.2
+zenrows==1.3.1