From 315d1ba3826477bcaddbd47e730fcad994edd2ef Mon Sep 17 00:00:00 2001 From: frandier Date: Thu, 10 Nov 2022 18:34:58 -0500 Subject: [PATCH] Save unique tweets --- main.py | 6 ++++-- requirements.txt | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index f7a3269..b39b758 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ import numpy as np import spacy from dotenv import load_dotenv -from pymongo import MongoClient +from pymongo import MongoClient, ASCENDING from sklearn.feature_extraction.text import TfidfVectorizer from art import tprint @@ -73,6 +73,7 @@ def new_database(): try: client = MongoClient(os.environ.get("mongo_uri")) db = client.twitter + db.tweets.create_index([('id', ASCENDING)], unique=True) return db except: print("Could not connect to MongoDB") @@ -90,6 +91,7 @@ def get_tweets_from_twitter(client): def save_tweets(db, tweets): for tweet in tweets: tw = { + "id": tweet.id, "text": tweet.text, "user": tweet.user.screen_name, "location": tweet.user.location, @@ -100,7 +102,7 @@ def save_tweets(db, tweets): tw_id = db.tweets.insert_one(tw).inserted_id print("Tweet inserted with id: ", tw_id) except: - print("Could not insert tweet") + print("Could not insert tweet", tweet.id) def clean_and_normalize_tweets(db, clean, nlp): tweets = db.tweets.find() diff --git a/requirements.txt b/requirements.txt index 59cae46..d0e7042 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,51 @@ +ansiwrap==0.8.4 +art==5.7 +blis==0.7.8 +catalogue==2.0.8 certifi==2022.9.24 charset-normalizer==2.1.1 +click==8.1.3 +confection==0.0.3 +cymem==2.0.6 dnspython==2.2.1 +es-core-news-md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0-py3-none-any.whl idna==3.4 +Jinja2==3.1.2 +joblib==1.2.0 +langcodes==3.3.0 +MarkupSafe==2.1.1 +murmurhash==1.0.8 +nltk==3.7 +numpy==1.23.3 oauthlib==3.2.1 +packaging==21.3 +pandas==1.5.0 +pathy==0.6.2 +preshed==3.0.7 +pydantic==1.9.2 pymongo==4.2.0 +pyparsing==3.0.9 +python-dateutil==2.8.2 python-dotenv==0.21.0 +pytz==2022.4 +regex==2022.9.13 requests==2.28.1 requests-oauthlib==1.3.1 +scikit-learn==1.1.2 +scipy==1.9.2 +six==1.16.0 +sklearn==0.0 +smart-open==5.2.1 +spacy==3.4.1 +spacy-legacy==3.0.10 +spacy-loggers==1.0.3 +srsly==2.4.4 +textwrap3==0.9.2 +thinc==8.1.3 +threadpoolctl==3.1.0 +tqdm==4.64.1 tweepy==4.10.1 +typer==0.4.2 +typing_extensions==4.4.0 urllib3==1.26.12 +wasabi==0.10.1