Skip to content

Commit

Permalink
Update config.py
Browse files Browse the repository at this point in the history
  • Loading branch information
slyfox1186 authored Oct 4, 2024
1 parent 0e805aa commit 059da6d
Showing 1 changed file with 18 additions and 20 deletions.
38 changes: 18 additions & 20 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# config.py
# config.py

import os

Expand All @@ -7,33 +7,31 @@
class Config:
ALLOWED_EXTENSIONS = {'html'}
# --------------------- Log Setup ----------------------
DATE_FMT = '%m-%d-%Y_%H:%M:%S-%p'
LOG_FILE = 'download_wikipedia.log'
LOG_FORMAT = os.environ.get('LOG_FORMAT', '[%(asctime)s] [%(levelname)s] %(message)s')
LOG_LEVEL = 'INFO' # Set to 'DEBUG' for more detailed logs
LOG_MAX_BYTES = 10 * 1024 * 1024 # 10 MB
LOG_FORMAT = '%(asctime)s ::: %(levelname)s ::: %(message)s'
LOG_LEVEL = 'DEBUG' # Changed to DEBUG to capture detailed logs
LOG_MAX_BYTES = 10 * 1024 * 1024 # 10 MB
# -------------------- Internet Setup--------------------
CHECKSUM = None
CHUNK_SIZE = 1024 * 1024 # 1 MB
CONNECTION_COOLDOWN = 10 # seconds
CHUNK_SIZE = 1048576 # 1 MB
CONNECTION_COOLDOWN = 10 # seconds
INCREASE_FAILURE_LIMIT = 2
INCREASE_WAIT_TIME = 900 # seconds
MAX_PART_SIZE = 10 # Maximum size per file part in megabytes
INCREASE_WAIT_TIME = 900 # seconds
MAX_CONNECTIONS = 3
MAX_RETRIES = 10 # Reduced from 200 to 10
OPTIMAL_CONNECTION_TIMEOUT = 300 # seconds
RETRY_BACKOFF = 2.0
WEB_TIMEOUT = 15 # seconds
MAX_PART_SIZE_MB = 25 # Ensure this is consistent
MAX_RETRIES = 20
MAX_PART_SIZE = 10 * 1024 * 1024 # Number of file parts
OPTIMAL_CONNECTION_TIMEOUT = 300 # seconds
MAX_BACKOFF = 2.0 # Backoff factor
WEB_TIMEOUT = 30 # seconds
USER_AGENT = os.environ.get(
'USER_AGENT',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'
)
MAX_BACKOFF = 10 # seconds
# -------------------- Wikipedia Setup-------------------
DOWNLOAD_FOLDER = './database_files'
# DOWNLOAD_URL_TEST = 'http://download.wikimedia.org/enwiki/20240920/enwiki-20240920-pages-articles-multistream-index.txt.bz2'
DOWNLOAD_URL_FULL = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2' # Large model for production
DOWNLOAD_URL_TEST = 'https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-stub-articles.xml.gz' # Smaller model great for testing

# For a list of possible downloads visit either site:
# https://dumps.wikimedia.org/enwiki/latest/
# https://dumps.wikimedia.org/simplewiki/latest/
# For a list of possible downloads visit either: https://dumps.wikimedia.org/enwiki/latest/ || https://dumps.wikimedia.org/simplewiki/latest/
# LARGE_DOWNLOAD_URL = 'http://download.wikimedia.org/enwiki/20240920/enwiki-20240920-pages-articles-multistream-index.txt.bz2'
LARGE_DOWNLOAD_URL = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
SMALL_DOWNLOAD_URL = 'https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-stub-articles.xml.gz'

0 comments on commit 059da6d

Please sign in to comment.