Skip to content

Commit 9a9ef9a

Browse files
authored
Codeforces Problem Scraper Added
1 parent bc2de17 commit 9a9ef9a

File tree

1 file changed

+128
-27
lines changed

1 file changed

+128
-27
lines changed

Coderforces_Problem_Scrapper/Codeforces_problem_scrapper.py

+128-27
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,103 @@
22
from selenium import webdriver # Automated webdriver
33
from PIL import Image
44
from fpdf import FPDF # For converting images to pdf
5+
56
DRIVER_PATH = ''
67

7-
def getproblem():
8+
9+
def select_difficulty():
10+
"""
11+
This function will let user to choose the difficulty level
12+
:return: difficulty_level[]
13+
"""
14+
difficulty_level = []
15+
print("\nEnter the Range between 800 to 3500: ")
16+
difficulty_level.append(int(input("Min: ")))
17+
difficulty_level.append(int(input("Max: ")))
18+
19+
return difficulty_level
20+
21+
22+
def extracting_problem_links(diff_level):
23+
"""
24+
This function saves first saves the link of the pages to scrape from
25+
and then the link of every question, saves it in list
26+
:param diff_level: difficulty_level entered by the user
27+
:return pblms_links: consists of all the available questions to scrape
28+
"""
29+
no_of_questions = int(input("\nHow many Questions you want to scrape: "))
30+
31+
pblms_link_scraped = 0
32+
pblms_links = []
33+
page = 1
34+
options = webdriver.ChromeOptions()
35+
options.headless = True
36+
driver = webdriver.Chrome(DRIVER_PATH, options=options)
37+
print("\nRequesting URL ...")
38+
driver.get(f"https://codeforces.com/problemset/?tags={diff_level[0]}-{diff_level[1]}")
39+
40+
# ===================Getting no. of Pages to Scrape=============================
41+
42+
# It will give the total no. of pages present with that question from
43+
# which we are going to scrape
44+
page_links = []
45+
46+
print("\nFinding available pages to scrape....")
47+
48+
available_pages = driver.find_elements_by_css_selector("div.pagination a")
49+
for page_no in available_pages:
50+
page_links.append(page_no.get_attribute("href"))
51+
52+
print(f"Available Pages to scrape are: {len(page_links[:-1])}")
53+
54+
# ===================================================================================
55+
56+
# ***************************** SCRAPING PAGE 1 *************************************
57+
print(f"\nScraping Page {page}")
58+
59+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
60+
for element in elements:
61+
# Saving the link in pblms_links
62+
pblms_links.append(element.get_attribute("href"))
63+
pblms_link_scraped += 1
64+
65+
# If we scraped required no. of questions then return
66+
if pblms_link_scraped == no_of_questions:
67+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
68+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
69+
return pblms_links
70+
page += 1
71+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
72+
# *************************************************************************************
73+
74+
# ----------------------------- SCRAPING SUBSEQUENT PAGES -----------------------------
75+
for link in page_links[1:-1]:
76+
print(f"\nScraping Page {page}")
77+
78+
# Going to next Page
79+
driver.get(link)
80+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
81+
for element in elements:
82+
# Saving the link in pblms_links
83+
pblms_links.append(element.get_attribute("href"))
84+
pblms_link_scraped += 1
85+
86+
# If we scraped required no. of questions then return
87+
if pblms_link_scraped == no_of_questions:
88+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
89+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
90+
return pblms_links
91+
92+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
93+
page += 1
94+
# ----------------------------------------------------------------------------------------------
95+
96+
# scraped all the available questions but still the count is less
97+
print(f"\n{pblms_link_scraped} out of {no_of_questions} URLs able to scrapped !!!")
98+
return pblms_links
99+
100+
101+
def getproblem(URLs):
8102
"""
9103
getproblem() : It takes input from the user of codeforces problemID and difficulty
10104
level and then by using selenium and chrome webdriver, capturing screenshot of the
@@ -13,42 +107,49 @@ def getproblem():
13107
Then saving the image.png as pdf file by using fdf library.
14108
"""
15109

16-
# Taking input from the user to search for the problem
17-
Pblm_id = input("Enter the Problem ID: ")
18-
difficulty = input("Enter the difficulty level: ")
19-
filename = input('Enter the file name to store Question: ') + '.pdf'
20-
21-
# Going to the specific URL
22-
url = "https://codeforces.com/problemset/problem/" + Pblm_id + "/" + difficulty
23110
path = 'image.png'
24-
options = webdriver.ChromeOptions()
25111

112+
# Creating a Target Output Folder
113+
target_folder = './problems_pdf'
114+
if not os.path.exists(target_folder):
115+
os.makedirs(target_folder)
116+
117+
options = webdriver.ChromeOptions()
26118
# Headless = True for taking a scrolling snapshot
27119
options.headless = True
28120
driver = webdriver.Chrome(DRIVER_PATH, options=options)
29-
driver.get(url)
30-
# Deciding height by tag
31-
required_height = driver.execute_script(
32-
'return document.body.parentNode.scrollHeight')
33-
driver.set_window_size(1366, required_height)
121+
file_counter = 1
122+
123+
for url in URLs:
124+
driver.get(url)
125+
# Deciding height by tag
126+
required_height = driver.execute_script(
127+
'return document.body.parentNode.scrollHeight')
128+
driver.set_window_size(1366, required_height)
129+
130+
title = driver.find_element_by_class_name("title").text
131+
filename = title[3:] + '.pdf'
34132

35-
# Taking SS of everything within the ttypography class
36-
driver.find_element_by_class_name('ttypography').screenshot(path)
133+
# Taking SS of everything within the ttypography class
134+
driver.find_element_by_class_name('ttypography').screenshot(path)
37135

38-
# Opening image with pillow so based to capture its height and width
39-
cover = Image.open(path)
40-
WIDTH, HEIGHT = cover.size
41-
MARGIN = 10
42-
# based on image's height and width we are adjusting the pdf margin and borders
43-
pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])
44-
pdf.add_page() # Adding new page to the pdf
45-
pdf.image(path, MARGIN, MARGIN)
46-
pdf.output(filename, "F") # saving the pdf with the specified filename
136+
# Opening image with pillow so based to capture its height and width
137+
cover = Image.open(path)
138+
WIDTH, HEIGHT = cover.size
139+
MARGIN = 10
140+
# based on image's height and width we are adjusting the pdf margin and borders
141+
pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])
142+
pdf.add_page() # Adding new page to the pdf
143+
pdf.image(path, MARGIN, MARGIN)
47144

48-
print(f'\nGreat Success!!! Check your directory for {filename} file!')
145+
pdf.output(os.path.join(target_folder, filename), "F") # saving the pdf with the specified filename
146+
print(f'File saved in your directory ./problems_pdf/{filename} ({file_counter}/{len(URLs)}) !')
147+
file_counter += 1
49148

50149

51150
if __name__ == "__main__":
52151
DRIVER_PATH = input("Enter DRIVER PATH location: ")
53-
getproblem()
152+
diff = select_difficulty() # Accepting difficulty level from user
153+
problems_link = extracting_problem_links(diff) # scraping the required the no. of links
154+
getproblem(problems_link) # saving the Questions in PDF file.
54155
os.remove('image.png')

0 commit comments

Comments
 (0)