2
2
from selenium import webdriver # Automated webdriver
3
3
from PIL import Image
4
4
from fpdf import FPDF # For converting images to pdf
5
+
5
6
DRIVER_PATH = ''
6
7
7
- def getproblem ():
8
+
9
+ def select_difficulty ():
10
+ """
11
+ This function will let user to choose the difficulty level
12
+ :return: difficulty_level[]
13
+ """
14
+ difficulty_level = []
15
+ print ("\n Enter the Range between 800 to 3500: " )
16
+ difficulty_level .append (int (input ("Min: " )))
17
+ difficulty_level .append (int (input ("Max: " )))
18
+
19
+ return difficulty_level
20
+
21
+
22
+ def extracting_problem_links (diff_level ):
23
+ """
24
+ This function saves first saves the link of the pages to scrape from
25
+ and then the link of every question, saves it in list
26
+ :param diff_level: difficulty_level entered by the user
27
+ :return pblms_links: consists of all the available questions to scrape
28
+ """
29
+ no_of_questions = int (input ("\n How many Questions you want to scrape: " ))
30
+
31
+ pblms_link_scraped = 0
32
+ pblms_links = []
33
+ page = 1
34
+ options = webdriver .ChromeOptions ()
35
+ options .headless = True
36
+ driver = webdriver .Chrome (DRIVER_PATH , options = options )
37
+ print ("\n Requesting URL ..." )
38
+ driver .get (f"https://codeforces.com/problemset/?tags={ diff_level [0 ]} -{ diff_level [1 ]} " )
39
+
40
+ # ===================Getting no. of Pages to Scrape=============================
41
+
42
+ # It will give the total no. of pages present with that question from
43
+ # which we are going to scrape
44
+ page_links = []
45
+
46
+ print ("\n Finding available pages to scrape...." )
47
+
48
+ available_pages = driver .find_elements_by_css_selector ("div.pagination a" )
49
+ for page_no in available_pages :
50
+ page_links .append (page_no .get_attribute ("href" ))
51
+
52
+ print (f"Available Pages to scrape are: { len (page_links [:- 1 ])} " )
53
+
54
+ # ===================================================================================
55
+
56
+ # ***************************** SCRAPING PAGE 1 *************************************
57
+ print (f"\n Scraping Page { page } " )
58
+
59
+ elements = driver .find_elements_by_css_selector ("td.id.dark.left a" and "td.id.left a" )
60
+ for element in elements :
61
+ # Saving the link in pblms_links
62
+ pblms_links .append (element .get_attribute ("href" ))
63
+ pblms_link_scraped += 1
64
+
65
+ # If we scraped required no. of questions then return
66
+ if pblms_link_scraped == no_of_questions :
67
+ print (f"URLs of Question Scraped till now: { pblms_link_scraped } " )
68
+ print (f"\n URLs Scrapped Successfully { pblms_link_scraped } out of { no_of_questions } " )
69
+ return pblms_links
70
+ page += 1
71
+ print (f"URLs of Question Scraped till now: { pblms_link_scraped } " )
72
+ # *************************************************************************************
73
+
74
+ # ----------------------------- SCRAPING SUBSEQUENT PAGES -----------------------------
75
+ for link in page_links [1 :- 1 ]:
76
+ print (f"\n Scraping Page { page } " )
77
+
78
+ # Going to next Page
79
+ driver .get (link )
80
+ elements = driver .find_elements_by_css_selector ("td.id.dark.left a" and "td.id.left a" )
81
+ for element in elements :
82
+ # Saving the link in pblms_links
83
+ pblms_links .append (element .get_attribute ("href" ))
84
+ pblms_link_scraped += 1
85
+
86
+ # If we scraped required no. of questions then return
87
+ if pblms_link_scraped == no_of_questions :
88
+ print (f"URLs of Question Scraped till now: { pblms_link_scraped } " )
89
+ print (f"\n URLs Scrapped Successfully { pblms_link_scraped } out of { no_of_questions } " )
90
+ return pblms_links
91
+
92
+ print (f"URLs of Question Scraped till now: { pblms_link_scraped } " )
93
+ page += 1
94
+ # ----------------------------------------------------------------------------------------------
95
+
96
+ # scraped all the available questions but still the count is less
97
+ print (f"\n { pblms_link_scraped } out of { no_of_questions } URLs able to scrapped !!!" )
98
+ return pblms_links
99
+
100
+
101
+ def getproblem (URLs ):
8
102
"""
9
103
getproblem() : It takes input from the user of codeforces problemID and difficulty
10
104
level and then by using selenium and chrome webdriver, capturing screenshot of the
@@ -13,42 +107,49 @@ def getproblem():
13
107
Then saving the image.png as pdf file by using fdf library.
14
108
"""
15
109
16
- # Taking input from the user to search for the problem
17
- Pblm_id = input ("Enter the Problem ID: " )
18
- difficulty = input ("Enter the difficulty level: " )
19
- filename = input ('Enter the file name to store Question: ' ) + '.pdf'
20
-
21
- # Going to the specific URL
22
- url = "https://codeforces.com/problemset/problem/" + Pblm_id + "/" + difficulty
23
110
path = 'image.png'
24
- options = webdriver .ChromeOptions ()
25
111
112
+ # Creating a Target Output Folder
113
+ target_folder = './problems_pdf'
114
+ if not os .path .exists (target_folder ):
115
+ os .makedirs (target_folder )
116
+
117
+ options = webdriver .ChromeOptions ()
26
118
# Headless = True for taking a scrolling snapshot
27
119
options .headless = True
28
120
driver = webdriver .Chrome (DRIVER_PATH , options = options )
29
- driver .get (url )
30
- # Deciding height by tag
31
- required_height = driver .execute_script (
32
- 'return document.body.parentNode.scrollHeight' )
33
- driver .set_window_size (1366 , required_height )
121
+ file_counter = 1
122
+
123
+ for url in URLs :
124
+ driver .get (url )
125
+ # Deciding height by tag
126
+ required_height = driver .execute_script (
127
+ 'return document.body.parentNode.scrollHeight' )
128
+ driver .set_window_size (1366 , required_height )
129
+
130
+ title = driver .find_element_by_class_name ("title" ).text
131
+ filename = title [3 :] + '.pdf'
34
132
35
- # Taking SS of everything within the ttypography class
36
- driver .find_element_by_class_name ('ttypography' ).screenshot (path )
133
+ # Taking SS of everything within the ttypography class
134
+ driver .find_element_by_class_name ('ttypography' ).screenshot (path )
37
135
38
- # Opening image with pillow so based to capture its height and width
39
- cover = Image .open (path )
40
- WIDTH , HEIGHT = cover .size
41
- MARGIN = 10
42
- # based on image's height and width we are adjusting the pdf margin and borders
43
- pdf = FPDF (unit = 'pt' , format = [WIDTH + 2 * MARGIN , HEIGHT + 2 * MARGIN ])
44
- pdf .add_page () # Adding new page to the pdf
45
- pdf .image (path , MARGIN , MARGIN )
46
- pdf .output (filename , "F" ) # saving the pdf with the specified filename
136
+ # Opening image with pillow so based to capture its height and width
137
+ cover = Image .open (path )
138
+ WIDTH , HEIGHT = cover .size
139
+ MARGIN = 10
140
+ # based on image's height and width we are adjusting the pdf margin and borders
141
+ pdf = FPDF (unit = 'pt' , format = [WIDTH + 2 * MARGIN , HEIGHT + 2 * MARGIN ])
142
+ pdf .add_page () # Adding new page to the pdf
143
+ pdf .image (path , MARGIN , MARGIN )
47
144
48
- print (f'\n Great Success!!! Check your directory for { filename } file!' )
145
+ pdf .output (os .path .join (target_folder , filename ), "F" ) # saving the pdf with the specified filename
146
+ print (f'File saved in your directory ./problems_pdf/{ filename } ({ file_counter } /{ len (URLs )} ) !' )
147
+ file_counter += 1
49
148
50
149
51
150
if __name__ == "__main__" :
52
151
DRIVER_PATH = input ("Enter DRIVER PATH location: " )
53
- getproblem ()
152
+ diff = select_difficulty () # Accepting difficulty level from user
153
+ problems_link = extracting_problem_links (diff ) # scraping the required the no. of links
154
+ getproblem (problems_link ) # saving the Questions in PDF file.
54
155
os .remove ('image.png' )
0 commit comments