Skip to content

Commit 2ac8f5e

Browse files
Merge pull request avinashkranjan#837 from smriti26raina/issue-754
Added script for Codechef Scrapper
2 parents 91e787a + 127b4a6 commit 2ac8f5e

File tree

3 files changed

+161
-0
lines changed

3 files changed

+161
-0
lines changed

Codechef Scrapper/README.md

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Codechef Scraper
2+
This python script will let the user to scrape 'n' number of codechef problems from any category/difficulty in https://www.codechef.com/ ,as provided by the user. The functionality of the script is to gain the information regarding particular codechef problem in different PDFs.
3+
4+
5+
## Prerequisite Steps:
6+
Download the required packages from the following command in you terminal.(Make sure you're in the same project directory)
7+
8+
```
9+
pip3 install -r requirements.txt
10+
11+
```
12+
13+
To run this script,you need to have selenium installed and configure webdriver to use chrome browser in your`$PATH`.You can directly download chrome driver from the link below-
14+
https://chromedriver.chromium.org/downloads
15+
Further,you can set the path to chromedriver using
16+
17+
```
18+
driver = webdriver.chrome("/usr/lib/chromium-browser/chromedriver")
19+
20+
```
21+
22+
23+
24+
## Running the script:
25+
After installing all the requirements,run this command in your terminal.
26+
27+
```
28+
python3 codechef.py
29+
30+
```
31+
32+
## Output:
33+
This script will generate 'n' number of different PDFs in a folder to store the problem information (problem title,problem statement,test cases,problem link) separately.
34+
35+
![image](https://user-images.githubusercontent.com/30191221/113629602-46a4ff80-9684-11eb-8938-c6e8f934d3ae.png)
36+
37+
![image](https://user-images.githubusercontent.com/30191221/113629697-64726480-9684-11eb-9d14-3b1ac515d40e.png)
38+
39+
Author:
40+
[Smriti Raina](https://github.com/smriti26raina)

Codechef Scrapper/codechef.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from selenium import webdriver
2+
import os
3+
options = webdriver.ChromeOptions()
4+
options.add_argument("--headless")
5+
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
6+
from selenium.webdriver.support.ui import WebDriverWait
7+
from selenium.webdriver.support import expected_conditions as EC
8+
from selenium.webdriver.common.by import By
9+
from selenium.common.exceptions import NoSuchElementException
10+
from selenium.common.exceptions import TimeoutException
11+
from fpdf import FPDF
12+
13+
14+
capa = DesiredCapabilities.CHROME
15+
capa["pageLoadStrategy"] = "none"
16+
17+
driver = webdriver.Chrome(desired_capabilities=capa,options=options)
18+
baseurl="https://www.codechef.com/problems"
19+
wait = WebDriverWait(driver, 15)
20+
21+
# map to get url from its problem difficulty
22+
problem_difficulty = {"Beginner": "school", "Easy": "easy", "Medium": "medium", "Hard": "hard", "Challenge": "challenge"}
23+
24+
# get_problems returns the name and links of the problems
25+
def get_problems(category, no_of_problems):
26+
27+
# A map to store problem name and problem url
28+
problem_info = {}
29+
try:
30+
driver.get(baseurl + '/' + category)
31+
# wait till the first element is loaded
32+
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[1]/td[1]/div/a/b")))
33+
except TimeoutException as exception:
34+
print("Couldn't fetch problem. Network issue or page slow to render. Try again")
35+
os._exit(-1)
36+
37+
38+
39+
for problem_index in range(1, no_of_problems + 1):
40+
problem_name = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a/b".format(problem_index)).text
41+
problem_url = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a".format(problem_index)).get_attribute('href')
42+
print(problem_name," ",problem_url)
43+
problem_info[problem_name] = problem_url
44+
return problem_info
45+
46+
# get_problem_desciption returns content of the problem
47+
def get_problem_description(problem_url,problem_name):
48+
try:
49+
driver.get(problem_url)
50+
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='problem-statement']/p[1]")))
51+
problem_title= problem_name
52+
problem_statement = driver.find_element_by_xpath("//*[@id='problem-statement']/p[1]").text
53+
problem_test_cases = driver.find_element_by_xpath("//*[@id='problem-statement']/pre[1]").text
54+
55+
56+
if (problem_test_cases.find("Output") == -1):
57+
problem_test_cases = "Input\n" + problem_test_cases
58+
problem_test_cases+="\nOutput\n"
59+
problem_test_cases += driver.find_element_by_xpath("//*[@id='problem-statement']/pre[2]").text
60+
61+
62+
else:
63+
64+
65+
driver.execute_script("window.stop();")
66+
problem={'title':problem_title,'statement':problem_statement,'test_case':problem_test_cases,'url':problem_url}
67+
return problem
68+
69+
#Handling exceptions
70+
except NoSuchElementException as e:
71+
print("Couldn't scrap the element, Unable to locate it")
72+
problem=None
73+
except TimeoutException as exception:
74+
print("Couldn't scrap the element, Unable to locate it")
75+
problem=None
76+
77+
78+
79+
80+
81+
#storing the information in the pdf
82+
def convert_to_pdf(problem):
83+
pdf = FPDF()
84+
pdf.add_page()
85+
pdf.set_font("Arial", size = 15)
86+
# Replace character that aren't in latin-1 character set
87+
title=problem["title"].encode('latin-1', 'replace').decode('latin-1')
88+
statement=problem["statement"].encode('latin-1', 'replace').decode('latin-1')
89+
test_case=problem["test_case"].encode('latin-1', 'replace').decode('latin-1')
90+
url=problem["url"]
91+
# add sections to pdf
92+
pdf.cell(200, 10, txt =title, ln = 1, align = 'C')
93+
pdf.multi_cell(200, 10, txt =statement, align = 'L')
94+
pdf.multi_cell(200, 10, txt =test_case, align = 'L')
95+
pdf.write(5, 'Problem_Link: ')
96+
pdf.write(5,url,url)
97+
98+
99+
pdf.output(title+".pdf")
100+
101+
102+
#main function
103+
def main():
104+
category=input("Enter the difficulty level from the following \n Beginner \n Easy \n Medium \n Hard \n Challenge \n\n")
105+
no_of_problems=int(input("\n Enter the number of problems to be scrapped: \n"))
106+
info = get_problems(problem_difficulty[category],no_of_problems)
107+
for name, url in info.items():
108+
problem=get_problem_description(url,name)
109+
if(problem is not None ):
110+
convert_to_pdf(problem)
111+
else:
112+
pass
113+
114+
if __name__ == '__main__':
115+
main()
116+
117+
driver.close()

Codechef Scrapper/requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
fpdf==1.7.2
2+
requests==2.24.0
3+
selenium==3.141.0
4+
urllib3==1.25.11

0 commit comments

Comments
 (0)