Merge pull request avinashkranjan#837 from smriti26raina/issue-754

avinashkranjan · web-flow · commit 2ac8f5e2ee3d · 2021-04-10T22:13:40.000+05:30
Added script for Codechef Scrapper
diff --git a/Codechef Scrapper/README.md b/Codechef Scrapper/README.md
@@ -0,0 +1,40 @@
+# Codechef Scraper
+This python script will let the user to scrape 'n' number of codechef problems from any category/difficulty in https://www.codechef.com/ ,as provided by the user. The functionality of the script is to gain the information regarding particular codechef problem in different PDFs.
+
+
+## Prerequisite Steps:
+Download the required packages from the following command in you terminal.(Make sure you're in the same project directory) 
+
+```
+pip3 install -r requirements.txt
+
+```
+
+To run this script,you need to have selenium installed and configure webdriver to use chrome browser in your`$PATH`.You can directly download chrome driver from the link below-
+https://chromedriver.chromium.org/downloads
+Further,you can set the path to chromedriver using
+
+```
+driver = webdriver.chrome("/usr/lib/chromium-browser/chromedriver") 
+
+```
+
+
+
+## Running the script:
+After installing all the requirements,run this command in your terminal.
+
+```
+python3 codechef.py
+
+```
+
+## Output:
+This script will generate 'n' number of different PDFs in a folder to store the problem information (problem title,problem statement,test cases,problem link) separately.
+
+![image](https://user-images.githubusercontent.com/30191221/113629602-46a4ff80-9684-11eb-8938-c6e8f934d3ae.png)
+
+![image](https://user-images.githubusercontent.com/30191221/113629697-64726480-9684-11eb-9d14-3b1ac515d40e.png)
+
+Author:
+[Smriti Raina](https://github.com/smriti26raina)
diff --git a/Codechef Scrapper/codechef.py b/Codechef Scrapper/codechef.py
@@ -0,0 +1,117 @@
+from selenium import webdriver
+import os
+options = webdriver.ChromeOptions()
+options.add_argument("--headless")
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import TimeoutException
+from fpdf import FPDF
+
+
+capa = DesiredCapabilities.CHROME
+capa["pageLoadStrategy"] = "none"
+
+driver = webdriver.Chrome(desired_capabilities=capa,options=options)
+baseurl="https://www.codechef.com/problems"
+wait = WebDriverWait(driver, 15)
+
+# map to get url from its problem difficulty
+problem_difficulty = {"Beginner": "school", "Easy": "easy", "Medium": "medium", "Hard": "hard", "Challenge": "challenge"}
+
+# get_problems returns the name and links of the problems
+def get_problems(category, no_of_problems):
+
+# A map to store problem name and problem url
+    problem_info = {}
+    try:
+        driver.get(baseurl + '/' + category)
+        # wait till the  first element is loaded
+        wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[1]/td[1]/div/a/b")))
+    except TimeoutException as exception:
+        print("Couldn't fetch problem. Network issue or page slow to render. Try again")
+        os._exit(-1)
+
+
+    
+    for problem_index in range(1, no_of_problems + 1):
+        problem_name = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a/b".format(problem_index)).text
+        problem_url = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a".format(problem_index)).get_attribute('href')
+        print(problem_name," ",problem_url)
+        problem_info[problem_name] = problem_url
+    return problem_info
+    
+# get_problem_desciption returns content of the problem
+def get_problem_description(problem_url,problem_name):
+    try:
+        driver.get(problem_url)
+        wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='problem-statement']/p[1]")))
+        problem_title= problem_name
+        problem_statement = driver.find_element_by_xpath("//*[@id='problem-statement']/p[1]").text
+        problem_test_cases = driver.find_element_by_xpath("//*[@id='problem-statement']/pre[1]").text
+
+    
+        if (problem_test_cases.find("Output") == -1):
+            problem_test_cases = "Input\n" + problem_test_cases
+            problem_test_cases+="\nOutput\n"
+            problem_test_cases +=  driver.find_element_by_xpath("//*[@id='problem-statement']/pre[2]").text
+            
+        
+        else:
+            
+            
+            driver.execute_script("window.stop();")
+        problem={'title':problem_title,'statement':problem_statement,'test_case':problem_test_cases,'url':problem_url}
+        return problem
+    
+    #Handling exceptions
+    except NoSuchElementException as e:
+        print("Couldn't scrap the element, Unable to locate it")
+        problem=None
+    except TimeoutException as exception:
+        print("Couldn't scrap the element, Unable to locate it")
+        problem=None
+        
+
+
+
+
+#storing the information in the pdf
+def convert_to_pdf(problem):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size = 15)
+    # Replace character that aren't in latin-1 character set
+    title=problem["title"].encode('latin-1', 'replace').decode('latin-1')
+    statement=problem["statement"].encode('latin-1', 'replace').decode('latin-1')
+    test_case=problem["test_case"].encode('latin-1', 'replace').decode('latin-1')
+    url=problem["url"]
+    # add sections to pdf
+    pdf.cell(200, 10, txt =title, ln = 1, align = 'C')
+    pdf.multi_cell(200, 10, txt =statement, align = 'L')
+    pdf.multi_cell(200, 10, txt =test_case, align = 'L')
+    pdf.write(5, 'Problem_Link: ')
+    pdf.write(5,url,url)
+
+    
+    pdf.output(title+".pdf")   
+    
+    
+#main function
+def main():
+    category=input("Enter the difficulty level from the following \n Beginner \n Easy \n Medium \n Hard \n Challenge  \n\n")
+    no_of_problems=int(input("\n Enter the number of problems to be scrapped: \n"))
+    info = get_problems(problem_difficulty[category],no_of_problems)
+    for name, url in info.items():
+        problem=get_problem_description(url,name)
+        if(problem is not None ):
+            convert_to_pdf(problem)
+        else:
+            pass
+            
+if __name__ == '__main__':
+    main()
+
+driver.close()
diff --git a/Codechef Scrapper/requirements.txt b/Codechef Scrapper/requirements.txt
@@ -0,0 +1,4 @@
+fpdf==1.7.2
+requests==2.24.0
+selenium==3.141.0
+urllib3==1.25.11