forked from Anjan50/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scrapper.py
77 lines (53 loc) · 1.51 KB
/
web_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
#Requirements
#requests
#bs4
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
url= raw_input("enter url: ")
source=requests.get(url)
def get_chrome_web_driver(options):
return webdriver.Chrome("./chromedriver", chrome_options=options)
def get_web_driver_options():
return webdriver.ChromeOptions()
def set_ignore_certificate_error(options):
options.add_argument('--ignore-certificate-errors')
def set_browser_as_incognito(options):
options.add_argument('--incognito')
soup=BeautifulSoup(source.text,'html')
title=soup.find('title')
print("this is with html tags :",title)
qwery=soup.find('h1')
print("this is without html tags:",qwery.text)
links=soup.find('a')
print(links)
print(links['href'])
print(links['class'])
many_link=soup.find_all('a')
total_links=len(many_link)
print("total links in my website :",total_links)
print()
for i in many_link[:6]:
print(i)
second_link=many_link[1]
print(second_link)
print()
print("href is :",second_link['href'])
nested_div=second_link.find('div')
print(nested_div)
print()
z=(nested_div['class'])
print(z)
print(type(z))
print()
print("class name of div is :"," ".join(nested_div['class']))
wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
soup=BeautifulSoup(wiki.text,'html')
print(soup.find('title'))
ww2_contents=soup.find_all("div",class_='toc')
for i in ww2_contents:
print(i.text)
overview=soup.find_all('table',class_='infobox vevent')
for z in overview:
print(z.text)