-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraper.py
157 lines (127 loc) · 5.56 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import json
# Set up the service for ChromeDriver
service = Service(ChromeDriverManager().install())
# Initialize the Chrome WebDriver with the service
driver = webdriver.Chrome(service=service)
# URL of the fish listing page
listing_url = 'https://www.fishkeeper.co.uk/fish/marine/fish'
# Open the page
driver.get(listing_url)
# Initialize a set to store unique fish names and URLs
fish_names = set()
fish_urls = []
scroll_pause_time = 3 # Pause time between scrolls
while True:
# Scroll down by smaller increments to ensure all elements load
driver.execute_script("window.scrollBy(0, 600);")
# Wait for a short time to allow new elements to load
time.sleep(scroll_pause_time)
# Recheck all fish elements on the page
fish_elements = driver.find_elements(By.CLASS_NAME, 'data-card__name')
fish_links = driver.find_elements(By.CSS_SELECTOR, 'a.data-card')
# Add any new fish names and their links to the set
new_names_added = False
for fish, link in zip(fish_elements, fish_links):
if fish.text not in fish_names:
fish_names.add(fish.text)
fish_urls.append(link.get_attribute('href'))
new_names_added = True
# If no new names are added, and we've scrolled to the bottom, break the loop
new_height = driver.execute_script("return document.body.scrollHeight")
last_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
if last_height == new_height and not new_names_added:
break
# Define the categories you are looking for based on known patterns in URLs or page content
categories = {
"angelfish": "Angelfish",
"dwarf-angelfish": "Dwarf Angelfish",
"anthias": "Anthias",
"basslets": "Basslets",
"dottybacks": "Dottybacks",
"blennies": "Blennies",
"gobies": "Gobies",
"butterflyfish": "Butterfly fish",
"cardinalfish": "Cardinal fish",
"damselfish": "Damsel fish",
"clownfish": "Clown fish",
"hawkfish": "Hawk fish",
"lion": "Lion fish",
"scorpion": "Scorpion fish",
"miscellaneous-fish": "Miscellaneous Fish",
"pufferfish": "Puffer fish",
"rabbitfish": "Rabbit fish",
"triggers": "Triggers",
"filefish": "File fish",
"wrasses": "Wrasses"
}
# Initialize an empty list to store fish details
fish_data = []
# Now, visit each fish's detail page to scrape detailed information
for index, url in enumerate(fish_urls):
driver.get(url)
print(f"\nScraping details for fish {index + 1}/{len(fish_urls)}: {url}")
fish_details = {}
# Determine the category based on the URL
fish_category = "Miscellaneous Fish" # Default category
for key, value in categories.items():
if key in url.lower():
fish_category = value
break
fish_details['category'] = fish_category
# Wait for the Overview section to load
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'data-view__title'))
)
# Scrape the fish name (confirming we're on the right page)
fish_name = driver.find_element(By.CLASS_NAME, 'data-view__description').text.strip()
fish_details['name'] = fish_name
# Scrape the Overview section (key details)
overview_table = driver.find_element(By.CLASS_NAME, 'data-view__table')
rows = overview_table.find_elements(By.TAG_NAME, 'tr')
overview_details = {}
for row in rows:
label = row.find_element(By.TAG_NAME, 'td').text.strip()
value = row.find_element(By.CLASS_NAME, 'italic-style').text.strip() if 'italic-style' in row.get_attribute('class') else row.find_elements(By.TAG_NAME, 'td')[1].text.strip()
overview_details[label] = value
fish_details['overview'] = overview_details
# Scrape the Description section
try:
description_section = driver.find_element(By.ID, 'care')
description_text = description_section.text.strip()
fish_details['description'] = description_text
except:
fish_details['description'] = "No description found."
# Scrape the Feeding section
try:
feeding_section = driver.find_element(By.ID, 'feeding')
feeding_text = feeding_section.text.strip()
fish_details['feeding'] = feeding_text
except:
fish_details['feeding'] = "No feeding details found."
# Scrape the Breeding section
try:
breeding_section = driver.find_element(By.ID, 'breeding')
breeding_text = breeding_section.text.strip()
fish_details['breeding'] = breeding_text
except:
fish_details['breeding'] = "No breeding details found."
# Append the fish details to the list
fish_data.append(fish_details)
except Exception as e:
print(f"Failed to scrape details for {url}: {e}")
# Wait a bit before moving to the next fish to avoid overwhelming the server
time.sleep(2)
# Save the fish data to a JSON file
with open('fish_data.json', 'w') as json_file:
json.dump(fish_data, json_file, indent=4)
print("Data has been saved to fish_data.json")
# Close the browser
driver.quit()