diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4b2c497 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# Python files +__pycache__ +*.pyc +*.xml + +# Application files +error_results.txt \ No newline at end of file diff --git a/gui.py b/gui.py index fb6376a..746e728 100644 --- a/gui.py +++ b/gui.py @@ -1,31 +1,107 @@ import linker import tkinter as tk +from tkinter import filedialog, ttk -class Application(tk.Frame): +class AuthDialog(): + auth = () + def __init__(self, parent): + self.win = tk.Toplevel(parent) + self.win.wm_title("Authentication Required") + + self.u = tk.Label(self.win, text="Username") + self.u.grid(row=0, column=0) + + self.auth_user = tk.Entry(self.win) + self.auth_user.grid(row=0, column=1) + + self.p = tk.Label(self.win, text="Password") + self.p.grid(row=1, column=0) + + self.auth_pass = tk.Entry(self.win) + self.auth_pass.grid(row=1, column=1) + + self.b = ttk.Button(self.win, text="Enter", command=self.return_auth) + self.b.grid(row=1, column=0) + + def return_auth(self): + print("clicked") + self.auth = self.auth_user.get(), self.auth_pass.get() + self.win.destroy() + + +class LinkerGUI(tk.Frame): PAD_X = 25 PAD_Y = 25 + user = '' + pswd = '' + def __init__(self, master=None): tk.Frame.__init__(self, master) self.grid() self.createWidgets() + def get_broken_links(self): + self.results.delete(0,tk.END) + filename = self.file_input.get() + if filename != "": + errors = linker.check_links(filename) + if errors == 401: + print("Auth required") + popup = AuthDialog(self) + self.wait_window(popup.win) + auth = popup.auth + + print(auth) + errors = linker.check_links(filename, auth) + + for url, error, location in errors: + self.results.insert(tk.END, "===== BROKEN LINK DETECTED ======") + self.results.insert(tk.END, "Broken Link path: ", url) + self.results.insert(tk.END, "Error Code: ", error) + self.results.insert(tk.END, "Location of broken URL: ",location) + self.results.insert(tk.END, "================================") + self.results.insert(tk.END, " ") + + else: + print("No file specified!") + + def browse_file(self): + self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("XML Files","*.xml"),("all files","*.*"))) + self.file_input.insert(0, self.filename) + def createWidgets(self): - self.filename = tk.Label(self,text="Filename") - self.filename.grid(row=0) + # Set title of window + self.winfo_toplevel().title("Linker") + + self.filename_label = tk.Label(self,text="Filename") + self.filename_label.grid(row=0) self.file_input = tk.Entry(self) self.file_input.grid(row=0, column=1) + + self.results = tk.Listbox(self) + self.results.config(width=70) + self.results.grid(row=1, column=0, columnspan=3) + + # ====== Buttons ====== # + # Browse + self.browse = tk.Button(self) + self.browse["text"] = "Browse..." + self.browse["command"] = self.browse_file + self.browse.grid(row=0, column=2) + + # Quit + self.QUIT = tk.Button(self, text="QUIT", fg="red", command=self.quit) + self.QUIT.grid(row=3, column=0) + # Enter self.enter = tk.Button(self) self.enter["text"] = "Enter" - self.enter["command"] = lambda: linker.check_links(self.file_input.get()) - self.enter.grid(row=1, column=0) + self.enter["command"] = self.get_broken_links + self.enter.grid(row=3, column=2) - self.QUIT = tk.Button(self, text="QUIT", fg="red", - command=self.quit) - self.QUIT.grid(row=1, column=1) def run(): root = tk.Tk() - app = Application(master=root) + app = LinkerGUI(master=root) app.mainloop() \ No newline at end of file diff --git a/linker.py b/linker.py index 59e275c..554a5e3 100644 --- a/linker.py +++ b/linker.py @@ -2,7 +2,7 @@ import requests from bs4 import BeautifulSoup as Soup -def check_links(site_map_file): +def check_links(site_map_file, auth=None): tree = ET.parse(site_map_file) root = tree.getroot() url_count = len(list(root)) @@ -11,23 +11,43 @@ def check_links(site_map_file): broken_links = [] # Every checked page url, including it's links from a tags checked_links = [] - + site_url = root[0][0].text + print(site_url) + + try: + if auth: + print(auth) + r = requests.get(site_url, auth=(auth[0], auth[1])) + else: + r = requests.get(site_url) + except: + print("Could not reach site") + quit + + if r.status_code == 401: + return 401 # Loop over every tag from the site map - for index, page in enumerate(root): + for page_index, page in enumerate(root): # Get the tag its contents url = page[0].text # Encoded to include multi-lang urls url.encode('utf-8') - print('{} of {} | Checking url [{}]'.format(index + 1, url_count, url)) - if url not in checked_links and "/assets" not in url: + if (url not in checked_links) and ("/assets" not in url): + print('Page {} of {} | Checking url [{}]'.format(page_index + 1, url_count, url)) + try: - r = requests.get(url) + if auth: + r = requests.get(url, auth=(auth[0], auth[1])) + else: + r = requests.get(url) + except: print("Uh oh, something went wrong checking {}".format(url)) broken_links.append((url, "Unknown error", url)) status_code = r.status_code + if status_code != 200: print('Non-OK response ({}) on url: {}'.format(url,status_code)) broken_links.append((url, status_code, url)) @@ -39,31 +59,55 @@ def check_links(site_map_file): links = soup.find_all('a') links_count = len(links) - for index, link in enumerate(links): + # Checks every link's href () on the page + for link_index, link in enumerate(links): link_url = link.get('href') - print('{} of {} | Checking url [{}]'.format(index + 1, links_count, link_url)) if link_url not in checked_links: + checked_links.append(link_url) + + # Allows for links that are relative, ie - /contact + if link_url and not (link_url.startswith("http") or link_url.startswith("mailto:")): + if link_url.startswith("/"): + link_url = site_url[0:-1] + link_url + else: + link_url = site_url + link_url + + print('Page {} of {} | Link {} of {} | Checking url [{}]'.format(page_index + 1, url_count, link_index + 1, links_count, link_url)) try: - r = requests.get(link_url) + if auth: + r = requests.get(link_url, auth=(auth[0],auth[1])) + else: + r = requests.get(link_url) except: print("Uh oh, something went wrong checking {}".format(link_url)) + if link_url == '': + link_url = link broken_links.append((link_url, "Unknown error", url)) - status_code = r.status_code if status_code != 200: print('Non-OK response ({}) on link_url: {}'.format(link_url,status_code)) + print(link) broken_links.append((link_url, status_code, url)) - - checked_links.append(link_url) - else: - print('Link already checked') - continue - - for url, error, location in broken_links: - print( "Error: " + str(error), " => URL: " + url, "Location: " + location) - - + else: + continue + # Outputs results to a file and terminal, returns results + with open('error_results.txt', 'w') as file: + for url, error, location in broken_links: + broken_link = """ + ===== BROKEN LINK ============ + Broken Link Path: {} + Location: {} + Error: {} + ============================== + """.format(str(url), str(location), str(error)) + print( "Error: ", str(error), " => URL: ", str(url), "Location: ", str(location)) + file.write(broken_link) + + + return broken_links + +# Allows command line running def run(): print("Enter sitemap url - ie, https://www.google.com/sitemap.xml [Leave blank to use local file]:") url = input() @@ -82,9 +126,6 @@ def run(): raise SystemExit("Could not download the xml file, please try again.") - check_links(site_map_file) - - if __name__ == "__main__": run() \ No newline at end of file diff --git a/main.py b/main.py index adb36a8..db6eefb 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,3 @@ -import linker, gui +import gui -gui.run() -linker.run() \ No newline at end of file +gui.run() \ No newline at end of file