Skip to content

Commit

Permalink
Merge branch 'feature/command-auth' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Merton committed Aug 13, 2018
2 parents 988ee20 + ff4717a commit 56a218b
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 38 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ __pycache__
broken_link_output.txt

# Application files
config.ini
config.ini
/log/*
28 changes: 21 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,48 @@ To install the required packages use `pip`:

## Usage
### Command Line & Server side
To run the script, ensure you have python (min <2.7) installed and run:
`python linker.py`

Copy the contents of the `config.ini.example` file into `config.ini` in the same directory.

Modify your config file with the details for your site:

**GENERAL CONFIG**
#### GENERAL CONFIG

|Config option|Description|
|-------------|-----------|
SiteName|The name of your site, this is for display only and gets used in the output for easier identification
UseLocalFile|yes (default) / no
LocalSitemapFile | File path + name relative to this directory
DownloadSitemap | yes / no (default)
RemoteSitemapUrl | The url of the sitemap hosted on your website
OutputToFile | yes (default) / no
OutputFileName | Name of the file that the results will store. Can be placed elsewhere using relative path
LogfileDirectory| The directory where logs will be saved, ensure you have the correct permissions for the directory. The script will a directories per site, ie: `<LogFileDirectory>/linker/<SiteName>/<date-of-scan>`

**EMAIL CONFIG**

#### EMAIL CONFIG

|Config option|Description|
|-------------|-----------|
SiteName|The name of your site, this is for display only and gets used in the output for easier identification
EmailOutput|yes / no (default)
AdminEmailAddress|The address of that emails will be sent from
AdminEmailPassword|The password of the Admins email account *PLAIN TEXT!*
AdminEmailPassword|The password of the Admins email account -> **PLAIN TEXT!**
RecipientEmailAddress|The recipient's email where the output gets sent to

#### AUTH CONFIG

For sites that are protected behind a username and password, you can authenticate by providing the username and password in the config.

**WARNING** These are stored in plain text, so the right priviledges should be granted to keep them as secure as possible.

|Config option|Description|
|-------------|-----------|
SiteUsername| The username for the protected site
SitePassword| The password for the protected site -> **Plain Text**


To run the script, ensure you have python (min <2.7) installed and run:
`python linker.py`

### Graphical interface
Enter the linker directory, and run:
`python main.py`
Expand Down
15 changes: 12 additions & 3 deletions config.ini.example
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
;COPY THE CONTENTS OF THIS EXAMPLE INTO A "config.ini" FILE

[OPTIONS]
;Use exisiting sitemap on system, include xml extension
;Used in logging & email reports
SiteName=

;Use an exisiting sitemap on system, include the xml extension
UseLocalFile=yes
LocalSitemapFile=public/sitemap.xml
LocalSitemapFile=../public/sitemap.xml

;Downloads and uses the sitemap from the given url (ie, google.com/sitemap.xml)
DownloadSitemap=no
Expand All @@ -12,12 +15,18 @@ RemoteSitemapUrl=
OutputToFile=yes
OutputFileName=broken_link_output.txt

;Logging - The directory where logs will be stored
LogfileDirectory=./log

;Emails the results to the Recipient address from the Admin address
[EMAIL]
SiteName=
EmailOutput=yes

AdminEmailAddress=
AdminEmailPassword=

RecipientEmailAddress=

[AUTH]
SiteUsername=
SitePassword=
100 changes: 73 additions & 27 deletions linker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import configparser
import os
import os, sys
import smtplib
import datetime

Expand All @@ -10,6 +10,7 @@
import requests
from bs4 import BeautifulSoup as Soup

import logging

def check_links(site_map_file, auth=None):
tree = ET.parse(site_map_file)
Expand All @@ -22,18 +23,21 @@ def check_links(site_map_file, auth=None):
checked_links = []
site_url = root[0][0].text
print(site_url)
logging.info(site_url)

try:
if auth:
print(auth)
r = requests.get(site_url, auth=(auth[0], auth[1]))
else:
r = requests.get(site_url)
except:
print("Could not reach site")
log = "Could not reach site"
print(log)
logging.error(log)
quit

if r.status_code == 401:
logging.error("Received a 401 error from the site, try adding auth into the config.ini and retry")
return 401
# Loop over every <url> tag from the site map
for page_index, page in enumerate(root):
Expand All @@ -43,22 +47,27 @@ def check_links(site_map_file, auth=None):
url.encode('utf-8')

if (url not in checked_links) and ("/assets" not in url):
print('Page {} of {} | Checking url [{}]'.format(page_index + 1, url_count, url))

log = 'Page {} of {} | Checking url [{}]'.format(page_index + 1, url_count, url)
print(log)
logging.info(log)
try:
if auth:
r = requests.get(url, auth=(auth[0], auth[1]))
else:
r = requests.get(url)

except:
print("Uh oh, something went wrong checking {}".format(url))
log = "Uh oh, something went wrong checking {}".format(url)
print(log)
logging.info(log)
broken_links.append((url, "Unknown error", url))

status_code = r.status_code

if status_code != 200:
print('Non-OK response ({}) on url: {}'.format(url,status_code))
log = 'Non-OK response ({}) for url: {}'.format(status_code,url)
print(log)
logging.info(log)
broken_links.append((url, status_code, url))

checked_links.append(url)
Expand All @@ -76,31 +85,42 @@ def check_links(site_map_file, auth=None):
checked_links.append(link_url)

# Allows for links that are relative, ie - /contact
if link_url and not (link_url.startswith("http") or link_url.startswith("mailto:")):
if link_url and not (link_url.startswith("http")):
if link_url.startswith("/"):
link_url = site_url[0:-1] + link_url
elif link_url.startswith("mailto:"):
continue
else:
link_url = site_url + link_url

print('Page {} of {} | Link {} of {} | Checking url [{}]'.format(page_index + 1, url_count, link_index + 1, links_count, link_url))
log = 'Page {} of {} | Link {} of {} | Checking url [{}]'.format(page_index + 1, url_count, link_index + 1, links_count, link_url)
print(log)
logging.info(log)
try:
if auth:
r = requests.get(link_url, auth=(auth[0],auth[1]))
else:
r = requests.get(link_url)
except:
print("Uh oh, something went wrong checking {}".format(link_url))
log = "Uh oh, something went wrong checking {}".format(link_url)
print(log)
logging.info(log)

if link_url == '':
link_url = link
broken_links.append((link_url, "Unknown error", url))
else:
broken_links.append((link_url, "Unknown error", url))

status_code = r.status_code
if status_code != 200:
print('Non-OK response ({}) on link_url: {}'.format(link_url,status_code))
print(link)
log = 'Non-OK response ({}) at the link url: {}'.format(status_code,link_url)
print(log)
logging.info(log)

broken_links.append((link_url, status_code, url))
else:
continue

return broken_links

def download_map(url):
Expand All @@ -114,7 +134,9 @@ def download_map(url):

return site_map_file
except:
raise SystemExit("Could not download the xml file, please try again.")
log = "Could not download the xml file, please try again."
logging.error(log)
raise sys.exit(log)


def send_mail(config, subject, message):
Expand Down Expand Up @@ -146,20 +168,45 @@ def run():

gen_conf = config['GENERAL']
email_conf = config['EMAIL']
auth_conf = config['AUTH']
auth = (auth_conf['SiteUsername'],auth_conf['SitePassword'])

scan_date = datetime.datetime.now().strftime("%Y-%m-%d")

log_file_path = '{}/linker/{}/'.format(gen_conf['LogFileDirectory'], gen_conf['SiteName'])
log_file_name = scan_date
os.makedirs(log_file_path, exist_ok=True)

logging.basicConfig(
filename=log_file_path + log_file_name,
level=logging.INFO,
format=' %(asctime)s - %(levelname)s - %(message)s'
)

# Local file
if gen_conf['UseLocalFile'] == 'yes':
site_map_file = gen_conf['LocalSitemapFile']

# Download sitemap from given url
elif gen_conf['DownloadSitemap'] == 'yes':
site_map_file = download_map(gen_conf['RemoteSitemapUrl'])

else:
log = "Please specify either a local sitemap, or the address to download it remotely."
logging.error(log)
sys.exit(log)

# Check links
if os.path.exists(site_map_file):
broken_links = check_links(site_map_file)
if auth != ('', ''):
broken_links = check_links(site_map_file, auth)
else:
broken_links = check_links(site_map_file)

if broken_links == 401:
sys.exit("The site you are trying to check requires authenticating. Please add the auth details in the config.ini file and try again.")

else:
print("The file at {} could not be found. Please check the config and ensure the filepath is correct.".format(site_map_file))
log = "The file at {} could not be found. Please check the config and ensure the filepath is correct.".format(site_map_file)
logging.error(log)
sys.exit(log)

# If sitemap was downloaded, remove
try:
Expand All @@ -170,18 +217,16 @@ def run():

###### Output ######
# Outputs results to a file, the terminal and via email
scan_date = datetime.datetime.now().strftime("%c")

count_broken_links = len(broken_links)
# count_broken_links = 0

if count_broken_links > 0:
subject = "ALERT: Site Report for {} - {} Broken Links detected".format(email_conf['SiteName'], count_broken_links)
subject = "ALERT: Site Report for {} - {} Broken Links detected".format(gen_conf['SiteName'], count_broken_links)
message = """
Linker Scan - FAILED, Broken links found
Site: {}
Date Scanned: {}
No. Broken Links: {} \n
""".format(email_conf["SiteName"], scan_date, count_broken_links)
""".format(gen_conf["SiteName"], scan_date, count_broken_links)

# Formats the links into a human readable format
for url, error, location in broken_links:
Expand All @@ -193,6 +238,7 @@ def run():
==============================
""".format(str(url), str(location), str(error))
print( "Error: ", str(error), " => URL: ", str(url), "Location: ", str(location))
logging.info(broken_link)
message += broken_link

# Writes to file
Expand All @@ -201,12 +247,12 @@ def run():
file.write(message)
else:
# Emails the output to address specified in config.ini
subject = "PASSED: Site Report for {} - No Broken Links detected".format(email_conf['SiteName'])
subject = "PASSED: Site Report for {} - No Broken Links detected".format(gen_conf['SiteName'])
message = """
Linker Scan - PASSED, No broken links found
Site: {}
Date Scanned: {}
""".format(email_conf['SiteName'], scan_date)
""".format(gen_conf['SiteName'], scan_date)

if email_conf['EmailOutput'] == 'yes':
send_mail(email_conf, subject, message)
Expand Down

0 comments on commit 56a218b

Please sign in to comment.