Skip to content

Commit

Permalink
Merge branch 'release/0.2'
Browse files Browse the repository at this point in the history
  • Loading branch information
Merton committed Aug 10, 2018
2 parents 341b322 + 1a02d52 commit 48b940b
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 32 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,8 @@ __pycache__
*.pyc
*.xml

# Generated files
broken_link_output.txt

# Application files
error_results.txt
config.ini
37 changes: 30 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,42 @@ To install the required packages use `pip`:
`pip install -r requirements.txt`

## Usage
### Command Line & Server side
To run the script, ensure you have python (min <2.7) installed and run:
`python linker.py`

Copy the contents of the `config.ini.example` file into `config.ini` in the same directory.

Modify your config file with the details for your site:

**GENERAL CONFIG**

|Config option|Description|
|-------------|-----------|
UseLocalFile|yes (default) / no
LocalSitemapFile | File path + name relative to this directory
DownloadSitemap | yes / no (default)
RemoteSitemapUrl | The url of the sitemap hosted on your website
OutputToFile | yes (default) / no
OutputFileName | Name of the file that the results will store. Can be placed elsewhere using relative path

**EMAIL CONFIG**

|Config option|Description|
|-------------|-----------|
SiteName|The name of your site, this is for display only and gets used in the output for easier identification
EmailOutput|yes / no (default)
AdminEmailAddress|The address of that emails will be sent from
AdminEmailPassword|The password of the Admins email account *PLAIN TEXT!*
RecipientEmailAddress|The recipient's email where the output gets sent to

### Graphical interface
Enter the linker directory, and run:
`python main.py`

From here you can enter or browse for the filename of the XML sitemap, and click enter.

#### HTTP Auth
If your site has a username and password,

### Command Line
To run the script, ensure you have python (min <2.7) installed and run:
`python linker.py`

You will be asked if you want to use a sitemap hosted on your website, these are often found at `/sitemap.xml`. If you would rather use a local xml file, leave it blank and you will be prompted for the filepath of the sitemap.
If your site has http authentication, then you will be asked to enter the username and password for the site. These details are not stored.

The script will carry out the test on every url, and then output a report of all the broken links found.
23 changes: 23 additions & 0 deletions config.ini.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
;COPY THE CONTENTS OF THIS EXAMPLE INTO A "config.ini" FILE

[OPTIONS]
;Use exisiting sitemap on system, include xml extension
UseLocalFile=yes
LocalSitemapFile=public/sitemap.xml

;Downloads and uses the sitemap from the given url (ie, google.com/sitemap.xml)
DownloadSitemap=no
RemoteSitemapUrl=

OutputToFile=yes
OutputFileName=broken_link_output.txt

;Emails the results to the Recipient address from the Admin address
[EMAIL]
SiteName=
EmailOutput=yes

AdminEmailAddress=
AdminEmailPassword=

RecipientEmailAddress=
127 changes: 103 additions & 24 deletions linker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import configparser
import os
import smtplib
import datetime

from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

import xml.etree.ElementTree as ET
import requests
from bs4 import BeautifulSoup as Soup


def check_links(site_map_file, auth=None):
tree = ET.parse(site_map_file)
root = tree.getroot()
Expand Down Expand Up @@ -91,24 +100,12 @@ def check_links(site_map_file, auth=None):
broken_links.append((link_url, status_code, url))
else:
continue
# Outputs results to a file and terminal, returns results
with open('error_results.txt', 'w') as file:
for url, error, location in broken_links:
broken_link = """
===== BROKEN LINK ============
Broken Link Path: {}
Location: {}
Error: {}
==============================
""".format(str(url), str(location), str(error))
print( "Error: ", str(error), " => URL: ", str(url), "Location: ", str(location))
file.write(broken_link)


return broken_links

def download_map(url):
# Download XML sitemap from given web address and save to file
site_map_file = 'xml_sitemap.xml'
site_map_file = 'tmp_sitemap.xml'
try:
# Get the xml map from the site
xml = requests.get(url, stream=True)
Expand All @@ -119,18 +116,100 @@ def download_map(url):
except:
raise SystemExit("Could not download the xml file, please try again.")


def send_mail(config, subject, message):
"""
sends an email from using the EMAIL config in config.ini
"""
server = smtplib.SMTP('smtp.gmail.com', 587)
server.starttls()
server.login(config['AdminEmailAddress'], config['AdminEmailPassword'])

msg = MIMEMultipart()
msg['From'] = config['AdminEmailAddress']
msg['To'] = config['RecipientEmailAddress']
msg['Subject'] = subject

body = message

msg.attach(MIMEText(body, 'plain'))

txt = msg.as_string()
server.sendmail(config['AdminEmailAddress'], config['RecipientEmailAddress'], txt)
server.quit()


# Allows command line running
def run():
print("Enter sitemap url - ie, https://www.google.com/sitemap.xml [Leave blank to use local file]:")
url = input()

if url == '':
print("Enter the filename for the XML sitemap, include extension:")
site_map_file = input()
else:
site_map_file = download_map(url)

check_links(site_map_file)
config = configparser.ConfigParser()
config.read('config.ini')

gen_conf = config['GENERAL']
email_conf = config['EMAIL']

# Local file
if gen_conf['UseLocalFile'] == 'yes':
site_map_file = gen_conf['LocalSitemapFile']

# Download sitemap from given url
elif gen_conf['DownloadSitemap'] == 'yes':
site_map_file = download_map(gen_conf['RemoteSitemapUrl'])

# Check links
if os.path.exists(site_map_file):
broken_links = check_links(site_map_file)
else:
print("The file at {} could not be found. Please check the config and ensure the filepath is correct.".format(site_map_file))

# If sitemap was downloaded, remove
try:
os.remove('tmp_sitemap.xml')
except FileNotFoundError:
pass


###### Output ######
# Outputs results to a file, the terminal and via email
scan_date = datetime.datetime.now().strftime("%c")

count_broken_links = len(broken_links)
# count_broken_links = 0
if count_broken_links > 0:
subject = "ALERT: Site Report for {} - {} Broken Links detected".format(email_conf['SiteName'], count_broken_links)
message = """
Linker Scan - FAILED, Broken links found
Site: {}
Date Scanned: {}
No. Broken Links: {} \n
""".format(email_conf["SiteName"], scan_date, count_broken_links)

# Formats the links into a human readable format
for url, error, location in broken_links:
broken_link = """
===== BROKEN LINK ============
Destination: {}
Location: {}
Error: {}
==============================
""".format(str(url), str(location), str(error))
print( "Error: ", str(error), " => URL: ", str(url), "Location: ", str(location))
message += broken_link

# Writes to file
if gen_conf['OutputToFile'] == 'yes':
with open(gen_conf['OutputFileName'], 'w') as file:
file.write(message)
else:
# Emails the output to address specified in config.ini
subject = "PASSED: Site Report for {} - No Broken Links detected".format(email_conf['SiteName'])
message = """
Linker Scan - PASSED, No broken links found
Site: {}
Date Scanned: {}
""".format(email_conf['SiteName'], scan_date)

if email_conf['EmailOutput'] == 'yes':
send_mail(email_conf, subject, message)

if __name__ == "__main__":
run()
Expand Down

0 comments on commit 48b940b

Please sign in to comment.