\d{2})-(?P\d{2})'
+ tokens = []
+ offset = 0
+
+ # Escape all special RegEx chars
+ escaped_fmt = re.escape(fmt)
+
+ # Extract the bracketed expressions to be reinserted later.
+ escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
+
+ # Any number of S is the same as one.
+ # TODO: allow users to specify the number of digits to parse
+ escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
+
+ escaped_data = re.findall(self._ESCAPE_RE, fmt)
+
+ fmt_pattern = escaped_fmt
+
+ for m in self._FORMAT_RE.finditer(escaped_fmt):
+ token = m.group(0)
+ try:
+ input_re = self._input_re_map[token]
+ except KeyError:
+ raise ParserError("Unrecognized token '{}'".format(token))
+ input_pattern = "(?P<{}>{})".format(token, input_re.pattern)
+ tokens.append(token)
+ # a pattern doesn't have the same length as the token
+ # it replaces! We keep the difference in the offset variable.
+ # This works because the string is scanned left-to-right and matches
+ # are returned in the order found by finditer.
+ fmt_pattern = (
+ fmt_pattern[: m.start() + offset]
+ + input_pattern
+ + fmt_pattern[m.end() + offset :]
+ )
+ offset += len(input_pattern) - (m.end() - m.start())
+
+ final_fmt_pattern = ""
+ split_fmt = fmt_pattern.split(r"\#")
+
+ # Due to the way Python splits, 'split_fmt' will always be longer
+ for i in range(len(split_fmt)):
+ final_fmt_pattern += split_fmt[i]
+ if i < len(escaped_data):
+ final_fmt_pattern += escaped_data[i][1:-1]
+
+ # Wrap final_fmt_pattern in a custom word boundary to strictly
+ # match the formatting pattern and filter out date and time formats
+ # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
+ # blah1998-09-12blah. The custom word boundary matches every character
+ # that is not a whitespace character to allow for searching for a date
+ # and time string in a natural language sentence. Therefore, searching
+ # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
+ # work properly.
+ # Certain punctuation before or after the target pattern such as
+ # "1998-09-12," is permitted. For the full list of valid punctuation,
+ # see the documentation.
+
+ starting_word_boundary = (
+ r"(?\s])" # This is the list of punctuation that is ok before the pattern (i.e. "It can't not be these characters before the pattern")
+ r"(\b|^)" # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a negative number through i.e. before epoch numbers
+ )
+ ending_word_boundary = (
+ r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?" # Positive lookahead stating that these punctuation marks can appear after the pattern at most 1 time
+ r"(?!\S))" # Don't allow any non-whitespace character after the punctuation
+ )
+ bounded_fmt_pattern = r"{}{}{}".format(
+ starting_word_boundary, final_fmt_pattern, ending_word_boundary
+ )
+
+ return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
+
+ def _parse_token(self, token, value, parts):
+
+ if token == "YYYY":
+ parts["year"] = int(value)
+
+ elif token == "YY":
+ value = int(value)
+ parts["year"] = 1900 + value if value > 68 else 2000 + value
+
+ elif token in ["MMMM", "MMM"]:
+ parts["month"] = self.locale.month_number(value.lower())
+
+ elif token in ["MM", "M"]:
+ parts["month"] = int(value)
+
+ elif token in ["DDDD", "DDD"]:
+ parts["day_of_year"] = int(value)
+
+ elif token in ["DD", "D"]:
+ parts["day"] = int(value)
+
+ elif token in ["Do"]:
+ parts["day"] = int(value)
+
+ elif token.upper() in ["HH", "H"]:
+ parts["hour"] = int(value)
+
+ elif token in ["mm", "m"]:
+ parts["minute"] = int(value)
+
+ elif token in ["ss", "s"]:
+ parts["second"] = int(value)
+
+ elif token == "S":
+ # We have the *most significant* digits of an arbitrary-precision integer.
+ # We want the six most significant digits as an integer, rounded.
+ # IDEA: add nanosecond support somehow? Need datetime support for it first.
+ value = value.ljust(7, str("0"))
+
+ # floating-point (IEEE-754) defaults to half-to-even rounding
+ seventh_digit = int(value[6])
+ if seventh_digit == 5:
+ rounding = int(value[5]) % 2
+ elif seventh_digit > 5:
+ rounding = 1
+ else:
+ rounding = 0
+
+ parts["microsecond"] = int(value[:6]) + rounding
+
+ elif token == "X":
+ parts["timestamp"] = float(value)
+
+ elif token == "x":
+ parts["expanded_timestamp"] = int(value)
+
+ elif token in ["ZZZ", "ZZ", "Z"]:
+ parts["tzinfo"] = TzinfoParser.parse(value)
+
+ elif token in ["a", "A"]:
+ if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
+ parts["am_pm"] = "am"
+ elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
+ parts["am_pm"] = "pm"
+
+ elif token == "W":
+ parts["weekdate"] = value
+
+ @staticmethod
+ def _build_datetime(parts):
+
+ weekdate = parts.get("weekdate")
+
+ if weekdate is not None:
+ # we can use strptime (%G, %V, %u) in python 3.6 but these tokens aren't available before that
+ year, week = int(weekdate[0]), int(weekdate[1])
+
+ if weekdate[2] is not None:
+ day = int(weekdate[2])
+ else:
+ # day not given, default to 1
+ day = 1
+
+ dt = iso_to_gregorian(year, week, day)
+ parts["year"] = dt.year
+ parts["month"] = dt.month
+ parts["day"] = dt.day
+
+ timestamp = parts.get("timestamp")
+
+ if timestamp is not None:
+ return datetime.fromtimestamp(timestamp, tz=tz.tzutc())
+
+ expanded_timestamp = parts.get("expanded_timestamp")
+
+ if expanded_timestamp is not None:
+ return datetime.fromtimestamp(
+ normalize_timestamp(expanded_timestamp), tz=tz.tzutc(),
+ )
+
+ day_of_year = parts.get("day_of_year")
+
+ if day_of_year is not None:
+ year = parts.get("year")
+ month = parts.get("month")
+ if year is None:
+ raise ParserError(
+ "Year component is required with the DDD and DDDD tokens."
+ )
+
+ if month is not None:
+ raise ParserError(
+ "Month component is not allowed with the DDD and DDDD tokens."
+ )
+
+ date_string = "{}-{}".format(year, day_of_year)
+ try:
+ dt = datetime.strptime(date_string, "%Y-%j")
+ except ValueError:
+ raise ParserError(
+ "The provided day of year '{}' is invalid.".format(day_of_year)
+ )
+
+ parts["year"] = dt.year
+ parts["month"] = dt.month
+ parts["day"] = dt.day
+
+ am_pm = parts.get("am_pm")
+ hour = parts.get("hour", 0)
+
+ if am_pm == "pm" and hour < 12:
+ hour += 12
+ elif am_pm == "am" and hour == 12:
+ hour = 0
+
+ # Support for midnight at the end of day
+ if hour == 24:
+ if parts.get("minute", 0) != 0:
+ raise ParserError("Midnight at the end of day must not contain minutes")
+ if parts.get("second", 0) != 0:
+ raise ParserError("Midnight at the end of day must not contain seconds")
+ if parts.get("microsecond", 0) != 0:
+ raise ParserError(
+ "Midnight at the end of day must not contain microseconds"
+ )
+ hour = 0
+ day_increment = 1
+ else:
+ day_increment = 0
+
+ # account for rounding up to 1000000
+ microsecond = parts.get("microsecond", 0)
+ if microsecond == 1000000:
+ microsecond = 0
+ second_increment = 1
+ else:
+ second_increment = 0
+
+ increment = timedelta(days=day_increment, seconds=second_increment)
+
+ return (
+ datetime(
+ year=parts.get("year", 1),
+ month=parts.get("month", 1),
+ day=parts.get("day", 1),
+ hour=hour,
+ minute=parts.get("minute", 0),
+ second=parts.get("second", 0),
+ microsecond=microsecond,
+ tzinfo=parts.get("tzinfo"),
+ )
+ + increment
+ )
+
+ def _parse_multiformat(self, string, formats):
+
+ _datetime = None
+
+ for fmt in formats:
+ try:
+ _datetime = self.parse(string, fmt)
+ break
+ except ParserMatchError:
+ pass
+
+ if _datetime is None:
+ raise ParserError(
+ "Could not match input '{}' to any of the following formats: {}".format(
+ string, ", ".join(formats)
+ )
+ )
+
+ return _datetime
+
+ # generates a capture group of choices separated by an OR operator
+ @staticmethod
+ def _generate_choice_re(choices, flags=0):
+ return re.compile(r"({})".format("|".join(choices)), flags=flags)
+
+
+class TzinfoParser(object):
+ _TZINFO_RE = re.compile(r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$")
+
+ @classmethod
+ def parse(cls, tzinfo_string):
+
+ tzinfo = None
+
+ if tzinfo_string == "local":
+ tzinfo = tz.tzlocal()
+
+ elif tzinfo_string in ["utc", "UTC", "Z"]:
+ tzinfo = tz.tzutc()
+
+ else:
+
+ iso_match = cls._TZINFO_RE.match(tzinfo_string)
+
+ if iso_match:
+ sign, hours, minutes = iso_match.groups()
+ if minutes is None:
+ minutes = 0
+ seconds = int(hours) * 3600 + int(minutes) * 60
+
+ if sign == "-":
+ seconds *= -1
+
+ tzinfo = tz.tzoffset(None, seconds)
+
+ else:
+ tzinfo = tz.gettz(tzinfo_string)
+
+ if tzinfo is None:
+ raise ParserError(
+ 'Could not parse timezone expression "{}"'.format(tzinfo_string)
+ )
+
+ return tzinfo
diff --git a/script.module.slyguy/resources/modules/arrow/util.py b/script.module.slyguy/resources/modules/arrow/util.py
new file mode 100644
index 00000000..785da867
--- /dev/null
+++ b/script.module.slyguy/resources/modules/arrow/util.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import datetime
+import numbers
+
+from arrow.constants import MAX_TIMESTAMP, MAX_TIMESTAMP_MS, MAX_TIMESTAMP_US
+
+
+def total_seconds(td):
+ """Get total seconds for timedelta."""
+ return td.total_seconds()
+
+
+def is_timestamp(value):
+ """Check if value is a valid timestamp."""
+ if isinstance(value, bool):
+ return False
+ if not (
+ isinstance(value, numbers.Integral)
+ or isinstance(value, float)
+ or isinstance(value, str)
+ ):
+ return False
+ try:
+ float(value)
+ return True
+ except ValueError:
+ return False
+
+
+def normalize_timestamp(timestamp):
+ """Normalize millisecond and microsecond timestamps into normal timestamps."""
+ if timestamp > MAX_TIMESTAMP:
+ if timestamp < MAX_TIMESTAMP_MS:
+ timestamp /= 1e3
+ elif timestamp < MAX_TIMESTAMP_US:
+ timestamp /= 1e6
+ else:
+ raise ValueError(
+ "The specified timestamp '{}' is too large.".format(timestamp)
+ )
+ return timestamp
+
+
+# Credit to https://stackoverflow.com/a/1700069
+def iso_to_gregorian(iso_year, iso_week, iso_day):
+ """Converts an ISO week date tuple into a datetime object."""
+
+ if not 1 <= iso_week <= 53:
+ raise ValueError("ISO Calendar week value must be between 1-53.")
+
+ if not 1 <= iso_day <= 7:
+ raise ValueError("ISO Calendar day value must be between 1-7")
+
+ # The first week of the year always contains 4 Jan.
+ fourth_jan = datetime.date(iso_year, 1, 4)
+ delta = datetime.timedelta(fourth_jan.isoweekday() - 1)
+ year_start = fourth_jan - delta
+ gregorian = year_start + datetime.timedelta(days=iso_day - 1, weeks=iso_week - 1)
+
+ return gregorian
+
+
+# Python 2.7 / 3.0+ definitions for isstr function.
+
+try: # pragma: no cover
+ basestring
+
+ def isstr(s):
+ return isinstance(s, basestring) # noqa: F821
+
+
+except NameError: # pragma: no cover
+
+ def isstr(s):
+ return isinstance(s, str)
+
+
+__all__ = ["total_seconds", "is_timestamp", "isstr", "iso_to_gregorian"]
diff --git a/script.module.slyguy/resources/modules/bs4/__init__.py b/script.module.slyguy/resources/modules/bs4/__init__.py
new file mode 100644
index 00000000..eb068730
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/__init__.py
@@ -0,0 +1,465 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides provides methods and Pythonic idioms that make it easy to
+navigate, search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.6 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.4.1"
+__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import os
+import re
+import six
+import warnings
+
+from .builder import builder_registry, ParserRejectedMarkup
+from .dammit import UnicodeDammit
+from .element import (
+ CData,
+ Comment,
+ DEFAULT_OUTPUT_ENCODING,
+ Declaration,
+ Doctype,
+ NavigableString,
+ PageElement,
+ ProcessingInstruction,
+ ResultSet,
+ SoupStrainer,
+ Tag,
+ )
+
+class BeautifulSoup(Tag):
+ """
+ This class defines the basic interface called by the tree builders.
+
+ These methods will be called by the parser:
+ reset()
+ feed(markup)
+
+ The tree builder may call these methods from its feed() implementation:
+ handle_starttag(name, attrs) # See note about return value
+ handle_endtag(name)
+ handle_data(data) # Appends to the current data node
+ endData(containerClass=NavigableString) # Ends the current data node
+
+ No matter how complicated the underlying parser is, you should be
+ able to build a tree using 'start tag' events, 'end tag' events,
+ 'data' events, and "done with data" events.
+
+ If you encounter an empty-element tag (aka a self-closing tag,
+ like HTML's
tag), call handle_starttag and then
+ handle_endtag.
+ """
+ ROOT_TAG_NAME = u'[document]'
+
+ # If the end-user gives no indication which tree builder they
+ # want, look for one with these features.
+ DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+ ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+
+ def __init__(self, markup="", features=None, builder=None,
+ parse_only=None, from_encoding=None, exclude_encodings=None,
+ **kwargs):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser."""
+
+ if 'convertEntities' in kwargs:
+ warnings.warn(
+ "BS4 does not respect the convertEntities argument to the "
+ "BeautifulSoup constructor. Entities are always converted "
+ "to Unicode characters.")
+
+ if 'markupMassage' in kwargs:
+ del kwargs['markupMassage']
+ warnings.warn(
+ "BS4 does not respect the markupMassage argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for any necessary markup massage.")
+
+ if 'smartQuotesTo' in kwargs:
+ del kwargs['smartQuotesTo']
+ warnings.warn(
+ "BS4 does not respect the smartQuotesTo argument to the "
+ "BeautifulSoup constructor. Smart quotes are always converted "
+ "to Unicode characters.")
+
+ if 'selfClosingTags' in kwargs:
+ del kwargs['selfClosingTags']
+ warnings.warn(
+ "BS4 does not respect the selfClosingTags argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for understanding self-closing tags.")
+
+ if 'isHTML' in kwargs:
+ del kwargs['isHTML']
+ warnings.warn(
+ "BS4 does not respect the isHTML argument to the "
+ "BeautifulSoup constructor. Suggest you use "
+ "features='lxml' for HTML and features='lxml-xml' for "
+ "XML.")
+
+ def deprecated_argument(old_name, new_name):
+ if old_name in kwargs:
+ warnings.warn(
+ 'The "%s" argument to the BeautifulSoup constructor '
+ 'has been renamed to "%s."' % (old_name, new_name))
+ value = kwargs[old_name]
+ del kwargs[old_name]
+ return value
+ return None
+
+ parse_only = parse_only or deprecated_argument(
+ "parseOnlyThese", "parse_only")
+
+ from_encoding = from_encoding or deprecated_argument(
+ "fromEncoding", "from_encoding")
+
+ if len(kwargs) > 0:
+ arg = list(kwargs.keys()).pop()
+ raise TypeError(
+ "__init__() got an unexpected keyword argument '%s'" % arg)
+
+ if builder is None:
+ original_features = features
+ if isinstance(features, six.string_types):
+ features = [features]
+ if features is None or len(features) == 0:
+ features = self.DEFAULT_BUILDER_FEATURES
+ builder_class = builder_registry.lookup(*features)
+ if builder_class is None:
+ raise FeatureNotFound(
+ "Couldn't find a tree builder with the features you "
+ "requested: %s. Do you need to install a parser library?"
+ % ",".join(features))
+ builder = builder_class()
+ if not (original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES):
+ if builder.is_xml:
+ markup_type = "XML"
+ else:
+ markup_type = "HTML"
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+ parser=builder.NAME,
+ markup_type=markup_type))
+
+ self.builder = builder
+ self.is_xml = builder.is_xml
+ self.builder.soup = self
+
+ self.parse_only = parse_only
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ elif len(markup) <= 256:
+ # Print out warnings for a couple beginner problems
+ # involving passing non-markup to Beautiful Soup.
+ # Beautiful Soup will still parse the input as markup,
+ # just in case that's what the user really wants.
+ if (isinstance(markup, six.text_type)
+ and not os.path.supports_unicode_filenames):
+ possible_filename = markup.encode("utf8")
+ else:
+ possible_filename = markup
+ is_file = False
+ try:
+ is_file = os.path.exists(possible_filename)
+ except Exception as e:
+ # This is almost certainly a problem involving
+ # characters not valid in filenames on this
+ # system. Just let it go.
+ pass
+ if is_file:
+ if isinstance(markup, six.text_type):
+ markup = markup.encode("utf8")
+ warnings.warn(
+ '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
+ if markup[:5] == "http:" or markup[:6] == "https:":
+ # TODO: This is ugly but I couldn't get it to work in
+ # Python 3 otherwise.
+ if ((isinstance(markup, bytes) and not b' ' in markup)
+ or (isinstance(markup, six.text_type) and not u' ' in markup)):
+ if isinstance(markup, six.text_type):
+ markup = markup.encode("utf8")
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+
+ for (self.markup, self.original_encoding, self.declared_html_encoding,
+ self.contains_replacement_characters) in (
+ self.builder.prepare_markup(
+ markup, from_encoding, exclude_encodings=exclude_encodings)):
+ self.reset()
+ try:
+ self._feed()
+ break
+ except ParserRejectedMarkup:
+ pass
+
+ # Clear out the markup and remove the builder's circular
+ # reference to this object.
+ self.markup = None
+ self.builder.soup = None
+
+ def __copy__(self):
+ return type(self)(self.encode(), builder=self.builder)
+
+ def __getstate__(self):
+ # Frequently a tree builder can't be pickled.
+ d = dict(self.__dict__)
+ if 'builder' in d and not self.builder.picklable:
+ del d['builder']
+ return d
+
+ def _feed(self):
+ # Convert the document to Unicode.
+ self.builder.reset()
+
+ self.builder.feed(self.markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def reset(self):
+ Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ self.builder.reset()
+ self.current_data = []
+ self.currentTag = None
+ self.tagStack = []
+ self.preserve_whitespace_tag_stack = []
+ self.pushTag(self)
+
+ def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+ """Create a new tag associated with this soup."""
+ return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+ def new_string(self, s, subclass=NavigableString):
+ """Create a new NavigableString associated with this soup."""
+ return subclass(s)
+
+ def insert_before(self, successor):
+ raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
+
+ def insert_after(self, successor):
+ raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+ if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
+ self.preserve_whitespace_tag_stack.pop()
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+ if tag.name in self.builder.preserve_whitespace_tags:
+ self.preserve_whitespace_tag_stack.append(tag)
+
+ def endData(self, containerClass=NavigableString):
+ if self.current_data:
+ current_data = u''.join(self.current_data)
+ # If whitespace is not preserved, and this string contains
+ # nothing but ASCII spaces, replace it with a single space
+ # or newline.
+ if not self.preserve_whitespace_tag_stack:
+ strippable = True
+ for i in current_data:
+ if i not in self.ASCII_SPACES:
+ strippable = False
+ break
+ if strippable:
+ if '\n' in current_data:
+ current_data = '\n'
+ else:
+ current_data = ' '
+
+ # Reset the data collector.
+ self.current_data = []
+
+ # Should we add this string to the tree at all?
+ if self.parse_only and len(self.tagStack) <= 1 and \
+ (not self.parse_only.text or \
+ not self.parse_only.search(current_data)):
+ return
+
+ o = containerClass(current_data)
+ self.object_was_parsed(o)
+
+ def object_was_parsed(self, o, parent=None, most_recent_element=None):
+ """Add an object to the parse tree."""
+ parent = parent or self.currentTag
+ previous_element = most_recent_element or self._most_recent_element
+
+ next_element = previous_sibling = next_sibling = None
+ if isinstance(o, Tag):
+ next_element = o.next_element
+ next_sibling = o.next_sibling
+ previous_sibling = o.previous_sibling
+ if not previous_element:
+ previous_element = o.previous_element
+
+ o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
+
+ self._most_recent_element = o
+ parent.contents.append(o)
+
+ if parent.next_sibling:
+ # This node is being inserted into an element that has
+ # already been parsed. Deal with any dangling references.
+ index = parent.contents.index(o)
+ if index == 0:
+ previous_element = parent
+ previous_sibling = None
+ else:
+ previous_element = previous_sibling = parent.contents[index-1]
+ if index == len(parent.contents)-1:
+ next_element = parent.next_sibling
+ next_sibling = None
+ else:
+ next_element = next_sibling = parent.contents[index+1]
+
+ o.previous_element = previous_element
+ if previous_element:
+ previous_element.next_element = o
+ o.next_element = next_element
+ if next_element:
+ next_element.previous_element = o
+ o.next_sibling = next_sibling
+ if next_sibling:
+ next_sibling.previous_sibling = o
+ o.previous_sibling = previous_sibling
+ if previous_sibling:
+ previous_sibling.next_sibling = o
+
+ def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ # The BeautifulSoup object itself can never be popped.
+ return
+
+ most_recently_popped = None
+
+ stack_size = len(self.tagStack)
+ for i in range(stack_size - 1, 0, -1):
+ t = self.tagStack[i]
+ if (name == t.name and nsprefix == t.prefix):
+ if inclusivePop:
+ most_recently_popped = self.popTag()
+ break
+ most_recently_popped = self.popTag()
+
+ return most_recently_popped
+
+ def handle_starttag(self, name, namespace, nsprefix, attrs):
+ """Push a start tag on to the stack.
+
+ If this method returns None, the tag was rejected by the
+ SoupStrainer. You should proceed as if the tag had not occured
+ in the document. For instance, if this was a self-closing tag,
+ don't call handle_endtag.
+ """
+
+ # print "Start tag %s: %s" % (name, attrs)
+ self.endData()
+
+ if (self.parse_only and len(self.tagStack) <= 1
+ and (self.parse_only.text
+ or not self.parse_only.search_tag(name, attrs))):
+ return None
+
+ tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+ self.currentTag, self._most_recent_element)
+ if tag is None:
+ return tag
+ if self._most_recent_element:
+ self._most_recent_element.next_element = tag
+ self._most_recent_element = tag
+ self.pushTag(tag)
+ return tag
+
+ def handle_endtag(self, name, nsprefix=None):
+ #print "End tag: " + name
+ self.endData()
+ self._popToTag(name, nsprefix)
+
+ def handle_data(self, data):
+ self.current_data.append(data)
+
+ def decode(self, pretty_print=False,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ formatter="minimal"):
+ """Returns a string or Unicode representation of this document.
+ To get Unicode, pass None for encoding."""
+
+ if self.is_xml:
+ # Print the XML declaration
+ encoding_part = ''
+ if eventual_encoding != None:
+ encoding_part = ' encoding="%s"' % eventual_encoding
+ prefix = u'\n' % encoding_part
+ else:
+ prefix = u''
+ if not pretty_print:
+ indent_level = None
+ else:
+ indent_level = 0
+ return prefix + super(BeautifulSoup, self).decode(
+ indent_level, eventual_encoding, formatter)
+
+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
+class BeautifulStoneSoup(BeautifulSoup):
+ """Deprecated interface to an XML parser."""
+
+ def __init__(self, *args, **kwargs):
+ kwargs['features'] = 'xml'
+ warnings.warn(
+ 'The BeautifulStoneSoup class is deprecated. Instead of using '
+ 'it, pass features="xml" into the BeautifulSoup constructor.')
+ super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+ pass
+
+class FeatureNotFound(ValueError):
+ pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print(soup.prettify())
diff --git a/script.module.slyguy/resources/modules/bs4/builder/__init__.py b/script.module.slyguy/resources/modules/bs4/builder/__init__.py
new file mode 100644
index 00000000..b6e0ffbb
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/builder/__init__.py
@@ -0,0 +1,313 @@
+from collections import defaultdict
+import itertools
+import sys
+import six
+from bs4.element import (
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
+ whitespace_re
+ )
+
+__all__ = [
+ 'HTMLTreeBuilder',
+ 'SAXTreeBuilder',
+ 'TreeBuilder',
+ 'TreeBuilderRegistry',
+ ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+ def __init__(self):
+ self.builders_for_feature = defaultdict(list)
+ self.builders = []
+
+ def register(self, treebuilder_class):
+ """Register a treebuilder based on its advertised features."""
+ for feature in treebuilder_class.features:
+ self.builders_for_feature[feature].insert(0, treebuilder_class)
+ self.builders.insert(0, treebuilder_class)
+
+ def lookup(self, *features):
+ if len(self.builders) == 0:
+ # There are no builders at all.
+ return None
+
+ if len(features) == 0:
+ # They didn't ask for any features. Give them the most
+ # recently registered builder.
+ return self.builders[0]
+
+ # Go down the list of features in order, and eliminate any builders
+ # that don't match every feature.
+ features = list(features)
+ features.reverse()
+ candidates = None
+ candidate_set = None
+ while len(features) > 0:
+ feature = features.pop()
+ we_have_the_feature = self.builders_for_feature.get(feature, [])
+ if len(we_have_the_feature) > 0:
+ if candidates is None:
+ candidates = we_have_the_feature
+ candidate_set = set(candidates)
+ else:
+ # Eliminate any candidates that don't have this feature.
+ candidate_set = candidate_set.intersection(
+ set(we_have_the_feature))
+
+ # The only valid candidates are the ones in candidate_set.
+ # Go through the original list of candidates and pick the first one
+ # that's in candidate_set.
+ if candidate_set is None:
+ return None
+ for candidate in candidates:
+ if candidate in candidate_set:
+ return candidate
+ return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+ """Turn a document into a Beautiful Soup object tree."""
+
+ NAME = "[Unknown tree builder]"
+ ALTERNATE_NAMES = []
+ features = []
+
+ is_xml = False
+ picklable = False
+ preserve_whitespace_tags = set()
+ empty_element_tags = None # A tag will be considered an empty-element
+ # tag when and only when it has no contents.
+
+ # A value for these tag/attribute combinations is a space- or
+ # comma-separated list of CDATA, rather than a single CDATA.
+ cdata_list_attributes = {}
+
+
+ def __init__(self):
+ self.soup = None
+
+ def reset(self):
+ pass
+
+ def can_be_empty_element(self, tag_name):
+ """Might a tag with this name be an empty-element tag?
+
+ The final markup may or may not actually present this tag as
+ self-closing.
+
+ For instance: an HTMLBuilder does not consider a tag to be
+ an empty-element tag (it's not in
+ HTMLBuilder.empty_element_tags). This means an empty
tag
+ will be presented as "
", not "".
+
+ The default implementation has no opinion about which tags are
+ empty-element tags, so a tag will be presented as an
+ empty-element tag if and only if it has no contents.
+ "" will become "", and "bar" will
+ be left alone.
+ """
+ if self.empty_element_tags is None:
+ return True
+ return tag_name in self.empty_element_tags
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ return markup, None, None, False
+
+ def test_fragment_to_document(self, fragment):
+ """Wrap an HTML fragment to make it look like a document.
+
+ Different parsers do this differently. For instance, lxml
+ introduces an empty tag, and html5lib
+ doesn't. Abstracting this away lets us write simple tests
+ which run HTML fragments through the parser and compare the
+ results against other HTML fragments.
+
+ This method should not be used outside of tests.
+ """
+ return fragment
+
+ def set_up_substitutions(self, tag):
+ return False
+
+ def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+ """Replaces class="foo bar" with class=["foo", "bar"]
+
+ Modifies its input in place.
+ """
+ if not attrs:
+ return attrs
+ if self.cdata_list_attributes:
+ universal = self.cdata_list_attributes.get('*', [])
+ tag_specific = self.cdata_list_attributes.get(
+ tag_name.lower(), None)
+ for attr in list(attrs.keys()):
+ if attr in universal or (tag_specific and attr in tag_specific):
+ # We have a "class"-type attribute whose string
+ # value is a whitespace-separated list of
+ # values. Split it into a list.
+ value = attrs[attr]
+ if isinstance(value, six.string_types):
+ values = whitespace_re.split(value)
+ else:
+ # html5lib sometimes calls setAttributes twice
+ # for the same tag when rearranging the parse
+ # tree. On the second call the attribute value
+ # here is already a list. If this happens,
+ # leave the value alone rather than trying to
+ # split it again.
+ values = value
+ attrs[attr] = values
+ return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+ """A Beautiful Soup treebuilder that listens for SAX events."""
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def close(self):
+ pass
+
+ def startElement(self, name, attrs):
+ attrs = dict((key[1], value) for key, value in list(attrs.items()))
+ #print "Start %s, %r" % (name, attrs)
+ self.soup.handle_starttag(name, attrs)
+
+ def endElement(self, name):
+ #print "End %s" % name
+ self.soup.handle_endtag(name)
+
+ def startElementNS(self, nsTuple, nodeName, attrs):
+ # Throw away (ns, nodeName) for now.
+ self.startElement(nodeName, attrs)
+
+ def endElementNS(self, nsTuple, nodeName):
+ # Throw away (ns, nodeName) for now.
+ self.endElement(nodeName)
+ #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+ def startPrefixMapping(self, prefix, nodeValue):
+ # Ignore the prefix for now.
+ pass
+
+ def endPrefixMapping(self, prefix):
+ # Ignore the prefix for now.
+ # handler.endPrefixMapping(prefix)
+ pass
+
+ def characters(self, content):
+ self.soup.handle_data(content)
+
+ def startDocument(self):
+ pass
+
+ def endDocument(self):
+ pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+ """This TreeBuilder knows facts about HTML.
+
+ Such as which tags are empty-element tags.
+ """
+
+ preserve_whitespace_tags = set(['pre', 'textarea'])
+ empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
+ # The HTML standard defines these attributes as containing a
+ # space-separated list of values, not a single value. That is,
+ # class="foo bar" means that the 'class' attribute has two values,
+ # 'foo' and 'bar', not the single value 'foo bar'. When we
+ # encounter one of these attributes, we will parse its value into
+ # a list of values if possible. Upon output, the list will be
+ # converted back into a string.
+ cdata_list_attributes = {
+ "*" : ['class', 'accesskey', 'dropzone'],
+ "a" : ['rel', 'rev'],
+ "link" : ['rel', 'rev'],
+ "td" : ["headers"],
+ "th" : ["headers"],
+ "td" : ["headers"],
+ "form" : ["accept-charset"],
+ "object" : ["archive"],
+
+ # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+ "area" : ["rel"],
+ "icon" : ["sizes"],
+ "iframe" : ["sandbox"],
+ "output" : ["for"],
+ }
+
+ def set_up_substitutions(self, tag):
+ # We are only interested in tags
+ if tag.name != 'meta':
+ return False
+
+ http_equiv = tag.get('http-equiv')
+ content = tag.get('content')
+ charset = tag.get('charset')
+
+ # We are interested in tags that say what encoding the
+ # document was originally in. This means HTML 5-style
+ # tags that provide the "charset" attribute. It also means
+ # HTML 4-style tags that provide the "content"
+ # attribute and have "http-equiv" set to "content-type".
+ #
+ # In both cases we will replace the value of the appropriate
+ # attribute with a standin object that can take on any
+ # encoding.
+ meta_encoding = None
+ if charset is not None:
+ # HTML 5 style:
+ #
+ meta_encoding = charset
+ tag['charset'] = CharsetMetaAttributeValue(charset)
+
+ elif (content is not None and http_equiv is not None
+ and http_equiv.lower() == 'content-type'):
+ # HTML 4 style:
+ #
+ tag['content'] = ContentMetaAttributeValue(content)
+
+ return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+ """Copy TreeBuilders from the given module into this module."""
+ # I'm fairly sure this is not the best way to do this.
+ this_module = sys.modules['bs4.builder']
+ for name in module.__all__:
+ obj = getattr(module, name)
+
+ if issubclass(obj, TreeBuilder):
+ setattr(this_module, name, obj)
+ this_module.__all__.append(name)
+ # Register the builder while we're at it.
+ this_module.builder_registry.register(obj)
+
+class ParserRejectedMarkup(Exception):
+ pass
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
\ No newline at end of file
diff --git a/script.module.slyguy/resources/modules/bs4/builder/_htmlparser.py b/script.module.slyguy/resources/modules/bs4/builder/_htmlparser.py
new file mode 100644
index 00000000..cb87ebae
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/builder/_htmlparser.py
@@ -0,0 +1,263 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+ 'HTMLParserTreeBuilder',
+ ]
+
+from six.moves.html_parser import HTMLParser
+
+try:
+ from HTMLParser import HTMLParseError
+except ImportError as e:
+ # HTMLParseError is removed in Python 3.5. Since it can never be
+ # thrown in 3.5, we can just define our own class as a placeholder.
+ class HTMLParseError(Exception):
+ pass
+
+import sys
+import six
+import warnings
+
+# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
+# argument, which we'd like to set to False. Unfortunately,
+# http://bugs.python.org/issue13273 makes strict=True a better bet
+# before Python 3.2.3.
+#
+# At the end of this file, we monkeypatch HTMLParser so that
+# strict=True works well on Python 3.2.2.
+major, minor, release = sys.version_info[:3]
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+
+
+from bs4.element import (
+ CData,
+ Comment,
+ Declaration,
+ Doctype,
+ ProcessingInstruction,
+ )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+ HTML,
+ HTMLTreeBuilder,
+ STRICT,
+ )
+
+
+HTMLPARSER = 'html.parser'
+
+class BeautifulSoupHTMLParser(HTMLParser):
+ def handle_starttag(self, name, attrs):
+ # XXX namespace
+ attr_dict = {}
+ for key, value in attrs:
+ # Change None attribute values to the empty string
+ # for consistency with the other tree builders.
+ if value is None:
+ value = ''
+ attr_dict[key] = value
+ attrvalue = '""'
+ self.soup.handle_starttag(name, None, None, attr_dict)
+
+ def handle_endtag(self, name):
+ self.soup.handle_endtag(name)
+
+ def handle_data(self, data):
+ self.soup.handle_data(data)
+
+ def handle_charref(self, name):
+ # XXX workaround for a bug in HTMLParser. Remove this once
+ # it's fixed in all supported versions.
+ # http://bugs.python.org/issue13633
+ if name.startswith('x'):
+ real_name = int(name.lstrip('x'), 16)
+ elif name.startswith('X'):
+ real_name = int(name.lstrip('X'), 16)
+ else:
+ real_name = int(name)
+
+ try:
+ data = six.unichr(real_name)
+ except (ValueError, OverflowError) as e:
+ data = u"\N{REPLACEMENT CHARACTER}"
+
+ self.handle_data(data)
+
+ def handle_entityref(self, name):
+ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+ if character is not None:
+ data = character
+ else:
+ data = "&%s;" % name
+ self.handle_data(data)
+
+ def handle_comment(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(Comment)
+
+ def handle_decl(self, data):
+ self.soup.endData()
+ if data.startswith("DOCTYPE "):
+ data = data[len("DOCTYPE "):]
+ elif data == 'DOCTYPE':
+ # i.e. ""
+ data = ''
+ self.soup.handle_data(data)
+ self.soup.endData(Doctype)
+
+ def unknown_decl(self, data):
+ if data.upper().startswith('CDATA['):
+ cls = CData
+ data = data[len('CDATA['):]
+ else:
+ cls = Declaration
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(cls)
+
+ def handle_pi(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+ is_xml = False
+ picklable = True
+ NAME = HTMLPARSER
+ features = [NAME, HTML, STRICT]
+
+ def __init__(self, *args, **kwargs):
+ if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
+ kwargs['strict'] = False
+ if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+ kwargs['convert_charrefs'] = False
+ self.parser_args = (args, kwargs)
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None, exclude_encodings=None):
+ """
+ :return: A 4-tuple (markup, original encoding, encoding
+ declared within markup, whether any characters had to be
+ replaced with REPLACEMENT CHARACTER).
+ """
+ if isinstance(markup, six.text_type):
+ yield (markup, None, None, False)
+ return
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+ exclude_encodings=exclude_encodings)
+ yield (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
+
+ def feed(self, markup):
+ args, kwargs = self.parser_args
+ parser = BeautifulSoupHTMLParser(*args, **kwargs)
+ parser.soup = self.soup
+ try:
+ parser.feed(markup)
+ except HTMLParseError as e:
+ warnings.warn(RuntimeWarning(
+ "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
+ raise e
+
+# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
+# 3.2.3 code. This ensures they don't treat markup like as a
+# string.
+#
+# XXX This code can be removed once most Python 3 users are on 3.2.3.
+if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
+ import re
+ attrfind_tolerant = re.compile(
+ r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+ HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
+
+ locatestarttagend = re.compile(r"""
+ <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
+ (?:\s+ # whitespace before attribute name
+ (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
+ (?:\s*=\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |\"[^\"]*\" # LIT-enclosed value
+ |[^'\">\s]+ # bare value
+ )
+ )?
+ )
+ )*
+ \s* # trailing whitespace
+""", re.VERBOSE)
+ BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
+
+ from html.parser import tagfind, attrfind
+
+ def parse_starttag(self, i):
+ self.__starttag_text = None
+ endpos = self.check_for_whole_start_tag(i)
+ if endpos < 0:
+ return endpos
+ rawdata = self.rawdata
+ self.__starttag_text = rawdata[i:endpos]
+
+ # Now parse the data between i+1 and j into a tag and attrs
+ attrs = []
+ match = tagfind.match(rawdata, i+1)
+ assert match, 'unexpected call to parse_starttag()'
+ k = match.end()
+ self.lasttag = tag = rawdata[i+1:k].lower()
+ while k < endpos:
+ if self.strict:
+ m = attrfind.match(rawdata, k)
+ else:
+ m = attrfind_tolerant.match(rawdata, k)
+ if not m:
+ break
+ attrname, rest, attrvalue = m.group(1, 2, 3)
+ if not rest:
+ attrvalue = None
+ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+ attrvalue[:1] == '"' == attrvalue[-1:]:
+ attrvalue = attrvalue[1:-1]
+ if attrvalue:
+ attrvalue = self.unescape(attrvalue)
+ attrs.append((attrname.lower(), attrvalue))
+ k = m.end()
+
+ end = rawdata[k:endpos].strip()
+ if end not in (">", "/>"):
+ lineno, offset = self.getpos()
+ if "\n" in self.__starttag_text:
+ lineno = lineno + self.__starttag_text.count("\n")
+ offset = len(self.__starttag_text) \
+ - self.__starttag_text.rfind("\n")
+ else:
+ offset = offset + len(self.__starttag_text)
+ if self.strict:
+ self.error("junk characters in start tag: %r"
+ % (rawdata[k:endpos][:20],))
+ self.handle_data(rawdata[i:endpos])
+ return endpos
+ if end.endswith('/>'):
+ # XHTML-style empty tag:
+ self.handle_startendtag(tag, attrs)
+ else:
+ self.handle_starttag(tag, attrs)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag)
+ return endpos
+
+ def set_cdata_mode(self, elem):
+ self.cdata_elem = elem.lower()
+ self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
+
+ BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+ BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
+
+ CONSTRUCTOR_TAKES_STRICT = True
diff --git a/script.module.slyguy/resources/modules/bs4/dammit.py b/script.module.slyguy/resources/modules/bs4/dammit.py
new file mode 100644
index 00000000..63a51c22
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/dammit.py
@@ -0,0 +1,841 @@
+# -*- coding: utf-8 -*-
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This library converts a bytestream to Unicode through any means
+necessary. It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
+XML or HTML to reflect a new encoding; that's the tree builder's job.
+"""
+__license__ = "MIT"
+
+from pdb import set_trace
+import codecs
+import six
+from six.moves.html_entities import codepoint2name
+import re
+import logging
+import string
+
+# Import a library to autodetect character encodings.
+chardet_type = None
+try:
+ # First try the fast C implementation.
+ # PyPI package: cchardet
+ import cchardet
+ def chardet_dammit(s):
+ return cchardet.detect(s)['encoding']
+except ImportError:
+ try:
+ # Fall back to the pure Python implementation
+ # Debian package: python-chardet
+ # PyPI package: chardet
+ import chardet
+ def chardet_dammit(s):
+ return chardet.detect(s)['encoding']
+ #import chardet.constants
+ #chardet.constants._debug = 1
+ except ImportError:
+ # No chardet available.
+ def chardet_dammit(s):
+ return None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+xml_encoding_re = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+ '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+
+class EntitySubstitution(object):
+
+ """Substitute XML or HTML entities for the corresponding characters."""
+
+ def _populate_class_variables():
+ lookup = {}
+ reverse_lookup = {}
+ characters_for_re = []
+ for codepoint, name in list(codepoint2name.items()):
+ character = six.unichr(codepoint)
+ if codepoint != 34:
+ # There's no point in turning the quotation mark into
+ # ", unless it happens within an attribute value, which
+ # is handled elsewhere.
+ characters_for_re.append(character)
+ lookup[character] = name
+ # But we do want to turn " into the quotation mark.
+ reverse_lookup[name] = character
+ re_definition = "[%s]" % "".join(characters_for_re)
+ return lookup, reverse_lookup, re.compile(re_definition)
+ (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+ CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+ CHARACTER_TO_XML_ENTITY = {
+ "'": "apos",
+ '"': "quot",
+ "&": "amp",
+ "<": "lt",
+ ">": "gt",
+ }
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ ")")
+
+ AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
+ @classmethod
+ def _substitute_html_entity(cls, matchobj):
+ entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+ return "&%s;" % entity
+
+ @classmethod
+ def _substitute_xml_entity(cls, matchobj):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+ return "&%s;" % entity
+
+ @classmethod
+ def quoted_attribute_value(self, value):
+ """Make a value into a quoted XML attribute, possibly escaping it.
+
+ Most strings will be quoted using double quotes.
+
+ Bob's Bar -> "Bob's Bar"
+
+ If a string contains double quotes, it will be quoted using
+ single quotes.
+
+ Welcome to "my bar" -> 'Welcome to "my bar"'
+
+ If a string contains both single and double quotes, the
+ double quotes will be escaped, and the string will be quoted
+ using double quotes.
+
+ Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
+ """
+ quote_with = '"'
+ if '"' in value:
+ if "'" in value:
+ # The string contains both single and double
+ # quotes. Turn the double quotes into
+ # entities. We quote the double quotes rather than
+ # the single quotes because the entity name is
+ # """ whether this is HTML or XML. If we
+ # quoted the single quotes, we'd have to decide
+ # between ' and &squot;.
+ replace_with = """
+ value = value.replace('"', replace_with)
+ else:
+ # There are double quotes but no single quotes.
+ # We can use single quotes to quote the attribute.
+ quote_with = "'"
+ return quote_with + value + quote_with
+
+ @classmethod
+ def substitute_xml(cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
+ :param value: A string to be substituted. The less-than sign
+ will become <, the greater-than sign will become >,
+ and any ampersands will become &. If you want ampersands
+ that appear to be part of an entity definition to be left
+ alone, use substitute_xml_containing_entities() instead.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets and ampersands.
+ value = cls.AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_xml_containing_entities(
+ cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
+ :param value: A string to be substituted. The less-than sign will
+ become <, the greater-than sign will become >, and any
+ ampersands that are not part of an entity defition will
+ become &.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets, and ampersands that aren't part of
+ # entities.
+ value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_html(cls, s):
+ """Replace certain Unicode characters with named HTML entities.
+
+ This differs from data.encode(encoding, 'xmlcharrefreplace')
+ in that the goal is to make the result more readable (to those
+ with ASCII displays) rather than to recover from
+ errors. There's absolutely nothing wrong with a UTF-8 string
+ containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+ character with "é" will make it more readable to some
+ people.
+ """
+ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+ cls._substitute_html_entity, s)
+
+
+class EncodingDetector:
+ """Suggests a number of possible encodings for a bytestring.
+
+ Order of precedence:
+
+ 1. Encodings you specifically tell EncodingDetector to try first
+ (the override_encodings argument to the constructor).
+
+ 2. An encoding declared within the bytestring itself, either in an
+ XML declaration (if the bytestring is to be interpreted as an XML
+ document), or in a tag (if the bytestring is to be
+ interpreted as an HTML document.)
+
+ 3. An encoding detected through textual analysis by chardet,
+ cchardet, or a similar external library.
+
+ 4. UTF-8.
+
+ 5. Windows-1252.
+ """
+ def __init__(self, markup, override_encodings=None, is_html=False,
+ exclude_encodings=None):
+ self.override_encodings = override_encodings or []
+ exclude_encodings = exclude_encodings or []
+ self.exclude_encodings = set([x.lower() for x in exclude_encodings])
+ self.chardet_encoding = None
+ self.is_html = is_html
+ self.declared_encoding = None
+
+ # First order of business: strip a byte-order mark.
+ self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
+
+ def _usable(self, encoding, tried):
+ if encoding is not None:
+ encoding = encoding.lower()
+ if encoding in self.exclude_encodings:
+ return False
+ if encoding not in tried:
+ tried.add(encoding)
+ return True
+ return False
+
+ @property
+ def encodings(self):
+ """Yield a number of encodings that might work for this markup."""
+ tried = set()
+ for e in self.override_encodings:
+ if self._usable(e, tried):
+ yield e
+
+ # Did the document originally start with a byte-order mark
+ # that indicated its encoding?
+ if self._usable(self.sniffed_encoding, tried):
+ yield self.sniffed_encoding
+
+ # Look within the document for an XML or HTML encoding
+ # declaration.
+ if self.declared_encoding is None:
+ self.declared_encoding = self.find_declared_encoding(
+ self.markup, self.is_html)
+ if self._usable(self.declared_encoding, tried):
+ yield self.declared_encoding
+
+ # Use third-party character set detection to guess at the
+ # encoding.
+ if self.chardet_encoding is None:
+ self.chardet_encoding = chardet_dammit(self.markup)
+ if self._usable(self.chardet_encoding, tried):
+ yield self.chardet_encoding
+
+ # As a last-ditch effort, try utf-8 and windows-1252.
+ for e in ('utf-8', 'windows-1252'):
+ if self._usable(e, tried):
+ yield e
+
+ @classmethod
+ def strip_byte_order_mark(cls, data):
+ """If a byte-order mark is present, strip it and return the encoding it implies."""
+ encoding = None
+ if isinstance(data, six.text_type):
+ # Unicode data cannot have a byte-order mark.
+ return data, encoding
+ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == b'\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == b'\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == b'\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ return data, encoding
+
+ @classmethod
+ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
+ """Given a document, tries to find its declared encoding.
+
+ An XML encoding is declared at the beginning of the document.
+
+ An HTML encoding is declared in a tag, hopefully near the
+ beginning of the document.
+ """
+ if search_entire_document:
+ xml_endpos = html_endpos = len(markup)
+ else:
+ xml_endpos = 1024
+ html_endpos = max(2048, int(len(markup) * 0.05))
+
+ declared_encoding = None
+ declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+ if not declared_encoding_match and is_html:
+ declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+ if declared_encoding_match is not None:
+ declared_encoding = declared_encoding_match.groups()[0].decode(
+ 'ascii', 'replace')
+ if declared_encoding:
+ return declared_encoding.lower()
+ return None
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = {"macintosh": "mac-roman",
+ "x-sjis": "shift-jis"}
+
+ ENCODINGS_WITH_SMART_QUOTES = [
+ "windows-1252",
+ "iso-8859-1",
+ "iso-8859-2",
+ ]
+
+ def __init__(self, markup, override_encodings=[],
+ smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+ self.smart_quotes_to = smart_quotes_to
+ self.tried_encodings = []
+ self.contains_replacement_characters = False
+ self.is_html = is_html
+
+ self.detector = EncodingDetector(
+ markup, override_encodings, is_html, exclude_encodings)
+
+ # Short-circuit if the data is in Unicode to begin with.
+ if isinstance(markup, six.text_type) or markup == '':
+ self.markup = markup
+ self.unicode_markup = six.text_type(markup)
+ self.original_encoding = None
+ return
+
+ # The encoding detector may have stripped a byte-order mark.
+ # Use the stripped markup from this point on.
+ self.markup = self.detector.markup
+
+ u = None
+ for encoding in self.detector.encodings:
+ markup = self.detector.markup
+ u = self._convert_from(encoding)
+ if u is not None:
+ break
+
+ if not u:
+ # None of the encodings worked. As an absolute last resort,
+ # try them again with character replacement.
+
+ for encoding in self.detector.encodings:
+ if encoding != "ascii":
+ u = self._convert_from(encoding, "replace")
+ if u is not None:
+ logging.warning(
+ "Some characters could not be decoded, and were "
+ "replaced with REPLACEMENT CHARACTER.")
+ self.contains_replacement_characters = True
+ break
+
+ # If none of that worked, we could at this point force it to
+ # ASCII, but that would destroy so much data that I think
+ # giving up is better.
+ self.unicode_markup = u
+ if not u:
+ self.original_encoding = None
+
+ def _sub_ms_char(self, match):
+ """Changes a MS smart quote character to an XML or HTML
+ entity, or an ASCII character."""
+ orig = match.group(1)
+ if self.smart_quotes_to == 'ascii':
+ sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
+ else:
+ sub = self.MS_CHARS.get(orig)
+ if type(sub) == tuple:
+ if self.smart_quotes_to == 'xml':
+ sub = ''.encode() + sub[1].encode() + ';'.encode()
+ else:
+ sub = '&'.encode() + sub[0].encode() + ';'.encode()
+ else:
+ sub = sub.encode()
+ return sub
+
+ def _convert_from(self, proposed, errors="strict"):
+ proposed = self.find_codec(proposed)
+ if not proposed or (proposed, errors) in self.tried_encodings:
+ return None
+ self.tried_encodings.append((proposed, errors))
+ markup = self.markup
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if (self.smart_quotes_to is not None
+ and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
+ smart_quotes_re = b"([\x80-\x9f])"
+ smart_quotes_compiled = re.compile(smart_quotes_re)
+ markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+ try:
+ #print "Trying to convert document to %s (errors=%s)" % (
+ # proposed, errors)
+ u = self._to_unicode(markup, proposed, errors)
+ self.markup = u
+ self.original_encoding = proposed
+ except Exception as e:
+ #print "That didn't work!"
+ #print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _to_unicode(self, data, encoding, errors="strict"):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+ return six.text_type(data, encoding, errors)
+
+ @property
+ def declared_html_encoding(self):
+ if not self.is_html:
+ return None
+ return self.detector.declared_encoding
+
+ def find_codec(self, charset):
+ value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
+ or (charset and self._codec(charset.replace("-", "")))
+ or (charset and self._codec(charset.replace("-", "_")))
+ or (charset and charset.lower())
+ or charset
+ )
+ if value:
+ return value.lower()
+ return None
+
+ def _codec(self, charset):
+ if not charset:
+ return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+
+ # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
+ MS_CHARS = {b'\x80': ('euro', '20AC'),
+ b'\x81': ' ',
+ b'\x82': ('sbquo', '201A'),
+ b'\x83': ('fnof', '192'),
+ b'\x84': ('bdquo', '201E'),
+ b'\x85': ('hellip', '2026'),
+ b'\x86': ('dagger', '2020'),
+ b'\x87': ('Dagger', '2021'),
+ b'\x88': ('circ', '2C6'),
+ b'\x89': ('permil', '2030'),
+ b'\x8A': ('Scaron', '160'),
+ b'\x8B': ('lsaquo', '2039'),
+ b'\x8C': ('OElig', '152'),
+ b'\x8D': '?',
+ b'\x8E': ('#x17D', '17D'),
+ b'\x8F': '?',
+ b'\x90': '?',
+ b'\x91': ('lsquo', '2018'),
+ b'\x92': ('rsquo', '2019'),
+ b'\x93': ('ldquo', '201C'),
+ b'\x94': ('rdquo', '201D'),
+ b'\x95': ('bull', '2022'),
+ b'\x96': ('ndash', '2013'),
+ b'\x97': ('mdash', '2014'),
+ b'\x98': ('tilde', '2DC'),
+ b'\x99': ('trade', '2122'),
+ b'\x9a': ('scaron', '161'),
+ b'\x9b': ('rsaquo', '203A'),
+ b'\x9c': ('oelig', '153'),
+ b'\x9d': '?',
+ b'\x9e': ('#x17E', '17E'),
+ b'\x9f': ('Yuml', ''),}
+
+ # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
+ # horrors like stripping diacritical marks to turn á into a, but also
+ # contains non-horrors like turning “ into ".
+ MS_CHARS_TO_ASCII = {
+ b'\x80' : 'EUR',
+ b'\x81' : ' ',
+ b'\x82' : ',',
+ b'\x83' : 'f',
+ b'\x84' : ',,',
+ b'\x85' : '...',
+ b'\x86' : '+',
+ b'\x87' : '++',
+ b'\x88' : '^',
+ b'\x89' : '%',
+ b'\x8a' : 'S',
+ b'\x8b' : '<',
+ b'\x8c' : 'OE',
+ b'\x8d' : '?',
+ b'\x8e' : 'Z',
+ b'\x8f' : '?',
+ b'\x90' : '?',
+ b'\x91' : "'",
+ b'\x92' : "'",
+ b'\x93' : '"',
+ b'\x94' : '"',
+ b'\x95' : '*',
+ b'\x96' : '-',
+ b'\x97' : '--',
+ b'\x98' : '~',
+ b'\x99' : '(TM)',
+ b'\x9a' : 's',
+ b'\x9b' : '>',
+ b'\x9c' : 'oe',
+ b'\x9d' : '?',
+ b'\x9e' : 'z',
+ b'\x9f' : 'Y',
+ b'\xa0' : ' ',
+ b'\xa1' : '!',
+ b'\xa2' : 'c',
+ b'\xa3' : 'GBP',
+ b'\xa4' : '$', #This approximation is especially parochial--this is the
+ #generic currency symbol.
+ b'\xa5' : 'YEN',
+ b'\xa6' : '|',
+ b'\xa7' : 'S',
+ b'\xa8' : '..',
+ b'\xa9' : '',
+ b'\xaa' : '(th)',
+ b'\xab' : '<<',
+ b'\xac' : '!',
+ b'\xad' : ' ',
+ b'\xae' : '(R)',
+ b'\xaf' : '-',
+ b'\xb0' : 'o',
+ b'\xb1' : '+-',
+ b'\xb2' : '2',
+ b'\xb3' : '3',
+ b'\xb4' : ("'", 'acute'),
+ b'\xb5' : 'u',
+ b'\xb6' : 'P',
+ b'\xb7' : '*',
+ b'\xb8' : ',',
+ b'\xb9' : '1',
+ b'\xba' : '(th)',
+ b'\xbb' : '>>',
+ b'\xbc' : '1/4',
+ b'\xbd' : '1/2',
+ b'\xbe' : '3/4',
+ b'\xbf' : '?',
+ b'\xc0' : 'A',
+ b'\xc1' : 'A',
+ b'\xc2' : 'A',
+ b'\xc3' : 'A',
+ b'\xc4' : 'A',
+ b'\xc5' : 'A',
+ b'\xc6' : 'AE',
+ b'\xc7' : 'C',
+ b'\xc8' : 'E',
+ b'\xc9' : 'E',
+ b'\xca' : 'E',
+ b'\xcb' : 'E',
+ b'\xcc' : 'I',
+ b'\xcd' : 'I',
+ b'\xce' : 'I',
+ b'\xcf' : 'I',
+ b'\xd0' : 'D',
+ b'\xd1' : 'N',
+ b'\xd2' : 'O',
+ b'\xd3' : 'O',
+ b'\xd4' : 'O',
+ b'\xd5' : 'O',
+ b'\xd6' : 'O',
+ b'\xd7' : '*',
+ b'\xd8' : 'O',
+ b'\xd9' : 'U',
+ b'\xda' : 'U',
+ b'\xdb' : 'U',
+ b'\xdc' : 'U',
+ b'\xdd' : 'Y',
+ b'\xde' : 'b',
+ b'\xdf' : 'B',
+ b'\xe0' : 'a',
+ b'\xe1' : 'a',
+ b'\xe2' : 'a',
+ b'\xe3' : 'a',
+ b'\xe4' : 'a',
+ b'\xe5' : 'a',
+ b'\xe6' : 'ae',
+ b'\xe7' : 'c',
+ b'\xe8' : 'e',
+ b'\xe9' : 'e',
+ b'\xea' : 'e',
+ b'\xeb' : 'e',
+ b'\xec' : 'i',
+ b'\xed' : 'i',
+ b'\xee' : 'i',
+ b'\xef' : 'i',
+ b'\xf0' : 'o',
+ b'\xf1' : 'n',
+ b'\xf2' : 'o',
+ b'\xf3' : 'o',
+ b'\xf4' : 'o',
+ b'\xf5' : 'o',
+ b'\xf6' : 'o',
+ b'\xf7' : '/',
+ b'\xf8' : 'o',
+ b'\xf9' : 'u',
+ b'\xfa' : 'u',
+ b'\xfb' : 'u',
+ b'\xfc' : 'u',
+ b'\xfd' : 'y',
+ b'\xfe' : 'b',
+ b'\xff' : 'y',
+ }
+
+ # A map used when removing rogue Windows-1252/ISO-8859-1
+ # characters in otherwise UTF-8 documents.
+ #
+ # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
+ # Windows-1252.
+ WINDOWS_1252_TO_UTF8 = {
+ 0x80 : b'\xe2\x82\xac', # €
+ 0x82 : b'\xe2\x80\x9a', # ‚
+ 0x83 : b'\xc6\x92', # ƒ
+ 0x84 : b'\xe2\x80\x9e', # „
+ 0x85 : b'\xe2\x80\xa6', # …
+ 0x86 : b'\xe2\x80\xa0', # †
+ 0x87 : b'\xe2\x80\xa1', # ‡
+ 0x88 : b'\xcb\x86', # ˆ
+ 0x89 : b'\xe2\x80\xb0', # ‰
+ 0x8a : b'\xc5\xa0', # Š
+ 0x8b : b'\xe2\x80\xb9', # ‹
+ 0x8c : b'\xc5\x92', # Œ
+ 0x8e : b'\xc5\xbd', # Ž
+ 0x91 : b'\xe2\x80\x98', # ‘
+ 0x92 : b'\xe2\x80\x99', # ’
+ 0x93 : b'\xe2\x80\x9c', # “
+ 0x94 : b'\xe2\x80\x9d', # ”
+ 0x95 : b'\xe2\x80\xa2', # •
+ 0x96 : b'\xe2\x80\x93', # –
+ 0x97 : b'\xe2\x80\x94', # —
+ 0x98 : b'\xcb\x9c', # ˜
+ 0x99 : b'\xe2\x84\xa2', # ™
+ 0x9a : b'\xc5\xa1', # š
+ 0x9b : b'\xe2\x80\xba', # ›
+ 0x9c : b'\xc5\x93', # œ
+ 0x9e : b'\xc5\xbe', # ž
+ 0x9f : b'\xc5\xb8', # Ÿ
+ 0xa0 : b'\xc2\xa0', #
+ 0xa1 : b'\xc2\xa1', # ¡
+ 0xa2 : b'\xc2\xa2', # ¢
+ 0xa3 : b'\xc2\xa3', # £
+ 0xa4 : b'\xc2\xa4', # ¤
+ 0xa5 : b'\xc2\xa5', # ¥
+ 0xa6 : b'\xc2\xa6', # ¦
+ 0xa7 : b'\xc2\xa7', # §
+ 0xa8 : b'\xc2\xa8', # ¨
+ 0xa9 : b'\xc2\xa9', # ©
+ 0xaa : b'\xc2\xaa', # ª
+ 0xab : b'\xc2\xab', # «
+ 0xac : b'\xc2\xac', # ¬
+ 0xad : b'\xc2\xad', #
+ 0xae : b'\xc2\xae', # ®
+ 0xaf : b'\xc2\xaf', # ¯
+ 0xb0 : b'\xc2\xb0', # °
+ 0xb1 : b'\xc2\xb1', # ±
+ 0xb2 : b'\xc2\xb2', # ²
+ 0xb3 : b'\xc2\xb3', # ³
+ 0xb4 : b'\xc2\xb4', # ´
+ 0xb5 : b'\xc2\xb5', # µ
+ 0xb6 : b'\xc2\xb6', # ¶
+ 0xb7 : b'\xc2\xb7', # ·
+ 0xb8 : b'\xc2\xb8', # ¸
+ 0xb9 : b'\xc2\xb9', # ¹
+ 0xba : b'\xc2\xba', # º
+ 0xbb : b'\xc2\xbb', # »
+ 0xbc : b'\xc2\xbc', # ¼
+ 0xbd : b'\xc2\xbd', # ½
+ 0xbe : b'\xc2\xbe', # ¾
+ 0xbf : b'\xc2\xbf', # ¿
+ 0xc0 : b'\xc3\x80', # À
+ 0xc1 : b'\xc3\x81', # Á
+ 0xc2 : b'\xc3\x82', # Â
+ 0xc3 : b'\xc3\x83', # Ã
+ 0xc4 : b'\xc3\x84', # Ä
+ 0xc5 : b'\xc3\x85', # Å
+ 0xc6 : b'\xc3\x86', # Æ
+ 0xc7 : b'\xc3\x87', # Ç
+ 0xc8 : b'\xc3\x88', # È
+ 0xc9 : b'\xc3\x89', # É
+ 0xca : b'\xc3\x8a', # Ê
+ 0xcb : b'\xc3\x8b', # Ë
+ 0xcc : b'\xc3\x8c', # Ì
+ 0xcd : b'\xc3\x8d', # Í
+ 0xce : b'\xc3\x8e', # Î
+ 0xcf : b'\xc3\x8f', # Ï
+ 0xd0 : b'\xc3\x90', # Ð
+ 0xd1 : b'\xc3\x91', # Ñ
+ 0xd2 : b'\xc3\x92', # Ò
+ 0xd3 : b'\xc3\x93', # Ó
+ 0xd4 : b'\xc3\x94', # Ô
+ 0xd5 : b'\xc3\x95', # Õ
+ 0xd6 : b'\xc3\x96', # Ö
+ 0xd7 : b'\xc3\x97', # ×
+ 0xd8 : b'\xc3\x98', # Ø
+ 0xd9 : b'\xc3\x99', # Ù
+ 0xda : b'\xc3\x9a', # Ú
+ 0xdb : b'\xc3\x9b', # Û
+ 0xdc : b'\xc3\x9c', # Ü
+ 0xdd : b'\xc3\x9d', # Ý
+ 0xde : b'\xc3\x9e', # Þ
+ 0xdf : b'\xc3\x9f', # ß
+ 0xe0 : b'\xc3\xa0', # à
+ 0xe1 : b'\xa1', # á
+ 0xe2 : b'\xc3\xa2', # â
+ 0xe3 : b'\xc3\xa3', # ã
+ 0xe4 : b'\xc3\xa4', # ä
+ 0xe5 : b'\xc3\xa5', # å
+ 0xe6 : b'\xc3\xa6', # æ
+ 0xe7 : b'\xc3\xa7', # ç
+ 0xe8 : b'\xc3\xa8', # è
+ 0xe9 : b'\xc3\xa9', # é
+ 0xea : b'\xc3\xaa', # ê
+ 0xeb : b'\xc3\xab', # ë
+ 0xec : b'\xc3\xac', # ì
+ 0xed : b'\xc3\xad', # í
+ 0xee : b'\xc3\xae', # î
+ 0xef : b'\xc3\xaf', # ï
+ 0xf0 : b'\xc3\xb0', # ð
+ 0xf1 : b'\xc3\xb1', # ñ
+ 0xf2 : b'\xc3\xb2', # ò
+ 0xf3 : b'\xc3\xb3', # ó
+ 0xf4 : b'\xc3\xb4', # ô
+ 0xf5 : b'\xc3\xb5', # õ
+ 0xf6 : b'\xc3\xb6', # ö
+ 0xf7 : b'\xc3\xb7', # ÷
+ 0xf8 : b'\xc3\xb8', # ø
+ 0xf9 : b'\xc3\xb9', # ù
+ 0xfa : b'\xc3\xba', # ú
+ 0xfb : b'\xc3\xbb', # û
+ 0xfc : b'\xc3\xbc', # ü
+ 0xfd : b'\xc3\xbd', # ý
+ 0xfe : b'\xc3\xbe', # þ
+ }
+
+ MULTIBYTE_MARKERS_AND_SIZES = [
+ (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
+ (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
+ (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
+ ]
+
+ FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
+ LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
+
+ @classmethod
+ def detwingle(cls, in_bytes, main_encoding="utf8",
+ embedded_encoding="windows-1252"):
+ """Fix characters from one encoding embedded in some other encoding.
+
+ Currently the only situation supported is Windows-1252 (or its
+ subset ISO-8859-1), embedded in UTF-8.
+
+ The input must be a bytestring. If you've already converted
+ the document to Unicode, you're too late.
+
+ The output is a bytestring in which `embedded_encoding`
+ characters have been converted to their `main_encoding`
+ equivalents.
+ """
+ if embedded_encoding.replace('_', '-').lower() not in (
+ 'windows-1252', 'windows_1252'):
+ raise NotImplementedError(
+ "Windows-1252 and ISO-8859-1 are the only currently supported "
+ "embedded encodings.")
+
+ if main_encoding.lower() not in ('utf8', 'utf-8'):
+ raise NotImplementedError(
+ "UTF-8 is the only currently supported main encoding.")
+
+ byte_chunks = []
+
+ chunk_start = 0
+ pos = 0
+ while pos < len(in_bytes):
+ byte = in_bytes[pos]
+ if not isinstance(byte, int):
+ # Python 2.x
+ byte = ord(byte)
+ if (byte >= cls.FIRST_MULTIBYTE_MARKER
+ and byte <= cls.LAST_MULTIBYTE_MARKER):
+ # This is the start of a UTF-8 multibyte character. Skip
+ # to the end.
+ for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
+ if byte >= start and byte <= end:
+ pos += size
+ break
+ elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
+ # We found a Windows-1252 character!
+ # Save the string up to this point as a chunk.
+ byte_chunks.append(in_bytes[chunk_start:pos])
+
+ # Now translate the Windows-1252 character into UTF-8
+ # and add it as another, one-byte chunk.
+ byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
+ pos += 1
+ chunk_start = pos
+ else:
+ # Go on to the next character.
+ pos += 1
+ if chunk_start == 0:
+ # The string is unchanged.
+ return in_bytes
+ else:
+ # Store the final chunk.
+ byte_chunks.append(in_bytes[chunk_start:])
+ return b''.join(byte_chunks)
+
diff --git a/script.module.slyguy/resources/modules/bs4/diagnose.py b/script.module.slyguy/resources/modules/bs4/diagnose.py
new file mode 100644
index 00000000..aba97f22
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/diagnose.py
@@ -0,0 +1,216 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+
+__license__ = "MIT"
+
+import cProfile
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+import bs4
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+
+import os
+import pstats
+import random
+import tempfile
+import time
+import traceback
+import sys
+import cProfile
+
+def diagnose(data):
+ """Diagnostic suite for isolating common problems."""
+ print("Diagnostic running on Beautiful Soup %s" % __version__)
+ print("Python version %s" % sys.version)
+
+ basic_parsers = ["html.parser", "html5lib", "lxml"]
+ for name in basic_parsers:
+ for builder in builder_registry.builders:
+ if name in builder.features:
+ break
+ else:
+ basic_parsers.remove(name)
+ print(
+ "I noticed that %s is not installed. Installing it may help." %
+ name)
+
+ if 'lxml' in basic_parsers:
+ basic_parsers.append(["lxml", "xml"])
+ try:
+ from lxml import etree
+ print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+ except ImportError, e:
+ print(
+ "lxml is not installed or couldn't be imported.")
+
+
+ if 'html5lib' in basic_parsers:
+ try:
+ import html5lib
+ print("Found html5lib version %s" % html5lib.__version__)
+ except ImportError, e:
+ print(
+ "html5lib is not installed or couldn't be imported.")
+
+ if hasattr(data, 'read'):
+ data = data.read()
+ elif os.path.exists(data):
+ print('"%s" looks like a filename. Reading data from the file.' % data)
+ data = open(data).read()
+ elif data.startswith("http:") or data.startswith("https:"):
+ print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+ print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
+ return
+ print
+
+ for parser in basic_parsers:
+ print("Trying to parse your markup with %s" % parser)
+ success = False
+ try:
+ soup = BeautifulSoup(data, parser)
+ success = True
+ except Exception, e:
+ print("%s could not parse the markup." % parser)
+ traceback.print_exc()
+ if success:
+ print("Here's what %s did with the markup:" % parser)
+ print soup.prettify()
+
+ print "-" * 80
+
+def lxml_trace(data, html=True, **kwargs):
+ """Print out the lxml events that occur during parsing.
+
+ This lets you see how lxml parses a document when no Beautiful
+ Soup code is running.
+ """
+ from lxml import etree
+ for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+ print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+ """Announces HTMLParser parse events, without doing anything else."""
+
+ def _p(self, s):
+ print(s)
+
+ def handle_starttag(self, name, attrs):
+ self._p("%s START" % name)
+
+ def handle_endtag(self, name):
+ self._p("%s END" % name)
+
+ def handle_data(self, data):
+ self._p("%s DATA" % data)
+
+ def handle_charref(self, name):
+ self._p("%s CHARREF" % name)
+
+ def handle_entityref(self, name):
+ self._p("%s ENTITYREF" % name)
+
+ def handle_comment(self, data):
+ self._p("%s COMMENT" % data)
+
+ def handle_decl(self, data):
+ self._p("%s DECL" % data)
+
+ def unknown_decl(self, data):
+ self._p("%s UNKNOWN-DECL" % data)
+
+ def handle_pi(self, data):
+ self._p("%s PI" % data)
+
+def htmlparser_trace(data):
+ """Print out the HTMLParser events that occur during parsing.
+
+ This lets you see how HTMLParser parses a document when no
+ Beautiful Soup code is running.
+ """
+ parser = AnnouncingParser()
+ parser.feed(data)
+
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+
+def rword(length=5):
+ "Generate a random word-like string."
+ s = ''
+ for i in range(length):
+ if i % 2 == 0:
+ t = _consonants
+ else:
+ t = _vowels
+ s += random.choice(t)
+ return s
+
+def rsentence(length=4):
+ "Generate a random sentence-like string."
+ return " ".join(rword(random.randint(4,9)) for i in range(length))
+
+def rdoc(num_elements=1000):
+ """Randomly generate an invalid HTML document."""
+ tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+ elements = []
+ for i in range(num_elements):
+ choice = random.randint(0,3)
+ if choice == 0:
+ # New tag.
+ tag_name = random.choice(tag_names)
+ elements.append("<%s>" % tag_name)
+ elif choice == 1:
+ elements.append(rsentence(random.randint(1,4)))
+ elif choice == 2:
+ # Close a tag.
+ tag_name = random.choice(tag_names)
+ elements.append("%s>" % tag_name)
+ return "" + "\n".join(elements) + ""
+
+def benchmark_parsers(num_elements=100000):
+ """Very basic head-to-head performance benchmark."""
+ print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ data = rdoc(num_elements)
+ print "Generated a large invalid HTML document (%d bytes)." % len(data)
+
+ for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+ success = False
+ try:
+ a = time.time()
+ soup = BeautifulSoup(data, parser)
+ b = time.time()
+ success = True
+ except Exception, e:
+ print "%s could not parse the markup." % parser
+ traceback.print_exc()
+ if success:
+ print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+
+ from lxml import etree
+ a = time.time()
+ etree.HTML(data)
+ b = time.time()
+ print "Raw lxml parsed the markup in %.2fs." % (b-a)
+
+ import html5lib
+ parser = html5lib.HTMLParser()
+ a = time.time()
+ parser.parse(data)
+ b = time.time()
+ print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+
+def profile(num_elements=100000, parser="lxml"):
+
+ filehandle = tempfile.NamedTemporaryFile()
+ filename = filehandle.name
+
+ data = rdoc(num_elements)
+ vars = dict(bs4=bs4, data=data, parser=parser)
+ cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+
+ stats = pstats.Stats(filename)
+ # stats.strip_dirs()
+ stats.sort_stats("cumulative")
+ stats.print_stats('_html5lib|bs4', 50)
+
+if __name__ == '__main__':
+ diagnose(sys.stdin.read())
diff --git a/script.module.slyguy/resources/modules/bs4/element.py b/script.module.slyguy/resources/modules/bs4/element.py
new file mode 100644
index 00000000..69d3c8ab
--- /dev/null
+++ b/script.module.slyguy/resources/modules/bs4/element.py
@@ -0,0 +1,1730 @@
+__license__ = "MIT"
+
+from pdb import set_trace
+import collections
+import re
+import sys
+import six
+import warnings
+from bs4.dammit import EntitySubstitution
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+PY3K = (sys.version_info[0] > 2)
+
+whitespace_re = re.compile("\s+")
+
+def _alias(attr):
+ """Alias one attribute name to another for backward compatibility"""
+ @property
+ def alias(self):
+ return getattr(self, attr)
+
+ @alias.setter
+ def alias(self):
+ return setattr(self, attr)
+ return alias
+
+
+class NamespacedAttribute(six.text_type):
+
+ def __new__(cls, prefix, name, namespace=None):
+ if name is None:
+ obj = six.text_type.__new__(cls, prefix)
+ elif prefix is None:
+ # Not really namespaced.
+ obj = six.text_type.__new__(cls, name)
+ else:
+ obj = six.text_type.__new__(cls, prefix + ":" + name)
+ obj.prefix = prefix
+ obj.name = name
+ obj.namespace = namespace
+ return obj
+
+class AttributeValueWithCharsetSubstitution(six.text_type):
+ """A stand-in object for a character encoding specified in HTML."""
+
+class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+ """A generic stand-in for the value of a meta tag's 'charset' attribute.
+
+ When Beautiful Soup parses the markup '', the
+ value of the 'charset' attribute will be one of these objects.
+ """
+
+ def __new__(cls, original_value):
+ obj = six.text_type.__new__(cls, original_value)
+ obj.original_value = original_value
+ return obj
+
+ def encode(self, encoding):
+ return encoding
+
+
+class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+ """A generic stand-in for the value of a meta tag's 'content' attribute.
+
+ When Beautiful Soup parses the markup:
+
+
+ The value of the 'content' attribute will be one of these objects.
+ """
+
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def __new__(cls, original_value):
+ match = cls.CHARSET_RE.search(original_value)
+ if match is None:
+ # No substitution necessary.
+ return six.text_type.__new__(six.text_type, original_value)
+
+ obj = six.text_type.__new__(cls, original_value)
+ obj.original_value = original_value
+ return obj
+
+ def encode(self, encoding):
+ def rewrite(match):
+ return match.group(1) + encoding
+ return self.CHARSET_RE.sub(rewrite, self.original_value)
+
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+ """Entity substitution rules that are aware of some HTML quirks.
+
+ Specifically, the contents of