for example
if not REGEXES['divToPElementsRe'].search(
- str(''.join(map(str, map(tostring, list(elem)))))):
- #self.debug("Altering %s to p" % (describe(elem)))
+ str_(b''.join(map(tostring, list(elem))))):
+ #log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
@@ -432,8 +410,7 @@ def reverse_tags(self, node, *tag_names):
yield e
def sanitize(self, node, candidates):
- MIN_LEN = self.options.get('min_text_length',
- self.TEXT_LENGTH_THRESHOLD)
+ MIN_LEN = self.min_text_length
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
@@ -461,7 +438,7 @@ def sanitize(self, node, candidates):
tag = el.tag
if weight + content_score < 0:
- self.debug("Cleaned %s with score %6.3f and weight %-3s" %
+ log.debug("Removed %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
@@ -500,9 +477,12 @@ def sanitize(self, node, candidates):
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x s than s"
to_remove = True
- elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
+ elif content_length < MIN_LEN and counts["img"] == 0:
reason = "too short content length %s without a single image" % content_length
to_remove = True
+ elif content_length < MIN_LEN and counts["img"] > 2:
+ reason = "too short content length %s and too many images" % content_length
+ to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (
link_density, weight)
@@ -514,22 +494,25 @@ def sanitize(self, node, candidates):
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "