Updates old HTML formatter

loguic · Apr 19, 2016 · 97dc01b · 97dc01b
1 parent 9263ed2
commit 97dc01b
Showing 1 changed file with 65 additions and 43 deletions.
diff --git a/codeformatter/lib/htmlbeautifier/__init__.py b/codeformatter/lib/htmlbeautifier/__init__.py
@@ -1,3 +1,5 @@
+# For support with this formatter, tag @RevanProdigalKnight in the issue
+
 from __future__ import print_function
 import sys
 import re
@@ -15,7 +17,6 @@ def __init__(self):
 		self.indent_char = ' '
 		self.indent_with_tabs = False
 		self.expand_tags = False
-		self.expand_javascript = False
 		self.minimum_attribute_count = 2
 		self.first_attribute_on_new_line = False
 		self.reduce_empty_tags = False
@@ -27,12 +28,11 @@ def __repr__(self):
 indent_char = [%s]
 indent_with_tabs = [%s]
 expand_tags = [%s]
-expand_javascript = [%s]
 minimum_attribute_count = %d
 first_attribute_on_new_line = [%s]
 reduce_empty_tags = [%s]
 exception_on_tag_mismatch = [%s]
-custom_singletons = [%s]""" % (self.indent_size, self.indent_char, self.indent_with_tabs, self.expand_tags, self.expand_javascript, self.minimum_attribute_count, self.first_attribute_on_new_line, self.reduce_empty_tags, self.exception_on_tag_mismatch, self.custom_singletons)
+custom_singletons = [%s]""" % (self.indent_size, self.indent_char, self.indent_with_tabs, self.expand_tags, self.minimum_attribute_count, self.first_attribute_on_new_line, self.reduce_empty_tags, self.exception_on_tag_mismatch, self.custom_singletons)
 
 def default_options():
 	return BeautifierOptions()
@@ -62,7 +62,6 @@ def __init__(self, source_text, opts=default_options()):
 		self.opts = opts
 		self.exception_on_tag_mismatch = opts.exception_on_tag_mismatch
 		self.expand_tags = opts.expand_tags
-		self.expand_javascript = opts.expand_javascript
 		self.minimum_attribute_count = opts.minimum_attribute_count
 		self.first_attribute_on_new_line = opts.first_attribute_on_new_line
 		self.reduce_empty_tags = opts.reduce_empty_tags
@@ -75,14 +74,16 @@ def __init__(self, source_text, opts=default_options()):
 			self.tab_size = sublime.load_settings('Preferences.sublime-settings').get('tab_size',4)
 		self.indent_level = 0
 		# These are the tags that are currently defined as being void by the HTML5 spec, and should be self-closing (a.k.a. singletons)
-		self.singletons = r'<(area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr<%= custom %>)([^>]*)>'
+		self.singletons = r'<(area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr<%= custom %>)([^>]*?)/?>(?:\s*?</\1>)?'
 		if not opts.custom_singletons == '':
 			self.singletons = re.sub(r'<%= custom %>','|' + opts.custom_singletons,self.singletons)
 		else:
 			self.singletons = re.sub(r'<%= custom %>','',self.singletons)
-
-	def remove_newlines(self,str):
-		return re.sub(r'\n\s*',r'',str.group(1))
+		# Compile singletons regex since it's used so often (twice before the loop, then once per loop iteration)
+		self.singletons = re.compile(self.singletons,re.I)
+		self.removed_css = []
+		self.removed_js = []
+		self.removed_comments = []
 
 	def expand_tag(self,str):
 		_str = str.group(0) # cache the original string in a variable for faster access
@@ -115,74 +116,95 @@ def expand_tag(self,str):
 			tag += '\n' + (((self.indent_level * self.indent_size) + extra_tabs) * self.indent_char) + (indent * ' ') + l
 		return tag
 
+	def remove_newlines(self,ch=''): return lambda str: re.sub(r'\n\s*',ch,str.group(0))
+
+	def remove(self,pattern,replacement,findList,raw):
+		pattern = re.compile(r'(?<=\n)\s*?' + pattern,re.S|re.I)
+		findList.extend(pattern.findall(raw))
+		return pattern.sub((lambda match: match.group(0)[:-len(match.group(0).lstrip())] + replacement),raw) # Preserve the indentation from the beginning of the match
+
+	def remove_js(self,raw): return self.remove(r'<script[^>]*>.*?</script>','/* SCRIPT */',self.removed_js,raw)
+	def remove_css(self,raw): return self.remove(r'<style[^>]*>.*?</style>','/* STYLE */',self.removed_css,raw)
+	def remove_comments(self,raw): return self.remove(r'<!--.*?-->','/* COMMENT */',self.removed_comments,raw)
+
+	def reindent(self,raw,match):
+		prev_newline = r'(?<=\n)'
+		lowest_indent = -1
+		for l in re.split(r'\n',raw):
+			indent = len(l) - len(l.strip())
+			if lowest_indent == -1 or lowest_indent > indent:
+				lowest_indent = indent
+		indent = len(match.group(1)) * self.indent_char
+		return indent + re.sub(prev_newline,indent,re.sub(prev_newline + (lowest_indent * self.indent_char),'',raw.lstrip())); # Force new indentation
+
+	def getNextFrom(self,_list):
+		it = iter(_list)
+		return lambda match: self.reindent(it.next(),match)
+
+	def replace(self,pattern,replaceList,raw): return re.compile(r'(?<=\n)(\s*?)' + pattern,re.S|re.I).sub(self.getNextFrom(replaceList),raw)
+	def replace_comments(self,raw): return self.replace(r'/\* COMMENT \*/',self.removed_comments,raw)
+	def replace_css(self,raw): return self.replace(r'/\* STYLE \*/',self.removed_css,raw)
+	def replace_js(self,raw): return self.replace(r'/\* SCRIPT \*/',self.removed_js,raw)
+
 	def beautify(self):
 		beautiful = ''
+
+		replaceWithSpace = self.remove_newlines(' ')
+
 		raw = self.source_text
 
-		# Replace single-line javascript comments with block comments so that expansion of the elements inside doesn't
-		# become un-commented, ignoring commented CDATA tags
-		if self.expand_javascript:
-			raw = re.sub(r'(?<=\s)//(?!<!\[CDATA\[|\]\]>)(.+)\n',r'/*\1*/\n',raw)
-			raw = re.sub(r';+',r';',raw)
+		# Remove JS, CSS, and comments from raw source
+		raw = self.remove_js(raw)
+		raw = self.remove_css(raw)
+		raw = self.remove_comments(raw)
 
-		# Add newlines before/after tags (excluding CDATA). This separates single-line HTML comments into 3 lines as well
+		# Add newlines before/after tags (excluding CDATA)
 		raw = re.sub(r'(<[^! ]|(?<!/\*|//)\]\]>|(?<!<!\[endif\])-->)',r'\n\1',raw)
 		raw = re.sub(r'(>|(?<!/\*|//)<!\[CDATA\[|<!--(?!\[if .+?\]>))',r'\1\n',raw)
 
-		# Add newlines before=after javascript braces/switch cases/comments
-		if self.expand_javascript:
-			raw = re.sub(r'(\}|\*/)',r'\n\1',raw)
-			raw = re.sub(r'(\{|/\*|(?<!\();(?!\))|(?:case [^:]+|default):)',r'\1\n',raw)
-			raw = re.sub(r'\[(?!(?:if .+?\]>|endif\]-->))([^\]]+)\]',r'[\n\1\n]',raw)# Split javascript array entries onto new lines
-			raw = re.sub(r'(\[[^\[\]]{0,10}\])',self.remove_newlines,raw)# Fix javascript regex that was broken by the previous regex replace
-			raw = re.sub(r',(?!;$)([^:;\{]+:[^,])',r',\n\1',raw)		# Split javascript object entries onto new lines
-			raw = re.sub(r'({[^\{}]{0,10}})',self.remove_newlines,raw)# Fix javascript regex that was broken by the previous regex replace
-			raw = re.sub(r'((for|while)\s+?(\([^\)]+\))\s+?\{)',self.remove_newlines,raw)	# Put all the content of a loop def on the same line
-			raw = re.sub(r'\},\s*?\{',r'},\n{',raw)
-			# Fix CSS that will have been expanded by this option as well so that new CSS rulesets begin on their own line
-			raw = re.sub(r'\}(.*?)(\{|;)',r'}\n\1\2',raw)
-			# Fix AngularJS/Blade/etc brace ({{}}) templates that will have been broken into multiple lines
-			raw = re.sub(r'(\{{2,})(.*?)(\}{2,})',r'\1 \2 \3',re.sub(r'(\{(?:\s*\{)+[\s\S]*?\}(?:\s*\})+)',self.remove_newlines,raw))
-
-		raw = re.sub(r'("[^"]*")',self.remove_newlines,raw)				# Put all content between double-quote marks back on the same line
-		raw = re.sub(self.singletons,r'<\1\2/>',raw)							# Replace all singleton tags with /-delimited tags (XHTML style)
-		raw = raw.replace('//>','/>')															# Fix the singleton tags if they were already /-delimited
+		# Fix AngularJS/Blade/etc brace ({{}}, {{::}}, etc) templates that will have been broken into multiple lines
+		raw = re.sub(r'(\{{2,}(?:::)?)\s?(.*?)\s?(\}{2,})',r'\1 \2 \3',re.sub(r'\{(?:\s*\{)+\s?[\s\S]*?\s?\}(?:\s*\})+',self.remove_newlines(),raw))
+
+		raw = re.sub(r'"[^"]*"',replaceWithSpace,raw)							# Put all content between double-quote marks back on the same line
+
+		# Re-join start tags that are already on multiple lines (ignore end tags)
+		raw = re.compile(r'(?<=\n)<(?!/).*?>(?=\n)',re.S).sub(replaceWithSpace,raw)
+
+		raw = self.singletons.sub(r'<\1\2/>',raw)									# Replace all singleton tags with /-delimited tags (XHTML style)
+		raw = self.singletons.sub(replaceWithSpace,raw)
+		raw = re.sub(r'(?<!\s)\s(?=/?>)','',raw)
 		raw = re.sub(r'\n{2,}',r'\n',raw)													# Replace multiple newlines with just one
 
 		for l in re.split('\n',raw):
 			l = l.strip()																						# Trim whitespace from the line
-			if l == '' or l == ';': continue												# If the line has no content (HTML or JavaScript), skip
+			if l == '': continue																		# If the line has no content, skip
 
 			# If the line starts with </, or an end CDATA/block comment tag, reduce indentation
 			if re.match(r'</|]]>|(?:<!\[endif\])?-->',l): self.indent_level -= 1
-			# If the line starts with }, a switch case, or the end of a block comment, reduce indentation
-			if self.expand_javascript and re.match(r'\}|\]|(?:case [^:]+|default):|\*/',l): self.indent_level -= 1
 
 			beautiful += (self.indent_char * self.indent_level * self.indent_size)
 			if self.expand_tags:
 				beautiful += re.sub(r'^<.*>$',self.expand_tag,l)
-				# beautiful += re.sub(r'(<[^/!][^>]+>)',self.expand_tag,l)
 			else:
 				beautiful += l
 			beautiful += '\n'
 
-			if re.search(self.singletons,l): pass										# If the tag is a singleton, indentation stays the same
+			if self.singletons.search(l): pass											# If the tag is a singleton, indentation stays the same
 			else:
 				# If the line starts with a begin CDATA/block comment tag or a tag, indent the next line
 				if re.match(r'<!--|<!\[CDATA\[|<[^/?! ]',l): self.indent_level += 1
-				# If the line starts with a block comment, switch case, or ends with {, indent the next line}
-				if self.expand_javascript:
-					if re.match(r'/\*|(?:case [^:]+|default):',l) or re.search(r'(?:\{|\[)$',l): self.indent_level += 1
 
 		# If the end of the document is not at the same indentation as the beginning, the tags aren't matched
 		if not self.indent_level == 0 and self.exception_on_tag_mismatch:
 			raise Exception("Mismatched tags")
 
 		# Put all matched start/end tags with no content between them on the same line and return
 		if self.reduce_empty_tags:
-			beautiful = re.sub(r'<([\w\-]+)([^>]*)>\s+</\1>',r'<\1\2></\1>',beautiful)
+			beautiful = re.sub(r'<(\S+)([^>]*)>\s+</\1>',r'<\1\2></\1>',beautiful)
 
-		# Put all single-line comments back on a single line - I separated them out earlier for simplicity's sake
-		beautiful = re.sub(r'<!--\n\s+(.+)\n\s+-->',r'<!-- \1 -->',beautiful)
+		# Replace JS, CSS, and comments in the opposite order of their removal
+		beautiful = self.replace_comments(beautiful)
+		beautiful = self.replace_css(beautiful)
+		beautiful = self.replace_js(beautiful)
 
 		return beautiful