Skip to content

Commit

Permalink
Merge pull request mixmark-io#34 from mathiasbynens/decode-html
Browse files Browse the repository at this point in the history
Decode HTML character references in code spans and blocks
  • Loading branch information
domchristie committed Jul 16, 2014
2 parents c4482b2 + 56f5a6a commit 69ccee0
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 49 deletions.
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
"author": "Dom Christie",
"main": "src/to-markdown.js",
"version": "0.0.1",
"dependencies": {
"he": ">=0.4.1"
},
"devDependencies": {
"nodeunit" : ">=0.6.0",
"zombie" : ">=0.12.0",
Expand Down
47 changes: 26 additions & 21 deletions src/to-markdown.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
*
*/

if (typeof he !== 'object' && typeof require === 'function') {
var he = require('he');
}

var toMarkdown = function(string) {

var ELEMENTS = [
{
patterns: 'p',
Expand Down Expand Up @@ -58,7 +62,7 @@ var toMarkdown = function(string) {
{
patterns: 'code',
replacement: function(str, attrs, innerHTML) {
return innerHTML ? '`' + innerHTML + '`' : '';
return innerHTML ? '`' + he.decode(innerHTML) + '`' : '';
}
},
{
Expand All @@ -72,7 +76,7 @@ var toMarkdown = function(string) {
}
}
];

for(var i = 0, len = ELEMENTS.length; i < len; i++) {
if(typeof ELEMENTS[i].patterns === 'string') {
string = replaceEls(string, { tag: ELEMENTS[i].patterns, replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type });
Expand All @@ -83,7 +87,7 @@ var toMarkdown = function(string) {
}
}
}

function replaceEls(html, elProperties) {
var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>',
regex = new RegExp(pattern, 'gi'),
Expand All @@ -98,45 +102,46 @@ var toMarkdown = function(string) {
}
return markdown;
}

function attrRegExp(attr) {
return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i');
}

// Pre code blocks

string = string.replace(/<pre\b[^>]*>`([\s\S]*)`<\/pre>/gi, function(str, innerHTML) {
innerHTML = innerHTML.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense)
innerHTML = innerHTML.replace(/\n/g, '\n ');
return '\n\n ' + innerHTML + '\n';
var text = he.decode(innerHTML);
text = text.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense)
text = text.replace(/\n/g, '\n ');
return '\n\n ' + text + '\n';
});

// Lists

// Escape numbers that could trigger an ol
// If there are more than three spaces before the code, it would be in a pre tag
// Make sure we are escaping the period not matching any character
string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. ');
// Converts lists that have no child lists (of same type) first, then works it's way up

// Converts lists that have no child lists (of same type) first, then works its way up
var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi;
while(string.match(noChildrenRegex)) {
string = string.replace(noChildrenRegex, function(str) {
return replaceLists(str);
});
}

function replaceLists(html) {

html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) {
var lis = innerHTML.split('</li>');
lis.splice(lis.length - 1, 1);

for(i = 0, len = lis.length; i < len; i++) {
if(lis[i]) {
var prefix = (listType === 'ol') ? (i + 1) + ". " : "* ";
lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) {

innerHTML = innerHTML.replace(/^\s+/, '');
innerHTML = innerHTML.replace(/\n\n/g, '\n\n ');
// indent nested lists
Expand All @@ -149,15 +154,15 @@ var toMarkdown = function(string) {
});
return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, '');
}

// Blockquotes
var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi;
while(string.match(deepest)) {
string = string.replace(deepest, function(str) {
return replaceBlockquotes(str);
});
}

function replaceBlockquotes(html) {
html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) {
inner = inner.replace(/^\s+|\s+$/g, '');
Expand All @@ -168,14 +173,14 @@ var toMarkdown = function(string) {
});
return html;
}

function cleanUp(string) {
string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace
string = string.replace(/\n\s+\n/g, '\n\n');
string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2
return string;
}

return cleanUp(string);
};

Expand Down
1 change: 1 addition & 0 deletions test/test-runner.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<link rel="stylesheet" href="lib/qunit.css"></link>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.4/jquery.min.js"></script>
<script src="lib/qunit.js"></script>
<script src="../node_modules/he/he.js"></script>
<script src="../src/to-markdown.js"></script>
<script src="tests.js"></script>
</head>
Expand Down
60 changes: 32 additions & 28 deletions test/tests.js
Original file line number Diff line number Diff line change
@@ -1,47 +1,49 @@
$(function(){

test("converting p elements", function() {
equal(toMarkdown("<p>Lorem ipsum</p>"), "Lorem ipsum", "We expect p tags to be wrapped with two line breaks");
equal(toMarkdown("<p class='intro'>Lorem ipsum</p>"), "Lorem ipsum", "We expect p tags to be wrapped with two line breaks");
});

test("converting emphasis elements", function() {
equal(toMarkdown("<b>Hello world</b>"), "**Hello world**", "We expect <b>Hello world</b> to be converted to **Hello world**");
equal(toMarkdown("<strong>Hello world</strong>"), "**Hello world**", "We expect <strong>Hello world</strong> to be converted to **Hello world**");
equal(toMarkdown("<b></b>"), "", "We expect b tags to be removed");

equal(toMarkdown("<i>Hello world</i>"), "_Hello world_", "We expect <i>Hello world</i> to be converted to _Hello world_");
equal(toMarkdown("<em>Hello world</em>"), "_Hello world_", "We expect <em>Hello world</em> to be converted to _Hello world_");
equal(toMarkdown("<em id='one' class='cowabunga'>Hello world</em>"), "_Hello world_", "We expect <em id='one' class='cowabunga'>Hello world</em> to be converted to _Hello world_");
equal(toMarkdown("<em id='one' class='cowabunga'></em>"), "", "We expect empty em tags to be removed");
});

test("converting inline code elements", function() {
equal(toMarkdown("<code>print()</code>"), "`print()`", "We expect inline code tags to be converted to backticks");
equal(toMarkdown("<code></code>"), "", "We expect empty code tags to be removed");
equal(toMarkdown("<code>&lt;video&gt;</code>"), "`<video>`", "We expect HTML character references to be decoded");
equal(toMarkdown("<code>foo&#x1D306;bar</code>"), "`foo\uD834\uDF06bar`", "We expect HTML character references to be decoded");
});

test("converting heading elements", function() {
equal(toMarkdown("<h1>Hello world</h1>"), "# Hello world", "We expect <h1>Hello world</h1> to be converted to # Hello world");
equal(toMarkdown("<h3>Hello world</h3>"), "### Hello world", "We expect <h3>Hello world</h3> to be converted to ### Hello world");
equal(toMarkdown("<h6>Hello world</h6>"), "###### Hello world", "We expect <h6>Hello world</h6> to be converted to ###### Hello world");

equal(toMarkdown("<h8>Hello world</h8>"), "<h8>Hello world</h8>", "We expect <h8>Hello world</h8> to be converted to <h8>Hello world</h8>");
});

test("converting hr elements", function() {
equal(toMarkdown("<hr />"), "* * *", "We expect hr elements to be converted to * * *");
equal(toMarkdown("<hr/>"), "* * *", "We expect hr elements to be converted to * * *");
equal(toMarkdown("<hr>"), "* * *", "We expect hr elements to be converted to * * *");
equal(toMarkdown("<hr class='fancy' />"), "* * *", "We expect hr elements to be converted to * * *");
});

test("converting br elements", function() {
equal(toMarkdown("Hello<br />world"), "Hello\nworld", "We expect br elements to be converted to \n");
equal(toMarkdown("Hello<br/>world"), "Hello\nworld", "We expect br elements to be converted to \n");
equal(toMarkdown("Hello<br>world"), "Hello\nworld", "We expect br elements to be converted to \n");
});

test("converting img elements", function() {
equal(toMarkdown("<img src='http://example.com/logo.png' />"), "![](http://example.com/logo.png)", "We expect img elements to be converted properly");
equal(toMarkdown('<img src="http://example.com/logo.png" />'), "![](http://example.com/logo.png)", "We expect img elements to be converted properly");
Expand All @@ -51,36 +53,38 @@ $(function(){
equal(toMarkdown("<img src='http://example.com/logo.png' alt='Example logo' />"), "![Example logo](http://example.com/logo.png)", "We expect img elements to be converted properly with alt attrs");
equal(toMarkdown("<img src='http://example.com/logo.png' alt='Example logo' title='Example title' />"), "![Example logo](http://example.com/logo.png \"Example title\")", "We expect img elements to be converted properly with alt and title attrs");
});

test("converting anchor elements", function() {
equal(toMarkdown("<a href='http://example.com/about'>About us</a>"), "[About us](http://example.com/about)", "We expect anchor elements to be converted properly");
equal(toMarkdown('<a href="http://www.example.com/about" title="About this company">About us</a>'), '[About us](http://www.example.com/about "About this company")', "We expect an anchor element with a title tag to have correct markdown");
equal(toMarkdown('<a class="some really messy stuff" href="/about" id="donuts3" title="About this company">About us</a>'), '[About us](/about "About this company")', "We expect an anchor element with a title tag to have correct markdown");
equal(toMarkdown('<a id="donuts3">About us</a>'), '<a id="donuts3">About us</a>', "Anchor tags without an href should not be converted");
});

test("converting code blocks", function() {
var codeHtml = [
"<pre><code>def hello_world",
"<pre><code>def foo",
" # 42 &lt; 9001",
" 'Hello world!'",
"end</code></pre>"
],
codeMd = [
" def hello_world",
" def foo",
" # 42 < 9001",
" 'Hello world!'",
" end"
];
equal(toMarkdown(codeHtml.join('\n')), codeMd.join('\n'), "We expect code blocks to be converted");
});

test("converting list elements", function() {
equal(toMarkdown('1986. What a great season.'), '1986\\. What a great season.','We expect numbers that could trigger an ol to be escaped');
equal(toMarkdown("<ol>\n\t<li>Hello world</li>\n\t<li>Lorem ipsum</li>\n</ol>"), "1. Hello world\n2. Lorem ipsum", "We expect ol elements to be converted properly");
equal(toMarkdown("<ul>\n\t<li>Hello world</li>\n\t<li>Lorem ipsum</li>\n</ul>"), "* Hello world\n* Lorem ipsum", "We expect ul elements with line breaks and tabs to be converted properly");
equal(toMarkdown("<ul class='blargh'><li class='first'>Hello world</li><li>Lorem ipsum</li></ul>"), "* Hello world\n* Lorem ipsum", "We expect ul elements with attributes to be converted properly");
equal(toMarkdown("<ul><li>Hello world</li><li>Lorem ipsum</li></ul><ul><li>Hello world</li><li>Lorem ipsum</li></ul>"), "* Hello world\n* Lorem ipsum\n\n* Hello world\n* Lorem ipsum", "We expect multiple ul elements to be converted properly");
equal(toMarkdown("<ul><li><p>Hello world</p></li><li>Lorem ipsum</li></ul>"), "* Hello world\n\n* Lorem ipsum", "We expect li elements with ps to be converted properly");

var lisWithPsHtml = [
"<ol>",
" <li>",
Expand All @@ -92,17 +96,17 @@ $(function(){
" </li>",
"</ol>"
].join('\n'),

lisWithPsMd = [
"1. This is a list item with two paragraphs. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.",
"",
" Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. Donec sit amet nisl. Aliquam semper ipsum sit amet velit.",
"",
"2. Suspendisse id sem consectetuer libero luctus adipiscing."
].join('\n');

equal(toMarkdown(lisWithPsHtml), lisWithPsMd,'We expect lists with paragraphs to be converted');

var nestedListHtml = [
"<ul>",
" <li>This is a list item at root level</li>",
Expand Down Expand Up @@ -134,7 +138,7 @@ $(function(){
"* This is a third item at root level"
].join('\n');
equal(toMarkdown(nestedListHtml), nestedListMd, "We expect nested lists to be converted properly");

nestedListHtml = [
"<ul>",
" <li>This is a list item at root level</li>",
Expand Down Expand Up @@ -166,7 +170,7 @@ $(function(){
"* This is a third item at root level"
].join('\n');
equal(toMarkdown(nestedListHtml), nestedListMd, "We expect nested lists to be converted properly");

var html = [
"<ul>",
" <li>",
Expand All @@ -182,11 +186,11 @@ $(function(){
"",
" > This is a blockquote inside a list item."
].join('\n');

// needs fixing: see https://github.com/domchristie/to-markdown/issues/2
equal(toMarkdown(html), md, "We expect lists with blockquotes to be converted");
});

test("converting blockquotes", function() {
var html = [
"<blockquote>",
Expand All @@ -201,7 +205,7 @@ $(function(){
"> Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse id sem consectetuer libero luctus adipiscing."
].join('\n');
equal(toMarkdown(html), md, "We expect blockquotes with two paragraphs to be converted");

html = [
"<blockquote>",
" <p>This is the first level of quoting.</p>",
Expand All @@ -221,7 +225,7 @@ $(function(){
"> Back to the first level."
].join('\n');
equal(toMarkdown(html), md, "We expect nested blockquotes to be converted");

html = [
"<blockquote>",
" <h2>This is a header.</h2>",
Expand All @@ -230,7 +234,7 @@ $(function(){
" <li>This is the second list item.</li>",
" </ol>",
" <p>Here's some example code:</p>",
" <pre><code>return shell_exec(\"echo $input | $markdown_script\");</code></pre>",
" <pre><code>return 1 &lt; 2 ? shell_exec(\"echo $input | $markdown_script\") : 0;</code></pre>",
"</blockquote>"
].join('\n');
md = [
Expand All @@ -241,8 +245,8 @@ $(function(){
"> ",
"> Here's some example code:",
"> ",
"> return shell_exec(\"echo $input | $markdown_script\");"
"> return 1 < 2 ? shell_exec(\"echo $input | $markdown_script\") : 0;"
].join('\n');
strictEqual(toMarkdown(html), md, "We expect html in blockquotes to be converted");
strictEqual(toMarkdown(html), md, "We expect HTML in blockquotes to be converted");
});
});
});

0 comments on commit 69ccee0

Please sign in to comment.