Skip to content

Commit

Permalink
feat: add rewriteHtml (#207)
Browse files Browse the repository at this point in the history
* feat: add `rewriteHtml`

* chore: rewrite twitter tags

* chore: handle when name = property

* chore: ensure content is not empty

* chore: treat "fb:" as "og:"
  • Loading branch information
Kikobeats authored Oct 18, 2024
1 parent 2e0163e commit ec99777
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 2 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ Default: `false`

When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.

##### rewriteHtml

Type: `boolean`<br>
Default: `false`

When is `true`, it will rewrite some common mistake related with HTML meta tags.

## License

**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
Expand Down
27 changes: 27 additions & 0 deletions src/html.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'use strict'

const { get, split, nth, castArray, forEach } = require('lodash')
const debug = require('debug-logfmt')('html-get:rewrite')
const localhostUrl = require('localhost-url-regex')
const { TAGS: URL_TAGS } = require('html-urls')
const isHTML = require('is-html-content')
Expand Down Expand Up @@ -89,6 +90,29 @@ const addBody = ({ url, headers, html }) => {
return `<!DOCTYPE html><html><head></head><body>${element}</body></html>`
}

const isOpenGraph = (prop = '') =>
['og:', 'fb:'].some(prefix => prop.startsWith(prefix))

const rewriteMetaTags = ({ $ }) => {
$('meta').each((_, element) => {
const el = $(element)
if (!el.attr('content')) return

const name = el.attr('name')
const property = el.attr('property')

// Convert 'name' to 'property' for Open Graph tags if 'property' is not already set correctly
if (property !== name && isOpenGraph(name)) {
el.removeAttr('name').attr('property', name)
debug('og', el.attr())
// Convert 'property' to 'name' for non-Open Graph tags
} else if (property && !isOpenGraph(property)) {
el.removeAttr('property').attr('name', property)
debug('meta', el.attr())
}
})
}

const rewriteHtmlUrls = ({ $, url }) => {
forEach(URL_TAGS, (tagName, urlAttr) => {
$(tagName.join(',')).each(function () {
Expand Down Expand Up @@ -156,6 +180,7 @@ module.exports = ({
hide,
remove,
rewriteUrls,
rewriteHtml,
scripts,
modules
}) => {
Expand All @@ -167,6 +192,8 @@ module.exports = ({

if (rewriteUrls) rewriteHtmlUrls({ $, url })

if (rewriteHtml) rewriteMetaTags({ $, url })

addHead({ $, url, headers })

if (styles) injectStyle({ $, styles })
Expand Down
8 changes: 6 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ const getContent = PCancelable.fn(
mutoolPath,
puppeteerOpts,
rewriteUrls,
rewriteHtml,
toEncode
},
onCancel
Expand All @@ -224,7 +225,8 @@ const getContent = PCancelable.fn(
const html = addHtml({
...content,
...(isFetchMode ? puppeteerOpts : undefined),
rewriteUrls
rewriteUrls,
rewriteHtml
})

return { ...content, html }
Expand All @@ -245,7 +247,8 @@ module.exports = PCancelable.fn(
mutoolPath = defaultMutoolPath(),
prerender = 'auto',
puppeteerOpts,
rewriteUrls = false
rewriteUrls = false,
rewriteHtml = false
} = {},
onCancel
) => {
Expand All @@ -268,6 +271,7 @@ module.exports = PCancelable.fn(
mutoolPath,
puppeteerOpts,
rewriteUrls,
rewriteHtml,
toEncode
})

Expand Down
154 changes: 154 additions & 0 deletions test/html/rewrite-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
'use strict'

const test = require('ava')
const cheerio = require('cheerio')

const { prettyHtml } = require('../util')

const html = require('../../src/html')

const composeHtml = meta =>
prettyHtml(`
<!DOCTYPE html>
<html>
<head>
<title>kikobeats.com</title>
<meta property="og:site_name" content="kikobeats.com">
<link rel="canonical" href="https://kikobeats.com"><meta charset="utf-8">
${meta.join('\n')}
</head>
<body></body>
</html>`)

test("don't rewrite og if property is already present", async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta content="This Pin was discovered by NMA Group" data-app="true" name="og:description" property="og:description">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is(
$('meta[name="og:description"]').attr('content'),
'This Pin was discovered by NMA Group'
)
t.is(
$('meta[property="og:description"]').attr('content'),
'This Pin was discovered by NMA Group'
)
})

test('fb propietary tags should be treat as og', async t => {
{
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="1234" property="fb:app_id">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="fb:app_id"]').attr('content'), '1234')
t.is($('meta[name="fb:app_id"]').attr('content'), undefined)
}
{
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="1234" name="fb:app_id">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="fb:app_id"]').attr('content'), '1234')
t.is($('meta[name="fb:app_id"]').attr('content'), undefined)
}
})

test("don't rewrite og if content is empty", async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml(['<meta content="" name="twitter:description">']),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="twitter:description"]').attr('content'), '')
t.is($('meta[property="twitter:description"]').attr('content'), undefined)
})

test('rewrite multiple og wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta name="og:title" content="Kiko Beats">',
'<meta name="og:description" content="Personal website of Kiko Beats">',
'<meta name="og:image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[property="og:title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[property="og:description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[property="og:image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})

test('rewrite multiple meta wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta property="title" content="Kiko Beats">',
'<meta property="description" content="Personal website of Kiko Beats">',
'<meta property="image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[name="description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[name="image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})

test('rewrite multiple twitter wrong markup', async t => {
const output = html({
rewriteHtml: true,
url: 'https://kikobeats.com',
html: composeHtml([
'<meta property="twitter:title" content="Kiko Beats">',
'<meta property="twitter:description" content="Personal website of Kiko Beats">',
'<meta property="twitter:image" content="https://kikobeats.com/image.jpg">'
]),
headers: { 'content-type': 'text/html; charset=utf-8' }
})

const $ = cheerio.load(output)
t.is($('meta[name="twitter:title"]').attr('content'), 'Kiko Beats')
t.is(
$('meta[name="twitter:description"]').attr('content'),
'Personal website of Kiko Beats'
)
t.is(
$('meta[name="twitter:image"]').attr('content'),
'https://kikobeats.com/image.jpg'
)
})

0 comments on commit ec99777

Please sign in to comment.