Skip to content

Commit

Permalink
chore(images): add absolute url joining
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 31, 2024
1 parent 888e83a commit 753e5f9
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 14 deletions.
71 changes: 66 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.19"
version = "0.0.20"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand All @@ -19,6 +19,7 @@ html5ever = "0.27"
lazy_static = "1"
percent-encoding = "2"
auto_encoder = "0"
url = "2"

[dev-dependencies]
spectral = "0.6.0"
Expand Down
22 changes: 22 additions & 0 deletions src/ignore.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
use super::Handle;
use super::StructuredPrinter;
use super::TagHandler;
use super::TagHandlerFactory;

#[derive(Clone)]
/// Ignore the tag complete from the markup.
pub struct IgnoreTagFactory;

impl TagHandlerFactory for IgnoreTagFactory {
fn instantiate(&self) -> Box<dyn TagHandler> {
Box::new(self.clone())
}
}

impl TagHandler for IgnoreTagFactory {
fn handle(&mut self, _tag: &Handle, _printer: &mut StructuredPrinter) {}
fn after_handle(&mut self, _printer: &mut StructuredPrinter) {}
fn skip_descendants(&self) -> bool {
true
}
}
24 changes: 22 additions & 2 deletions src/images.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::sync::Arc;

use super::common::get_tag_attr;
use super::StructuredPrinter;
use super::TagHandler;
use markup5ever_rcdom::Handle;
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
use url::Url;

const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');

Expand All @@ -12,12 +15,19 @@ const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').ad
pub struct ImgHandler {
block_mode: bool,
commonmark: bool,
/// Used to make absolute urls.
url: Option<Arc<Url>>,
}

impl ImgHandler {
pub fn new(commonmark: bool) -> Self {
pub fn new(commonmark: bool, url: &Option<std::sync::Arc<Url>>) -> Self {
Self {
commonmark,
url: if let Some(u) = url {
Some(u.clone())
} else {
None
},
..Default::default()
}
}
Expand All @@ -26,7 +36,7 @@ impl ImgHandler {
impl TagHandler for ImgHandler {
fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
// hack: detect if the image has associated style and has display in block mode
let style_tag = get_tag_attr(tag, "src");
let style_tag = get_tag_attr(tag, "style");

if let Some(style) = style_tag {
if style.contains("display: block") {
Expand All @@ -41,6 +51,7 @@ impl TagHandler for ImgHandler {

// try to extract attrs
let src = get_tag_attr(tag, "src");

let alt = get_tag_attr(tag, "alt");
let title = get_tag_attr(tag, "title");
let height = get_tag_attr(tag, "height");
Expand Down Expand Up @@ -73,10 +84,19 @@ impl TagHandler for ImgHandler {
// need to escape URL if it contains spaces
// don't have any geometry-controlling attrs, post markdown natively
let mut img_url = src.unwrap_or_default();

if img_url.contains(' ') {
img_url = utf8_percent_encode(&img_url, FRAGMENT).to_string();
}

if img_url.starts_with("/") {
if let Some(ref u) = self.url {
if let Ok(n) = u.join(&img_url) {
img_url = n.to_string();
}
}
}

printer.append_str(&format!(
"![{}]({}{})",
alt.unwrap_or_default(),
Expand Down
59 changes: 55 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::boxed::Box;
use std::collections::HashMap;
use std::sync::Arc;
use url::Url;
pub mod anchors;
pub mod codes;
pub mod common;
pub mod containers;
pub mod dummy;
pub mod headers;
pub mod iframes;
pub mod ignore;
pub mod images;
pub mod lists;
pub mod paragraphs;
Expand Down Expand Up @@ -66,25 +69,72 @@ lazy_static! {
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
pub fn parse_html_custom(
pub fn parse_html_custom_base(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
match parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.read_from(&mut html.as_bytes())
{
Ok(dom) => {
let mut result = StructuredPrinter::default();
walk(&dom.document, &mut result, custom, commonmark);

walk(
&dom.document,
&mut result,
custom,
commonmark,
&if let Some(u) = url {
Some(Arc::new(u.clone()))
} else {
None
},
);

// we want to eventually remove the clean step.
clean_markdown(&result.data)
}
_ => Default::default(),
}
}

/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
/// in order to register custom tag hadler for tags you want.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
pub fn parse_html_custom(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
) -> String {
parse_html_custom_base(html, custom, commonmark, &None)
}

/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
/// in order to register custom tag hadler for tags you want.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
pub fn parse_html_custom_with_url(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
parse_html_custom_base(html, custom, commonmark, &url)
}

/// Main function of this library. Parses incoming HTML, converts it into Markdown
/// and returns converted string.
/// # Arguments
Expand Down Expand Up @@ -123,6 +173,7 @@ fn walk(
result: &mut StructuredPrinter,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Arc<Url>>,
) {
let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
let mut tag_name = String::default();
Expand Down Expand Up @@ -211,7 +262,7 @@ fn walk(
}
"pre" | "code" => Box::new(CodeHandler::default()),
// images, links
"img" => Box::new(ImgHandler::new(commonmark)),
"img" => Box::new(ImgHandler::new(commonmark, url)),
"a" => Box::new(AnchorHandler::default()),
// lists
"ol" | "ul" | "menu" => Box::new(ListHandler),
Expand Down Expand Up @@ -246,7 +297,7 @@ fn walk(
continue;
}

walk(child, result, custom, commonmark);
walk(child, result, custom, commonmark, url);

if let NodeData::Element { ref name, .. } = child.data {
if let Some(el) = result.siblings.get_mut(&current_depth) {
Expand Down
2 changes: 1 addition & 1 deletion src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,6 @@ where
/// and concatenates their text, recursively.
fn to_text(tag: &Handle, commonmark: bool) -> String {
let mut printer = StructuredPrinter::default();
walk(tag, &mut printer, &HashMap::default(), commonmark);
walk(tag, &mut printer, &HashMap::default(), commonmark, &None);
clean_markdown(&printer.data)
}
31 changes: 30 additions & 1 deletion tests/integration.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
extern crate spectral;

use html2md::parse_html;
use html2md::ignore::IgnoreTagFactory;
use html2md::{parse_html, parse_html_custom, parse_html_custom_with_url};
use std::collections::HashMap;
use std::fs::File;
use std::io::prelude::*;
use url::Url;

use indoc::indoc;
use spectral::prelude::*;
Expand Down Expand Up @@ -128,3 +131,29 @@ fn test_tables_crash2() {
assert_that!(table_with_vertical_header).contains(indoc! {"\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |\n\n"
});
}

#[test]
fn test_html_from_text() {
let mut html = String::new();
let mut html_file = File::open("test-samples/real-world-1.html").unwrap();
html_file
.read_to_string(&mut html)
.expect("File must be readable");

let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> = HashMap::new();
let tag = Box::new(IgnoreTagFactory {});

tag_factory.insert(String::from("script"), tag.clone());
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());

tag_factory.insert(String::from("iframe"), tag);

let result = parse_html_custom_with_url(
&html,
&tag_factory,
false,
&Some(Url::parse("https://spider.cloud").unwrap()),
);
assert!(!result.is_empty());
}

0 comments on commit 753e5f9

Please sign in to comment.