Skip to content

Commit

Permalink
DEV: Small Amazon fixes (discourse#463)
Browse files Browse the repository at this point in the history
* DEV: Small Amazon fixes

* Use the canonical URL as the link within the generated onebox

* Use the `data-old-hires` value for the image URL, if available

* Version bump to 2.2.10

* Truncate description, regardless of where it is sourced from

Co-authored-by: Jarek Radosz <[email protected]>
  • Loading branch information
jbrw and CvX authored Apr 2, 2021
1 parent 6e90f46 commit d84e820
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 14 deletions.
34 changes: 22 additions & 12 deletions lib/onebox/engine/amazon_onebox.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,20 @@ class AmazonOnebox
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)

def url
# Have we cached the HTML body of the requested URL?
# If so, try to grab the canonical URL from that document,
# If possible, fetch the cached HTML body immediately so we can
# try to grab the canonical URL from that document,
# rather than guess at the best URL structure to use
if @body_cacher && @body_cacher.respond_to?('cache_response_body?')
if @body_cacher.cached_response_body_exists?(uri.to_s)
@raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, @body_cacher)
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
return canonical_link.to_s if canonical_link
if body_cacher&.respond_to?('cache_response_body?')
if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
@raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
end
end

if @raw
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
return canonical_link.to_s if canonical_link
end

if match && match[:id]
return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
end
Expand All @@ -45,7 +48,7 @@ def http_params
private

def match
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[^\/]+)(?:\/|$)/mi)
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
end

def image
Expand All @@ -60,6 +63,10 @@ def image
end

if (landing_image = raw.css("#landingImage")) && landing_image.any?
attributes = landing_image.first.attributes

return attributes["data-old-hires"].to_s if attributes["data-old-hires"]

landing_image.first["src"].to_s
end

Expand Down Expand Up @@ -110,7 +117,7 @@ def data
end

result = {
link: link,
link: url,
title: title,
by_info: authors,
image: og.image || image,
Expand Down Expand Up @@ -141,7 +148,7 @@ def data
end

result = {
link: link,
link: url,
title: title,
by_info: authors,
image: og.image || image,
Expand All @@ -157,7 +164,7 @@ def data
else
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
result = {
link: link,
link: url,
title: title,
image: og.image || image,
price: price
Expand All @@ -167,7 +174,10 @@ def data
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]

summary = raw.at("#productDescription")
result[:description] = og.description || (summary && summary.inner_text) || CGI.unescapeHTML(Onebox::Helpers.truncate(raw.css("meta[name=description]").first["content"], 250))

description = og.description || summary&.inner_text
description ||= raw.css("meta[name=description]").first&.[]("content")
result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
end

result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
Expand Down
5 changes: 4 additions & 1 deletion lib/onebox/engine/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ def http_params
end

def raw
body_cacher = self.options[:body_cacher] if self.options
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher)
end

def body_cacher
self.options&.[](:body_cacher)
end

def html?
raw.respond_to(:css)
end
Expand Down
2 changes: 1 addition & 1 deletion lib/onebox/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module Onebox
VERSION = "2.2.9"
VERSION = "2.2.10"
end
7 changes: 7 additions & 0 deletions spec/lib/onebox/engine/amazon_onebox_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ def check_link(tdl, link)
expect(described_class.new("http://www.amazon.fr/gp/product/B01BYD0TZM").url)
.to eq("https://www.amazon.fr/dp/B01BYD0TZM")
end

let(:long_url) { "https://www.amazon.ca/gp/product/B087Z3N428?pf_rd_r=SXABADD0ZZ3NF9Q5F8TW&pf_rd_p=05378fd5-c43e-4948-99b1-a65b129fdd73&pd_rd_r=0237fb28-7f47-49f4-986a-be0c78e52863&pd_rd_w=FfIoI&pd_rd_wg=Hw4qq&ref_=pd_gw_unk" }

it "removes parameters from the URL" do
expect(described_class.new(long_url).url)
.not_to include("?pf_rd_r")
end
end

describe "#to_html" do
Expand Down

0 comments on commit d84e820

Please sign in to comment.