From e415a1351bf3e7eff0a33574400fa89c0941d9bf Mon Sep 17 00:00:00 2001 From: "Scott J. Goldman" Date: Fri, 31 Aug 2012 22:47:19 -0700 Subject: [PATCH 1/3] When testing if a blob is indexable, check size first Otherwise, charlock_holmes will allocate another large binary buffer for testing the encoding, which is a problem if the binary blob is many hundreds of MB large. It'll just fail and crash ruby. --- github-linguist.gemspec | 1 + lib/linguist/blob_helper.rb | 6 +++--- test/test_blob.rb | 7 +++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/github-linguist.gemspec b/github-linguist.gemspec index 246824ac0a..3b1e37019b 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -12,6 +12,7 @@ Gem::Specification.new do |s| s.add_dependency 'escape_utils', '~> 0.2.3' s.add_dependency 'mime-types', '~> 1.19' s.add_dependency 'pygments.rb', '>= 0.2.13' + s.add_development_dependency 'mocha' s.add_development_dependency 'json' s.add_development_dependency 'rake' s.add_development_dependency 'yajl-ruby' diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index b2d72f1140..fbbaff9cfe 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -250,7 +250,9 @@ def generated? # # Return true or false def indexable? - if binary? + if size > 100 * 1024 + false + elsif binary? false elsif extname == '.txt' true @@ -260,8 +262,6 @@ def indexable? false elsif generated? false - elsif size > 100 * 1024 - false else true end diff --git a/test/test_blob.rb b/test/test_blob.rb index 0832b8fdb4..17e9ef8c20 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -2,6 +2,7 @@ require 'linguist/samples' require 'test/unit' +require 'mocha' require 'mime/types' require 'pygments' @@ -261,6 +262,12 @@ def test_indexable assert !blob("Text/dump.sql").indexable? assert !blob("Binary/github.po").indexable? assert !blob("Binary/linguist.gem").indexable? + + # large binary blobs should fail on size check first, not call + # into charlock_holmes and alloc big buffers for testing encoding + b = blob("Binary/octocat.ai") + b.expects(:binary?).never + assert !b.indexable? end def test_language From 04394750e77ed8dc71b93a22ab3c9c8b79cbf57e Mon Sep 17 00:00:00 2001 From: "Scott J. Goldman" Date: Sat, 1 Sep 2012 22:59:49 -0700 Subject: [PATCH 2/3] When testing if a blob is safe to colorize, check size first Similar to e415a13 --- lib/linguist/blob_helper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index fbbaff9cfe..04c3f0a9a9 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -160,7 +160,7 @@ def large? # # Return true or false def safe_to_colorize? - text? && !large? && !high_ratio_of_long_lines? + !large? && text? && !high_ratio_of_long_lines? end # Internal: Does the blob have a ratio of long lines? From fc435a254171ff7ac8ff1291fc03a071add3c346 Mon Sep 17 00:00:00 2001 From: "Scott J. Goldman" Date: Fri, 31 Aug 2012 22:49:54 -0700 Subject: [PATCH 3/3] Linguist 2.3.2 --- github-linguist.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github-linguist.gemspec b/github-linguist.gemspec index 3b1e37019b..14590ce00e 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'github-linguist' - s.version = '2.3.1' + s.version = '2.3.2' s.summary = "GitHub Language detection" s.authors = "GitHub"