Skip to content

Commit

Permalink
Added new IsIgnoreRobotsDotTextIfRootDisallowedEnabled config value w…
Browse files Browse the repository at this point in the history
…hich completely ignores the robots.txt file if it blocks the root. Ie.. act as if there is no robots.txt.
  • Loading branch information
sjdirect committed Mar 31, 2015
1 parent 21bed57 commit 9bd3d7d
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 3 deletions.
1 change: 1 addition & 0 deletions Abot.Demo/App.config
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
isRespectRobotsDotTextEnabled="false"
isRespectMetaRobotsNoFollowEnabled="false"
isRespectAnchorRelNoFollowEnabled="false"
isIgnoreRobotsDotTextIfRootDisallowedEnabled="false"
robotsDotTextUserAgentString="abot"
maxRobotsDotTextCrawlDelayInSeconds="5"
minCrawlDelayPerDomainMilliSeconds="1000"/>
Expand Down
1 change: 1 addition & 0 deletions Abot.Tests.Integration/App.config
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
isRespectRobotsDotTextEnabled="false"
isRespectMetaRobotsNoFollowEnabled="false"
isRespectAnchorRelNoFollowEnabled="false"
isIgnoreRobotsDotTextIfRootDisallowedEnabled="false"
robotsDotTextUserAgentString="abot"
maxRobotsDotTextCrawlDelayInSeconds="5"
minCrawlDelayPerDomainMilliSeconds="0" />
Expand Down
3 changes: 2 additions & 1 deletion Abot.Tests.Unit/App.config
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@
<politeness
isRespectRobotsDotTextEnabled="true"
isRespectMetaRobotsNoFollowEnabled="true"
isRespectAnchorRelNoFollowEnabled="true"
isRespectAnchorRelNoFollowEnabled="true"
isIgnoreRobotsDotTextIfRootDisallowedEnabled="true"
robotsDotTextUserAgentString="zzzz"
maxRobotsDotTextCrawlDelayInSeconds="5"
minCrawlDelayPerDomainMilliSeconds="55" />
Expand Down
3 changes: 3 additions & 0 deletions Abot.Tests.Unit/Core/AbotConfigurationSectionHandlerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public void GetSetion_FillsConfigValuesFromAppConfigFile()
Assert.AreEqual(true, _uut.Politeness.IsRespectRobotsDotTextEnabled);
Assert.AreEqual(true, _uut.Politeness.IsRespectMetaRobotsNoFollowEnabled);
Assert.AreEqual(true, _uut.Politeness.IsRespectAnchorRelNoFollowEnabled);
Assert.AreEqual(true, _uut.Politeness.IsIgnoreRobotsDotTextIfRootDisallowedEnabled);
Assert.AreEqual("zzzz", _uut.Politeness.RobotsDotTextUserAgentString);
Assert.AreEqual(55, _uut.Politeness.MinCrawlDelayPerDomainMilliSeconds);
Assert.AreEqual(5, _uut.Politeness.MaxRobotsDotTextCrawlDelayInSeconds);
Expand Down Expand Up @@ -83,6 +84,7 @@ public void Convert_CovertsFromSectionObjectToDtoObject()
Assert.AreEqual(result.MinRetryDelayInMilliseconds, _uut.CrawlBehavior.MinRetryDelayInMilliseconds);

Assert.AreEqual(result.IsRespectRobotsDotTextEnabled, _uut.Politeness.IsRespectRobotsDotTextEnabled);
Assert.AreEqual(result.IsIgnoreRobotsDotTextIfRootDisallowedEnabled, _uut.Politeness.IsIgnoreRobotsDotTextIfRootDisallowedEnabled);
Assert.AreEqual(result.RobotsDotTextUserAgentString, _uut.Politeness.RobotsDotTextUserAgentString);
Assert.AreEqual(result.MinCrawlDelayPerDomainMilliSeconds, _uut.Politeness.MinCrawlDelayPerDomainMilliSeconds);
Assert.AreEqual(result.MaxRobotsDotTextCrawlDelayInSeconds, _uut.Politeness.MaxRobotsDotTextCrawlDelayInSeconds);
Expand All @@ -106,6 +108,7 @@ public void SectionHandlerDefaults_MatchPocoDefaults()
Assert.AreEqual(pocoDefaults.IsRespectRobotsDotTextEnabled, _uut.Politeness.IsRespectRobotsDotTextEnabled);
Assert.AreEqual(pocoDefaults.IsRespectMetaRobotsNoFollowEnabled, _uut.Politeness.IsRespectMetaRobotsNoFollowEnabled);
Assert.AreEqual(pocoDefaults.IsRespectAnchorRelNoFollowEnabled, _uut.Politeness.IsRespectAnchorRelNoFollowEnabled);
Assert.AreEqual(pocoDefaults.IsIgnoreRobotsDotTextIfRootDisallowedEnabled, _uut.Politeness.IsIgnoreRobotsDotTextIfRootDisallowedEnabled);
Assert.AreEqual(pocoDefaults.IsUriRecrawlingEnabled, _uut.CrawlBehavior.IsUriRecrawlingEnabled);
Assert.AreEqual(pocoDefaults.MaxConcurrentThreads, _uut.CrawlBehavior.MaxConcurrentThreads);
Assert.AreEqual(pocoDefaults.MaxRobotsDotTextCrawlDelayInSeconds, _uut.Politeness.MaxRobotsDotTextCrawlDelayInSeconds);
Expand Down
28 changes: 28 additions & 0 deletions Abot.Tests.Unit/Crawler/PoliteWebCrawlerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,34 @@ public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed
_fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(0));
}

[Test]
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester()
{
CrawledPage homePage = new CrawledPage(_rootUri)
{
Content = new PageContent
{
Text = "content here"
}
};
CrawledPage page1 = new CrawledPage(_rootUri);

_fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(false);
_fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision {Allow = true});
_dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
_dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true;
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

_unitUnderTest.Crawl(_rootUri);

_fakeCrawlDecisionMaker.VerifyAll();
_fakeRobotsDotText.VerifyAll();
_fakeRobotsDotTextFinder.VerifyAll();
_fakeHttpRequester.VerifyAll();
}

[Test]
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString()
{
Expand Down
6 changes: 6 additions & 0 deletions Abot/Core/AbotConfigurationSectionHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ public bool IsRespectAnchorRelNoFollowEnabled
get { return (bool)this["isRespectAnchorRelNoFollowEnabled"]; }
}

[ConfigurationProperty("isIgnoreRobotsDotTextIfRootDisallowedEnabled", IsRequired = false)]
public bool IsIgnoreRobotsDotTextIfRootDisallowedEnabled
{
get { return (bool)this["isIgnoreRobotsDotTextIfRootDisallowedEnabled"]; }
}

[ConfigurationProperty("robotsDotTextUserAgentString", IsRequired = false, DefaultValue = "abot")]
public string RobotsDotTextUserAgentString
{
Expand Down
9 changes: 8 additions & 1 deletion Abot/Crawler/PoliteWebCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,14 @@ protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
if (_robotsDotText != null)
allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);

if (!allowedByRobots)
if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
{
string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
allowedByRobots = true;
_robotsDotText = null;
}
else if (!allowedByRobots)
{
string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
Expand Down
7 changes: 6 additions & 1 deletion Abot/Poco/CrawlConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ public CrawlConfiguration()
/// </summary>
public bool IsRespectAnchorRelNoFollowEnabled { get; set; }

/// <summary>
/// If true, will ignore the robots.txt file if it disallows crawling the root uri.
/// </summary>
public bool IsIgnoreRobotsDotTextIfRootDisallowedEnabled { get; set; }

/// <summary>
/// The user agent string to use when checking robots.txt file for specific directives. Some examples of other crawler's user agent values are "googlebot", "slurp" etc...
/// </summary>
Expand All @@ -190,7 +195,7 @@ public CrawlConfiguration()
/// The number of milliseconds to wait in between http requests to the same domain.
/// </summary>
public int MinCrawlDelayPerDomainMilliSeconds { get; set; }

#endregion
}
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ using Abot.Poco;
isRespectRobotsDotTextEnabled="false"
isRespectMetaRobotsNoFollowEnabled="false"
isRespectAnchorRelNoFollowEnabled="false"
isIgnoreRobotsDotTextIfRootDisallowedEnabled="true"
robotsDotTextUserAgentString="abot"
maxRobotsDotTextCrawlDelayInSeconds="5"
minCrawlDelayPerDomainMilliSeconds="0"/>
Expand Down

0 comments on commit 9bd3d7d

Please sign in to comment.