A fairly perennial problem in web development is the requirement to verify that a piece of web content has been published. Recently I had a requirement to write a service that would check that a number of Sitecore items (Articles), had actually been published before the site published the URLs pointing to that content.
While it is trivial to identify whether content can be accessed, the complications that follow from this essentially revolve around the fact that querying a web page is a non-deterministic operation: it may complete or it may not. In the context of serving a web page, it is normally assumed that most operations are, if no synchronous then at least short-lived and deterministic. So if we need to verify that 10 or more Articles are, in fact, live we need to do so in a way that will not block if there are problems with the retrieval of any of the Articles.
One solution to this would be to make the WebRequests asynchronously so that some of the requests could be made concurrently, but although this improves the situation, we still have a problem if the calls do not complete in a timely manner. An elegant solution to this is to make a set of paired tasks where each fetch task is paired with a delay task of finite duration. By doing this we are effectively saying “Try and retrieve the web page, but if the delay terminates first, end the task pair”.
The code below implements a service that offers functionality to retrieve the HttpStatusCode of the pages supplied, in two versions with and without paired delay Tasks. The non-delay pair version, while simpler, could theoretically block, because although the HttpClient.GetAsync is used, it depends on all calls ultimately returning with a code at some point. This could be only after minutes have elapsed, depending on configuration.
The delay pair implementation by contrast can only take a maximum of n milliseconds per call, where n is the number of milliseconds supplied in the call. While a timeout of say, 2 seconds is quite a significant period of time when applied to ten or more requests, if they all took the maximum time, chances are the website being accessed has bigger problems, than a couple of unpublished pages.
I have also included the tests (properly described as integration tests, not unit tests) written for this code at the end of this article.
namespace MyProject.Services { public class LiveArticleVerificationService { private readonly Dictionary<Guid, LiveArticle> _liveArticles = new Dictionary<Guid, LiveArticle>(); private const string LogErrorString = "WebCheckService:ProcessUrlAsync Exception Thrown {0}, Article ID={1} Url={2}"; public List<LiveArticle> Run(List<Article> articles, int? timeOutMilliseconds = null) { ProcessUrls(articles, timeOutMilliseconds); return _liveArticles.Values.ToList(); } private async Task<KeyValuePair<Article, HttpStatusCode>> ProcessUrlAsync(Article article, HttpClient client) { try { var task = client.GetAsync(article.Url).ConfigureAwait(false); HttpResponseMessage response = await task; return new KeyValuePair<Article, HttpStatusCode>(article, response.StatusCode); } catch (Exception ex) { Logger.Error(string.Format(LogErrorString, ex.Message, article.Id, article.Url)); } return new KeyValuePair<Article, HttpStatusCode> (article, HttpStatusCode.InternalServerError); } private void ProcessUrls(IEnumerable<Article> articles, int? timeOutMilliseconds) { if (!timeOutMilliseconds.HasValue) CreateMultipleTasksAsync(articles); else { CreateMultipleTasksAsyncWithTimeOut(articles, timeOutMilliseconds.Value); } } private void CreateMultipleTasksAsync(IEnumerable<Article> articles) { var client = new HttpClient { MaxResponseContentBufferSize = 1000000 }; List<Task<KeyValuePair<Article, HttpStatusCode>>> tasks = articles.Select(article => ProcessUrlAsync(article, client)).ToList(); var results = Task.WhenAll(tasks); foreach (var keyValuePair in results.Result) { _liveArticles.Add(new Guid(keyValuePair.Key.Id), new LiveArticle(keyValuePair.Key, keyValuePair.Value)); } } private void CreateMultipleTasksAsyncWithTimeOut(IEnumerable<Article> articles, int timeOutMilliseconds) { var timeout = TimeSpan.FromMilliseconds(Convert.ToDouble(timeOutMilliseconds)); var client = new HttpClient {MaxResponseContentBufferSize = 1000000}; var enumerable = articles as IList<Article> ?? articles.ToList(); var alltasks = enumerable.Select(article => Task .WhenAny(ProcessUrlAsync(article, client), CreateDelayTask(timeout))).ToList(); var completedTasks = Task.WhenAll(alltasks); var urlQueryTasks = completedTasks.Result.OfType<Task<KeyValuePair<Article, HttpStatusCode>>>() .Select(task => task.Result).ToList(); foreach (var keyValuePair in urlQueryTasks) { _liveArticles.Add(new Guid(keyValuePair.Key.Id), new LiveArticle(keyValuePair.Key, keyValuePair.Value)); } foreach (var resultArticle in enumerable. Where(resultArticle => !_liveArticles.ContainsKey(new Guid(resultArticle.Id)))) { _liveArticles.Add(new Guid(resultArticle.Id), new LiveArticle(resultArticle, HttpStatusCode.RequestTimeout)); } } private Task CreateDelayTask(TimeSpan timeout) { var delayTask = Task.Delay(timeout); delayTask.ConfigureAwait(false); return delayTask; } } }
The following integration tests exercise the verification service code, under a number of conditions, pages that exist, others that do not exist, and a range of timeout values.
using NUnit.Framework; namespace MyProject.Services.Tests.ArticleManagement { [TestFixture] public class LiveArticleVerificationServiceTests { [SetUp] public void SetUp() { _articles = new[] { new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.google.com/"}, new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.bbc.co.uk/"}, new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.guardian.co.uk/"}, new Article {Id = Guid.NewGuid().ToString(), Url = "http://facebook.com"} }.ToList(); } private List<Article> _articles; private readonly HttpStatusCode[] _expectedHttpStatusCodes = { HttpStatusCode.OK, HttpStatusCode.NotFound, HttpStatusCode.RequestTimeout }; [Test] public void VerificationServiceReturns200ForExtantUrlsAnd404ForNonExistantNoTimeOut() { var verificationService = new LiveArticleVerificationService(); _articles.Add(new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.google.com/wombat"}); _articles.Add(new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.google.com/banana"}); _articles.Add(new Article {Id = Guid.NewGuid().ToString(), Url = "http://www.google.com/aardvark"}); List<LiveArticle> results = verificationService.Run(_articles); List<HttpStatusCode> returnCodes = results.Select(result => result.Status).ToList(); Assert.AreEqual(7, results.Count); Assert.IsTrue(!returnCodes.Except(_expectedHttpStatusCodes).Any()); } [Test] public void VerificationServiceReturns200ForExtantUrlsNoTimeOut() { var verificationService = new LiveArticleVerificationService(); List<LiveArticle> results = verificationService.Run(_articles); List<HttpStatusCode> returnCodes = results.Select(result => result.Status).ToList(); Assert.AreEqual(4, results.Count); CollectionAssert.IsSubsetOf(new[] {HttpStatusCode.OK}, returnCodes); } [Test] public void VerificationServiceReturns200ForExtantUrlsWithTimeOut1000() { var verificationService = new LiveArticleVerificationService(); List<LiveArticle> results = verificationService.Run(_articles, 1000); List<HttpStatusCode> returnCodes = results.Select(result => result.Status).ToList(); Assert.AreEqual(4, results.Count); CollectionAssert.IsSubsetOf(new[] {HttpStatusCode.OK}, returnCodes); } [Test] public void VerificationServiceReturns200ForExtantUrlsWithTimeOut500() { var verificationService = new LiveArticleVerificationService(); List<LiveArticle> results = verificationService.Run(_articles, 500); List<HttpStatusCode> returnCodes = results.Select(result => result.Status).ToList(); Assert.AreEqual(4, results.Count); Assert.IsTrue(!returnCodes.Except(_expectedHttpStatusCodes).Any()); } [Test] public void VerificationServiceReturns408ForTimedOutWithTooShortTimeTimeOut() { var verificationService = new LiveArticleVerificationService(); List<LiveArticle> results = verificationService.Run(_articles, 100); List<HttpStatusCode> returnCodes = results.Select(result => result.Status).ToList(); Assert.AreEqual(4, results.Count); CollectionAssert.IsSubsetOf(new[] {HttpStatusCode.RequestTimeout}, returnCodes); } } }