Web Scraping material


Scraper


using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Linq;
using System.Text;
using log4net;

using FashionExchange.Common.DAL;
using FashionExchange.Common.BLL;
using HtmlAgilityPack;
using System.Net;
using System.Threading;
using FashionExchange.Common.Utils;
using System.IO;
using System.Web.Hosting;
using FashionExchange.Common.Enums;
using OpenQA.Selenium.Remote;

namespace FashionExchange.Common.Scrapers
{
    public abstract class Scraper
    {
        protected ILog log { get { return LogManager.GetLogger(this.GetType().Name); } }

        public Store Store
        {
            get
            {
                return StoreManager.GetStoreByScraperClassName(this.GetType().Name);
            }
        }

        protected SiteMap _siteMap = null;
        protected virtual SiteMap SiteMap
        {
            get
            {
                if (_siteMap == null)
                    _siteMap = new SiteMap(Store.Url);
                return _siteMap;
            }
        }
        protected int productAddedCount = 0, productDeletedCount = 0, productUpdatedCount = 0, productMergedCount = 0;
        protected int warningCount = 0, errorCount = 0;
        protected int productUrlCount = 0;
        private Dictionary>> scrapeErrorDict = new Dictionary>>();
        private HashSet updatedProductNames = new HashSet(StringComparer.InvariantCultureIgnoreCase);

        protected virtual CookieContainer CookieContainer { get { return null; } }
        protected virtual WebProxy Proxy { get { return null; } }
        protected virtual Dictionary HttpHeaders { get { return null; } }
        protected virtual Dictionary HttpHeadersForProductPhoto { get { return null; } }
        protected virtual RemoteWebDriver WebDriver { get { return null; } }

        // Variables to be used for scraping
        [ThreadStatic]
        private static HtmlDocument _productPage;
        protected HtmlDocument ProductPage
        {
            get { return _productPage; }
            private set { _productPage = value; }
        }
        [ThreadStatic]
        private static WebException _webException;
        protected WebException WebException
        {
            get { return _webException; }
            private set { _webException = value; }
        }
        [ThreadStatic]
        private static string _productUrl;
        protected string ProductUrl
        {
            get { return _productUrl; }
            private set { _productUrl = value; }
        }
        [ThreadStatic]
        private static string _redirectedProductUrl;
        protected string RedirectedProductUrl
        {
            get { return _redirectedProductUrl; }
            private set { _redirectedProductUrl = value; }
        }

        // abstract methods
        protected abstract List GetProductUrls();
        protected abstract List GetProducts();
        protected abstract string GetName();
        protected abstract decimal GetPrice();
        protected abstract decimal GetSalePrice();
        protected abstract string GetCategory();
        protected abstract string GetBrand();
        protected abstract string GetDescription();
        protected abstract string GetImageUrl();

        // virtual methods
        protected virtual List GetVariations()
        {
            return new List();
        }
        protected virtual List GetSizes()
        {
            return new List();
        }
        
        protected HtmlDocument DownloadWebPage(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
        {
            string redirectedUrl;
            return DownloadWebPage(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
        }
        protected virtual HtmlDocument DownloadWebPage(string url, out string redirectedUrl, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
        {
            return ScraperUtil.LoadHtml(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
        }

        private List SetupAndGetProducts(HtmlDocument productPage, string productUrl, string redirectedProductUrl, WebException webException)
        {
            ProductPage = productPage;
            ProductUrl = productUrl;
            RedirectedProductUrl = redirectedProductUrl;
            WebException = webException;

            return GetProducts();
        }

        public ScrapeResult Scrape(bool scrapeNewUrl, double? scrapeExistingUrlAgeInHour, int? scrapeExistingUrlPriceDroppedInDay)
        {
            log.InfoFormat("Scraping for store {0} has started. Scrape New Url: {1} Scrape Existing Url Age (Hour): {2} Scrape Existing Url Price Dropped (Day): {3}", this.GetType().Name, scrapeNewUrl, scrapeExistingUrlAgeInHour, scrapeExistingUrlPriceDroppedInDay);

            DateTime startTime = DateTime.Now;
            int newProductUrlCount = 0, existingProductUrlCount = 0;

            if (Store == null)
            {
                log.ErrorFormat("Store not found in database: Scraper Class Name={0}", this.GetType().Name);
                errorCount++;
                throw new Exception(String.Format("Store not found in database: Scraper Class Name={0}", this.GetType().Name));
            }

            // scrape all product urls from website and merge with existing product urls in database
            List productUrls = new List();
            try
            {
                // Get product urls from website
                if (scrapeNewUrl)
                {
                    List newProductUrls = GetProductUrls();
                    if (!newProductUrls.Any())
                        throw new Exception("No new product urls found");

                    newProductUrls = newProductUrls.Where(p => !String.IsNullOrEmpty(p)).Select(p => WebUtility.HtmlDecode(p.Trim())).Distinct().ToList();
                    newProductUrlCount = newProductUrls.Count();
                    productUrls.AddRange(newProductUrls);
                }

                // Get product urls for existing products
                if (scrapeExistingUrlAgeInHour.HasValue)
                {
                    DateTime urlAgeDateTime = DateTime.Now.AddHours(-scrapeExistingUrlAgeInHour.Value);
                    List existingProductUrls = ProductManager.GetProductsByStore(Store.Id).Where(m => m.DateUpdated < urlAgeDateTime).Select(p => p.Url).Distinct().ToList();
                    existingProductUrlCount = existingProductUrls.Count();
                    productUrls.InsertRange(0, existingProductUrls); // scrape existing product urls first
                }

                // Get product urls for existing products that have been price dropped
                if (scrapeExistingUrlPriceDroppedInDay.HasValue)
                {
                    List existingPriceDroppedProductUrls = DataContext.GetCurrentDataContext().ExecuteStoreQuery(
                        @"Select Product.Url
                        from ProductPrice with(nolock)
                        inner join Product with(nolock) on Product.Id = ProductPrice.ProductId
                        where StoreId = {0} and Product.Deleted = 0 and ProductPrice.DateCreated > {1}
                        and CASE WHEN ProductPrice.SalePrice IS NULL THEN ProductPrice.Price ELSE ProductPrice.SalePrice END 
                        > CASE WHEN Product.SalePrice IS NULL THEN Product.Price ELSE Product.SalePrice END
                        group by ProductPrice.ProductId, Product.Url",
                        Store.Id,
                        DateTime.Today.AddDays(-scrapeExistingUrlPriceDroppedInDay.Value - 1)).ToList();
                    existingProductUrlCount = existingPriceDroppedProductUrls.Count();
                    productUrls.InsertRange(0, existingPriceDroppedProductUrls); // scrape existing product urls first
                }

                // merge new product urls and existing product urls
                productUrls = productUrls.Distinct().ToList();
                productUrlCount = productUrls.Count();

                // download product pages async
                List downloadThreads = new List();
                ConcurrentQueue productUrlQ = new ConcurrentQueue(productUrls);
                ConcurrentQueue productPageQ = new ConcurrentQueue();

                for (int i = 0; i < Store.DownloadThreadCount; i++)
                {
                    Thread thread = new Thread(new ThreadStart(() => DownloadProductPages(productUrlQ, productPageQ)));
                    downloadThreads.Add(thread);
                    thread.Start();
                }

                List scrapeThreads = new List();
                for (int i = 0; i < Store.ScrapeThreadCount; i++)
                {
                    Thread thread = new Thread(new ThreadStart(() => Scrape(productPageQ, downloadThreads)));
                    scrapeThreads.Add(thread);
                    thread.Start();
                }

                foreach (Thread thread in scrapeThreads)
                    thread.Join();

                // check if there are product urls still in the queue
                if (productUrlQ.Any())
                {
                    errorCount += productUrlQ.Count();
                    throw new TimeoutException("Scraper thread timed out. " + productUrlQ.Count() + " product urls still in queue");
                }
            }
            catch (Exception e)
            {
                log.ErrorFormat("An error has occured while scraping website: {0}", e.ToString());
                if (e.Data.Contains(ParameterInfo.ExceptionData.Url))
                    scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { e.Data[ParameterInfo.ExceptionData.Url].ToString() }));
                else
                    scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List()));

                errorCount++;
            }
            finally
            {
                if (WebDriver != null)
                    WebDriver.Dispose();

            }

            log.InfoFormat("Scraping for store {0} is completed.", Store.Name);
            log.InfoFormat("Products Added={0} Updated={1} Deleted={2} Merged={3}", productAddedCount, productUpdatedCount, productDeletedCount, productMergedCount);
            log.InfoFormat("Warning={0} Error={1}", warningCount, errorCount);

            // return result
            ScrapeResult result = new ScrapeResult();
            result.Store = Store;
            result.ScrapeType = StoreManager.GetScrapeType(ScrapeTypeCode.Web);
            result.NewProductUrlCount = newProductUrlCount;
            result.ExistingProductUrlCount = existingProductUrlCount;
            result.TotalDistinctProductUrlCount = productUrlCount;
            result.ProductAddedCount = productAddedCount;
            result.ProductUpdatedCount = productUpdatedCount;
            result.ProductDeletedCount = productDeletedCount;
            result.ProductMergedCount = productMergedCount;
            result.SaleProductCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.SalePrice != null).Count();
            result.TotalProductCount = ProductManager.GetProductsByStore(Store.Id).Count();
            result.WarningCount = warningCount;
            result.ErrorCount = errorCount;
            result.DownloadThreadCount = Store.DownloadThreadCount;
            result.ScrapeThreadCount = Store.ScrapeThreadCount;
            result.StartDateTime = startTime;
            result.EndDateTime = DateTime.Now;
            result.ProductWithProductSizeCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).Count();
            result.ProductSizeAvailableCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Where(m => m.Available).Count();
            result.ProductSizeWithColourCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Where(m => m.Colour != null).Count();
            result.TotalProductSizeCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Count();
            result.ProductWithBrandCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.BrandUnparsed != null).Count();
            result.UniqueBrandCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.BrandUnparsed != null).Select(m => m.BrandUnparsed).Distinct().Count();

            foreach (KeyValuePair>> scrapeErrorOuterPair in scrapeErrorDict)
            {
                KeyValuePair> scrapeErrorPair = scrapeErrorOuterPair.Value;

                ScrapeError scrapeError = new ScrapeError();
                scrapeError.Url1 = scrapeErrorPair.Value.FirstOrDefault();
                scrapeError.Url2 = scrapeErrorPair.Value.Skip(1).FirstOrDefault();
                scrapeError.Url3 = scrapeErrorPair.Value.Skip(2).FirstOrDefault();
                scrapeError.Exception = scrapeErrorPair.Key.GetType().ToString();
                scrapeError.Message = scrapeErrorPair.Key.Message;
                scrapeError.StackTrace = scrapeErrorPair.Key.StackTrace;
                scrapeError.Count = scrapeErrorPair.Value.Any() ? scrapeErrorPair.Value.Count() : 1;

                result.ScrapeErrors.Add(scrapeError);
            }

            return result;
        }

        private void Scrape(ConcurrentQueue productPageQ, List downloadThreads)
        {
            int waitCount = 0;
            while (productPageQ.Count() > 0 || downloadThreads.Where(t => t.IsAlive).Any())
            {
                ProductPage productPage;
                if (productPageQ.TryDequeue(out productPage))
                {
                    waitCount = 0;
                    Scrape(productPage);
                }
                else
                {
                    if (waitCount == 1200)
                    {
                        log.Info("Waited 10 minutes for product page. Stopping scraper thread");
                        return;
                    }

                    Thread.Sleep(500); // wait for 0.5 second if there are no product page to be parse in queue

                    waitCount++;
                    if (waitCount % 8 == 0)
                        log.InfoFormat("Waited {0} seconds for download product page", waitCount * 0.5);

                }
            }
        }
        private void Scrape(ProductPage productPage)
        {
            DateTime startTime = DateTime.Now;
            FashionExchangeEntities db = DataContext.GetCurrentDataContext();

            log.InfoFormat("Parsing product page: {0}", productPage.Url);

            try
            {
                // responseUrl is the product url to scrape. It can be the redirected url
                string responseUrl = productPage.Url;

                // get existing products from database
                List productsFromDB = ProductManager.GetProductsByStoreIdAndUrl(Store.Id, productPage.Url).ToList();
                if (productsFromDB.Any())
                    log.InfoFormat("Found {0} product from database", productsFromDB.Count());

                if (!String.IsNullOrEmpty(productPage.RedirectedUrl))
                {
                    // check if it's redirect to home page
                    if (new Uri(productPage.RedirectedUrl).PathAndQuery == "/" || productPage.RedirectedUrl.TrimEnd('/') == Store.Url.Trim('/'))
                    {
                        log.Info("Redirected to home page");
                        if (productsFromDB.Any())
                        {
                            productDeletedCount += productsFromDB.Count();
                            ProductManager.DeleteProduct(productsFromDB);
                            log.Info("All existing product are deleted");
                        }
                        return;
                    }

                    // retrieve product from database if url is redirected
                    responseUrl = productPage.RedirectedUrl;

                    IEnumerable existingProductsWithRedirectedUrl = ProductManager.GetProductsByStoreIdAndUrl(Store.Id, responseUrl).ToList();

                    log.InfoFormat("Found {0} existing product from database using redirected url", existingProductsWithRedirectedUrl.Count());
                    LogProduct(existingProductsWithRedirectedUrl);

                    productsFromDB.AddRange(existingProductsWithRedirectedUrl);
                }

                // check if there was an error when downloading product page
                if (productPage.WebException != null)
                {
                    if (productPage.WebException.Response != null)
                    {
                        // delete existing products if server return 404 not found or 410 Gone
                        HttpWebResponse errorResponse = (HttpWebResponse)productPage.WebException.Response;
                        if (errorResponse.StatusCode == HttpStatusCode.NotFound || errorResponse.StatusCode == HttpStatusCode.Gone)
                        {
                            log.InfoFormat("Status={0} Message={1}", errorResponse.StatusCode, productPage.WebException.Message);
                            if (productsFromDB.Any())
                            {
                                productDeletedCount += productsFromDB.Count();
                                ProductManager.DeleteProduct(productsFromDB);
                                log.Info("All existing product are deleted");
                            }
                            return;
                        }
                    }
                }

                // parse products from product page
                List productsFromWeb = SetupAndGetProducts(productPage.HtmlDocument, productPage.Url, productPage.RedirectedUrl, productPage.WebException);
                if (productsFromWeb.Any())
                {
                    log.InfoFormat("Found {0} product from web:", productsFromWeb.Count());
                    LogProduct(productsFromWeb);
                }

                // skip products with duplicate names, remove zero dollar, gift card an out of stock products
                List productsToMerged = new List();
                for (int i = 0; i < productsFromWeb.Count(); i++)
                {
                    Product product = productsFromWeb.ElementAt(i);

                    // Check for zero dollar product
                    if (product.Price == 0)
                    {
                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;
                        continue;
                    }

                    // check if product is out of stock or unavailable
                    if (StringUtil.ContainsIgnoreCase(product.Name, "out of stock")
                        || StringUtil.ContainsIgnoreCase(product.Name, "unavailable")
                        || StringUtil.ContainsIgnoreCase(product.Name, "sold out")
                        || StringUtil.ContainsIgnoreCase(product.Name, "gift card")
                        || StringUtil.ContainsIgnoreCase(product.Name, "gift voucher"))
                    {
                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;
                        continue;
                    }

                    // Check for duplicates parsed from product page
                    if (productsFromWeb.Where(m => m.Name == product.Name).Count() > 1)
                    {
                        List duplicateProducts = (from p in productsFromWeb
                                                           where p.Name == product.Name
                                                           select p).Skip(1).ToList();
                        log.InfoFormat("Found {0} product with same name parsed from web", duplicateProducts.Count());
                        foreach (Product duplicateProduct in duplicateProducts)
                        {
                            ProductManager.Detach(duplicateProduct);
                            productsFromWeb.Remove(duplicateProduct);
                        }
                    }

                    for (int index = 0; index < product.ProductSizes.Count(); index++)
                    {
                        ProductSize size = product.ProductSizes.ElementAt(index);

                        // Check if there are duplicate sizes
                        if (product.ProductSizes.Where(m => m.Colour == size.Colour && m.Size == size.Size && m.Available == size.Available).Count() > 1)
                        {
                            product.ProductSizes.Remove(size);
                            DataContext.GetCurrentDataContext().Detach(size);
                            index--;
                        }

                        // throw exception if size is out of stock or sold out
                        if (StringUtil.ContainsIgnoreCase(size.Size, "out of stock") || StringUtil.ContainsIgnoreCase(size.Size, "sold out") || StringUtil.ContainsIgnoreCase(size.Size, "unavailable"))
                        {
                            throw new Exception("Size out of stock. Size: " + size.Size);
                        }
                    }

                    // Check if all product sizes are unavailable
                    if (product.ProductSizes.Any() && !product.ProductSizes.Where(m => m.Available).Any())
                    {
                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;
                        continue;
                    }

                    // Check if merge product size is true, product from web has no sizes, product from database has size, then delete product from database
                    if (Store.MergeProductSize && !product.ProductSizes.Any() && productsFromDB.Where(m => String.Equals(m.Name, product.Name, StringComparison.InvariantCultureIgnoreCase) && m.ProductSizes.Any()).Any())
                    {
                        log.Warn("Product from web have no sizes");
                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;
                        continue;
                    }

                    // Check in database if there are any same name products but with different url, there should only be one
                    Product sameNameDifferentUrlProduct = ProductManager.GetProductByStoreAndName(Store.Id, product.Name);
                    if (sameNameDifferentUrlProduct != null && !String.Equals(sameNameDifferentUrlProduct.Url, productPage.Url, StringComparison.InvariantCultureIgnoreCase) && !String.Equals(sameNameDifferentUrlProduct.Url, responseUrl, StringComparison.InvariantCultureIgnoreCase))
                    {
                        // check if product category or brand need to be merged
                        bool needMerge = false;

                        // merge category
                        if (!String.IsNullOrWhiteSpace(product.CategoryUnparsed))
                        {
                            if (String.IsNullOrWhiteSpace(sameNameDifferentUrlProduct.CategoryUnparsed))
                            {
                                sameNameDifferentUrlProduct.CategoryUnparsed = product.CategoryUnparsed;
                                needMerge = true;
                            }
                            else if (!sameNameDifferentUrlProduct.CategoryUnparsed.Contains(product.CategoryUnparsed))
                            {
                                sameNameDifferentUrlProduct.CategoryUnparsed += " " + product.CategoryUnparsed;
                                needMerge = true;
                            }
                        }

                        // merge brand
                        if (String.IsNullOrWhiteSpace(sameNameDifferentUrlProduct.BrandUnparsed) && !String.IsNullOrWhiteSpace(product.BrandUnparsed))
                        {
                            sameNameDifferentUrlProduct.BrandUnparsed = product.BrandUnparsed;
                            needMerge = true;
                        }

                        if (needMerge)
                            productsToMerged.Add(sameNameDifferentUrlProduct);

                        // check if products to be merged have different prices
                        if (sameNameDifferentUrlProduct.Price != product.Price || sameNameDifferentUrlProduct.SalePrice != product.SalePrice)
                        {
                            log.Warn("Product to merge have different prices. Existing Product Url: " + sameNameDifferentUrlProduct.Url + " New Product Url: " + product.Url);
                            warningCount++;
                        }

                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;
                        continue;
                    }

                    // Check if product has already been added or updated within this scrape job
                    if (updatedProductNames.Contains(product.Name))
                    {
                        log.Info("Product has already been updated");
                        ProductManager.Detach(product);
                        productsFromWeb.Remove(product);
                        --i;

                        // Remove products from db if any. This scenario occurs when new product url is redirected to url same as existing product url
                        Product productFromDB = productsFromDB.Where(m => String.Equals(m.Name, product.Name, StringComparison.InvariantCultureIgnoreCase)).FirstOrDefault();
                        if (productFromDB != null)
                        {
                            ProductManager.Detach(productFromDB);
                            productsFromDB.Remove(productFromDB);
                        }
                        continue;
                    }

                    // Check if product from web doesn't have brand, but existing product from DB has brand. This to ensure existing product brand does not get removed
                    if (String.IsNullOrWhiteSpace(product.BrandUnparsed) && productsFromDB.Where(m => m.Url == product.Url && m.BrandUnparsed != null).Any())
                    {
                        log.Info("Product from web doesn't have brand. But product from database has brand");
                        product.BrandUnparsed = productsFromDB.Where(m => m.Url == product.Url && m.BrandUnparsed != null).First().BrandUnparsed;
                    }
                }

                // update if product from db exist by (url or redirectedUrl) and name
                IEnumerable productsToBeUpdated = (from p in productsFromWeb
                                                            where (productsFromDB.Select(m => m.Url.ToLower()).Contains(p.Url.ToLower()) || productsFromDB.Select(m => m.Url.ToLower()).Contains(productPage.Url.ToLower()))
                                                            && productsFromDB.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
                                                            select p);
                if (productsToBeUpdated.Any())
                {
                    productUpdatedCount += productsToBeUpdated.Count();
                    ProductManager.UpdateProduct(productsToBeUpdated, false, true);
                    foreach (Product productToBeUpdated in productsToBeUpdated)
                        updatedProductNames.Add(productToBeUpdated.Name);
                    log.InfoFormat("Updated {0} product", productsToBeUpdated.Count());
                }

                // add if product from db does not exist by name
                IEnumerable productsToBeAdded = (from p in productsFromWeb
                                                          where !productsFromDB.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
                                                          select p);
                if (productsToBeAdded.Any())
                {
                    productAddedCount += productsToBeAdded.Count();
                    ProductManager.NewProduct(productsToBeAdded, false);
                    foreach (Product productToBeAdded in productsToBeAdded)
                        updatedProductNames.Add(productToBeAdded.Name);
                    log.InfoFormat("Added {0} product", productsToBeAdded.Count());
                }

                // delete if product from db cannot be found from product page by name
                IEnumerable productsToBeDeleted = (from p in productsFromDB
                                                            where !productsFromWeb.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
                                                            select p);
                if (productsToBeDeleted.Any())
                {
                    productDeletedCount += productsToBeDeleted.Count();
                    ProductManager.DeleteProduct(productsToBeDeleted, false);
                    log.InfoFormat("Deleted {0} product", productsToBeDeleted.Count());
                }

                // merge if product from db exist by name but different url
                if (productsToMerged.Any())
                {
                    productMergedCount += productsToMerged.Count();
                    ProductManager.UpdateProduct(productsToMerged, false, false);
                    log.InfoFormat("Merged {0} product", productsToMerged.Count());
                }

                db.SaveChanges();
            }
            catch (Exception e)
            {
                log.ErrorFormat("Error parsing product page. Url: {0} Exception: {1}", productPage.Url, e.ToString());
                if (scrapeErrorDict.ContainsKey(e.StackTrace))
                    scrapeErrorDict[e.StackTrace].Value.Add(productPage.Url);
                else
                    scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { productPage.Url }));

                errorCount++;
            }
            finally
            {
                // discard all changes / caching to database context
                DataContext.DisposeDataContext();
                log.InfoFormat("Time taken {0}ms", Math.Round(DateTime.Now.Subtract(startTime).TotalMilliseconds, MidpointRounding.AwayFromZero));
            }
        }

        private void DownloadProductPages(ConcurrentQueue productUrlQueue, ConcurrentQueue productPageQueue)
        {
            string productUrl;
            while (productUrlQueue.TryDequeue(out productUrl))
            {
                // if there are too many productPage waiting to be parsed. Wait for 0.5 second before continue downloading
                while (productPageQueue.Count() > Store.DownloadThreadCount * 5)
                    Thread.Sleep(500);

                ProductPage productPage = new ProductPage();
                try
                {
                    productPage.Url = productUrl;

                    int productUrlQueueCount = productUrlQueue.Count();
                    log.InfoFormat("Downloading product page [{0}/{1}]: {2}", productUrlCount - productUrlQueueCount, productUrlCount, productPage.Url);

                    DateTime startTime = DateTime.Now;
                    string responseUrl;
                    productPage.HtmlDocument = DownloadWebPage(productPage.Url, out responseUrl, cookieContainer: CookieContainer, proxy: Proxy, httpHeaders: HttpHeaders, webDriver: WebDriver);

                    if (responseUrl != productPage.Url)
                    {
                        log.InfoFormat("Product url has been redirected: {0}", responseUrl);
                        productPage.RedirectedUrl = responseUrl;
                    }

                    DateTime endTime = DateTime.Now;

                    log.InfoFormat("Downloaded product page [{0}/{1}] Time taken {2}ms", productUrlCount - productUrlQueueCount, productUrlCount, Math.Round(endTime.Subtract(startTime).TotalMilliseconds, MidpointRounding.AwayFromZero));
                    productPageQueue.Enqueue(productPage);
                }
                catch (WebException e)
                {
                    log.InfoFormat("There was an error downloading product page: {0} {1}", String.IsNullOrEmpty(productPage.RedirectedUrl) ? productPage.Url : productPage.RedirectedUrl, e.ToString());
                    productPage.WebException = e;
                    productPageQueue.Enqueue(productPage);
                }
                catch (Exception e)
                {
                    log.ErrorFormat("An error ocurred while downloading product page: {0} {1}", productPage.Url, e.ToString());
                    if (scrapeErrorDict.ContainsKey(e.StackTrace))
                        scrapeErrorDict[e.StackTrace].Value.Add(productPage.Url);
                    else
                        scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { productPage.Url }));

                    errorCount++;
                }
            }
        }

        // Scrape test only provided url and does not update database.
        public IEnumerable TestScrapeProduct(string productUrl)
        {
            log.InfoFormat("Scraper test on url: {0}", productUrl);
            List productUrls = new List();
            productUrls.Add(productUrl);
            ConcurrentQueue productUrlQ = new ConcurrentQueue(productUrls);
            ConcurrentQueue productPageQ = new ConcurrentQueue();

            List products = new List();
            try
            {
                Thread thread = new Thread(new ThreadStart(() => DownloadProductPages(productUrlQ, productPageQ)));

                thread.Start();

                thread.Join();

                ProductPage productPage;
                productPageQ.TryDequeue(out productPage);

                // check if there was an error when downloading product page
                if (productPage.WebException != null)
                {
                    if (productPage.WebException.Response != null)
                    {
                        // delete existing products if server return 404 not found or 410 Gone
                        HttpWebResponse errorResponse = (HttpWebResponse)productPage.WebException.Response;
                        if (errorResponse.StatusCode == HttpStatusCode.NotFound || errorResponse.StatusCode == HttpStatusCode.Gone)
                        {
                            log.InfoFormat("Status={0} Message={1}", errorResponse.StatusCode, productPage.WebException.Message);
                            return products;
                        }
                    }
                }

                string responseUrl = String.IsNullOrEmpty(productPage.RedirectedUrl) ? productPage.Url : productPage.RedirectedUrl;
                products = SetupAndGetProducts(productPage.HtmlDocument, productPage.Url, productPage.RedirectedUrl, productPage.WebException);
                log.InfoFormat("Products found: {0}", products.Count());

                LogProduct(products);

                return products;
            }
            catch (Exception e)
            {
                log.ErrorFormat("Error getting products from url: {0}", e.ToString());
                throw;
            }
            finally
            {
                if (WebDriver != null)
                    WebDriver.Dispose();
            }
        }

        // Scrape test to get all product urls
        public void ScrapeProductUrls()
        {
            log.InfoFormat("Url scraping test for store {0} has started.", Store.Name);

            try
            {
                List productUrls = GetProductUrls();
                log.InfoFormat("Url scraping test has completed. Found {0} urls. {1} distinct.", productUrls.Count(), productUrls.Distinct().Count());
            }
            catch (Exception e)
            {
                log.ErrorFormat("Error occured while getting product urls.");
                log.Error(e.ToString());
            }
        }

        protected List CreateProductObjects()
        {
            List products = new List();

            string productName = GetName();
            if (!String.IsNullOrWhiteSpace(productName))
                productName = ScraperUtil.NormalizeText(productName);
            else
                productName = null;

            //string description = GetDescription();
            //if (!String.IsNullOrWhiteSpace(description))
            //    description = ScraperUtil.NormalizeText(description);
            //else
            //    description = null;

            // Categories are usually scraped from product page's breadcrumb. Also check and remove product name from category
            string category = GetCategory();
            if (!String.IsNullOrWhiteSpace(category))
            {
                category = ScraperUtil.NormalizeText(category);

                if (!String.IsNullOrEmpty(productName))
                {
                    if (String.Compare(category, productName, true) == 0)
                        category = null;
                    else
                        category = category.Replace(productName, String.Empty).Trim();
                }
            }
            else
                category = null;

            string brand = GetBrand();
            if (!String.IsNullOrWhiteSpace(brand))
                brand = ScraperUtil.NormalizeText(brand);
            else
                brand = null;

            // check if there are product variations
            List productVariations = GetVariations();
            if (productVariations.Any())
            {
                // check if all variation prices are same, then just take one product without variation name
                //if (productVariations.Count() == 1
                //    || (productVariations.Count() > 1
                //        && productVariations.Select(m => Math.Round(m.SalePrice, 2, MidpointRounding.AwayFromZero)).Distinct().Count() == 1
                //        && productVariations.Select(m => Math.Round(m.Price, 2, MidpointRounding.AwayFromZero)).Distinct().Count() == 1)
                //    )
                //{
                //    ProductVariation variation = productVariations.First();

                //    Product product = new Product();
                //    product.Name = (String.IsNullOrWhiteSpace(productName)) ?
                //       ScraperUtil.NormalizeText(variation.Name) : productName;
                //    product.Description = description;
                //    product.CategoryUnparsed = category;
                //    product.BrandUnparsed = brand;
                //    product.Price = Math.Round(variation.Price, 2, MidpointRounding.AwayFromZero);

                //    if (variation.SalePrice > 0 && variation.SalePrice < variation.Price)
                //        product.SalePrice = Math.Round(variation.SalePrice, 2, MidpointRounding.AwayFromZero);

                //    product.Url = productUrl;
                //    product.Store = Store;

                //    DownloadProductImage(variation.ImageUrl, product.Name);

                //    products.Add(product);
                //}
                //else
                //{
                foreach (ProductVariation variation in productVariations)
                {
                    Product product = new Product();

                    variation.Name = variation.Name ?? String.Empty;
                    product.Name = (String.IsNullOrWhiteSpace(productName)) ?
                        variation.Name.Trim() : String.Format("{0} {1}", productName, variation.Name.Replace(productName, String.Empty).Trim());
                    product.Name = ScraperUtil.NormalizeText(product.Name);
                    //product.Description = description;
                    product.CategoryUnparsed = category;
                    product.BrandUnparsed = brand;
                    product.Price = Math.Round(variation.Price, 2, MidpointRounding.AwayFromZero);

                    if (variation.SalePrice > 0 && variation.SalePrice < variation.Price)
                        product.SalePrice = Math.Round(variation.SalePrice, 2, MidpointRounding.AwayFromZero);

                    foreach (ProductSize productSize in variation.ProductSizes)
                    {
                        if (!String.IsNullOrWhiteSpace(productSize.Colour))
                            productSize.Colour = ScraperUtil.NormalizeText(productSize.Colour);
                        else
                            productSize.Colour = null;

                        if (!String.IsNullOrEmpty(productSize.Size))
                            productSize.Size = ScraperUtil.NormalizeText(productSize.Size);
                        else if (String.IsNullOrEmpty(productSize.Colour))
                            continue;
                        else if (!String.IsNullOrEmpty(productSize.Colour))
                            throw new Exception("Product size has colour but no size. Colour: " + productSize.Colour);

                        product.ProductSizes.Add(productSize);
                    }

                    product.Url = String.IsNullOrEmpty(RedirectedProductUrl) ? ProductUrl : RedirectedProductUrl;
                    product.Store = Store;

                    DownloadProductPhoto(variation.ImageUrl, product);

                    products.Add(product);
                }
                //}
            }
            else
            {
                Product product = new Product();
                product.Name = productName;
                //product.Description = description;
                product.CategoryUnparsed = category;
                product.BrandUnparsed = brand;

                product.Price = Math.Round(GetPrice(), 2, MidpointRounding.AwayFromZero);

                decimal salePrice = Math.Round(GetSalePrice(), 2, MidpointRounding.AwayFromZero);
                if (salePrice > 0 && salePrice < product.Price)
                    product.SalePrice = salePrice;

                foreach (ProductSize productSize in GetSizes() ?? Enumerable.Empty())
                {
                    if (!String.IsNullOrWhiteSpace(productSize.Colour))
                        productSize.Colour = ScraperUtil.NormalizeText(productSize.Colour);
                    else
                        productSize.Colour = null;

                    if (!String.IsNullOrEmpty(productSize.Size))
                        productSize.Size = ScraperUtil.NormalizeText(productSize.Size);
                    else if (String.IsNullOrEmpty(productSize.Colour))
                        continue;
                    else if (!String.IsNullOrEmpty(productSize.Colour))
                        throw new Exception("Product size has colour but no size. Colour: " + productSize.Colour);
                    
                    product.ProductSizes.Add(productSize);
                }

                product.Url = String.IsNullOrEmpty(RedirectedProductUrl) ? ProductUrl : RedirectedProductUrl;
                product.Store = Store;

                DownloadProductPhoto(GetImageUrl(), product);

                products.Add(product);
            }
            return products;
        }

        protected void DownloadProductPhoto(string imageUrl, Product product)
        {
            try
            {
                ProductManager.DownloadProductPhoto(imageUrl, product, proxy: Proxy, httpHeaders: HttpHeadersForProductPhoto);
            }
            catch (Exception)
            {
                log.ErrorFormat("Error downloading / processing product image. Image url: {0}", imageUrl);
                throw;
            }
        }

        private void LogProduct(IEnumerable products)
        {
            int productNum = 1;
            foreach (Product p in products)
            {
                if (p.SalePrice > 0)
                    log.InfoFormat("{0}. {1} Regular Price: ${2} Sale Price: ${3}", productNum, p.Name, p.Price, p.SalePrice);
                else
                    log.InfoFormat("{0}. {1} ${2}", productNum, p.Name, p.Price);
                productNum++;
            }
        }
    }
}



ScraperUtil


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using HtmlAgilityPack;
using log4net;
using System.Xml;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Enums;
using Newtonsoft.Json.Linq;
using System.Web;
using System.Collections.Specialized;
using OpenQA.Selenium.PhantomJS;
using System.Runtime.Caching;
using OpenQA.Selenium.Firefox;
using OpenQA.Selenium.Remote;
using System.IO.Compression;
using System.Web.Hosting;

namespace FashionExchange.Common.Utils
{
    public class ScraperUtil
    {
        private static ILog log = LogManager.GetLogger(typeof(ScraperUtil).Name);
        private const int defaultRetryCount = 3;
        private static FixedSizeConcurrentQueue loadedHtmlUrls = new FixedSizeConcurrentQueue(1000);
        private static FixedSizeConcurrentQueue loadedHttpGetUrls = new FixedSizeConcurrentQueue(1000);

        public static string CheckForRedirectUrl(string url, WebProxy proxy = null, bool useHead = true)
        {
            string responseUrl = String.Empty;


            if (url.Contains("./"))
                FixDotInUri();

            url = WebUtility.HtmlDecode(url);

            // retry if request has been timed out or refused
            for (int i = 0; i <= defaultRetryCount; i++)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    if (useHead)
                        request.Method = "HEAD";
                    if (proxy != null)
                        request.Proxy = proxy;
                    request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                    request.AllowAutoRedirect = false;

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    responseUrl = String.IsNullOrEmpty(response.GetResponseHeader("Location")) ? url : response.GetResponseHeader("Location");
                    response.Close();

                    return responseUrl;
                }
                catch (WebException e)
                {
                    // give up if already retried n times
                    if (i == defaultRetryCount)
                        throw;

                    if (e.Status == WebExceptionStatus.Timeout)
                        log.Warn(e.Message);
                    else if (e.Status == WebExceptionStatus.ProtocolError)
                    {
                        if (useHead)
                        {
                            log.Warn(e.Message + " Retry without http request = head.");
                            useHead = false;
                        }
                        else
                        {
                            throw;
                        }
                    }
                    else
                        throw;
                }
                catch (IOException)
                {
                    // give up if already retried n times
                    if (i == defaultRetryCount)
                        throw;
                }
            }

            return responseUrl;
        }

        public static HtmlDocument LoadHtml(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
        {
            string redirectedUrl;
            return LoadHtml(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
        }
        public static HtmlDocument LoadHtml(string url, out string redirectedUrl, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
        {
            if (url.Contains("./"))
                FixDotInUri();

            // check if url has already been loaded in the past
            if (checkUrlLoaded && loadedHtmlUrls.Contains(url) && loadedHtmlUrls.Where(m => m == url).Count() >= 2)
            {
                Exception e = new Exception("Url has already been loaded more than 2 times. Url: " + url);
                e.Data[ParameterInfo.ExceptionData.Url] = url;
                throw e;
            }
            else
                loadedHtmlUrls.Enqueue(url);

            url = WebUtility.HtmlDecode(url);
            redirectedUrl = url;

            bool hasConnectionClosedError = false;

            // retry if request has been timed out or refused
            for (int i = 0; i <= retryCount; i++)
            {
                try
                {
                    HtmlDocument htmlDocument = new HtmlDocument();
                    HttpWebResponse response;
                    bool redirected = false;
                    List redirectedUrls = new List();
                    redirectedUrls.Add(redirectedUrl);

                    if (webDriver != null)
                    {
                        webDriver.Navigate().GoToUrl(url);
                        System.Threading.Thread.Sleep(1000 * 5);
                        HtmlNode.ElementsFlags.Remove("option");
                        HtmlNode.ElementsFlags.Remove("form");

                        htmlDocument.LoadHtml(webDriver.PageSource);

                        if (url != webDriver.Url)
                            redirectedUrl = webDriver.Url;
                    }
                    else
                    {
                        do
                        {
                            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(redirectedUrl);
                            request.Method = "GET";
                            request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                            request.AllowAutoRedirect = false;
                            request.CookieContainer = cookieContainer;
                            request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                            /* Sometimes .net framework doesn't handle http request properly and instead throws 
                             * web exception "The request was aborted: The connection was closed unexpectedly". 
                             * Using a different HttpVersion.Version10 can get around this issue. */
                            if (hasConnectionClosedError)
                                request.ProtocolVersion = HttpVersion.Version10;

                            if (proxy != null)
                                request.Proxy = proxy;

                            SetRequestHeaders(request, httpHeaders);

                            response = (HttpWebResponse)request.GetResponse();

                            if (!redirectedUrl.Contains("www.styletread.com.au")
                             && (response.StatusCode == HttpStatusCode.Redirect
                             || response.StatusCode == HttpStatusCode.TemporaryRedirect
                             || response.StatusCode == HttpStatusCode.MovedPermanently
                             || response.StatusCode == HttpStatusCode.Moved))
                            {
                                // append root url address it does not exist in redirect url
                                if (!response.GetResponseHeader("Location").StartsWith("http"))
                                {
                                    Uri uri = new Uri(redirectedUrl);
                                    string domainUrl = new UriBuilder(uri.Scheme, uri.DnsSafeHost).ToString();
                                    redirectedUrl = domainUrl + response.GetResponseHeader("Location").TrimStart('/'); // domainUrl will always have '/' at the end
                                }
                                else
                                    redirectedUrl = response.GetResponseHeader("Location");

                                // append hash tag
                                if (url.Contains("#") && !redirectedUrl.Contains("#"))
                                    redirectedUrl += StringUtil.SubstringToEnd(url, "#", includeStartStr: true);

                                // check if the new url has been redirected before. if yes throw an exception, else add new redirect url to redirectUrls list
                                if (redirectedUrls.Contains(redirectedUrl))
                                    throw new WebException("Too many automatic redirections were attempted.", WebExceptionStatus.ProtocolError);
                                else
                                    redirectedUrls.Add(redirectedUrl);

                                redirected = true;
                            }
                            else
                            {
                                redirected = false;
                                HtmlNode.ElementsFlags.Remove("option");
                                HtmlNode.ElementsFlags.Remove("form");

                                htmlDocument.Load(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet.Trim('"')));
                            }
                            response.Close();
                        } while (redirected);
                    }

                    // convert all img src and a href from relative to absolute urls
                    Uri finalUrl = new Uri(redirectedUrl);
                    string baseUrl = finalUrl.Scheme + "://" + finalUrl.Authority;
                    foreach (HtmlNode linkNode in htmlDocument.DocumentNode.SelectNodes("//a[@href and not(starts-with(@href, 'http'))]") ?? Enumerable.Empty())
                    {
                        if (linkNode.Attributes["href"].Value.StartsWith("//"))
                            linkNode.Attributes["href"].Value = finalUrl.Scheme + ":" + linkNode.Attributes["href"].Value;
                        else if (linkNode.Attributes["href"].Value.StartsWith("?"))
                            linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath + linkNode.Attributes["href"].Value;
                        else if (linkNode.Attributes["href"].Value.StartsWith("../"))
                        {
                            // work out number of times traverse to parent
                            int backUpParentCount = 0;
                            int linkNodeStartIndex = 0;
                            while (linkNode.Attributes["href"].Value.IndexOf("../", linkNodeStartIndex) == linkNodeStartIndex)
                            {
                                linkNodeStartIndex += "../".Length; ;
                                backUpParentCount++;
                            }

                            // work out number of slashes we need to traverse and get the position index
                            int slashIndex = 0;
                            if (finalUrl.AbsolutePath.LastIndexOf('/') != 0)
                            {
                                slashIndex = finalUrl.AbsolutePath.LastIndexOf('/');
                                for (int backUpParentCounter = 0; backUpParentCounter < backUpParentCount; backUpParentCounter++)
                                {
                                    slashIndex = finalUrl.AbsolutePath.LastIndexOf('/', slashIndex - 1);
                                    if (slashIndex == 0)
                                        break;
                                }
                            }

                            linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, slashIndex) + "/" + linkNode.Attributes["href"].Value.Substring(linkNodeStartIndex);
                        }
                        else if (linkNode.Attributes["href"].Value.StartsWith("/"))
                            linkNode.Attributes["href"].Value = baseUrl + linkNode.Attributes["href"].Value;
                        else
                        {
                            if (finalUrl.AbsolutePath == "/")
                                linkNode.Attributes["href"].Value = baseUrl + "/" + linkNode.Attributes["href"].Value;
                            else
                                linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, finalUrl.AbsolutePath.TrimEnd('/').LastIndexOf('/')) + "/" + linkNode.Attributes["href"].Value;
                        }
                    }
                    foreach (HtmlNode imgNode in htmlDocument.DocumentNode.SelectNodes("//img[@src and not(starts-with(@src, 'http'))]") ?? Enumerable.Empty())
                    {
                        if (imgNode.Attributes["src"].Value.StartsWith("//"))
                            imgNode.Attributes["src"].Value = finalUrl.Scheme + ":" + imgNode.Attributes["src"].Value;
                        else if (imgNode.Attributes["src"].Value.StartsWith("?"))
                            imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath + imgNode.Attributes["src"].Value;
                        else if (imgNode.Attributes["src"].Value.StartsWith("../"))
                        {
                            // work out number of times traverse to parent
                            int backUpParentCount = 0;
                            int imgNodeStartIndex = 0;
                            while (imgNode.Attributes["src"].Value.IndexOf("../", imgNodeStartIndex) == imgNodeStartIndex)
                            {
                                imgNodeStartIndex += "../".Length; ;
                                backUpParentCount++;
                            }

                            // work out number of slashes we need to traverse and get the position index
                            int slashIndex = 0;
                            if (finalUrl.AbsolutePath.LastIndexOf('/') != 0)
                            {
                                slashIndex = finalUrl.AbsolutePath.LastIndexOf('/');
                                for (int backUpParentCounter = 0; backUpParentCounter < backUpParentCount; backUpParentCounter++)
                                {
                                    slashIndex = finalUrl.AbsolutePath.LastIndexOf('/', slashIndex - 1);
                                    if (slashIndex == 0)
                                        break;
                                }
                            }

                            imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, slashIndex) + "/" + imgNode.Attributes["src"].Value.Substring(imgNodeStartIndex);
                        }
                        else if (imgNode.Attributes["src"].Value.StartsWith("/"))
                            imgNode.Attributes["src"].Value = baseUrl + imgNode.Attributes["src"].Value;
                        else
                        {
                            if (finalUrl.AbsolutePath == "/")
                                imgNode.Attributes["src"].Value = baseUrl + "/" + imgNode.Attributes["src"].Value;
                            else
                                imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, finalUrl.AbsolutePath.TrimEnd('/').LastIndexOf('/')) + "/" + imgNode.Attributes["src"].Value;
                        }
                    }

                    return htmlDocument;
                }
                catch (Exception e)
                {
                    log.Info("An exception occured while loading url: " + redirectedUrl);
                    e.Data[ParameterInfo.ExceptionData.Url] = redirectedUrl;

                    // give up if already retried n times
                    if (i == retryCount)
                        throw;

                    // If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
                    // Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
                    if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
                        hasConnectionClosedError = true;
                    else if (retryOnTimeout
                        && e is WebException
                        && ((((WebException)e).Status == WebExceptionStatus.Timeout)
                            || (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
                        log.Warn(e.Message);
                    else if (retryOnTimeout
                        && e is OpenQA.Selenium.WebDriverException
                        && ((OpenQA.Selenium.WebDriverException)e).InnerException is WebException
                        && ((WebException)((OpenQA.Selenium.WebDriverException)e).InnerException).Status == WebExceptionStatus.Timeout)
                        log.Warn(e.Message);
                    else if (retryOnError)
                        log.Warn(e.Message);
                    else
                        throw;

                    if (retryDelayInMilisecond > 0)
                    {
                        log.Info("Retry loading html page after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
                        System.Threading.Thread.Sleep(retryDelayInMilisecond);
                    }
                    else
                    {
                        log.Info("Retry loading html page. Retry count: " + (i + 1));
                    }
                }
            }
            return null;
        }

        public static XmlDocument LoadXml(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, Dictionary httpHeaders = null)
        {
            if (url.Contains("./"))
                FixDotInUri();

            url = WebUtility.HtmlDecode(url);

            // retry if request has been timed out or refused
            for (int i = 0; i <= retryCount; i++)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method = "GET";
                    request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                    request.CookieContainer = cookieContainer;

                    SetRequestHeaders(request, httpHeaders);

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    XmlDocument xml = new XmlDocument();

                    xml.Load(response.GetResponseStream());

                    response.Close();

                    return xml;
                }
                catch (Exception e)
                {
                    log.InfoFormat("An exception occured while loading xml. Url: {0}", url);
                    e.Data[ParameterInfo.ExceptionData.Url] = url;

                    // give up if already retried n times
                    if (i == retryCount)
                        throw;

                    if (retryOnTimeout
                        && e is WebException
                        && ((((WebException)e).Status == WebExceptionStatus.Timeout)
                            || (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
                        log.Warn(e.Message);
                    else if (retryOnError)
                        log.Warn(e.Message);
                    else
                        throw;

                    if (retryDelayInMilisecond > 0)
                    {
                        log.Info("Retry loading xml after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
                        System.Threading.Thread.Sleep(retryDelayInMilisecond);
                    }
                    else
                        log.Info("Retry loading xml. Retry count: " + (i + 1));
                }
            }
            return null;
        }

        public static string RemoveXmlNamespaces(string xml)
        {
            List xmlNamepaces = StringUtil.Substrings(xml, "xmlns=\"", "\"", includeStartStr: true, includeEndStr: true);
            foreach (string xmlNamespace in xmlNamepaces)
                xml = xml.Replace(xmlNamespace, String.Empty);

            return xml;
        }

        public static string NormalizeText(string text)
        {
            if (String.IsNullOrEmpty(text))
                return text;

            text = WebUtility.HtmlDecode(text)
                .Replace("\n", " ")
                .Replace("\r", " ")
                .Replace("\t", " ")
                .Replace("\u00A0", " ")
                .Replace("\u0085", String.Empty)
                .Replace("\u0080", String.Empty)
                .Replace("\u0093", String.Empty)
                .Replace("\u0099", String.Empty)
                .Replace("\u0091", String.Empty)
                .Replace("\u0092", String.Empty)
                .Replace("\u0094", String.Empty)
                .Replace("\u009f", String.Empty)
                .Replace("\u008c", String.Empty)
                .Replace("\uff08", "(")
                .Replace("\uff09", ")")
                .Replace("¢", String.Empty)
                .Replace("„", String.Empty)
                .Replace("€", String.Empty)
                .Replace("™", String.Empty)
                .Replace("®", String.Empty)
                .Replace("“", String.Empty)
                .Replace("–º", String.Empty)
                .Replace("â", String.Empty)
                .Replace("Â", String.Empty)
                .Replace("&", "&")
                .Replace("è", "e")
                .Replace("é", "e")
                .Replace("ë", "e")
                .Replace("ç", "c")
                .Replace("ć", "c")
                .Replace("Ć", "C")
                .Replace("ô", "o")
                .Replace("ó", "o")
                .Replace("ò", "o")
                .Replace("Ò", "O")
                .Replace("ø", "o")
                .Replace("Ê", "E")
                .Replace("É", "E")
                .Replace("È", "E")
                .Replace("Ë", "E")
                .Replace("ù", "u")
                .Replace("ü", "u")
                .Replace("ä", "a")
                .Replace("Ã", "A")
                .Replace("Å", "A")
                .Replace("å", "a")
                .Replace("à", "a")
                .Replace("á", "a")
                .Replace("–", "-")
                .Replace("�", "?")
                .Replace("’", "'")
                .Replace("⁺", "+")
                .Replace("", String.Empty) // hidden character, ref to task #480
                .Replace("​", String.Empty) // hidden character, ref to task #4165
                .Replace("▽", String.Empty)
                .Replace("`", "'")
                .Replace("ï", "i")
                .Replace("Ï", "I")
                .Replace("í", "i")
                .Replace("Í", "I")
                .Replace("⅜", "3/8")
                .Replace("¾", "3/4")
                .Replace("¼", "1/4")
                .Replace("½", "1/2")
                .Replace("⅝", "5/8")
                .Replace("⅛", "1/8")
                .Replace("″", "\"")
                .Replace("ð", String.Empty)
                .Trim();

            // remove extra spaces
            while (text.Contains("  "))
                text = text.Replace("  ", " ");

            return text;
        }

        public static string CreateHttpGetRequest(string url, CookieContainer cookieContainer = null, string accept = null, Dictionary httpHeaders = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, bool checkUrlLoaded = true)
        {
            // check if url has already been loaded in the past
            if (checkUrlLoaded && loadedHttpGetUrls.Contains(url) && loadedHttpGetUrls.Where(m => m == url).Count() >= 2)
            {
                Exception e = new Exception("Url has already been loaded more than 2 times. Url: " + url);
                e.Data[ParameterInfo.ExceptionData.Url] = url;
                throw e;
            }
            else
                loadedHttpGetUrls.Enqueue(url);

            url = WebUtility.HtmlDecode(url);

            bool hasConnectionClosedError = false;

            // retry if request has been timed out or refused
            for (int i = 0; i <= retryCount; i++)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method = "GET";
                    request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                    request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
                    request.CookieContainer = cookieContainer;

                    /* Sometimes .net framework doesn't handle http request properly and instead throws 
                     * web exception "The request was aborted: The connection was closed unexpectedly". 
                     * Using a different HttpVersion.Version10 can get around this issue. */
                    if (hasConnectionClosedError)
                        request.ProtocolVersion = HttpVersion.Version10;

                    if (proxy != null)
                        request.Proxy = proxy;

                    if (!String.IsNullOrEmpty(accept))
                        request.Accept = accept;

                    SetRequestHeaders(request, httpHeaders);

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                    string responseStr;

                    using (StreamReader reader = new StreamReader(response.GetResponseStream()))
                    {
                        responseStr = reader.ReadToEnd();
                    }

                    response.Close();

                    return responseStr;
                }
                catch (Exception e)
                {
                    log.InfoFormat("A WebException occured while sending request. Url: {0}", url);
                    e.Data[ParameterInfo.ExceptionData.Url] = url;

                    // give up if already retried n times
                    if (i == retryCount)
                        throw;

                    // If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
                    // Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
                    if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
                        hasConnectionClosedError = true;
                    else if (retryOnTimeout
                        && e is WebException
                        && ((((WebException)e).Status == WebExceptionStatus.Timeout)
                            || (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
                        log.Warn(e.Message);
                    else if (retryOnError)
                        log.Warn(e.Message);
                    else
                        throw;

                    if (retryDelayInMilisecond > 0)
                    {
                        log.Info("Retry sending request after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
                        System.Threading.Thread.Sleep(retryDelayInMilisecond);
                    }
                    else
                    {
                        log.Info("Retry posting request. Retry count: " + (i + 1));
                    }
                }
            }
            return null;
        }

        public static string CreateHttpPostRequest(string url, string postData, Dictionary httpHeaders = null, CookieContainer cookieContainer = null, string ContentType = "application/x-www-form-urlencoded; charset=UTF-8", bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null)
        {
            string responseStr = String.Empty;
            postData = postData ?? String.Empty;

            url = WebUtility.HtmlDecode(url);

            bool hasConnectionClosedError = false;

            byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);

            // retry if request has been timed out or refused
            for (int i = 0; i <= retryCount; i++)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method = "POST";
                    request.ContentLength = postDataBytes.Length;
                    request.ContentType = ContentType;
                    request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                    request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
                    request.CookieContainer = cookieContainer;

                    /* Sometimes .net framework doesn't handle http request properly and instead throws 
                     * web exception "The request was aborted: The connection was closed unexpectedly". 
                     * Using a different HttpVersion.Version10 can get around this issue. */
                    if (hasConnectionClosedError)
                        request.ProtocolVersion = HttpVersion.Version10;

                    if (proxy != null)
                        request.Proxy = proxy;

                    SetRequestHeaders(request, httpHeaders);

                    using (Stream postStream = request.GetRequestStream())
                    {
                        postStream.Write(postDataBytes, 0, postDataBytes.Length);
                    }

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                    using (StreamReader reader = new StreamReader(response.GetResponseStream()))
                    {
                        responseStr = reader.ReadToEnd();
                    }

                    response.Close();

                    return responseStr;
                }
                catch (Exception e)
                {
                    log.InfoFormat("An exception occured while posting request. Url: {0}", url);
                    e.Data[ParameterInfo.ExceptionData.Url] = url;

                    // give up if already retried n times
                    if (i == retryCount)
                        throw;

                    // If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
                    // Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
                    if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
                        hasConnectionClosedError = true;
                    else if (retryOnTimeout
                        && e is WebException
                        && ((((WebException)e).Status == WebExceptionStatus.Timeout)
                            || (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
                        log.Warn(e.Message);
                    else if (retryOnError)
                        log.Warn(e.Message);
                    else
                        throw;

                    if (retryDelayInMilisecond > 0)
                    {
                        log.Info("Retry posting request after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
                        System.Threading.Thread.Sleep(retryDelayInMilisecond);
                    }
                    else
                    {
                        log.Info("Retry posting request. Retry count: " + (i + 1));
                    }
                }
            }

            return null;
        }

        public static void DownloadFile(string destFilePath, string localFilePath)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(destFilePath);

            request.Method = WebRequestMethods.Http.Get;

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream responseStream = response.GetResponseStream();
            FileStream writer = new FileStream(localFilePath, FileMode.Create);

            long length = response.ContentLength;
            int bufferSize = 2048;
            int readCount;
            byte[] buffer = new byte[2048];

            readCount = responseStream.Read(buffer, 0, bufferSize);
            while (readCount > 0)
            {
                writer.Write(buffer, 0, readCount);
                readCount = responseStream.Read(buffer, 0, bufferSize);
            }

            responseStream.Close();
            response.Close();
            writer.Close();
        }

        public static void SetRequestHeaders(HttpWebRequest request, Dictionary httpHeaders)
        {
            if (httpHeaders != null && httpHeaders.Any())
            {
                foreach (KeyValuePair httpHeader in httpHeaders)
                {
                    if (httpHeader.Key == "User-Agent")
                        request.UserAgent = httpHeader.Value;
                    else if (httpHeader.Key == "Referer")
                        request.Referer = httpHeader.Value;
                    else if (httpHeader.Key == "Range")
                    {
                        long endRange = Convert.ToInt64(StringUtil.SubstringToEnd(httpHeader.Value, "-"));
                        if (httpHeader.Value.Contains("="))
                        {
                            string rangeSpecifier = StringUtil.SubstringFromStart(httpHeader.Value, "=");
                            long startRange = Convert.ToInt64(StringUtil.Substring(httpHeader.Value, "=", "-"));
                            request.AddRange(rangeSpecifier, startRange, endRange);
                        }
                        else
                        {
                            long startRange = Convert.ToInt64(StringUtil.SubstringFromStart(httpHeader.Value, "-"));
                            request.AddRange(startRange, endRange);
                        }
                    }
                    else if (httpHeader.Key == "Expect")
                        request.ServicePoint.Expect100Continue = Convert.ToBoolean(httpHeader.Value);
                    else if (httpHeader.Key == "Accept")
                        request.Accept = httpHeader.Value;
                    else if (httpHeader.Key == "Connection" && String.Equals(httpHeader.Value, "Keep-Alive"))
                    {
                        // For some reason Keep-Alive is set only on first request. Below is work around using reflection to set Keep-Alive on every request
                        ServicePoint servicePoint = request.ServicePoint;
                        System.Reflection.PropertyInfo properptyInfo = servicePoint.GetType().GetProperty("HttpBehaviour", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic);
                        properptyInfo.SetValue(servicePoint, (byte)0, null);
                    }
                    else if (httpHeader.Key == "Connection" && String.Equals(httpHeader.Value, "keep-alive"))
                    {
                        // Similar to above but setting Keep-Alive as lower case. Note following reflection code appends keep-alive instead of replacing existing value. Therefore Connection: keep-alive,Keep-Alive. But this only happens on first request for same domain. Subsequent requests will be Connection: keep-alive
                        request.Headers.GetType().InvokeMember(
                            "ChangeInternal",
                            System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.InvokeMethod,
                            Type.DefaultBinder,
                            request.Headers,
                            new object[] { "Connection", "keep-alive" }
                        );
                    }
                    else if (httpHeader.Key == "Proxy-Authorization")
                    {
                        request.Headers[httpHeader.Key] = httpHeader.Value;
                        request.PreAuthenticate = true;
                    }
                    else if (httpHeader.Key == "Content-Type")
                        request.ContentType = httpHeader.Value;
                    else
                        request.Headers[httpHeader.Key] = httpHeader.Value;
                }
            }
        }

        /* 
         * Workaround for .Net framework bug which escapes dot in uri. For example: http://www.thevetshed.com.au/buy/adjustable-3-8-puppy-kitten-cat-harness-w-lead./CATHARN
         * becomes http://www.thevetshed.com.au/buy/adjustable-3-8-puppy-kitten-cat-harness-w-lead/CATHARN
         * 
         * Reference: http://stackoverflow.com/questions/856885/httpwebrequest-to-url-with-dot-at-the-end
         */
        private static void FixDotInUri()
        {
            System.Reflection.MethodInfo getSyntax = typeof(UriParser).GetMethod("GetSyntax", System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic);
            System.Reflection.FieldInfo flagsField = typeof(UriParser).GetField("m_Flags", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic);
            if (getSyntax != null && flagsField != null)
            {
                foreach (string scheme in new[] { "http", "https" })
                {
                    UriParser parser = (UriParser)getSyntax.Invoke(null, new object[] { scheme });
                    if (parser != null)
                    {
                        int flagsValue = (int)flagsField.GetValue(parser);
                        // Clear the CanonicalizeAsFilePath attribute
                        if ((flagsValue & 0x1000000) != 0)
                            flagsField.SetValue(parser, flagsValue & ~0x1000000);
                    }
                }
            }
        }

        public static CookieContainer GetLoginCookieContainer(string hostAddress)
        {
            // login
            string loginUrl = hostAddress + FashionExchangeSetting.AdminLoginPath;
            string userName = FashionExchangeSetting.AdminUserName;
            string password = FashionExchangeSetting.AdminPassword;
            string postData = String.Format("email={0}&password={1}", userName, password);
            byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);

            HttpWebRequest loginReq = (HttpWebRequest)WebRequest.Create(loginUrl);
            loginReq.Method = "POST";
            loginReq.ContentType = "application/x-www-form-urlencoded";
            loginReq.ContentLength = postDataBytes.Length;
            loginReq.AllowAutoRedirect = false;
            loginReq.CookieContainer = new CookieContainer();
            using (Stream postStream = loginReq.GetRequestStream())
            {
                postStream.Write(postDataBytes, 0, postDataBytes.Length);
            }
            HttpWebResponse loginResp = (HttpWebResponse)loginReq.GetResponse();
            loginResp.Close();

            return loginReq.CookieContainer;
        }

        public static void ScrapeStore(string hostAddress, int[] storeIds = null, bool scrapeNewUrl = false, double? scrapeExistingUrlAgeInHour = null, int? scrapeExistingUrlPriceDroppedInDay = null, bool uploadProductPhoto = false, bool checkNewsletterProductAvailabilityAndWatermarkPhoto = false, bool uploadNewsletterPhoto = false, bool uploadOfferImage = false, bool syncUserActivity = false, bool reindex = false, bool uploadIndex = false, bool sendPriceAlert = false, bool sendSaleAlert = false, bool deleteOldIndex = false, bool deleteOldPhoto = false, int scraperThreadCount = 10, CookieContainer loginCookieContainer = null, bool notifyTodaysSale = false, bool checkStoreForScrapeAlert = false, bool sendScrapeResultSummary = false, int? deleteProductAgeInDay = null)
        {
            if (storeIds == null)
                storeIds = new int[0];

            string postData = String.Join("&", storeIds.Select(m => ParameterInfo.QueryString.StoreId + "=" + m));
            postData += "&" + ParameterInfo.QueryString.ScrapeNewUrl + "=" + scrapeNewUrl;
            postData += "&" + ParameterInfo.QueryString.ScrapeExistingUrlAgeInHour + "=" + scrapeExistingUrlAgeInHour;
            postData += "&" + ParameterInfo.QueryString.ScrapeExistingUrlPriceDroppedInDay + "=" + scrapeExistingUrlPriceDroppedInDay;
            postData += "&" + ParameterInfo.QueryString.UploadProductPhoto + "=" + uploadProductPhoto;
            postData += "&" + ParameterInfo.QueryString.CheckNewsletterProductAvailabilityAndWatermarkPhoto + "=" + checkNewsletterProductAvailabilityAndWatermarkPhoto;
            postData += "&" + ParameterInfo.QueryString.UploadNewsletterPhoto + "=" + uploadNewsletterPhoto;
            postData += "&" + ParameterInfo.QueryString.UploadOfferImage + "=" + uploadOfferImage;
            postData += "&" + ParameterInfo.QueryString.SyncUserActivity + "=" + syncUserActivity;
            postData += "&" + ParameterInfo.QueryString.Reindex + "=" + reindex;
            postData += "&" + ParameterInfo.QueryString.UploadIndex + "=" + uploadIndex;
            postData += "&" + ParameterInfo.QueryString.SendPriceAlert + "=" + sendPriceAlert;
            postData += "&" + ParameterInfo.QueryString.SendSaleAlert + "=" + sendSaleAlert;
            postData += "&" + ParameterInfo.QueryString.DeleteOldIndex + "=" + deleteOldIndex;
            postData += "&" + ParameterInfo.QueryString.DeleteOldPhoto + "=" + deleteOldPhoto;
            postData += "&" + ParameterInfo.QueryString.ScraperThreadCount + "=" + scraperThreadCount;
            postData += "&" + ParameterInfo.QueryString.NotifyTodaysSale + "=" + notifyTodaysSale;
            postData += "&" + ParameterInfo.QueryString.CheckStoreForScrapeAlert + "=" + checkStoreForScrapeAlert;
            postData += "&" + ParameterInfo.QueryString.SendScrapeResultSummary + "=" + sendScrapeResultSummary;
            postData += "&" + ParameterInfo.QueryString.DeleteProductAgeInDay + "=" + deleteProductAgeInDay;
            byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(hostAddress + FashionExchangeSetting.AdminScrapeStorePath);
            request.Method = WebRequestMethods.Http.Post;
            request.ContentType = "application/x-www-form-urlencoded";
            request.ContentLength = postDataBytes.Length;
            request.AllowAutoRedirect = false;
            request.CookieContainer = loginCookieContainer;
            request.Timeout = 1000 * 2;

            using (Stream postStream = request.GetRequestStream())
                postStream.Write(postDataBytes, 0, postDataBytes.Length);

            try
            {
                request.GetResponse();
            }
            catch (WebException e)
            {
                // Request was set to timeout after 2 seconds because we don't want to wait for the request to complete
                if (e.Status == WebExceptionStatus.Timeout)
                    return;
                else
                    throw;
            }
        }

        private static Dictionary> proxies = new Dictionary>();
        private static Dictionary, List> proxiesValidatedByUrl = new Dictionary, List>();
        private static Object proxyLock = new Object();
        public static List GetProxies(ProxyNetwork proxyNetwork = ProxyNetwork.MyPrivateProxy, string testUrl = null, int maxTestUrlAttemptCount = 30, int maxWorkingProxyCount = 20, bool disableCache = false)
        {
            if (proxyNetwork == ProxyNetwork.MyPrivateProxy)
            {
                lock (proxyLock)
                {
                    if (!proxies.ContainsKey(ProxyNetwork.MyPrivateProxy))
                    {
                        log.Info("Get MyPrivateProxy proxy list...");
                        proxies[ProxyNetwork.MyPrivateProxy] = new List();
                        string proxyListResponse = ScraperUtil.CreateHttpGetRequest("https://api.myprivateproxy.net/v1/fetchProxies/json/full/gkytva0jbisl9olooyzjmpbdfcrpmn9i", checkUrlLoaded: false);
                        JObject proxyListJson = JObject.Parse("{Wrapper:" + proxyListResponse + "}");
                        foreach (JToken proxyListItem in proxyListJson.SelectToken("Wrapper"))
                        {
                            log.Info("Found proxy: " + proxyListItem["proxy_ip"].ToString() + ":" + proxyListItem["proxy_port"].ToString());
                            WebProxy proxy = new WebProxy(proxyListItem["proxy_ip"].ToString(), Convert.ToInt32(proxyListItem["proxy_port"].ToString()));
                            proxy.Credentials = new NetworkCredential(proxyListItem["username"].ToString(), proxyListItem["password"].ToString());
                            proxies[ProxyNetwork.MyPrivateProxy].Add(proxy);
                        }
                    }
                }
            }
            else if (proxyNetwork == ProxyNetwork.ProxyBonanza)
            {
                lock (proxyLock)
                {
                    if (!proxies.ContainsKey(ProxyNetwork.ProxyBonanza))
                    {
                        proxies[ProxyNetwork.ProxyBonanza] = new List();
                        string apiUrl = "https://api.proxybonanza.com/v1/userpackages/49575.json";
                        Dictionary httpHeaders = new Dictionary();
                        httpHeaders.Add("Authorization", "mIAWG1CKaz3cSjWUV2wnAaszmNS6nck6C8kQIDBmOheslgMOFp!43336");
                        JObject proxyListJson = JObject.Parse(ScraperUtil.CreateHttpGetRequest(apiUrl, httpHeaders: httpHeaders, checkUrlLoaded: false));
                        foreach (JToken proxyItem in proxyListJson.SelectToken("data.ippacks"))
                        {
                            log.Info("Found proxy: " + proxyItem["ip"].ToString() + ":" + proxyItem["port_http"].ToString());

                            WebProxy proxy = new WebProxy(proxyItem["ip"].ToString(), Convert.ToInt32(proxyItem["port_http"].ToString()));
                            proxy.Credentials = new NetworkCredential(proxyListJson.SelectToken("data.login").ToString(), proxyListJson.SelectToken("data.password").ToString());
                            proxies[ProxyNetwork.ProxyBonanza].Add(proxy);
                        }
                    }
                }
            }
            else if (proxyNetwork == ProxyNetwork.BinaryLane)
            {
                lock (proxyLock)
                {
                    if (!proxies.ContainsKey(ProxyNetwork.BinaryLane))
                    {
                        proxies[ProxyNetwork.BinaryLane] = new List();
                        WebProxy proxy = new WebProxy("43.229.63.22", 29842);
                        proxy.Credentials = new NetworkCredential("scrapeserver", "95EpLZ");
                        proxies[ProxyNetwork.BinaryLane].Add(proxy);
                    }
                }
            }
            else if (proxyNetwork == ProxyNetwork.BinaryLaneScrapeServer)
            {
                lock (proxyLock)
                {
                    if (!proxies.ContainsKey(ProxyNetwork.BinaryLaneScrapeServer))
                    {
                        proxies[ProxyNetwork.BinaryLaneScrapeServer] = new List();
                        WebProxy proxy = new WebProxy("43.229.63.22", 29843);
                        proxies[ProxyNetwork.BinaryLaneScrapeServer].Add(proxy);
                    }
                }
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiGlobalShared) // Luminati Global Shared proxies are slightly different because it has huge list of IPs
            {
                if (disableCache && String.IsNullOrEmpty(testUrl))
                    throw new Exception("test url is required when disable cache is enabled");

                if (String.IsNullOrEmpty(testUrl))
                {
                    string sessionId = new Random().Next().ToString();
                    string userName = "lum-customer-hl_8238b460-zone-zone1-session-" + sessionId;
                    string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";

                    WebProxy proxy = new WebProxy(proxyAddress, 22225);
                    proxy.Credentials = new NetworkCredential(userName, "200fbti3d9xt");

                    return new List() { proxy };
                }

                Tuple luminatiAndTestUrl = new Tuple(proxyNetwork, new Uri(testUrl).Authority);
                if (!disableCache && proxiesValidatedByUrl.ContainsKey(luminatiAndTestUrl))
                    return proxiesValidatedByUrl[luminatiAndTestUrl];

                proxiesValidatedByUrl[luminatiAndTestUrl] = new List();

                for (int attempt = 0; attempt < maxTestUrlAttemptCount && proxiesValidatedByUrl[luminatiAndTestUrl].Count < maxWorkingProxyCount; attempt++)
                {
                    string sessionId = new Random().Next().ToString();
                    string userName = "lum-customer-hl_8238b460-zone-zone1-session-" + sessionId;
                    string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
                    WebProxy proxy = new WebProxy(proxyAddress, 22225);
                    proxy.Credentials = new NetworkCredential(userName, "200fbti3d9xt");
                    try
                    {
                        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(testUrl);
                        request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                        request.Proxy = proxy;
                        request.Timeout = 1000 * 10;
                        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                        log.Info("Found working proxy. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
                        proxiesValidatedByUrl[luminatiAndTestUrl].Add(proxy);
                    }
                    catch (WebException e)
                    {
                        if ((e.Status == WebExceptionStatus.Timeout)
                        || (e.Response != null && ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.GatewayTimeout))
                            log.Info("Timed out downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
                        else if (e.Response != null && (((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.Unauthorized || ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.Forbidden))
                            log.Info("Forbidden or unauthorized downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
                        else
                            log.Info("Error downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false) + " Exception: " + e.ToString());
                    }
                }


                return proxiesValidatedByUrl[luminatiAndTestUrl];
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiStatic)
            {
                string proxyAddress = "customer-hl_8238b460.zproxy.lum-superproxy.io";
                string userName = "lum-customer-hl_8238b460-zone-static";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "xoj41myrdax6");

                return new List() { proxy };
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedJDSports)
            {
                string sessionId = new Random().Next().ToString();
                string userName = "lum-customer-hl_8238b460-zone-jdsports-session-" + sessionId;
                string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "tw53pckkvvie");

                return new List() { proxy };
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedFootLocker)
            {
                string sessionId = new Random().Next().ToString();
                string userName = "lum-customer-hl_8238b460-zone-footlocker-session-" + sessionId;
                string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "728yatag13w4");

                return new List() { proxy };
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedVisionDirect)
            {
                string sessionId = new Random().Next().ToString();
                string userName = "lum-customer-hl_8238b460-zone-visiondirect-session-" + sessionId;
                string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "mls5d00ybhdr");

                return new List() { proxy };
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedASOS)
            {
                string sessionId = new Random().Next().ToString();
                string userName = "lum-customer-hl_8238b460-zone-asos-session-" + sessionId;
                string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "cis8217oa34e");

                return new List() { proxy };
            }
            else if (proxyNetwork == ProxyNetwork.LuminatiStaticNZ)
            {
                string proxyAddress = "customer-hl_8238b460.zproxy.lum-superproxy.io";
                string userName = "lum-customer-hl_8238b460-zone-static_nz";

                WebProxy proxy = new WebProxy(proxyAddress, 22225);
                proxy.Credentials = new NetworkCredential(userName, "cf83xw0jjhgi");

                return new List() { proxy };
            }

            if (String.IsNullOrEmpty(testUrl))
                return proxies[proxyNetwork];

            Tuple proxyNetworkAndTestUrl = new Tuple(proxyNetwork, new Uri(testUrl).Authority);
            if (proxiesValidatedByUrl.ContainsKey(proxyNetworkAndTestUrl))
                return proxiesValidatedByUrl[proxyNetworkAndTestUrl];
            else
            {
                proxiesValidatedByUrl.Add(proxyNetworkAndTestUrl, new List());
                foreach (WebProxy proxy in proxies[proxyNetwork])
                {
                    try
                    {
                        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(testUrl);
                        request.UserAgent = FashionExchangeSetting.UserAgentChrome;
                        request.Proxy = proxy;
                        request.Timeout = 1000 * 10;
                        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                        log.Info("Found working proxy. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri);
                        proxiesValidatedByUrl[proxyNetworkAndTestUrl].Add(proxy);
                    }
                    catch (WebException e)
                    {
                        if ((e.Status == WebExceptionStatus.Timeout)
                        || (e.Response != null && ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.GatewayTimeout))
                            log.Info("Timed out downloading test url. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri);
                        else
                            log.Info("Error downloading test url. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri + " Exception: " + e.ToString());
                    }
                }
                return proxiesValidatedByUrl[proxyNetworkAndTestUrl];
            }
        }

        public static WebProxy GetRandomProxy(ProxyNetwork proxyNetwork = ProxyNetwork.MyPrivateProxy, string testUrl = null, int maxTestUrlAttemptCount = 30, int maxWorkingProxyCount = 20, bool disableCache = false)
        {
            List proxies = GetProxies(proxyNetwork: proxyNetwork, testUrl: testUrl, maxTestUrlAttemptCount: maxTestUrlAttemptCount, maxWorkingProxyCount: maxWorkingProxyCount, disableCache: disableCache);
            return proxies.ElementAt(new Random().Next(proxies.Count()));
        }

        public static string GenerateChromeProxyExtension(WebProxy proxy, string fileNamePrefix)
        {
            string extensionFullPath = HostingEnvironment.ApplicationPhysicalPath ?? AppDomain.CurrentDomain.BaseDirectory;

            if (!Directory.Exists(extensionFullPath + FashionExchangeSetting.TemporaryDirectory))
                Directory.CreateDirectory(extensionFullPath + FashionExchangeSetting.TemporaryDirectory);

            extensionFullPath += FashionExchangeSetting.TemporaryDirectory + fileNamePrefix + "_ChromeProxy_" + System.Threading.Thread.CurrentThread.ManagedThreadId + "_" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + ".zip";

            using (MemoryStream memoryStream = new MemoryStream())
            {
                using (ZipArchive archive = new ZipArchive(memoryStream, ZipArchiveMode.Create, true))
                {
                    ZipArchiveEntry backgroundJS = archive.CreateEntry("background.js");
                    using (Stream entryStream = backgroundJS.Open())
                    using (StreamWriter streamWriter = new StreamWriter(entryStream))
                    {
                        streamWriter.Write(
@"var config = {{
    mode: ""fixed_servers"",
    rules: {{
      singleProxy: {{
        scheme: ""http"",
        host: ""{0}"",
        port: parseInt({1})
      }},
      bypassList: [""foobar.com""]
    }}
  }};

chrome.proxy.settings.set({{value: config, scope: ""regular""}}, function() {{}});

function callbackFn(details) {{
    return {{
        authCredentials: {{
            username: ""{2}"",
            password: ""{3}""
        }}
    }};
}}

chrome.webRequest.onAuthRequired.addListener(
        callbackFn,
        {{urls: [""""]}},
        ['blocking']
);",
                            proxy.Address.Host,
                            proxy.Address.Port,
                            (proxy.Credentials as NetworkCredential).UserName,
                            (proxy.Credentials as NetworkCredential).Password
                        );
                    }

                    ZipArchiveEntry manifestJSON = archive.CreateEntry("manifest.json");
                    using (Stream entryStream = manifestJSON.Open())
                    using (StreamWriter streamWriter = new StreamWriter(entryStream))
                    {
                        streamWriter.Write(
                            @"{
    ""version"": ""1.0.0"",
    ""manifest_version"": 2,
    ""name"": ""Chrome Proxy"",
    ""permissions"": [
        ""proxy"",
        ""tabs"",
        ""unlimitedStorage"",
        ""storage"",
        """",
        ""webRequest"",
        ""webRequestBlocking""
    ],
    ""background"": {
        ""scripts"": [""background.js""]
    },
    ""minimum_chrome_version"":""22.0.0""
}"
                            );
                    }
                }

                using (FileStream fileStream = new FileStream(extensionFullPath, FileMode.Create))
                {
                    memoryStream.Seek(0, SeekOrigin.Begin);
                    memoryStream.CopyTo(fileStream);
                }
            }

            return extensionFullPath;
        }

        public static string GenerateChromeBlockExtension(List blockUrls, List blockResourceTypes, string fileNamePrefix)
        {
            if (blockUrls == null || !blockUrls.Any())
                throw new ArgumentException("At least one block url is required");

            string extensionFullPath = HostingEnvironment.ApplicationPhysicalPath ?? AppDomain.CurrentDomain.BaseDirectory;

            if (!Directory.Exists(extensionFullPath + FashionExchangeSetting.TemporaryDirectory))
                Directory.CreateDirectory(extensionFullPath + FashionExchangeSetting.TemporaryDirectory);

            extensionFullPath += FashionExchangeSetting.TemporaryDirectory + fileNamePrefix + "_ChromeBlock_" + System.Threading.Thread.CurrentThread.ManagedThreadId + "_" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + ".zip";

            using (MemoryStream memoryStream = new MemoryStream())
            {
                using (ZipArchive archive = new ZipArchive(memoryStream, ZipArchiveMode.Create, true))
                {
                    ZipArchiveEntry backgroundJS = archive.CreateEntry("background.js");
                    using (Stream entryStream = backgroundJS.Open())
                    using (StreamWriter streamWriter = new StreamWriter(entryStream))
                    {
                        streamWriter.Write(
@"chrome.webRequest.onBeforeRequest.addListener(
    function(details) {{ return {{cancel: true}}; }},
    {{
        urls: [{0}],
        types: [{1}]
    }},
    [""blocking""]);",
                            String.Join(",", blockUrls.Select(m => "\"" + m + "\"")),
                            String.Join(",", blockResourceTypes.Select(m => "\"" + m + "\""))
                        );
                    }
                    // For a list of applicable resource types: https://developer.chrome.com/extensions/webRequest#type-ResourceType

                    ZipArchiveEntry manifestJSON = archive.CreateEntry("manifest.json");
                    using (Stream entryStream = manifestJSON.Open())
                    using (StreamWriter streamWriter = new StreamWriter(entryStream))
                    {
                        streamWriter.Write(
@"{
    ""version"": ""1.0.0"",
    ""manifest_version"": 2,
    ""name"": ""Chrome Block"",
    ""permissions"": [
        """",
        ""webRequest"",
        ""webRequestBlocking""
    ],
    ""background"": {
        ""scripts"": [""background.js""]
    },
    ""minimum_chrome_version"":""22.0.0""
}"
                            );
                    }
                }

                using (FileStream fileStream = new FileStream(extensionFullPath, FileMode.Create))
                {
                    memoryStream.Seek(0, SeekOrigin.Begin);
                    memoryStream.CopyTo(fileStream);
                }
            }

            return extensionFullPath;
        }

        private static Dictionary exchangeRates;
        public static decimal ConvertToAUD(string currencyCode, decimal price)
        {
            if (exchangeRates == null)
            {
                log.Info("Retrieve exchange rates...");

                exchangeRates = new Dictionary(StringComparer.InvariantCultureIgnoreCase);

                JObject exchangeRateJson = JObject.Parse(ScraperUtil.CreateHttpGetRequest("http://apilayer.net/api/live?access_key=" + FashionExchangeSetting.CurrencyLayerAccessKey + "¤cies=AUD,NZD,EUR,GBP", checkUrlLoaded: false));
                JToken usdToAud = exchangeRateJson.SelectToken("quotes.USDAUD");
                exchangeRates.Add(Country.UnitedStates.CurrencyCode, Convert.ToDecimal(usdToAud.ToString()));

                JToken usdToGbp = exchangeRateJson.SelectToken("quotes.USDGBP");
                exchangeRates.Add(Country.UnitedKingdom.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToGbp.ToString()));

                JToken usdToNzd = exchangeRateJson.SelectToken("quotes.USDNZD");
                exchangeRates.Add(Country.NewZealand.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToNzd.ToString()));

                JToken usdToEur = exchangeRateJson.SelectToken("quotes.USDEUR");
                exchangeRates.Add(Country.Germany.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToEur.ToString()));

                log.Info("Exchange rates retrieved successfully");
            }

            return price * exchangeRates[currencyCode];
        }

        public static string UpdateUrlQuery(string url, string queryName, string queryValue)
        {
            Uri uri = new Uri(url);
            NameValueCollection queries = HttpUtility.ParseQueryString(uri.Query);
            if (queryValue == null)// only remove query when query value is null, do not remove if value is blank
                queries.Remove(queryName);
            else
                queries.Set(queryName, queryValue);

            if (queries.Count > 0)
                return uri.Scheme + "://" + uri.Authority + uri.AbsolutePath + "?" + queries.ToString();
            else
                return uri.Scheme + "://" + uri.Authority + uri.AbsolutePath;
        }

        public static long ConvertIPAddressToNumber(string ipAddress)
        {
            // on localhost, ip address is ::1
            if (String.Equals(ipAddress, "::1"))
                return 0;

            int[] segments = ipAddress.Split('.').Select(m => Convert.ToInt32(m)).ToArray();
            return (long)segments[0] * 16777216 + segments[1] * 65536 + segments[2] * 256 + segments[3];
        }

        private static SortedList _chinaIPRanges;
        public static bool IsChinaIPAddress(string ipAddress)
        {
            if (_chinaIPRanges == null)
            {
                log.Info("Reading China IP Address.txt");
                _chinaIPRanges = new SortedList();
                try
                {
                    using (StreamReader reader = new StreamReader(Path.Combine(HostingEnvironment.ApplicationPhysicalPath, "bin", "China IP Address.txt")))
                    {
                        string line = null;
                        while ((line = reader.ReadLine()) != null)
                        {
                            if (String.IsNullOrWhiteSpace(line) || line.StartsWith("#"))
                                continue;

                            string startIP = StringUtil.SubstringFromStart(line, " - ");
                            string endIP = StringUtil.Substring(line, " - ", "  China");

                            _chinaIPRanges.Add(ScraperUtil.ConvertIPAddressToNumber(endIP), ScraperUtil.ConvertIPAddressToNumber(startIP));
                        }
                    }
                }
                catch (Exception e)
                {
                    log.Error("Error reading china IP address list: " + e.ToString());
                }
                log.Info("Found " + _chinaIPRanges.Count + " china ip ranges");
            }

            long ip = ScraperUtil.ConvertIPAddressToNumber(ipAddress);

            KeyValuePair ipRange = _chinaIPRanges.FirstOrDefault(m => ip <= m.Key);
            if (ipRange.Key == 0 && ipRange.Value == 0)
                return false;
            else if (ip >= ipRange.Value)
                return true;
            else
                return false;
        }

        public static List ParseCategoryTreeAsList(string homeUrl, string categoryTreeXPath, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
        {
            HtmlDocument homePage = ScraperUtil.LoadHtml(homeUrl, checkUrlLoaded: false);
            HtmlNode treeNode = homePage.DocumentNode.SelectSingleNode(categoryTreeXPath);

            return ParseCategoryTree(treeNode, skipCategoryByExactMatch: skipCategoryByExactMatch, skipCategoryByContainMatch: skipCategoryByContainMatch);
        }

        public static void ParseCategoryTree(HtmlNode topTreeNode, DAL.SiteMap siteMap, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
        {
            List categories = ParseCategoryTree(topTreeNode, skipCategoryByExactMatch: skipCategoryByExactMatch, skipCategoryByContainMatch: skipCategoryByContainMatch);

            // add leaf categories to SiteMap
            foreach (Category category in categories.Where(m => !m.SubCategories.Any() && !String.IsNullOrEmpty(m.Url)))
            {
                string breadcrumb = category.Name;
                Category currentCategory = category;
                while (currentCategory.ParentCategory != null)
                {
                    breadcrumb = currentCategory.ParentCategory.Name + " " + breadcrumb;
                    currentCategory = currentCategory.ParentCategory;
                }

                siteMap.AddCategoryPage(breadcrumb, category.Url, siteMap.GetHomePageUrl());
            }
        }
        private static List ParseCategoryTree(HtmlNode topTreeNode, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
        {
            // remove all empty text nodes
            foreach (HtmlNode textNode in topTreeNode.SelectNodes("//text()"))
                if (String.IsNullOrWhiteSpace(textNode.InnerText.Trim()))
                    textNode.Remove();

            HtmlNode currentNode = topTreeNode;
            HtmlNode parentNode = null;
            int depth = 0;

            Category topCategory = new Category();
            topCategory.Name = "Home";
            topCategory.Depth = depth;
            List categoryList = new List();
            categoryList.Add(topCategory);

            while (currentNode.HasChildNodes || currentNode.NextSibling != null || currentNode.ParentNode != topTreeNode)
            {
                if (currentNode.HasChildNodes)
                {
                    parentNode = currentNode;
                    currentNode = currentNode.FirstChild;
                    depth++;
                }
                else if (currentNode.NextSibling != null)
                {
                    currentNode = currentNode.NextSibling;
                    currentNode.PreviousSibling.Remove();
                }
                else if (currentNode.ParentNode != null)
                {
                    currentNode = currentNode.ParentNode;
                    currentNode.FirstChild.Remove();
                    parentNode = currentNode.ParentNode;
                    depth--;
                }
                else
                    break;

                // if current node is a ahref node, add link and link text to category, otherwise just add text to category
                string name = null, url = null;
                if (currentNode.Name == "a")
                {
                    name = ScraperUtil.NormalizeText(currentNode.InnerText);
                    url = currentNode.Attributes["href"]?.Value;
                }
                else if (!String.IsNullOrWhiteSpace(currentNode.SelectSingleNode("./text()")?.InnerText.Trim()))
                {
                    name = ScraperUtil.NormalizeText(currentNode.SelectSingleNode("./text()").InnerText);
                }

                if (!String.IsNullOrEmpty(name))
                {
                    Category category = new Category();
                    category.Name = name;
                    category.Depth = depth;
                    category.Url = url;

                    Category parentCategory = categoryList.Where(m => m.Depth < depth).Last(); // parent category is the last added category with lower depth
                    parentCategory.AddSubCategory(category);

                    categoryList.Add(category);

                    // Stop parse inner html if current node is ahref, because we can have xxx and we got link text already
                    if (currentNode.Name == "a")
                    {
                        currentNode.Remove();
                        currentNode = parentNode;
                        parentNode = currentNode.ParentNode;
                        depth--;
                    }
                }
            }

            for (int i = 0; i < categoryList.Count; i++)
            {
                Category category = categoryList[i];

                if (skipCategoryByExactMatch != null && (StringUtil.Contains(skipCategoryByExactMatch, category.Name, ignoreCase: true) || skipCategoryByExactMatch.Where(m => StringUtil.Contains(category.Breadcrumb, m, ignoreCase: true)).Any()))
                {
                    categoryList.Remove(category);
                    i--;
                }
                else if (skipCategoryByContainMatch != null && (skipCategoryByContainMatch.Any(m => StringUtil.ContainsIgnoreCase(category.Name, m)) || skipCategoryByContainMatch.Where(skipCategory => category.Breadcrumb.Where(breadcrumb => StringUtil.ContainsIgnoreCase(breadcrumb, skipCategory)).Any()).Any()))
                {
                    categoryList.Remove(category);
                    i--;
                }
            }

            return categoryList;
        }

        public class Category
        {
            public Category()
            {
                SubCategories = new List();
            }
            public string Name { get; set; }
            public int Depth { get; set; }
            public string Url { get; set; }
            public Category ParentCategory { get; private set; }
            public List SubCategories { get; private set; }
            public List Breadcrumb
            {
                get
                {
                    List breadcrumb = new List();

                    if (ParentCategory == null)
                        return breadcrumb;

                    Category currentCategory = this;
                    while (currentCategory.ParentCategory != null && currentCategory.ParentCategory.Depth > 0)
                    {
                        currentCategory = currentCategory.ParentCategory;
                        breadcrumb.Add(currentCategory.Name);
                    }
                    return breadcrumb;
                }
            }

            public void AddSubCategory(string name, int depth, string url)
            {
                Category category = new Category();
                category.Name = name;
                category.Depth = depth;
                category.Url = url;
                AddSubCategory(category);
            }
            public void AddSubCategory(Category category)
            {
                category.ParentCategory = this;
                this.SubCategories.Add(category);
            }
        }

        public static bool IsNoIndex()
        {
            if (HttpContext.Current.Request.QueryString.Count >= 2
                || (HttpContext.Current.Request.QueryString.Count == 1 && HttpContext.Current.Request.QueryString.GetKey(0) != ParameterInfo.QueryString.StoreId && HttpContext.Current.Request.QueryString.GetKey(0) != ParameterInfo.QueryString.BrandName))
                return true;
            else
                return false;
        }
    }
}


TemplateShopifyScraper


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;


namespace FashionExchange.Common.Scrapers
{
    public class TemplateShopifyScraper : ShopifyScraper
    {
        protected override HashSet SizeOptionNames
        {
            get
            {
                if (_sizeOptionNames == null)
                {
                    _sizeOptionNames = base.SizeOptionNames;
                    //_sizeOptionNames.Add("size option name");
                }
                return _sizeOptionNames;
            }
        }
        protected override HashSet ColourOptionNames
        {
            get
            {
                if (_colourOptionNames == null)
                {
                    _colourOptionNames = base.ColourOptionNames;
                    //_colourOptionNames.Add("colour option name");
                }
                return _colourOptionNames;
            }
        }
        protected override List GetProductUrls()
        {
            bool hasSubCategory = false;

            HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);

            HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
            foreach (HtmlNode categoryNode in categoryNodes)
            {
                string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
                log.Info("Parsing category: " + categoryName);

                HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
                if (subCategoryNodes != null)
                {
                    foreach (HtmlNode subCategoryNode in subCategoryNodes)
                    {
                        log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());

                        SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
                        GetProductUrlsFromCategoryPage(subCategoryNode);

                        hasSubCategory = true;
                    }
                }
                else
                {
                    HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
                    SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
                    GetProductUrlsFromCategoryPage(categoryLinkNode);
                }
            }

            if (!hasSubCategory)
                throw new Exception("No sub categories found");

            return SiteMap.GetAllProductUrls();
        }
        private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
        {
            GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
        }
        private void GetProductUrlsFromCategoryPage(string categoryUrl)
        {
            log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);

            HtmlDocument categoryPage = null;
            try
            {
                categoryPage = ScraperUtil.LoadHtml(categoryUrl);
            }
            catch (WebException e)
            {
                log.Info("Error download category page: " + e.ToString());
                return;
            }

            HtmlNode nextPageNode = null;
            do
            {
                HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//div[@id='product-loop']//div[@class='product-info-inner']/a");
                if (productNodes == null)
                {
                    log.Info("Category page has no products");
                    return;
                }

                foreach (HtmlNode productNode in productNodes)
                {
                    SiteMap.AddProductUrl(productNode, categoryUrl);
                    log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
                }

                nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//div[@id='pagination']/a[i/@class='fa fa-caret-right']");
                if (nextPageNode != null)
                {
                    log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
                    try
                    {
                        categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
                    }
                    catch (WebException e)
                    {
                        log.Info("Error download next category page: " + e.ToString());
                        return;
                    }
                }

            } while (nextPageNode != null);
        }
        protected override bool IgnoreProduct()
        {
            return base.IgnoreProduct();
        }
    }
}


TemplateMagentoScraper

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;


namespace FashionExchange.Common.Scrapers
{
    public class TemplateMagentoScraper : MagentoScraper
    {
        protected override HashSet ColourCodeNames
        {
            get
            {
                if (_colourCodeNames == null)
                {
                    _colourCodeNames = base.ColourCodeNames;
                    //_colourCodeNames.Add("custom_colour_name");
                }
                return _colourCodeNames;
            }
        }
        protected override HashSet SizeCodeNames
        {
            get
            {
                if (_sizeCodeNames == null)
                {
                    _sizeCodeNames = base.SizeCodeNames;
                    //_sizeCodeNames.Add("custom_size_name");
                }
                return _sizeCodeNames;
            }
        }
        protected override List GetProductUrls()
        {
            bool hasSubCategory = false;

            HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);

            HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
            foreach (HtmlNode categoryNode in categoryNodes)
            {
                string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
                log.Info("Parsing category: " + categoryName);

                HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
                if (subCategoryNodes != null)
                {
                    foreach (HtmlNode subCategoryNode in subCategoryNodes)
                    {
                        log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());

                        SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
                        GetProductUrlsFromCategoryPage(subCategoryNode);

                        hasSubCategory = true;
                    }
                }
                else
                {
                    HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
                    SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
                    GetProductUrlsFromCategoryPage(categoryLinkNode);
                }
            }

            if (!hasSubCategory)
                throw new Exception("No sub categories found");

            return SiteMap.GetAllProductUrls();
        }
        private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
        {
            GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
        }
        private void GetProductUrlsFromCategoryPage(string categoryUrl)
        {
            log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);

            HtmlDocument categoryPage = null;
            try
            {
                categoryPage = ScraperUtil.LoadHtml(categoryUrl);
            }
            catch (WebException e)
            {
                log.Info("Error download category page: " + e.ToString());
                return;
            }

            HtmlNode nextPageNode = null;
            do
            {
                HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//h2[@class='product-name']/a");
                if (productNodes == null)
                {
                    log.Info("Category page has no products");
                    return;
                }

                foreach (HtmlNode productNode in productNodes)
                {
                    SiteMap.AddProductUrl(productNode, categoryUrl);
                    log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
                }

                nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//a[@class='next i-next']");
                if (nextPageNode != null)
                {
                    log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
                    try
                    { 
                    categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
                    }
                    catch (WebException e)
                    {
                        log.Info("Error download next category page: " + e.ToString());
                        return;
                    }
                }

            } while (nextPageNode != null);
        }
        protected override bool IgnoreProduct()
        {
            return base.IgnoreProduct();
        }
        protected override string GetBrand()
        {
            return String.Empty;
        }
        protected override string GetDescription()
        {
            return String.Empty;
        }
        protected override string GetImageUrl()
        {
            return String.Empty;
        }
    }
}


TemplateScraper

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;


namespace FashionExchange.Common.Scrapers
{
    public class TemplateScraper : Scraper
    {
        protected override List GetProductUrls()
        {
            bool hasSubCategory = false;

            HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);

            HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
            foreach (HtmlNode categoryNode in categoryNodes)
            {
                string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
                log.Info("Parsing category: " + categoryName);

                HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
                if (subCategoryNodes != null)
                {
                    foreach (HtmlNode subCategoryNode in subCategoryNodes)
                    {
                        log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());

                        SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
                        GetProductUrlsFromCategoryPage(subCategoryNode);

                        hasSubCategory = true;
                    }
                }
                else
                {
                    HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
                    SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
                    GetProductUrlsFromCategoryPage(categoryLinkNode);
                }
            }

            if (!hasSubCategory)
                throw new Exception("No sub categories found");

            return SiteMap.GetAllProductUrls();
        }
        private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
        {
            GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
        }
        private void GetProductUrlsFromCategoryPage(string categoryUrl)
        {
            log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);

            HtmlDocument categoryPage = null;
            try
            {
                categoryPage = ScraperUtil.LoadHtml(categoryUrl);
            }
            catch (WebException e)
            {
                log.Info("Error download category page: " + e.ToString());
                return;
            }

            HtmlNode nextPageNode = null;
            do
            {
                HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("");
                if (productNodes == null)
                {
                    log.Info("Category has no products");
                    return;
                }

                foreach (HtmlNode productNode in productNodes)
                {
                    SiteMap.AddProductUrl(productNode, categoryUrl);
                    log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
                }

                nextPageNode = categoryPage.DocumentNode.SelectSingleNode("");
                if (nextPageNode != null)
                {
                    log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
                    try
                    {
                        categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
                    }
                    catch (WebException e)
                    {
                        log.Info("Error download next category page: " + e.ToString());
                        return;
                    }
                }

            } while (nextPageNode != null);
        }
        protected override List GetProducts()
        {
            List products = new List();

            products.AddRange(CreateProductObjects());

            return products;
        }
        protected override string GetName()
        {
            return String.Empty;
        }
        protected override decimal GetPrice()
        {
            HtmlNode oldPriceNode = ProductPage.DocumentNode.SelectSingleNode("");
            if (oldPriceNode != null)
                return Convert.ToDecimal(oldPriceNode.InnerText.Replace("$", String.Empty));
            else
                return Convert.ToDecimal(ProductPage.DocumentNode.SelectSingleNode("").InnerText.Replace("$", String.Empty));
        }
        protected override decimal GetSalePrice()
        {
            HtmlNode salePriceNode = ProductPage.DocumentNode.SelectSingleNode("");
            if (salePriceNode != null)
                return Convert.ToDecimal(salePriceNode.InnerText.Replace("$", String.Empty));
            else
                return 0;
        }
        protected override string GetCategory()
        {
            return SiteMap.GetBreadcrumb(ProductUrl);
        }
        protected override string GetBrand()
        {
            return String.Empty;
        }
        protected override string GetDescription()
        {
            return String.Empty;
        }
        protected override string GetImageUrl()
        {
            return String.Empty;
        }
        protected override List GetSizes()
        {
            List sizes = new List();

            HtmlNodeCollection sizeNodes = ProductPage.DocumentNode.SelectNodes("");
            foreach (HtmlNode sizeNode in sizeNodes)
            {
                ProductSize size = new ProductSize();
                size.Size = sizeNode.InnerText;
                size.Available = true;
                sizes.Add(size);
            }

            return sizes;
        }
    }
}


AmazonSignedRequestHelper

using System;
using System.Collections.Generic;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using System.Threading.Tasks;
using System.Web;

namespace Scraper.Common.Utils
{
    class AmazonSignedRequestHelper
    {
        private string endPoint;
        private string akid;
        private string associateTag;
        private byte[] secret;
        private HMAC signer;

        private const string REQUEST_URI = "/onca/xml";
        private const string REQUEST_METHOD = "GET";

        /*
         * Use this constructor to create the object. The AWS credentials are available on
         * http://aws.amazon.com
         * 
         * The destination is the service end-point for your application:
         *  US: ecs.amazonaws.com
         *  JP: ecs.amazonaws.jp
         *  UK: ecs.amazonaws.co.uk
         *  DE: ecs.amazonaws.de
         *  FR: ecs.amazonaws.fr
         *  CA: ecs.amazonaws.ca
         */
        public AmazonSignedRequestHelper(string awsAccessKeyId, string awsSecretKey, string destination, string associateTag)
        {
            this.endPoint = destination.ToLower();
            this.akid = awsAccessKeyId;
            this.secret = Encoding.UTF8.GetBytes(awsSecretKey);
            this.associateTag = associateTag;
            this.signer = new HMACSHA256(this.secret);
        }

        /*
         * Sign a request in the form of a Dictionary of name-value pairs.
         * 
         * This method returns a complete URL to use. Modifying the returned URL
         * in any way invalidates the signature and Amazon will reject the requests.
         */
        public string Sign(IDictionary request)
        {
            // Use a SortedDictionary to get the parameters in naturual byte order, as
            // required by AWS.
            ParamComparer pc = new ParamComparer();
            SortedDictionary sortedMap = new SortedDictionary(request, pc);

            // Add the AWSAccessKeyId and Timestamp to the requests.
            sortedMap["AWSAccessKeyId"] = this.akid;
            sortedMap["Timestamp"] = this.GetTimestamp();
            sortedMap["AssociateTag"] = this.associateTag;

            // Get the canonical query string
            string canonicalQS = this.ConstructCanonicalQueryString(sortedMap);

            // Derive the bytes needs to be signed.
            StringBuilder builder = new StringBuilder();
            builder.Append(REQUEST_METHOD)
                .Append("\n")
                .Append(this.endPoint)
                .Append("\n")
                .Append(REQUEST_URI)
                .Append("\n")
                .Append(canonicalQS);

            string stringToSign = builder.ToString();
            byte[] toSign = Encoding.UTF8.GetBytes(stringToSign);

            // Compute the signature and convert to Base64.
            byte[] sigBytes = signer.ComputeHash(toSign);
            string signature = Convert.ToBase64String(sigBytes);

            // now construct the complete URL and return to caller.
            StringBuilder qsBuilder = new StringBuilder();
            qsBuilder.Append("http://")
                .Append(this.endPoint)
                .Append(REQUEST_URI)
                .Append("?")
                .Append(canonicalQS)
                .Append("&Signature=")
                .Append(this.PercentEncodeRfc3986(signature));

            return qsBuilder.ToString();
        }

        /*
         * Sign a request in the form of a query string.
         * 
         * This method returns a complete URL to use. Modifying the returned URL
         * in any way invalidates the signature and Amazon will reject the requests.
         */
        public string Sign(string queryString)
        {
            IDictionary request = this.CreateDictionary(queryString);
            return this.Sign(request);
        }

        /*
         * Current time in IS0 8601 format as required by Amazon
         */
        private string GetTimestamp()
        {
            DateTime currentTime = DateTime.UtcNow;
            string timestamp = currentTime.ToString("yyyy-MM-ddTHH:mm:ssZ");
            return timestamp;
        }

        /*
         * Percent-encode (URL Encode) according to RFC 3986 as required by Amazon.
         * 
         * This is necessary because .NET's HttpUtility.UrlEncode does not encode
         * according to the above standard. Also, .NET returns lower-case encoding
         * by default and Amazon requires upper-case encoding.
         */
        private string PercentEncodeRfc3986(string str)
        {
            str = HttpUtility.UrlEncode(str, System.Text.Encoding.UTF8);
            str = str.Replace("'", "%27").Replace("(", "%28").Replace(")", "%29").Replace("*", "%2A").Replace("!", "%21").Replace("%7e", "~").Replace("+", "%20");

            StringBuilder sbuilder = new StringBuilder(str);
            for (int i = 0; i < sbuilder.Length; i++)
            {
                if (sbuilder[i] == '%')
                {
                    if (Char.IsLetter(sbuilder[i + 1]) || Char.IsLetter(sbuilder[i + 2]))
                    {
                        sbuilder[i + 1] = Char.ToUpper(sbuilder[i + 1]);
                        sbuilder[i + 2] = Char.ToUpper(sbuilder[i + 2]);
                    }
                }
            }
            return sbuilder.ToString();
        }

        /*
         * Convert a query string to corresponding dictionary of name-value pairs.
         */
        private IDictionary CreateDictionary(string queryString)
        {
            Dictionary map = new Dictionary();

            string[] requestParams = queryString.Split('&');

            for (int i = 0; i < requestParams.Length; i++)
            {
                if (requestParams[i].Length < 1)
                {
                    continue;
                }

                char[] sep = { '=' };
                string[] param = requestParams[i].Split(sep, 2);
                for (int j = 0; j < param.Length; j++)
                {
                    param[j] = HttpUtility.UrlDecode(param[j], System.Text.Encoding.UTF8);
                }
                switch (param.Length)
                {
                    case 1:
                        {
                            if (requestParams[i].Length >= 1)
                            {
                                if (requestParams[i].ToCharArray()[0] == '=')
                                {
                                    map[""] = param[0];
                                }
                                else
                                {
                                    map[param[0]] = "";
                                }
                            }
                            break;
                        }
                    case 2:
                        {
                            if (!string.IsNullOrEmpty(param[0]))
                            {
                                map[param[0]] = param[1];
                            }
                        }
                        break;
                }
            }

            return map;
        }

        /*
         * Consttuct the canonical query string from the sorted parameter map.
         */
        private string ConstructCanonicalQueryString(SortedDictionary sortedParamMap)
        {
            StringBuilder builder = new StringBuilder();

            if (sortedParamMap.Count == 0)
            {
                builder.Append("");
                return builder.ToString();
            }

            foreach (KeyValuePair kvp in sortedParamMap)
            {
                builder.Append(this.PercentEncodeRfc3986(kvp.Key));
                builder.Append("=");
                builder.Append(this.PercentEncodeRfc3986(kvp.Value));
                builder.Append("&");
            }
            string canonicalString = builder.ToString();
            canonicalString = canonicalString.Substring(0, canonicalString.Length - 1);
            return canonicalString;
        }
    }

    /*
     * To help the SortedDictionary order the name-value pairs in the correct way.
     */
    class ParamComparer : IComparer
    {
        public int Compare(string p1, string p2)
        {
            return string.CompareOrdinal(p1, p2);
        }
    }
}


TemplateWooCommerceScraper

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;


namespace FashionExchange.Common.Scrapers
{
    public class TemplateWooCommerceScraper : WooCommerceScraper
    {
        protected override HashSet SizeAttributeNames
        {
            get
            {
                if (_sizeAttributeNames == null)
                {
                    _sizeAttributeNames = base.SizeAttributeNames;
                    //_sizeAttributeNames.Add("attribute_mysize");
                }
                return _sizeAttributeNames;
            }
        }
        protected override HashSet ColourAttributeNames
        {
            get
            {
                if (_colourAttributeNames == null)
                {
                    _colourAttributeNames = base.ColourAttributeNames;
                    //_colourAttributeNames.Add("attribute_zodiac-sign");
                }
                return _colourAttributeNames;
            }
        }
        protected override List GetProductUrls()
        {
            bool hasSubCategory = false;

            HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);

            HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
            foreach (HtmlNode categoryNode in categoryNodes)
            {
                string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
                log.Info("Parsing category: " + categoryName);

                HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
                if (subCategoryNodes != null)
                {
                    foreach (HtmlNode subCategoryNode in subCategoryNodes)
                    {
                        log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());

                        SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
                        GetProductUrlsFromCategoryPage(subCategoryNode);

                        hasSubCategory = true;
                    }
                }
                else
                {
                    HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
                    SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
                    GetProductUrlsFromCategoryPage(categoryLinkNode);
                }
            }

            if (!hasSubCategory)
                throw new Exception("No sub categories found");

            return SiteMap.GetAllProductUrls();
        }
        private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
        {
            GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
        }
        private void GetProductUrlsFromCategoryPage(string categoryUrl)
        {
            log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);

            HtmlDocument categoryPage = null;
            try
            {
                categoryPage = ScraperUtil.LoadHtml(categoryUrl);
            }
            catch (WebException e)
            {
                log.Info("Error download category page: " + e.ToString());
                return;
            }

            HtmlNode nextPageNode = null;
            do
            {
                HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//ul[@class='ProductList ']//div[@class='ProductDetails']/a");
                if (productNodes == null)
                {
                    log.Info("Category page has no products");
                    return;
                }

                foreach (HtmlNode productNode in productNodes)
                {
                    SiteMap.AddProductUrl(productNode, categoryUrl);
                    log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
                }

                nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//ul[@class='page-numbers']/li/a[@class='next page-numbers']");
                if (nextPageNode != null)
                {
                    log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
                    try
                    {
                        categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
                    }
                    catch (WebException e)
                    {
                        log.Info("Error download next category page: " + e.ToString());
                        return;
                    }
                }
            } while (nextPageNode != null);
        }
        protected override bool IgnoreProduct()
        {
            return base.IgnoreProduct();
        }
        protected override string GetBrand()
        {
            return ProductPage.DocumentNode.SelectSingleNode("//div[starts-with(@id, 'product-') and @itemtype='http://schema.org/Product']//span[@class='posted_in' and contains(text(), 'Brands')]/a").InnerText;
        }
        protected override string GetDescription()
        {
            HtmlNodeCollection descriptionNodes = ProductPage.DocumentNode.SelectNodes("//div[@id='tab-description']//text()");
            if (descriptionNodes != null)
                return String.Join(" ", ProductPage.DocumentNode.SelectNodes("//div[@id='tab-description']//text()").Select(m => m.InnerText));
            else
                return null;
        }
    }
}

Comments

Popular posts from this blog

Scrapy Splash

Utility