Scraper
using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Linq;
using System.Text;
using log4net;
using FashionExchange.Common.DAL;
using FashionExchange.Common.BLL;
using HtmlAgilityPack;
using System.Net;
using System.Threading;
using FashionExchange.Common.Utils;
using System.IO;
using System.Web.Hosting;
using FashionExchange.Common.Enums;
using OpenQA.Selenium.Remote;
namespace FashionExchange.Common.Scrapers
{
public abstract class Scraper
{
protected ILog log { get { return LogManager.GetLogger(this.GetType().Name); } }
public Store Store
{
get
{
return StoreManager.GetStoreByScraperClassName(this.GetType().Name);
}
}
protected SiteMap _siteMap = null;
protected virtual SiteMap SiteMap
{
get
{
if (_siteMap == null)
_siteMap = new SiteMap(Store.Url);
return _siteMap;
}
}
protected int productAddedCount = 0, productDeletedCount = 0, productUpdatedCount = 0, productMergedCount = 0;
protected int warningCount = 0, errorCount = 0;
protected int productUrlCount = 0;
private Dictionary>> scrapeErrorDict = new Dictionary>>();
private HashSet updatedProductNames = new HashSet(StringComparer.InvariantCultureIgnoreCase);
protected virtual CookieContainer CookieContainer { get { return null; } }
protected virtual WebProxy Proxy { get { return null; } }
protected virtual Dictionary HttpHeaders { get { return null; } }
protected virtual Dictionary HttpHeadersForProductPhoto { get { return null; } }
protected virtual RemoteWebDriver WebDriver { get { return null; } }
// Variables to be used for scraping
[ThreadStatic]
private static HtmlDocument _productPage;
protected HtmlDocument ProductPage
{
get { return _productPage; }
private set { _productPage = value; }
}
[ThreadStatic]
private static WebException _webException;
protected WebException WebException
{
get { return _webException; }
private set { _webException = value; }
}
[ThreadStatic]
private static string _productUrl;
protected string ProductUrl
{
get { return _productUrl; }
private set { _productUrl = value; }
}
[ThreadStatic]
private static string _redirectedProductUrl;
protected string RedirectedProductUrl
{
get { return _redirectedProductUrl; }
private set { _redirectedProductUrl = value; }
}
// abstract methods
protected abstract List GetProductUrls();
protected abstract List GetProducts();
protected abstract string GetName();
protected abstract decimal GetPrice();
protected abstract decimal GetSalePrice();
protected abstract string GetCategory();
protected abstract string GetBrand();
protected abstract string GetDescription();
protected abstract string GetImageUrl();
// virtual methods
protected virtual List GetVariations()
{
return new List();
}
protected virtual List GetSizes()
{
return new List();
}
protected HtmlDocument DownloadWebPage(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
{
string redirectedUrl;
return DownloadWebPage(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
}
protected virtual HtmlDocument DownloadWebPage(string url, out string redirectedUrl, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
{
return ScraperUtil.LoadHtml(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
}
private List SetupAndGetProducts(HtmlDocument productPage, string productUrl, string redirectedProductUrl, WebException webException)
{
ProductPage = productPage;
ProductUrl = productUrl;
RedirectedProductUrl = redirectedProductUrl;
WebException = webException;
return GetProducts();
}
public ScrapeResult Scrape(bool scrapeNewUrl, double? scrapeExistingUrlAgeInHour, int? scrapeExistingUrlPriceDroppedInDay)
{
log.InfoFormat("Scraping for store {0} has started. Scrape New Url: {1} Scrape Existing Url Age (Hour): {2} Scrape Existing Url Price Dropped (Day): {3}", this.GetType().Name, scrapeNewUrl, scrapeExistingUrlAgeInHour, scrapeExistingUrlPriceDroppedInDay);
DateTime startTime = DateTime.Now;
int newProductUrlCount = 0, existingProductUrlCount = 0;
if (Store == null)
{
log.ErrorFormat("Store not found in database: Scraper Class Name={0}", this.GetType().Name);
errorCount++;
throw new Exception(String.Format("Store not found in database: Scraper Class Name={0}", this.GetType().Name));
}
// scrape all product urls from website and merge with existing product urls in database
List productUrls = new List();
try
{
// Get product urls from website
if (scrapeNewUrl)
{
List newProductUrls = GetProductUrls();
if (!newProductUrls.Any())
throw new Exception("No new product urls found");
newProductUrls = newProductUrls.Where(p => !String.IsNullOrEmpty(p)).Select(p => WebUtility.HtmlDecode(p.Trim())).Distinct().ToList();
newProductUrlCount = newProductUrls.Count();
productUrls.AddRange(newProductUrls);
}
// Get product urls for existing products
if (scrapeExistingUrlAgeInHour.HasValue)
{
DateTime urlAgeDateTime = DateTime.Now.AddHours(-scrapeExistingUrlAgeInHour.Value);
List existingProductUrls = ProductManager.GetProductsByStore(Store.Id).Where(m => m.DateUpdated < urlAgeDateTime).Select(p => p.Url).Distinct().ToList();
existingProductUrlCount = existingProductUrls.Count();
productUrls.InsertRange(0, existingProductUrls); // scrape existing product urls first
}
// Get product urls for existing products that have been price dropped
if (scrapeExistingUrlPriceDroppedInDay.HasValue)
{
List existingPriceDroppedProductUrls = DataContext.GetCurrentDataContext().ExecuteStoreQuery(
@"Select Product.Url
from ProductPrice with(nolock)
inner join Product with(nolock) on Product.Id = ProductPrice.ProductId
where StoreId = {0} and Product.Deleted = 0 and ProductPrice.DateCreated > {1}
and CASE WHEN ProductPrice.SalePrice IS NULL THEN ProductPrice.Price ELSE ProductPrice.SalePrice END
> CASE WHEN Product.SalePrice IS NULL THEN Product.Price ELSE Product.SalePrice END
group by ProductPrice.ProductId, Product.Url",
Store.Id,
DateTime.Today.AddDays(-scrapeExistingUrlPriceDroppedInDay.Value - 1)).ToList();
existingProductUrlCount = existingPriceDroppedProductUrls.Count();
productUrls.InsertRange(0, existingPriceDroppedProductUrls); // scrape existing product urls first
}
// merge new product urls and existing product urls
productUrls = productUrls.Distinct().ToList();
productUrlCount = productUrls.Count();
// download product pages async
List downloadThreads = new List();
ConcurrentQueue productUrlQ = new ConcurrentQueue(productUrls);
ConcurrentQueue productPageQ = new ConcurrentQueue();
for (int i = 0; i < Store.DownloadThreadCount; i++)
{
Thread thread = new Thread(new ThreadStart(() => DownloadProductPages(productUrlQ, productPageQ)));
downloadThreads.Add(thread);
thread.Start();
}
List scrapeThreads = new List();
for (int i = 0; i < Store.ScrapeThreadCount; i++)
{
Thread thread = new Thread(new ThreadStart(() => Scrape(productPageQ, downloadThreads)));
scrapeThreads.Add(thread);
thread.Start();
}
foreach (Thread thread in scrapeThreads)
thread.Join();
// check if there are product urls still in the queue
if (productUrlQ.Any())
{
errorCount += productUrlQ.Count();
throw new TimeoutException("Scraper thread timed out. " + productUrlQ.Count() + " product urls still in queue");
}
}
catch (Exception e)
{
log.ErrorFormat("An error has occured while scraping website: {0}", e.ToString());
if (e.Data.Contains(ParameterInfo.ExceptionData.Url))
scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { e.Data[ParameterInfo.ExceptionData.Url].ToString() }));
else
scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List()));
errorCount++;
}
finally
{
if (WebDriver != null)
WebDriver.Dispose();
}
log.InfoFormat("Scraping for store {0} is completed.", Store.Name);
log.InfoFormat("Products Added={0} Updated={1} Deleted={2} Merged={3}", productAddedCount, productUpdatedCount, productDeletedCount, productMergedCount);
log.InfoFormat("Warning={0} Error={1}", warningCount, errorCount);
// return result
ScrapeResult result = new ScrapeResult();
result.Store = Store;
result.ScrapeType = StoreManager.GetScrapeType(ScrapeTypeCode.Web);
result.NewProductUrlCount = newProductUrlCount;
result.ExistingProductUrlCount = existingProductUrlCount;
result.TotalDistinctProductUrlCount = productUrlCount;
result.ProductAddedCount = productAddedCount;
result.ProductUpdatedCount = productUpdatedCount;
result.ProductDeletedCount = productDeletedCount;
result.ProductMergedCount = productMergedCount;
result.SaleProductCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.SalePrice != null).Count();
result.TotalProductCount = ProductManager.GetProductsByStore(Store.Id).Count();
result.WarningCount = warningCount;
result.ErrorCount = errorCount;
result.DownloadThreadCount = Store.DownloadThreadCount;
result.ScrapeThreadCount = Store.ScrapeThreadCount;
result.StartDateTime = startTime;
result.EndDateTime = DateTime.Now;
result.ProductWithProductSizeCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).Count();
result.ProductSizeAvailableCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Where(m => m.Available).Count();
result.ProductSizeWithColourCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Where(m => m.Colour != null).Count();
result.TotalProductSizeCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.ProductSizes.Any()).SelectMany(m => m.ProductSizes).Count();
result.ProductWithBrandCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.BrandUnparsed != null).Count();
result.UniqueBrandCount = ProductManager.GetProductsByStore(Store.Id).Where(m => m.BrandUnparsed != null).Select(m => m.BrandUnparsed).Distinct().Count();
foreach (KeyValuePair>> scrapeErrorOuterPair in scrapeErrorDict)
{
KeyValuePair> scrapeErrorPair = scrapeErrorOuterPair.Value;
ScrapeError scrapeError = new ScrapeError();
scrapeError.Url1 = scrapeErrorPair.Value.FirstOrDefault();
scrapeError.Url2 = scrapeErrorPair.Value.Skip(1).FirstOrDefault();
scrapeError.Url3 = scrapeErrorPair.Value.Skip(2).FirstOrDefault();
scrapeError.Exception = scrapeErrorPair.Key.GetType().ToString();
scrapeError.Message = scrapeErrorPair.Key.Message;
scrapeError.StackTrace = scrapeErrorPair.Key.StackTrace;
scrapeError.Count = scrapeErrorPair.Value.Any() ? scrapeErrorPair.Value.Count() : 1;
result.ScrapeErrors.Add(scrapeError);
}
return result;
}
private void Scrape(ConcurrentQueue productPageQ, List downloadThreads)
{
int waitCount = 0;
while (productPageQ.Count() > 0 || downloadThreads.Where(t => t.IsAlive).Any())
{
ProductPage productPage;
if (productPageQ.TryDequeue(out productPage))
{
waitCount = 0;
Scrape(productPage);
}
else
{
if (waitCount == 1200)
{
log.Info("Waited 10 minutes for product page. Stopping scraper thread");
return;
}
Thread.Sleep(500); // wait for 0.5 second if there are no product page to be parse in queue
waitCount++;
if (waitCount % 8 == 0)
log.InfoFormat("Waited {0} seconds for download product page", waitCount * 0.5);
}
}
}
private void Scrape(ProductPage productPage)
{
DateTime startTime = DateTime.Now;
FashionExchangeEntities db = DataContext.GetCurrentDataContext();
log.InfoFormat("Parsing product page: {0}", productPage.Url);
try
{
// responseUrl is the product url to scrape. It can be the redirected url
string responseUrl = productPage.Url;
// get existing products from database
List productsFromDB = ProductManager.GetProductsByStoreIdAndUrl(Store.Id, productPage.Url).ToList();
if (productsFromDB.Any())
log.InfoFormat("Found {0} product from database", productsFromDB.Count());
if (!String.IsNullOrEmpty(productPage.RedirectedUrl))
{
// check if it's redirect to home page
if (new Uri(productPage.RedirectedUrl).PathAndQuery == "/" || productPage.RedirectedUrl.TrimEnd('/') == Store.Url.Trim('/'))
{
log.Info("Redirected to home page");
if (productsFromDB.Any())
{
productDeletedCount += productsFromDB.Count();
ProductManager.DeleteProduct(productsFromDB);
log.Info("All existing product are deleted");
}
return;
}
// retrieve product from database if url is redirected
responseUrl = productPage.RedirectedUrl;
IEnumerable existingProductsWithRedirectedUrl = ProductManager.GetProductsByStoreIdAndUrl(Store.Id, responseUrl).ToList();
log.InfoFormat("Found {0} existing product from database using redirected url", existingProductsWithRedirectedUrl.Count());
LogProduct(existingProductsWithRedirectedUrl);
productsFromDB.AddRange(existingProductsWithRedirectedUrl);
}
// check if there was an error when downloading product page
if (productPage.WebException != null)
{
if (productPage.WebException.Response != null)
{
// delete existing products if server return 404 not found or 410 Gone
HttpWebResponse errorResponse = (HttpWebResponse)productPage.WebException.Response;
if (errorResponse.StatusCode == HttpStatusCode.NotFound || errorResponse.StatusCode == HttpStatusCode.Gone)
{
log.InfoFormat("Status={0} Message={1}", errorResponse.StatusCode, productPage.WebException.Message);
if (productsFromDB.Any())
{
productDeletedCount += productsFromDB.Count();
ProductManager.DeleteProduct(productsFromDB);
log.Info("All existing product are deleted");
}
return;
}
}
}
// parse products from product page
List productsFromWeb = SetupAndGetProducts(productPage.HtmlDocument, productPage.Url, productPage.RedirectedUrl, productPage.WebException);
if (productsFromWeb.Any())
{
log.InfoFormat("Found {0} product from web:", productsFromWeb.Count());
LogProduct(productsFromWeb);
}
// skip products with duplicate names, remove zero dollar, gift card an out of stock products
List productsToMerged = new List();
for (int i = 0; i < productsFromWeb.Count(); i++)
{
Product product = productsFromWeb.ElementAt(i);
// Check for zero dollar product
if (product.Price == 0)
{
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
continue;
}
// check if product is out of stock or unavailable
if (StringUtil.ContainsIgnoreCase(product.Name, "out of stock")
|| StringUtil.ContainsIgnoreCase(product.Name, "unavailable")
|| StringUtil.ContainsIgnoreCase(product.Name, "sold out")
|| StringUtil.ContainsIgnoreCase(product.Name, "gift card")
|| StringUtil.ContainsIgnoreCase(product.Name, "gift voucher"))
{
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
continue;
}
// Check for duplicates parsed from product page
if (productsFromWeb.Where(m => m.Name == product.Name).Count() > 1)
{
List duplicateProducts = (from p in productsFromWeb
where p.Name == product.Name
select p).Skip(1).ToList();
log.InfoFormat("Found {0} product with same name parsed from web", duplicateProducts.Count());
foreach (Product duplicateProduct in duplicateProducts)
{
ProductManager.Detach(duplicateProduct);
productsFromWeb.Remove(duplicateProduct);
}
}
for (int index = 0; index < product.ProductSizes.Count(); index++)
{
ProductSize size = product.ProductSizes.ElementAt(index);
// Check if there are duplicate sizes
if (product.ProductSizes.Where(m => m.Colour == size.Colour && m.Size == size.Size && m.Available == size.Available).Count() > 1)
{
product.ProductSizes.Remove(size);
DataContext.GetCurrentDataContext().Detach(size);
index--;
}
// throw exception if size is out of stock or sold out
if (StringUtil.ContainsIgnoreCase(size.Size, "out of stock") || StringUtil.ContainsIgnoreCase(size.Size, "sold out") || StringUtil.ContainsIgnoreCase(size.Size, "unavailable"))
{
throw new Exception("Size out of stock. Size: " + size.Size);
}
}
// Check if all product sizes are unavailable
if (product.ProductSizes.Any() && !product.ProductSizes.Where(m => m.Available).Any())
{
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
continue;
}
// Check if merge product size is true, product from web has no sizes, product from database has size, then delete product from database
if (Store.MergeProductSize && !product.ProductSizes.Any() && productsFromDB.Where(m => String.Equals(m.Name, product.Name, StringComparison.InvariantCultureIgnoreCase) && m.ProductSizes.Any()).Any())
{
log.Warn("Product from web have no sizes");
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
continue;
}
// Check in database if there are any same name products but with different url, there should only be one
Product sameNameDifferentUrlProduct = ProductManager.GetProductByStoreAndName(Store.Id, product.Name);
if (sameNameDifferentUrlProduct != null && !String.Equals(sameNameDifferentUrlProduct.Url, productPage.Url, StringComparison.InvariantCultureIgnoreCase) && !String.Equals(sameNameDifferentUrlProduct.Url, responseUrl, StringComparison.InvariantCultureIgnoreCase))
{
// check if product category or brand need to be merged
bool needMerge = false;
// merge category
if (!String.IsNullOrWhiteSpace(product.CategoryUnparsed))
{
if (String.IsNullOrWhiteSpace(sameNameDifferentUrlProduct.CategoryUnparsed))
{
sameNameDifferentUrlProduct.CategoryUnparsed = product.CategoryUnparsed;
needMerge = true;
}
else if (!sameNameDifferentUrlProduct.CategoryUnparsed.Contains(product.CategoryUnparsed))
{
sameNameDifferentUrlProduct.CategoryUnparsed += " " + product.CategoryUnparsed;
needMerge = true;
}
}
// merge brand
if (String.IsNullOrWhiteSpace(sameNameDifferentUrlProduct.BrandUnparsed) && !String.IsNullOrWhiteSpace(product.BrandUnparsed))
{
sameNameDifferentUrlProduct.BrandUnparsed = product.BrandUnparsed;
needMerge = true;
}
if (needMerge)
productsToMerged.Add(sameNameDifferentUrlProduct);
// check if products to be merged have different prices
if (sameNameDifferentUrlProduct.Price != product.Price || sameNameDifferentUrlProduct.SalePrice != product.SalePrice)
{
log.Warn("Product to merge have different prices. Existing Product Url: " + sameNameDifferentUrlProduct.Url + " New Product Url: " + product.Url);
warningCount++;
}
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
continue;
}
// Check if product has already been added or updated within this scrape job
if (updatedProductNames.Contains(product.Name))
{
log.Info("Product has already been updated");
ProductManager.Detach(product);
productsFromWeb.Remove(product);
--i;
// Remove products from db if any. This scenario occurs when new product url is redirected to url same as existing product url
Product productFromDB = productsFromDB.Where(m => String.Equals(m.Name, product.Name, StringComparison.InvariantCultureIgnoreCase)).FirstOrDefault();
if (productFromDB != null)
{
ProductManager.Detach(productFromDB);
productsFromDB.Remove(productFromDB);
}
continue;
}
// Check if product from web doesn't have brand, but existing product from DB has brand. This to ensure existing product brand does not get removed
if (String.IsNullOrWhiteSpace(product.BrandUnparsed) && productsFromDB.Where(m => m.Url == product.Url && m.BrandUnparsed != null).Any())
{
log.Info("Product from web doesn't have brand. But product from database has brand");
product.BrandUnparsed = productsFromDB.Where(m => m.Url == product.Url && m.BrandUnparsed != null).First().BrandUnparsed;
}
}
// update if product from db exist by (url or redirectedUrl) and name
IEnumerable productsToBeUpdated = (from p in productsFromWeb
where (productsFromDB.Select(m => m.Url.ToLower()).Contains(p.Url.ToLower()) || productsFromDB.Select(m => m.Url.ToLower()).Contains(productPage.Url.ToLower()))
&& productsFromDB.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
select p);
if (productsToBeUpdated.Any())
{
productUpdatedCount += productsToBeUpdated.Count();
ProductManager.UpdateProduct(productsToBeUpdated, false, true);
foreach (Product productToBeUpdated in productsToBeUpdated)
updatedProductNames.Add(productToBeUpdated.Name);
log.InfoFormat("Updated {0} product", productsToBeUpdated.Count());
}
// add if product from db does not exist by name
IEnumerable productsToBeAdded = (from p in productsFromWeb
where !productsFromDB.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
select p);
if (productsToBeAdded.Any())
{
productAddedCount += productsToBeAdded.Count();
ProductManager.NewProduct(productsToBeAdded, false);
foreach (Product productToBeAdded in productsToBeAdded)
updatedProductNames.Add(productToBeAdded.Name);
log.InfoFormat("Added {0} product", productsToBeAdded.Count());
}
// delete if product from db cannot be found from product page by name
IEnumerable productsToBeDeleted = (from p in productsFromDB
where !productsFromWeb.Select(m => m.Name.ToLower()).Contains(p.Name.ToLower())
select p);
if (productsToBeDeleted.Any())
{
productDeletedCount += productsToBeDeleted.Count();
ProductManager.DeleteProduct(productsToBeDeleted, false);
log.InfoFormat("Deleted {0} product", productsToBeDeleted.Count());
}
// merge if product from db exist by name but different url
if (productsToMerged.Any())
{
productMergedCount += productsToMerged.Count();
ProductManager.UpdateProduct(productsToMerged, false, false);
log.InfoFormat("Merged {0} product", productsToMerged.Count());
}
db.SaveChanges();
}
catch (Exception e)
{
log.ErrorFormat("Error parsing product page. Url: {0} Exception: {1}", productPage.Url, e.ToString());
if (scrapeErrorDict.ContainsKey(e.StackTrace))
scrapeErrorDict[e.StackTrace].Value.Add(productPage.Url);
else
scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { productPage.Url }));
errorCount++;
}
finally
{
// discard all changes / caching to database context
DataContext.DisposeDataContext();
log.InfoFormat("Time taken {0}ms", Math.Round(DateTime.Now.Subtract(startTime).TotalMilliseconds, MidpointRounding.AwayFromZero));
}
}
private void DownloadProductPages(ConcurrentQueue productUrlQueue, ConcurrentQueue productPageQueue)
{
string productUrl;
while (productUrlQueue.TryDequeue(out productUrl))
{
// if there are too many productPage waiting to be parsed. Wait for 0.5 second before continue downloading
while (productPageQueue.Count() > Store.DownloadThreadCount * 5)
Thread.Sleep(500);
ProductPage productPage = new ProductPage();
try
{
productPage.Url = productUrl;
int productUrlQueueCount = productUrlQueue.Count();
log.InfoFormat("Downloading product page [{0}/{1}]: {2}", productUrlCount - productUrlQueueCount, productUrlCount, productPage.Url);
DateTime startTime = DateTime.Now;
string responseUrl;
productPage.HtmlDocument = DownloadWebPage(productPage.Url, out responseUrl, cookieContainer: CookieContainer, proxy: Proxy, httpHeaders: HttpHeaders, webDriver: WebDriver);
if (responseUrl != productPage.Url)
{
log.InfoFormat("Product url has been redirected: {0}", responseUrl);
productPage.RedirectedUrl = responseUrl;
}
DateTime endTime = DateTime.Now;
log.InfoFormat("Downloaded product page [{0}/{1}] Time taken {2}ms", productUrlCount - productUrlQueueCount, productUrlCount, Math.Round(endTime.Subtract(startTime).TotalMilliseconds, MidpointRounding.AwayFromZero));
productPageQueue.Enqueue(productPage);
}
catch (WebException e)
{
log.InfoFormat("There was an error downloading product page: {0} {1}", String.IsNullOrEmpty(productPage.RedirectedUrl) ? productPage.Url : productPage.RedirectedUrl, e.ToString());
productPage.WebException = e;
productPageQueue.Enqueue(productPage);
}
catch (Exception e)
{
log.ErrorFormat("An error ocurred while downloading product page: {0} {1}", productPage.Url, e.ToString());
if (scrapeErrorDict.ContainsKey(e.StackTrace))
scrapeErrorDict[e.StackTrace].Value.Add(productPage.Url);
else
scrapeErrorDict.Add(e.StackTrace, new KeyValuePair>(e, new List() { productPage.Url }));
errorCount++;
}
}
}
// Scrape test only provided url and does not update database.
public IEnumerable TestScrapeProduct(string productUrl)
{
log.InfoFormat("Scraper test on url: {0}", productUrl);
List productUrls = new List();
productUrls.Add(productUrl);
ConcurrentQueue productUrlQ = new ConcurrentQueue(productUrls);
ConcurrentQueue productPageQ = new ConcurrentQueue();
List products = new List();
try
{
Thread thread = new Thread(new ThreadStart(() => DownloadProductPages(productUrlQ, productPageQ)));
thread.Start();
thread.Join();
ProductPage productPage;
productPageQ.TryDequeue(out productPage);
// check if there was an error when downloading product page
if (productPage.WebException != null)
{
if (productPage.WebException.Response != null)
{
// delete existing products if server return 404 not found or 410 Gone
HttpWebResponse errorResponse = (HttpWebResponse)productPage.WebException.Response;
if (errorResponse.StatusCode == HttpStatusCode.NotFound || errorResponse.StatusCode == HttpStatusCode.Gone)
{
log.InfoFormat("Status={0} Message={1}", errorResponse.StatusCode, productPage.WebException.Message);
return products;
}
}
}
string responseUrl = String.IsNullOrEmpty(productPage.RedirectedUrl) ? productPage.Url : productPage.RedirectedUrl;
products = SetupAndGetProducts(productPage.HtmlDocument, productPage.Url, productPage.RedirectedUrl, productPage.WebException);
log.InfoFormat("Products found: {0}", products.Count());
LogProduct(products);
return products;
}
catch (Exception e)
{
log.ErrorFormat("Error getting products from url: {0}", e.ToString());
throw;
}
finally
{
if (WebDriver != null)
WebDriver.Dispose();
}
}
// Scrape test to get all product urls
public void ScrapeProductUrls()
{
log.InfoFormat("Url scraping test for store {0} has started.", Store.Name);
try
{
List productUrls = GetProductUrls();
log.InfoFormat("Url scraping test has completed. Found {0} urls. {1} distinct.", productUrls.Count(), productUrls.Distinct().Count());
}
catch (Exception e)
{
log.ErrorFormat("Error occured while getting product urls.");
log.Error(e.ToString());
}
}
protected List CreateProductObjects()
{
List products = new List();
string productName = GetName();
if (!String.IsNullOrWhiteSpace(productName))
productName = ScraperUtil.NormalizeText(productName);
else
productName = null;
//string description = GetDescription();
//if (!String.IsNullOrWhiteSpace(description))
// description = ScraperUtil.NormalizeText(description);
//else
// description = null;
// Categories are usually scraped from product page's breadcrumb. Also check and remove product name from category
string category = GetCategory();
if (!String.IsNullOrWhiteSpace(category))
{
category = ScraperUtil.NormalizeText(category);
if (!String.IsNullOrEmpty(productName))
{
if (String.Compare(category, productName, true) == 0)
category = null;
else
category = category.Replace(productName, String.Empty).Trim();
}
}
else
category = null;
string brand = GetBrand();
if (!String.IsNullOrWhiteSpace(brand))
brand = ScraperUtil.NormalizeText(brand);
else
brand = null;
// check if there are product variations
List productVariations = GetVariations();
if (productVariations.Any())
{
// check if all variation prices are same, then just take one product without variation name
//if (productVariations.Count() == 1
// || (productVariations.Count() > 1
// && productVariations.Select(m => Math.Round(m.SalePrice, 2, MidpointRounding.AwayFromZero)).Distinct().Count() == 1
// && productVariations.Select(m => Math.Round(m.Price, 2, MidpointRounding.AwayFromZero)).Distinct().Count() == 1)
// )
//{
// ProductVariation variation = productVariations.First();
// Product product = new Product();
// product.Name = (String.IsNullOrWhiteSpace(productName)) ?
// ScraperUtil.NormalizeText(variation.Name) : productName;
// product.Description = description;
// product.CategoryUnparsed = category;
// product.BrandUnparsed = brand;
// product.Price = Math.Round(variation.Price, 2, MidpointRounding.AwayFromZero);
// if (variation.SalePrice > 0 && variation.SalePrice < variation.Price)
// product.SalePrice = Math.Round(variation.SalePrice, 2, MidpointRounding.AwayFromZero);
// product.Url = productUrl;
// product.Store = Store;
// DownloadProductImage(variation.ImageUrl, product.Name);
// products.Add(product);
//}
//else
//{
foreach (ProductVariation variation in productVariations)
{
Product product = new Product();
variation.Name = variation.Name ?? String.Empty;
product.Name = (String.IsNullOrWhiteSpace(productName)) ?
variation.Name.Trim() : String.Format("{0} {1}", productName, variation.Name.Replace(productName, String.Empty).Trim());
product.Name = ScraperUtil.NormalizeText(product.Name);
//product.Description = description;
product.CategoryUnparsed = category;
product.BrandUnparsed = brand;
product.Price = Math.Round(variation.Price, 2, MidpointRounding.AwayFromZero);
if (variation.SalePrice > 0 && variation.SalePrice < variation.Price)
product.SalePrice = Math.Round(variation.SalePrice, 2, MidpointRounding.AwayFromZero);
foreach (ProductSize productSize in variation.ProductSizes)
{
if (!String.IsNullOrWhiteSpace(productSize.Colour))
productSize.Colour = ScraperUtil.NormalizeText(productSize.Colour);
else
productSize.Colour = null;
if (!String.IsNullOrEmpty(productSize.Size))
productSize.Size = ScraperUtil.NormalizeText(productSize.Size);
else if (String.IsNullOrEmpty(productSize.Colour))
continue;
else if (!String.IsNullOrEmpty(productSize.Colour))
throw new Exception("Product size has colour but no size. Colour: " + productSize.Colour);
product.ProductSizes.Add(productSize);
}
product.Url = String.IsNullOrEmpty(RedirectedProductUrl) ? ProductUrl : RedirectedProductUrl;
product.Store = Store;
DownloadProductPhoto(variation.ImageUrl, product);
products.Add(product);
}
//}
}
else
{
Product product = new Product();
product.Name = productName;
//product.Description = description;
product.CategoryUnparsed = category;
product.BrandUnparsed = brand;
product.Price = Math.Round(GetPrice(), 2, MidpointRounding.AwayFromZero);
decimal salePrice = Math.Round(GetSalePrice(), 2, MidpointRounding.AwayFromZero);
if (salePrice > 0 && salePrice < product.Price)
product.SalePrice = salePrice;
foreach (ProductSize productSize in GetSizes() ?? Enumerable.Empty())
{
if (!String.IsNullOrWhiteSpace(productSize.Colour))
productSize.Colour = ScraperUtil.NormalizeText(productSize.Colour);
else
productSize.Colour = null;
if (!String.IsNullOrEmpty(productSize.Size))
productSize.Size = ScraperUtil.NormalizeText(productSize.Size);
else if (String.IsNullOrEmpty(productSize.Colour))
continue;
else if (!String.IsNullOrEmpty(productSize.Colour))
throw new Exception("Product size has colour but no size. Colour: " + productSize.Colour);
product.ProductSizes.Add(productSize);
}
product.Url = String.IsNullOrEmpty(RedirectedProductUrl) ? ProductUrl : RedirectedProductUrl;
product.Store = Store;
DownloadProductPhoto(GetImageUrl(), product);
products.Add(product);
}
return products;
}
protected void DownloadProductPhoto(string imageUrl, Product product)
{
try
{
ProductManager.DownloadProductPhoto(imageUrl, product, proxy: Proxy, httpHeaders: HttpHeadersForProductPhoto);
}
catch (Exception)
{
log.ErrorFormat("Error downloading / processing product image. Image url: {0}", imageUrl);
throw;
}
}
private void LogProduct(IEnumerable products)
{
int productNum = 1;
foreach (Product p in products)
{
if (p.SalePrice > 0)
log.InfoFormat("{0}. {1} Regular Price: ${2} Sale Price: ${3}", productNum, p.Name, p.Price, p.SalePrice);
else
log.InfoFormat("{0}. {1} ${2}", productNum, p.Name, p.Price);
productNum++;
}
}
}
}
ScraperUtil
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using HtmlAgilityPack;
using log4net;
using System.Xml;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Enums;
using Newtonsoft.Json.Linq;
using System.Web;
using System.Collections.Specialized;
using OpenQA.Selenium.PhantomJS;
using System.Runtime.Caching;
using OpenQA.Selenium.Firefox;
using OpenQA.Selenium.Remote;
using System.IO.Compression;
using System.Web.Hosting;
namespace FashionExchange.Common.Utils
{
public class ScraperUtil
{
private static ILog log = LogManager.GetLogger(typeof(ScraperUtil).Name);
private const int defaultRetryCount = 3;
private static FixedSizeConcurrentQueue loadedHtmlUrls = new FixedSizeConcurrentQueue(1000);
private static FixedSizeConcurrentQueue loadedHttpGetUrls = new FixedSizeConcurrentQueue(1000);
public static string CheckForRedirectUrl(string url, WebProxy proxy = null, bool useHead = true)
{
string responseUrl = String.Empty;
if (url.Contains("./"))
FixDotInUri();
url = WebUtility.HtmlDecode(url);
// retry if request has been timed out or refused
for (int i = 0; i <= defaultRetryCount; i++)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
if (useHead)
request.Method = "HEAD";
if (proxy != null)
request.Proxy = proxy;
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.AllowAutoRedirect = false;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
responseUrl = String.IsNullOrEmpty(response.GetResponseHeader("Location")) ? url : response.GetResponseHeader("Location");
response.Close();
return responseUrl;
}
catch (WebException e)
{
// give up if already retried n times
if (i == defaultRetryCount)
throw;
if (e.Status == WebExceptionStatus.Timeout)
log.Warn(e.Message);
else if (e.Status == WebExceptionStatus.ProtocolError)
{
if (useHead)
{
log.Warn(e.Message + " Retry without http request = head.");
useHead = false;
}
else
{
throw;
}
}
else
throw;
}
catch (IOException)
{
// give up if already retried n times
if (i == defaultRetryCount)
throw;
}
}
return responseUrl;
}
public static HtmlDocument LoadHtml(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
{
string redirectedUrl;
return LoadHtml(url, out redirectedUrl, cookieContainer: cookieContainer, retryOnError: retryOnError, retryOnTimeout: retryOnTimeout, retryCount: retryCount, retryDelayInMilisecond: retryDelayInMilisecond, proxy: proxy, httpHeaders: httpHeaders, checkUrlLoaded: checkUrlLoaded, webDriver: webDriver);
}
public static HtmlDocument LoadHtml(string url, out string redirectedUrl, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, Dictionary httpHeaders = null, bool checkUrlLoaded = true, RemoteWebDriver webDriver = null)
{
if (url.Contains("./"))
FixDotInUri();
// check if url has already been loaded in the past
if (checkUrlLoaded && loadedHtmlUrls.Contains(url) && loadedHtmlUrls.Where(m => m == url).Count() >= 2)
{
Exception e = new Exception("Url has already been loaded more than 2 times. Url: " + url);
e.Data[ParameterInfo.ExceptionData.Url] = url;
throw e;
}
else
loadedHtmlUrls.Enqueue(url);
url = WebUtility.HtmlDecode(url);
redirectedUrl = url;
bool hasConnectionClosedError = false;
// retry if request has been timed out or refused
for (int i = 0; i <= retryCount; i++)
{
try
{
HtmlDocument htmlDocument = new HtmlDocument();
HttpWebResponse response;
bool redirected = false;
List redirectedUrls = new List();
redirectedUrls.Add(redirectedUrl);
if (webDriver != null)
{
webDriver.Navigate().GoToUrl(url);
System.Threading.Thread.Sleep(1000 * 5);
HtmlNode.ElementsFlags.Remove("option");
HtmlNode.ElementsFlags.Remove("form");
htmlDocument.LoadHtml(webDriver.PageSource);
if (url != webDriver.Url)
redirectedUrl = webDriver.Url;
}
else
{
do
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(redirectedUrl);
request.Method = "GET";
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.AllowAutoRedirect = false;
request.CookieContainer = cookieContainer;
request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
/* Sometimes .net framework doesn't handle http request properly and instead throws
* web exception "The request was aborted: The connection was closed unexpectedly".
* Using a different HttpVersion.Version10 can get around this issue. */
if (hasConnectionClosedError)
request.ProtocolVersion = HttpVersion.Version10;
if (proxy != null)
request.Proxy = proxy;
SetRequestHeaders(request, httpHeaders);
response = (HttpWebResponse)request.GetResponse();
if (!redirectedUrl.Contains("www.styletread.com.au")
&& (response.StatusCode == HttpStatusCode.Redirect
|| response.StatusCode == HttpStatusCode.TemporaryRedirect
|| response.StatusCode == HttpStatusCode.MovedPermanently
|| response.StatusCode == HttpStatusCode.Moved))
{
// append root url address it does not exist in redirect url
if (!response.GetResponseHeader("Location").StartsWith("http"))
{
Uri uri = new Uri(redirectedUrl);
string domainUrl = new UriBuilder(uri.Scheme, uri.DnsSafeHost).ToString();
redirectedUrl = domainUrl + response.GetResponseHeader("Location").TrimStart('/'); // domainUrl will always have '/' at the end
}
else
redirectedUrl = response.GetResponseHeader("Location");
// append hash tag
if (url.Contains("#") && !redirectedUrl.Contains("#"))
redirectedUrl += StringUtil.SubstringToEnd(url, "#", includeStartStr: true);
// check if the new url has been redirected before. if yes throw an exception, else add new redirect url to redirectUrls list
if (redirectedUrls.Contains(redirectedUrl))
throw new WebException("Too many automatic redirections were attempted.", WebExceptionStatus.ProtocolError);
else
redirectedUrls.Add(redirectedUrl);
redirected = true;
}
else
{
redirected = false;
HtmlNode.ElementsFlags.Remove("option");
HtmlNode.ElementsFlags.Remove("form");
htmlDocument.Load(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet.Trim('"')));
}
response.Close();
} while (redirected);
}
// convert all img src and a href from relative to absolute urls
Uri finalUrl = new Uri(redirectedUrl);
string baseUrl = finalUrl.Scheme + "://" + finalUrl.Authority;
foreach (HtmlNode linkNode in htmlDocument.DocumentNode.SelectNodes("//a[@href and not(starts-with(@href, 'http'))]") ?? Enumerable.Empty())
{
if (linkNode.Attributes["href"].Value.StartsWith("//"))
linkNode.Attributes["href"].Value = finalUrl.Scheme + ":" + linkNode.Attributes["href"].Value;
else if (linkNode.Attributes["href"].Value.StartsWith("?"))
linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath + linkNode.Attributes["href"].Value;
else if (linkNode.Attributes["href"].Value.StartsWith("../"))
{
// work out number of times traverse to parent
int backUpParentCount = 0;
int linkNodeStartIndex = 0;
while (linkNode.Attributes["href"].Value.IndexOf("../", linkNodeStartIndex) == linkNodeStartIndex)
{
linkNodeStartIndex += "../".Length; ;
backUpParentCount++;
}
// work out number of slashes we need to traverse and get the position index
int slashIndex = 0;
if (finalUrl.AbsolutePath.LastIndexOf('/') != 0)
{
slashIndex = finalUrl.AbsolutePath.LastIndexOf('/');
for (int backUpParentCounter = 0; backUpParentCounter < backUpParentCount; backUpParentCounter++)
{
slashIndex = finalUrl.AbsolutePath.LastIndexOf('/', slashIndex - 1);
if (slashIndex == 0)
break;
}
}
linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, slashIndex) + "/" + linkNode.Attributes["href"].Value.Substring(linkNodeStartIndex);
}
else if (linkNode.Attributes["href"].Value.StartsWith("/"))
linkNode.Attributes["href"].Value = baseUrl + linkNode.Attributes["href"].Value;
else
{
if (finalUrl.AbsolutePath == "/")
linkNode.Attributes["href"].Value = baseUrl + "/" + linkNode.Attributes["href"].Value;
else
linkNode.Attributes["href"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, finalUrl.AbsolutePath.TrimEnd('/').LastIndexOf('/')) + "/" + linkNode.Attributes["href"].Value;
}
}
foreach (HtmlNode imgNode in htmlDocument.DocumentNode.SelectNodes("//img[@src and not(starts-with(@src, 'http'))]") ?? Enumerable.Empty())
{
if (imgNode.Attributes["src"].Value.StartsWith("//"))
imgNode.Attributes["src"].Value = finalUrl.Scheme + ":" + imgNode.Attributes["src"].Value;
else if (imgNode.Attributes["src"].Value.StartsWith("?"))
imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath + imgNode.Attributes["src"].Value;
else if (imgNode.Attributes["src"].Value.StartsWith("../"))
{
// work out number of times traverse to parent
int backUpParentCount = 0;
int imgNodeStartIndex = 0;
while (imgNode.Attributes["src"].Value.IndexOf("../", imgNodeStartIndex) == imgNodeStartIndex)
{
imgNodeStartIndex += "../".Length; ;
backUpParentCount++;
}
// work out number of slashes we need to traverse and get the position index
int slashIndex = 0;
if (finalUrl.AbsolutePath.LastIndexOf('/') != 0)
{
slashIndex = finalUrl.AbsolutePath.LastIndexOf('/');
for (int backUpParentCounter = 0; backUpParentCounter < backUpParentCount; backUpParentCounter++)
{
slashIndex = finalUrl.AbsolutePath.LastIndexOf('/', slashIndex - 1);
if (slashIndex == 0)
break;
}
}
imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, slashIndex) + "/" + imgNode.Attributes["src"].Value.Substring(imgNodeStartIndex);
}
else if (imgNode.Attributes["src"].Value.StartsWith("/"))
imgNode.Attributes["src"].Value = baseUrl + imgNode.Attributes["src"].Value;
else
{
if (finalUrl.AbsolutePath == "/")
imgNode.Attributes["src"].Value = baseUrl + "/" + imgNode.Attributes["src"].Value;
else
imgNode.Attributes["src"].Value = baseUrl + finalUrl.AbsolutePath.Substring(0, finalUrl.AbsolutePath.TrimEnd('/').LastIndexOf('/')) + "/" + imgNode.Attributes["src"].Value;
}
}
return htmlDocument;
}
catch (Exception e)
{
log.Info("An exception occured while loading url: " + redirectedUrl);
e.Data[ParameterInfo.ExceptionData.Url] = redirectedUrl;
// give up if already retried n times
if (i == retryCount)
throw;
// If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
// Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
hasConnectionClosedError = true;
else if (retryOnTimeout
&& e is WebException
&& ((((WebException)e).Status == WebExceptionStatus.Timeout)
|| (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
log.Warn(e.Message);
else if (retryOnTimeout
&& e is OpenQA.Selenium.WebDriverException
&& ((OpenQA.Selenium.WebDriverException)e).InnerException is WebException
&& ((WebException)((OpenQA.Selenium.WebDriverException)e).InnerException).Status == WebExceptionStatus.Timeout)
log.Warn(e.Message);
else if (retryOnError)
log.Warn(e.Message);
else
throw;
if (retryDelayInMilisecond > 0)
{
log.Info("Retry loading html page after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
System.Threading.Thread.Sleep(retryDelayInMilisecond);
}
else
{
log.Info("Retry loading html page. Retry count: " + (i + 1));
}
}
}
return null;
}
public static XmlDocument LoadXml(string url, CookieContainer cookieContainer = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, Dictionary httpHeaders = null)
{
if (url.Contains("./"))
FixDotInUri();
url = WebUtility.HtmlDecode(url);
// retry if request has been timed out or refused
for (int i = 0; i <= retryCount; i++)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.CookieContainer = cookieContainer;
SetRequestHeaders(request, httpHeaders);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
XmlDocument xml = new XmlDocument();
xml.Load(response.GetResponseStream());
response.Close();
return xml;
}
catch (Exception e)
{
log.InfoFormat("An exception occured while loading xml. Url: {0}", url);
e.Data[ParameterInfo.ExceptionData.Url] = url;
// give up if already retried n times
if (i == retryCount)
throw;
if (retryOnTimeout
&& e is WebException
&& ((((WebException)e).Status == WebExceptionStatus.Timeout)
|| (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
log.Warn(e.Message);
else if (retryOnError)
log.Warn(e.Message);
else
throw;
if (retryDelayInMilisecond > 0)
{
log.Info("Retry loading xml after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
System.Threading.Thread.Sleep(retryDelayInMilisecond);
}
else
log.Info("Retry loading xml. Retry count: " + (i + 1));
}
}
return null;
}
public static string RemoveXmlNamespaces(string xml)
{
List xmlNamepaces = StringUtil.Substrings(xml, "xmlns=\"", "\"", includeStartStr: true, includeEndStr: true);
foreach (string xmlNamespace in xmlNamepaces)
xml = xml.Replace(xmlNamespace, String.Empty);
return xml;
}
public static string NormalizeText(string text)
{
if (String.IsNullOrEmpty(text))
return text;
text = WebUtility.HtmlDecode(text)
.Replace("\n", " ")
.Replace("\r", " ")
.Replace("\t", " ")
.Replace("\u00A0", " ")
.Replace("\u0085", String.Empty)
.Replace("\u0080", String.Empty)
.Replace("\u0093", String.Empty)
.Replace("\u0099", String.Empty)
.Replace("\u0091", String.Empty)
.Replace("\u0092", String.Empty)
.Replace("\u0094", String.Empty)
.Replace("\u009f", String.Empty)
.Replace("\u008c", String.Empty)
.Replace("\uff08", "(")
.Replace("\uff09", ")")
.Replace("¢", String.Empty)
.Replace("„", String.Empty)
.Replace("€", String.Empty)
.Replace("™", String.Empty)
.Replace("®", String.Empty)
.Replace("“", String.Empty)
.Replace("–º", String.Empty)
.Replace("â", String.Empty)
.Replace("Â", String.Empty)
.Replace("&", "&")
.Replace("è", "e")
.Replace("é", "e")
.Replace("ë", "e")
.Replace("ç", "c")
.Replace("ć", "c")
.Replace("Ć", "C")
.Replace("ô", "o")
.Replace("ó", "o")
.Replace("ò", "o")
.Replace("Ò", "O")
.Replace("ø", "o")
.Replace("Ê", "E")
.Replace("É", "E")
.Replace("È", "E")
.Replace("Ë", "E")
.Replace("ù", "u")
.Replace("ü", "u")
.Replace("ä", "a")
.Replace("Ã", "A")
.Replace("Å", "A")
.Replace("å", "a")
.Replace("à", "a")
.Replace("á", "a")
.Replace("–", "-")
.Replace("�", "?")
.Replace("’", "'")
.Replace("⁺", "+")
.Replace("", String.Empty) // hidden character, ref to task #480
.Replace("", String.Empty) // hidden character, ref to task #4165
.Replace("▽", String.Empty)
.Replace("`", "'")
.Replace("ï", "i")
.Replace("Ï", "I")
.Replace("í", "i")
.Replace("Í", "I")
.Replace("⅜", "3/8")
.Replace("¾", "3/4")
.Replace("¼", "1/4")
.Replace("½", "1/2")
.Replace("⅝", "5/8")
.Replace("⅛", "1/8")
.Replace("″", "\"")
.Replace("ð", String.Empty)
.Trim();
// remove extra spaces
while (text.Contains(" "))
text = text.Replace(" ", " ");
return text;
}
public static string CreateHttpGetRequest(string url, CookieContainer cookieContainer = null, string accept = null, Dictionary httpHeaders = null, bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null, bool checkUrlLoaded = true)
{
// check if url has already been loaded in the past
if (checkUrlLoaded && loadedHttpGetUrls.Contains(url) && loadedHttpGetUrls.Where(m => m == url).Count() >= 2)
{
Exception e = new Exception("Url has already been loaded more than 2 times. Url: " + url);
e.Data[ParameterInfo.ExceptionData.Url] = url;
throw e;
}
else
loadedHttpGetUrls.Enqueue(url);
url = WebUtility.HtmlDecode(url);
bool hasConnectionClosedError = false;
// retry if request has been timed out or refused
for (int i = 0; i <= retryCount; i++)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
request.CookieContainer = cookieContainer;
/* Sometimes .net framework doesn't handle http request properly and instead throws
* web exception "The request was aborted: The connection was closed unexpectedly".
* Using a different HttpVersion.Version10 can get around this issue. */
if (hasConnectionClosedError)
request.ProtocolVersion = HttpVersion.Version10;
if (proxy != null)
request.Proxy = proxy;
if (!String.IsNullOrEmpty(accept))
request.Accept = accept;
SetRequestHeaders(request, httpHeaders);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string responseStr;
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
responseStr = reader.ReadToEnd();
}
response.Close();
return responseStr;
}
catch (Exception e)
{
log.InfoFormat("A WebException occured while sending request. Url: {0}", url);
e.Data[ParameterInfo.ExceptionData.Url] = url;
// give up if already retried n times
if (i == retryCount)
throw;
// If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
// Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
hasConnectionClosedError = true;
else if (retryOnTimeout
&& e is WebException
&& ((((WebException)e).Status == WebExceptionStatus.Timeout)
|| (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
log.Warn(e.Message);
else if (retryOnError)
log.Warn(e.Message);
else
throw;
if (retryDelayInMilisecond > 0)
{
log.Info("Retry sending request after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
System.Threading.Thread.Sleep(retryDelayInMilisecond);
}
else
{
log.Info("Retry posting request. Retry count: " + (i + 1));
}
}
}
return null;
}
public static string CreateHttpPostRequest(string url, string postData, Dictionary httpHeaders = null, CookieContainer cookieContainer = null, string ContentType = "application/x-www-form-urlencoded; charset=UTF-8", bool retryOnError = false, bool retryOnTimeout = true, int retryCount = 3, int retryDelayInMilisecond = 0, WebProxy proxy = null)
{
string responseStr = String.Empty;
postData = postData ?? String.Empty;
url = WebUtility.HtmlDecode(url);
bool hasConnectionClosedError = false;
byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);
// retry if request has been timed out or refused
for (int i = 0; i <= retryCount; i++)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "POST";
request.ContentLength = postDataBytes.Length;
request.ContentType = ContentType;
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
request.CookieContainer = cookieContainer;
/* Sometimes .net framework doesn't handle http request properly and instead throws
* web exception "The request was aborted: The connection was closed unexpectedly".
* Using a different HttpVersion.Version10 can get around this issue. */
if (hasConnectionClosedError)
request.ProtocolVersion = HttpVersion.Version10;
if (proxy != null)
request.Proxy = proxy;
SetRequestHeaders(request, httpHeaders);
using (Stream postStream = request.GetRequestStream())
{
postStream.Write(postDataBytes, 0, postDataBytes.Length);
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
responseStr = reader.ReadToEnd();
}
response.Close();
return responseStr;
}
catch (Exception e)
{
log.InfoFormat("An exception occured while posting request. Url: {0}", url);
e.Data[ParameterInfo.ExceptionData.Url] = url;
// give up if already retried n times
if (i == retryCount)
throw;
// If web exception is connection closed unexpectedly, retry with different Protocol Version (i.e. HttpVersion10)
// Else retry if request timeout and retryOnTimeout = true, or if retryOnError = true
if (e is WebException && ((WebException)e).Status == WebExceptionStatus.ConnectionClosed && !hasConnectionClosedError)
hasConnectionClosedError = true;
else if (retryOnTimeout
&& e is WebException
&& ((((WebException)e).Status == WebExceptionStatus.Timeout)
|| (((WebException)e).Response != null && ((HttpWebResponse)((WebException)e).Response).StatusCode == HttpStatusCode.GatewayTimeout)))
log.Warn(e.Message);
else if (retryOnError)
log.Warn(e.Message);
else
throw;
if (retryDelayInMilisecond > 0)
{
log.Info("Retry posting request after " + retryDelayInMilisecond + " milliseconds. Retry count: " + (i + 1));
System.Threading.Thread.Sleep(retryDelayInMilisecond);
}
else
{
log.Info("Retry posting request. Retry count: " + (i + 1));
}
}
}
return null;
}
public static void DownloadFile(string destFilePath, string localFilePath)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(destFilePath);
request.Method = WebRequestMethods.Http.Get;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream responseStream = response.GetResponseStream();
FileStream writer = new FileStream(localFilePath, FileMode.Create);
long length = response.ContentLength;
int bufferSize = 2048;
int readCount;
byte[] buffer = new byte[2048];
readCount = responseStream.Read(buffer, 0, bufferSize);
while (readCount > 0)
{
writer.Write(buffer, 0, readCount);
readCount = responseStream.Read(buffer, 0, bufferSize);
}
responseStream.Close();
response.Close();
writer.Close();
}
public static void SetRequestHeaders(HttpWebRequest request, Dictionary httpHeaders)
{
if (httpHeaders != null && httpHeaders.Any())
{
foreach (KeyValuePair httpHeader in httpHeaders)
{
if (httpHeader.Key == "User-Agent")
request.UserAgent = httpHeader.Value;
else if (httpHeader.Key == "Referer")
request.Referer = httpHeader.Value;
else if (httpHeader.Key == "Range")
{
long endRange = Convert.ToInt64(StringUtil.SubstringToEnd(httpHeader.Value, "-"));
if (httpHeader.Value.Contains("="))
{
string rangeSpecifier = StringUtil.SubstringFromStart(httpHeader.Value, "=");
long startRange = Convert.ToInt64(StringUtil.Substring(httpHeader.Value, "=", "-"));
request.AddRange(rangeSpecifier, startRange, endRange);
}
else
{
long startRange = Convert.ToInt64(StringUtil.SubstringFromStart(httpHeader.Value, "-"));
request.AddRange(startRange, endRange);
}
}
else if (httpHeader.Key == "Expect")
request.ServicePoint.Expect100Continue = Convert.ToBoolean(httpHeader.Value);
else if (httpHeader.Key == "Accept")
request.Accept = httpHeader.Value;
else if (httpHeader.Key == "Connection" && String.Equals(httpHeader.Value, "Keep-Alive"))
{
// For some reason Keep-Alive is set only on first request. Below is work around using reflection to set Keep-Alive on every request
ServicePoint servicePoint = request.ServicePoint;
System.Reflection.PropertyInfo properptyInfo = servicePoint.GetType().GetProperty("HttpBehaviour", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic);
properptyInfo.SetValue(servicePoint, (byte)0, null);
}
else if (httpHeader.Key == "Connection" && String.Equals(httpHeader.Value, "keep-alive"))
{
// Similar to above but setting Keep-Alive as lower case. Note following reflection code appends keep-alive instead of replacing existing value. Therefore Connection: keep-alive,Keep-Alive. But this only happens on first request for same domain. Subsequent requests will be Connection: keep-alive
request.Headers.GetType().InvokeMember(
"ChangeInternal",
System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.InvokeMethod,
Type.DefaultBinder,
request.Headers,
new object[] { "Connection", "keep-alive" }
);
}
else if (httpHeader.Key == "Proxy-Authorization")
{
request.Headers[httpHeader.Key] = httpHeader.Value;
request.PreAuthenticate = true;
}
else if (httpHeader.Key == "Content-Type")
request.ContentType = httpHeader.Value;
else
request.Headers[httpHeader.Key] = httpHeader.Value;
}
}
}
/*
* Workaround for .Net framework bug which escapes dot in uri. For example: http://www.thevetshed.com.au/buy/adjustable-3-8-puppy-kitten-cat-harness-w-lead./CATHARN
* becomes http://www.thevetshed.com.au/buy/adjustable-3-8-puppy-kitten-cat-harness-w-lead/CATHARN
*
* Reference: http://stackoverflow.com/questions/856885/httpwebrequest-to-url-with-dot-at-the-end
*/
private static void FixDotInUri()
{
System.Reflection.MethodInfo getSyntax = typeof(UriParser).GetMethod("GetSyntax", System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic);
System.Reflection.FieldInfo flagsField = typeof(UriParser).GetField("m_Flags", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic);
if (getSyntax != null && flagsField != null)
{
foreach (string scheme in new[] { "http", "https" })
{
UriParser parser = (UriParser)getSyntax.Invoke(null, new object[] { scheme });
if (parser != null)
{
int flagsValue = (int)flagsField.GetValue(parser);
// Clear the CanonicalizeAsFilePath attribute
if ((flagsValue & 0x1000000) != 0)
flagsField.SetValue(parser, flagsValue & ~0x1000000);
}
}
}
}
public static CookieContainer GetLoginCookieContainer(string hostAddress)
{
// login
string loginUrl = hostAddress + FashionExchangeSetting.AdminLoginPath;
string userName = FashionExchangeSetting.AdminUserName;
string password = FashionExchangeSetting.AdminPassword;
string postData = String.Format("email={0}&password={1}", userName, password);
byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);
HttpWebRequest loginReq = (HttpWebRequest)WebRequest.Create(loginUrl);
loginReq.Method = "POST";
loginReq.ContentType = "application/x-www-form-urlencoded";
loginReq.ContentLength = postDataBytes.Length;
loginReq.AllowAutoRedirect = false;
loginReq.CookieContainer = new CookieContainer();
using (Stream postStream = loginReq.GetRequestStream())
{
postStream.Write(postDataBytes, 0, postDataBytes.Length);
}
HttpWebResponse loginResp = (HttpWebResponse)loginReq.GetResponse();
loginResp.Close();
return loginReq.CookieContainer;
}
public static void ScrapeStore(string hostAddress, int[] storeIds = null, bool scrapeNewUrl = false, double? scrapeExistingUrlAgeInHour = null, int? scrapeExistingUrlPriceDroppedInDay = null, bool uploadProductPhoto = false, bool checkNewsletterProductAvailabilityAndWatermarkPhoto = false, bool uploadNewsletterPhoto = false, bool uploadOfferImage = false, bool syncUserActivity = false, bool reindex = false, bool uploadIndex = false, bool sendPriceAlert = false, bool sendSaleAlert = false, bool deleteOldIndex = false, bool deleteOldPhoto = false, int scraperThreadCount = 10, CookieContainer loginCookieContainer = null, bool notifyTodaysSale = false, bool checkStoreForScrapeAlert = false, bool sendScrapeResultSummary = false, int? deleteProductAgeInDay = null)
{
if (storeIds == null)
storeIds = new int[0];
string postData = String.Join("&", storeIds.Select(m => ParameterInfo.QueryString.StoreId + "=" + m));
postData += "&" + ParameterInfo.QueryString.ScrapeNewUrl + "=" + scrapeNewUrl;
postData += "&" + ParameterInfo.QueryString.ScrapeExistingUrlAgeInHour + "=" + scrapeExistingUrlAgeInHour;
postData += "&" + ParameterInfo.QueryString.ScrapeExistingUrlPriceDroppedInDay + "=" + scrapeExistingUrlPriceDroppedInDay;
postData += "&" + ParameterInfo.QueryString.UploadProductPhoto + "=" + uploadProductPhoto;
postData += "&" + ParameterInfo.QueryString.CheckNewsletterProductAvailabilityAndWatermarkPhoto + "=" + checkNewsletterProductAvailabilityAndWatermarkPhoto;
postData += "&" + ParameterInfo.QueryString.UploadNewsletterPhoto + "=" + uploadNewsletterPhoto;
postData += "&" + ParameterInfo.QueryString.UploadOfferImage + "=" + uploadOfferImage;
postData += "&" + ParameterInfo.QueryString.SyncUserActivity + "=" + syncUserActivity;
postData += "&" + ParameterInfo.QueryString.Reindex + "=" + reindex;
postData += "&" + ParameterInfo.QueryString.UploadIndex + "=" + uploadIndex;
postData += "&" + ParameterInfo.QueryString.SendPriceAlert + "=" + sendPriceAlert;
postData += "&" + ParameterInfo.QueryString.SendSaleAlert + "=" + sendSaleAlert;
postData += "&" + ParameterInfo.QueryString.DeleteOldIndex + "=" + deleteOldIndex;
postData += "&" + ParameterInfo.QueryString.DeleteOldPhoto + "=" + deleteOldPhoto;
postData += "&" + ParameterInfo.QueryString.ScraperThreadCount + "=" + scraperThreadCount;
postData += "&" + ParameterInfo.QueryString.NotifyTodaysSale + "=" + notifyTodaysSale;
postData += "&" + ParameterInfo.QueryString.CheckStoreForScrapeAlert + "=" + checkStoreForScrapeAlert;
postData += "&" + ParameterInfo.QueryString.SendScrapeResultSummary + "=" + sendScrapeResultSummary;
postData += "&" + ParameterInfo.QueryString.DeleteProductAgeInDay + "=" + deleteProductAgeInDay;
byte[] postDataBytes = UTF8Encoding.UTF8.GetBytes(postData);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(hostAddress + FashionExchangeSetting.AdminScrapeStorePath);
request.Method = WebRequestMethods.Http.Post;
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = postDataBytes.Length;
request.AllowAutoRedirect = false;
request.CookieContainer = loginCookieContainer;
request.Timeout = 1000 * 2;
using (Stream postStream = request.GetRequestStream())
postStream.Write(postDataBytes, 0, postDataBytes.Length);
try
{
request.GetResponse();
}
catch (WebException e)
{
// Request was set to timeout after 2 seconds because we don't want to wait for the request to complete
if (e.Status == WebExceptionStatus.Timeout)
return;
else
throw;
}
}
private static Dictionary> proxies = new Dictionary>();
private static Dictionary, List> proxiesValidatedByUrl = new Dictionary, List>();
private static Object proxyLock = new Object();
public static List GetProxies(ProxyNetwork proxyNetwork = ProxyNetwork.MyPrivateProxy, string testUrl = null, int maxTestUrlAttemptCount = 30, int maxWorkingProxyCount = 20, bool disableCache = false)
{
if (proxyNetwork == ProxyNetwork.MyPrivateProxy)
{
lock (proxyLock)
{
if (!proxies.ContainsKey(ProxyNetwork.MyPrivateProxy))
{
log.Info("Get MyPrivateProxy proxy list...");
proxies[ProxyNetwork.MyPrivateProxy] = new List();
string proxyListResponse = ScraperUtil.CreateHttpGetRequest("https://api.myprivateproxy.net/v1/fetchProxies/json/full/gkytva0jbisl9olooyzjmpbdfcrpmn9i", checkUrlLoaded: false);
JObject proxyListJson = JObject.Parse("{Wrapper:" + proxyListResponse + "}");
foreach (JToken proxyListItem in proxyListJson.SelectToken("Wrapper"))
{
log.Info("Found proxy: " + proxyListItem["proxy_ip"].ToString() + ":" + proxyListItem["proxy_port"].ToString());
WebProxy proxy = new WebProxy(proxyListItem["proxy_ip"].ToString(), Convert.ToInt32(proxyListItem["proxy_port"].ToString()));
proxy.Credentials = new NetworkCredential(proxyListItem["username"].ToString(), proxyListItem["password"].ToString());
proxies[ProxyNetwork.MyPrivateProxy].Add(proxy);
}
}
}
}
else if (proxyNetwork == ProxyNetwork.ProxyBonanza)
{
lock (proxyLock)
{
if (!proxies.ContainsKey(ProxyNetwork.ProxyBonanza))
{
proxies[ProxyNetwork.ProxyBonanza] = new List();
string apiUrl = "https://api.proxybonanza.com/v1/userpackages/49575.json";
Dictionary httpHeaders = new Dictionary();
httpHeaders.Add("Authorization", "mIAWG1CKaz3cSjWUV2wnAaszmNS6nck6C8kQIDBmOheslgMOFp!43336");
JObject proxyListJson = JObject.Parse(ScraperUtil.CreateHttpGetRequest(apiUrl, httpHeaders: httpHeaders, checkUrlLoaded: false));
foreach (JToken proxyItem in proxyListJson.SelectToken("data.ippacks"))
{
log.Info("Found proxy: " + proxyItem["ip"].ToString() + ":" + proxyItem["port_http"].ToString());
WebProxy proxy = new WebProxy(proxyItem["ip"].ToString(), Convert.ToInt32(proxyItem["port_http"].ToString()));
proxy.Credentials = new NetworkCredential(proxyListJson.SelectToken("data.login").ToString(), proxyListJson.SelectToken("data.password").ToString());
proxies[ProxyNetwork.ProxyBonanza].Add(proxy);
}
}
}
}
else if (proxyNetwork == ProxyNetwork.BinaryLane)
{
lock (proxyLock)
{
if (!proxies.ContainsKey(ProxyNetwork.BinaryLane))
{
proxies[ProxyNetwork.BinaryLane] = new List();
WebProxy proxy = new WebProxy("43.229.63.22", 29842);
proxy.Credentials = new NetworkCredential("scrapeserver", "95EpLZ");
proxies[ProxyNetwork.BinaryLane].Add(proxy);
}
}
}
else if (proxyNetwork == ProxyNetwork.BinaryLaneScrapeServer)
{
lock (proxyLock)
{
if (!proxies.ContainsKey(ProxyNetwork.BinaryLaneScrapeServer))
{
proxies[ProxyNetwork.BinaryLaneScrapeServer] = new List();
WebProxy proxy = new WebProxy("43.229.63.22", 29843);
proxies[ProxyNetwork.BinaryLaneScrapeServer].Add(proxy);
}
}
}
else if (proxyNetwork == ProxyNetwork.LuminatiGlobalShared) // Luminati Global Shared proxies are slightly different because it has huge list of IPs
{
if (disableCache && String.IsNullOrEmpty(testUrl))
throw new Exception("test url is required when disable cache is enabled");
if (String.IsNullOrEmpty(testUrl))
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-zone1-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "200fbti3d9xt");
return new List() { proxy };
}
Tuple luminatiAndTestUrl = new Tuple(proxyNetwork, new Uri(testUrl).Authority);
if (!disableCache && proxiesValidatedByUrl.ContainsKey(luminatiAndTestUrl))
return proxiesValidatedByUrl[luminatiAndTestUrl];
proxiesValidatedByUrl[luminatiAndTestUrl] = new List();
for (int attempt = 0; attempt < maxTestUrlAttemptCount && proxiesValidatedByUrl[luminatiAndTestUrl].Count < maxWorkingProxyCount; attempt++)
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-zone1-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "200fbti3d9xt");
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(testUrl);
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.Proxy = proxy;
request.Timeout = 1000 * 10;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
log.Info("Found working proxy. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
proxiesValidatedByUrl[luminatiAndTestUrl].Add(proxy);
}
catch (WebException e)
{
if ((e.Status == WebExceptionStatus.Timeout)
|| (e.Response != null && ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.GatewayTimeout))
log.Info("Timed out downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
else if (e.Response != null && (((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.Unauthorized || ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.Forbidden))
log.Info("Forbidden or unauthorized downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false));
else
log.Info("Error downloading test url. Test Url: " + testUrl + " Proxy IP: " + CreateHttpGetRequest("https://api.ipify.org/", proxy: proxy, checkUrlLoaded: false) + " Exception: " + e.ToString());
}
}
return proxiesValidatedByUrl[luminatiAndTestUrl];
}
else if (proxyNetwork == ProxyNetwork.LuminatiStatic)
{
string proxyAddress = "customer-hl_8238b460.zproxy.lum-superproxy.io";
string userName = "lum-customer-hl_8238b460-zone-static";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "xoj41myrdax6");
return new List() { proxy };
}
else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedJDSports)
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-jdsports-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "tw53pckkvvie");
return new List() { proxy };
}
else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedFootLocker)
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-footlocker-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "728yatag13w4");
return new List() { proxy };
}
else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedVisionDirect)
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-visiondirect-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "mls5d00ybhdr");
return new List() { proxy };
}
else if (proxyNetwork == ProxyNetwork.LuminatiGlobalSharedASOS)
{
string sessionId = new Random().Next().ToString();
string userName = "lum-customer-hl_8238b460-zone-asos-session-" + sessionId;
string proxyAddress = "customer-hl_8238b460-session-" + sessionId + ".zproxy.lum-superproxy.io";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "cis8217oa34e");
return new List() { proxy };
}
else if (proxyNetwork == ProxyNetwork.LuminatiStaticNZ)
{
string proxyAddress = "customer-hl_8238b460.zproxy.lum-superproxy.io";
string userName = "lum-customer-hl_8238b460-zone-static_nz";
WebProxy proxy = new WebProxy(proxyAddress, 22225);
proxy.Credentials = new NetworkCredential(userName, "cf83xw0jjhgi");
return new List() { proxy };
}
if (String.IsNullOrEmpty(testUrl))
return proxies[proxyNetwork];
Tuple proxyNetworkAndTestUrl = new Tuple(proxyNetwork, new Uri(testUrl).Authority);
if (proxiesValidatedByUrl.ContainsKey(proxyNetworkAndTestUrl))
return proxiesValidatedByUrl[proxyNetworkAndTestUrl];
else
{
proxiesValidatedByUrl.Add(proxyNetworkAndTestUrl, new List());
foreach (WebProxy proxy in proxies[proxyNetwork])
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(testUrl);
request.UserAgent = FashionExchangeSetting.UserAgentChrome;
request.Proxy = proxy;
request.Timeout = 1000 * 10;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
log.Info("Found working proxy. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri);
proxiesValidatedByUrl[proxyNetworkAndTestUrl].Add(proxy);
}
catch (WebException e)
{
if ((e.Status == WebExceptionStatus.Timeout)
|| (e.Response != null && ((HttpWebResponse)e.Response).StatusCode == HttpStatusCode.GatewayTimeout))
log.Info("Timed out downloading test url. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri);
else
log.Info("Error downloading test url. Test Url: " + testUrl + " Proxy: " + proxy.Address.AbsoluteUri + " Exception: " + e.ToString());
}
}
return proxiesValidatedByUrl[proxyNetworkAndTestUrl];
}
}
public static WebProxy GetRandomProxy(ProxyNetwork proxyNetwork = ProxyNetwork.MyPrivateProxy, string testUrl = null, int maxTestUrlAttemptCount = 30, int maxWorkingProxyCount = 20, bool disableCache = false)
{
List proxies = GetProxies(proxyNetwork: proxyNetwork, testUrl: testUrl, maxTestUrlAttemptCount: maxTestUrlAttemptCount, maxWorkingProxyCount: maxWorkingProxyCount, disableCache: disableCache);
return proxies.ElementAt(new Random().Next(proxies.Count()));
}
public static string GenerateChromeProxyExtension(WebProxy proxy, string fileNamePrefix)
{
string extensionFullPath = HostingEnvironment.ApplicationPhysicalPath ?? AppDomain.CurrentDomain.BaseDirectory;
if (!Directory.Exists(extensionFullPath + FashionExchangeSetting.TemporaryDirectory))
Directory.CreateDirectory(extensionFullPath + FashionExchangeSetting.TemporaryDirectory);
extensionFullPath += FashionExchangeSetting.TemporaryDirectory + fileNamePrefix + "_ChromeProxy_" + System.Threading.Thread.CurrentThread.ManagedThreadId + "_" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + ".zip";
using (MemoryStream memoryStream = new MemoryStream())
{
using (ZipArchive archive = new ZipArchive(memoryStream, ZipArchiveMode.Create, true))
{
ZipArchiveEntry backgroundJS = archive.CreateEntry("background.js");
using (Stream entryStream = backgroundJS.Open())
using (StreamWriter streamWriter = new StreamWriter(entryStream))
{
streamWriter.Write(
@"var config = {{
mode: ""fixed_servers"",
rules: {{
singleProxy: {{
scheme: ""http"",
host: ""{0}"",
port: parseInt({1})
}},
bypassList: [""foobar.com""]
}}
}};
chrome.proxy.settings.set({{value: config, scope: ""regular""}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: ""{2}"",
password: ""{3}""
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: [""""]}},
['blocking']
);",
proxy.Address.Host,
proxy.Address.Port,
(proxy.Credentials as NetworkCredential).UserName,
(proxy.Credentials as NetworkCredential).Password
);
}
ZipArchiveEntry manifestJSON = archive.CreateEntry("manifest.json");
using (Stream entryStream = manifestJSON.Open())
using (StreamWriter streamWriter = new StreamWriter(entryStream))
{
streamWriter.Write(
@"{
""version"": ""1.0.0"",
""manifest_version"": 2,
""name"": ""Chrome Proxy"",
""permissions"": [
""proxy"",
""tabs"",
""unlimitedStorage"",
""storage"",
"""",
""webRequest"",
""webRequestBlocking""
],
""background"": {
""scripts"": [""background.js""]
},
""minimum_chrome_version"":""22.0.0""
}"
);
}
}
using (FileStream fileStream = new FileStream(extensionFullPath, FileMode.Create))
{
memoryStream.Seek(0, SeekOrigin.Begin);
memoryStream.CopyTo(fileStream);
}
}
return extensionFullPath;
}
public static string GenerateChromeBlockExtension(List blockUrls, List blockResourceTypes, string fileNamePrefix)
{
if (blockUrls == null || !blockUrls.Any())
throw new ArgumentException("At least one block url is required");
string extensionFullPath = HostingEnvironment.ApplicationPhysicalPath ?? AppDomain.CurrentDomain.BaseDirectory;
if (!Directory.Exists(extensionFullPath + FashionExchangeSetting.TemporaryDirectory))
Directory.CreateDirectory(extensionFullPath + FashionExchangeSetting.TemporaryDirectory);
extensionFullPath += FashionExchangeSetting.TemporaryDirectory + fileNamePrefix + "_ChromeBlock_" + System.Threading.Thread.CurrentThread.ManagedThreadId + "_" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + ".zip";
using (MemoryStream memoryStream = new MemoryStream())
{
using (ZipArchive archive = new ZipArchive(memoryStream, ZipArchiveMode.Create, true))
{
ZipArchiveEntry backgroundJS = archive.CreateEntry("background.js");
using (Stream entryStream = backgroundJS.Open())
using (StreamWriter streamWriter = new StreamWriter(entryStream))
{
streamWriter.Write(
@"chrome.webRequest.onBeforeRequest.addListener(
function(details) {{ return {{cancel: true}}; }},
{{
urls: [{0}],
types: [{1}]
}},
[""blocking""]);",
String.Join(",", blockUrls.Select(m => "\"" + m + "\"")),
String.Join(",", blockResourceTypes.Select(m => "\"" + m + "\""))
);
}
// For a list of applicable resource types: https://developer.chrome.com/extensions/webRequest#type-ResourceType
ZipArchiveEntry manifestJSON = archive.CreateEntry("manifest.json");
using (Stream entryStream = manifestJSON.Open())
using (StreamWriter streamWriter = new StreamWriter(entryStream))
{
streamWriter.Write(
@"{
""version"": ""1.0.0"",
""manifest_version"": 2,
""name"": ""Chrome Block"",
""permissions"": [
"""",
""webRequest"",
""webRequestBlocking""
],
""background"": {
""scripts"": [""background.js""]
},
""minimum_chrome_version"":""22.0.0""
}"
);
}
}
using (FileStream fileStream = new FileStream(extensionFullPath, FileMode.Create))
{
memoryStream.Seek(0, SeekOrigin.Begin);
memoryStream.CopyTo(fileStream);
}
}
return extensionFullPath;
}
private static Dictionary exchangeRates;
public static decimal ConvertToAUD(string currencyCode, decimal price)
{
if (exchangeRates == null)
{
log.Info("Retrieve exchange rates...");
exchangeRates = new Dictionary(StringComparer.InvariantCultureIgnoreCase);
JObject exchangeRateJson = JObject.Parse(ScraperUtil.CreateHttpGetRequest("http://apilayer.net/api/live?access_key=" + FashionExchangeSetting.CurrencyLayerAccessKey + "¤cies=AUD,NZD,EUR,GBP", checkUrlLoaded: false));
JToken usdToAud = exchangeRateJson.SelectToken("quotes.USDAUD");
exchangeRates.Add(Country.UnitedStates.CurrencyCode, Convert.ToDecimal(usdToAud.ToString()));
JToken usdToGbp = exchangeRateJson.SelectToken("quotes.USDGBP");
exchangeRates.Add(Country.UnitedKingdom.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToGbp.ToString()));
JToken usdToNzd = exchangeRateJson.SelectToken("quotes.USDNZD");
exchangeRates.Add(Country.NewZealand.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToNzd.ToString()));
JToken usdToEur = exchangeRateJson.SelectToken("quotes.USDEUR");
exchangeRates.Add(Country.Germany.CurrencyCode, exchangeRates[Country.UnitedStates.CurrencyCode] / Convert.ToDecimal(usdToEur.ToString()));
log.Info("Exchange rates retrieved successfully");
}
return price * exchangeRates[currencyCode];
}
public static string UpdateUrlQuery(string url, string queryName, string queryValue)
{
Uri uri = new Uri(url);
NameValueCollection queries = HttpUtility.ParseQueryString(uri.Query);
if (queryValue == null)// only remove query when query value is null, do not remove if value is blank
queries.Remove(queryName);
else
queries.Set(queryName, queryValue);
if (queries.Count > 0)
return uri.Scheme + "://" + uri.Authority + uri.AbsolutePath + "?" + queries.ToString();
else
return uri.Scheme + "://" + uri.Authority + uri.AbsolutePath;
}
public static long ConvertIPAddressToNumber(string ipAddress)
{
// on localhost, ip address is ::1
if (String.Equals(ipAddress, "::1"))
return 0;
int[] segments = ipAddress.Split('.').Select(m => Convert.ToInt32(m)).ToArray();
return (long)segments[0] * 16777216 + segments[1] * 65536 + segments[2] * 256 + segments[3];
}
private static SortedList _chinaIPRanges;
public static bool IsChinaIPAddress(string ipAddress)
{
if (_chinaIPRanges == null)
{
log.Info("Reading China IP Address.txt");
_chinaIPRanges = new SortedList();
try
{
using (StreamReader reader = new StreamReader(Path.Combine(HostingEnvironment.ApplicationPhysicalPath, "bin", "China IP Address.txt")))
{
string line = null;
while ((line = reader.ReadLine()) != null)
{
if (String.IsNullOrWhiteSpace(line) || line.StartsWith("#"))
continue;
string startIP = StringUtil.SubstringFromStart(line, " - ");
string endIP = StringUtil.Substring(line, " - ", " China");
_chinaIPRanges.Add(ScraperUtil.ConvertIPAddressToNumber(endIP), ScraperUtil.ConvertIPAddressToNumber(startIP));
}
}
}
catch (Exception e)
{
log.Error("Error reading china IP address list: " + e.ToString());
}
log.Info("Found " + _chinaIPRanges.Count + " china ip ranges");
}
long ip = ScraperUtil.ConvertIPAddressToNumber(ipAddress);
KeyValuePair ipRange = _chinaIPRanges.FirstOrDefault(m => ip <= m.Key);
if (ipRange.Key == 0 && ipRange.Value == 0)
return false;
else if (ip >= ipRange.Value)
return true;
else
return false;
}
public static List ParseCategoryTreeAsList(string homeUrl, string categoryTreeXPath, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
{
HtmlDocument homePage = ScraperUtil.LoadHtml(homeUrl, checkUrlLoaded: false);
HtmlNode treeNode = homePage.DocumentNode.SelectSingleNode(categoryTreeXPath);
return ParseCategoryTree(treeNode, skipCategoryByExactMatch: skipCategoryByExactMatch, skipCategoryByContainMatch: skipCategoryByContainMatch);
}
public static void ParseCategoryTree(HtmlNode topTreeNode, DAL.SiteMap siteMap, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
{
List categories = ParseCategoryTree(topTreeNode, skipCategoryByExactMatch: skipCategoryByExactMatch, skipCategoryByContainMatch: skipCategoryByContainMatch);
// add leaf categories to SiteMap
foreach (Category category in categories.Where(m => !m.SubCategories.Any() && !String.IsNullOrEmpty(m.Url)))
{
string breadcrumb = category.Name;
Category currentCategory = category;
while (currentCategory.ParentCategory != null)
{
breadcrumb = currentCategory.ParentCategory.Name + " " + breadcrumb;
currentCategory = currentCategory.ParentCategory;
}
siteMap.AddCategoryPage(breadcrumb, category.Url, siteMap.GetHomePageUrl());
}
}
private static List ParseCategoryTree(HtmlNode topTreeNode, List skipCategoryByExactMatch = null, List skipCategoryByContainMatch = null)
{
// remove all empty text nodes
foreach (HtmlNode textNode in topTreeNode.SelectNodes("//text()"))
if (String.IsNullOrWhiteSpace(textNode.InnerText.Trim()))
textNode.Remove();
HtmlNode currentNode = topTreeNode;
HtmlNode parentNode = null;
int depth = 0;
Category topCategory = new Category();
topCategory.Name = "Home";
topCategory.Depth = depth;
List categoryList = new List();
categoryList.Add(topCategory);
while (currentNode.HasChildNodes || currentNode.NextSibling != null || currentNode.ParentNode != topTreeNode)
{
if (currentNode.HasChildNodes)
{
parentNode = currentNode;
currentNode = currentNode.FirstChild;
depth++;
}
else if (currentNode.NextSibling != null)
{
currentNode = currentNode.NextSibling;
currentNode.PreviousSibling.Remove();
}
else if (currentNode.ParentNode != null)
{
currentNode = currentNode.ParentNode;
currentNode.FirstChild.Remove();
parentNode = currentNode.ParentNode;
depth--;
}
else
break;
// if current node is a ahref node, add link and link text to category, otherwise just add text to category
string name = null, url = null;
if (currentNode.Name == "a")
{
name = ScraperUtil.NormalizeText(currentNode.InnerText);
url = currentNode.Attributes["href"]?.Value;
}
else if (!String.IsNullOrWhiteSpace(currentNode.SelectSingleNode("./text()")?.InnerText.Trim()))
{
name = ScraperUtil.NormalizeText(currentNode.SelectSingleNode("./text()").InnerText);
}
if (!String.IsNullOrEmpty(name))
{
Category category = new Category();
category.Name = name;
category.Depth = depth;
category.Url = url;
Category parentCategory = categoryList.Where(m => m.Depth < depth).Last(); // parent category is the last added category with lower depth
parentCategory.AddSubCategory(category);
categoryList.Add(category);
// Stop parse inner html if current node is ahref, because we can have xxx and we got link text already
if (currentNode.Name == "a")
{
currentNode.Remove();
currentNode = parentNode;
parentNode = currentNode.ParentNode;
depth--;
}
}
}
for (int i = 0; i < categoryList.Count; i++)
{
Category category = categoryList[i];
if (skipCategoryByExactMatch != null && (StringUtil.Contains(skipCategoryByExactMatch, category.Name, ignoreCase: true) || skipCategoryByExactMatch.Where(m => StringUtil.Contains(category.Breadcrumb, m, ignoreCase: true)).Any()))
{
categoryList.Remove(category);
i--;
}
else if (skipCategoryByContainMatch != null && (skipCategoryByContainMatch.Any(m => StringUtil.ContainsIgnoreCase(category.Name, m)) || skipCategoryByContainMatch.Where(skipCategory => category.Breadcrumb.Where(breadcrumb => StringUtil.ContainsIgnoreCase(breadcrumb, skipCategory)).Any()).Any()))
{
categoryList.Remove(category);
i--;
}
}
return categoryList;
}
public class Category
{
public Category()
{
SubCategories = new List();
}
public string Name { get; set; }
public int Depth { get; set; }
public string Url { get; set; }
public Category ParentCategory { get; private set; }
public List SubCategories { get; private set; }
public List Breadcrumb
{
get
{
List breadcrumb = new List();
if (ParentCategory == null)
return breadcrumb;
Category currentCategory = this;
while (currentCategory.ParentCategory != null && currentCategory.ParentCategory.Depth > 0)
{
currentCategory = currentCategory.ParentCategory;
breadcrumb.Add(currentCategory.Name);
}
return breadcrumb;
}
}
public void AddSubCategory(string name, int depth, string url)
{
Category category = new Category();
category.Name = name;
category.Depth = depth;
category.Url = url;
AddSubCategory(category);
}
public void AddSubCategory(Category category)
{
category.ParentCategory = this;
this.SubCategories.Add(category);
}
}
public static bool IsNoIndex()
{
if (HttpContext.Current.Request.QueryString.Count >= 2
|| (HttpContext.Current.Request.QueryString.Count == 1 && HttpContext.Current.Request.QueryString.GetKey(0) != ParameterInfo.QueryString.StoreId && HttpContext.Current.Request.QueryString.GetKey(0) != ParameterInfo.QueryString.BrandName))
return true;
else
return false;
}
}
}
TemplateShopifyScraper
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;
namespace FashionExchange.Common.Scrapers
{
public class TemplateShopifyScraper : ShopifyScraper
{
protected override HashSet SizeOptionNames
{
get
{
if (_sizeOptionNames == null)
{
_sizeOptionNames = base.SizeOptionNames;
//_sizeOptionNames.Add("size option name");
}
return _sizeOptionNames;
}
}
protected override HashSet ColourOptionNames
{
get
{
if (_colourOptionNames == null)
{
_colourOptionNames = base.ColourOptionNames;
//_colourOptionNames.Add("colour option name");
}
return _colourOptionNames;
}
}
protected override List GetProductUrls()
{
bool hasSubCategory = false;
HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);
HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
foreach (HtmlNode categoryNode in categoryNodes)
{
string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
log.Info("Parsing category: " + categoryName);
HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
if (subCategoryNodes != null)
{
foreach (HtmlNode subCategoryNode in subCategoryNodes)
{
log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());
SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
GetProductUrlsFromCategoryPage(subCategoryNode);
hasSubCategory = true;
}
}
else
{
HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
GetProductUrlsFromCategoryPage(categoryLinkNode);
}
}
if (!hasSubCategory)
throw new Exception("No sub categories found");
return SiteMap.GetAllProductUrls();
}
private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
{
GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
}
private void GetProductUrlsFromCategoryPage(string categoryUrl)
{
log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);
HtmlDocument categoryPage = null;
try
{
categoryPage = ScraperUtil.LoadHtml(categoryUrl);
}
catch (WebException e)
{
log.Info("Error download category page: " + e.ToString());
return;
}
HtmlNode nextPageNode = null;
do
{
HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//div[@id='product-loop']//div[@class='product-info-inner']/a");
if (productNodes == null)
{
log.Info("Category page has no products");
return;
}
foreach (HtmlNode productNode in productNodes)
{
SiteMap.AddProductUrl(productNode, categoryUrl);
log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
}
nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//div[@id='pagination']/a[i/@class='fa fa-caret-right']");
if (nextPageNode != null)
{
log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
try
{
categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
}
catch (WebException e)
{
log.Info("Error download next category page: " + e.ToString());
return;
}
}
} while (nextPageNode != null);
}
protected override bool IgnoreProduct()
{
return base.IgnoreProduct();
}
}
}
TemplateMagentoScraper
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;
namespace FashionExchange.Common.Scrapers
{
public class TemplateMagentoScraper : MagentoScraper
{
protected override HashSet ColourCodeNames
{
get
{
if (_colourCodeNames == null)
{
_colourCodeNames = base.ColourCodeNames;
//_colourCodeNames.Add("custom_colour_name");
}
return _colourCodeNames;
}
}
protected override HashSet SizeCodeNames
{
get
{
if (_sizeCodeNames == null)
{
_sizeCodeNames = base.SizeCodeNames;
//_sizeCodeNames.Add("custom_size_name");
}
return _sizeCodeNames;
}
}
protected override List GetProductUrls()
{
bool hasSubCategory = false;
HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);
HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
foreach (HtmlNode categoryNode in categoryNodes)
{
string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
log.Info("Parsing category: " + categoryName);
HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
if (subCategoryNodes != null)
{
foreach (HtmlNode subCategoryNode in subCategoryNodes)
{
log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());
SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
GetProductUrlsFromCategoryPage(subCategoryNode);
hasSubCategory = true;
}
}
else
{
HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
GetProductUrlsFromCategoryPage(categoryLinkNode);
}
}
if (!hasSubCategory)
throw new Exception("No sub categories found");
return SiteMap.GetAllProductUrls();
}
private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
{
GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
}
private void GetProductUrlsFromCategoryPage(string categoryUrl)
{
log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);
HtmlDocument categoryPage = null;
try
{
categoryPage = ScraperUtil.LoadHtml(categoryUrl);
}
catch (WebException e)
{
log.Info("Error download category page: " + e.ToString());
return;
}
HtmlNode nextPageNode = null;
do
{
HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//h2[@class='product-name']/a");
if (productNodes == null)
{
log.Info("Category page has no products");
return;
}
foreach (HtmlNode productNode in productNodes)
{
SiteMap.AddProductUrl(productNode, categoryUrl);
log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
}
nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//a[@class='next i-next']");
if (nextPageNode != null)
{
log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
try
{
categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
}
catch (WebException e)
{
log.Info("Error download next category page: " + e.ToString());
return;
}
}
} while (nextPageNode != null);
}
protected override bool IgnoreProduct()
{
return base.IgnoreProduct();
}
protected override string GetBrand()
{
return String.Empty;
}
protected override string GetDescription()
{
return String.Empty;
}
protected override string GetImageUrl()
{
return String.Empty;
}
}
}
TemplateScraper
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;
namespace FashionExchange.Common.Scrapers
{
public class TemplateScraper : Scraper
{
protected override List GetProductUrls()
{
bool hasSubCategory = false;
HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);
HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
foreach (HtmlNode categoryNode in categoryNodes)
{
string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
log.Info("Parsing category: " + categoryName);
HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
if (subCategoryNodes != null)
{
foreach (HtmlNode subCategoryNode in subCategoryNodes)
{
log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());
SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
GetProductUrlsFromCategoryPage(subCategoryNode);
hasSubCategory = true;
}
}
else
{
HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
GetProductUrlsFromCategoryPage(categoryLinkNode);
}
}
if (!hasSubCategory)
throw new Exception("No sub categories found");
return SiteMap.GetAllProductUrls();
}
private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
{
GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
}
private void GetProductUrlsFromCategoryPage(string categoryUrl)
{
log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);
HtmlDocument categoryPage = null;
try
{
categoryPage = ScraperUtil.LoadHtml(categoryUrl);
}
catch (WebException e)
{
log.Info("Error download category page: " + e.ToString());
return;
}
HtmlNode nextPageNode = null;
do
{
HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("");
if (productNodes == null)
{
log.Info("Category has no products");
return;
}
foreach (HtmlNode productNode in productNodes)
{
SiteMap.AddProductUrl(productNode, categoryUrl);
log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
}
nextPageNode = categoryPage.DocumentNode.SelectSingleNode("");
if (nextPageNode != null)
{
log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
try
{
categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
}
catch (WebException e)
{
log.Info("Error download next category page: " + e.ToString());
return;
}
}
} while (nextPageNode != null);
}
protected override List GetProducts()
{
List products = new List();
products.AddRange(CreateProductObjects());
return products;
}
protected override string GetName()
{
return String.Empty;
}
protected override decimal GetPrice()
{
HtmlNode oldPriceNode = ProductPage.DocumentNode.SelectSingleNode("");
if (oldPriceNode != null)
return Convert.ToDecimal(oldPriceNode.InnerText.Replace("$", String.Empty));
else
return Convert.ToDecimal(ProductPage.DocumentNode.SelectSingleNode("").InnerText.Replace("$", String.Empty));
}
protected override decimal GetSalePrice()
{
HtmlNode salePriceNode = ProductPage.DocumentNode.SelectSingleNode("");
if (salePriceNode != null)
return Convert.ToDecimal(salePriceNode.InnerText.Replace("$", String.Empty));
else
return 0;
}
protected override string GetCategory()
{
return SiteMap.GetBreadcrumb(ProductUrl);
}
protected override string GetBrand()
{
return String.Empty;
}
protected override string GetDescription()
{
return String.Empty;
}
protected override string GetImageUrl()
{
return String.Empty;
}
protected override List GetSizes()
{
List sizes = new List();
HtmlNodeCollection sizeNodes = ProductPage.DocumentNode.SelectNodes("");
foreach (HtmlNode sizeNode in sizeNodes)
{
ProductSize size = new ProductSize();
size.Size = sizeNode.InnerText;
size.Available = true;
sizes.Add(size);
}
return sizes;
}
}
}
AmazonSignedRequestHelper
using System;
using System.Collections.Generic;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using System.Threading.Tasks;
using System.Web;
namespace Scraper.Common.Utils
{
class AmazonSignedRequestHelper
{
private string endPoint;
private string akid;
private string associateTag;
private byte[] secret;
private HMAC signer;
private const string REQUEST_URI = "/onca/xml";
private const string REQUEST_METHOD = "GET";
/*
* Use this constructor to create the object. The AWS credentials are available on
* http://aws.amazon.com
*
* The destination is the service end-point for your application:
* US: ecs.amazonaws.com
* JP: ecs.amazonaws.jp
* UK: ecs.amazonaws.co.uk
* DE: ecs.amazonaws.de
* FR: ecs.amazonaws.fr
* CA: ecs.amazonaws.ca
*/
public AmazonSignedRequestHelper(string awsAccessKeyId, string awsSecretKey, string destination, string associateTag)
{
this.endPoint = destination.ToLower();
this.akid = awsAccessKeyId;
this.secret = Encoding.UTF8.GetBytes(awsSecretKey);
this.associateTag = associateTag;
this.signer = new HMACSHA256(this.secret);
}
/*
* Sign a request in the form of a Dictionary of name-value pairs.
*
* This method returns a complete URL to use. Modifying the returned URL
* in any way invalidates the signature and Amazon will reject the requests.
*/
public string Sign(IDictionary request)
{
// Use a SortedDictionary to get the parameters in naturual byte order, as
// required by AWS.
ParamComparer pc = new ParamComparer();
SortedDictionary sortedMap = new SortedDictionary(request, pc);
// Add the AWSAccessKeyId and Timestamp to the requests.
sortedMap["AWSAccessKeyId"] = this.akid;
sortedMap["Timestamp"] = this.GetTimestamp();
sortedMap["AssociateTag"] = this.associateTag;
// Get the canonical query string
string canonicalQS = this.ConstructCanonicalQueryString(sortedMap);
// Derive the bytes needs to be signed.
StringBuilder builder = new StringBuilder();
builder.Append(REQUEST_METHOD)
.Append("\n")
.Append(this.endPoint)
.Append("\n")
.Append(REQUEST_URI)
.Append("\n")
.Append(canonicalQS);
string stringToSign = builder.ToString();
byte[] toSign = Encoding.UTF8.GetBytes(stringToSign);
// Compute the signature and convert to Base64.
byte[] sigBytes = signer.ComputeHash(toSign);
string signature = Convert.ToBase64String(sigBytes);
// now construct the complete URL and return to caller.
StringBuilder qsBuilder = new StringBuilder();
qsBuilder.Append("http://")
.Append(this.endPoint)
.Append(REQUEST_URI)
.Append("?")
.Append(canonicalQS)
.Append("&Signature=")
.Append(this.PercentEncodeRfc3986(signature));
return qsBuilder.ToString();
}
/*
* Sign a request in the form of a query string.
*
* This method returns a complete URL to use. Modifying the returned URL
* in any way invalidates the signature and Amazon will reject the requests.
*/
public string Sign(string queryString)
{
IDictionary request = this.CreateDictionary(queryString);
return this.Sign(request);
}
/*
* Current time in IS0 8601 format as required by Amazon
*/
private string GetTimestamp()
{
DateTime currentTime = DateTime.UtcNow;
string timestamp = currentTime.ToString("yyyy-MM-ddTHH:mm:ssZ");
return timestamp;
}
/*
* Percent-encode (URL Encode) according to RFC 3986 as required by Amazon.
*
* This is necessary because .NET's HttpUtility.UrlEncode does not encode
* according to the above standard. Also, .NET returns lower-case encoding
* by default and Amazon requires upper-case encoding.
*/
private string PercentEncodeRfc3986(string str)
{
str = HttpUtility.UrlEncode(str, System.Text.Encoding.UTF8);
str = str.Replace("'", "%27").Replace("(", "%28").Replace(")", "%29").Replace("*", "%2A").Replace("!", "%21").Replace("%7e", "~").Replace("+", "%20");
StringBuilder sbuilder = new StringBuilder(str);
for (int i = 0; i < sbuilder.Length; i++)
{
if (sbuilder[i] == '%')
{
if (Char.IsLetter(sbuilder[i + 1]) || Char.IsLetter(sbuilder[i + 2]))
{
sbuilder[i + 1] = Char.ToUpper(sbuilder[i + 1]);
sbuilder[i + 2] = Char.ToUpper(sbuilder[i + 2]);
}
}
}
return sbuilder.ToString();
}
/*
* Convert a query string to corresponding dictionary of name-value pairs.
*/
private IDictionary CreateDictionary(string queryString)
{
Dictionary map = new Dictionary();
string[] requestParams = queryString.Split('&');
for (int i = 0; i < requestParams.Length; i++)
{
if (requestParams[i].Length < 1)
{
continue;
}
char[] sep = { '=' };
string[] param = requestParams[i].Split(sep, 2);
for (int j = 0; j < param.Length; j++)
{
param[j] = HttpUtility.UrlDecode(param[j], System.Text.Encoding.UTF8);
}
switch (param.Length)
{
case 1:
{
if (requestParams[i].Length >= 1)
{
if (requestParams[i].ToCharArray()[0] == '=')
{
map[""] = param[0];
}
else
{
map[param[0]] = "";
}
}
break;
}
case 2:
{
if (!string.IsNullOrEmpty(param[0]))
{
map[param[0]] = param[1];
}
}
break;
}
}
return map;
}
/*
* Consttuct the canonical query string from the sorted parameter map.
*/
private string ConstructCanonicalQueryString(SortedDictionary sortedParamMap)
{
StringBuilder builder = new StringBuilder();
if (sortedParamMap.Count == 0)
{
builder.Append("");
return builder.ToString();
}
foreach (KeyValuePair kvp in sortedParamMap)
{
builder.Append(this.PercentEncodeRfc3986(kvp.Key));
builder.Append("=");
builder.Append(this.PercentEncodeRfc3986(kvp.Value));
builder.Append("&");
}
string canonicalString = builder.ToString();
canonicalString = canonicalString.Substring(0, canonicalString.Length - 1);
return canonicalString;
}
}
/*
* To help the SortedDictionary order the name-value pairs in the correct way.
*/
class ParamComparer : IComparer
{
public int Compare(string p1, string p2)
{
return string.CompareOrdinal(p1, p2);
}
}
}
TemplateWooCommerceScraper
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using FashionExchange.Common.DAL;
using FashionExchange.Common.Utils;
using System.Web;
using Newtonsoft.Json;
using System.Net;
using Newtonsoft.Json.Linq;
using log4net;
namespace FashionExchange.Common.Scrapers
{
public class TemplateWooCommerceScraper : WooCommerceScraper
{
protected override HashSet SizeAttributeNames
{
get
{
if (_sizeAttributeNames == null)
{
_sizeAttributeNames = base.SizeAttributeNames;
//_sizeAttributeNames.Add("attribute_mysize");
}
return _sizeAttributeNames;
}
}
protected override HashSet ColourAttributeNames
{
get
{
if (_colourAttributeNames == null)
{
_colourAttributeNames = base.ColourAttributeNames;
//_colourAttributeNames.Add("attribute_zodiac-sign");
}
return _colourAttributeNames;
}
}
protected override List GetProductUrls()
{
bool hasSubCategory = false;
HtmlDocument homePage = ScraperUtil.LoadHtml(Store.Url);
HtmlNodeCollection categoryNodes = homePage.DocumentNode.SelectNodes("//ul[@id='nav']/li");
foreach (HtmlNode categoryNode in categoryNodes)
{
string categoryName = categoryNode.SelectSingleNode("./a").InnerText.Trim();
log.Info("Parsing category: " + categoryName);
HtmlNodeCollection subCategoryNodes = categoryNode.SelectNodes("./ul/li/a");
if (subCategoryNodes != null)
{
foreach (HtmlNode subCategoryNode in subCategoryNodes)
{
log.Info("Parsing sub category: " + subCategoryNode.InnerText.Trim());
SiteMap.AddCategoryPage(categoryName + " " + subCategoryNode.InnerText, subCategoryNode, Store.Url);
GetProductUrlsFromCategoryPage(subCategoryNode);
hasSubCategory = true;
}
}
else
{
HtmlNode categoryLinkNode = categoryNode.SelectSingleNode("./a");
SiteMap.AddCategoryPage(categoryLinkNode, Store.Url);
GetProductUrlsFromCategoryPage(categoryLinkNode);
}
}
if (!hasSubCategory)
throw new Exception("No sub categories found");
return SiteMap.GetAllProductUrls();
}
private void GetProductUrlsFromCategoryPage(HtmlNode categoryNode)
{
GetProductUrlsFromCategoryPage(categoryNode.Attributes["href"].Value);
}
private void GetProductUrlsFromCategoryPage(string categoryUrl)
{
log.InfoFormat("Parsing category page. Url: {0}", categoryUrl);
HtmlDocument categoryPage = null;
try
{
categoryPage = ScraperUtil.LoadHtml(categoryUrl);
}
catch (WebException e)
{
log.Info("Error download category page: " + e.ToString());
return;
}
HtmlNode nextPageNode = null;
do
{
HtmlNodeCollection productNodes = categoryPage.DocumentNode.SelectNodes("//ul[@class='ProductList ']//div[@class='ProductDetails']/a");
if (productNodes == null)
{
log.Info("Category page has no products");
return;
}
foreach (HtmlNode productNode in productNodes)
{
SiteMap.AddProductUrl(productNode, categoryUrl);
log.InfoFormat("Product url added: {0}", productNode.Attributes["href"].Value);
}
nextPageNode = categoryPage.DocumentNode.SelectSingleNode("//ul[@class='page-numbers']/li/a[@class='next page-numbers']");
if (nextPageNode != null)
{
log.Info("Parsing next category page: " + nextPageNode.Attributes["href"].Value);
try
{
categoryPage = ScraperUtil.LoadHtml(nextPageNode.Attributes["href"].Value);
}
catch (WebException e)
{
log.Info("Error download next category page: " + e.ToString());
return;
}
}
} while (nextPageNode != null);
}
protected override bool IgnoreProduct()
{
return base.IgnoreProduct();
}
protected override string GetBrand()
{
return ProductPage.DocumentNode.SelectSingleNode("//div[starts-with(@id, 'product-') and @itemtype='http://schema.org/Product']//span[@class='posted_in' and contains(text(), 'Brands')]/a").InnerText;
}
protected override string GetDescription()
{
HtmlNodeCollection descriptionNodes = ProductPage.DocumentNode.SelectNodes("//div[@id='tab-description']//text()");
if (descriptionNodes != null)
return String.Join(" ", ProductPage.DocumentNode.SelectNodes("//div[@id='tab-description']//text()").Select(m => m.InnerText));
else
return null;
}
}
}
Comments
Post a Comment