In __init__ to run scraper
from scrapy import cmdline
cmdline.execute("scrapy crawl usps_service_spider".split())
Pipeline
import pyodbc
from scrapy.exceptions import NotConfigured
class TrackingServicesPipeline(object):
def __init__(self):
pass
def open_spider(self, spider):
self.conn = pyodbc.connect("Driver={SQL Server Native Client 11.0};Server=192.168.168.40;Database=amz_reviews;UID=amz_reviews;PWD=amz@dev;port=1433;Integrated_Security=true;Trusted_Connection=no;")
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql = "INSERT INTO table (field1, field2, field3) VALUES (%s, %s, %s)"
# self.cursor.execute(sql,
# (
# item.get("field1"),
# item.get("field2"),
# item.get("field3"),
# )
# )
# self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
Setting
BOT_NAME = 'Tracking_Services'
SPIDER_MODULES = ['Tracking_Services.spiders']
NEWSPIDER_MODULE = 'Tracking_Services.spiders'
ROBOTSTXT_OBEY =False
SPLASH_URL = 'http://localhost:8050/'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ITEM_PIPELINES = {
'Tracking_Services.pipelines.TrackingServicesPipeline': 300,
}
Spider withour Splash
import scrapy
from ..items import TrackingServicesItem
class YodelServiceSpiderSpider(scrapy.Spider):
name = 'yodel_service_spider'
allowed_domains = ['http://tracking.yodel.co.uk/']
start_urls = ['http://tracking.yodel.co.uk/wrd/run/WT_XTRACK_PW?PCL_NO_FROM.PARCEL_RANGE.XTRACKING=JJD0002254326090806&SUBMIT.DUM_CONTROLS.XTRACKING=Submit']
listingDetails = TrackingServicesItem()
def parse(self, response):
rows=response.xpath("//table[@class='list']//tr[position()>1]")
for row in rows:
self.listingDetails['CurrentStatus']=row.xpath("./td[not(.//input or .//a)][2]/text()").extract()
self.listingDetails['Date'] = row.xpath("./td[not(.//input or .//a)][3]/text()").extract()
self.listingDetails['Time'] = row.xpath("./td[not(.//input or .//a)][4]/text()").extract()
self.listingDetails['Signatory'] = row.xpath("./td[not(.//input or .//a)][5]/text()").extract()
self.listingDetails['Branch'] = row.xpath("./td[not(.//input or .//a)][6]/text()").extract()
yield self.listingDetails
pass
Spider with Splash
import scrapy
from scrapy_splash import SplashRequest
from ..items import TrackingServicesItem
class YodelServiceSpiderSpider(scrapy.Spider):
name = 'usps_service_spider'
allowed_domains = ['https://webtrack.dhlglobalmail.com']
start_urls = ['https://webtrack.dhlglobalmail.com/?trackingnumber=9361269903505367808721','https://webtrack.dhlglobalmail.com/?trackingnumber=9361269903505306406797']
listingDetails = TrackingServicesItem()
def parse(self, response):
self.listingDetails['CurrentStatus'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/h2/text()").extract()
self.listingDetails['description'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/em/text()").extract()
self.listingDetails['Date'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/p/text()").extract()
self.listingDetails['Location'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/p/br/text()").extract()
print(self.listingDetails['CurrentStatus'],self.listingDetails['description'],self.listingDetails['Date'],self.listingDetails['Location'])
yield self.listingDetails
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse,args={"wait":3})
Item
import scrapy
class TrackingServicesItem(scrapy.Item):
# define the fields for your item here like:
CurrentStatus = scrapy.Field()
Date = scrapy.Field()
Time = scrapy.Field()
Signatory = scrapy.Field()
Branch = scrapy.Field()
description = scrapy.Field()
Location = scrapy.Field()
pass
Comments
Post a Comment