Scrapy Splash

In __init__ to run scraper


from scrapy import cmdline
cmdline.execute("scrapy crawl usps_service_spider".split())

Pipeline


import pyodbc
from scrapy.exceptions import NotConfigured


class TrackingServicesPipeline(object):
    def __init__(self):
        pass

    def open_spider(self, spider):
        self.conn = pyodbc.connect("Driver={SQL Server Native Client 11.0};Server=192.168.168.40;Database=amz_reviews;UID=amz_reviews;PWD=amz@dev;port=1433;Integrated_Security=true;Trusted_Connection=no;")
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
       # sql = "INSERT INTO table (field1, field2, field3) VALUES (%s, %s, %s)"
       # self.cursor.execute(sql,
       #                     (
       #                         item.get("field1"),
       #                         item.get("field2"),
        #                        item.get("field3"),
        #                    )
        #                    )
       # self.conn.commit()
        return item

    def close_spider(self, spider):
        self.conn.close()

Setting


BOT_NAME = 'Tracking_Services'

SPIDER_MODULES = ['Tracking_Services.spiders']
NEWSPIDER_MODULE = 'Tracking_Services.spiders'

ROBOTSTXT_OBEY =False

SPLASH_URL = 'http://localhost:8050/'
DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

ITEM_PIPELINES = {
    'Tracking_Services.pipelines.TrackingServicesPipeline': 300,
}



Spider withour Splash


import scrapy
from  ..items import TrackingServicesItem


class YodelServiceSpiderSpider(scrapy.Spider):
    name = 'yodel_service_spider'
    allowed_domains = ['http://tracking.yodel.co.uk/']
    start_urls = ['http://tracking.yodel.co.uk/wrd/run/WT_XTRACK_PW?PCL_NO_FROM.PARCEL_RANGE.XTRACKING=JJD0002254326090806&SUBMIT.DUM_CONTROLS.XTRACKING=Submit']
    listingDetails = TrackingServicesItem()
    def parse(self, response):
        rows=response.xpath("//table[@class='list']//tr[position()>1]")
        for row in rows:
            self.listingDetails['CurrentStatus']=row.xpath("./td[not(.//input or .//a)][2]/text()").extract()
            self.listingDetails['Date'] = row.xpath("./td[not(.//input or .//a)][3]/text()").extract()
            self.listingDetails['Time'] = row.xpath("./td[not(.//input or .//a)][4]/text()").extract()
            self.listingDetails['Signatory'] = row.xpath("./td[not(.//input or .//a)][5]/text()").extract()
            self.listingDetails['Branch'] = row.xpath("./td[not(.//input or .//a)][6]/text()").extract()

            yield  self.listingDetails
        pass

Spider with Splash


import scrapy
from scrapy_splash import SplashRequest

from  ..items import TrackingServicesItem


class YodelServiceSpiderSpider(scrapy.Spider):
    name = 'usps_service_spider'
    allowed_domains = ['https://webtrack.dhlglobalmail.com']
    start_urls = ['https://webtrack.dhlglobalmail.com/?trackingnumber=9361269903505367808721','https://webtrack.dhlglobalmail.com/?trackingnumber=9361269903505306406797']
    listingDetails = TrackingServicesItem()
    def parse(self, response):
        self.listingDetails['CurrentStatus'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/h2/text()").extract()
        self.listingDetails['description'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/em/text()").extract()
        self.listingDetails['Date'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/p/text()").extract()
        self.listingDetails['Location'] = response.xpath("//div[@class='col col-sm-8' or @class='status-info']/p/br/text()").extract()
        print(self.listingDetails['CurrentStatus'],self.listingDetails['description'],self.listingDetails['Date'],self.listingDetails['Location'])
        yield self.listingDetails

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback=self.parse,args={"wait":3})

Item


import scrapy


class TrackingServicesItem(scrapy.Item):
    # define the fields for your item here like:
    CurrentStatus = scrapy.Field()
    Date = scrapy.Field()
    Time = scrapy.Field()
    Signatory = scrapy.Field()
    Branch = scrapy.Field()

    description = scrapy.Field()
    Location = scrapy.Field()

    pass

Comments

Popular posts from this blog

Web Scraping material

Utility