From 23c872f0c8416b115d073c908915720cd6d5d7ff Mon Sep 17 00:00:00 2001 From: Arvind Date: Fri, 28 Mar 2014 23:53:07 +0530 Subject: [PATCH] Ticket #1 Fix: Do not post content from cached pages. Scrapy maintains a http cache, it knows what pages it has crawled previously. `Response` object has a `flags` attribute which is a list of flags like 'cached', 'redirected', etc. Comments: --- postScraper/postScraper/spiders/swara_spider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/postScraper/postScraper/spiders/swara_spider.py b/postScraper/postScraper/spiders/swara_spider.py index cba07e7..65d0290 100644 --- a/postScraper/postScraper/spiders/swara_spider.py +++ b/postScraper/postScraper/spiders/swara_spider.py @@ -17,8 +17,9 @@ class SwaraSpider(CrawlSpider): callback='parse_start'),) def parse_start(self, response): - xpath = Selector() - loader = ItemLoader(item=PostscraperItem(), response=response) + if 'cached' not in response.flags: + xpath = Selector() + loader = ItemLoader(item=PostscraperItem(), response=response) loader.add_xpath('content', '//div[@class="report"]/p/text()') loader.add_xpath('audio', -- 1.7.10.4