From 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57 Mon Sep 17 00:00:00 2001 From: Arvind Date: Sun, 30 Mar 2014 19:30:13 +0530 Subject: [PATCH] Ticket #2:Disbale http_caching. Index page was also being cached, which means that whenever the spider ran, it got only a cached version of the page, hence will not get new posts. Setting caching policy to be compliant with RFC2616 does not help, the pages being served from the web server do not have any cache-control directives. Fix: Using anydbm module to implement an equivalent of caching, maintain a db of urls which have been crawled and posted, do not process those urls again. TODO: Find a more idiomatic way of doing this. This can move to sweets, maybe. --- .gitignore | 4 +++- postScraper/postScraper/settings.py | 9 ++++++++- postScraper/postScraper/spiders/swara_spider.py | 12 ++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 45e4175..16fcecb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ bin/ include/ local/ lib/ +build/ *.db *.pid -conf.py \ No newline at end of file +conf.py +urlCache \ No newline at end of file diff --git a/postScraper/postScraper/settings.py b/postScraper/postScraper/settings.py index 8d3f300..0483937 100644 --- a/postScraper/postScraper/settings.py +++ b/postScraper/postScraper/settings.py @@ -13,4 +13,11 @@ NEWSPIDER_MODULE = 'postScraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'postScraper (+http://www.yourdomain.com)' -HTTPCACHE_ENABLED = True +# HTTPCACHE_ENABLED = True +#HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy' +# SPIDER_MIDDLEWARES = { +# 'postScraper.middlewares.deltafetch.DeltaFetch': 100, +# } + +# DELTAFETCH_ENABLED = True +# DOTSCRAPY_ENABLED = True diff --git a/postScraper/postScraper/spiders/swara_spider.py b/postScraper/postScraper/spiders/swara_spider.py index 2e04730..9be9c65 100644 --- a/postScraper/postScraper/spiders/swara_spider.py +++ b/postScraper/postScraper/spiders/swara_spider.py @@ -3,11 +3,11 @@ from scrapy.contrib.spiders import Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from scrapy.contrib.loader import ItemLoader - from postScraper.items import PostscraperItem import facebook import conf +import anydbm class SwaraSpider(CrawlSpider): @@ -18,7 +18,8 @@ class SwaraSpider(CrawlSpider): callback='parse_start'),) def parse_start(self, response): - if 'cached' not in response.flags: + db = anydbm.open('urlCache', 'c') + if response.url not in db.keys(): xpath = Selector() loader = ItemLoader(item=PostscraperItem(), response=response) @@ -37,3 +38,10 @@ class SwaraSpider(CrawlSpider): description=content[0]['content'].encode('utf8'), message="#CGNetSwara http://cgnetswara.org/" + content[1]['audio']) + print str(response.url) + print type(response.url) + db[response.url] = str(True) + db.close() + else: + print "Not posting content from " + response.url + db.close() -- 1.7.10.4