--- a/.gitignore +++ b/.gitignore @@ -5,7 +5,9 @@ include/ local/ lib/ +build/ *.db *.pid conf.py +urlCache --- a/postScraper/postScraper/settings.py +++ b/postScraper/postScraper/settings.py @@ -13,5 +13,12 @@ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'postScraper (+http://www.yourdomain.com)' -HTTPCACHE_ENABLED = True +# HTTPCACHE_ENABLED = True +#HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy' +# SPIDER_MIDDLEWARES = { +# 'postScraper.middlewares.deltafetch.DeltaFetch': 100, +# } + +# DELTAFETCH_ENABLED = True +# DOTSCRAPY_ENABLED = True --- a/postScraper/postScraper/spiders/swara_spider.py +++ b/postScraper/postScraper/spiders/swara_spider.py @@ -3,11 +3,11 @@ from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from scrapy.contrib.loader import ItemLoader - from postScraper.items import PostscraperItem import facebook import conf +import anydbm class SwaraSpider(CrawlSpider): @@ -18,7 +18,8 @@ callback='parse_start'),) def parse_start(self, response): - if 'cached' not in response.flags: + db = anydbm.open('urlCache', 'c') + if response.url not in db.keys(): xpath = Selector() loader = ItemLoader(item=PostscraperItem(), response=response) @@ -37,4 +38,11 @@ description=content[0]['content'].encode('utf8'), message="#CGNetSwara http://cgnetswara.org/" + content[1]['audio']) + print str(response.url) + print type(response.url) + db[response.url] = str(True) + db.close() + else: + print "Not posting content from " + response.url + db.close()