From 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57 Mon Sep 17 00:00:00 2001
From: Arvind <arvindkhadri@gmail.com>
Date: Sun, 30 Mar 2014 19:30:13 +0530
Subject: [PATCH] Ticket #2:Disbale http_caching.  Index page was also being
 cached, which means that whenever the spider ran, it got
 only a cached version of the page, hence will not get new
 posts. Setting caching policy to be compliant with RFC2616
 does not help, the pages being served from the web server
 do not have any cache-control directives.

Fix: Using anydbm module to implement an equivalent of caching, maintain a db
of urls which have been crawled and posted, do not process those urls
again.

TODO: Find a more idiomatic way of doing this. This can move to sweets, maybe.
---
 .gitignore                                      |    4 +++-
 postScraper/postScraper/settings.py             |    9 ++++++++-
 postScraper/postScraper/spiders/swara_spider.py |   12 ++++++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 45e4175..16fcecb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@ bin/
 include/
 local/
 lib/
+build/
 *.db
 *.pid
-conf.py
\ No newline at end of file
+conf.py
+urlCache
\ No newline at end of file
diff --git a/postScraper/postScraper/settings.py b/postScraper/postScraper/settings.py
index 8d3f300..0483937 100644
--- a/postScraper/postScraper/settings.py
+++ b/postScraper/postScraper/settings.py
@@ -13,4 +13,11 @@ NEWSPIDER_MODULE = 'postScraper.spiders'
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'postScraper (+http://www.yourdomain.com)'
-HTTPCACHE_ENABLED = True
+# HTTPCACHE_ENABLED = True
+#HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
+# SPIDER_MIDDLEWARES = {
+#     'postScraper.middlewares.deltafetch.DeltaFetch': 100,
+# }
+
+# DELTAFETCH_ENABLED = True
+# DOTSCRAPY_ENABLED = True
diff --git a/postScraper/postScraper/spiders/swara_spider.py b/postScraper/postScraper/spiders/swara_spider.py
index 2e04730..9be9c65 100644
--- a/postScraper/postScraper/spiders/swara_spider.py
+++ b/postScraper/postScraper/spiders/swara_spider.py
@@ -3,11 +3,11 @@ from scrapy.contrib.spiders import Rule
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.selector import Selector
 from scrapy.contrib.loader import ItemLoader
-
 from postScraper.items import PostscraperItem
 
 import facebook
 import conf
+import anydbm
 
 
 class SwaraSpider(CrawlSpider):
@@ -18,7 +18,8 @@ class SwaraSpider(CrawlSpider):
                   callback='parse_start'),)
 
     def parse_start(self, response):
-        if 'cached' not in response.flags:
+        db = anydbm.open('urlCache', 'c')
+        if response.url not in db.keys():
             xpath = Selector()
             loader = ItemLoader(item=PostscraperItem(), response=response)
 
@@ -37,3 +38,10 @@ class SwaraSpider(CrawlSpider):
                              description=content[0]['content'].encode('utf8'),
                              message="#CGNetSwara http://cgnetswara.org/" +
                              content[1]['audio'])
+            print str(response.url)
+            print type(response.url)
+            db[response.url] = str(True)
+            db.close()
+        else:
+            print "Not posting content from " + response.url
+            db.close()
-- 
1.7.10.4