Commit 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57

arvind
Sun Mar 30 19:30:13 IST 2014

Tree SHA1: 404bcb9
Parent SHA1: 918fa19 (Adding #CGNetSwara to the content of the post being made.)
raw diff | raw patch

Ticket #2:Disbale http_caching.  Index page was also being cached, which means
that whenever the spider ran, it got only a cached version of the page,
hence will not get new posts.
Setting caching policy to be compliant with RFC2616 does not help, the
pages being served from the web server do not have any cache-control
directives.

Fix: Using anydbm module to implement an equivalent of caching, maintain a db
of urls which have been crawled and posted, do not process those urls
again.

TODO: Find a more idiomatic way of doing this. This can move to sweets, maybe.

.gitignore 4 -+++
postScraper/postScraper/settings.py 9 -++++++++
postScraper/postScraper/spiders/swara_spider.py 12 --++++++++++

Diff rendering mode:
inline
side by side

.gitignore

 include/
 local/
 lib/
+build/
 *.db
 *.pid
 conf.py
+urlCache

postScraper/postScraper/settings.py

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'postScraper (+http://www.yourdomain.com)'
 HTTPCACHE_ENABLED = True
+# HTTPCACHE_ENABLED = True
+#HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
+# SPIDER_MIDDLEWARES = {
+#     'postScraper.middlewares.deltafetch.DeltaFetch': 100,
+# }
+# DELTAFETCH_ENABLED = True
+# DOTSCRAPY_ENABLED = True

postScraper/postScraper/spiders/swara_spider.py

 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.selector import Selector
 from scrapy.contrib.loader import ItemLoader
 from postScraper.items import PostscraperItem
 import facebook
 import conf
+import anydbm
 class SwaraSpider(CrawlSpider):
-…
                   callback='parse_start'),)
     def parse_start(self, response):
         if 'cached' not in response.flags:
+        db = anydbm.open('urlCache', 'c')
+        if response.url not in db.keys():
             xpath = Selector()
             loader = ItemLoader(item=PostscraperItem(), response=response)
-…
                              description=content[0]['content'].encode('utf8'),
                              message="#CGNetSwara http://cgnetswara.org/" +
                              content[1]['audio'])
+            print str(response.url)
+            print type(response.url)
+            db[response.url] = str(True)
+            db.close()
+        else:
+            print "Not posting content from " + response.url
+            db.close()

Commit 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57

.gitignore

postScraper/postScraper/settings.py

postScraper/postScraper/spiders/swara_spider.py

Gitorious