Commit 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57

  • avatar
  • arvind
  • Sun Mar 30 19:30:13 IST 2014
Ticket #2:Disbale http_caching.  Index page was also being cached, which means
that whenever the spider ran, it got only a cached version of the page,
hence will not get new posts.
Setting caching policy to be compliant with RFC2616 does not help, the
pages being served from the web server do not have any cache-control
directives.

Fix: Using anydbm module to implement an equivalent of caching, maintain a db
of urls which have been crawled and posted, do not process those urls
again.

TODO: Find a more idiomatic way of doing this. This can move to sweets, maybe.
  • Diff rendering mode:
  • inline
  • side by side

.gitignore

5include/5include/
6local/6local/
7lib/7lib/
8build/
8*.db9*.db
9*.pid10*.pid
10conf.py11conf.py
12urlCache

postScraper/postScraper/settings.py

1313
14# Crawl responsibly by identifying yourself (and your website) on the user-agent14# Crawl responsibly by identifying yourself (and your website) on the user-agent
15#USER_AGENT = 'postScraper (+http://www.yourdomain.com)'15#USER_AGENT = 'postScraper (+http://www.yourdomain.com)'
16HTTPCACHE_ENABLED = True
16# HTTPCACHE_ENABLED = True
17#HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
18# SPIDER_MIDDLEWARES = {
19# 'postScraper.middlewares.deltafetch.DeltaFetch': 100,
20# }
21
22# DELTAFETCH_ENABLED = True
23# DOTSCRAPY_ENABLED = True

postScraper/postScraper/spiders/swara_spider.py

3from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor3from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
4from scrapy.selector import Selector4from scrapy.selector import Selector
5from scrapy.contrib.loader import ItemLoader5from scrapy.contrib.loader import ItemLoader
6
7from postScraper.items import PostscraperItem6from postScraper.items import PostscraperItem
87
9import facebook8import facebook
10import conf9import conf
10import anydbm
1111
1212
13class SwaraSpider(CrawlSpider):13class SwaraSpider(CrawlSpider):
18 callback='parse_start'),)18 callback='parse_start'),)
1919
20 def parse_start(self, response):20 def parse_start(self, response):
21 if 'cached' not in response.flags:
21 db = anydbm.open('urlCache', 'c')
22 if response.url not in db.keys():
22 xpath = Selector()23 xpath = Selector()
23 loader = ItemLoader(item=PostscraperItem(), response=response)24 loader = ItemLoader(item=PostscraperItem(), response=response)
2425
38 description=content[0]['content'].encode('utf8'),38 description=content[0]['content'].encode('utf8'),
39 message="#CGNetSwara http://cgnetswara.org/" +39 message="#CGNetSwara http://cgnetswara.org/" +
40 content[1]['audio'])40 content[1]['audio'])
41 print str(response.url)
42 print type(response.url)
43 db[response.url] = str(True)
44 db.close()
45 else:
46 print "Not posting content from " + response.url
47 db.close()