Commit 890f5ac40e64fd9b28a5fc3ede0509fee7abbe57
Ticket #2:Disbale http_caching. Index page was also being cached, which means
that whenever the spider ran, it got only a cached version of the page,
hence will not get new posts.
Setting caching policy to be compliant with RFC2616 does not help, the
pages being served from the web server do not have any cache-control
directives.
Fix: Using anydbm module to implement an equivalent of caching, maintain a db
of urls which have been crawled and posted, do not process those urls
again.
TODO: Find a more idiomatic way of doing this. This can move to sweets, maybe.
| | | | 5 | include/ | 5 | include/ |
---|
6 | local/ | 6 | local/ |
---|
7 | lib/ | 7 | lib/ |
---|
| | 8 | build/ |
---|
8 | *.db | 9 | *.db |
---|
9 | *.pid | 10 | *.pid |
---|
10 | conf.py | 11 | conf.py |
---|
| | 12 | urlCache |
---|
| | | | 13 | | 13 | |
---|
14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent | 14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
---|
15 | #USER_AGENT = 'postScraper (+http://www.yourdomain.com)' | 15 | #USER_AGENT = 'postScraper (+http://www.yourdomain.com)' |
---|
16 | HTTPCACHE_ENABLED = True | | HTTPCACHE_ENABLED = True |
---|
| | 16 | # HTTPCACHE_ENABLED = True | | | 17 | #HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy' |
---|
| | 18 | # SPIDER_MIDDLEWARES = { |
---|
| | 19 | # 'postScraper.middlewares.deltafetch.DeltaFetch': 100, |
---|
| | 20 | # } |
---|
| | 21 | |
---|
| | 22 | # DELTAFETCH_ENABLED = True |
---|
| | 23 | # DOTSCRAPY_ENABLED = True |
---|
| | | | 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor |
---|
4 | from scrapy.selector import Selector | 4 | from scrapy.selector import Selector |
---|
5 | from scrapy.contrib.loader import ItemLoader | 5 | from scrapy.contrib.loader import ItemLoader |
---|
6 | | | |
---|
7 | from postScraper.items import PostscraperItem | 6 | from postScraper.items import PostscraperItem |
---|
8 | | 7 | |
---|
9 | import facebook | 8 | import facebook |
---|
10 | import conf | 9 | import conf |
---|
| | 10 | import anydbm |
---|
11 | | 11 | |
---|
12 | | 12 | |
---|
13 | class SwaraSpider(CrawlSpider): | 13 | class SwaraSpider(CrawlSpider): |
---|
… | | … | |
---|
18 | callback='parse_start'),) | 18 | callback='parse_start'),) |
---|
19 | | 19 | |
---|
20 | def parse_start(self, response): | 20 | def parse_start(self, response): |
---|
21 | if 'cached' not in response.flags: | | if 'cached' not in response.flags: |
---|
| | 21 | db = anydbm.open('urlCache', 'c') | | | 22 | if response.url not in db.keys(): |
---|
22 | xpath = Selector() | 23 | xpath = Selector() |
---|
23 | loader = ItemLoader(item=PostscraperItem(), response=response) | 24 | loader = ItemLoader(item=PostscraperItem(), response=response) |
---|
24 | | 25 | |
---|
… | | … | |
---|
38 | description=content[0]['content'].encode('utf8'), | 38 | description=content[0]['content'].encode('utf8'), |
---|
39 | message="#CGNetSwara http://cgnetswara.org/" + | 39 | message="#CGNetSwara http://cgnetswara.org/" + |
---|
40 | content[1]['audio']) | 40 | content[1]['audio']) |
---|
| | 41 | print str(response.url) |
---|
| | 42 | print type(response.url) |
---|
| | 43 | db[response.url] = str(True) |
---|
| | 44 | db.close() |
---|
| | 45 | else: |
---|
| | 46 | print "Not posting content from " + response.url |
---|
| | 47 | db.close() |
---|