From 30bdc98db1e597e29e367185d8a843556bd037de Mon Sep 17 00:00:00 2001 From: Arvind Date: Fri, 28 Mar 2014 22:31:23 +0530 Subject: [PATCH] First commit. The crawler, crawls a page specified and gets allowed pages from that domain. Further there is parsing to extract specific portion of the page, using XPath selectors. The parsed content is then posted to a social networking site. Run `$python setup.py install` to get the required dependencies. Start a crawl by `$scrapy crawl swara`. --- .gitignore | 9 +++++ postScraper/postScraper/items.py | 14 ++++++++ postScraper/postScraper/pipelines.py | 8 +++++ postScraper/postScraper/settings.py | 16 +++++++++ postScraper/postScraper/spiders/__init__.py | 4 +++ postScraper/postScraper/spiders/swara_spider.py | 40 +++++++++++++++++++++++ postScraper/scrapy.cfg | 11 +++++++ setup.py | 37 +++++++++++++++++++++ 8 files changed, 139 insertions(+) create mode 100644 .gitignore create mode 100644 postScraper/postScraper/__init__.py create mode 100644 postScraper/postScraper/items.py create mode 100644 postScraper/postScraper/pipelines.py create mode 100644 postScraper/postScraper/settings.py create mode 100644 postScraper/postScraper/spiders/__init__.py create mode 100644 postScraper/postScraper/spiders/swara_spider.py create mode 100644 postScraper/scrapy.cfg create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15f921a --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.pyc +*.*~ +.scrapy/ +bin/ +include/ +local/ +lib/ +*.db +*.pid \ No newline at end of file diff --git a/postScraper/postScraper/__init__.py b/postScraper/postScraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/postScraper/postScraper/items.py b/postScraper/postScraper/items.py new file mode 100644 index 0000000..1ce39ec --- /dev/null +++ b/postScraper/postScraper/items.py @@ -0,0 +1,14 @@ +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + + +class PostscraperItem(Item): + # define the fields for your item here like: + # TODO: get a factory for generating fields. + content = Field() + audio = Field() + title = Field() diff --git a/postScraper/postScraper/pipelines.py b/postScraper/postScraper/pipelines.py new file mode 100644 index 0000000..84d3f45 --- /dev/null +++ b/postScraper/postScraper/pipelines.py @@ -0,0 +1,8 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + +class PostscraperPipeline(object): + def process_item(self, item, spider): + return item diff --git a/postScraper/postScraper/settings.py b/postScraper/postScraper/settings.py new file mode 100644 index 0000000..8d3f300 --- /dev/null +++ b/postScraper/postScraper/settings.py @@ -0,0 +1,16 @@ +# Scrapy settings for postScraper project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'postScraper' + +SPIDER_MODULES = ['postScraper.spiders'] +NEWSPIDER_MODULE = 'postScraper.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'postScraper (+http://www.yourdomain.com)' +HTTPCACHE_ENABLED = True diff --git a/postScraper/postScraper/spiders/__init__.py b/postScraper/postScraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/postScraper/postScraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/postScraper/postScraper/spiders/swara_spider.py b/postScraper/postScraper/spiders/swara_spider.py new file mode 100644 index 0000000..cba07e7 --- /dev/null +++ b/postScraper/postScraper/spiders/swara_spider.py @@ -0,0 +1,40 @@ +from scrapy.contrib.spiders import CrawlSpider +from scrapy.contrib.spiders import Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import Selector +from scrapy.contrib.loader import ItemLoader + +from postScraper.items import PostscraperItem + +import facebook + + +class SwaraSpider(CrawlSpider): + name = 'swara' + allowed_domains = ["cgnetswara.org"] + start_urls = ["http://cgnetswara.org/"] + rules = (Rule(SgmlLinkExtractor(allow=r'\?id=\d+'), follow=True, + callback='parse_start'),) + + def parse_start(self, response): + xpath = Selector() + loader = ItemLoader(item=PostscraperItem(), response=response) + + loader.add_xpath('content', '//div[@class="report"]/p/text()') + loader.add_xpath('audio', + '//div[@class="audiobox"]/object/@data', + re='(audio\/\d+\.mp3)') + loader.add_xpath('title', '//div[@class="report"]/h3/text()') + + content = [{item: loader.get_collected_values(item)[0]} for item in + loader.load_item()] + + message = '{0}, {1}, http://cgnetswara.org/{2}'.format( + content[2]['title'].encode('utf8'), content[0]['content'].encode( + 'utf8'), + content[1]['audio'].encode('utf8')) + + graph = facebook.GraphAPI('CAAInjfaxO5kBAIGSm6cp7HQKpFDZAcRYIaGDswF5ZAZCpQZBQB2U5kZCyRLH2ShLZBkyGbJPDt8QzzB64WbrcZCuAo3rH5P7b1a59vTUK3m0CCZAGZCtDjwcTp7VwOlZCeCRmdcX9x6bsjNBrNgpwgRIcgbAI4cSnK2pHYlhOZBO5x5f4ZAsc5YGklj8xuxjg2Bu3ZB8ZD') + graph.put_object('me', 'feed', link=response.url, + description=content[0]['content'].encode('utf8'), + message="http://cgnetswara.org/"+content[1]['audio']) diff --git a/postScraper/scrapy.cfg b/postScraper/scrapy.cfg new file mode 100644 index 0000000..0336a61 --- /dev/null +++ b/postScraper/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = postScraper.settings + +[deploy] +#url = http://localhost:6800/ +project = postScraper diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..934bca2 --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +import os +from setuptools import setup, find_packages + +here = os.path.abspath(os.path.dirname(__file__)) +#README = open(os.path.join(here, 'README.rst')).read() +#CHANGES = open(os.path.join(here, 'CHANGES.rst')).read() + +requires = [ + 'scrapy', + 'facebook-sdk' + ] + +setup(name='Mouchak', + version='0.1', + description='Web framework', + license='BSD', + classifiers=[ + "Development Status :: 1 - alpha", + "Intended Audience :: Developers", + "Environment :: Web Environment", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: JavaScript", + "Programming Language :: Python", + "Programming Language :: Python :: 2.7", + "Topic :: Internet", + "Topic :: Internet :: WWW/HTTP :: Site Management", + ], + author='Zoso', + author_email='arvind@riseup.net', + url='https://git.pantoto.org/sweet-web/crawler.git', + keywords='', + packages=find_packages(), + include_package_data=True, + zip_safe=False, + install_requires=requires, + ) -- 1.7.10.4