From 30bdc98db1e597e29e367185d8a843556bd037de Mon Sep 17 00:00:00 2001
From: Arvind <arvindkhadri@gmail.com>
Date: Fri, 28 Mar 2014 22:31:23 +0530
Subject: [PATCH] First commit.  The crawler, crawls a page specified and gets
 allowed pages from that domain.  Further there is parsing
 to extract specific portion of the page, using XPath
 selectors.  The parsed content is then posted to a social
 networking site.

Run `$python setup.py install` to get the required dependencies.
Start a crawl by `$scrapy crawl swara`.
---
 .gitignore                                      |    9 +++++
 postScraper/postScraper/items.py                |   14 ++++++++
 postScraper/postScraper/pipelines.py            |    8 +++++
 postScraper/postScraper/settings.py             |   16 +++++++++
 postScraper/postScraper/spiders/__init__.py     |    4 +++
 postScraper/postScraper/spiders/swara_spider.py |   40 +++++++++++++++++++++++
 postScraper/scrapy.cfg                          |   11 +++++++
 setup.py                                        |   37 +++++++++++++++++++++
 8 files changed, 139 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 postScraper/postScraper/__init__.py
 create mode 100644 postScraper/postScraper/items.py
 create mode 100644 postScraper/postScraper/pipelines.py
 create mode 100644 postScraper/postScraper/settings.py
 create mode 100644 postScraper/postScraper/spiders/__init__.py
 create mode 100644 postScraper/postScraper/spiders/swara_spider.py
 create mode 100644 postScraper/scrapy.cfg
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..15f921a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.pyc
+*.*~
+.scrapy/
+bin/
+include/
+local/
+lib/
+*.db
+*.pid
\ No newline at end of file
diff --git a/postScraper/postScraper/__init__.py b/postScraper/postScraper/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/postScraper/postScraper/items.py b/postScraper/postScraper/items.py
new file mode 100644
index 0000000..1ce39ec
--- /dev/null
+++ b/postScraper/postScraper/items.py
@@ -0,0 +1,14 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+
+class PostscraperItem(Item):
+    # define the fields for your item here like:
+    # TODO: get a factory for generating fields.
+    content = Field()
+    audio = Field()
+    title = Field()
diff --git a/postScraper/postScraper/pipelines.py b/postScraper/postScraper/pipelines.py
new file mode 100644
index 0000000..84d3f45
--- /dev/null
+++ b/postScraper/postScraper/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+class PostscraperPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/postScraper/postScraper/settings.py b/postScraper/postScraper/settings.py
new file mode 100644
index 0000000..8d3f300
--- /dev/null
+++ b/postScraper/postScraper/settings.py
@@ -0,0 +1,16 @@
+# Scrapy settings for postScraper project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'postScraper'
+
+SPIDER_MODULES = ['postScraper.spiders']
+NEWSPIDER_MODULE = 'postScraper.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'postScraper (+http://www.yourdomain.com)'
+HTTPCACHE_ENABLED = True
diff --git a/postScraper/postScraper/spiders/__init__.py b/postScraper/postScraper/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/postScraper/postScraper/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/postScraper/postScraper/spiders/swara_spider.py b/postScraper/postScraper/spiders/swara_spider.py
new file mode 100644
index 0000000..cba07e7
--- /dev/null
+++ b/postScraper/postScraper/spiders/swara_spider.py
@@ -0,0 +1,40 @@
+from scrapy.contrib.spiders import CrawlSpider
+from scrapy.contrib.spiders import Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import Selector
+from scrapy.contrib.loader import ItemLoader
+
+from postScraper.items import PostscraperItem
+
+import facebook
+
+
+class SwaraSpider(CrawlSpider):
+    name = 'swara'
+    allowed_domains = ["cgnetswara.org"]
+    start_urls = ["http://cgnetswara.org/"]
+    rules = (Rule(SgmlLinkExtractor(allow=r'\?id=\d+'), follow=True,
+                  callback='parse_start'),)
+
+    def parse_start(self, response):
+        xpath = Selector()
+        loader = ItemLoader(item=PostscraperItem(), response=response)
+
+        loader.add_xpath('content', '//div[@class="report"]/p/text()')
+        loader.add_xpath('audio',
+                         '//div[@class="audiobox"]/object/@data',
+                         re='(audio\/\d+\.mp3)')
+        loader.add_xpath('title', '//div[@class="report"]/h3/text()')
+
+        content = [{item: loader.get_collected_values(item)[0]} for item in
+                   loader.load_item()]
+
+        message = '{0}, {1}, http://cgnetswara.org/{2}'.format(
+            content[2]['title'].encode('utf8'), content[0]['content'].encode(
+                'utf8'),
+            content[1]['audio'].encode('utf8'))
+
+        graph = facebook.GraphAPI('CAAInjfaxO5kBAIGSm6cp7HQKpFDZAcRYIaGDswF5ZAZCpQZBQB2U5kZCyRLH2ShLZBkyGbJPDt8QzzB64WbrcZCuAo3rH5P7b1a59vTUK3m0CCZAGZCtDjwcTp7VwOlZCeCRmdcX9x6bsjNBrNgpwgRIcgbAI4cSnK2pHYlhOZBO5x5f4ZAsc5YGklj8xuxjg2Bu3ZB8ZD')
+        graph.put_object('me', 'feed', link=response.url,
+                         description=content[0]['content'].encode('utf8'),
+                         message="http://cgnetswara.org/"+content[1]['audio'])
diff --git a/postScraper/scrapy.cfg b/postScraper/scrapy.cfg
new file mode 100644
index 0000000..0336a61
--- /dev/null
+++ b/postScraper/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = postScraper.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = postScraper
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..934bca2
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+import os
+from setuptools import setup, find_packages
+
+here = os.path.abspath(os.path.dirname(__file__))
+#README = open(os.path.join(here, 'README.rst')).read()
+#CHANGES = open(os.path.join(here, 'CHANGES.rst')).read()
+
+requires = [
+    'scrapy',
+    'facebook-sdk'
+    ]
+
+setup(name='Mouchak',
+      version='0.1',
+      description='Web framework',
+      license='BSD',
+      classifiers=[
+        "Development Status :: 1 - alpha",
+        "Intended Audience :: Developers",
+        "Environment :: Web Environment",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: OS Independent",
+        "Programming Language :: JavaScript",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 2.7",
+        "Topic :: Internet",
+        "Topic :: Internet :: WWW/HTTP :: Site Management",
+        ],
+      author='Zoso',
+      author_email='arvind@riseup.net',
+      url='https://git.pantoto.org/sweet-web/crawler.git',
+      keywords='',
+      packages=find_packages(),
+      include_package_data=True,
+      zip_safe=False,
+      install_requires=requires,
+      )
-- 
1.7.10.4