(base) D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper> >>> for dirpath, subdirs, files in os.walk("."): ... for f in files: ... print(os.path.join(dirpath, f)) ... os.system("type " + os.path.join(dirpath, f)) ... print(" ----------------------------------------------- ") ... .\client.py import requests headers = {'content-type': 'application/json'} URL = "http://127.0.0.1:5050/helloworld" r = requests.post(url = URL, json = {}, headers = headers) print("Response text: " + r.text) URL = "http://127.0.0.1:5050/helloworld" r = requests.post(url = URL, json = {}, headers = headers) print("Response text: " + r.text) ----------------------------------------------- .\crawler.py from scrapy.crawler import CrawlerProcess from ourfirstscraper.spiders.survival8 import Survival8Spider class MyCrawler(): def start_crawler(self): print('inside start_crawler') process = CrawlerProcess() process.crawl(Survival8Spider, depth = 1, args={'callback': self.return_spider_output}) process.start() def return_spider_output(self, response): print("return_spider_output") ----------------------------------------------- .\scrapy.cfg # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = ourfirstscraper.settings [deploy] #url = http://localhost:6800/ project = ourfirstscraper ----------------------------------------------- .\server.py import re # Flask Imports from flask_cors import CORS, cross_origin from flask import Flask, request import os from crawler import MyCrawler app = Flask(__name__) cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' # POST @app.route("/helloworld", methods = ['POST']) @cross_origin() def helloWorld(): print("Content-Type: " + request.headers['Content-Type']) if request.headers['Content-Type'] == 'text/plain': return "Request from client is: " + request.data elif(request.headers['Content-Type'] in ['application/json', 'application/json; charset=utf-8']): # For C#: request.headers['Content-Type'] == 'application/json; charset=utf-8' # For Python: request.headers['Content-Type'] == 'application/json' # For AngularJS due to the "Content-Type" headers == 'application/json' print("request.json: " + str(request.json)) mcrawl = MyCrawler() mcrawl.start_crawler() return "Exiting helloWorld()" # GET @app.route("/") @cross_origin() # allow all origins all methods. def hello(): return "Hello, cross-origin-world!" if __name__ == "__main__": app.run(host = "0.0.0.0", port = 5050)0 ----------------------------------------------- 0 .\__init__.py ----------------------------------------------- .\ourfirstscraper\items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class OurfirstscraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ----------------------------------------------- .\ourfirstscraper\middlewares.py # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class OurfirstscraperSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesnΓÇÖt have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class OurfirstscraperDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ----------------------------------------------- .\ourfirstscraper\pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class OurfirstscraperPipeline(object): def process_item(self, item, spider): return item ----------------------------------------------- .\ourfirstscraper\settings.py # -*- coding: utf-8 -*- # Scrapy settings for ourfirstscraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ourfirstscraper' SPIDER_MODULES = ['ourfirstscraper.spiders'] NEWSPIDER_MODULE = 'ourfirstscraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ourfirstscraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ourfirstscraper.middlewares.OurfirstscraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'ourfirstscraper.middlewares.OurfirstscraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'ourfirstscraper.pipelines.OurfirstscraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ----------------------------------------------- 0 .\ourfirstscraper\__init__.py ----------------------------------------------- .\ourfirstscraper\spiders\survival8.py # (base) D:\workspace\Jupyter\exp_52_scrapy>scrapy startproject ourfirstscraper # New Scrapy project 'ourfirstscraper', using template directory 'c:\users\ashish\appdata\local\continuum\anaconda3\lib\site-packages\scrapy\templates\project', created in: # D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper # You can start your first spider with: # cd ourfirstscraper # scrapy genspider example example.com # (base) D:\workspace\Jupyter\exp_52_scrapy>cd ourfirstscraper # (base) D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper>scrapy genspider survival8 survival8.blogspot.com # Created spider 'survival8' using template 'basic' in module: # ourfirstscraper.spiders.survival8 # -*- coding: utf-8 -*- import scrapy from scrapy import Request from urllib.parse import urljoin, urlsplit, urlunsplit, urlparse class Survival8Spider(scrapy.Spider): name = 'survival8' allowed_domains = ['survival8.blogspot.com'] start_urls = ['http://survival8.blogspot.com/p/index-of-lessons-in-technology.html'] def __init__(self, *args, **kwargs): super(Survival8Spider, self).__init__(*args, **kwargs) if kwargs.get('depth'): self.d = int(kwargs.get('depth')) else: self.d = 0 # Handle exception here. #print("Exception handled for the case of 'no url to start with!'") #raise Exception def parse(self, response): sites = response.xpath('//a') split_url = urlsplit(response.url) urls = [] for site in sites: site = site.xpath('@href').extract() temp = [] if len(site) > 0 and "mailto:" not in site[0]: if "://" in site[0] : temp.append("" + site[0]) # "site[0]" is giving a list of all the characters in the URL string instead of a single string. else: temp.append(split_url.scheme + "://" + split_url.netloc + site[0]) urls.append(temp) for i in range(len(urls)): for url in urls[i]: request = Request(url, callback=self.extract_urls, cb_kwargs = dict(urls = urls)) print("FROM PARSE") print(request) return request def extract_urls(self, response, urls): sites = response.xpath('//a') urls.append(sites) print("FROM EXTRACT_URLS") print(sites) return sites class MyCrawler(): def start_crawler(self): print('inside start_crawler') process = CrawlerProcess() process.crawl(Survival8Spider, depth = 1, args={'callback': self.return_spider_output}) process.start() def return_spider_output(self, response): print("return_spider_output") ----------------------------------------------- .\ourfirstscraper\spiders\__init__.py # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ----------------------------------------------- >>> LOGS: (swagger) D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper>python server.py * Serving Flask app "server" (lazy loading) * Environment: production WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. * Debug mode: off * Running on http://0.0.0.0:5050/ (Press CTRL+C to quit) Content-Type: application/json request.json: {} inside start_crawler [2020-03-04 22:50:24,329] ERROR in app: Exception on /helloworld [POST] Traceback (most recent call last): File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 2446, in wsgi_app response = self.full_dispatch_request() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1951, in full_dispatch_request rv = self.handle_user_exception(e) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask_cors\extension.py", line 161, in wrapped_function return cors_after_request(app.make_response(f(*args, **kwargs))) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1820, in handle_user_exception reraise(exc_type, exc_value, tb) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\_compat.py", line 39, in reraise raise value File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1949, in full_dispatch_request rv = self.dispatch_request() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1935, in dispatch_request return self.view_functions[rule.endpoint](**req.view_args) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask_cors\decorator.py", line 128, in wrapped_function resp = make_response(f(*args, **kwargs)) File "server.py", line 30, in helloWorld hcrawl.start_crawler() File "D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper\crawler.py", line 9, in start_crawler process = CrawlerProcess() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\scrapy\crawler.py", line 268, in __init__ install_shutdown_handlers(self._signal_shutdown) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\scrapy\utils\ossignal.py", line 22, in install_shutdown_handlers reactor._handleSignals() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\twisted\internet\posixbase.py", line 295, in _handleSignals _SignalReactorMixin._handleSignals(self) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\twisted\internet\base.py", line 1243, in _handleSignals signal.signal(signal.SIGINT, self.sigInt) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\signal.py", line 47, in signal handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler)) ValueError: signal only works in main thread 127.0.0.1 - - [04/Feb/2019 22:50:24] "POST /helloworld HTTP/1.1" 500 - Content-Type: application/json request.json: {} inside start_crawler [2020-03-04 22:50:24,557] ERROR in app: Exception on /helloworld [POST] Traceback (most recent call last): File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 2446, in wsgi_app response = self.full_dispatch_request() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1951, in full_dispatch_request rv = self.handle_user_exception(e) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask_cors\extension.py", line 161, in wrapped_function return cors_after_request(app.make_response(f(*args, **kwargs))) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1820, in handle_user_exception reraise(exc_type, exc_value, tb) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\_compat.py", line 39, in reraise raise value File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1949, in full_dispatch_request rv = self.dispatch_request() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask\app.py", line 1935, in dispatch_request return self.view_functions[rule.endpoint](**req.view_args) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\flask_cors\decorator.py", line 128, in wrapped_function resp = make_response(f(*args, **kwargs)) File "server.py", line 30, in helloWorld hcrawl.start_crawler() File "D:\workspace\Jupyter\exp_52_scrapy\ourfirstscraper\crawler.py", line 9, in start_crawler process = CrawlerProcess() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\scrapy\crawler.py", line 268, in __init__ install_shutdown_handlers(self._signal_shutdown) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\scrapy\utils\ossignal.py", line 22, in install_shutdown_handlers reactor._handleSignals() File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\twisted\internet\posixbase.py", line 295, in _handleSignals _SignalReactorMixin._handleSignals(self) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\site-packages\twisted\internet\base.py", line 1243, in _handleSignals signal.signal(signal.SIGINT, self.sigInt) File "C:\Users\ashish\AppData\Local\Continuum\anaconda3\envs\swagger\lib\signal.py", line 47, in signal handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler)) ValueError: signal only works in main thread 127.0.0.1 - - [04/Feb/2019 22:50:24] "POST /helloworld HTTP/1.1" 500 - Google Drive Link to code: https://drive.google.com/open?id=1D5OP2e6hKuekliN2pbDpQZJopafW-OnP
Generating Scrapy Error 'ValueError - signal only works in main thread'
Subscribe to:
Posts (Atom)
No comments:
Post a Comment