Hard coded paths to files or directories: 1. File path: C:\Users\ashish\Desktop\myscrapers\ourfirstscraper\settings.py Line: FEED_URI="C:/Users/ashish/Desktop/myscrapers/response_logs.json" 2. File path: C:\Users\ashish\Desktop\myscrapers\server_3.py Line: os.chdir(r"C:/Users/ashish/Desktop/myscrapers/ourfirstscraper") Code: Server code file "C:\Users\ashish\Desktop\myscrapers\server_3.py": import re # Flask Imports from flask_cors import CORS, cross_origin from flask import Flask, request import os app = Flask(__name__) cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' # POST @app.route("/helloworld", methods = ['POST']) @cross_origin() def helloWorld(): print("Content-Type: " + request.headers['Content-Type']) if request.headers['Content-Type'] == 'text/plain': return "Request from client is: " + request.data elif(request.headers['Content-Type'] in ['application/json', 'application/json; charset=utf-8']): # For C#: request.headers['Content-Type'] == 'application/json; charset=utf-8' # For Python: request.headers['Content-Type'] == 'application/json' # For AngularJS due to the "Content-Type" headers == 'application/json' print("request.json: " + str(request.json)) # Next line changes the current directory to the path from where CLI commands will be executed. os.chdir(r"C:/Users/ashish/Desktop/myscrapers/ourfirstscraper") os.system('scrapy crawl xyz -a start_urls="' + ','.join(request.json['start_urls']) + '"') return "Exiting helloWorld()" # GET @app.route("/") @cross_origin() # allow all origins all methods. def hello(): return "Hello, cross-origin-world!" if __name__ == "__main__": app.run(host = "0.0.0.0", port = 5050) Client code file "C:\Users\ashish\Desktop\myscrapers\client_3.py": import requests headers = {'content-type': 'application/json'} URL = "http://127.0.0.1:5050/helloworld" r = requests.post(url = URL, json = { 'start_urls': ['https://survival8.blogspot.com/p/index-of-lessons-in-technology.html'] }, headers = headers) print("Response text: " + r.text) URL = "http://127.0.0.1:5050" r = requests.get(url = URL, data = {}, headers = headers) print("Response text: " + r.text) Spider code file "xyz.py": # How to execute this spider: (base) C:/Users/ashish/Desktop/myscrapers/ourfirstscraper>scrapy crawl xyz -a start_urls="abc.com,xyz.in" import scrapy import os class XyzSpider(scrapy.Spider): name = 'xyz' def __init__(self, *args, **kwargs): super(XyzSpider, self).__init__(*args, **kwargs) if kwargs.get('start_urls'): self.start_urls = kwargs.get('start_urls').split(',') else: print("Exception handled for the case of 'no url to start with!'") raise Exception def parse(self, response): body = ';'.join(response.xpath('//a/text()').extract()) yield { 'text': body } Ref: https://stackoverflow.com/questions/43193833/scrapy-error-notsupported-unsupported-url-scheme-no-handler-available-for Google Drive Link to Code: https://drive.google.com/open?id=1_xykiEWw5_TZg-m7cgqF-tq--0xNhxB4
Scrapy spider to retrieve URLs from any webpage URL passed by Flask API
Subscribe to:
Posts (Atom)
No comments:
Post a Comment