From 5a4aece3b1826e8315bf7c45c0c7c05448fe9e35 Mon Sep 17 00:00:00 2001 From: zhuangzhuang Date: Fri, 3 Feb 2017 12:29:58 +0800 Subject: [PATCH] refactoring webui --- pyspider/libs/samples/__init__.py | 21 ++ .../{sample_handler.py => samples/handler.py} | 6 +- pyspider/libs/samples/task.py | 10 + pyspider/processor/project_module.py | 1 + pyspider/run.py | 2 +- pyspider/webui/__init__.py | 130 ++++++++++- pyspider/webui/_compat.py | 16 ++ pyspider/webui/app.py | 57 +---- pyspider/webui/bench_test.py | 31 --- pyspider/webui/debug.py | 220 ------------------ pyspider/webui/login.py | 30 +-- pyspider/webui/templates/bench_test.html | 12 + pyspider/webui/templates/index.html | 8 +- pyspider/webui/view/__init__.py | 0 pyspider/webui/view/bench_test.py | 38 +++ pyspider/webui/view/debug.py | 198 ++++++++++++++++ pyspider/webui/{ => view}/index.py | 100 ++++---- pyspider/webui/{ => view}/result.py | 16 +- pyspider/webui/{ => view}/task.py | 59 ++--- pyspider/webui/webdav.py | 40 ++-- tests/test_webui.py | 6 +- 21 files changed, 568 insertions(+), 433 deletions(-) create mode 100644 pyspider/libs/samples/__init__.py rename pyspider/libs/{sample_handler.py => samples/handler.py} (83%) create mode 100644 pyspider/libs/samples/task.py create mode 100644 pyspider/webui/_compat.py delete mode 100644 pyspider/webui/bench_test.py delete mode 100644 pyspider/webui/debug.py create mode 100644 pyspider/webui/templates/bench_test.html create mode 100644 pyspider/webui/view/__init__.py create mode 100644 pyspider/webui/view/bench_test.py create mode 100644 pyspider/webui/view/debug.py rename pyspider/webui/{ => view}/index.py (59%) rename pyspider/webui/{ => view}/result.py (83%) rename pyspider/webui/{ => view}/task.py (59%) diff --git a/pyspider/libs/samples/__init__.py b/pyspider/libs/samples/__init__.py new file mode 100644 index 000000000..1f1a77b87 --- /dev/null +++ b/pyspider/libs/samples/__init__.py @@ -0,0 +1,21 @@ +# -*- encoding: utf-8 -*- +import datetime +from jinja2 import Template + + +def get_sample_task(): + from .task import default_task + return default_task + + +def get_sample_handler(project, start_url=None, date=None): + from pyspider.libs.samples import handler + import inspect + source = inspect.getsource(handler) + tp = Template(source) + if date is None: + date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if not start_url: + start_url = '__START_URL__' + res = tp.render(DATE=date, PROJECT_NAME=project, START_URL=start_url) + return res diff --git a/pyspider/libs/sample_handler.py b/pyspider/libs/samples/handler.py similarity index 83% rename from pyspider/libs/sample_handler.py rename to pyspider/libs/samples/handler.py index ecea6cd95..4415e166f 100644 --- a/pyspider/libs/sample_handler.py +++ b/pyspider/libs/samples/handler.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -# Created on __DATE__ -# Project: __PROJECT_NAME__ +# Created on {{DATE}} +# Project: {{PROJECT_NAME}} from pyspider.libs.base_handler import * @@ -12,7 +12,7 @@ class Handler(BaseHandler): @every(minutes=24 * 60) def on_start(self): - self.crawl('__START_URL__', callback=self.index_page) + self.crawl('{{START_URL}}', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): diff --git a/pyspider/libs/samples/task.py b/pyspider/libs/samples/task.py new file mode 100644 index 000000000..d084c4935 --- /dev/null +++ b/pyspider/libs/samples/task.py @@ -0,0 +1,10 @@ +# -*- encoding: utf-8 -*- + +default_task = { + 'taskid': 'data:,on_start', + 'project': '', + 'url': 'data:,on_start', + 'process': { + 'callback': 'on_start', + } +} diff --git a/pyspider/processor/project_module.py b/pyspider/processor/project_module.py index 91512c264..138ca7c60 100644 --- a/pyspider/processor/project_module.py +++ b/pyspider/processor/project_module.py @@ -15,6 +15,7 @@ import inspect import traceback import linecache + from pyspider.libs import utils from pyspider.libs.log import SaveLogHandler, LogFormatter logger = logging.getLogger("processor") diff --git a/pyspider/run.py b/pyspider/run.py index c3ff6c1cb..7a91b82cf 100755 --- a/pyspider/run.py +++ b/pyspider/run.py @@ -321,7 +321,7 @@ def result_worker(ctx, result_cls, get_object=False): @click.option('--password', envvar='WEBUI_PASSWORD', help='password of lock -ed projects') @click.option('--need-auth', is_flag=True, default=False, help='need username and password') -@click.option('--webui-instance', default='pyspider.webui.app.app', callback=load_cls, +@click.option('--webui-instance', default='pyspider.webui.app', callback=load_cls, help='webui Flask Application instance to be used.') @click.option('--process-time-limit', default=30, help='script process time limit in debug') @click.pass_context diff --git a/pyspider/webui/__init__.py b/pyspider/webui/__init__.py index abbc7d707..bf013352a 100644 --- a/pyspider/webui/__init__.py +++ b/pyspider/webui/__init__.py @@ -5,4 +5,132 @@ # http://binux.me # Created on 2014-02-22 23:20:40 -from . import app, index, debug, task, result, login +import os +import sys + +import logging +from importlib import import_module +from flask import current_app +from flask import Blueprint, Response +from werkzeug.exceptions import Unauthorized + +from pyspider.libs import utils +from pyspider.processor.project_module import ProjectFinder +from .app import QuitableFlask +from pyspider.fetcher import tornado_fetcher +from ._compat import builtins, urljoin, reraise + +path = os.path + +base_dir = path.dirname(__file__) + +logger = logging.getLogger("webui") + + +def full_path(p): + return path.join(base_dir, p) + +if os.name == 'nt': + import mimetypes + mimetypes.add_type("text/css", ".css", True) + + +def _fetch(url): + return tornado_fetcher.Fetcher(None, None, async=False).fetch(url) + + +def init_config(app): + app.config.update({ + 'fetch': _fetch, + 'taskdb': None, + 'projectdb': None, + 'scheduler_rpc': None, + 'queues': dict(), + 'process_time_limit': 30, + 'login_response': Response("need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) + }) + + +def init_jinja(app): + app.jinja_env.line_statement_prefix = '#' + app.jinja_env.globals.update(builtins.__dict__) + app.template_filter('format_date')(utils.format_date) + + +def init_session(app): + app.secret_key = os.urandom(24) + + +def init_view(app): + bp_modules = ('debug', 'task', 'index', 'bench_test', 'result') + for bp_module in bp_modules: + module = '.view.%s' % bp_module + module_instance = import_module(module, __name__) + bp = getattr(module_instance, 'bp') + if bp and isinstance(bp, Blueprint): + app.register_blueprint(bp) + + +def cdn_url_handler(error, endpoint, kwargs): + if endpoint == 'cdn': + path = kwargs.pop('path') + # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/') + # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') + cdn = current_app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') + return urljoin(cdn, path) + else: + exc_type, exc_value, tb = sys.exc_info() + if exc_value is error: + reraise(exc_type, exc_value, tb) + else: + raise error + + +def init_url_handler(app): + app.handle_url_build_error = cdn_url_handler + + +def init_login(app): + from .login import login_manager + login_manager.init_app(app) + + + from ._compat import login + + @app.before_request + def before_request(): + config = current_app.config + if config.get('need_auth', True): + if not login.current_user.is_active(): + return config['login_response'] + + +def init_project_import(app): + sys.meta_path.append(ProjectFinder(app.config['projectdb'])) + + +def init_webdav(app): + try: + from .webdav import init_webdav + init_webdav(app) + except ImportError as e: + logger.warning('WebDav interface not enabled: %r', e) + + +def create_app(): + static_folder = full_path('static') + template_folder = full_path('templates') + app = QuitableFlask(__name__, + static_folder=static_folder, + template_folder=template_folder) + init_config(app) + init_jinja(app) + init_session(app) + init_view(app) + init_url_handler(app) + init_login(app) + init_project_import(app) + return app + + +app = create_app() diff --git a/pyspider/webui/_compat.py b/pyspider/webui/_compat.py new file mode 100644 index 000000000..379f4afa1 --- /dev/null +++ b/pyspider/webui/_compat.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + +from six import reraise +from six.moves import builtins +from six.moves.urllib.parse import urljoin +from six import iteritems, itervalues + +try: + import flask_login as login +except ImportError: + from flask.ext import login + +try: + from urllib import urlencode +except ImportError: + from urllib.parse import urlencode \ No newline at end of file diff --git a/pyspider/webui/app.py b/pyspider/webui/app.py index e596337e1..062333797 100644 --- a/pyspider/webui/app.py +++ b/pyspider/webui/app.py @@ -5,20 +5,12 @@ # http://binux.me # Created on 2014-02-22 23:17:13 -import os -import sys import logging -logger = logging.getLogger("webui") -from six import reraise -from six.moves import builtins -from six.moves.urllib.parse import urljoin from flask import Flask -from pyspider.fetcher import tornado_fetcher +from werkzeug.wsgi import DispatcherMiddleware -if os.name == 'nt': - import mimetypes - mimetypes.add_type("text/css", ".css", True) +logger = logging.getLogger("webui") class QuitableFlask(Flask): @@ -56,15 +48,13 @@ def run(self, host=None, port=None, debug=None, **options): application = DebuggedApplication(application, True) try: - from .webdav import dav_app - except ImportError as e: - logger.warning('WebDav interface not enabled: %r', e) - dav_app = None - if dav_app: - from werkzeug.wsgi import DispatcherMiddleware + from .webdav import init_webdav + dev_app = init_webdav(self) application = DispatcherMiddleware(application, { - '/dav': dav_app + '/dav': dev_app }) + except ImportError as e: + pass container = tornado.wsgi.WSGIContainer(application) self.http_server = tornado.httpserver.HTTPServer(container) @@ -82,36 +72,3 @@ def quit(self): self.ioloop.add_callback(self.http_server.stop) self.ioloop.add_callback(self.ioloop.stop) self.logger.info('webui exiting...') - - -app = QuitableFlask('webui', - static_folder=os.path.join(os.path.dirname(__file__), 'static'), - template_folder=os.path.join(os.path.dirname(__file__), 'templates')) -app.secret_key = os.urandom(24) -app.jinja_env.line_statement_prefix = '#' -app.jinja_env.globals.update(builtins.__dict__) - -app.config.update({ - 'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async=False).fetch(x), - 'taskdb': None, - 'projectdb': None, - 'scheduler_rpc': None, - 'queues': dict(), - 'process_time_limit': 30, -}) - - -def cdn_url_handler(error, endpoint, kwargs): - if endpoint == 'cdn': - path = kwargs.pop('path') - # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/') - # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') - cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') - return urljoin(cdn, path) - else: - exc_type, exc_value, tb = sys.exc_info() - if exc_value is error: - reraise(exc_type, exc_value, tb) - else: - raise error -app.handle_url_build_error = cdn_url_handler diff --git a/pyspider/webui/bench_test.py b/pyspider/webui/bench_test.py deleted file mode 100644 index 18d21e9ba..000000000 --- a/pyspider/webui/bench_test.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- -# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: -# Author: Binux -# http://binux.me -# Created on 2014-12-08 22:31:17 - -import random -try: - from urllib import urlencode -except ImportError: - from urllib.parse import urlencode - -from flask import request -from .app import app - - -@app.route('/bench') -def bench_test(): - total = int(request.args.get('total', 10000)) - show = int(request.args.get('show', 20)) - nlist = [random.randint(1, total) for _ in range(show)] - result = [] - result.append("") - args = dict(request.args) - for nl in nlist: - args['n'] = nl - argstr = urlencode(sorted(args.items()), doseq=True) - result.append("follow {1}
".format(argstr, nl)) - result.append("") - return "".join(result) diff --git a/pyspider/webui/debug.py b/pyspider/webui/debug.py deleted file mode 100644 index 6a0694139..000000000 --- a/pyspider/webui/debug.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- -# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: -# Author: Binux -# http://binux.me -# Created on 2014-02-23 00:19:06 - - -import sys -import time -import socket -import inspect -import datetime -import traceback -from flask import render_template, request, json - -try: - import flask_login as login -except ImportError: - from flask.ext import login - -from pyspider.libs import utils, sample_handler, dataurl -from pyspider.libs.response import rebuild_response -from pyspider.processor.project_module import ProjectManager, ProjectFinder -from .app import app - -default_task = { - 'taskid': 'data:,on_start', - 'project': '', - 'url': 'data:,on_start', - 'process': { - 'callback': 'on_start', - }, -} -default_script = inspect.getsource(sample_handler) - - -@app.route('/debug/', methods=['GET', 'POST']) -def debug(project): - projectdb = app.config['projectdb'] - if not projectdb.verify_project_name(project): - return 'project name is not allowed!', 400 - info = projectdb.get(project, fields=['name', 'script']) - if info: - script = info['script'] - else: - script = (default_script - .replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - .replace('__PROJECT_NAME__', project) - .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) - - taskid = request.args.get('taskid') - if taskid: - taskdb = app.config['taskdb'] - task = taskdb.get_task( - project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) - else: - task = default_task - - default_task['project'] = project - return render_template("debug.html", task=task, script=script, project_name=project) - - -@app.before_first_request -def enable_projects_import(): - sys.meta_path.append(ProjectFinder(app.config['projectdb'])) - - -@app.route('/debug//run', methods=['POST', ]) -def run(project): - start_time = time.time() - try: - task = utils.decode_unicode_obj(json.loads(request.form['task'])) - except Exception: - result = { - 'fetch_result': "", - 'logs': u'task json error', - 'follows': [], - 'messages': [], - 'result': None, - 'time': time.time() - start_time, - } - return json.dumps(utils.unicode_obj(result)), \ - 200, {'Content-Type': 'application/json'} - - project_info = { - 'name': project, - 'status': 'DEBUG', - 'script': request.form['script'], - } - - if request.form.get('webdav_mode') == 'true': - projectdb = app.config['projectdb'] - info = projectdb.get(project, fields=['name', 'script']) - if not info: - result = { - 'fetch_result': "", - 'logs': u' in wevdav mode, cannot load script', - 'follows': [], - 'messages': [], - 'result': None, - 'time': time.time() - start_time, - } - return json.dumps(utils.unicode_obj(result)), \ - 200, {'Content-Type': 'application/json'} - project_info['script'] = info['script'] - - fetch_result = {} - try: - module = ProjectManager.build_module(project_info, { - 'debugger': True, - 'process_time_limit': app.config['process_time_limit'], - }) - - # The code below is to mock the behavior that crawl_config been joined when selected by scheduler. - # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` - # crawl_config = module['instance'].crawl_config - # task = module['instance'].task_join_crawl_config(task, crawl_config) - - fetch_result = app.config['fetch'](task) - response = rebuild_response(fetch_result) - - ret = module['instance'].run_task(module['module'], task, response) - except Exception: - type, value, tb = sys.exc_info() - tb = utils.hide_me(tb, globals()) - logs = ''.join(traceback.format_exception(type, value, tb)) - result = { - 'fetch_result': fetch_result, - 'logs': logs, - 'follows': [], - 'messages': [], - 'result': None, - 'time': time.time() - start_time, - } - else: - result = { - 'fetch_result': fetch_result, - 'logs': ret.logstr(), - 'follows': ret.follows, - 'messages': ret.messages, - 'result': ret.result, - 'time': time.time() - start_time, - } - result['fetch_result']['content'] = response.text - if (response.headers.get('content-type', '').startswith('image')): - result['fetch_result']['dataurl'] = dataurl.encode( - response.content, response.headers['content-type']) - - try: - # binary data can't encode to JSON, encode result as unicode obj - # before send it to frontend - return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} - except Exception: - type, value, tb = sys.exc_info() - tb = utils.hide_me(tb, globals()) - logs = ''.join(traceback.format_exception(type, value, tb)) - result = { - 'fetch_result': "", - 'logs': logs, - 'follows': [], - 'messages': [], - 'result': None, - 'time': time.time() - start_time, - } - return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} - - -@app.route('/debug//save', methods=['POST', ]) -def save(project): - projectdb = app.config['projectdb'] - if not projectdb.verify_project_name(project): - return 'project name is not allowed!', 400 - script = request.form['script'] - project_info = projectdb.get(project, fields=['name', 'status', 'group']) - if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ - and not login.current_user.is_active(): - return app.login_response - - if project_info: - info = { - 'script': script, - } - if project_info.get('status') in ('DEBUG', 'RUNNING', ): - info['status'] = 'CHECKING' - projectdb.update(project, info) - else: - info = { - 'name': project, - 'script': script, - 'status': 'TODO', - 'rate': app.config.get('max_rate', 1), - 'burst': app.config.get('max_burst', 3), - } - projectdb.insert(project, info) - - rpc = app.config['scheduler_rpc'] - if rpc is not None: - try: - rpc.update_project() - except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) - return 'rpc error', 200 - - return 'ok', 200 - - -@app.route('/debug//get') -def get_script(project): - projectdb = app.config['projectdb'] - if not projectdb.verify_project_name(project): - return 'project name is not allowed!', 400 - info = projectdb.get(project, fields=['name', 'script']) - return json.dumps(utils.unicode_obj(info)), \ - 200, {'Content-Type': 'application/json'} - - -@app.route('/blank.html') -def blank_html(): - return "" diff --git a/pyspider/webui/login.py b/pyspider/webui/login.py index d32d5b73a..519be2186 100644 --- a/pyspider/webui/login.py +++ b/pyspider/webui/login.py @@ -6,15 +6,12 @@ # Created on 2014-12-10 20:36:27 import base64 -from flask import Response -try: - import flask_login as login -except ImportError: - from flask.ext import login -from .app import app + +from flask import current_app +from ._compat import login + login_manager = login.LoginManager() -login_manager.init_app(app) class AnonymousUser(login.AnonymousUserMixin): @@ -39,10 +36,11 @@ def __init__(self, id, password): self.password = password def is_authenticated(self): - if not app.config.get('webui_username'): + config = current_app.config + if not config.get('webui_username'): return True - if self.id == app.config.get('webui_username') \ - and self.password == app.config.get('webui_password'): + if self.id == config.get('webui_username') \ + and self.password == config.get('webui_password'): return True return False @@ -62,16 +60,6 @@ def load_user_from_request(request): api_key = base64.b64decode(api_key).decode('utf8') return User(*api_key.split(":", 1)) except Exception as e: - app.logger.error('wrong api key: %r, %r', api_key, e) + current_app.logger.error('wrong api key: %r, %r', api_key, e) return None return None -app.login_response = Response( - "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'} -) - - -@app.before_request -def before_request(): - if app.config.get('need_auth', False): - if not login.current_user.is_active(): - return app.login_response diff --git a/pyspider/webui/templates/bench_test.html b/pyspider/webui/templates/bench_test.html new file mode 100644 index 000000000..c472974e1 --- /dev/null +++ b/pyspider/webui/templates/bench_test.html @@ -0,0 +1,12 @@ + + + + + Bench Test + + +{%for item in items %} + follow {{item.name}}
+{% endfor %} + + \ No newline at end of file diff --git a/pyspider/webui/templates/index.html b/pyspider/webui/templates/index.html index 6ffd19540..2051fcc67 100644 --- a/pyspider/webui/templates/index.html +++ b/pyspider/webui/templates/index.html @@ -59,7 +59,7 @@

pyspider dashboard

{% if config.scheduler_rpc is not none %} - Recent Active Tasks + Recent Active Tasks {% endif %}
@@ -127,7 +127,7 @@ {% raw %} {{ project.group }} - {{* project.name }} + {{* project.name }} {{ project.paused ? 'PAUSED' : project.status }} @@ -162,12 +162,12 @@ # if config.scheduler_rpc is not none: {% raw %} - Active Tasks + Active Tasks {% endraw %} # endif # if config.resultdb: {% raw %} - Results + Results {% endraw %} # endif diff --git a/pyspider/webui/view/__init__.py b/pyspider/webui/view/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pyspider/webui/view/bench_test.py b/pyspider/webui/view/bench_test.py new file mode 100644 index 000000000..e9e696cfc --- /dev/null +++ b/pyspider/webui/view/bench_test.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: +# Author: Binux +# http://binux.me +# Created on 2014-12-08 22:31:17 + +import random + +from flask import Blueprint +from flask import request, render_template + +from pyspider.webui._compat import urlencode + + +bp = Blueprint("bench_test", __name__, url_prefix="/bench") + + +class Item(object): + def __init__(self, arg, name): + self.arg = arg + self.name = name + + +@bp.route('/') +def bench_test(): + args = request.args + total = int(args.get('total', 10000)) + show = int(args.get('show', 20)) + nlist = [random.randint(1, total) for _ in range(show)] + items = [] + args_ = dict(args) + for nl in nlist: + args_['n'] = nl + argstr = urlencode(sorted(args_.items()), doseq=True) + item = Item(argstr, nl) + items.append(item) + return render_template("bench_test.html", items=items) diff --git a/pyspider/webui/view/debug.py b/pyspider/webui/view/debug.py new file mode 100644 index 000000000..0124a4989 --- /dev/null +++ b/pyspider/webui/view/debug.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: +# Author: Binux +# http://binux.me +# Created on 2014-02-23 00:19:06 + + +import sys +import time +import socket + +import traceback +from flask import render_template, request, json, current_app, jsonify +from flask import Blueprint +from werkzeug.exceptions import HTTPException + +from pyspider.libs import utils, dataurl +from pyspider.libs.response import rebuild_response +from pyspider.processor.project_module import ProjectManager, ProjectFinder +from pyspider.libs.samples import get_sample_handler, get_sample_task +from pyspider.webui._compat import login + + +bp = Blueprint("debug", __name__, url_prefix='/debug') + + +class Result(object): + def __init__(self, fetch_result='', logs=u'', follows=None, messages=None, result=None): + if follows is None: + follows = [] + if messages is None: + messages = [] + self.fetch_result = fetch_result + self.logs= logs + self.follows = follows + self.messages = messages + self.result = result + self.time = 0 + + def render(self, start_time=None): + if start_time is not None: + self.time = time.time() - start_time + data = { + 'fetch_result': self.fetch_result, + 'logs': self.logs, + 'follows': self.follows, + 'messages': self.messages, + 'result': self.result, + 'time': self.time + } + return utils.unicode_dict(data) + + +def check_project(projectdb, project): + if not projectdb.verify_project_name(project): + raise HTTPException("project name is not allowed!", 400) + + +@bp.route('/', methods=['GET', 'POST']) +def debug(project): + config = current_app.config + projectdb = config['projectdb'] + check_project(projectdb, project) + + info = projectdb.get(project, fields=['name', 'script']) + if info: + script = info['script'] + else: + script = get_sample_handler(project, request.values.get('start-urls')) + + taskid = request.args.get('taskid') + if taskid: + taskdb = config['taskdb'] + task = taskdb.get_task( + project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) + else: + task = get_sample_task() + + return render_template("debug.html", task=task, script=script, project_name=project) + + +@bp.route('//run', methods=['POST', ]) +def run(project): + config = current_app.config + start_time = time.time() + try: + task = utils.decode_unicode_obj(json.loads(request.form['task'])) + except Exception: + res = Result(logs=u'task json error') + return jsonify(res.render(start_time)) + + project_info = { + 'name': project, + 'status': 'DEBUG', + 'script': request.form['script'], + } + + if request.form.get('webdav_mode') == 'true': + projectdb = config['projectdb'] + info = projectdb.get(project, fields=['name', 'script']) + if not info: + res = Result(logs=u' in wevdav mode, cannot load script') + return jsonify(res.render(start_time)) + project_info['script'] = info['script'] + + fetch_result = {} + try: + module = ProjectManager.build_module(project_info, { + 'debugger': True, + 'process_time_limit': config['process_time_limit'], + }) + + # The code below is to mock the behavior that crawl_config been joined when selected by scheduler. + # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` + # crawl_config = module['instance'].crawl_config + # task = module['instance'].task_join_crawl_config(task, crawl_config) + + fetch_result = config['fetch'](task) + response = rebuild_response(fetch_result) + + ret = module['instance'].run_task(module['module'], task, response) + except Exception: + type, value, tb = sys.exc_info() + tb = utils.hide_me(tb, globals()) + logs = ''.join(traceback.format_exception(type, value, tb)) + res = Result(fetch_result=fetch_result, logs=logs) + else: + fetch_result['content'] = response.text + if response.headers.get('content-type', '').startswith('image'): + fetch_result['dataurl'] = dataurl.encode( + response.content, response.headers['content-type']) + res = Result(fetch_result=fetch_result, logs=ret.logstr, + follows=ret.follows, messages=ret.messages, result=ret.result) + + try: + # binary data can't encode to JSON, encode result as unicode obj + # before send it to frontend + return jsonify(res.render(start_time)) + except Exception: + type, value, tb = sys.exc_info() + tb = utils.hide_me(tb, globals()) + logs = ''.join(traceback.format_exception(type, value, tb)) + res = Result(logs=logs) + return jsonify(res.render(start_time)) + + +@bp.route('//save', methods=['POST', ]) +def save(project): + config = current_app.config + projectdb = config['projectdb'] + check_project(projectdb, project) + + script = request.form['script'] + project_info = projectdb.get(project, fields=['name', 'status', 'group']) + if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ + and not login.current_user.is_active(): + return config['login_response'] + + if project_info: + info = { + 'script': script, + } + if project_info.get('status') in ('DEBUG', 'RUNNING', ): + info['status'] = 'CHECKING' + projectdb.update(project, info) + else: + info = { + 'name': project, + 'script': script, + 'status': 'TODO', + 'rate': config.get('max_rate', 1), + 'burst': config.get('max_burst', 3), + } + projectdb.insert(project, info) + + rpc = config['scheduler_rpc'] + if rpc is not None: + try: + rpc.update_project() + except socket.error as e: + current_app.logger.warning('connect to scheduler rpc error: %r', e) + return 'rpc error' + return 'ok' + + +@bp.route('//get') +def get_script(project): + projectdb = current_app.config['projectdb'] + check_project(projectdb, project) + + info = projectdb.get(project, fields=['name', 'script']) + return jsonify(utils.unicode_obj(info)) + + +@bp.route('/blank.html') +def blank_html(): + return "" diff --git a/pyspider/webui/index.py b/pyspider/webui/view/index.py similarity index 59% rename from pyspider/webui/index.py rename to pyspider/webui/view/index.py index 194ae47ce..743835d8f 100644 --- a/pyspider/webui/index.py +++ b/pyspider/webui/view/index.py @@ -6,70 +6,73 @@ # Created on 2014-02-22 23:20:39 import socket +from flask import Blueprint +from flask import render_template, request, json, current_app, jsonify +from pandas.compat import iteritems +from werkzeug.exceptions import HTTPException -from six import iteritems, itervalues -from flask import render_template, request, json +from pyspider.webui._compat import login -try: - import flask_login as login -except ImportError: - from flask.ext import login -from .app import app +bp = Blueprint('index', __name__) + index_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime'] -@app.route('/') +@bp.route('/') def index(): - projectdb = app.config['projectdb'] + projectdb = current_app.config['projectdb'] projects = sorted(projectdb.get_all(fields=index_fields), key=lambda k: (0 if k['group'] else 1, k['group'] or '', k['name'])) return render_template("index.html", projects=projects) -@app.route('/queues') -def get_queues(): - def try_get_qsize(queue): - if queue is None: - return 'None' - try: - return queue.qsize() - except Exception as e: - return "%r" % e +def try_get_qsize(queue): + if queue is None: + return 'None' + try: + return queue.qsize() + except Exception as e: + return "%r" % e + +@bp.route('/queues') +def get_queues(): result = {} - queues = app.config.get('queues', {}) + queues = current_app.config.get('queues', {}) for key in queues: result[key] = try_get_qsize(queues[key]) - return json.dumps(result), 200, {'Content-Type': 'application/json'} + return jsonify(result) -@app.route('/update', methods=['POST', ]) +@bp.route('/update', methods=['POST', ]) def project_update(): - projectdb = app.config['projectdb'] + config = current_app.config + projectdb = config['projectdb'] project = request.form['pk'] name = request.form['name'] value = request.form['value'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: - return "no such project.", 404 + raise HTTPException("no such project.", 404) if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): - return app.login_response + return current_app.login_response if name not in ('group', 'status', 'rate'): - return 'unknown field: %s' % name, 400 + msg = 'unknown field: %s' % name + raise HTTPException(msg, 400) if name == 'rate': value = value.split('/') if len(value) != 2: - return 'format error: rate/burst', 400 + raise HTTPException('format error: rate/burst', 400) rate = float(value[0]) burst = float(value[1]) update = { - 'rate': min(rate, app.config.get('max_rate', rate)), - 'burst': min(burst, app.config.get('max_burst', burst)), + 'rate': min(rate, config.get('max_rate', rate)), + 'burst': min(burst, config.get('max_burst', burst)), } else: update = { @@ -78,21 +81,22 @@ def project_update(): ret = projectdb.update(project, update) if ret: - rpc = app.config['scheduler_rpc'] + rpc = config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) - return 'rpc error', 200 - return 'ok', 200 + current_app.logger.warning('connect to scheduler rpc error: %r', e) + return 'rpc error' + return 'ok' else: - return 'update error', 500 + raise HTTPException("update error", 500) -@app.route('/counter') +@bp.route('/counter') def counter(): - rpc = app.config['scheduler_rpc'] + config = current_app.config + rpc = config['scheduler_rpc'] if rpc is None: return json.dumps({}) @@ -105,26 +109,26 @@ def counter(): for project, paused in iteritems(data['pause_status']): result.setdefault(project, {})['paused'] = paused except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) - return json.dumps({}), 200, {'Content-Type': 'application/json'} + current_app.logger.warning('connect to scheduler rpc error: %r', e) + return jsonify({}) + return jsonify(result) - return json.dumps(result), 200, {'Content-Type': 'application/json'} - -@app.route('/run', methods=['POST', ]) +@bp.route('/run', methods=['POST', ]) def runtask(): - rpc = app.config['scheduler_rpc'] + config = current_app.config + rpc = config['scheduler_rpc'] if rpc is None: - return json.dumps({}) + return jsonify({}) - projectdb = app.config['projectdb'] + projectdb = config['projectdb'] project = request.form['project'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: return "no such project.", 404 if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): - return app.login_response + return current_app.login_response newtask = { "project": project, @@ -140,15 +144,15 @@ def runtask(): }, } + ret = False try: ret = rpc.newtask(newtask) except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) - return json.dumps({"result": False}), 200, {'Content-Type': 'application/json'} - return json.dumps({"result": ret}), 200, {'Content-Type': 'application/json'} + current_app.logger.warning('connect to scheduler rpc error: %r', e) + return jsonify({"result": ret}) -@app.route('/robots.txt') +@bp.route('/robots.txt') def robots(): return """User-agent: * Disallow: / diff --git a/pyspider/webui/result.py b/pyspider/webui/view/result.py similarity index 83% rename from pyspider/webui/result.py rename to pyspider/webui/view/result.py index 84305bb31..1fcc48623 100644 --- a/pyspider/webui/result.py +++ b/pyspider/webui/view/result.py @@ -7,15 +7,17 @@ from __future__ import unicode_literals -from flask import render_template, request, json -from flask import Response -from .app import app +from flask import render_template, request, json, current_app +from flask import Response, Blueprint from pyspider.libs import result_dump -@app.route('/results') +bp = Blueprint("result", __name__, url_prefix="/results") + + +@bp.route('/') def result(): - resultdb = app.config['resultdb'] + resultdb = current_app.config['resultdb'] project = request.args.get('project') offset = int(request.args.get('offset', 0)) limit = int(request.args.get('limit', 20)) @@ -30,9 +32,9 @@ def result(): ) -@app.route('/results/dump/.<_format>') +@bp.route('/dump/.<_format>') def dump_result(project, _format): - resultdb = app.config['resultdb'] + resultdb = current_app.config['resultdb'] # force update project list resultdb.get(project, 'any') if project not in resultdb.projects: diff --git a/pyspider/webui/task.py b/pyspider/webui/view/task.py similarity index 59% rename from pyspider/webui/task.py rename to pyspider/webui/view/task.py index a407da0c1..15a8bf69d 100644 --- a/pyspider/webui/task.py +++ b/pyspider/webui/view/task.py @@ -6,57 +6,63 @@ # Created on 2014-07-16 15:30:57 import socket -from flask import abort, render_template, request, json - +from flask import abort, render_template, request, jsonify, current_app +from flask import Blueprint from pyspider.libs import utils -from .app import app -@app.route('/task/') +bp = Blueprint("task", __name__, url_prefix="/task") + + +@bp.route('/task/') def task(taskid): + config = current_app.config if ':' not in taskid: abort(400) project, taskid = taskid.split(':', 1) - taskdb = app.config['taskdb'] + taskdb = config['taskdb'] task = taskdb.get_task(project, taskid) if not task: abort(404) - resultdb = app.config['resultdb'] + resultdb = config['resultdb'] + result = None if resultdb: result = resultdb.get(project, taskid) - return render_template("task.html", task=task, json=json, result=result, - status_to_string=app.config['taskdb'].status_to_string) + return render_template("task.html", task=task, json=jsonify, result=result, + status_to_string=config['taskdb'].status_to_string) -@app.route('/task/.json') +@bp.route('/.json') def task_in_json(taskid): + config = current_app.config if ':' not in taskid: - return json.jsonify({'code': 400, 'error': 'bad project:task_id format'}) + return jsonify({'code': 400, 'error': 'bad project:task_id format'}) project, taskid = taskid.split(':', 1) - taskdb = app.config['taskdb'] + taskdb = config['taskdb'] task = taskdb.get_task(project, taskid) if not task: - return json.jsonify({'code': 404, 'error': 'not found'}) - task['status_string'] = app.config['taskdb'].status_to_string(task['status']) - return json.jsonify(task) + return jsonify({'code': 404, 'error': 'not found'}) + task['status_string'] = config['taskdb'].status_to_string(task['status']) + return jsonify(task) -@app.route('/tasks') +@bp.route('/tasks') def tasks(): - rpc = app.config['scheduler_rpc'] - taskdb = app.config['taskdb'] + config = current_app.config + rpc = config['scheduler_rpc'] + taskdb = config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: updatetime_tasks = rpc.get_active_tasks(project, limit) except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) + current_app.logger.warning('connect to scheduler rpc error: %r', e) return 'connect to scheduler error', 502 tasks = {} @@ -76,18 +82,21 @@ def tasks(): ) -@app.route('/active_tasks') +@bp.route('/active_tasks') def active_tasks(): - rpc = app.config['scheduler_rpc'] - taskdb = app.config['taskdb'] + config = current_app.config + rpc = config['scheduler_rpc'] + taskdb = config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: tasks = rpc.get_active_tasks(project, limit) except socket.error as e: - app.logger.warning('connect to scheduler rpc error: %r', e) - return '{}', 502, {'Content-Type': 'application/json'} + current_app.logger.warning('connect to scheduler rpc error: %r', e) + res = jsonify({}) + res.state = 502 + return res result = [] for updatetime, task in tasks: @@ -97,6 +106,4 @@ def active_tasks(): task['status_text'] = taskdb.status_to_string(task['status']) result.append(task) - return json.dumps(result), 200, {'Content-Type': 'application/json'} - -app.template_filter('format_date')(utils.format_date) + return jsonify(result) diff --git a/pyspider/webui/webdav.py b/pyspider/webui/webdav.py index 886eb77b8..9bd3d6496 100644 --- a/pyspider/webui/webdav.py +++ b/pyspider/webui/webdav.py @@ -10,15 +10,18 @@ import time import base64 import six + from six import BytesIO +from flask import current_app from wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp from wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection from wsgidav.dav_error import DAVError, HTTP_FORBIDDEN + from pyspider.libs.utils import utf8, text -from .app import app def check_user(environ): + config = current_app.config authheader = environ.get("HTTP_AUTHORIZATION") if not authheader: return False @@ -26,11 +29,11 @@ def check_user(environ): try: username, password = text(base64.b64decode(authheader)).split(':', 1) except Exception as e: - app.logger.error('wrong api key: %r, %r', authheader, e) + current_app.logger.error('wrong api key: %r, %r', authheader, e) return False - if username == app.config['webui_username'] \ - and password == app.config['webui_password']: + if username == config['webui_username'] \ + and password == config['webui_password']: return True else: return False @@ -200,17 +203,18 @@ def authDomainUser(self, realmname, username, password, environ): and password == self.app.config.get('webui_password') -config = DEFAULT_CONFIG.copy() -config.update({ - 'mount_path': '/dav', - 'provider_mapping': { - '/': ScriptProvider(app) - }, - 'domaincontroller': NeedAuthController(app), - 'verbose': 1 if app.debug else 0, - 'dir_browser': {'davmount': False, - 'enable': True, - 'msmount': False, - 'response_trailer': ''}, -}) -dav_app = WsgiDAVApp(config) +def init_webdav(app): + config = DEFAULT_CONFIG.copy() + config.update({ + 'mount_path': '/dav', + 'provider_mapping': { + '/': ScriptProvider(app) + }, + 'domaincontroller': NeedAuthController(app), + 'verbose': 1 if app.debug else 0, + 'dir_browser': {'davmount': False, + 'enable': True, + 'msmount': False, + 'response_trailer': ''}, + }) + dav_app = WsgiDAVApp(config) diff --git a/tests/test_webui.py b/tests/test_webui.py index 32b6c1a95..ab7afadef 100644 --- a/tests/test_webui.py +++ b/tests/test_webui.py @@ -295,7 +295,7 @@ def test_a15_queues(self): self.assertIn('status_queue', data) def test_a20_tasks(self): - rv = self.app.get('/tasks') + rv = self.app.get('/task/tasks') self.assertEqual(rv.status_code, 200, rv.data) self.assertIn(b'SUCCESS', rv.data) self.assertNotIn(b'>ERROR', rv.data) @@ -314,7 +314,7 @@ def test_a20_tasks(self): self.assertNotIn(b'>ERROR', rv.data) def test_a22_active_tasks(self): - rv = self.app.get('/active_tasks') + rv = self.app.get('/task/active_tasks') data = json.loads(utils.text(rv.data)) track = False self.assertGreater(len(data), 0) @@ -551,7 +551,7 @@ def test_x40_debug_save(self): self.assertNotIn(b'ok', rv.data) def test_x50_tasks(self): - rv = self.app.get('/tasks') + rv = self.app.get('/task/tasks') self.assertEqual(rv.status_code, 502) def test_x60_robots(self):