Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added SCRAPYD_SERVER HTTPS support #230

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
added SCRAPYD_SERVER HTTPS support
  • Loading branch information
ritikkumarsahu committed Mar 22, 2024
commit d7a0d8c92d35d9b5356be2527de1f3546d3c35ca
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ scrapydweb/data/demo_projects/ScrapydWeb_demo - 副本/*


venv/
.venv/

*.pyc
__pycache__/
Expand All @@ -44,3 +45,4 @@ build/
*.egg-info/

.idea
.vscode/launch.json
29 changes: 16 additions & 13 deletions scrapydweb/utils/check_app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,40 +333,43 @@ def check_scrapyd_servers(config):
servers = []
for idx, (server, public_url) in enumerate(zip(SCRAPYD_SERVERS, SCRAPYD_SERVERS_PUBLIC_URLS)):
if isinstance(server, tuple):
assert len(server) == 5, ("Scrapyd server should be a tuple of 5 elements, "
assert len(server) == 6, ("Scrapyd server should be a tuple of 6 elements, "
"current value: %s" % str(server))
usr, psw, ip, port, group = server
usr, psw, protocol, ip, port, group = server
else:
usr, psw, ip, port, group = re.search(SCRAPYD_SERVER_PATTERN, server.strip()).groups()
# TODO: handle protocol
usr, psw, protocol, ip, port, group = re.search(SCRAPYD_SERVER_PATTERN, server.strip()).groups()
protocol = protocol.strip() if protocol and protocol.strip() else 'http'
ip = ip.strip() if ip and ip.strip() else '127.0.0.1'
port = port.strip() if port and port.strip() else '6800'
group = group.strip() if group and group.strip() else ''
auth = (usr, psw) if usr and psw else None
public_url = public_url.strip(' /')
servers.append((group, ip, port, auth, public_url))
servers.append((group, protocol, ip, port, auth, public_url))

def key_func(arg):
_group, _ip, _port, _auth, _public_url = arg
_group, _protocol, _ip, _port, _auth, _public_url = arg
parts = _ip.split('.')
parts = [('0' * (3 - len(part)) + part) for part in parts]
return [_group, '.'.join(parts), int(_port)]

servers = sorted(set(servers), key=key_func)
check_scrapyd_connectivity(servers)

config['SCRAPYD_SERVERS'] = ['%s:%s' % (ip, port) for (group, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_GROUPS'] = [group for (group, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_AUTHS'] = [auth for (group, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_PUBLIC_URLS'] = [public_url for (group, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS'] = ['%s:%s' % (ip, port) for (group, protocol, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_GROUPS'] = [group for (group, protocol, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_PROTOCOLS'] = [protocol for (group, protocol, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_AUTHS'] = [auth for (group, protocol, ip, port, auth, public_url) in servers]
config['SCRAPYD_SERVERS_PUBLIC_URLS'] = [public_url for (group, protocol, ip, port, auth, public_url) in servers]


def check_scrapyd_connectivity(servers):
logger.debug("Checking connectivity of SCRAPYD_SERVERS...")

def check_connectivity(server):
(_group, _ip, _port, _auth, _public_url) = server
(_group, __protocol, _ip, _port, _auth, _public_url) = server
try:
url = 'http:https://%s:%s' % (_ip, _port)
url = '%s:https://%s:%s' % (__protocol, _ip, _port)
r = session.get(url, auth=_auth, timeout=10)
assert r.status_code == 200, "%s got status_code %s" % (url, r.status_code)
except Exception as err:
Expand All @@ -385,9 +388,9 @@ def check_connectivity(server):
print("\nIndex {group:<20} {server:<21} Connectivity Auth".format(
group='Group', server='Scrapyd IP:Port'))
print(HASH)
for idx, ((group, ip, port, auth, public_url), result) in enumerate(zip(servers, results), 1):
for idx, ((group, protocol, ip, port, auth, public_url), result) in enumerate(zip(servers, results), 1):
print("{idx:_<5} {group:_<20} {server:_<22} {result:_<11} {auth}".format(
idx=idx, group=group or 'None', server='%s:%s' % (ip, port), auth=auth, result=str(result)))
idx=idx, group=group or 'None', server='%s://%s:%s' % (protocol, ip, port), auth=auth, result=str(result)))
print(HASH + '\n')

assert any(results), "None of your SCRAPYD_SERVERS could be connected. "
Expand Down
10 changes: 6 additions & 4 deletions scrapydweb/utils/poll.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@
class Poll(object):
logger = logger

def __init__(self, url_scrapydweb, username, password,
def __init__(self, url_scrapydweb, username, password, scrapyd_servers_protocols,
scrapyd_servers, scrapyd_servers_auths,
poll_round_interval, poll_request_interval,
main_pid, verbose, exit_timeout=0):
self.url_scrapydweb = url_scrapydweb
self.auth = (username, password) if username and password else None

self.scrapyd_servers = scrapyd_servers
self.scrapyd_servers_protocols = scrapyd_servers_protocols
self.scrapyd_servers_auths = scrapyd_servers_auths

self.session = requests.Session()
Expand Down Expand Up @@ -179,12 +180,12 @@ def make_request(self, url, auth, post=False):
return r

def run(self):
for node, (scrapyd_server, auth) in enumerate(zip(self.scrapyd_servers, self.scrapyd_servers_auths), 1):
for node, (scrapyd_server_protocol, scrapyd_server, auth) in enumerate(zip(self.scrapyd_servers_protocols, self.scrapyd_servers, self.scrapyd_servers_auths), 1):
# Update Jobs history
# url_jobs = self.url_scrapydweb + '/%s/jobs/' % node
# self.make_request(url_jobs, auth=self.auth, post=True)

url_jobs = 'http:https://%s/jobs' % scrapyd_server
url_jobs = '{}:https://{}/jobs'.format(scrapyd_server_protocol, scrapyd_server)
# json.loads(json.dumps({'auth':(1,2)})) => {'auth': [1, 2]}
auth = tuple(auth) if auth else None # TypeError: 'list' object is not callable
try:
Expand Down Expand Up @@ -227,11 +228,12 @@ def update_finished_jobs(self, node, finished_jobs_set):


def main(args):
keys = ('url_scrapydweb', 'username', 'password',
keys = ('url_scrapydweb', 'username', 'password', 'scrapyd_servers_protocols',
'scrapyd_servers', 'scrapyd_servers_auths',
'poll_round_interval', 'poll_request_interval',
'main_pid', 'verbose', 'exit_timeout')
kwargs = dict(zip(keys, args))
kwargs['scrapyd_servers_protocols'] = json.loads(kwargs['scrapyd_servers_protocols'])
kwargs['scrapyd_servers'] = json.loads(kwargs['scrapyd_servers'])
kwargs['scrapyd_servers_auths'] = json.loads(kwargs['scrapyd_servers_auths'])
kwargs['poll_round_interval'] = int(kwargs['poll_round_interval'])
Expand Down
2 changes: 1 addition & 1 deletion scrapydweb/views/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self):
self.project = self.view_args['project']
self.version_spider_job = self.view_args['version_spider_job']

self.url = 'http:https://{}/{}.json'.format(self.SCRAPYD_SERVER, API_MAP.get(self.opt, self.opt))
self.url = '{}:https://{}/{}.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, API_MAP.get(self.opt, self.opt))
self.data = None
self.status_code = 0
self.js = {}
Expand Down
2 changes: 2 additions & 0 deletions scrapydweb/views/baseview.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(self, *args, **kwargs):

# Scrapyd
self.SCRAPYD_SERVERS = app.config.get('SCRAPYD_SERVERS', []) or ['127.0.0.1:6800']
self.SCRAPYD_SERVERS_PROTOCOLS = app.config.get('SCRAPYD_SERVERS_PROTOCOLS', []) or ['http']
self.SCRAPYD_SERVERS_AMOUNT = len(self.SCRAPYD_SERVERS)
self.SCRAPYD_SERVERS_GROUPS = app.config.get('SCRAPYD_SERVERS_GROUPS', []) or ['']
self.SCRAPYD_SERVERS_AUTHS = app.config.get('SCRAPYD_SERVERS_AUTHS', []) or [None]
Expand Down Expand Up @@ -183,6 +184,7 @@ def __init__(self, *args, **kwargs):
assert 0 < self.node <= self.SCRAPYD_SERVERS_AMOUNT, \
'node index error: %s, which should be between 1 and %s' % (self.node, self.SCRAPYD_SERVERS_AMOUNT)
self.SCRAPYD_SERVER = self.SCRAPYD_SERVERS[self.node - 1]
self.SCRAPYD_SERVER_PROTOCOL = self.SCRAPYD_SERVERS_PROTOCOLS[self.node - 1]
self.IS_LOCAL_SCRAPYD_SERVER = self.SCRAPYD_SERVER == self.LOCAL_SCRAPYD_SERVER
self.GROUP = self.SCRAPYD_SERVERS_GROUPS[self.node - 1]
self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.node - 1]
Expand Down
4 changes: 2 additions & 2 deletions scrapydweb/views/dashboard/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(self):
self.logger.debug("Change per_page to %s", self.metadata['per_page'])
self.page = request.args.get('page', default=1, type=int)

self.url = 'http:https://%s/jobs' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/jobs'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
if self.SCRAPYD_SERVER_PUBLIC_URL:
self.public_url = '%s/jobs' % self.SCRAPYD_SERVER_PUBLIC_URL
else:
Expand Down Expand Up @@ -411,7 +411,7 @@ def set_kwargs(self):
url=self.url,
url_schedule=url_for('schedule', node=self.node),
url_liststats=url_for('api', node=self.node, opt='liststats'),
url_liststats_source='http:https://%s/logs/stats.json' % self.SCRAPYD_SERVER,
url_liststats_source='{}:https://{}/logs/stats.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER),
SCRAPYD_SERVER=self.SCRAPYD_SERVER.split(':')[0],
LOGPARSER_VERSION=self.LOGPARSER_VERSION,
JOBS_RELOAD_INTERVAL=self.JOBS_RELOAD_INTERVAL,
Expand Down
2 changes: 1 addition & 1 deletion scrapydweb/views/files/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self):
self.project = self.view_args['project']
self.spider = self.view_args['spider']

self.url = 'http:https://{}/items/{}{}'.format(self.SCRAPYD_SERVER,
self.url = '{}:https://{}/items/{}{}'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER,
'%s/' % self.project if self.project else '',
'%s/' % self.spider if self.spider else '')
if self.SCRAPYD_SERVER_PUBLIC_URL:
Expand Down
4 changes: 2 additions & 2 deletions scrapydweb/views/files/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self):

# Note that self.LOCAL_SCRAPYD_LOGS_DIR may be an empty string
# Extension like '.log' is excluded here.
self.url = u'http:https://{}/logs/{}/{}/{}'.format(self.SCRAPYD_SERVER, self.project, self.spider, self.job)
self.url = u'{}:https://{}/logs/{}/{}/{}'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, self.project, self.spider, self.job)
self.log_path = os.path.join(self.LOCAL_SCRAPYD_LOGS_DIR, self.project, self.spider, self.job)

# For Log and Stats buttons in the Logs page: /a.log/?with_ext=True
Expand All @@ -66,7 +66,7 @@ def __init__(self):

# json file by LogParser
self.json_path = os.path.join(self.LOCAL_SCRAPYD_LOGS_DIR, self.project, self.spider, job_without_ext+'.json')
self.json_url = u'http:https://{}/logs/{}/{}/{}.json'.format(self.SCRAPYD_SERVER, self.project, self.spider,
self.json_url = u'{}:https://{}/logs/{}/{}/{}.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, self.project, self.spider,
job_without_ext)

self.status_code = 0
Expand Down
2 changes: 1 addition & 1 deletion scrapydweb/views/files/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self):
self.project = self.view_args['project']
self.spider = self.view_args['spider']

self.url = 'http:https://{}/logs/{}{}'.format(self.SCRAPYD_SERVER,
self.url = '{}:https://{}/logs/{}{}'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER,
'%s/' % self.project if self.project else '',
'%s/' % self.spider if self.spider else '')
if self.SCRAPYD_SERVER_PUBLIC_URL:
Expand Down
8 changes: 4 additions & 4 deletions scrapydweb/views/operations/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class DeployView(BaseView):
def __init__(self):
super(DeployView, self).__init__()

self.url = 'http:https://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion')
self.url = '{}:https://{}/{}.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, 'addversion')
self.template = 'scrapydweb/deploy.html'

self.scrapy_cfg_list = []
Expand Down Expand Up @@ -284,11 +284,11 @@ def handle_form(self):
if self.selected_nodes_amount:
self.selected_nodes = self.get_selected_nodes()
self.first_selected_node = self.selected_nodes[0]
self.url = 'http:https://{}/{}.json'.format(self.SCRAPYD_SERVERS[self.first_selected_node - 1], 'addversion')
self.url = '{}:https://{}/{}.json'.format(self.SCRAPYD_SERVERS_PROTOCOLS[self.first_selected_node - 1], self.SCRAPYD_SERVERS[self.first_selected_node - 1], 'addversion')
# Note that self.first_selected_node != self.node
self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.first_selected_node - 1]
else:
self.url = 'http:https://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion')
self.url = '{}:https://{}/{}.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, 'addversion')

# Error: Project names must begin with a letter and contain only letters, numbers and underscores
self.project = re.sub(self.STRICT_NAME_PATTERN, '_', request.form.get('project', '')) or self.get_now_string()
Expand Down Expand Up @@ -446,7 +446,7 @@ def __init__(self):
self.project = self.view_args['project']
self.version = self.view_args['version']

self.url = 'http:https://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion')
self.url = '{}:https://{}/{}.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER, 'addversion')

self.slot = slot

Expand Down
14 changes: 7 additions & 7 deletions scrapydweb/views/operations/schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self):
self.task_id = request.args.get('task_id', default=None, type=int)
self.task = None

self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
self.template = 'scrapydweb/schedule.html'
self.kwargs = {}

Expand Down Expand Up @@ -214,7 +214,7 @@ class ScheduleCheckView(BaseView):
def __init__(self):
super(ScheduleCheckView, self).__init__()

self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
self.template = 'scrapydweb/schedule.html'

self.filename = ''
Expand All @@ -228,7 +228,7 @@ def dispatch_request(self, **kwargs):
# self.logger.warning(self.json_dumps(self.data)) # TypeError: Object of type datetime is not JSON serializable
cmd = generate_cmd(self.AUTH, self.url, self.data)
# '-d' may be in project name, like 'ScrapydWeb-demo'
cmd = re.sub(r'(curl -u\s+.*?:.*?)\s+(http:https://)', r'\1 \\\r\n\2', cmd)
cmd = re.sub(r'(curl -u\s+.*?:.*?)\s+({}:https://)'.format(self.SCRAPYD_SERVER_PROTOCOL), r'\1 \\\r\n\2', cmd)
cmd = re.sub(r'\s+-d\s+', ' \\\r\n-d ', cmd)
cmd = re.sub(r'\s+--data-urlencode\s+', ' \\\r\n--data-urlencode ', cmd)
return self.json_dumps({'filename': self.filename, 'cmd': cmd}, as_response=True)
Expand Down Expand Up @@ -365,12 +365,12 @@ def handle_form(self):
if self.selected_nodes_amount:
self.selected_nodes = self.get_selected_nodes()
self.first_selected_node = self.selected_nodes[0]
self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVERS[self.first_selected_node - 1]
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVERS_PROTOCOLS[self.first_selected_node - 1], self.SCRAPYD_SERVERS[self.first_selected_node - 1])
# Note that self.first_selected_node != self.node
self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.first_selected_node - 1]
else:
self.selected_nodes = [self.node]
self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)

# in handle_action(): self.data.pop('__task_data', {}) self.task_data.pop
self.data = self.slot.data.get(self.filename, {})
Expand Down Expand Up @@ -598,7 +598,7 @@ def __init__(self):
super(ScheduleXhrView, self).__init__()

self.filename = self.view_args['filename']
self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
self.slot = slot
self.data = None

Expand All @@ -619,7 +619,7 @@ class ScheduleTaskView(BaseView):
def __init__(self):
super(ScheduleTaskView, self).__init__()

self.url = 'http:https://%s/schedule.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/schedule.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
self.task_id = request.form['task_id']
self.jobid = request.form['jobid']
self.data = {}
Expand Down
2 changes: 1 addition & 1 deletion scrapydweb/views/overview/servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self):
self.version_job = self.view_args['version_job']
self.spider = self.view_args['spider']

self.url = 'http:https://%s/daemonstatus.json' % self.SCRAPYD_SERVER
self.url = '{}:https://{}/daemonstatus.json'.format(self.SCRAPYD_SERVER_PROTOCOL, self.SCRAPYD_SERVER)
self.template = 'scrapydweb/servers.html'
self.selected_nodes = []

Expand Down