Skip to content

Commit

Permalink
Compatible feature: support priority start_urls #172
Browse files Browse the repository at this point in the history
  • Loading branch information
nieweiming committed Apr 27, 2021
1 parent 2df784f commit 1b333a2
Showing 1 changed file with 5 additions and 8 deletions.
13 changes: 5 additions & 8 deletions src/scrapy_redis/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,13 @@ def setup_redis(self, crawler=None):

if self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET):
self.fetch_data = self.server.spop
self.count_size = self.server.scard
elif self.settings.getbool('REDIS_START_URLS_AS_ZSET', defaults.START_URLS_AS_ZSET):
self.fetch_data = self.pop_priority_queue
self.count_size = self.server.zcard
else:
self.fetch_data = self.pop_list_queue
self.count_size = self.server.llen

# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
Expand Down Expand Up @@ -148,9 +151,9 @@ def spider_idle(self):
or close spider when waiting seconds > MAX_IDLE_TIME_BEFORE_CLOSE.
MAX_IDLE_TIME_BEFORE_CLOSE will not affect SCHEDULER_IDLE_BEFORE_CLOSE.
"""

if self.server is not None and self.count_start_urls() > 0:
if self.server is not None and self.count_size(self.redis_key) > 0:
self.spider_idle_start_time = int(time.time())

self.schedule_next_requests()

max_idle_time = self.settings.getint("MAX_IDLE_TIME_BEFORE_CLOSE")
Expand All @@ -159,12 +162,6 @@ def spider_idle(self):
return
raise DontCloseSpider

def count_start_urls(self):
"""Count the number of start_urls"""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
count_size = self.server.scard if use_set else self.server.llen
return count_size(self.redis_key)


class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle.
Expand Down

0 comments on commit 1b333a2

Please sign in to comment.