Skip to content

Commit

Permalink
Update wayback_machine_downloader.rb
Browse files Browse the repository at this point in the history
  • Loading branch information
insaner authored Jan 24, 2017
1 parent 330c188 commit 5bd9fbf
Showing 1 changed file with 46 additions and 5 deletions.
51 changes: 46 additions & 5 deletions lib/wayback_machine_downloader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def initialize params
@only_filter = params[:only_filter]
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@all_timestamps = params[:all_timestamps]
@list = params[:list]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i
Expand Down Expand Up @@ -123,13 +124,53 @@ def get_file_list_curated
end
file_list_curated
end
def get_file_list_ALL
file_list_curated = Hash.new
puts "snapshots_to_consider: " + get_all_snapshots_to_consider.lines.count.to_s
get_all_snapshots_to_consider.each_line do |line|
unless line.include?('/')
print "**" + line
next
end
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/')
file_id = [file_timestamp, file_id].join('/')
file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id]
#puts "__" + file_id
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
puts "file_list_curated: " + file_list_curated.count.to_s
file_list_curated
end


def get_file_list_by_timestamp
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
if @all_timestamps
file_list_curated = get_file_list_ALL
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
else
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
end

Expand Down

0 comments on commit 5bd9fbf

Please sign in to comment.