-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
These files are indirectly referenced by scripts in my crontab.
Note that debug.py is harmless but not needed; I had used it for development but didn't always remember to delete it.
- Loading branch information
Showing
3 changed files
with
301 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/usr/bin/env python | ||
|
||
"""Adds up the number of files and amount of data in an installation log file.""" | ||
|
||
import debug | ||
|
||
Kval = 1000 # Value of a K - it might be 1000 or 1024, I don't know what Synda uses. | ||
Mval = Kval*Kval | ||
Gval = Kval*Mval | ||
Tval = Kval*Gval | ||
|
||
import sys | ||
from pprint import pprint | ||
|
||
# The installation log file should be the first (and only) argument. | ||
# For example: infile = '/home/painter/install.2020.10.19.log' | ||
# The installation script should create this file, append to it with every | ||
# "synda install" command, and, near the end, run this script. | ||
|
||
def bytecount_for_people(num): | ||
# from https://stackoverflow.com/questions/579310/formatting-long-numbers-as-strings-in-python | ||
num = float('{:.3g}'.format(num)) | ||
magnitude = 0 | ||
while abs(num) >= 1000: | ||
magnitude += 1 | ||
num /= 1000.0 | ||
return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), | ||
['', ' KB', ' MB', ' GB', ' TB'][magnitude]) | ||
def run( infile ): | ||
with open(infile) as f: | ||
lines = f.readlines() | ||
szlines = [line for line in lines if line.find('Once downloaded,')==0] | ||
ctlines = [line for line in lines if line.find('will be added')>0] | ||
|
||
szs = [ l.replace('Once downloaded, ','') for l in szlines ] | ||
szs = [ l.replace(' of additional disk space will be used.\n','') for l in szs ] | ||
szsKB = [ l.replace(' KB','') for l in szs if l.find('KB')>0 ] | ||
szsKB += [ l.replace(' kB','') for l in szs if l.find('kB')>0 ] | ||
szKB = sum(map(eval,szsKB)) | ||
szsMB = [ l.replace(' MB','') for l in szs if l.find('MB')>0 ] | ||
szMB = sum(map(eval,szsMB)) | ||
szsGB = [ l.replace(' GB','') for l in szs if l.find('GB')>0 ] | ||
szGB = sum(map(eval,szsGB)) | ||
szsTB = [ l.replace(' TB','') for l in szs if l.find('TB')>0 ] | ||
szTB = sum(map(eval,szsTB)) | ||
assert( len(szs) == len(szsKB)+len(szsMB)+len(szsGB)+len(szsTB) ) | ||
sz = Kval*szKB + Mval*szMB + Gval*szGB + Tval*szTB | ||
print "Total size installed =", bytecount_for_people(sz) | ||
|
||
cts = [ l.replace(' file(s) will be added to the download queue.\n', '') for l in ctlines ] | ||
ct = sum(map(eval,cts)) | ||
print "Total number of files installed =", ('{:,}').format(ct) | ||
|
||
if __name__ == '__main__': | ||
if len( sys.argv ) > 1: | ||
installlog = sys.argv[1] | ||
run( installlog ) | ||
else: | ||
print "Supply the installation log file" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# If you are on the developers list below, and you have imported this file, any uncaught exception | ||
# will go to the Python debugger. | ||
|
||
# based on http:https://stackoverflow.com/questions/242485/starting-python-debugger-automatically-on-error | ||
|
||
try: | ||
import getpass, os | ||
developers = [ 'painter', 'painter1' ] | ||
if (getpass.getuser() in developers and os.environ.get("PY_DEBUG_EXCEP",True)!='False')\ | ||
or os.environ.get("PY_DEBUG_EXCEP",False)=='True': | ||
import sys | ||
|
||
def info(type, value, tb): | ||
if hasattr(sys, 'ps1') or not sys.stderr.isatty(): | ||
# we are in interactive mode or we don't have a tty-like | ||
# device, so we call the default hook | ||
sys.__excepthook__(type, value, tb) | ||
else: | ||
import traceback, pdb | ||
# we are NOT in interactive mode, print the exception... | ||
traceback.print_exception(type, value, tb) | ||
# ...then start the debugger in post-mortem mode. | ||
# pdb.pm() # deprecated | ||
pdb.post_mortem(tb) # more "modern" | ||
|
||
sys.excepthook = info | ||
|
||
else: | ||
pass | ||
except: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
#!/usr/bin/env python | ||
|
||
"""Computes useful performance data from the Synda database. The beginning and ending dates and | ||
times should be provided in a modified ISO 8601 format without letter separators, e.g. | ||
'2019-01-25 13:04'. The third argument is a partial url, which is normally used to specify the | ||
protocol and data node, e.g. gsiftp:https://vesg.ipsl.upmc.fr. But the % wildcard is permitted, and a | ||
longer url may be used to narrow the coverage further.""" | ||
|
||
import os, sys, glob | ||
from pprint import pprint | ||
import sqlite3 | ||
#import debug, pdb | ||
import datetime | ||
global conn, curs | ||
|
||
def setup(): | ||
"""Initializes the connection to the database, etc.""" | ||
global conn, curs | ||
# normal: | ||
conn = sqlite3.connect('/var/lib/synda/sdt/sdt.db') | ||
# test on a temporary copy of the database: | ||
#conn = sqlite3.connect('~/db/sdt.db') | ||
curs = conn.cursor() | ||
|
||
def finish(): | ||
"""Closes connections to databases, etc.""" | ||
global conn, curs | ||
conn.commit() | ||
conn.close() | ||
|
||
def str2time( date ): | ||
"""Given a date string such as '2019-01-25 13:04' or '2019-01-25 13:04:13.922788', | ||
this function returns a datetime object representing the date.""" | ||
FMT_min = '%Y-%m-%d %H:%M' | ||
FMT_sec = '%Y-%m-%d %H:%M:%S' | ||
FMT_frac = '%Y-%m-%d %H:%M:%S.%f' | ||
try: | ||
return datetime.datetime.strptime( date, FMT_frac ) | ||
except ValueError: | ||
try: | ||
return datetime.datetime.strptime( date, FMT_sec ) | ||
except ValueError: | ||
return datetime.datetime.strptime( date, FMT_min ) | ||
|
||
def downloading_intervals( startin, stopin, file_intervals ): | ||
"""Returns active_time: the amount of time, in seconds, within (start,stop) in which at least | ||
one of the files described by 'file_intervals' was being downloaded. | ||
Input parameters: | ||
- start and stop define the overal time interval. | ||
The times provided are strings suitable for str2time, e.g. "2019-01-25 13:04:00.123" | ||
- The list 'file_intervals' is a list of tuples from the database, of the form | ||
(start_date, end_date, <ignored>). Each tuple defines a time interval in which one file was | ||
being downloaded. | ||
""" | ||
start = str2time( startin ) | ||
stop = str2time( stopin ) | ||
file_ints = [ ( str2time(file_int[0]), str2time(file_int[1]) ) for file_int in file_intervals ] | ||
|
||
file_ints.sort( key=(lambda x: x[0]) ) # sort by each file's start_date. | ||
|
||
# Because file_ints is sorted, the following computes intervals in a sorted order, sorted by | ||
# the bottom (start) time. Each file_int either extends an interval at the top, or starts a | ||
# new interval above the top of the previous interval. That is, the intervals are disjoint, | ||
# and ordered by the top (stop) time as well as the bottom time. | ||
# intervals = [] # not used, but may be useful for debugging | ||
bot = file_ints[0][0] | ||
top = file_ints[0][1] | ||
active_time = 0 | ||
for file_int in file_ints: | ||
if file_int[0]<=top: # extend present interval | ||
top = max( top, file_int[1] ) | ||
else: # new interval, after all previous intervals and previous files' end_date. | ||
active_time += (top-bot).total_seconds() | ||
# intervals.append( (bot,top) ) # save the last interval (not used) | ||
bot = file_int[0] | ||
top = file_int[1] | ||
if top<stop: | ||
active_time += (top-bot).total_seconds() | ||
# intervals.append( (bot,top) ) # save the last interval (not used) | ||
|
||
return active_time | ||
|
||
def url_hdr( url ): | ||
"""url header, i.e. the protocol and data node but no more of the url.""" | ||
upto_third_slash = url[: url.find('/', 2+url.find('//'))] | ||
return upto_third_slash | ||
|
||
def url_hdrs( start, stop, server, method='aggregate' ): | ||
"""Returns url headers (with protocol and data node) for transfers with times between | ||
'start' and 'stop', and a specified server. These are the same transfers as for the | ||
corresponding call of perf_data().""" | ||
# If the SQL command is changed in perf_data, then this should be changed to match: | ||
cmd = ("SELECT url FROM file WHERE start_date>='{0}' AND " +\ | ||
"end_date<='{1}' AND url LIKE '{2}%' AND " +\ | ||
"status='done' AND size IS NOT NULL").format(start, stop, server) | ||
curs.execute( cmd ) | ||
results = curs.fetchall() | ||
return list(set( [ url_hdr(r[0]) for r in results] )) | ||
|
||
def perf_data( start, stop, server, method='aggregate' ): | ||
"""Returns performance data for transfers with times between 'start' and 'stop', and a | ||
specified server. | ||
The times should be in a modified ISO 8601 format without letter separators, e.g. | ||
'2019-01-25 13:04'. The server - both the data node and the protocol - is specified as the | ||
first characters of the url, e.g. 'gsiftp:https://esgf1.dkrz.de' or 'http:https://esgf1.dkrz.de'. | ||
Optionally you may provide a method argument to specify how the rate is to be computed.""" | ||
cmd = ("SELECT start_date, end_date, size FROM file WHERE start_date>='{0}' AND " +\ | ||
"end_date<='{1}' AND url LIKE '{2}%' AND " +\ | ||
"(status='done' OR status='published') AND size IS NOT NULL").format(start, stop, server) | ||
# ...For more accuracy, I could include files overlapping the (start,stop) boundary, i.e. | ||
# end_date>{0} and start_date<{1}. Then I would have to reduce the file size in proportion | ||
# to the amount of the file's download time which is within (start,stop). | ||
curs.execute( cmd ) | ||
results = curs.fetchall() | ||
sizes = [ size for (start_date,end_date,size) in results ] | ||
Nfiles = len(sizes) | ||
totsize = sum(sizes) | ||
if totsize==0: | ||
return None,None,None,None,None | ||
avgsize = totsize/Nfiles/1024./1024 | ||
spf = 0 # don't want to compute it in non-default cases | ||
|
||
if method=='aggregate': # (bytes downloaded)/(downloading time). Takes parallelism | ||
# into account, and doesn't count inactive time in (start,stop ). | ||
active_time = downloading_intervals( start, stop, results ) | ||
if active_time>0: | ||
retrate = totsize/active_time/1024/1024. | ||
retsize = totsize/1024/1024/1024. | ||
else: | ||
retrate = 0 | ||
retsize = 0 | ||
spf = active_time/len(sizes) | ||
elif method=='aggregate-crude': # simply (bytes downloaded)/(stop-start). Takes parallelism | ||
# into account, but it's off, sometimes way off, if there are inactive periods. | ||
delta = str2time(stop) - str2time(start) | ||
retrate = totsize/delta.total_seconds()/1024/1024. | ||
retsize = totsize/1024/1024/1024. | ||
elif method=='seqsize': # size-weighted method, but based on separate rates for each file, | ||
# thus like "synda metric" except that the average is weighted by file size. | ||
# In other words, compute time as if everything were sequential. | ||
deltas = [ str2time(end_date)-str2time(start_date) for (start_date,end_date,size) in results ] | ||
delta = datetime.timedelta(0) # sum() doesn't work on timedelta objects | ||
for dl in deltas: | ||
delta += dl | ||
retrate = totsize/delta.total_seconds()/1024/1024. | ||
retsize = totsize/1024/1024/1024. | ||
elif method=='arith': # simple arithmetic average | ||
rates = [ size/(str2time(end_date)-str2time(start_date)).total_seconds() | ||
for (start_date,end_date,size) in results if size!=0 ] | ||
retrate = sum(rates)/1024/1024./len(rates) | ||
retsize = totsize/1024/1024/1024. | ||
else: # the simple arithmetic average which Synda does, but still restricted to the | ||
# protocol+server and the date range. This is a bit less precise than arith because | ||
# the 'rate' column in the database has been rounded to an integer. | ||
cmd = ("SELECT avg(rate) FROM file WHERE status='done' AND rate IS NOT NULL AND "+\ | ||
"start_date>='{0}' AND end_date<='{1}' AND size IS NOT NULL AND "+\ | ||
"url LIKE '{2}%'").format(start,stop,server) | ||
curs.execute( cmd ) | ||
results = curs.fetchall() | ||
retrate = results[0][0]/1024/1024. | ||
retsize = totsize/1024/1024/1024. | ||
return round(retrate,4), round(spf,4), round(retsize,4), round(avgsize,4), Nfiles | ||
|
||
|
||
if __name__ == '__main__': | ||
setup() | ||
print "args=", sys.argv | ||
if len( sys.argv ) < 3: | ||
print "provide start time, stop time, and server in the form of" | ||
print " '2019-01-25 13:04' '2019-01-25 14:04' 'gsiftp:https://esgf1.umr-cnrm.fr'" | ||
print " You can use a % wildcard character when specifying the server." | ||
print " You can use a T instead of a space between the date and time." | ||
else: | ||
if False: # for tests: | ||
for method in ['aggregate','aggregate-crude','seqsize','arith','synda']: | ||
rate,size= perf_data( sys.argv[1], sys.argv[2], sys.argv[3], method ) | ||
if rate is None: | ||
print "No data downloaded" | ||
else: | ||
print method, ' ',rate, "MiB/s", size, "GiB" | ||
else: | ||
# Times with a T work better in scripts, e.g. '2019-01-25T13:04'. | ||
# The Synda database uses a space between the date and time, e.g. | ||
# '2019-01-25 13:04' | ||
start = sys.argv[1].replace('T',' ') | ||
stop = sys.argv[2].replace('T',' ') | ||
if len(sys.argv)>=4: | ||
server = sys.argv[3] | ||
else: | ||
server = '%' | ||
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, server ) | ||
if rate is None: | ||
print "No data downloaded" | ||
else: | ||
uhs = url_hdrs( start, stop, server ) | ||
uhs.sort() | ||
print 'rate',rate, "MiB/s Nfiles",Nfiles," size", size, "GiB", "avg size", avgsize, "MiB", uhs | ||
if len(uhs)>1: | ||
for uh in uhs: | ||
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, uh ) | ||
print "rate {:6.2f}".format(rate),\ | ||
"MiB/s Nfiles {:5d}".format(Nfiles),\ | ||
" size {:8.2f}".format(size),\ | ||
"GiB", " avg size {:8.2f}".format(avgsize), "MiB", uh | ||
|
||
finish() | ||
|
||
|
||
|
||
|