Skip to content

Commit

Permalink
These files are indirectly referenced by scripts in my crontab.
Browse files Browse the repository at this point in the history
Note that debug.py is harmless but not needed; I had used it for development but
didn't always remember to delete it.
  • Loading branch information
painter1 committed Mar 8, 2021
1 parent 5db6fec commit b53c13c
Show file tree
Hide file tree
Showing 3 changed files with 301 additions and 0 deletions.
59 changes: 59 additions & 0 deletions count_installed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python

"""Adds up the number of files and amount of data in an installation log file."""

import debug

Kval = 1000 # Value of a K - it might be 1000 or 1024, I don't know what Synda uses.
Mval = Kval*Kval
Gval = Kval*Mval
Tval = Kval*Gval

import sys
from pprint import pprint

# The installation log file should be the first (and only) argument.
# For example: infile = '/home/painter/install.2020.10.19.log'
# The installation script should create this file, append to it with every
# "synda install" command, and, near the end, run this script.

def bytecount_for_people(num):
# from https://stackoverflow.com/questions/579310/formatting-long-numbers-as-strings-in-python
num = float('{:.3g}'.format(num))
magnitude = 0
while abs(num) >= 1000:
magnitude += 1
num /= 1000.0
return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'),
['', ' KB', ' MB', ' GB', ' TB'][magnitude])
def run( infile ):
with open(infile) as f:
lines = f.readlines()
szlines = [line for line in lines if line.find('Once downloaded,')==0]
ctlines = [line for line in lines if line.find('will be added')>0]

szs = [ l.replace('Once downloaded, ','') for l in szlines ]
szs = [ l.replace(' of additional disk space will be used.\n','') for l in szs ]
szsKB = [ l.replace(' KB','') for l in szs if l.find('KB')>0 ]
szsKB += [ l.replace(' kB','') for l in szs if l.find('kB')>0 ]
szKB = sum(map(eval,szsKB))
szsMB = [ l.replace(' MB','') for l in szs if l.find('MB')>0 ]
szMB = sum(map(eval,szsMB))
szsGB = [ l.replace(' GB','') for l in szs if l.find('GB')>0 ]
szGB = sum(map(eval,szsGB))
szsTB = [ l.replace(' TB','') for l in szs if l.find('TB')>0 ]
szTB = sum(map(eval,szsTB))
assert( len(szs) == len(szsKB)+len(szsMB)+len(szsGB)+len(szsTB) )
sz = Kval*szKB + Mval*szMB + Gval*szGB + Tval*szTB
print "Total size installed =", bytecount_for_people(sz)

cts = [ l.replace(' file(s) will be added to the download queue.\n', '') for l in ctlines ]
ct = sum(map(eval,cts))
print "Total number of files installed =", ('{:,}').format(ct)

if __name__ == '__main__':
if len( sys.argv ) > 1:
installlog = sys.argv[1]
run( installlog )
else:
print "Supply the installation log file"
32 changes: 32 additions & 0 deletions debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# If you are on the developers list below, and you have imported this file, any uncaught exception
# will go to the Python debugger.

# based on http:https://stackoverflow.com/questions/242485/starting-python-debugger-automatically-on-error

try:
import getpass, os
developers = [ 'painter', 'painter1' ]
if (getpass.getuser() in developers and os.environ.get("PY_DEBUG_EXCEP",True)!='False')\
or os.environ.get("PY_DEBUG_EXCEP",False)=='True':
import sys

def info(type, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
# we are in interactive mode or we don't have a tty-like
# device, so we call the default hook
sys.__excepthook__(type, value, tb)
else:
import traceback, pdb
# we are NOT in interactive mode, print the exception...
traceback.print_exception(type, value, tb)
print
# ...then start the debugger in post-mortem mode.
# pdb.pm() # deprecated
pdb.post_mortem(tb) # more "modern"

sys.excepthook = info

else:
pass
except:
pass
210 changes: 210 additions & 0 deletions synda-perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env python

"""Computes useful performance data from the Synda database. The beginning and ending dates and
times should be provided in a modified ISO 8601 format without letter separators, e.g.
'2019-01-25 13:04'. The third argument is a partial url, which is normally used to specify the
protocol and data node, e.g. gsiftp:https://vesg.ipsl.upmc.fr. But the % wildcard is permitted, and a
longer url may be used to narrow the coverage further."""

import os, sys, glob
from pprint import pprint
import sqlite3
#import debug, pdb
import datetime
global conn, curs

def setup():
"""Initializes the connection to the database, etc."""
global conn, curs
# normal:
conn = sqlite3.connect('/var/lib/synda/sdt/sdt.db')
# test on a temporary copy of the database:
#conn = sqlite3.connect('~/db/sdt.db')
curs = conn.cursor()

def finish():
"""Closes connections to databases, etc."""
global conn, curs
conn.commit()
conn.close()

def str2time( date ):
"""Given a date string such as '2019-01-25 13:04' or '2019-01-25 13:04:13.922788',
this function returns a datetime object representing the date."""
FMT_min = '%Y-%m-%d %H:%M'
FMT_sec = '%Y-%m-%d %H:%M:%S'
FMT_frac = '%Y-%m-%d %H:%M:%S.%f'
try:
return datetime.datetime.strptime( date, FMT_frac )
except ValueError:
try:
return datetime.datetime.strptime( date, FMT_sec )
except ValueError:
return datetime.datetime.strptime( date, FMT_min )

def downloading_intervals( startin, stopin, file_intervals ):
"""Returns active_time: the amount of time, in seconds, within (start,stop) in which at least
one of the files described by 'file_intervals' was being downloaded.
Input parameters:
- start and stop define the overal time interval.
The times provided are strings suitable for str2time, e.g. "2019-01-25 13:04:00.123"
- The list 'file_intervals' is a list of tuples from the database, of the form
(start_date, end_date, <ignored>). Each tuple defines a time interval in which one file was
being downloaded.
"""
start = str2time( startin )
stop = str2time( stopin )
file_ints = [ ( str2time(file_int[0]), str2time(file_int[1]) ) for file_int in file_intervals ]

file_ints.sort( key=(lambda x: x[0]) ) # sort by each file's start_date.

# Because file_ints is sorted, the following computes intervals in a sorted order, sorted by
# the bottom (start) time. Each file_int either extends an interval at the top, or starts a
# new interval above the top of the previous interval. That is, the intervals are disjoint,
# and ordered by the top (stop) time as well as the bottom time.
# intervals = [] # not used, but may be useful for debugging
bot = file_ints[0][0]
top = file_ints[0][1]
active_time = 0
for file_int in file_ints:
if file_int[0]<=top: # extend present interval
top = max( top, file_int[1] )
else: # new interval, after all previous intervals and previous files' end_date.
active_time += (top-bot).total_seconds()
# intervals.append( (bot,top) ) # save the last interval (not used)
bot = file_int[0]
top = file_int[1]
if top<stop:
active_time += (top-bot).total_seconds()
# intervals.append( (bot,top) ) # save the last interval (not used)

return active_time

def url_hdr( url ):
"""url header, i.e. the protocol and data node but no more of the url."""
upto_third_slash = url[: url.find('/', 2+url.find('//'))]
return upto_third_slash

def url_hdrs( start, stop, server, method='aggregate' ):
"""Returns url headers (with protocol and data node) for transfers with times between
'start' and 'stop', and a specified server. These are the same transfers as for the
corresponding call of perf_data()."""
# If the SQL command is changed in perf_data, then this should be changed to match:
cmd = ("SELECT url FROM file WHERE start_date>='{0}' AND " +\
"end_date<='{1}' AND url LIKE '{2}%' AND " +\
"status='done' AND size IS NOT NULL").format(start, stop, server)
curs.execute( cmd )
results = curs.fetchall()
return list(set( [ url_hdr(r[0]) for r in results] ))

def perf_data( start, stop, server, method='aggregate' ):
"""Returns performance data for transfers with times between 'start' and 'stop', and a
specified server.
The times should be in a modified ISO 8601 format without letter separators, e.g.
'2019-01-25 13:04'. The server - both the data node and the protocol - is specified as the
first characters of the url, e.g. 'gsiftp:https://esgf1.dkrz.de' or 'http:https://esgf1.dkrz.de'.
Optionally you may provide a method argument to specify how the rate is to be computed."""
cmd = ("SELECT start_date, end_date, size FROM file WHERE start_date>='{0}' AND " +\
"end_date<='{1}' AND url LIKE '{2}%' AND " +\
"(status='done' OR status='published') AND size IS NOT NULL").format(start, stop, server)
# ...For more accuracy, I could include files overlapping the (start,stop) boundary, i.e.
# end_date>{0} and start_date<{1}. Then I would have to reduce the file size in proportion
# to the amount of the file's download time which is within (start,stop).
curs.execute( cmd )
results = curs.fetchall()
sizes = [ size for (start_date,end_date,size) in results ]
Nfiles = len(sizes)
totsize = sum(sizes)
if totsize==0:
return None,None,None,None,None
avgsize = totsize/Nfiles/1024./1024
spf = 0 # don't want to compute it in non-default cases

if method=='aggregate': # (bytes downloaded)/(downloading time). Takes parallelism
# into account, and doesn't count inactive time in (start,stop ).
active_time = downloading_intervals( start, stop, results )
if active_time>0:
retrate = totsize/active_time/1024/1024.
retsize = totsize/1024/1024/1024.
else:
retrate = 0
retsize = 0
spf = active_time/len(sizes)
elif method=='aggregate-crude': # simply (bytes downloaded)/(stop-start). Takes parallelism
# into account, but it's off, sometimes way off, if there are inactive periods.
delta = str2time(stop) - str2time(start)
retrate = totsize/delta.total_seconds()/1024/1024.
retsize = totsize/1024/1024/1024.
elif method=='seqsize': # size-weighted method, but based on separate rates for each file,
# thus like "synda metric" except that the average is weighted by file size.
# In other words, compute time as if everything were sequential.
deltas = [ str2time(end_date)-str2time(start_date) for (start_date,end_date,size) in results ]
delta = datetime.timedelta(0) # sum() doesn't work on timedelta objects
for dl in deltas:
delta += dl
retrate = totsize/delta.total_seconds()/1024/1024.
retsize = totsize/1024/1024/1024.
elif method=='arith': # simple arithmetic average
rates = [ size/(str2time(end_date)-str2time(start_date)).total_seconds()
for (start_date,end_date,size) in results if size!=0 ]
retrate = sum(rates)/1024/1024./len(rates)
retsize = totsize/1024/1024/1024.
else: # the simple arithmetic average which Synda does, but still restricted to the
# protocol+server and the date range. This is a bit less precise than arith because
# the 'rate' column in the database has been rounded to an integer.
cmd = ("SELECT avg(rate) FROM file WHERE status='done' AND rate IS NOT NULL AND "+\
"start_date>='{0}' AND end_date<='{1}' AND size IS NOT NULL AND "+\
"url LIKE '{2}%'").format(start,stop,server)
curs.execute( cmd )
results = curs.fetchall()
retrate = results[0][0]/1024/1024.
retsize = totsize/1024/1024/1024.
return round(retrate,4), round(spf,4), round(retsize,4), round(avgsize,4), Nfiles


if __name__ == '__main__':
setup()
print "args=", sys.argv
if len( sys.argv ) < 3:
print "provide start time, stop time, and server in the form of"
print " '2019-01-25 13:04' '2019-01-25 14:04' 'gsiftp:https://esgf1.umr-cnrm.fr'"
print " You can use a % wildcard character when specifying the server."
print " You can use a T instead of a space between the date and time."
else:
if False: # for tests:
for method in ['aggregate','aggregate-crude','seqsize','arith','synda']:
rate,size= perf_data( sys.argv[1], sys.argv[2], sys.argv[3], method )
if rate is None:
print "No data downloaded"
else:
print method, ' ',rate, "MiB/s", size, "GiB"
else:
# Times with a T work better in scripts, e.g. '2019-01-25T13:04'.
# The Synda database uses a space between the date and time, e.g.
# '2019-01-25 13:04'
start = sys.argv[1].replace('T',' ')
stop = sys.argv[2].replace('T',' ')
if len(sys.argv)>=4:
server = sys.argv[3]
else:
server = '%'
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, server )
if rate is None:
print "No data downloaded"
else:
uhs = url_hdrs( start, stop, server )
uhs.sort()
print 'rate',rate, "MiB/s Nfiles",Nfiles," size", size, "GiB", "avg size", avgsize, "MiB", uhs
if len(uhs)>1:
for uh in uhs:
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, uh )
print "rate {:6.2f}".format(rate),\
"MiB/s Nfiles {:5d}".format(Nfiles),\
" size {:8.2f}".format(size),\
"GiB", " avg size {:8.2f}".format(avgsize), "MiB", uh

finish()




0 comments on commit b53c13c

Please sign in to comment.