These files are indirectly referenced by scripts in my crontab.

Note that debug.py is harmless but not needed; I had used it for development but didn't always remember to delete it.
painter1 · Mar 8, 2021 · b53c13c · b53c13c
1 parent 5db6fec
commit b53c13c
Show file tree

Hide file tree

Showing 3 changed files with 301 additions and 0 deletions.
diff --git a/count_installed.py b/count_installed.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""Adds up the number of files and amount of data in an installation log file."""
+
+import debug
+
+Kval = 1000 # Value of a K - it might be 1000 or 1024, I don't know what Synda uses.
+Mval = Kval*Kval
+Gval = Kval*Mval
+Tval = Kval*Gval
+
+import sys
+from pprint import pprint
+
+# The installation log file should be the first (and only) argument.
+# For example: infile = '/home/painter/install.2020.10.19.log'
+# The installation script should create this file, append to it with every
+# "synda install" command, and, near the end, run this script.
+
+def bytecount_for_people(num):
+ # from https://stackoverflow.com/questions/579310/formatting-long-numbers-as-strings-in-python
+ num = float('{:.3g}'.format(num))
+ magnitude = 0
+ while abs(num) >= 1000:
+ magnitude += 1
+ num /= 1000.0
+ return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'),
+ ['', ' KB', ' MB', ' GB', ' TB'][magnitude])
+def run( infile ):
+ with open(infile) as f:
+ lines = f.readlines()
+ szlines = [line for line in lines if line.find('Once downloaded,')==0]
+ ctlines = [line for line in lines if line.find('will be added')>0]
+
+ szs = [ l.replace('Once downloaded, ','') for l in szlines ]
+ szs = [ l.replace(' of additional disk space will be used.\n','') for l in szs ]
+ szsKB = [ l.replace(' KB','') for l in szs if l.find('KB')>0 ]
+ szsKB += [ l.replace(' kB','') for l in szs if l.find('kB')>0 ]
+ szKB = sum(map(eval,szsKB))
+ szsMB = [ l.replace(' MB','') for l in szs if l.find('MB')>0 ]
+ szMB = sum(map(eval,szsMB))
+ szsGB = [ l.replace(' GB','') for l in szs if l.find('GB')>0 ]
+ szGB = sum(map(eval,szsGB))
+ szsTB = [ l.replace(' TB','') for l in szs if l.find('TB')>0 ]
+ szTB = sum(map(eval,szsTB))
+ assert( len(szs) == len(szsKB)+len(szsMB)+len(szsGB)+len(szsTB) )
+ sz = Kval*szKB + Mval*szMB + Gval*szGB + Tval*szTB
+ print "Total size installed =", bytecount_for_people(sz)
+
+ cts = [ l.replace(' file(s) will be added to the download queue.\n', '') for l in ctlines ]
+ ct = sum(map(eval,cts))
+ print "Total number of files installed =", ('{:,}').format(ct)
+
+if __name__ == '__main__':
+ if len( sys.argv ) > 1:
+ installlog = sys.argv[1]
+ run( installlog )
+ else:
+ print "Supply the installation log file"
diff --git a/debug.py b/debug.py
@@ -0,0 +1,32 @@
+# If you are on the developers list below, and you have imported this file, any uncaught exception
+# will go to the Python debugger.
+
+# based on http:https://stackoverflow.com/questions/242485/starting-python-debugger-automatically-on-error
+
+try:
+ import getpass, os
+ developers = [ 'painter', 'painter1' ]
+ if (getpass.getuser() in developers and os.environ.get("PY_DEBUG_EXCEP",True)!='False')\
+ or os.environ.get("PY_DEBUG_EXCEP",False)=='True':
+ import sys
+
+ def info(type, value, tb):
+ if hasattr(sys, 'ps1') or not sys.stderr.isatty():
+ # we are in interactive mode or we don't have a tty-like
+ # device, so we call the default hook
+ sys.__excepthook__(type, value, tb)
+ else:
+ import traceback, pdb
+ # we are NOT in interactive mode, print the exception...
+ traceback.print_exception(type, value, tb)
+ print
+ # ...then start the debugger in post-mortem mode.
+ # pdb.pm() # deprecated
+ pdb.post_mortem(tb) # more "modern"
+
+ sys.excepthook = info
+
+ else:
+ pass
+except:
+ pass
diff --git a/synda-perf.py b/synda-perf.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+
+"""Computes useful performance data from the Synda database. The beginning and ending dates and
+times should be provided in a modified ISO 8601 format without letter separators, e.g.
+'2019-01-25 13:04'. The third argument is a partial url, which is normally used to specify the
+protocol and data node, e.g. gsiftp:https://vesg.ipsl.upmc.fr. But the % wildcard is permitted, and a
+longer url may be used to narrow the coverage further."""
+
+import os, sys, glob
+from pprint import pprint
+import sqlite3
+#import debug, pdb
+import datetime
+global conn, curs
+
+def setup():
+ """Initializes the connection to the database, etc."""
+ global conn, curs
+ # normal:
+ conn = sqlite3.connect('/var/lib/synda/sdt/sdt.db')
+ # test on a temporary copy of the database:
+ #conn = sqlite3.connect('~/db/sdt.db')
+ curs = conn.cursor()
+
+def finish():
+ """Closes connections to databases, etc."""
+ global conn, curs
+ conn.commit()
+ conn.close()
+
+def str2time( date ):
+ """Given a date string such as '2019-01-25 13:04' or '2019-01-25 13:04:13.922788',
+ this function returns a datetime object representing the date."""
+ FMT_min = '%Y-%m-%d %H:%M'
+ FMT_sec = '%Y-%m-%d %H:%M:%S'
+ FMT_frac = '%Y-%m-%d %H:%M:%S.%f'
+ try:
+ return datetime.datetime.strptime( date, FMT_frac )
+ except ValueError:
+ try:
+ return datetime.datetime.strptime( date, FMT_sec )
+ except ValueError:
+ return datetime.datetime.strptime( date, FMT_min )
+
+def downloading_intervals( startin, stopin, file_intervals ):
+ """Returns active_time: the amount of time, in seconds, within (start,stop) in which at least
+ one of the files described by 'file_intervals' was being downloaded.
+ Input parameters:
+ - start and stop define the overal time interval.
+ The times provided are strings suitable for str2time, e.g. "2019-01-25 13:04:00.123"
+ - The list 'file_intervals' is a list of tuples from the database, of the form
+ (start_date, end_date, <ignored>). Each tuple defines a time interval in which one file was
+ being downloaded.
+ """
+ start = str2time( startin )
+ stop = str2time( stopin )
+ file_ints = [ ( str2time(file_int[0]), str2time(file_int[1]) ) for file_int in file_intervals ]
+
+ file_ints.sort( key=(lambda x: x[0]) ) # sort by each file's start_date.
+
+ # Because file_ints is sorted, the following computes intervals in a sorted order, sorted by
+ # the bottom (start) time. Each file_int either extends an interval at the top, or starts a
+ # new interval above the top of the previous interval. That is, the intervals are disjoint,
+ # and ordered by the top (stop) time as well as the bottom time.
+ # intervals = [] # not used, but may be useful for debugging
+ bot = file_ints[0][0]
+ top = file_ints[0][1]
+ active_time = 0
+ for file_int in file_ints:
+ if file_int[0]<=top: # extend present interval
+ top = max( top, file_int[1] )
+ else: # new interval, after all previous intervals and previous files' end_date.
+ active_time += (top-bot).total_seconds()
+ # intervals.append( (bot,top) ) # save the last interval (not used)
+ bot = file_int[0]
+ top = file_int[1]
+ if top<stop:
+ active_time += (top-bot).total_seconds()
+ # intervals.append( (bot,top) ) # save the last interval (not used)
+
+ return active_time
+
+def url_hdr( url ):
+ """url header, i.e. the protocol and data node but no more of the url."""
+ upto_third_slash = url[: url.find('/', 2+url.find('//'))]
+ return upto_third_slash
+
+def url_hdrs( start, stop, server, method='aggregate' ):
+ """Returns url headers (with protocol and data node) for transfers with times between
+ 'start' and 'stop', and a specified server. These are the same transfers as for the
+ corresponding call of perf_data()."""
+ # If the SQL command is changed in perf_data, then this should be changed to match:
+ cmd = ("SELECT url FROM file WHERE start_date>='{0}' AND " +\
+ "end_date<='{1}' AND url LIKE '{2}%' AND " +\
+ "status='done' AND size IS NOT NULL").format(start, stop, server)
+ curs.execute( cmd )
+ results = curs.fetchall()
+ return list(set( [ url_hdr(r[0]) for r in results] ))
+
+def perf_data( start, stop, server, method='aggregate' ):
+ """Returns performance data for transfers with times between 'start' and 'stop', and a
+ specified server.
+ The times should be in a modified ISO 8601 format without letter separators, e.g.
+ '2019-01-25 13:04'. The server - both the data node and the protocol - is specified as the
+ first characters of the url, e.g. 'gsiftp:https://esgf1.dkrz.de' or 'http:https://esgf1.dkrz.de'.
+ Optionally you may provide a method argument to specify how the rate is to be computed."""
+ cmd = ("SELECT start_date, end_date, size FROM file WHERE start_date>='{0}' AND " +\
+ "end_date<='{1}' AND url LIKE '{2}%' AND " +\
+ "(status='done' OR status='published') AND size IS NOT NULL").format(start, stop, server)
+ # ...For more accuracy, I could include files overlapping the (start,stop) boundary, i.e.
+ # end_date>{0} and start_date<{1}. Then I would have to reduce the file size in proportion
+ # to the amount of the file's download time which is within (start,stop).
+ curs.execute( cmd )
+ results = curs.fetchall()
+ sizes = [ size for (start_date,end_date,size) in results ]
+ Nfiles = len(sizes)
+ totsize = sum(sizes)
+ if totsize==0:
+ return None,None,None,None,None
+ avgsize = totsize/Nfiles/1024./1024
+ spf = 0 # don't want to compute it in non-default cases
+
+ if method=='aggregate': # (bytes downloaded)/(downloading time). Takes parallelism
+ # into account, and doesn't count inactive time in (start,stop ).
+ active_time = downloading_intervals( start, stop, results )
+ if active_time>0:
+ retrate = totsize/active_time/1024/1024. 
+ retsize = totsize/1024/1024/1024.
+ else:
+ retrate = 0
+ retsize = 0
+ spf = active_time/len(sizes)
+ elif method=='aggregate-crude': # simply (bytes downloaded)/(stop-start). Takes parallelism
+ # into account, but it's off, sometimes way off, if there are inactive periods.
+ delta = str2time(stop) - str2time(start)
+ retrate = totsize/delta.total_seconds()/1024/1024. 
+ retsize = totsize/1024/1024/1024.
+ elif method=='seqsize': # size-weighted method, but based on separate rates for each file,
+ # thus like "synda metric" except that the average is weighted by file size.
+ # In other words, compute time as if everything were sequential.
+ deltas = [ str2time(end_date)-str2time(start_date) for (start_date,end_date,size) in results ]
+ delta = datetime.timedelta(0) # sum() doesn't work on timedelta objects
+ for dl in deltas:
+ delta += dl
+ retrate = totsize/delta.total_seconds()/1024/1024. 
+ retsize = totsize/1024/1024/1024.
+ elif method=='arith': # simple arithmetic average
+ rates = [ size/(str2time(end_date)-str2time(start_date)).total_seconds()
+ for (start_date,end_date,size) in results if size!=0 ]
+ retrate = sum(rates)/1024/1024./len(rates)
+ retsize = totsize/1024/1024/1024.
+ else: # the simple arithmetic average which Synda does, but still restricted to the
+ # protocol+server and the date range. This is a bit less precise than arith because
+ # the 'rate' column in the database has been rounded to an integer.
+ cmd = ("SELECT avg(rate) FROM file WHERE status='done' AND rate IS NOT NULL AND "+\
+ "start_date>='{0}' AND end_date<='{1}' AND size IS NOT NULL AND "+\
+ "url LIKE '{2}%'").format(start,stop,server)
+ curs.execute( cmd )
+ results = curs.fetchall()
+ retrate = results[0][0]/1024/1024.
+ retsize = totsize/1024/1024/1024.
+ return round(retrate,4), round(spf,4), round(retsize,4), round(avgsize,4), Nfiles
+
+
+if __name__ == '__main__':
+ setup()
+ print "args=", sys.argv
+ if len( sys.argv ) < 3:
+ print "provide start time, stop time, and server in the form of"
+ print " '2019-01-25 13:04' '2019-01-25 14:04' 'gsiftp:https://esgf1.umr-cnrm.fr'"
+ print " You can use a % wildcard character when specifying the server."
+ print " You can use a T instead of a space between the date and time."
+ else:
+ if False: # for tests:
+ for method in ['aggregate','aggregate-crude','seqsize','arith','synda']:
+ rate,size= perf_data( sys.argv[1], sys.argv[2], sys.argv[3], method )
+ if rate is None:
+ print "No data downloaded"
+ else:
+ print method, ' ',rate, "MiB/s", size, "GiB"
+ else:
+ # Times with a T work better in scripts, e.g. '2019-01-25T13:04'.
+ # The Synda database uses a space between the date and time, e.g.
+ # '2019-01-25 13:04'
+ start = sys.argv[1].replace('T',' ')
+ stop = sys.argv[2].replace('T',' ')
+ if len(sys.argv)>=4:
+ server = sys.argv[3]
+ else:
+ server = '%'
+ rate,spf,size,avgsize,Nfiles = perf_data( start, stop, server )
+ if rate is None:
+ print "No data downloaded"
+ else:
+ uhs = url_hdrs( start, stop, server )
+ uhs.sort()
+ print 'rate',rate, "MiB/s Nfiles",Nfiles," size", size, "GiB", "avg size", avgsize, "MiB", uhs
+ if len(uhs)>1:
+ for uh in uhs:
+ rate,spf,size,avgsize,Nfiles = perf_data( start, stop, uh )
+ print "rate {:6.2f}".format(rate),\
+ "MiB/s Nfiles {:5d}".format(Nfiles),\
+ " size {:8.2f}".format(size),\
+ "GiB", " avg size {:8.2f}".format(avgsize), "MiB", uh
+
+ finish()
+
+
+
+