Skip to content

Commit

Permalink
Keep the intermediate *.json and *.txt files for a month. These list …
Browse files Browse the repository at this point in the history
…the newly

retracted datasets.  Doing this requires the filename to contain the day of the
month and the constraints (normally the values of the data node, frequency, or
realm).
  • Loading branch information
painter1 committed Mar 29, 2021
1 parent 6ca1b56 commit 34f60d1
Showing 1 changed file with 22 additions and 15 deletions.
37 changes: 22 additions & 15 deletions retracted.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ def save_starting_offset( starting_offset ):
with open(foffset,'w') as f:
f.write( str(starting_offset) )

def one_query( cmd, starting_offset, prefix, test ):
def one_query( cmd, starting_offset, path, test ):
"""Does one query specified by cmd, starting at the specified offset.
Returns the number of datasets received in the response, which can be used
to compute the next offset. Also returns numFound, extracted from the response; and
Nchanges, the number of datasets which were newly marked as retracted.
The other arguments are a prefix for constructing file paths, and a flag which is True iff
The other arguments are a file path for the json input file and txt output file (without
the '.json' and '.txt' suffixes) , and a flag which is True iff
this is a test of the query system, and the database is not to be referenced."""

logging.info( cmd )
Expand All @@ -60,7 +61,7 @@ def one_query( cmd, starting_offset, prefix, test ):
raise e

# logging and convert json output to a text list of datasets:
cmd = "grep numFound "+prefix+"%s.json" % starting_offset
cmd = "grep numFound "+path+".json"
# Apply the grep command.
# BTW this is simpler but prints the whole numFound line: sp.call(cmd, shell=True)
# example of nFstr:
Expand All @@ -71,18 +72,17 @@ def one_query( cmd, starting_offset, prefix, test ):
# example of numFound (an int): 132311
numFound = map(int, re.findall(r'\d+',nF) )[0]
logging.info( nF )
cmd = ('grep \\"instance_id\\" '+prefix+'%s.json > '+prefix+'%s.txt') %\
(starting_offset,starting_offset)
cmd = 'grep \\"instance_id\\" '+path+'.json > '+path+'.txt'
sp.call(cmd, shell=True)
with open(prefix+'%s.txt' % starting_offset) as fids: num_lines = len(fids.readlines())
with open(path+'.txt') as fids: num_lines = len(fids.readlines())
# ... a terse way to count lines, memory hog ok because file is <10K lines.
if num_lines<numFound:
raise numFoundException

# Record the retracted datasets in the database:
if not test:
try:
Nchanges = status_retracted.status_retracted( prefix+'%s.txt' % starting_offset )
Nchanges = status_retracted.status_retracted( path+'.txt' )
# ... this defaults to suffix='retracted'
except Exception as e:
# database access errors are what I want to be prepared for, but I'm
Expand All @@ -91,7 +91,8 @@ def one_query( cmd, starting_offset, prefix, test ):
logging.error("will try again soon")
# ... For AssertionError, e or str(e) prints as ''.
# But an error here usually is a "database is locked". Rather than do the right
# thing, I'll do the simplest to code: wait 10 minutes and try again.
# thing, I'll do the simplest to code: wait 10 minutes and go on. Maybe we'll
# succeed the next time status_retracted is called on this data.
# The '-1' is a flag to tell the caller to leave starting_offset unchanged so the
# next run will retry from the same point.
time.sleep(600)
Expand All @@ -115,7 +116,8 @@ def get_retracted( prefix,
% (starting_offset,starting_offset)
Nchangesall = 0
for N in range(npages):
num_lines, numFound, Nchanges = one_query( cmd, starting_offset, prefix, test )
path = prefix+str(starting_offset)
num_lines, numFound, Nchanges = one_query( cmd, starting_offset, path, test )
numFoundmax = max( numFoundmax, numFound )
Nchangesall += Nchanges
if num_lines == -1:
Expand All @@ -132,14 +134,19 @@ def get_retracted( prefix,

def get_some_retracted( prefix, constraints='', test=True ):
"""Like get_retracted, but the query is limited as specified and _not_ paginated.
Example of a constraint: "data_node=esgf-data3.ceda.ac.uk". numFound is returned.
Also returns Nchanges, the number of datasets which were newly marked as retracted.
The string constraints is the concatenation of 0 or more constraints separated by '&'.
Example of a constraint: "data_node=esgf-data3.ceda.ac.uk".
Two numbers are returned: numFound and Nchanges, the number of datasets which were newly
marked as retracted.
"""
cmd = "wget -O "+prefix+\
"0.json 'https://esgf-node.llnl.gov/esg-search/search?project=CMIP6&retracted=true&" +\
constr2 = constraints.replace('!=','=NOT')
constr3 = '_'.join([con.split('=')[1] for con in constr2.split('&') if con.find('=')>=0])
path = prefix + constr3
cmd = "wget -O "+path+'.json'+\
" 'https://esgf-node.llnl.gov/esg-search/search?project=CMIP6&retracted=true&" +\
constraints +\
"&fields=instance_id&replica=false&limit=10000'"
num_lines, numFound, Nchanges = one_query( cmd, 0, prefix, test )
num_lines, numFound, Nchanges = one_query( cmd, 0, path, test )
logging.info( "get_some_retracted; constraints=%s, num_lines=%s, numFound=%s"%
(constraints, num_lines, numFound ) )
if num_lines<numFound:
Expand Down Expand Up @@ -260,7 +267,7 @@ def get_retracted_std3( prefix, complement_query=True, test=True ):
'mon', 'monC', 'monPt', 'month', 'subhrPt', 'yr', 'yrPt' ]
# data_nodes = ["esgf-data3.ceda.ac.uk"]
data_nodes = my_data_nodes()
realms = [ 'aerosol', 'atmos', 'atmosChem', 'land', 'landIce', 'ocean', ' ocnBgChem',
realms = [ 'aerosol', 'atmos', 'atmosChem', 'land', 'landIce', 'ocean', 'ocnBgChem',
'ocnBgchem', 'seaIce' ]
fcts = [ ('data_node',data_nodes), ('frequency',frequencies), ('realm',realms) ]
return get_retracted_multi_facets( prefix, fcts, '', complement_query, test)
Expand Down

0 comments on commit 34f60d1

Please sign in to comment.