Skip to content

Commit

Permalink
Merge pull request #81 from UUDigitalHumanitieslab/feature/calculate-…
Browse files Browse the repository at this point in the history
…per-document

Feature/calculate per document
  • Loading branch information
oktaal committed Oct 11, 2023
2 parents 0937d4a + 964c350 commit cc76961
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docker/deployment/command.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ then
fi

# same user as dispatcher
sudo -u _apt ./restart-projects.py &
sudo -u _apt LD_LIBRARY_PATH="$LD_LIBRARY_PATH" ./restart-projects.py &

# Configure services
cd runit.d
Expand Down
38 changes: 34 additions & 4 deletions webservice/tscanservice/tscanwrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
TSCANSRC = sys.argv[7]
ALPINOHOME = sys.argv[8]


# Remove all the output files from a crashed earlier attempt
for filepath in chain(
glob.glob(f"{inputdir}/*.tscan.xml"),
glob.glob(f"{inputdir}/*.csv")):
os.remove(filepath)

#Obtain all data from the CLAM system (passed in $DATAFILE (clam.xml))
clamdata = clam.common.data.getclamdata(datafile)

Expand Down Expand Up @@ -129,10 +136,12 @@ def init_alpino_lookup(configfile, inputdir, outputdir):
map(lambda x: x[1], sorted(words.items(), key=lambda x: x[0])))
alpino_lookup.append((sentence, filepath, index))

if len(alpino_lookup):
save_alpino_lookup(outputdir, alpino_lookup)
# always save the Alpino lookup even when empty:
# this way when we're dealing with multiple documents,
# there will always be an Alpino lookup which can processed
save_alpino_lookup(outputdir, alpino_lookup)

configfile.write(f"alpino_lookup=\"{outputdir}/alpino_lookup.data\"\n")
configfile.write(f"alpino_lookup=\"{outputdir}/alpino_lookup.data\"\n")

#Write configuration file

Expand Down Expand Up @@ -298,7 +307,28 @@ def sigterm_handler():

#pass all input files at once
clam.common.status.write(statusfile, "Processing " + str(len(inputfiles)) + " files, this may take a while...", 10) # status update
ref = os.system('ALPINO_HOME="' + ALPINOHOME + '" TCL_LIBRARY="' + ALPINOHOME + '/create_bin/tcl8.5" TCLLIBPATH="' + ALPINOHOME + '/create_bin/tcl8.5" tscan --config=' + outputdir + '/tscan.cfg ' + ' '.join(['"' + x + '"' for x in inputfiles]))
ref = 0
step_size = 80 / len(inputfiles)
for i, infile in enumerate(inputfiles):
if i > 0:
try:
# Use the generated lookup file as input
os.rename(outputdir + "/../out.alpino_lookup.data", outputdir + "/alpino_lookup.data")
clam.common.status.write(statusfile, "Updated Alpino lookup cache", int(10 + i * step_size))
except FileNotFoundError:
clam.common.status.write(statusfile, "No Alpino parses, empty document?", int(10 + i * step_size))
clam.common.status.write(statusfile, f"Started processing ... {infile}", int(10 + i * step_size))
try:
exit_status = os.system('ALPINO_HOME="' + ALPINOHOME + '" TCL_LIBRARY="' + ALPINOHOME + '/create_bin/tcl8.5" TCLLIBPATH="' + ALPINOHOME + '/create_bin/tcl8.5" tscan --config=' + outputdir + '/tscan.cfg ' + infile)
except Exception as error:
exit_status = 1

if exit_status == 0:
clam.common.status.write(statusfile, f"Finished processing {infile}", int(10 + i * step_size))
else:
clam.common.status.write(statusfile, f"PROBLEM PROCESSING {infile} ERROR CODE {exit_status} consult error log", int(10 + i * step_size))
ref = exit_status


#collect output
clam.common.status.write(statusfile, "Postprocessing", 90) # status update
Expand Down

0 comments on commit cc76961

Please sign in to comment.