collab-uniba · maelick · Apr 8, 2019 · Apr 8, 2019 · Apr 8, 2019 · Apr 9, 2019
diff --git a/ClassificationTask/classification_functions.R b/ClassificationTask/classification_functions.R
@@ -0,0 +1,121 @@
+library(data.table)
+library(LiblineaR)
+
+#' Read Features
+#'
+#' Read feature file.
+#'
+#' @param filename CSV file containing features.
+#' @return A data.table object with id column parsed as an integer.
+ReadFeatures <- function(filename) {
+ features <- fread(filename)
+ features[, id := as.integer(sub("^t", "", id)) + 1]
+ features
+}
+
+#' Features
+#'
+#' Call Senti4SD-fast.jar to compute features.
+#'
+#' @param input Input text file with a single column and no header.
+#' @param output Output file containing computed features.
+#' @param path Directory where the Senti4SD jar file and dsm.bin files
+#' are stored.
+#' @param read.file If TRUE, read with data.table the content of
+#' output file.
+#' @param use.temp.file If TRUE, output will be removed after feature
+#' computation. This also enforces read.file as TRUE.
+#' @return The result of feature extraction if read.file is TRUE,
+#' stdout and stderr of the Java process as invisible object
+#' otherwise.
+Features <- function(input, output, path=".", read.file=TRUE, use.temp.file=FALSE) {
+ senti.jar <- file.path(path, "Senti4SD-fast.jar")
+ dsm.bin <- file.path(path, "dsm.bin")
+ args <- c("-jar", senti.jar, "-F", "A", "-i", input, "-W", dsm.bin,
+ "-oc", output, "-vd", "600")
+ res <- invisible(system2("java", args, stdout=TRUE, stderr=TRUE))
+ if (read.file || use.temp.file) {
+ res <- ReadFeatures(output)
+ if (use.temp.file) {
+ file.remove(output)
+ }
+ }
+ res
+}
+
+#' Load Model
+#'
+#' Load LiblineaR model.
+#'
+#' @param model.filename Rda file where the model is stored.
+#' @return the LiblineaR model.
+LoadModel <- function(model.filename) {
+ load(model.filename)
+ m
+}
+
+#' Predict
+#'
+#' Predict polarity from a set of features.
+#'
+#' @param model The LiblineaR model object.
+#' @param features The feature data.table object.
+#' @return A factor with levels positive, negative and neutral.
+Predict <- function(model, features) {
+ features <- features[, names(features) != "id", with=FALSE]
+ predict(model, features)$predictions
+}
+
+#' Senti4SD
+#'
+#' Runs Senti4SD on given pieces of text.
+#'
+#' @param text A character vector on which to run Senti4SD.
+#' @param model The LiblineaR model to use for prediction.
+#' @param senti4sd.path Path where Senti4SD jar file is located.
+#' @return A data.table object with text, id and (predicted) polarity
+#' columns.
+Senti4SD <- function(text, model, senti4sd.path) {
+ text.file <- tempfile()
+ fwrite(data.table(gsub("\n", " ", text)), text.file, row.names=FALSE, col.names=FALSE)
+ features <- Features(text.file, tempfile(), senti4sd.path, TRUE, TRUE)
+ features <- na.omit(features)
+ prediction <- Predict(model, features)
+ result <- cbind(features[, list(id)], polarity=prediction)
+ result <- result[order(id)]
+ result[, text := text[id]]
+ file.remove(text.file)
+ result
+}
+
+#' Senti4SD Chunked
+#'
+#' Runs Senti4SD on given pieces of text by splitting the input in
+#' different chunks and running Senti4SD on each chunk.
+#'
+#' @param text A character vector on which to run Senti4SD.
+#' @param model The LiblineaR model to use for prediction.
+#' @param senti4sd.path Path where Senti4SD jar file is located.
+#' @param chunk.size Maximum number of text element to consider for
+#' one single run of Senti4SD.
+#' @param memory.limit Maximum amount of memory (in GB) to use for one
+#' run of Senti4SD. Overrides \code{chunk.size} by setting it to
+#' \code{500 * memory.limit}.
+#' @return A data.table object with text, id and (predicted) polarity
+#' columns.
+Senti4SDChunked <- function(text, model, senti4sd.path,
+ chunk.size=1000, memory.limit=0) {
+ if (memory.limit > 0) {
+ chunk.size <- 500 * memory.limit
+ }
+ chunks <- split(text, (1:length(text) - 1) %/% chunk.size)
+ rbindlist(lapply(1:length(chunks), function(i) {
+ chunk <- chunks[[i]]
+ logging::loginfo("Running Senti4SD on chunk %d of size %d",
+ i, length(chunk))
+ t <- system.time(res <- Senti4SD(chunk, model, senti4sd.path))
+ logging::loginfo("Senti4SD run on chunk %d in %.2f seconds",
+ i, t["elapsed"])
+ res
+ }))
+}
diff --git a/ClassificationTask/classification_new.R b/ClassificationTask/classification_new.R
@@ -0,0 +1,75 @@
+## Classification over an input dataset using an input liblinear model
+## (or default SO model if not present)
+
+logging::basicConfig()
+
+ScriptPath <- function() {
+ initial.options <- commandArgs(trailingOnly=FALSE)
+ file.arg.name <- "--file="
+ script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name,
+ initial.options)])
+ if (length(script.name)) {
+ dirname(script.name)
+ } else {
+ "."
+ }
+}
+
+ParseArgs <- function(script.path) {
+ ## enable commandline arguments from script launched using Rscript
+ args <- as.list(commandArgs(TRUE))
+ if (length(args) < 1) {
+ stop("At least one argument must be supplied.", call.=FALSE)
+ }
+ if (length(args) < 5) {
+ args[[5]] <- FALSE
+ }
+
+ if (is.null(args[[2]])) {
+ args[[2]] <- "predictions.csv"
+ }
+ if (is.null(args[[3]])) {
+ args[[3]] <- tempfile()
+ args[[5]] <- TRUE
+ message(sprintf("No feature file provided, using temporary file %s.",
+ args[[3]]))
+ } else {
+ args[[5]] <- as.logical(args[[5]])
+ if (length(args[[5]]) == 0 | is.na(args[[5]])) {
+ stop("Supplied boolean for temporary file is incorrectly formatted")
+ }
+ }
+ if (is.null(args[[4]])) {
+ message("No LiblinearModel supplied. Default StackOverflow model will be used.")
+ args[[4]] <- file.path(script.path, "modelLiblinear.Rda")
+ }
+
+ list(input.file=args[[1]],
+ output.file=args[[2]],
+ feature.file=args[[3]],
+ use.temp.file=args[[5]],
+ model.file=args[[4]])
+}
+
+script.path <- ScriptPath()
+args <- ParseArgs(script.path)
+attach(args)
+
+source(file.path(script.path, "classification_functions.R"))
+
+model <- LoadModel(model.file)
+
+features <- Features(input.file, feature.file, script.path, TRUE, use.temp.file)
+## if any, exclude rows with Na, NaN and Inf (missing values)
+features <- na.omit(features)
+
+prediction <- Predict(model, features)
+
+result <- cbind(features[, list(id)], polarity=prediction)
+result <- result[order(id)]
+
+text <- read.csv2(input.file, header=FALSE, col.names="text")$text
+result[, text := text[id]]
+
+fwrite(result, output.file)
+message(sprintf("%s was successfully created.", output.file))