merge refactor_dbgetfieldsintodf

Merge branch 'refactor_dbgetfieldsintodf' # Conflicts: # R/ctrLoadQueryIntoDbCtis.R
rfhb · Jan 22, 2024 · be3eac7 · be3eac7
2 parents 4ebd70a + 9921685
commit be3eac7
Show file tree

Hide file tree

Showing 87 changed files with 2,246 additions and 2,354 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -27,7 +27,7 @@ some_database_name.sqlite_file
 ^cran-comments\.md$
 codecov.yml
 ^CRAN-SUBMISSION$
-.vscode
+\.vscode
 ^tools
 ^files
 ^scripts
diff --git a/.github/workflows/check-standard-linux.yaml b/.github/workflows/check-standard-linux.yaml
diff --git a/...b/workflows/check-standard-win-macos.yaml → ...flows/check-standard-win-macos-linux.yaml b/...b/workflows/check-standard-win-macos.yaml → ...flows/check-standard-win-macos-linux.yaml
@@ -3,9 +3,9 @@
 # note: update
 # https://github.com/rfhb/ctrdata/settings/secrets/actions/CACHE_VERSION
 
-on: [push, pull_request]
+on: [push, pull_request, workflow_dispatch]
 
-name: R-CMD-CHECK-win-macos-duckdb-mongodb-sqlite
+name: R-CMD-CHECK-win-macos-linux-duckdb-mongodb-sqlite-postgres
 
 jobs:
  R-CMD-check:
@@ -17,9 +17,10 @@ jobs:
  fail-fast: false
  matrix:
  config:
- - {os: windows-latest, r: 'release'}
+ - {os: windows-2022, r: 'release'}
  - {os: macOS-latest, r: 'release'}
  - {os: macOS-latest, r: 'oldrel'}
+ - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
  # error installing dependencies under devel
  #- {os: macOS-latest, r: 'devel'}
 
@@ -29,13 +30,6 @@ jobs:
  RSPM: ${{ matrix.config.rspm }}
  GITHUB_PAT: ${{ secrets.GHPAT }}
 
- # https://docs.github.com/en/actions/reference/software-installed-on-github-hosted-runners
- # includes on macOS 10.15 mongod 4.2.8 and Windows Server 201{6,9}:
- # Version ServiceName ServiceStatus ServiceStartType
- # 4.2.8.0 MongoDB Running Automatic
- # disabled on Windows from August 2022
- # https://github.com/actions/runner-images/issues/5949
-
  steps:
  - uses: actions/checkout@v3
 
@@ -52,30 +46,24 @@ jobs:
  writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
  shell: Rscript {0}
 
- - name: Cache R packages
- if: runner.os != 'windows-latest'
- uses: actions/cache@v1
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-${{ secrets.CACHE_VERSION }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
- restore-keys: ${{ runner.os }}-${{ secrets.CACHE_VERSION }}-${{ hashFiles('.github/R-version') }}-1-
-
+ # remotes::install_github("r-dbi/RSQLite")
+ # remotes::install_github(repo = "ropensci/nodbi", ref = "reimplementation_end_2023")
  - name: Install dependencies
  run: |
  utils::install.packages("DBI")
+ utils::install.packages("RSQLite")
+ utils::install.packages("nodbi")
  utils::install.packages("duckdb", repos = "https://duckdb.r-universe.dev")
- remotes::install_github("rfhb/ctrdata")
  remotes::install_deps(dependencies = TRUE)
  remotes::install_cran(c("rcmdcheck", "covr"))
- remotes::install_github("ropensci/nodbi")
  shell: Rscript {0}
 
- - uses: r-lib/actions/setup-pandoc@v1
+ - uses: ankane/setup-mongodb@v1
 
- # - uses: ankane/setup-postgres@v1
- #  with:
- #  postgres-version: 14
- #  database: test
+ - uses: ankane/setup-postgres@v1
+ with:
+ postgres-version: 14
+ database: test
 
  - name: Test coverage
  if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release'

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,9 @@
 Package: ctrdata
 Type: Package
 Title: Retrieve and Analyze Clinical Trials in Public Registers
-Version: 1.16.0.9000
-Imports: jsonlite, httr, curl (>= 5.1.0), clipr, xml2, nodbi (>= 0.9.3), 
- stringi, tibble, lubridate, jqr, dplyr, zip, V8, readr
+Version: 1.17.0
+Imports: jsonlite, httr, curl (>= 5.1.0), clipr, xml2, nodbi (>= 0.10.0), 
+ stringi, tibble, lubridate, jqr, dplyr, zip, V8, readr, digest
 URL: https://cran.r-project.org/package=ctrdata
 BugReports: https://github.com/rfhb/ctrdata/issues
 Description: A system for querying, retrieving and analyzing
@@ -26,8 +26,8 @@ Description: A system for querying, retrieving and analyzing
  The package can be used for meta-analysis and trend-analysis of
  the design and conduct as well as of the results of clinical trials.
 License: MIT + file LICENSE
-RoxygenNote: 7.2.3
-Suggests: devtools, knitr, rmarkdown, RSQLite (>= 2.2.4), 
+RoxygenNote: 7.3.0
+Suggests: devtools, knitr, rmarkdown, RSQLite (>= 2.3.5), 
  mongolite, tinytest (>= 1.2.1), R.rsp, RPostgres, duckdb
 VignetteBuilder: R.rsp
 NeedsCompilation: no

diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,7 @@ importFrom(curl,multi_run)
 importFrom(curl,new_handle)
 importFrom(curl,new_pool)
 importFrom(curl,parse_headers)
+importFrom(digest,digest)
 importFrom(dplyr,c_across)
 importFrom(dplyr,mutate)
 importFrom(dplyr,rowwise)
@@ -39,7 +40,11 @@ importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,stream_in)
 importFrom(jsonlite,toJSON)
 importFrom(jsonlite,validate)
+importFrom(lubridate,ddays)
+importFrom(lubridate,dmonths)
 importFrom(lubridate,duration)
+importFrom(lubridate,dyears)
+importFrom(lubridate,ymd_hms)
 importFrom(nodbi,docdb_create)
 importFrom(nodbi,docdb_delete)
 importFrom(nodbi,docdb_list)
@@ -51,6 +56,7 @@ importFrom(readr,read_file)
 importFrom(readr,write_file)
 importFrom(stats,na.omit)
 importFrom(stats,setNames)
+importFrom(stringi,stri_detect_fixed)
 importFrom(stringi,stri_detect_regex)
 importFrom(stringi,stri_extract_all_charclass)
 importFrom(stringi,stri_extract_all_regex)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,22 @@
-# ctrdata 1.16.0.9000 (2023-11-24)
+# ctrdata 1.17.0
 
-New development version
+## Possibly breaking changes
+- Reimplemented `dbGetFieldsIntoDf()` to accelerate and have more predictable, simplified returns, in particular for nested fields; also attempts to recursively expand simply nested data into additional columns in the returned data frame
+- Reimplemented `dbFindFields()` to accelerate; both based on improved `nodbi::docdb_query()`
+- `dbFindFields()` now digests a sample of records to quickly find fields, or all records if `sample = FALSE` but taking increasing time with increasing number of records
+- If using `nodbi::scr_postgres()`, parameter `fields` of `dbGetFieldsIntoDf()` is limited to less than 50 fields; a message flags for any backend potential compatibility issues, suggesting to use parent fields, e.g., `a.b` instead of `c("a.b.c.d", "a.b.c.e")`
+- Parameter `stopifnodata` of `dbGetFieldsIntoDf()` is no more needed and deprecated
+- Reimplemented typing fields to speed up and to simplify
+
+## Improvements
+- Register data are re-used and not downloaded again in an interactive session (that is, the same temporary folder is now re-used throughout a user's session) 
+- Temporary folder can be set by users with `options(ctrdata.tempdir = "<user_specified_folder>")`
+- Inform MS Windows users if `cygwin` was found so that they may chose to delete it
+- Many fields added for typing e.g. as date in `dbGetFieldsIntoDf()`
+
+## Bug fixes
+- Adapted and corrected information loading to newly available data in `CTIS`
+- Corrected escaping, and back-conversion, of characters in `JSON` from `CTIS`
 
 # ctrdata 1.16.0 (released 2023-11-24)
 

diff --git a/R/ctrLoadQueryIntoDbCtgov.R b/R/ctrLoadQueryIntoDbCtgov.R
@@ -99,7 +99,11 @@ ctrLoadQueryIntoDbCtgov <- function(
  tempDir <- ctrTempDir(verbose)
 
  # prepare a file handle for temporary directory
- f <- file.path(tempDir, "ctgov.zip")
+ f <- file.path(
+ tempDir, paste0("ctgov_",
+ # include query in file name for potential re-download
+ sapply(ctgovdownloadcsvurl, digest::digest, algo = "crc32"),
+ ".zip"))
 
  # inform user
  message("(1/3) Downloading trial file...")
@@ -135,8 +139,10 @@ ctrLoadQueryIntoDbCtgov <- function(
 
  for (f in seq_along(xmlFileList)) {
 
- fNdjsonCon <- file(file.path(tempDir, paste0("ctgov_trials_", f, ".ndjson")), open = "at")
+ fNdjson <- file.path(tempDir, paste0("ctgov_trials_", f, ".ndjson"))
+ fNdjsonCon <- file(fNdjson, open = "at")
  on.exit(try(close(fNdjsonCon), silent = TRUE), add = TRUE)
+ on.exit(try(unlink(fNdjson), silent = TRUE), add = TRUE)
 
  for (i in xmlFileList[[f]]) {
 
@@ -181,6 +187,9 @@ ctrLoadQueryIntoDbCtgov <- function(
 
  } # for f
 
+ ## delete for any re-downloads
+ try(unlink(unlist(xmlFileList)), silent = TRUE)
+
  ## import -------------------------------------------------------------------
 
  ## run import
@@ -199,6 +208,7 @@ ctrLoadQueryIntoDbCtgov <- function(
  suppressMessages(unlink(downloadsNdjson))
  downloadsNdjsonCon <- file(downloadsNdjson, open = "at")
  on.exit(try(close(downloadsNdjsonCon), silent = TRUE), add = TRUE)
+ on.exit(try(unlink(downloadsNdjson), silent = TRUE), add = TRUE)
 
  # extract trial ids and file name and save in temporary file
  for (ndjsonFile in dir(
@@ -213,6 +223,7 @@ ctrLoadQueryIntoDbCtgov <- function(
  message(". ", appendLF = FALSE)
  }
  close(downloadsNdjsonCon)
+ message()
 
  # get document trial id and file name
  dlFiles <- jsonlite::stream_in(file(downloadsNdjson), verbose = FALSE)
@@ -234,6 +245,11 @@ ctrLoadQueryIntoDbCtgov <- function(
 
  } # !is.null(documents.path)
 
+ ## delete for any re-downloads
+ try(unlink(dir(
+ path = tempDir, pattern = "ctgov_trials_[0-9]+.ndjson",
+ full.names = TRUE)), silent = TRUE)
+
  ## inform user -----------------------------------------------------
 
  # find out number of trials imported into database

diff --git a/R/ctrLoadQueryIntoDbCtgov2.R b/R/ctrLoadQueryIntoDbCtgov2.R
@@ -210,15 +210,20 @@ ctrLoadQueryIntoDbCtgov2 <- function(
 
  while (TRUE) {
 
- # for download
- fTrialJson <- file.path(tempDir, paste0("ctgov_trials_", pageNumber,".json"))
-
  # page url
  urlToDownload <- ifelse(
  pageNextToken != "",
  paste0(url, "&pageToken=", pageNextToken),
  url)
 
+ # for download
+ fTrialJson <- file.path(
+ tempDir, paste0(
+ "ctgov_trials_",
+ # include query in file name for potential re-download
+ sapply(url, digest::digest, algo = "crc32"),
+ "_", pageNumber, ".json"))
+
  # do download
  tmp <- ctrMultiDownload(
  urlToDownload,
@@ -277,6 +282,7 @@ ctrLoadQueryIntoDbCtgov2 <- function(
  suppressMessages(unlink(downloadsNdjson))
  downloadsNdjsonCon <- file(downloadsNdjson, open = "at")
  on.exit(try(close(downloadsNdjsonCon), silent = TRUE), add = TRUE)
+ on.exit(try(unlink(downloadsNdjson), silent = TRUE), add = TRUE)
 
  # extract trial ids and file name and save in temporary file
  for (ndjsonFile in dir(
@@ -315,6 +321,11 @@ ctrLoadQueryIntoDbCtgov2 <- function(
 
  } # !is.null(documents.path)
 
+ ## delete for any re-downloads
+ try(unlink(dir(
+ path = tempDir, pattern = "ctgov_trials_[0-9]+.ndjson",
+ full.names = TRUE)), silent = TRUE)
+
  ## inform user -----------------------------------------------------
 
  # find out number of trials imported into database