Skip to content

Commit

Permalink
merge refactor_dbgetfieldsintodf
Browse files Browse the repository at this point in the history
Merge branch 'refactor_dbgetfieldsintodf'

# Conflicts:
#	R/ctrLoadQueryIntoDbCtis.R
  • Loading branch information
rfhb committed Jan 22, 2024
2 parents 4ebd70a + 9921685 commit be3eac7
Show file tree
Hide file tree
Showing 87 changed files with 2,246 additions and 2,354 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ some_database_name.sqlite_file
^cran-comments\.md$
codecov.yml
^CRAN-SUBMISSION$
.vscode
\.vscode
^tools
^files
^scripts
78 changes: 0 additions & 78 deletions .github/workflows/check-standard-linux.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# note: update
# https://github.com/rfhb/ctrdata/settings/secrets/actions/CACHE_VERSION

on: [push, pull_request]
on: [push, pull_request, workflow_dispatch]

name: R-CMD-CHECK-win-macos-duckdb-mongodb-sqlite
name: R-CMD-CHECK-win-macos-linux-duckdb-mongodb-sqlite-postgres

jobs:
R-CMD-check:
Expand All @@ -17,9 +17,10 @@ jobs:
fail-fast: false
matrix:
config:
- {os: windows-latest, r: 'release'}
- {os: windows-2022, r: 'release'}
- {os: macOS-latest, r: 'release'}
- {os: macOS-latest, r: 'oldrel'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
# error installing dependencies under devel
#- {os: macOS-latest, r: 'devel'}

Expand All @@ -29,13 +30,6 @@ jobs:
RSPM: ${{ matrix.config.rspm }}
GITHUB_PAT: ${{ secrets.GHPAT }}

# https://docs.github.com/en/actions/reference/software-installed-on-github-hosted-runners
# includes on macOS 10.15 mongod 4.2.8 and Windows Server 201{6,9}:
# Version ServiceName ServiceStatus ServiceStartType
# 4.2.8.0 MongoDB Running Automatic
# disabled on Windows from August 2022
# https://github.com/actions/runner-images/issues/5949

steps:
- uses: actions/checkout@v3

Expand All @@ -52,30 +46,24 @@ jobs:
writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
shell: Rscript {0}

- name: Cache R packages
if: runner.os != 'windows-latest'
uses: actions/cache@v1
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-${{ secrets.CACHE_VERSION }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ runner.os }}-${{ secrets.CACHE_VERSION }}-${{ hashFiles('.github/R-version') }}-1-

# remotes::install_github("r-dbi/RSQLite")
# remotes::install_github(repo = "ropensci/nodbi", ref = "reimplementation_end_2023")
- name: Install dependencies
run: |
utils::install.packages("DBI")
utils::install.packages("RSQLite")
utils::install.packages("nodbi")
utils::install.packages("duckdb", repos = "https://duckdb.r-universe.dev")
remotes::install_github("rfhb/ctrdata")
remotes::install_deps(dependencies = TRUE)
remotes::install_cran(c("rcmdcheck", "covr"))
remotes::install_github("ropensci/nodbi")
shell: Rscript {0}

- uses: r-lib/actions/setup-pandoc@v1
- uses: ankane/setup-mongodb@v1

# - uses: ankane/setup-postgres@v1
# with:
# postgres-version: 14
# database: test
- uses: ankane/setup-postgres@v1
with:
postgres-version: 14
database: test

- name: Test coverage
if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release'
Expand Down
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Package: ctrdata
Type: Package
Title: Retrieve and Analyze Clinical Trials in Public Registers
Version: 1.16.0.9000
Imports: jsonlite, httr, curl (>= 5.1.0), clipr, xml2, nodbi (>= 0.9.3),
stringi, tibble, lubridate, jqr, dplyr, zip, V8, readr
Version: 1.17.0
Imports: jsonlite, httr, curl (>= 5.1.0), clipr, xml2, nodbi (>= 0.10.0),
stringi, tibble, lubridate, jqr, dplyr, zip, V8, readr, digest
URL: https://cran.r-project.org/package=ctrdata
BugReports: https://github.com/rfhb/ctrdata/issues
Description: A system for querying, retrieving and analyzing
Expand All @@ -26,8 +26,8 @@ Description: A system for querying, retrieving and analyzing
The package can be used for meta-analysis and trend-analysis of
the design and conduct as well as of the results of clinical trials.
License: MIT + file LICENSE
RoxygenNote: 7.2.3
Suggests: devtools, knitr, rmarkdown, RSQLite (>= 2.2.4),
RoxygenNote: 7.3.0
Suggests: devtools, knitr, rmarkdown, RSQLite (>= 2.3.5),
mongolite, tinytest (>= 1.2.1), R.rsp, RPostgres, duckdb
VignetteBuilder: R.rsp
NeedsCompilation: no
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ importFrom(curl,multi_run)
importFrom(curl,new_handle)
importFrom(curl,new_pool)
importFrom(curl,parse_headers)
importFrom(digest,digest)
importFrom(dplyr,c_across)
importFrom(dplyr,mutate)
importFrom(dplyr,rowwise)
Expand All @@ -39,7 +40,11 @@ importFrom(jsonlite,fromJSON)
importFrom(jsonlite,stream_in)
importFrom(jsonlite,toJSON)
importFrom(jsonlite,validate)
importFrom(lubridate,ddays)
importFrom(lubridate,dmonths)
importFrom(lubridate,duration)
importFrom(lubridate,dyears)
importFrom(lubridate,ymd_hms)
importFrom(nodbi,docdb_create)
importFrom(nodbi,docdb_delete)
importFrom(nodbi,docdb_list)
Expand All @@ -51,6 +56,7 @@ importFrom(readr,read_file)
importFrom(readr,write_file)
importFrom(stats,na.omit)
importFrom(stats,setNames)
importFrom(stringi,stri_detect_fixed)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_extract_all_charclass)
importFrom(stringi,stri_extract_all_regex)
Expand Down
20 changes: 18 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# ctrdata 1.16.0.9000 (2023-11-24)
# ctrdata 1.17.0

New development version
## Possibly breaking changes
- Reimplemented `dbGetFieldsIntoDf()` to accelerate and have more predictable, simplified returns, in particular for nested fields; also attempts to recursively expand simply nested data into additional columns in the returned data frame
- Reimplemented `dbFindFields()` to accelerate; both based on improved `nodbi::docdb_query()`
- `dbFindFields()` now digests a sample of records to quickly find fields, or all records if `sample = FALSE` but taking increasing time with increasing number of records
- If using `nodbi::scr_postgres()`, parameter `fields` of `dbGetFieldsIntoDf()` is limited to less than 50 fields; a message flags for any backend potential compatibility issues, suggesting to use parent fields, e.g., `a.b` instead of `c("a.b.c.d", "a.b.c.e")`
- Parameter `stopifnodata` of `dbGetFieldsIntoDf()` is no more needed and deprecated
- Reimplemented typing fields to speed up and to simplify

## Improvements
- Register data are re-used and not downloaded again in an interactive session (that is, the same temporary folder is now re-used throughout a user's session)
- Temporary folder can be set by users with `options(ctrdata.tempdir = "<user_specified_folder>")`
- Inform MS Windows users if `cygwin` was found so that they may chose to delete it
- Many fields added for typing e.g. as date in `dbGetFieldsIntoDf()`

## Bug fixes
- Adapted and corrected information loading to newly available data in `CTIS`
- Corrected escaping, and back-conversion, of characters in `JSON` from `CTIS`

# ctrdata 1.16.0 (released 2023-11-24)

Expand Down
20 changes: 18 additions & 2 deletions R/ctrLoadQueryIntoDbCtgov.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ ctrLoadQueryIntoDbCtgov <- function(
tempDir <- ctrTempDir(verbose)

# prepare a file handle for temporary directory
f <- file.path(tempDir, "ctgov.zip")
f <- file.path(
tempDir, paste0("ctgov_",
# include query in file name for potential re-download
sapply(ctgovdownloadcsvurl, digest::digest, algo = "crc32"),
".zip"))

# inform user
message("(1/3) Downloading trial file...")
Expand Down Expand Up @@ -135,8 +139,10 @@ ctrLoadQueryIntoDbCtgov <- function(

for (f in seq_along(xmlFileList)) {

fNdjsonCon <- file(file.path(tempDir, paste0("ctgov_trials_", f, ".ndjson")), open = "at")
fNdjson <- file.path(tempDir, paste0("ctgov_trials_", f, ".ndjson"))
fNdjsonCon <- file(fNdjson, open = "at")
on.exit(try(close(fNdjsonCon), silent = TRUE), add = TRUE)
on.exit(try(unlink(fNdjson), silent = TRUE), add = TRUE)

for (i in xmlFileList[[f]]) {

Expand Down Expand Up @@ -181,6 +187,9 @@ ctrLoadQueryIntoDbCtgov <- function(

} # for f

## delete for any re-downloads
try(unlink(unlist(xmlFileList)), silent = TRUE)

## import -------------------------------------------------------------------

## run import
Expand All @@ -199,6 +208,7 @@ ctrLoadQueryIntoDbCtgov <- function(
suppressMessages(unlink(downloadsNdjson))
downloadsNdjsonCon <- file(downloadsNdjson, open = "at")
on.exit(try(close(downloadsNdjsonCon), silent = TRUE), add = TRUE)
on.exit(try(unlink(downloadsNdjson), silent = TRUE), add = TRUE)

# extract trial ids and file name and save in temporary file
for (ndjsonFile in dir(
Expand All @@ -213,6 +223,7 @@ ctrLoadQueryIntoDbCtgov <- function(
message(". ", appendLF = FALSE)
}
close(downloadsNdjsonCon)
message()

# get document trial id and file name
dlFiles <- jsonlite::stream_in(file(downloadsNdjson), verbose = FALSE)
Expand All @@ -234,6 +245,11 @@ ctrLoadQueryIntoDbCtgov <- function(

} # !is.null(documents.path)

## delete for any re-downloads
try(unlink(dir(
path = tempDir, pattern = "ctgov_trials_[0-9]+.ndjson",
full.names = TRUE)), silent = TRUE)

## inform user -----------------------------------------------------

# find out number of trials imported into database
Expand Down
17 changes: 14 additions & 3 deletions R/ctrLoadQueryIntoDbCtgov2.R
Original file line number Diff line number Diff line change
Expand Up @@ -210,15 +210,20 @@ ctrLoadQueryIntoDbCtgov2 <- function(

while (TRUE) {

# for download
fTrialJson <- file.path(tempDir, paste0("ctgov_trials_", pageNumber,".json"))

# page url
urlToDownload <- ifelse(
pageNextToken != "",
paste0(url, "&pageToken=", pageNextToken),
url)

# for download
fTrialJson <- file.path(
tempDir, paste0(
"ctgov_trials_",
# include query in file name for potential re-download
sapply(url, digest::digest, algo = "crc32"),
"_", pageNumber, ".json"))

# do download
tmp <- ctrMultiDownload(
urlToDownload,
Expand Down Expand Up @@ -277,6 +282,7 @@ ctrLoadQueryIntoDbCtgov2 <- function(
suppressMessages(unlink(downloadsNdjson))
downloadsNdjsonCon <- file(downloadsNdjson, open = "at")
on.exit(try(close(downloadsNdjsonCon), silent = TRUE), add = TRUE)
on.exit(try(unlink(downloadsNdjson), silent = TRUE), add = TRUE)

# extract trial ids and file name and save in temporary file
for (ndjsonFile in dir(
Expand Down Expand Up @@ -315,6 +321,11 @@ ctrLoadQueryIntoDbCtgov2 <- function(

} # !is.null(documents.path)

## delete for any re-downloads
try(unlink(dir(
path = tempDir, pattern = "ctgov_trials_[0-9]+.ndjson",
full.names = TRUE)), silent = TRUE)

## inform user -----------------------------------------------------

# find out number of trials imported into database
Expand Down
Loading

0 comments on commit be3eac7

Please sign in to comment.