Skip to content

Commit

Permalink
Set up parser for gdc2xena
Browse files Browse the repository at this point in the history
  • Loading branch information
ayan-b committed May 6, 2019
1 parent 5a5aa5a commit d9c5048
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 33 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
scripts=scripts,
entry_points={
'console_scripts': [
'xge = xena_gdc_etl.main:main'
'xge = xena_gdc_etl.main:main',
'gdc2xena = xena_gdc_etl.gdc2xena:main',
],
},
license='Apache License 2.0',
Expand Down
55 changes: 55 additions & 0 deletions tests/test_gdc2xena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest

from xena_gdc_etl import gdc2xena


class ParserTest(unittest.TestCase):
def setUp(self):
self.parser = gdc2xena.create_parser()

def test_etl(self):
parsed = self.parser.parse_args([
"etl",
"-r",
"path/to/dir",
"-p",
"project_name",
"-t",
"datatype",
])
assert parsed.subcomm == "etl"
assert parsed.root == "path/to/dir"
assert parsed.projects == ["project_name"]
assert parsed.datatype == ["datatype"]
# for mutually exclusive groups
parsed = self.parser.parse_args([
"etl",
"-r",
"path/to/dir",
"-P",
"not_this_project_name",
"-T",
"not_this_datatype",
])
assert parsed.subcomm == "etl"
assert parsed.root == "path/to/dir"
assert parsed.not_projects == ["not_this_project_name"]
assert parsed.not_datatype == ["not_this_datatype"]

def test_metaparser(self):
parsed = self.parser.parse_args([
"metadata",
"-p",
"project_name",
"-t",
"datatype",
"-m",
"path/to/matrix",
"-r",
"10",
])
assert parsed.subcomm == "metadata"
assert parsed.project == "project_name"
assert parsed.datatype == "datatype"
assert parsed.matrix == "path/to/matrix"
assert parsed.release == 10.0
30 changes: 13 additions & 17 deletions xena_gdc_etl/scripts/gdc2xena.py → xena_gdc_etl/gdc2xena.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
import os
import timeit

from xena_gdc_etl import gdc
from xena_gdc_etl.xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset
import xena_gdc_etl.gdc as gdc
from .xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset
from .constants import valid_dtype


def gdc2xena(root_dir, projects, xena_dtypes):
Expand Down Expand Up @@ -56,7 +57,6 @@ def gdc2xena(root_dir, projects, xena_dtypes):
"methylation27", "methylation450".
"""
start_time = timeit.default_timer()

counts = 0
total_projects = len(projects)
log_format = '%(asctime)-15s [%(levelname)s]: %(message)s'
Expand Down Expand Up @@ -88,25 +88,18 @@ def gdc2xena(root_dir, projects, xena_dtypes):
logger.warning(msg, exc_info=True)
print(msg)
logging.shutdown()

end_time = timeit.default_timer()
m, s = divmod(int(end_time - start_time), 60)
h, m = divmod(m, 60)
print('Finish in {:d}:{:02d}:{:02d}.'.format(h, m, s))


def main():
valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv',
'somaticsniper_snv', 'varscan2_snv', 'raw_phenotype',
'GDC_phenotype', 'survival', 'methylation27',
'methylation450']
def create_parser():
parser = argparse.ArgumentParser(
description='Pipeline for importing data from GDC to Xena.'
)
subparsers = parser.add_subparsers(title='Subcommands', dest='subcomm',
metavar='')

# Subcommand for full ETL (download, transform, and metadata)
etlparser = subparsers.add_parser(
'etl',
Expand Down Expand Up @@ -155,7 +148,14 @@ def main():
help='Path to a Xena matrix')
metaparser.add_argument('-r', '--release', type=float, required=True,
help='GDC data release number.')
return parser


def main():
"""
Program entry point for gdc2xena
"""
parser = create_parser()
args = parser.parse_args()
if args.subcomm == 'etl':
root_dir = os.path.abspath(args.root)
Expand Down Expand Up @@ -194,11 +194,7 @@ def main():
dataset = GDCOmicset(args.project, args.datatype, root_dir)
dataset.matrix = args.matrix
dataset.gdc_release = (
'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-'
+ str(args.release).replace('.', '')
'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-' + # noqa
str(args.release).replace('.', '')
)
dataset.metadata()


if __name__ == '__main__':
main()
20 changes: 10 additions & 10 deletions xena_gdc_etl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@ def main():
Program entry point
"""
parser = create_parser()
options = vars(parser.parse_args())
options = parser.parse_args()
# handle check xena equality matrices
if 'df1' in options and 'df2' in options:
equal_matrices(options['df1'], options['df2'])
if options.subcomm == "xena-eql":
equal_matrices(options.df1, options.df2)
# handle make metadata
elif "matrix" in options and "datatype" in options:
metadata(options["matrix"], options["datatype"])
elif options.subcomm == "make-metadata":
metadata(options.matrix, options.datatype)
# handle gdc_check_new
elif "url" in options:
gdc_check_new(options["url"])
elif options.subcomm == "gdc-check-new":
gdc_check_new(options.url)
# handle merge_xena
elif "files" in options and "datatype" in options:
handle_merge_xena(options["name"], options["files"], options["cohort"],
options["datatype"], options["outdir"])
elif options.subcomm == "merge-xena":
handle_merge_xena(options.name, options.files, options.cohort,
options.datatype, options.outdir)


def create_parser():
Expand Down
10 changes: 5 additions & 5 deletions xena_gdc_etl/xena_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
import pandas as pd
import requests

import gdc
from xena_gdc_etl.utils import mkdir_p
from constants import METADATA_TEMPLATE, METADATA_VARIABLES
from xena_gdc_etl import gdc
from .utils import mkdir_p
from .constants import METADATA_TEMPLATE, METADATA_VARIABLES


# Map GDC project_id to Xena specific cohort name.
Expand Down Expand Up @@ -1002,7 +1002,7 @@ def metadata_template(self):
jinja2.environment.Template)
return self.__metadata_template
except (AttributeError, AssertionError):
template_json = self.METADATA_TEMPLATE[self.xena_dtype]
template_json = METADATA_TEMPLATE[self.xena_dtype]
jinja2_env = jinja2.Environment(
loader=jinja2.PackageLoader('xena_gdc_etl', 'resources')
)
Expand All @@ -1028,7 +1028,7 @@ def metadata_vars(self):
else:
variables['xena_cohort'] = 'GDC ' + projects
try:
variables.update(self.METADATA_VARIABLES[self.xena_dtype])
variables.update(METADATA_VARIABLES[self.xena_dtype])
except KeyError:
pass
# Data type specific jinja2 Variables
Expand Down

0 comments on commit d9c5048

Please sign in to comment.