Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lint fixes #37

Merged
merged 4 commits into from
Jun 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@ script:
jobs:
include:
- stage: lint
script:
- pip install flake8
- flake8
allow_failures:
- stage: lint
before_install: false
install: pip install flake8
script: flake8

notifications:
email: false
7 changes: 2 additions & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# flake8 configuration
[flake8]

# E722: do not use bare 'except'
# W605: invalid escape sequence '\*'
# E121: continuation line under-indented for hanging indent
# E126: continuation line over-indented for hanging indent
ignore = E722, W605, E121, E126
# W503: line break before binary operator
ignore = W503
32 changes: 15 additions & 17 deletions tests/test_gdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,18 @@ def test_simple_and_filter():
in_dict_2 = {'a': 'b'}
exclude_dict_2 = {'c': 'd'}
expected = {
"content": [
{"content": {"field": "a", "value": ["b"]}, "op": "in"},
{"content": {"field": "c", "value": ["d"]}, "op": "exclude"}
],
"op": "and"
}
"content": [
{"content": {"field": "a", "value": ["b"]}, "op": "in"},
{"content": {"field": "c", "value": ["d"]}, "op": "exclude"},
],
"op": "and",
}
actual = gdc.simple_and_filter(in_dict_2, exclude_dict_2)
compare_dict(expected, actual)


def test_reduce_json_array():
input_1 = [{
'a': 'hello',
'b': [1, 2, 3],
'c': [10]
}]
input_1 = [{'a': 'hello', 'b': [1, 2, 3], 'c': [10]}]
input_2 = [{'a': 'b'}]
actual_1 = gdc.reduce_json_array(input_1)
expected_1 = {"a": "hello", "b": [1, 2, 3], "c": 10}
Expand Down Expand Up @@ -79,7 +75,7 @@ def test_get_project_info():
"name": "Cystadenocarcinoma",
"primary_site": "Ovary",
"program.name": "TCGA",
"project_id": "TCGA-OV"
"project_id": "TCGA-OV",
}
actual.equals(expected)

Expand All @@ -99,12 +95,13 @@ def test_search():
actual = gdc.search(endpoint=endpoint, in_filter=in_filter, fields=fields)
expected = {
"id": "d1a15919-f5e2-5e60-aed9-cb52a8b4a7a1",
"target": "TARGET-51-PAKWMM"
"target": "TARGET-51-PAKWMM",
}
actual.equals(expected)
with pytest.raises(ValueError) as exception_info:
gdc.search(endpoint=endpoint, in_filter=in_filter, fields=fields,
method="PUT")
gdc.search(
endpoint=endpoint, in_filter=in_filter, fields=fields, method="PUT"
)
error_str = 'Invalid method: PUT\n method must be either "GET" or "POST".'
assert exception_info.value.args[0] == error_str

Expand All @@ -116,7 +113,8 @@ def test_gdc_check_new(capfd):
gdc.gdc_check_new(new_file_uuids)
out, err = capfd.readouterr()
actual = pd.read_csv(StringIO(out), sep='\t')
expected = pd.read_csv("tests/fixtures/gdc_check_new_DR9.0_files_swap.csv",
sep='\t')
expected = pd.read_csv(
"tests/fixtures/gdc_check_new_DR9.0_files_swap.csv", sep='\t'
)
expected = expected.head()
actual.equals(expected)
93 changes: 57 additions & 36 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,29 @@ def test_xena_eql(self):
assert parsed.df2 == "df2"

def test_gdc_check_new(self):
parsed = self.parser.parse_args(["gdc-check-new",
"https://example.com/data.gz"])
parsed = self.parser.parse_args(
["gdc-check-new", "https://example.com/data.gz"]
)
assert parsed.subcomm == "gdc-check-new"
assert parsed.url == "https://example.com/data.gz"

def test_merge_xena(self):
parsed = self.parser.parse_args(["merge-xena", "-f", "path/to/matrix1",
"path/to/matrix2", "-t", "datatype",
"-o", "path/to/dir", "-n", "new_name",
"-c", "cohort_name"])
parsed = self.parser.parse_args(
[
"merge-xena",
"-f",
"path/to/matrix1",
"path/to/matrix2",
"-t",
"datatype",
"-o",
"path/to/dir",
"-n",
"new_name",
"-c",
"cohort_name",
]
)
assert parsed.subcomm == "merge-xena"
assert parsed.files == ["path/to/matrix1", "path/to/matrix2"]
assert parsed.datatype == "datatype"
Expand All @@ -39,46 +52,52 @@ def test_merge_xena(self):
assert parsed.cohort == "cohort_name"

def test_etl(self):
parsed = self.parser.parse_args([
"etl",
"-r",
"path/to/dir",
"-p",
"project_name",
"-t",
"datatype",
])
parsed = self.parser.parse_args(
[
"etl",
"-r",
"path/to/dir",
"-p",
"project_name",
"-t",
"datatype",
]
)
assert parsed.subcomm == "etl"
assert parsed.root == "path/to/dir"
assert parsed.projects == ["project_name"]
assert parsed.datatype == ["datatype"]
# for mutually exclusive groups
parsed = self.parser.parse_args([
"etl",
"-r",
"path/to/dir",
"-P",
"not_this_project_name",
"-T",
"not_this_datatype",
])
parsed = self.parser.parse_args(
[
"etl",
"-r",
"path/to/dir",
"-P",
"not_this_project_name",
"-T",
"not_this_datatype",
]
)
assert parsed.subcomm == "etl"
assert parsed.root == "path/to/dir"
assert parsed.not_projects == ["not_this_project_name"]
assert parsed.not_datatype == ["not_this_datatype"]

def test_metaparser(self):
parsed = self.parser.parse_args([
"metadata",
"-p",
"project_name",
"-t",
"datatype",
"-m",
"path/to/matrix",
"-r",
"10",
])
parsed = self.parser.parse_args(
[
"metadata",
"-p",
"project_name",
"-t",
"datatype",
"-m",
"path/to/matrix",
"-r",
"10",
]
)
assert parsed.subcomm == "metadata"
assert parsed.project == "project_name"
assert parsed.datatype == "datatype"
Expand All @@ -89,5 +108,7 @@ def test_version(self):
with pytest.raises(SystemExit):
self.parser.parse_args(['--version'])
out, _ = self.capfd.readouterr()
__version__ = pkg_resources.get_distribution("xena_gdc_etl").version
__version__ = pkg_resources.get_distribution(
"xena_gdc_etl"
).version
assert out == "xge " + __version__ + "\n"
5 changes: 3 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ def compare_dict(dict_1, dict_2):
>>> compare_dict({'a': 'b'}, {'a': 'b'})
True
"""
return json.dumps(dict_1, sort_keys=True) == json.dumps(dict_2,
sort_keys=True)
return json.dumps(dict_1, sort_keys=True) == json.dumps(
dict_2, sort_keys=True
)
111 changes: 59 additions & 52 deletions xena_gdc_etl/constants.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
GDC_RELEASE_URL = (
'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/'
)

# Map GDC project_id to Xena specific cohort name.
GDC_XENA_COHORT = {
'TCGA-BRCA': 'GDC TCGA Breast Cancer (BRCA)',
'TCGA-LUAD': 'GDC TCGA Lung Adenocarcinoma (LUAD)',
'TCGA-UCEC': 'GDC TCGA Endometrioid Cancer (UCEC)',
'TCGA-LGG': 'GDC TCGA Lower Grade Glioma (LGG)',
'TCGA-HNSC': 'GDC TCGA Head and Neck Cancer (HNSC)',
'TCGA-PRAD': 'GDC TCGA Prostate Cancer (PRAD)',
'TCGA-LUSC': 'GDC TCGA Lung Squamous Cell Carcinoma (LUSC)',
'TCGA-THCA': 'GDC TCGA Thyroid Cancer (THCA)',
'TCGA-SKCM': 'GDC TCGA Melanoma (SKCM)',
'TCGA-OV': 'GDC TCGA Ovarian Cancer (OV)',
'TCGA-STAD': 'GDC TCGA Stomach Cancer (STAD)',
'TCGA-COAD': 'GDC TCGA Colon Cancer (COAD)',
'TCGA-BLCA': 'GDC TCGA Bladder Cancer (BLCA)',
'TCGA-GBM': 'GDC TCGA Glioblastoma (GBM)',
'TCGA-LIHC': 'GDC TCGA Liver Cancer (LIHC)',
'TCGA-KIRC': 'GDC TCGA Kidney Clear Cell Carcinoma (KIRC)',
'TCGA-CESC': 'GDC TCGA Cervical Cancer (CESC)',
'TCGA-KIRP': 'GDC TCGA Kidney Papillary Cell Carcinoma (KIRP)',
'TCGA-SARC': 'GDC TCGA Sarcoma (SARC)',
'TCGA-ESCA': 'GDC TCGA Esophageal Cancer (ESCA)',
'TCGA-PAAD': 'GDC TCGA Pancreatic Cancer (PAAD)',
'TCGA-PCPG': 'GDC TCGA Pheochromocytoma & Paraganglioma (PCPG)',
'TCGA-READ': 'GDC TCGA Rectal Cancer (READ)',
'TCGA-TGCT': 'GDC TCGA Testicular Cancer (TGCT)',
'TCGA-LAML': 'GDC TCGA Acute Myeloid Leukemia (LAML)',
'TCGA-THYM': 'GDC TCGA Thymoma (THYM)',
'TCGA-ACC': 'GDC TCGA Adrenocortical Cancer (ACC)',
'TCGA-MESO': 'GDC TCGA Mesothelioma (MESO)',
'TCGA-UVM': 'GDC TCGA Ocular melanomas (UVM)',
'TCGA-KICH': 'GDC TCGA Kidney Chromophobe (KICH)',
'TCGA-UCS': 'GDC TCGA Uterine Carcinosarcoma (UCS)',
'TCGA-CHOL': 'GDC TCGA Bile Duct Cancer (CHOL)',
'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)'
}
'TCGA-BRCA': 'GDC TCGA Breast Cancer (BRCA)',
'TCGA-LUAD': 'GDC TCGA Lung Adenocarcinoma (LUAD)',
'TCGA-UCEC': 'GDC TCGA Endometrioid Cancer (UCEC)',
'TCGA-LGG': 'GDC TCGA Lower Grade Glioma (LGG)',
'TCGA-HNSC': 'GDC TCGA Head and Neck Cancer (HNSC)',
'TCGA-PRAD': 'GDC TCGA Prostate Cancer (PRAD)',
'TCGA-LUSC': 'GDC TCGA Lung Squamous Cell Carcinoma (LUSC)',
'TCGA-THCA': 'GDC TCGA Thyroid Cancer (THCA)',
'TCGA-SKCM': 'GDC TCGA Melanoma (SKCM)',
'TCGA-OV': 'GDC TCGA Ovarian Cancer (OV)',
'TCGA-STAD': 'GDC TCGA Stomach Cancer (STAD)',
'TCGA-COAD': 'GDC TCGA Colon Cancer (COAD)',
'TCGA-BLCA': 'GDC TCGA Bladder Cancer (BLCA)',
'TCGA-GBM': 'GDC TCGA Glioblastoma (GBM)',
'TCGA-LIHC': 'GDC TCGA Liver Cancer (LIHC)',
'TCGA-KIRC': 'GDC TCGA Kidney Clear Cell Carcinoma (KIRC)',
'TCGA-CESC': 'GDC TCGA Cervical Cancer (CESC)',
'TCGA-KIRP': 'GDC TCGA Kidney Papillary Cell Carcinoma (KIRP)',
'TCGA-SARC': 'GDC TCGA Sarcoma (SARC)',
'TCGA-ESCA': 'GDC TCGA Esophageal Cancer (ESCA)',
'TCGA-PAAD': 'GDC TCGA Pancreatic Cancer (PAAD)',
'TCGA-PCPG': 'GDC TCGA Pheochromocytoma & Paraganglioma (PCPG)',
'TCGA-READ': 'GDC TCGA Rectal Cancer (READ)',
'TCGA-TGCT': 'GDC TCGA Testicular Cancer (TGCT)',
'TCGA-LAML': 'GDC TCGA Acute Myeloid Leukemia (LAML)',
'TCGA-THYM': 'GDC TCGA Thymoma (THYM)',
'TCGA-ACC': 'GDC TCGA Adrenocortical Cancer (ACC)',
'TCGA-MESO': 'GDC TCGA Mesothelioma (MESO)',
'TCGA-UVM': 'GDC TCGA Ocular melanomas (UVM)',
'TCGA-KICH': 'GDC TCGA Kidney Chromophobe (KICH)',
'TCGA-UCS': 'GDC TCGA Uterine Carcinosarcoma (UCS)',
'TCGA-CHOL': 'GDC TCGA Bile Duct Cancer (CHOL)',
'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)',
}

# Map xena_dtype to corresponding metadata template.
METADATA_TEMPLATE = {
Expand All @@ -51,36 +55,39 @@
'GDC_phenotype': 'template.phenotype.meta.json',
'survival': 'template.survival.meta.json',
'methylation27': 'template.methylation.meta.json',
'methylation450': 'template.methylation.meta.json'
'methylation450': 'template.methylation.meta.json',
}

# Jinja2 template variables for corresponding "xena_dtype".
METADATA_VARIABLES = {
'htseq_counts': {'gdc_type': 'HTSeq - Counts'},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
'unit': 'fpkm-uq'},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', 'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', 'unit': 'fpkm-uq'},
'mirna': {'gdc_type': 'miRNA Expression Quantification'},
'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
'cnv': {'gdc_type': 'Copy Number Segment'},
'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
'mutect2_snv': {
'gdc_type': 'MuTect2 Variant Aggregation and Masking'
},
'mutect2_snv': {'gdc_type': 'MuTect2 Variant Aggregation and Masking'},
'somaticsniper_snv': {
'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
},
'varscan2_snv': {
'gdc_type': 'VarScan2 Variant Aggregation and Masking'
},
'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
},
'varscan2_snv': {'gdc_type': 'VarScan2 Variant Aggregation and Masking'},
'methylation27': {'platform_num': '27'},
'methylation450': {'platform_num': '450'}
'methylation450': {'platform_num': '450'},
}
valid_dtype = [
'htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv',
'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
'survival', 'methylation27', 'methylation450'
'htseq_counts',
'htseq_fpkm',
'htseq_fpkm-uq',
'mirna',
'masked_cnv',
'muse_snv',
'mutect2_snv',
'somaticsniper_snv',
'varscan2_snv',
'GDC_phenotype',
'survival',
'methylation27',
'methylation450',
]
Loading