ucscXena · yunhailuo · Jun 3, 2019 · Jun 3, 2019 · Jun 3, 2019 · Jun 3, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -20,11 +20,9 @@ script:
 jobs:
  include:
  - stage: lint
- script: 
- - pip install flake8
- - flake8
- allow_failures:
- - stage: lint
+ before_install: false
+ install: pip install flake8
+ script: flake8
 
 notifications:
  email: false
diff --git a/setup.cfg b/setup.cfg
@@ -1,8 +1,5 @@
 # flake8 configuration
 [flake8]
 
-# E722: do not use bare 'except'
-# W605: invalid escape sequence '\*'
-# E121: continuation line under-indented for hanging indent
-# E126: continuation line over-indented for hanging indent
-ignore = E722, W605, E121, E126
+# W503: line break before binary operator
+ignore = W503
diff --git a/tests/test_gdc.py b/tests/test_gdc.py
@@ -19,22 +19,18 @@ def test_simple_and_filter():
  in_dict_2 = {'a': 'b'}
  exclude_dict_2 = {'c': 'd'}
  expected = {
-  "content": [
-  {"content": {"field": "a", "value": ["b"]}, "op": "in"},
-  {"content": {"field": "c", "value": ["d"]}, "op": "exclude"}
-  ],
-  "op": "and"
-  }
+ "content": [
+ {"content": {"field": "a", "value": ["b"]}, "op": "in"},
+ {"content": {"field": "c", "value": ["d"]}, "op": "exclude"},
+ ],
+ "op": "and",
+ }
  actual = gdc.simple_and_filter(in_dict_2, exclude_dict_2)
  compare_dict(expected, actual)
 
 
 def test_reduce_json_array():
- input_1 = [{
- 'a': 'hello',
- 'b': [1, 2, 3],
- 'c': [10]
- }]
+ input_1 = [{'a': 'hello', 'b': [1, 2, 3], 'c': [10]}]
  input_2 = [{'a': 'b'}]
  actual_1 = gdc.reduce_json_array(input_1)
  expected_1 = {"a": "hello", "b": [1, 2, 3], "c": 10}
@@ -79,7 +75,7 @@ def test_get_project_info():
  "name": "Cystadenocarcinoma",
  "primary_site": "Ovary",
  "program.name": "TCGA",
- "project_id": "TCGA-OV"
+ "project_id": "TCGA-OV",
  }
  actual.equals(expected)
 
@@ -99,12 +95,13 @@ def test_search():
  actual = gdc.search(endpoint=endpoint, in_filter=in_filter, fields=fields)
  expected = {
  "id": "d1a15919-f5e2-5e60-aed9-cb52a8b4a7a1",
- "target": "TARGET-51-PAKWMM"
+ "target": "TARGET-51-PAKWMM",
  }
  actual.equals(expected)
  with pytest.raises(ValueError) as exception_info:
- gdc.search(endpoint=endpoint, in_filter=in_filter, fields=fields,
- method="PUT")
+ gdc.search(
+ endpoint=endpoint, in_filter=in_filter, fields=fields, method="PUT"
+ )
  error_str = 'Invalid method: PUT\n method must be either "GET" or "POST".'
  assert exception_info.value.args[0] == error_str
 
@@ -116,7 +113,8 @@ def test_gdc_check_new(capfd):
  gdc.gdc_check_new(new_file_uuids)
  out, err = capfd.readouterr()
  actual = pd.read_csv(StringIO(out), sep='\t')
- expected = pd.read_csv("tests/fixtures/gdc_check_new_DR9.0_files_swap.csv",
- sep='\t')
+ expected = pd.read_csv(
+ "tests/fixtures/gdc_check_new_DR9.0_files_swap.csv", sep='\t'
+ )
  expected = expected.head()
  actual.equals(expected)
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -21,16 +21,29 @@ def test_xena_eql(self):
  assert parsed.df2 == "df2"
 
  def test_gdc_check_new(self):
- parsed = self.parser.parse_args(["gdc-check-new",
- "https://example.com/data.gz"])
+ parsed = self.parser.parse_args(
+ ["gdc-check-new", "https://example.com/data.gz"]
+ )
  assert parsed.subcomm == "gdc-check-new"
  assert parsed.url == "https://example.com/data.gz"
 
  def test_merge_xena(self):
- parsed = self.parser.parse_args(["merge-xena", "-f", "path/to/matrix1",
- "path/to/matrix2", "-t", "datatype",
- "-o", "path/to/dir", "-n", "new_name",
- "-c", "cohort_name"])
+ parsed = self.parser.parse_args(
+ [
+ "merge-xena",
+ "-f",
+ "path/to/matrix1",
+ "path/to/matrix2",
+ "-t",
+ "datatype",
+ "-o",
+ "path/to/dir",
+ "-n",
+ "new_name",
+ "-c",
+ "cohort_name",
+ ]
+ )
  assert parsed.subcomm == "merge-xena"
  assert parsed.files == ["path/to/matrix1", "path/to/matrix2"]
  assert parsed.datatype == "datatype"
@@ -39,46 +52,52 @@ def test_merge_xena(self):
  assert parsed.cohort == "cohort_name"
 
  def test_etl(self):
- parsed = self.parser.parse_args([
- "etl",
- "-r",
- "path/to/dir",
- "-p",
- "project_name",
- "-t",
- "datatype",
- ])
+ parsed = self.parser.parse_args(
+ [
+ "etl",
+ "-r",
+ "path/to/dir",
+ "-p",
+ "project_name",
+ "-t",
+ "datatype",
+ ]
+ )
  assert parsed.subcomm == "etl"
  assert parsed.root == "path/to/dir"
  assert parsed.projects == ["project_name"]
  assert parsed.datatype == ["datatype"]
  # for mutually exclusive groups
- parsed = self.parser.parse_args([
- "etl",
- "-r",
- "path/to/dir",
- "-P",
- "not_this_project_name",
- "-T",
- "not_this_datatype",
- ])
+ parsed = self.parser.parse_args(
+ [
+ "etl",
+ "-r",
+ "path/to/dir",
+ "-P",
+ "not_this_project_name",
+ "-T",
+ "not_this_datatype",
+ ]
+ )
  assert parsed.subcomm == "etl"
  assert parsed.root == "path/to/dir"
  assert parsed.not_projects == ["not_this_project_name"]
  assert parsed.not_datatype == ["not_this_datatype"]
 
  def test_metaparser(self):
- parsed = self.parser.parse_args([
- "metadata",
- "-p",
- "project_name",
- "-t",
- "datatype",
- "-m",
- "path/to/matrix",
- "-r",
- "10",
- ])
+ parsed = self.parser.parse_args(
+ [
+ "metadata",
+ "-p",
+ "project_name",
+ "-t",
+ "datatype",
+ "-m",
+ "path/to/matrix",
+ "-r",
+ "10",
+ ]
+ )
  assert parsed.subcomm == "metadata"
  assert parsed.project == "project_name"
  assert parsed.datatype == "datatype"
@@ -89,5 +108,7 @@ def test_version(self):
  with pytest.raises(SystemExit):
  self.parser.parse_args(['--version'])
  out, _ = self.capfd.readouterr()
- __version__ = pkg_resources.get_distribution("xena_gdc_etl").version
+ __version__ = pkg_resources.get_distribution(
+ "xena_gdc_etl"
+ ).version
  assert out == "xge " + __version__ + "\n"
diff --git a/tests/utils.py b/tests/utils.py
@@ -17,5 +17,6 @@ def compare_dict(dict_1, dict_2):
  >>> compare_dict({'a': 'b'}, {'a': 'b'})
  True
  """
- return json.dumps(dict_1, sort_keys=True) == json.dumps(dict_2,
- sort_keys=True)
+ return json.dumps(dict_1, sort_keys=True) == json.dumps(
+ dict_2, sort_keys=True
+ )
diff --git a/xena_gdc_etl/constants.py b/xena_gdc_etl/constants.py
@@ -1,39 +1,43 @@
+GDC_RELEASE_URL = (
+ 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/'
+)
+
 # Map GDC project_id to Xena specific cohort name.
 GDC_XENA_COHORT = {
-  'TCGA-BRCA': 'GDC TCGA Breast Cancer (BRCA)',
-  'TCGA-LUAD': 'GDC TCGA Lung Adenocarcinoma (LUAD)',
-  'TCGA-UCEC': 'GDC TCGA Endometrioid Cancer (UCEC)',
-  'TCGA-LGG': 'GDC TCGA Lower Grade Glioma (LGG)',
-  'TCGA-HNSC': 'GDC TCGA Head and Neck Cancer (HNSC)',
-  'TCGA-PRAD': 'GDC TCGA Prostate Cancer (PRAD)',
-  'TCGA-LUSC': 'GDC TCGA Lung Squamous Cell Carcinoma (LUSC)',
-  'TCGA-THCA': 'GDC TCGA Thyroid Cancer (THCA)',
-  'TCGA-SKCM': 'GDC TCGA Melanoma (SKCM)',
-  'TCGA-OV': 'GDC TCGA Ovarian Cancer (OV)',
-  'TCGA-STAD': 'GDC TCGA Stomach Cancer (STAD)',
-  'TCGA-COAD': 'GDC TCGA Colon Cancer (COAD)',
-  'TCGA-BLCA': 'GDC TCGA Bladder Cancer (BLCA)',
-  'TCGA-GBM': 'GDC TCGA Glioblastoma (GBM)',
-  'TCGA-LIHC': 'GDC TCGA Liver Cancer (LIHC)',
-  'TCGA-KIRC': 'GDC TCGA Kidney Clear Cell Carcinoma (KIRC)',
-  'TCGA-CESC': 'GDC TCGA Cervical Cancer (CESC)',
-  'TCGA-KIRP': 'GDC TCGA Kidney Papillary Cell Carcinoma (KIRP)',
-  'TCGA-SARC': 'GDC TCGA Sarcoma (SARC)',
-  'TCGA-ESCA': 'GDC TCGA Esophageal Cancer (ESCA)',
-  'TCGA-PAAD': 'GDC TCGA Pancreatic Cancer (PAAD)',
-  'TCGA-PCPG': 'GDC TCGA Pheochromocytoma & Paraganglioma (PCPG)',
-  'TCGA-READ': 'GDC TCGA Rectal Cancer (READ)',
-  'TCGA-TGCT': 'GDC TCGA Testicular Cancer (TGCT)',
-  'TCGA-LAML': 'GDC TCGA Acute Myeloid Leukemia (LAML)',
-  'TCGA-THYM': 'GDC TCGA Thymoma (THYM)',
-  'TCGA-ACC': 'GDC TCGA Adrenocortical Cancer (ACC)',
-  'TCGA-MESO': 'GDC TCGA Mesothelioma (MESO)',
-  'TCGA-UVM': 'GDC TCGA Ocular melanomas (UVM)',
-  'TCGA-KICH': 'GDC TCGA Kidney Chromophobe (KICH)',
-  'TCGA-UCS': 'GDC TCGA Uterine Carcinosarcoma (UCS)',
-  'TCGA-CHOL': 'GDC TCGA Bile Duct Cancer (CHOL)',
-  'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)'
- }
+ 'TCGA-BRCA': 'GDC TCGA Breast Cancer (BRCA)',
+ 'TCGA-LUAD': 'GDC TCGA Lung Adenocarcinoma (LUAD)',
+ 'TCGA-UCEC': 'GDC TCGA Endometrioid Cancer (UCEC)',
+ 'TCGA-LGG': 'GDC TCGA Lower Grade Glioma (LGG)',
+ 'TCGA-HNSC': 'GDC TCGA Head and Neck Cancer (HNSC)',
+ 'TCGA-PRAD': 'GDC TCGA Prostate Cancer (PRAD)',
+ 'TCGA-LUSC': 'GDC TCGA Lung Squamous Cell Carcinoma (LUSC)',
+ 'TCGA-THCA': 'GDC TCGA Thyroid Cancer (THCA)',
+ 'TCGA-SKCM': 'GDC TCGA Melanoma (SKCM)',
+ 'TCGA-OV': 'GDC TCGA Ovarian Cancer (OV)',
+ 'TCGA-STAD': 'GDC TCGA Stomach Cancer (STAD)',
+ 'TCGA-COAD': 'GDC TCGA Colon Cancer (COAD)',
+ 'TCGA-BLCA': 'GDC TCGA Bladder Cancer (BLCA)',
+ 'TCGA-GBM': 'GDC TCGA Glioblastoma (GBM)',
+ 'TCGA-LIHC': 'GDC TCGA Liver Cancer (LIHC)',
+ 'TCGA-KIRC': 'GDC TCGA Kidney Clear Cell Carcinoma (KIRC)',
+ 'TCGA-CESC': 'GDC TCGA Cervical Cancer (CESC)',
+ 'TCGA-KIRP': 'GDC TCGA Kidney Papillary Cell Carcinoma (KIRP)',
+ 'TCGA-SARC': 'GDC TCGA Sarcoma (SARC)',
+ 'TCGA-ESCA': 'GDC TCGA Esophageal Cancer (ESCA)',
+ 'TCGA-PAAD': 'GDC TCGA Pancreatic Cancer (PAAD)',
+ 'TCGA-PCPG': 'GDC TCGA Pheochromocytoma & Paraganglioma (PCPG)',
+ 'TCGA-READ': 'GDC TCGA Rectal Cancer (READ)',
+ 'TCGA-TGCT': 'GDC TCGA Testicular Cancer (TGCT)',
+ 'TCGA-LAML': 'GDC TCGA Acute Myeloid Leukemia (LAML)',
+ 'TCGA-THYM': 'GDC TCGA Thymoma (THYM)',
+ 'TCGA-ACC': 'GDC TCGA Adrenocortical Cancer (ACC)',
+ 'TCGA-MESO': 'GDC TCGA Mesothelioma (MESO)',
+ 'TCGA-UVM': 'GDC TCGA Ocular melanomas (UVM)',
+ 'TCGA-KICH': 'GDC TCGA Kidney Chromophobe (KICH)',
+ 'TCGA-UCS': 'GDC TCGA Uterine Carcinosarcoma (UCS)',
+ 'TCGA-CHOL': 'GDC TCGA Bile Duct Cancer (CHOL)',
+ 'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)',
+}
 
 # Map xena_dtype to corresponding metadata template.
 METADATA_TEMPLATE = {
@@ -51,36 +55,39 @@
  'GDC_phenotype': 'template.phenotype.meta.json',
  'survival': 'template.survival.meta.json',
  'methylation27': 'template.methylation.meta.json',
- 'methylation450': 'template.methylation.meta.json'
+ 'methylation450': 'template.methylation.meta.json',
 }
 
 # Jinja2 template variables for corresponding "xena_dtype".
 METADATA_VARIABLES = {
  'htseq_counts': {'gdc_type': 'HTSeq - Counts'},
- 'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
- 'unit': 'fpkm'},
- 'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
- 'unit': 'fpkm-uq'},
+ 'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', 'unit': 'fpkm'},
+ 'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', 'unit': 'fpkm-uq'},
  'mirna': {'gdc_type': 'miRNA Expression Quantification'},
  'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
  'cnv': {'gdc_type': 'Copy Number Segment'},
  'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
  'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
- 'mutect2_snv': {
- 'gdc_type': 'MuTect2 Variant Aggregation and Masking'
- },
+ 'mutect2_snv': {'gdc_type': 'MuTect2 Variant Aggregation and Masking'},
  'somaticsniper_snv': {
- 'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
- },
- 'varscan2_snv': {
- 'gdc_type': 'VarScan2 Variant Aggregation and Masking'
- },
+ 'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
+ },
+ 'varscan2_snv': {'gdc_type': 'VarScan2 Variant Aggregation and Masking'},
  'methylation27': {'platform_num': '27'},
- 'methylation450': {'platform_num': '450'}
+ 'methylation450': {'platform_num': '450'},
 }
 valid_dtype = [
- 'htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
- 'masked_cnv', 'muse_snv', 'mutect2_snv',
- 'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
- 'survival', 'methylation27', 'methylation450'
+ 'htseq_counts',
+ 'htseq_fpkm',
+ 'htseq_fpkm-uq',
+ 'mirna',
+ 'masked_cnv',
+ 'muse_snv',
+ 'mutect2_snv',
+ 'somaticsniper_snv',
+ 'varscan2_snv',
+ 'GDC_phenotype',
+ 'survival',
+ 'methylation27',
+ 'methylation450',
 ]