Skip to content

Commit

Permalink
Some more lint fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ayan-b committed Jun 3, 2019
1 parent 76b792c commit 51ecf1a
Show file tree
Hide file tree
Showing 9 changed files with 809 additions and 439 deletions.
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@

# W605: invalid escape sequence '\*'
# W503: line break before binary operator
ignore = W605, W503
# E203: whitespace before ':'
ignore = W605, W503, E203
41 changes: 23 additions & 18 deletions xena_gdc_etl/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
GDC_RELEASE_URL = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/' # noqa: E501
GDC_RELEASE_URL = (
'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/'
) # noqa: E501

# Map GDC project_id to Xena specific cohort name.
GDC_XENA_COHORT = {
Expand Down Expand Up @@ -34,7 +36,7 @@
'TCGA-KICH': 'GDC TCGA Kidney Chromophobe (KICH)',
'TCGA-UCS': 'GDC TCGA Uterine Carcinosarcoma (UCS)',
'TCGA-CHOL': 'GDC TCGA Bile Duct Cancer (CHOL)',
'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)'
'TCGA-DLBC': 'GDC TCGA Large B-cell Lymphoma (DLBC)',
}

# Map xena_dtype to corresponding metadata template.
Expand All @@ -53,36 +55,39 @@
'GDC_phenotype': 'template.phenotype.meta.json',
'survival': 'template.survival.meta.json',
'methylation27': 'template.methylation.meta.json',
'methylation450': 'template.methylation.meta.json'
'methylation450': 'template.methylation.meta.json',
}

# Jinja2 template variables for corresponding "xena_dtype".
METADATA_VARIABLES = {
'htseq_counts': {'gdc_type': 'HTSeq - Counts'},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
'unit': 'fpkm-uq'},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', 'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', 'unit': 'fpkm-uq'},
'mirna': {'gdc_type': 'miRNA Expression Quantification'},
'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
'cnv': {'gdc_type': 'Copy Number Segment'},
'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
'mutect2_snv': {
'gdc_type': 'MuTect2 Variant Aggregation and Masking'
},
'mutect2_snv': {'gdc_type': 'MuTect2 Variant Aggregation and Masking'},
'somaticsniper_snv': {
'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
},
'varscan2_snv': {
'gdc_type': 'VarScan2 Variant Aggregation and Masking'
},
'varscan2_snv': {'gdc_type': 'VarScan2 Variant Aggregation and Masking'},
'methylation27': {'platform_num': '27'},
'methylation450': {'platform_num': '450'}
'methylation450': {'platform_num': '450'},
}
valid_dtype = [
'htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv',
'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
'survival', 'methylation27', 'methylation450'
'htseq_counts',
'htseq_fpkm',
'htseq_fpkm-uq',
'mirna',
'masked_cnv',
'muse_snv',
'mutect2_snv',
'somaticsniper_snv',
'varscan2_snv',
'GDC_phenotype',
'survival',
'methylation27',
'methylation450',
]
175 changes: 119 additions & 56 deletions xena_gdc_etl/gdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,22 @@
import pandas as pd
import requests

from .utils import (
mkdir_p,
reduce_json_array,
)
from .utils import mkdir_p, reduce_json_array

GDC_API_BASE = 'https://api.gdc.cancer.gov'
_SUPPORTED_FILE_TYPES = {'txt', 'vcf', 'bam', 'tsv', 'xml', 'maf', 'xlsx',
'tar', 'gz', 'md5', 'xls'}
_SUPPORTED_FILE_TYPES = {
'txt',
'vcf',
'bam',
'tsv',
'xml',
'maf',
'xlsx',
'tar',
'gz',
'md5',
'xls',
}
_SUPPORTED_DATASETS = [
{'data_type': 'Copy Number Segment'},
{'data_type': 'Masked Copy Number Segment'},
Expand All @@ -36,11 +44,13 @@
{'analysis.workflow_type': 'HTSeq - FPKM-UQ'},
{'analysis.workflow_type': 'MuSE Variant Aggregation and Masking'},
{'analysis.workflow_type': 'MuTect2 Variant Aggregation and Masking'},
{'analysis.workflow_type':
'SomaticSniper Variant Aggregation and Masking'},
{
'analysis.workflow_type':
'SomaticSniper Variant Aggregation and Masking'
},
{'analysis.workflow_type': 'VarScan2 Variant Aggregation and Masking'},
{'data_type': 'Biospecimen Supplement'},
{'data_type': 'Clinical Supplement'}
{'data_type': 'Clinical Supplement'},
]
# https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
TCGA_STUDY_ABBR = {
Expand All @@ -49,8 +59,9 @@
'BLCA': 'Bladder Urothelial Carcinoma',
'LGG': 'Brain Lower Grade Glioma',
'BRCA': 'Breast invasive carcinoma',
'CESC': ('Cervical squamous cell carcinoma and endocervical '
'adenocarcinoma'),
'CESC': (
'Cervical squamous cell carcinoma and endocervical adenocarcinoma'
),
'CHOL': 'Cholangiocarcinoma',
'LCML': 'Chronic Myelogenous Leukemia',
'COAD': 'Colon adenocarcinoma',
Expand Down Expand Up @@ -118,19 +129,28 @@ def simple_and_filter(in_dict={}, exclude_dict={}):
value = in_dict[key]
if not isinstance(value, list):
value = [value]
operation_list.append({"op": "in",
"content": {"field": key, "value": value}})
operation_list.append(
{"op": "in", "content": {"field": key, "value": value}}
)
for key in exclude_dict:
value = exclude_dict[key]
if not isinstance(value, list):
value = [value]
operation_list.append({"op": "exclude",
"content": {"field": key, "value": value}})
operation_list.append(
{"op": "exclude", "content": {"field": key, "value": value}}
)
return {"op": "and", "content": operation_list}


def search(endpoint, in_filter={}, exclude_filter={}, fields=[], expand=[],
typ='dataframe', method='GET'):
def search(
endpoint,
in_filter={},
exclude_filter={},
fields=[],
expand=[],
typ='dataframe',
method='GET',
):
"""Search one GDC endpoints and return searching results in a pandas
DataFrame if possible.
Expand Down Expand Up @@ -172,10 +192,11 @@ def search(endpoint, in_filter={}, exclude_filter={}, fields=[], expand=[],
try:
assert typ.lower() in ['json', 'dataframe']
except (AttributeError, AssertionError):
raise ValueError('typ should be a string of either JSON or dataframe, '
'not {}'.format(typ))
filters = simple_and_filter(in_dict=in_filter,
exclude_dict=exclude_filter)
raise ValueError(
'typ should be a string of either JSON or dataframe, '
'not {}'.format(typ)
)
filters = simple_and_filter(in_dict=in_filter, exclude_dict=exclude_filter)
if isinstance(fields, str):
fields = [fields]
if isinstance(expand, str):
Expand All @@ -193,8 +214,10 @@ def search(endpoint, in_filter={}, exclude_filter={}, fields=[], expand=[],
elif method.upper() == 'GET':
response = requests.get(url, params=payload)
else:
raise ValueError('Invalid method: {}\n method must be either "GET" '
'or "POST".'.format(method))
raise ValueError(
'Invalid method: {}\n method must be either "GET" '
'or "POST".'.format(method)
)
try:
payload['size'] = response.json()['data']['pagination']['total']
except KeyError:
Expand All @@ -203,8 +226,11 @@ def search(endpoint, in_filter={}, exclude_filter={}, fields=[], expand=[],
if typ.lower() == 'json':
return response.json()
else:
warnings.warn('Fail to get a table of results. JSON returned. '
'Please check the result carefully.', stacklevel=2)
warnings.warn(
'Fail to get a table of results. JSON returned. '
'Please check the result carefully.',
stacklevel=2,
)
return response.json()
if method.upper() == 'POST':
response = requests.post(url, data=payload)
Expand All @@ -217,12 +243,18 @@ def search(endpoint, in_filter={}, exclude_filter={}, fields=[], expand=[],
try:
return pd.io.json.json_normalize(reduce_json_array(results))
except Exception:
warnings.warn('Fail to convert searching results into table. '
'JSON will be returned.', stacklevel=2)
warnings.warn(
'Fail to convert searching results into table. '
'JSON will be returned.',
stacklevel=2,
)
return results
else:
warnings.warn('Searching failed with HTTP status code: '
'{}'.format(response.status_code), stacklevel=2)
warnings.warn(
'Searching failed with HTTP status code: '
'{}'.format(response.status_code),
stacklevel=2,
)
return None


Expand Down Expand Up @@ -278,8 +310,10 @@ def download(uuids, download_dir='.', chunk_size=4096):
elif isinstance(uuids, list):
uuids = {uuid: None for uuid in uuids}
elif not isinstance(uuids, dict):
raise TypeError('uuids is a {}; it should be a string, a list or a '
'dict'.format(type(uuids)))
raise TypeError(
'uuids is a {}; it should be a string, a list or a '
'dict'.format(type(uuids))
)
total = len(uuids)
count = 0
download_list = []
Expand All @@ -291,10 +325,11 @@ def download(uuids, download_dir='.', chunk_size=4096):
file_size = int(response.headers['Content-Length'])
if uuids[uuid] is None:
content_disp = response.headers['Content-Disposition']
ori_name = content_disp[content_disp.find('filename=') + 9:]
ori_name = content_disp[content_disp.find('filename=') + 9 :]
new_filename = uuid + '.' + get_ext(ori_name)
path = os.path.join(os.path.abspath(download_dir),
new_filename)
path = os.path.join(
os.path.abspath(download_dir), new_filename
)
else:
path = os.path.abspath(uuids[uuid])
status = '\r[{:d}/{:d}] Download to "{}": {:4.0%}'
Expand All @@ -306,9 +341,11 @@ def download(uuids, download_dir='.', chunk_size=4096):
for chunk in response.iter_content(chunk_size):
f.write(chunk)
downloaded = downloaded + chunk_size
print(status.format(
count, total, path, min(1, downloaded / file_size)),
end=''
print(
status.format(
count, total, path, min(1, downloaded / file_size)
),
end='',
)
sys.stdout.flush()
download_list.append(path)
Expand Down Expand Up @@ -338,9 +375,11 @@ def get_project_info(projects=None):
in_filter = {'projects.project_id': projects}
else:
in_filter = {'projects.project_id': [projects]}
project_df = search('projects', in_filter=in_filter,
fields=['name', 'primary_site', 'project_id',
'program.name'])
project_df = search(
'projects',
in_filter=in_filter,
fields=['name', 'primary_site', 'project_id', 'program.name'],
)
return project_df.set_index('id')


Expand All @@ -366,12 +405,29 @@ def get_samples_clinical(projects=None):
in_filter = {'project.project_id': projects}
else:
in_filter = {'project.project_id': [projects]}
fields = ['case_id', 'created_datetime', 'disease_type', 'id',
'primary_site', 'state', 'submitter_id', 'updated_datetime']
expand = ['demographic', 'diagnoses', 'diagnoses.treatments', 'exposures',
'family_histories', 'project', 'samples', 'tissue_source_site']
res = search('cases', in_filter=in_filter, fields=fields, expand=expand,
typ='json')
fields = [
'case_id',
'created_datetime',
'disease_type',
'id',
'primary_site',
'state',
'submitter_id',
'updated_datetime',
]
expand = [
'demographic',
'diagnoses',
'diagnoses.treatments',
'exposures',
'family_histories',
'project',
'samples',
'tissue_source_site',
]
res = search(
'cases', in_filter=in_filter, fields=fields, expand=expand, typ='json'
)
reduced_no_samples_json = reduce_json_array(
[{k: v for k, v in d.items() if k != 'samples'} for d in res]
)
Expand All @@ -382,12 +438,14 @@ def get_samples_clinical(projects=None):
# correctly with ``record_path `` "samples". Use the raw json instead.
# Besides, there are cases (34 by 12/11/2017) which doesn't have any
# samples and thus doesn't have the key "samples". Ignore them.
# for r in res:
# r.setdefault('samples', [{}])
# samples_json.append(r)
# for r in res:
# r.setdefault('samples', [{}])
# samples_json.append(r)
samples_df = pd.io.json.json_normalize(
[r for r in res if 'samples' in r], 'samples', 'id',
record_prefix='samples.'
[r for r in res if 'samples' in r],
'samples',
'id',
record_prefix='samples.',
)
return pd.merge(cases_df, samples_df, how='inner', on='id')

Expand All @@ -399,14 +457,19 @@ def gdc_check_new(new_file_uuids):
"""

df_list = []
for uuids in (new_file_uuids[i:i + 20000]
for i in range(0, len(new_file_uuids), 20000)):
for uuids in (
new_file_uuids[i : i + 20000]
for i in range(0, len(new_file_uuids), 20000)
):
df = search(
'files',
in_filter={'access': 'open', 'file_id': uuids},
fields=['cases.project.project_id', 'data_type',
'analysis.workflow_type'],
method='POST'
fields=[
'cases.project.project_id',
'data_type',
'analysis.workflow_type',
],
method='POST',
)
try:
df['cases'] = df['cases'].map(
Expand Down
11 changes: 7 additions & 4 deletions xena_gdc_etl/gdc2xena.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,13 @@ def gdc2xena(root_dir, projects, xena_dtypes):
counts = 0
total_projects = len(projects)
log_format = '%(asctime)-15s [%(levelname)s]: %(message)s'
logging.basicConfig(level=logging.WARNING, format=log_format,
datefmt='%Y-%m-%d %H:%M:%S',
filename=os.path.join(root_dir, 'etl.err'),
filemode='w')
logging.basicConfig(
level=logging.WARNING,
format=log_format,
datefmt='%Y-%m-%d %H:%M:%S',
filename=os.path.join(root_dir, 'etl.err'),
filemode='w',
)
logger = logging.getLogger('Xena-GDC-ETL')
for project in projects:
counts += 1
Expand Down
Loading

0 comments on commit 51ecf1a

Please sign in to comment.