Skip to content

Commit

Permalink
Add GDCAPIPhenoset (#91)
Browse files Browse the repository at this point in the history
  • Loading branch information
ayan-b authored and yunhailuo committed Aug 5, 2019
1 parent 5ce4f9b commit 4b97675
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 1 deletion.
79 changes: 79 additions & 0 deletions xena_gdc_etl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
'somaticsniper_snv': 'template.snv.meta.json',
'varscan2_snv': 'template.snv.meta.json',
'GDC_phenotype': 'template.phenotype.meta.json',
'Xena_phenotype': 'template.api_phenotype.meta.json',
'survival': 'template.survival.meta.json',
'gistic': 'template.gistic.meta.json',
'star_counts': 'template.rna.meta.json',
Expand Down Expand Up @@ -92,9 +93,87 @@
'somaticsniper_snv',
'varscan2_snv',
'GDC_phenotype',
'Xena_phenotype',
'survival',
'gistic',
'star_counts',
'methylation27',
'methylation450',
]
CASES_FIELDS_EXPANDS = {
"CPTAC-3": {
"fields": [
"case_id",
"demographic.cause_of_death",
"demographic.days_to_birth",
"demographic.days_to_death",
"demographic.demographic_id",
"demographic.ethnicity",
"demographic.gender",
"demographic.race",
"demographic.submitter_id",
"demographic.vital_status",
"demographic.year_of_birth",
"demographic.year_of_death",
"diagnoses.age_at_diagnosis",
"diagnoses.ajcc_clinical_m",
"diagnoses.ajcc_pathologic_m",
"diagnoses.ajcc_pathologic_n",
"diagnoses.ajcc_pathologic_stage",
"diagnoses.ajcc_pathologic_t",
"diagnoses.ajcc_staging_system_edition",
"diagnoses.classification_of_tumor",
"diagnoses.days_to_last_follow_up",
"diagnoses.days_to_last_known_disease_status",
"diagnoses.days_to_recurrence",
"diagnoses.diagnosis_id",
"diagnoses.last_known_disease_status",
"diagnoses.lymph_nodes_positive",
"diagnoses.morphology",
"diagnoses.primary_diagnosis",
"diagnoses.prior_malignancy",
"diagnoses.progression_or_recurrence",
"diagnoses.residual_disease",
"diagnoses.site_of_resection_or_biopsy",
"diagnoses.submitter_id",
"diagnoses.tissue_or_organ_of_origin",
"diagnoses.tumor_grade",
"diagnoses.tumor_largest_dimension_diameter",
"diagnoses.tumor_stage",
"diagnoses.year_of_diagnosis",
"disease_type",
"exposures.alcohol_history",
"exposures.alcohol_intensity",
"exposures.bmi",
"exposures.cigarettes_per_day",
"exposures.exposure_id",
"exposures.height",
"exposures.pack_years_smoked",
"exposures.submitter_id",
"exposures.tobacco_smoking_onset_year",
"exposures.tobacco_smoking_quit_year",
"exposures.tobacco_smoking_status",
"exposures.weight",
"exposures.years_smoked",
"primary_site",
"samples.composition",
"samples.current_weight",
"samples.days_to_sample_procurement",
"samples.freezing_method",
"samples.initial_weight",
"samples.is_ffpe",
"samples.longest_dimension",
"samples.oct_embedded",
"samples.preservation_method",
"samples.sample_id",
"samples.sample_type",
"samples.sample_type_id",
"samples.submitter_id",
"samples.time_between_excision_and_freezing",
"samples.tissue_type",
"samples.tumor_code",
"samples.tumor_descriptor",
],
"expand": [],
}
}
9 changes: 8 additions & 1 deletion xena_gdc_etl/gdc2xena.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@
import time
import shutil

from .xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset
from .xena_dataset import (
GDCOmicset,
GDCPhenoset,
GDCAPIPhenoset,
GDCSurvivalset,
)


def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False):
Expand Down Expand Up @@ -87,6 +92,8 @@ def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False):
dataset = GDCPhenoset(project, 'clinical', root_dir)
elif dtype == 'GDC_phenotype':
dataset = GDCPhenoset(project, 'GDC_phenotype', root_dir)
elif dtype == 'Xena_phenotype':
dataset = GDCAPIPhenoset(project, root_dir)
else:
dataset = GDCOmicset(project, dtype, root_dir)
try:
Expand Down
11 changes: 11 additions & 0 deletions xena_gdc_etl/resources/template.api_phenotype.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{"cohort":"{{ xena_cohort }}",
"url":"{% if gdc_release %}{{ gdc_release }}, {% endif %}https://api.gdc.cancer.gov/data/",
"dataSubType":"phenotype",
"dataProducer":"Genomic Data Commons",
"label":"Phenotype",
"type":"clinicalMatrix",
"wrangler":"Xena GDC ETL script",
"version":"{{ date }}"{% if notes %},
"notes":"{{ notes }}"{% endif %}{% if projects == "CPTAC-3" %},
"description": "Some submitter_ids in the GDC appear to be UUIDs (see an example here: https://portal.gdc.cancer.gov/cases/4c241b93-c11c-4802-94a7-07f125267ba3 and one with a UUID as submitter_id: https://portal.gdc.cancer.gov/cases/ba0fe300-d3f0-42fd-b609-3ee3d0e49d7b)." {% endif %}
}
173 changes: 173 additions & 0 deletions xena_gdc_etl/xena_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
METADATA_TEMPLATE,
METADATA_VARIABLES,
GDC_RELEASE_URL,
CASES_FIELDS_EXPANDS,
)


Expand Down Expand Up @@ -1719,6 +1720,178 @@ def transform(self):
return self


class GDCAPIPhenoset(XenaDataset):
r"""GDCAPIPhenoset is derived from the ``XenaDataset`` class and represents
for a Xena matrix whose data is phenotype data from the GDC API only.
Attributes:
projects (str or list): One (string) or a list of GDC's
"cases.project.project_id". All corresponding projects will be
included in this dataset.
gdc_release (str): URL to the data release note for the dataset. It
will be used by the ``metadata`` method when making the metadata
for this dataset. It is highly recommended that this attribute is
set explicitly by the user so that it is guaranteed to match the
data (raw data) underlying this dataset. If it is not available,
the most recent data release will be queried and used.
metadata_vars (dict): A dict of variables which will be used (by \*\*
unpacking) when rendering the ``metadata_template``. Defaults, if
needed, can be derived from corresponding matrix and ``projects``
and ``xena_dtype`` properties.
"""

@property
def gdc_release(self):
try:
return self.__gdc_release
except AttributeError:
data_release = gdc.search('status', typ='json')['data_release']
anchor = (
re.match(r'(Data Release [^\s]+)\s', data_release)
.group(1)
.replace(' ', '-')
.replace('.', '')
.lower()
)
self.__gdc_release = GDC_RELEASE_URL + '#' + anchor
return self.__gdc_release

@gdc_release.setter
def gdc_release(self, url):
self.__gdc_release = url

@property
def metadata_vars(self):
try:
assert self.__metadata_vars and isinstance(
self.__metadata_vars, dict
)
return self.__metadata_vars
except (AttributeError, AssertionError):
matrix_date = time.strftime(
"%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix))
)
projects = ','.join(self.projects)
variables = {
'project_id': projects,
'date': matrix_date,
'gdc_release': self.gdc_release,
}
if projects in GDC_XENA_COHORT:
variables['xena_cohort'] = GDC_XENA_COHORT[projects]
else:
variables['xena_cohort'] = 'GDC ' + projects
variables["projects"] = projects
try:
variables.update(METADATA_VARIABLES[self.xena_dtype])
except KeyError:
pass
self.__metadata_vars = variables
return self.__metadata_vars

@metadata_vars.setter
def metadata_vars(self, variables):
self.__metadata_vars = variables

@XenaDataset.download_map.getter
def download_map(self):
print("Xena_phenotype is selected. No files will be downloaded.")
return {}

def __get_samples_clinical(self, projects, fields, expand):
"""Get info for all samples of ``projects`` and clinical info for all
cases of ``projects`` through GDC API.
Args:
projects (list or str): one (str) or a list of GDC "project_id"(s),
whose info will be returned. If None, projects will not be
filtered, i.e. info for all GDC projects will be returned.
Defaults to None.
fields (list or str): one (str) or a list of GDC "cases"
expand (list or str): one (str) or a list of GDC "expand"
Returns:
pandas.core.frame.DataFrame: A DataFrame organized by samples,
having info for all samples of ``projects``, as well as
corresponding clinical info.
"""

in_filter = {}
if projects is not None:
if isinstance(projects, list):
in_filter = {'project.project_id': projects}
else:
in_filter = {'project.project_id': [projects]}
res = gdc.search(
'cases',
in_filter=in_filter,
fields=fields,
expand=expand,
typ='json'
)
to_drops = set()
for ele in res:
to_drops |= set(gdc.get_to_drops(ele))
print(
"Dropping columns {} for {} projects".format(to_drops, projects)
)
reduced_no_samples_json = reduce_json_array(
[{k: v for k, v in d.items() if k != 'samples'} for d in res]
)
cases_df = pd.io.json.json_normalize(reduced_no_samples_json)
samples_df = pd.io.json.json_normalize(
[r for r in res if 'samples' in r],
'samples',
'id',
record_prefix='samples.',
)
merged_df = pd.merge(cases_df, samples_df, how='inner', on='id')
merged_df.drop(list(to_drops), axis=1, inplace=True)
return merged_df

def __init__(
self,
projects,
root_dir='.',
matrix_dir=None,
):
super(GDCAPIPhenoset, self).__init__(
projects, 'Xena_phenotype', root_dir, matrix_dir,
)
self.projects = projects
if any(
[
project not in CASES_FIELDS_EXPANDS.keys()
for project in self.projects
]
):
raise NotImplementedError(
"'Xena_phenotype' for {} project is not implemented".format(
projects
)
)
jinja2_env = jinja2.Environment(
loader=jinja2.PackageLoader("xena_gdc_etl", "resources")
)
self.metadata_template = jinja2_env.get_template(
"template.api_phenotype.meta.json"
)

def transform(self):
if self.projects == ["CPTAC-3"]:
xena_matrix = self.__get_samples_clinical(
projects=["CPTAC-3"],
fields=CASES_FIELDS_EXPANDS["CPTAC-3"]["fields"],
expand=CASES_FIELDS_EXPANDS["CPTAC-3"]["expand"],
)
xena_matrix = xena_matrix.set_index("samples.submitter_id")
print('\rSaving matrix to {} ...'.format(self.matrix), end='')
mkdir_p(self.matrix_dir)
xena_matrix.to_csv(self.matrix, sep='\t', encoding='utf-8')
print('\rXena matrix is saved at {}.'.format(self.matrix))
return self


class GDCSurvivalset(XenaDataset):
r"""GDCSurvivalset is derived from the ``XenaDataset`` class and represents
for a Xena matrix of GDC survival data for project(s) of interest.
Expand Down

0 comments on commit 4b97675

Please sign in to comment.