Add GDCAPIPhenoset (#91)

ucscXena · Aug 5, 2019 · 4b97675 · 4b97675
1 parent 5ce4f9b
commit 4b97675
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 1 deletion.
diff --git a/xena_gdc_etl/constants.py b/xena_gdc_etl/constants.py
@@ -53,6 +53,7 @@
  'somaticsniper_snv': 'template.snv.meta.json',
  'varscan2_snv': 'template.snv.meta.json',
  'GDC_phenotype': 'template.phenotype.meta.json',
+ 'Xena_phenotype': 'template.api_phenotype.meta.json',
  'survival': 'template.survival.meta.json',
  'gistic': 'template.gistic.meta.json',
  'star_counts': 'template.rna.meta.json',
@@ -92,9 +93,87 @@
  'somaticsniper_snv',
  'varscan2_snv',
  'GDC_phenotype',
+ 'Xena_phenotype',
  'survival',
  'gistic',
  'star_counts',
  'methylation27',
  'methylation450',
 ]
+CASES_FIELDS_EXPANDS = {
+ "CPTAC-3": {
+ "fields": [
+ "case_id",
+ "demographic.cause_of_death",
+ "demographic.days_to_birth",
+ "demographic.days_to_death",
+ "demographic.demographic_id",
+ "demographic.ethnicity",
+ "demographic.gender",
+ "demographic.race",
+ "demographic.submitter_id",
+ "demographic.vital_status",
+ "demographic.year_of_birth",
+ "demographic.year_of_death",
+ "diagnoses.age_at_diagnosis",
+ "diagnoses.ajcc_clinical_m",
+ "diagnoses.ajcc_pathologic_m",
+ "diagnoses.ajcc_pathologic_n",
+ "diagnoses.ajcc_pathologic_stage",
+ "diagnoses.ajcc_pathologic_t",
+ "diagnoses.ajcc_staging_system_edition",
+ "diagnoses.classification_of_tumor",
+ "diagnoses.days_to_last_follow_up",
+ "diagnoses.days_to_last_known_disease_status",
+ "diagnoses.days_to_recurrence",
+ "diagnoses.diagnosis_id",
+ "diagnoses.last_known_disease_status",
+ "diagnoses.lymph_nodes_positive",
+ "diagnoses.morphology",
+ "diagnoses.primary_diagnosis",
+ "diagnoses.prior_malignancy",
+ "diagnoses.progression_or_recurrence",
+ "diagnoses.residual_disease",
+ "diagnoses.site_of_resection_or_biopsy",
+ "diagnoses.submitter_id",
+ "diagnoses.tissue_or_organ_of_origin",
+ "diagnoses.tumor_grade",
+ "diagnoses.tumor_largest_dimension_diameter",
+ "diagnoses.tumor_stage",
+ "diagnoses.year_of_diagnosis",
+ "disease_type",
+ "exposures.alcohol_history",
+ "exposures.alcohol_intensity",
+ "exposures.bmi",
+ "exposures.cigarettes_per_day",
+ "exposures.exposure_id",
+ "exposures.height",
+ "exposures.pack_years_smoked",
+ "exposures.submitter_id",
+ "exposures.tobacco_smoking_onset_year",
+ "exposures.tobacco_smoking_quit_year",
+ "exposures.tobacco_smoking_status",
+ "exposures.weight",
+ "exposures.years_smoked",
+ "primary_site",
+ "samples.composition",
+ "samples.current_weight",
+ "samples.days_to_sample_procurement",
+ "samples.freezing_method",
+ "samples.initial_weight",
+ "samples.is_ffpe",
+ "samples.longest_dimension",
+ "samples.oct_embedded",
+ "samples.preservation_method",
+ "samples.sample_id",
+ "samples.sample_type",
+ "samples.sample_type_id",
+ "samples.submitter_id",
+ "samples.time_between_excision_and_freezing",
+ "samples.tissue_type",
+ "samples.tumor_code",
+ "samples.tumor_descriptor",
+ ],
+ "expand": [],
+ }
+}
diff --git a/xena_gdc_etl/gdc2xena.py b/xena_gdc_etl/gdc2xena.py
@@ -29,7 +29,12 @@
 import time
 import shutil
 
-from .xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset
+from .xena_dataset import (
+ GDCOmicset,
+ GDCPhenoset,
+ GDCAPIPhenoset,
+ GDCSurvivalset,
+)
 
 
 def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False):
@@ -87,6 +92,8 @@ def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False):
  dataset = GDCPhenoset(project, 'clinical', root_dir)
  elif dtype == 'GDC_phenotype':
  dataset = GDCPhenoset(project, 'GDC_phenotype', root_dir)
+ elif dtype == 'Xena_phenotype':
+ dataset = GDCAPIPhenoset(project, root_dir)
  else:
  dataset = GDCOmicset(project, dtype, root_dir)
  try:

diff --git a/xena_gdc_etl/resources/template.api_phenotype.meta.json b/xena_gdc_etl/resources/template.api_phenotype.meta.json
@@ -0,0 +1,11 @@
+{"cohort":"{{ xena_cohort }}",
+ "url":"{% if gdc_release %}{{ gdc_release }}, {% endif %}https://api.gdc.cancer.gov/data/",
+ "dataSubType":"phenotype",
+ "dataProducer":"Genomic Data Commons",
+ "label":"Phenotype",
+ "type":"clinicalMatrix",
+ "wrangler":"Xena GDC ETL script",
+ "version":"{{ date }}"{% if notes %},
+ "notes":"{{ notes }}"{% endif %}{% if projects == "CPTAC-3" %},
+ "description": "Some submitter_ids in the GDC appear to be UUIDs (see an example here: https://portal.gdc.cancer.gov/cases/4c241b93-c11c-4802-94a7-07f125267ba3 and one with a UUID as submitter_id: https://portal.gdc.cancer.gov/cases/ba0fe300-d3f0-42fd-b609-3ee3d0e49d7b)." {% endif %} 
+}
diff --git a/xena_gdc_etl/xena_dataset.py b/xena_gdc_etl/xena_dataset.py
@@ -34,6 +34,7 @@
  METADATA_TEMPLATE,
  METADATA_VARIABLES,
  GDC_RELEASE_URL,
+ CASES_FIELDS_EXPANDS,
 )
 
 
@@ -1719,6 +1720,178 @@ def transform(self):
  return self
 
 
+class GDCAPIPhenoset(XenaDataset):
+ r"""GDCAPIPhenoset is derived from the ``XenaDataset`` class and represents
+ for a Xena matrix whose data is phenotype data from the GDC API only.
+
+ Attributes:
+ projects (str or list): One (string) or a list of GDC's
+ "cases.project.project_id". All corresponding projects will be
+ included in this dataset.
+ gdc_release (str): URL to the data release note for the dataset. It
+ will be used by the ``metadata`` method when making the metadata
+ for this dataset. It is highly recommended that this attribute is
+ set explicitly by the user so that it is guaranteed to match the
+ data (raw data) underlying this dataset. If it is not available,
+ the most recent data release will be queried and used.
+ metadata_vars (dict): A dict of variables which will be used (by \*\*
+ unpacking) when rendering the ``metadata_template``. Defaults, if
+ needed, can be derived from corresponding matrix and ``projects``
+ and ``xena_dtype`` properties.
+ """
+
+ @property
+ def gdc_release(self):
+ try:
+ return self.__gdc_release
+ except AttributeError:
+ data_release = gdc.search('status', typ='json')['data_release']
+ anchor = (
+ re.match(r'(Data Release [^\s]+)\s', data_release)
+ .group(1)
+ .replace(' ', '-')
+ .replace('.', '')
+ .lower()
+ )
+ self.__gdc_release = GDC_RELEASE_URL + '#' + anchor
+ return self.__gdc_release
+
+ @gdc_release.setter
+ def gdc_release(self, url):
+ self.__gdc_release = url
+
+ @property
+ def metadata_vars(self):
+ try:
+ assert self.__metadata_vars and isinstance(
+ self.__metadata_vars, dict
+ )
+ return self.__metadata_vars
+ except (AttributeError, AssertionError):
+ matrix_date = time.strftime(
+ "%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix))
+ )
+ projects = ','.join(self.projects)
+ variables = {
+ 'project_id': projects,
+ 'date': matrix_date,
+ 'gdc_release': self.gdc_release,
+ }
+ if projects in GDC_XENA_COHORT:
+ variables['xena_cohort'] = GDC_XENA_COHORT[projects]
+ else:
+ variables['xena_cohort'] = 'GDC ' + projects
+ variables["projects"] = projects
+ try:
+ variables.update(METADATA_VARIABLES[self.xena_dtype])
+ except KeyError:
+ pass
+ self.__metadata_vars = variables
+ return self.__metadata_vars
+
+ @metadata_vars.setter
+ def metadata_vars(self, variables):
+ self.__metadata_vars = variables
+
+ @XenaDataset.download_map.getter
+ def download_map(self):
+ print("Xena_phenotype is selected. No files will be downloaded.")
+ return {}
+
+ def __get_samples_clinical(self, projects, fields, expand):
+ """Get info for all samples of ``projects`` and clinical info for all
+ cases of ``projects`` through GDC API.
+
+ Args:
+ projects (list or str): one (str) or a list of GDC "project_id"(s),
+ whose info will be returned. If None, projects will not be
+ filtered, i.e. info for all GDC projects will be returned.
+ Defaults to None.
+ fields (list or str): one (str) or a list of GDC "cases"
+ expand (list or str): one (str) or a list of GDC "expand"
+
+ Returns:
+ pandas.core.frame.DataFrame: A DataFrame organized by samples,
+ having info for all samples of ``projects``, as well as
+ corresponding clinical info.
+ """
+
+ in_filter = {}
+ if projects is not None:
+ if isinstance(projects, list):
+ in_filter = {'project.project_id': projects}
+ else:
+ in_filter = {'project.project_id': [projects]}
+ res = gdc.search(
+ 'cases',
+ in_filter=in_filter,
+ fields=fields,
+ expand=expand,
+ typ='json'
+ )
+ to_drops = set()
+ for ele in res:
+ to_drops |= set(gdc.get_to_drops(ele))
+ print(
+ "Dropping columns {} for {} projects".format(to_drops, projects)
+ )
+ reduced_no_samples_json = reduce_json_array(
+ [{k: v for k, v in d.items() if k != 'samples'} for d in res]
+ )
+ cases_df = pd.io.json.json_normalize(reduced_no_samples_json)
+ samples_df = pd.io.json.json_normalize(
+ [r for r in res if 'samples' in r],
+ 'samples',
+ 'id',
+ record_prefix='samples.',
+ )
+ merged_df = pd.merge(cases_df, samples_df, how='inner', on='id')
+ merged_df.drop(list(to_drops), axis=1, inplace=True)
+ return merged_df
+
+ def __init__(
+ self,
+ projects,
+ root_dir='.',
+ matrix_dir=None,
+ ):
+ super(GDCAPIPhenoset, self).__init__(
+ projects, 'Xena_phenotype', root_dir, matrix_dir,
+ )
+ self.projects = projects
+ if any(
+ [
+ project not in CASES_FIELDS_EXPANDS.keys()
+ for project in self.projects
+ ]
+ ):
+ raise NotImplementedError(
+ "'Xena_phenotype' for {} project is not implemented".format(
+ projects
+ )
+ )
+ jinja2_env = jinja2.Environment(
+ loader=jinja2.PackageLoader("xena_gdc_etl", "resources")
+ )
+ self.metadata_template = jinja2_env.get_template(
+ "template.api_phenotype.meta.json"
+ )
+
+ def transform(self):
+ if self.projects == ["CPTAC-3"]:
+ xena_matrix = self.__get_samples_clinical(
+ projects=["CPTAC-3"],
+ fields=CASES_FIELDS_EXPANDS["CPTAC-3"]["fields"],
+ expand=CASES_FIELDS_EXPANDS["CPTAC-3"]["expand"],
+ )
+ xena_matrix = xena_matrix.set_index("samples.submitter_id")
+ print('\rSaving matrix to {} ...'.format(self.matrix), end='')
+ mkdir_p(self.matrix_dir)
+ xena_matrix.to_csv(self.matrix, sep='\t', encoding='utf-8')
+ print('\rXena matrix is saved at {}.'.format(self.matrix))
+ return self
+
+
 class GDCSurvivalset(XenaDataset):
  r"""GDCSurvivalset is derived from the ``XenaDataset`` class and represents
  for a Xena matrix of GDC survival data for project(s) of interest.