Merge pull request #92 from GeneDx/ACB-66

fix: changes the column headers in the phenotype.hpoa file, migrates …
GeneDx · Jun 6, 2023 · d0cd38c · d0cd38c
2 parents aafbd73 + 4f2f29e
commit d0cd38c
Show file tree

Hide file tree

Showing 12 changed files with 40 additions and 302 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -4,7 +4,6 @@ on: [push]
 
 jobs:
  build:
-
  runs-on: ubuntu-latest
  strategy:
  max-parallel: 4

diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # source image
-FROM python:3.7
+FROM python:3.9
 
 # set noninterative mode
 ENV DEBIAN_FRONTEND noninteractive

diff --git a/Pipfile b/Pipfile
@@ -14,11 +14,9 @@ twine = "*"
 gensim = "<4.0"
 obonet = "*"
 fire = "*"
-lightgbm = "*"
 pandas = "*"
 numpy = "*"
 scipy = "*"
-joblib = "1.1.1"
 requests = "*"
 
 [requires]

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![DOI](https://zenodo.org/badge/207335538.svg)](https://zenodo.org/badge/latestdoi/207335538)
 
 # phenopy
-`phenopy` is a Python package to perform phenotype similarity scoring by semantic similarity. `phenopy` is a
+`phenopy` is a Python (3.7) package to perform phenotype similarity scoring by semantic similarity. `phenopy` is a
 lightweight but highly optimized command line tool and library to efficiently perform semantic similarity scoring on
 generic entities with phenotype annotations from the [Human Phenotype Ontology (HPO)](https://hpo.jax.org/app/).
 
@@ -23,13 +23,6 @@ cd phenopy
 python setup.py install
 ```
 
-**To complete installation on macOS please install lightgbm using brew**
-```bash
-brew install lightgbm
-```
-
-or by following macOS installation instructions from [lightgbm documentation](https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html#macos).
-
 ## Command Line Usage
 ### score
 `phenopy` is primarily used as a command line tool. An entity, as described here, is presented as a sample, gene, or
@@ -129,32 +122,6 @@ using `--output-file=/path/to/output_file.txt`
  phenopy score tests/data/test.score-short.txt --summarization-method BMWA --threads 4
  ```
 
-### likelihood
-Phenopy can be used to predict the likelihood of a molecular diagnosis given an input set of HPO phenotypes. This functionality takes the same input records file as the `score` functionality. The likelhood command outputs a probability of finding a moleular diagnosis using a model trained on 46,674 probands primarily with the majority of them having a neurodevelopmental delay phenotype.
-
-To score a list of records with phenotypes:
-
-```bash
-phenopy likelihood tests/data/test.score-long.txt
-```
-
-If the `output_file` argument is not set, this command writes a file, `phenopy.likelihood_moldx.txt` to your current working directory. 
-Look at the predicted probabilities for the first five records:
-
-```bash
-$ head -5 phenopy.likelihood_moldx.txt
-```
-
-The columns are `record_id` and `probability_of_molecular_diagnosis`:
-
-```bash
-118200 0.34306641357469214
-118210 0.47593450032769
-118220 0.385742949333819
-118230 0.5833031588175435
-118300 0.5220058151734898
-```
-
 #### Parameters
 For a full list of command arguments use `phenopy [subcommand] --help`:
 ```bash
@@ -225,53 +192,6 @@ Output:
 0.11213185474495047
 ```
 
-### likelihood
-
-**Generate the hpo network and supporting objects**:
-
-```python
-import os
-from phenopy.build_hpo import generate_annotated_hpo_network
-from phenopy.util import read_phenotype_groups
-
-# data directory
-phenopy_data_directory = os.path.join(os.getenv('HOME'), '.phenopy/data')
-
-# files used in building the annotated HPO network
-obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
-disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')
-
-hpo_network, alt2prim, disease_records = \
- generate_annotated_hpo_network(obo_file, disease_to_phenotype_file)
-```
-
-**Read the phenotype_groups file and the records file into a pandas DataFrame:**
-
-```python
-import pandas as pd
-
-phenotype_groups = read_phenotype_groups()
-
-df = pd.read_csv(
- "tests/data/test.score-long.txt", 
- sep="\t",
- header=None,
- names=["record_id", "info", "phenotypes"]
-)
-
-df["phenotypes"] = df["phenotypes"].apply(lambda row: row.split("|"))
-```
-
-**Predict probabilities from the phenotypes in the DataFrame:**
-
-```python
-from phenopy.likelihood import predict_likelihood_moldx
-
-probabilities = predict_likelihood_moldx(df["phenotypes"])
-print(probabilities[:5])
-[0.34306641 0.4759345 0.38574295 0.58330316 0.52200582]
-```
-
 ### miscellaneous
 
 The library can be used to prune parent phenotypes from the `phenotype.hpoa` and store pruned annotations as a file

diff --git a/phenopy/__init__.py b/phenopy/__init__.py
@@ -1,2 +1,2 @@
 __project__ = 'phenopy'
-__version__ = '0.5.2'
+__version__ = '0.5.4'
diff --git a/phenopy/__main__.py b/phenopy/__main__.py
@@ -1,14 +1,12 @@
 import fire
 import itertools
-import lightgbm as lgb
 import sys
 
 from configparser import NoOptionError, NoSectionError
 
 from phenopy.util import open_or_stdout
 from phenopy.build_hpo import generate_annotated_hpo_network
 from phenopy.config import config, logger
-from phenopy.likelihood import predict_likelihood_moldx
 from phenopy.score import Scorer
 from phenopy.util import parse_input, half_product
 from phenoseries.experiment import run_phenoseries_experiment
@@ -131,61 +129,10 @@ def validate_phenoseries(phenotypic_series_filepath, outdir=None, min_hpos=4, mi
  pairwise_mim_scores_file=pairwise_mim_scores_file,
  )
 
-def likelihood_moldx(input_file, output_file=None, k_phenotype_groups=1000):
- """
- :param input_file: The file path to a file containing three columns. [ID\tkey=value\thpodid,hpoid,hpoid]
- :param output_file: The file path to an output file containing the predicted probabilities
- :param k_phenotype_groups: The number of phenotype groups to use for encoding phenotypes. The CLI version of phenopy allows for one of [1000, 1500] 
- """
- try:
- obo_file = config.get('hpo', 'obo_file')
- except (NoSectionError, NoOptionError):
- logger.critical(
- 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.')
- sys.exit(1)
- try:
- disease_to_phenotype_file = config.get('hpo', 'disease_to_phenotype_file')
- except (NoSectionError, NoOptionError):
- logger.critical(
- 'No HPO annotated dataset file found in the configuration file.'
- ' See "hpo:disease_to_phenotype_file" parameter.'
- )
- sys.exit(1)
-
- logger.info(f'Loading HPO OBO file: {obo_file}')
- hpo_network, alt2prim, _ = \
- generate_annotated_hpo_network(obo_file,
- disease_to_phenotype_file,
- )
-
- # parse input records
- input_records = parse_input(input_file, hpo_network, alt2prim)
- record_ids = [record["record_id"] for record in input_records]
- phenotypes = [record["terms"] for record in input_records]
-
- # predict likelihood of molecular diagnosis
- positive_probabilities = predict_likelihood_moldx(
- phenotypes,
- phenotype_groups=None, 
- hpo_network=hpo_network, 
- alt2prim=alt2prim,
- k_phenotype_groups=k_phenotype_groups,
- )
-
- if output_file is None:
- output_file = "phenopy.likelihood_moldx.txt"
- try:
- with open(output_file, "w") as f:
- for sample_id, probability in zip(record_ids, positive_probabilities):
- f.write(f"{sample_id}\t{probability}\n")
- except IOError:
- sys.exit("Something went wrong writing the probabilities to file")
-
 
 def main():
  fire.Fire({
  'score': score,
- 'likelihood': likelihood_moldx,
  'validate-phenoseries': validate_phenoseries,
  })
 

diff --git a/phenopy/config.py b/phenopy/config.py
@@ -134,7 +134,6 @@ def download(url, file_path):
 
  config['models'] = {
  'phenopy.wv.model': w2v_vw_path,
- 'likelihood.model': lmd_data_path,
  }
  config['age'] = {
  'open_access_phenotype_age': os.path.join(

diff --git a/phenopy/d2p.py b/phenopy/d2p.py
@@ -18,28 +18,41 @@ def read_hpo_annotation_file(phenotype_annotations_file, hpo_network, logger=Non
  :return: records
  """
  try:
- with open(phenotype_annotations_file, 'r') as tsv_fh:
+ with open(phenotype_annotations_file, "r") as tsv_fh:
  [next(tsv_fh) for _ in range(4)]
  reader = csv.DictReader(tsv_fh, delimiter='\t')
  # this removes the leading hash
- reader.fieldnames[0] = 'DatabaseID'
+ reader.fieldnames[0] = reader.fieldnames[0].lstrip("#")
 
  records = []
 
  for row in reader:
  # phenotype term id
- term_id = row['HPO_ID']
+ term_id = row.get("HPO_ID") if "HPO_ID" in row else row.get("hpo_id")
  if term_id not in hpo_network.nodes():
  continue
  # parse disease id, currently only supports omim entries
- db, disease_accession = row['DatabaseID'].split(':')
- if db not in ['OMIM']:
+ database_id = (
+ row.get("DatabaseID")
+ if "DatabaseID" in row
+ else row.get("database_id")
+ )
+ db, disease_accession = database_id.split(":")
+ if db not in ["OMIM"]:
  continue
  # For now, skip negative phenotype annotations
- if row['Qualifier'] == 'NOT':
+ qualifier = (
+ row.get("Qualifier") if "Qualifier" in row else row.get("qualifier")
+ )
+ if qualifier == "NOT":
  continue
 
- records.append((term_id, disease_accession, frequency_converter(row['Frequency'])))
+ frequency = (
+ row.get("Frequency") if "Frequency" in row else row.get("frequency")
+ )
+ records.append(
+ (term_id, disease_accession, frequency_converter(frequency))
+ )
 
  return records
 

diff --git a/phenopy/likelihood.py b/phenopy/likelihood.py
diff --git a/phenopy/util.py b/phenopy/util.py
@@ -1,5 +1,4 @@
 import csv
-import os
 import sys
 import networkx as nx
 import pandas as pd
@@ -45,10 +44,16 @@ def export_phenotype_hpoa_with_no_parents(phenotype_hpoa_file, phenotype_hpoa_no
  exit(1)
 
  no_parents_df = df.copy()
- for gene, annotations in df.groupby('#DatabaseID'):
- termlist = [node for node in annotations['HPO_ID'].tolist() if node in hpo_network.nodes()]
+ # Establish the proper column headers (different for various versions)
+ database_id = "#DatabaseID" if "#DatabaseID" in df.columns else "database_id"
+ hpo_id = "HPO_ID" if "HPO_ID" in df.columns else "hpo_id"
+
+ for gene, annotations in df.groupby(database_id):
+ termlist = [
+ node for node in annotations[hpo_id].tolist() if node in hpo_network.nodes()
+ ]
  termlist = remove_parents(termlist, hpo_network)
- parent_idx = annotations.loc[~annotations['HPO_ID'].isin(termlist)].index
+ parent_idx = annotations.loc[~annotations[hpo_id].isin(termlist)].index
  no_parents_df.drop(parent_idx, inplace=True)
 
  try:

diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
  description='Phenotype comparison scoring by semantic similarity.',
  long_description=long_description,
  long_description_content_type='text/markdown',
- author='Kevin Arvai <[email protected]>, Kyle Retterer <[email protected]>, Carlos Borroto <[email protected]>, Vlad Gainullin <[email protected]>, Vincent Ustach <[email protected]>',
+ author='Kevin Arvai <[email protected]>, Kyle Retterer <[email protected]>, Carlos Borroto <[email protected]>, Vlad Gainullin <[email protected]>, Vincent Ustach <[email protected]>, Stephen McGee <[email protected]',
  author_email='<[email protected]>',
  license='',
  entry_points={
@@ -24,15 +24,13 @@
  },
  include_package_data=True,
  install_requires=[
- 'fire',
- 'gensim<4.0 ',
- 'networkx',
- 'numpy',
+ 'fire==0.5.0',
+ 'gensim<4.0',
+ 'networkx==2.6.3',
+ 'numpy==1.21.6',
  'obonet',
- 'pandas',
- 'joblib',
- 'scipy',
- 'lightgbm',
- 'requests',
+ 'pandas==1.3.5',
+ 'scipy==1.7.3',
+ 'requests==2.31.0',
  ]
 )