Skip to content

Commit

Permalink
calibrated scores
Browse files Browse the repository at this point in the history
  • Loading branch information
Magnushhoie committed Apr 25, 2023
1 parent 645880d commit e26335a
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 21 deletions.
46 changes: 46 additions & 0 deletions src/mlscripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,52 @@ def extract_pdb(pdb, X_test, y_test, df_test):

return X_test[m], y_test[m], df_test[m]

def normalize_scores_load_models(
df,
score_col: str = "DiscoTope-3.0_score",
len_col: str = "length",
models_dir="models",
) -> np.array:
"""Loads model and normalizes scores"""

# Load GAMs to normalize scores by length and surface area
gam_len_to_mean = load_gam_model(f"{models_dir}/gam_len_to_mean.pkl")
gam_surface_to_std = load_gam_model(
f"{models_dir}/gam_surface_to_std.pkl"
)

Z_scores = normalize_scores(df, gam_len_to_mean, gam_surface_to_std)

return Z_scores

def normalize_scores(
df: pd.DataFrame,
gam_len_to_mean: "pygam.pygam.LinearGAM",
gam_surface_to_std: "pygam.pygam.LinearGAM",
score_col: str = "DiscoTope-3.0_score",
len_col: str = "length",
) -> np.array:
"""Z-score normalize scores using fitted GAMs on mean and std"""

# Parameters
scores = df[score_col].astype(float).values
length = int(df[len_col].iloc[0])
mean_surface_score = scores[df["rsa"].astype(float) >= 0.20].mean()

# Predict
u = gam_len_to_mean.predict(length)
std = gam_surface_to_std.predict(mean_surface_score)
z_scores = (scores - u) / std

return z_scores

def load_gam_model(model_path):
"""Loads GAM model from model_path"""

with open(model_path, "rb") as f:
gam_model = pickle.load(f)

return gam_model

def predict_test_pdb(pdb, models):
"""Get output testing df using saved models"""
Expand Down
48 changes: 27 additions & 21 deletions src/predict_webserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,17 +123,17 @@ def is_valid_path(parser, arg):
)

p.add_argument(
"--z_score_epi_threshold",
"--calibrated_score_epi_threshold",
type=float,
help="Z-score threshold for epitopes (default 0.90)",
help="Calibrated-score threshold for epitopes [low 0.40, moderate (0.90), higher 1.50]",
default=0.90,
)

p.add_argument(
"--no_z_normalization",
"--no_calibrated_normalization",
action="store_true",
default=False,
help="Skip Z-normalization of PDBs",
help="Skip Calibrated-normalization of PDBs",
)

p.add_argument(
Expand Down Expand Up @@ -351,7 +351,7 @@ def normalize_scores(
score_col: str = "DiscoTope-3.0_score",
len_col: str = "length",
) -> np.array:
"""Z-score normalize scores using fitted GAMs on mean and std"""
"""Calibrated-score normalize scores using fitted GAMs on mean and std"""

# Parameters
scores = df[score_col].astype(float).values
Expand All @@ -361,9 +361,9 @@ def normalize_scores(
# Predict
u = gam_len_to_mean.predict(length)
std = gam_surface_to_std.predict(mean_surface_score)
z_scores = (scores - u) / std
calibrated_scores = (scores - u) / std

return z_scores
return calibrated_scores


def predict_and_save(
Expand All @@ -373,8 +373,8 @@ def predict_and_save(
out_dir,
gam_len_to_mean=False,
gam_surface_to_std=False,
z_score_epi_threshold=0.90,
no_z_normalization=False,
calibrated_score_epi_threshold=0.90,
no_calibrated_normalization=False,
verbose: int = 0,
) -> None:
"""Predicts and saves CSV/PDBs with DiscoTope-3.0 scores"""
Expand All @@ -400,7 +400,7 @@ def predict_and_save(
]
)
df_all.insert(4, "DiscoTope-3.0_score", y_all)
df_all.insert(5, "Z_score", np.nan)
df_all.insert(5, "calibrated_score", np.nan)
df_all.insert(6, "epitope", np.nan)

# Round numerical columns to 5 digits for nicer CSV output
Expand Down Expand Up @@ -449,28 +449,28 @@ def predict_and_save(
df = df_all.iloc[start:end]
start = end

# Normalize for length and surface area with Z-scores
z_scores = normalize_scores(df, gam_len_to_mean, gam_surface_to_std)
# Normalize for length and surface area with Calibrated-scores
calibrated_scores = normalize_scores(df, gam_len_to_mean, gam_surface_to_std)

# Epitopes can now be set by fixed threshold, default median epitope Z-score (0.90)
# Epitopes can now be set by fixed threshold, default median epitope Calibrated-score (0.90)
# Nb: All residue median 0.00, exposed 0.50, exposed epitope 0.90
df["epitope"] = z_scores >= z_score_epi_threshold
df["epitope"] = calibrated_scores >= calibrated_score_epi_threshold

# Set Z-scores to string for nicer CSV output
df["Z_score"] = pd.Series(z_scores).apply(lambda x: "{:.5f}".format(x))
# Set Calibrated-scores to string for nicer CSV output
df["calibrated_score"] = pd.Series(calibrated_scores).apply(lambda x: "{:.5f}".format(x))

# Save CSV
outfile = f"{out_dir}/{_pdb}_discotope3.csv"
df.to_csv(outfile, index=False)

# Save PDB with or without Z-normalized scores
if no_z_normalization:
# Save PDB with or without Calibrated-normalized scores
if no_calibrated_normalization:
struc_pred = set_struc_res_bfactor(
struc, df["DiscoTope-3.0_score"].values.astype(float) * 100
)
else:
struc_pred = set_struc_res_bfactor(
struc, df["Z_score"].values.astype(float) * 100
struc, df["calibrated_score"].values.astype(float) * 100
)

outfile = f"{out_dir}/{_pdb}_discotope3.pdb"
Expand Down Expand Up @@ -802,6 +802,12 @@ def load_gam_model(model_path):
def main(args):
"""Main function"""

# Log if multichain mode is set
if args.multichain_mode:
log.info(f"Multi-chain mode set, will predict PDBs as complexes")
else:
log.info(f"Single-chain mode set, will predict PDBs as single chains")

# Directory for input single chains (extracted from input PDBs) and output CSV/PDB results
input_chains_dir = f"{args.out_dir}/input_chains"
out_dir = f"{args.out_dir}/output"
Expand Down Expand Up @@ -921,8 +927,8 @@ def main(args):
out_dir=out_dir,
gam_len_to_mean=gam_len_to_mean,
gam_surface_to_std=gam_surface_to_std,
z_score_epi_threshold=args.z_score_epi_threshold,
no_z_normalization=args.no_z_normalization,
calibrated_score_epi_threshold=args.calibrated_score_epi_threshold,
no_calibrated_normalization=args.no_calibrated_normalization,
verbose=args.verbose,
)

Expand Down

0 comments on commit e26335a

Please sign in to comment.