edge clustering

patflick · May 11, 2014 · 57b051e · 57b051e
1 parent 5680505
commit 57b051e
Show file tree

Hide file tree

Showing 6 changed files with 183 additions and 9 deletions.
diff --git a/analysis/cluster_score_hist.R b/analysis/cluster_score_hist.R
@@ -256,7 +256,7 @@ get_top_global <- function()
 get_top_ts <- function()
 {
  clusterer <- "PLM-gamma-50.0"
- excl_types <- c("Global", "GLOBAL", "EdgeScoring")
+ excl_types <- c("Global", "GLOBAL", "EdgeScoring", "EdgeCorrelation", "EdgeCoexprCount")
 
  df <- data.frame()
 
@@ -287,7 +287,7 @@ get_top_ts_vs_global <- function(ppi_name="string", expr_name="gene_atlas")
  # define good clusters
  top_percent <- 0.2
  clusterer <- "PLM-gamma-50.0"
- excl_types <- c("Global", "GLOBAL", "EdgeScoring")
+ excl_types <- c("Global", "GLOBAL", "EdgeScoring", "EdgeCorrelation", "EdgeCoexprCount")
 
  # get data
  data <- get_cluster_data(ppi_name, expr_name, clusterer)
@@ -341,6 +341,56 @@ get_top_ts_vs_global <- function(ppi_name="string", expr_name="gene_atlas")
  return(df)
 }
 
+get_top_edge_vs_global <- function(ppi_name="string", expr_name="gene_atlas", edge_score_type="EdgeCoexprCount")
+{
+ # define good clusters
+ top_percent <- 0.2
+ clusterer <- "PLM-gamma-50.0"
+ global_type <- "GLOBAL"
+ edge_type <- edge_score_type
+
+ # get data
+ data <- get_cluster_data(ppi_name, expr_name, clusterer)
+ data$mod_by_size <- data$modularity/data$size
+
+ # get global data
+ data_g <- data[which(data$type == global_type),]
+ data_g <- data_g[with(data_g, order(-modularity)), ]
+ top20_idx <- ceiling(dim(data_g)[1] * top_percent)
+ data_g <- data_g[1:top20_idx,]
+
+ # get edge scoring data
+ data_e <- data[which(data$type == edge_type),]
+ data_e <- data_e[with(data_e, order(-modularity)), ]
+ top20_idx <- ceiling(dim(data_e)[1] * top_percent)
+ data_e <- data_e[1:top20_idx,]
+
+ if (dim(data_e)[1] < 2 || dim(data_g)[1] < 2)
+ {
+ # too few for statistics
+ df <- data.frame(ppi=ppi_name, expr=expr_name,
+ mean_edge=NA, mean_global=NA,
+ greater=FALSE, pval=NA,
+ stringsAsFactors=FALSE)
+ } else {
+ # get statistics
+ tt <- t.test(data_e$bpscore, data_g$bpscore)
+ mean_e <- tt$estimate[1]
+ mean_g <- tt$estimate[2]
+ greater <- mean_e > mean_g
+ t_pvalue <- tt$p.value
+ df <- data.frame(ppi=ppi_name, expr=expr_name,
+ mean_edge=mean_e, mean_global=mean_g,
+ greater=greater, pval=t_pvalue,
+ stringsAsFactors=FALSE)
+ }
+
+ df$ppi <- to_short_ppi_name(df$ppi)
+ df$expr <- to_short_expr_name(df$expr)
+ return(df)
+}
+
+
 get_top_ts_vs_global_sum <- function()
 {
  df <- data.frame()

diff --git a/analysis/data/benchmark_tsPLP_pretty.csv b/analysis/data/benchmark_tsPLP_pretty.csv
@@ -0,0 +1,26 @@
+,ppi,expr,N,M,T,naive,ts,pe
+10,STRING,Gene Atlas,13001,251809,84,2.56907,6.62616,STRING - Gene Atlas
+7,STRING,HPA All,10120,125481,61,0.808105,4.46422,STRING - HPA All
+15,Bossi,Gene Atlas,9048,69013,84,0.575307,1.12931,Bossi - Gene Atlas
+20,IMEx,Gene Atlas,9576,51008,84,0.411124,1.02708,IMEx - Gene Atlas
+9,STRING,RNAseq Atlas,14899,287240,11,0.390199,0.840285,STRING - RNAseq Atlas
+12,Bossi,HPA All,6691,38246,61,0.236283,0.863832,Bossi - HPA All
+17,IMEx,HPA All,7384,29927,61,0.198252,0.534086,IMEx - HPA All
+6,STRING,HPA,2884,27383,66,0.152913,0.585204,STRING - HPA
+8,STRING,Body Map,8662,105541,14,0.121633,0.204149,STRING - Body Map
+25,Havugimana,Gene Atlas,2711,12092,84,0.087077,0.178565,Havugimana - Gene Atlas
+14,Bossi,RNAseq Atlas,9559,71914,11,0.073109,0.165591,Bossi - RNAseq Atlas
+5,HI-2012,Gene Atlas,3675,9576,84,0.071346,0.101304,HI-2012 - Gene Atlas
+2,HI-2012,HPA All,2683,5691,61,0.063665,0.110575,HI-2012 - HPA All
+11,Bossi,HPA,2358,8728,66,0.060641,0.15166,Bossi - HPA
+19,IMEx,RNAseq Atlas,10357,53672,11,0.053638,0.091202,IMEx - RNAseq Atlas
+16,IMEx,HPA,2407,6753,66,0.046879,0.088674,IMEx - HPA
+22,Havugimana,HPA All,2104,6630,61,0.045837,0.099423,Havugimana - HPA All
+13,Bossi,Body Map,6097,37520,14,0.043955,0.066293,Bossi - Body Map
+18,IMEx,Body Map,6881,27059,14,0.031473,0.046274,IMEx - Body Map
+4,HI-2012,RNAseq Atlas,4231,13718,11,0.018091,0.038564,HI-2012 - RNAseq Atlas
+21,Havugimana,HPA,911,1613,66,0.017955,0.015832,Havugimana - HPA
+24,Havugimana,RNAseq Atlas,2851,12796,11,0.015188,0.02831,Havugimana - RNAseq Atlas
+23,Havugimana,Body Map,2132,7294,14,0.010975,0.011336,Havugimana - Body Map
+3,HI-2012,Body Map,2457,3901,14,0.009957,0.012694,HI-2012 - Body Map
+1,HI-2012,HPA,800,662,66,0.009619,0.007425,HI-2012 - HPA
diff --git a/analysis/data/edgecoexrcount_clustering_pretty.csv b/analysis/data/edgecoexrcount_clustering_pretty.csv
@@ -0,0 +1,26 @@
+,ppi,expr,mean_edge,mean_global,greater,pval
+mean of x,Bossi,Body Map,0.2648902840893,0.266953980419442,FALSE,0.918920727505963
+mean of x1,Bossi,Gene Atlas,0.234997567270818,0.268768984570892,FALSE,0.0800814683364346
+mean of x2,Bossi,RNAseq Atlas,0.247563736824431,0.269039595624559,FALSE,0.207555453549585
+mean of x3,Bossi,HPA,0.244801446546063,0.270390694390233,FALSE,0.415180194790716
+mean of x4,Bossi,HPA All,0.22322416330079,0.255815612686531,FALSE,0.102903144320357
+mean of x5,STRING,Body Map,0.309717382633401,0.312498405809694,FALSE,0.869492230658319
+mean of x6,STRING,Gene Atlas,0.256759986972471,0.292617509947238,FALSE,0.0334671413915068
+mean of x7,STRING,RNAseq Atlas,0.274867511367292,0.282502269569477,FALSE,0.646792101626966
+mean of x8,STRING,HPA,0.309936365020903,0.331837450284542,FALSE,0.438964893801547
+mean of x9,STRING,HPA All,0.269986283001024,0.284681405997942,FALSE,0.428882956635981
+mean of x10,IMEx,Body Map,0.180252142049832,0.180322381072929,FALSE,0.99699156321098
+mean of x11,IMEx,Gene Atlas,0.15963814037064,0.189383725812873,FALSE,0.0809056999027601
+mean of x12,IMEx,RNAseq Atlas,0.169878643929642,0.186585548762292,FALSE,0.289732450142627
+mean of x13,IMEx,HPA,0.175833352690299,0.188968251841269,FALSE,0.622663568764704
+mean of x14,IMEx,HPA All,0.14437836598175,0.157127546926947,FALSE,0.47337000762031
+mean of x15,Havugimana,Body Map,0.240129000263812,0.222445895419041,TRUE,0.68559933893789
+mean of x16,Havugimana,Gene Atlas,0.276202524763661,0.262638724841425,TRUE,0.72239588539903
+mean of x17,Havugimana,RNAseq Atlas,0.255364581995594,0.265188479996711,FALSE,0.783920948443039
+mean of x18,Havugimana,HPA,0.334591979332534,0.317873631940377,TRUE,0.835548092197443
+mean of x19,Havugimana,HPA All,0.261364058740685,0.282477690121773,FALSE,0.624794571794092
+mean of x20,HI-2012,Body Map,0.094614739814077,0.102804490717259,FALSE,0.79657051278028
+mean of x21,HI-2012,Gene Atlas,0.14729526768552,0.0716357922517101,TRUE,0.049988484053078
+mean of x22,HI-2012,RNAseq Atlas,0.0885722632180316,0.0661317164621626,TRUE,0.290710654192392
+mean of x23,HI-2012,HPA,0.0305892735376451,0.128308628385925,FALSE,0.23377179349928
+mean of x24,HI-2012,HPA All,0.077846456223171,0.053143155711485,TRUE,0.217736957163601
diff --git a/analysis/data/edgecorrelation_clustering_pretty.csv b/analysis/data/edgecorrelation_clustering_pretty.csv
@@ -0,0 +1,26 @@
+,ppi,expr,mean_edge,mean_global,greater,pval
+mean of x,Bossi,Body Map,0.259886787888398,0.266953980419442,FALSE,0.72605564485992
+mean of x1,Bossi,Gene Atlas,0.264411221550878,0.268768984570892,FALSE,0.789389939918772
+mean of x2,Bossi,RNAseq Atlas,0.24957457435218,0.269039595624559,FALSE,0.200732173255965
+mean of x3,Bossi,HPA,0.248064312446627,0.270390694390233,FALSE,0.42851284134202
+mean of x4,Bossi,HPA All,0.241658573377702,0.255815612686531,FALSE,0.476578591073371
+mean of x5,STRING,Body Map,0.305765064322964,0.312498405809694,FALSE,0.694113317581276
+mean of x6,STRING,Gene Atlas,0.297255758284334,0.292617509947238,TRUE,0.778896897880408
+mean of x7,STRING,RNAseq Atlas,0.281449437174647,0.282502269569477,FALSE,0.948299805452851
+mean of x8,STRING,HPA,0.326158766741519,0.331837450284542,FALSE,0.8395247437801
+mean of x9,STRING,HPA All,0.284809039812838,0.284681405997942,TRUE,0.994441351303567
+mean of x10,IMEx,Body Map,0.182327881057344,0.180322381072929,TRUE,0.913238786970288
+mean of x11,IMEx,Gene Atlas,0.183378698791555,0.189383725812873,FALSE,0.690998368149286
+mean of x12,IMEx,RNAseq Atlas,0.179187965181961,0.186585548762292,FALSE,0.640820975241596
+mean of x13,IMEx,HPA,0.189303725723412,0.188968251841269,TRUE,0.989744647932498
+mean of x14,IMEx,HPA All,0.148786310168953,0.157127546926947,FALSE,0.634700420674442
+mean of x15,Havugimana,Body Map,0.243448903690265,0.222445895419041,TRUE,0.618502834405426
+mean of x16,Havugimana,Gene Atlas,0.266751987859751,0.262638724841425,TRUE,0.906394179627094
+mean of x17,Havugimana,RNAseq Atlas,0.256226282562288,0.265188479996711,FALSE,0.798573050849972
+mean of x18,Havugimana,HPA,0.35650666421274,0.317873631940377,TRUE,0.629146853313002
+mean of x19,Havugimana,HPA All,0.262854231313305,0.282477690121773,FALSE,0.652391384370894
+mean of x20,HI-2012,Body Map,0.0943165514458018,0.102804490717259,FALSE,0.794937492766682
+mean of x21,HI-2012,Gene Atlas,0.066932015579464,0.0716357922517101,FALSE,0.819432225241818
+mean of x22,HI-2012,RNAseq Atlas,0.0675531071182213,0.0661317164621626,TRUE,0.940115549082296
+mean of x23,HI-2012,HPA,0.0572170344093818,0.128308628385925,FALSE,0.508111341250778
+mean of x24,HI-2012,HPA All,0.0673648952389213,0.053143155711485,TRUE,0.458328678039557
diff --git a/analysis/plot_benchmarks.R b/analysis/plot_benchmarks.R
@@ -121,6 +121,44 @@ plot_bw_benchmark <- function()
  return (fig)
 }
 
+get_plp_data <- function()
+{
+ # get serial benchmark
+ data <- read.csv("./data/benchmark_tsPLP.csv", header=TRUE, sep=";")
+
+ # sort by naive time
+ data <- data[with(data, order(-naive)),]
+
+ # map to short name
+ data$ppi <- to_short_ppi_name(data$ppi)
+ data$expr <- to_short_expr_name(data$expr)
+
+ data$pe <- paste(data$ppi, "-", data$expr)
+
+ return (data)
+}
+
+plot_plp_benchmark <- function()
+{
+ data <- get_plp_data()
+
+ plot_data <- data[1:6,c("pe", "naive", "ts")]
+ colnames(plot_data) <- c("PPI.Expr", "Create Subgraphs", "Use Tissue Vectors")
+
+ # for plotting
+ plot_data <- melt(plot_data, id=c("PPI.Expr"))
+ colnames(plot_data) <- c("PPI.Expr", "Method", "Runtime")
+
+ fig <- ggplot(plot_data, aes(PPI.Expr, Runtime, fill=Method)) +
+ geom_bar(stat="identity", position="dodge") +
+ geom_text(aes(PPI.Expr, pmin(max(Runtime)*.60,Runtime), label=paste(round(Runtime,1),"s"), hjust=0), size=3.5, position = position_dodge(width=1)) +
+ coord_flip() +
+ xlab("") +
+ ylab("Run time [s]") +
+ labs(title="Run time of PLP for adapted algorithm")
+ return (fig)
+}
+
 save_plots <- function()
 {
  # clustering coeff. benchmark
@@ -134,4 +172,10 @@ save_plots <- function()
  pdf("../figs/benchmark_bw.pdf", width=8, height=3.5)
  print(fig)
  dev.off()
+
+ # PLP benchmarks
+ fig <- plot_plp_benchmark()
+ pdf("../figs/benchmark_plp.pdf", width=8, height=3.2)
+ print(fig)
+ dev.off()
 }
diff --git a/src/networkit_clustering.py b/src/networkit_clustering.py
@@ -189,7 +189,7 @@ def __del__(self):
 def run_and_score_clustering(graph, clusterer, scorer, writer=None, category=None):
  # run clustering
  start = time.time()
- clusters = clusterer.run(g)
+ clusters = clusterer.run(graph)
  t_cluster = time.time() - start
  # print histogram
  #hist = cluster_hist(clusters)
@@ -248,8 +248,10 @@ def run_ts_clustering(tsppi, clusterer, scorer, writer=None):
 
 def run_edgescore_clustering(tsppi, clusterer, scorer, writer=None):
  print("Scoring on EdgeScore graph (TS/Global hybrid via edge weighting)")
- g = tsppi.getEdgeScoreGraph()
- run_and_score_clustering(g, clusterer, scorer, writer, "EdgeScoring")
+ #g = tsppi.getEdgeCorrelationGraph()
+ #run_and_score_clustering(g, clusterer, scorer, writer, "EdgeCorrelation")
+ g = tsppi.getEdgeCoexprCountGraph()
+ run_and_score_clustering(g, clusterer, scorer, writer, "EdgeCoexprCount")
 
 
 def run_global_clustering(tsppi, clusterer, scorer, writer=None):
@@ -286,7 +288,8 @@ def get_scorer(con):
 #gamma = 1
 # ("PLM-gamma-1.0", ppi_networkit.PLM(gamma=1)),
 #clusterers = [("PLM-gamma-5.0", ppi_networkit.PLM(gamma=5)), ("PLM-gamma-10.0", ppi_networkit.PLM(gamma=10)), ("PLM-gamma-50.0", ppi_networkit.PLM(gamma=50)), ("PLP", ppi_networkit.PLP()), ("CNM", ppi_networkit.CNM())]
-clusterers = [("PLM-gamma-100.0", ppi_networkit.PLM(gamma=100))]
+#clusterers = [("PLM-gamma-100.0", ppi_networkit.PLM(gamma=100))]
+clusterers = [("PLM-gamma-50.0", ppi_networkit.PLM(gamma=50))]
 for clusterer_name, clusterer in clusterers:
  writer.set_clusterer(clusterer_name)
 
@@ -303,11 +306,10 @@ def get_scorer(con):
 
  # get graph
  tsppi = sqlio.load_tsppi_graph(ppi, expr)
- g = tsppi.getGraph()
 
  # run the clustering algos
- run_global_clustering(tsppi, clusterer, scorer, writer)
- #run_ts_clustering(tsppi, clusterer, scorer, writer)
+ #run_global_clustering(tsppi, clusterer, scorer, writer)
+ run_ts_clustering(tsppi, clusterer, scorer, writer)
  #run_edgescore_clustering(tsppi, clusterer, scorer, writer)
 
  # commit all current changes to the SQL server