Skip to content

Commit

Permalink
edge clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
patflick committed May 11, 2014
1 parent 5680505 commit 57b051e
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 9 deletions.
54 changes: 52 additions & 2 deletions analysis/cluster_score_hist.R
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ get_top_global <- function()
get_top_ts <- function()
{
clusterer <- "PLM-gamma-50.0"
excl_types <- c("Global", "GLOBAL", "EdgeScoring")
excl_types <- c("Global", "GLOBAL", "EdgeScoring", "EdgeCorrelation", "EdgeCoexprCount")

df <- data.frame()

Expand Down Expand Up @@ -287,7 +287,7 @@ get_top_ts_vs_global <- function(ppi_name="string", expr_name="gene_atlas")
# define good clusters
top_percent <- 0.2
clusterer <- "PLM-gamma-50.0"
excl_types <- c("Global", "GLOBAL", "EdgeScoring")
excl_types <- c("Global", "GLOBAL", "EdgeScoring", "EdgeCorrelation", "EdgeCoexprCount")

# get data
data <- get_cluster_data(ppi_name, expr_name, clusterer)
Expand Down Expand Up @@ -341,6 +341,56 @@ get_top_ts_vs_global <- function(ppi_name="string", expr_name="gene_atlas")
return(df)
}

get_top_edge_vs_global <- function(ppi_name="string", expr_name="gene_atlas", edge_score_type="EdgeCoexprCount")
{
# define good clusters
top_percent <- 0.2
clusterer <- "PLM-gamma-50.0"
global_type <- "GLOBAL"
edge_type <- edge_score_type

# get data
data <- get_cluster_data(ppi_name, expr_name, clusterer)
data$mod_by_size <- data$modularity/data$size

# get global data
data_g <- data[which(data$type == global_type),]
data_g <- data_g[with(data_g, order(-modularity)), ]
top20_idx <- ceiling(dim(data_g)[1] * top_percent)
data_g <- data_g[1:top20_idx,]

# get edge scoring data
data_e <- data[which(data$type == edge_type),]
data_e <- data_e[with(data_e, order(-modularity)), ]
top20_idx <- ceiling(dim(data_e)[1] * top_percent)
data_e <- data_e[1:top20_idx,]

if (dim(data_e)[1] < 2 || dim(data_g)[1] < 2)
{
# too few for statistics
df <- data.frame(ppi=ppi_name, expr=expr_name,
mean_edge=NA, mean_global=NA,
greater=FALSE, pval=NA,
stringsAsFactors=FALSE)
} else {
# get statistics
tt <- t.test(data_e$bpscore, data_g$bpscore)
mean_e <- tt$estimate[1]
mean_g <- tt$estimate[2]
greater <- mean_e > mean_g
t_pvalue <- tt$p.value
df <- data.frame(ppi=ppi_name, expr=expr_name,
mean_edge=mean_e, mean_global=mean_g,
greater=greater, pval=t_pvalue,
stringsAsFactors=FALSE)
}

df$ppi <- to_short_ppi_name(df$ppi)
df$expr <- to_short_expr_name(df$expr)
return(df)
}


get_top_ts_vs_global_sum <- function()
{
df <- data.frame()
Expand Down
26 changes: 26 additions & 0 deletions analysis/data/benchmark_tsPLP_pretty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
,ppi,expr,N,M,T,naive,ts,pe
10,STRING,Gene Atlas,13001,251809,84,2.56907,6.62616,STRING - Gene Atlas
7,STRING,HPA All,10120,125481,61,0.808105,4.46422,STRING - HPA All
15,Bossi,Gene Atlas,9048,69013,84,0.575307,1.12931,Bossi - Gene Atlas
20,IMEx,Gene Atlas,9576,51008,84,0.411124,1.02708,IMEx - Gene Atlas
9,STRING,RNAseq Atlas,14899,287240,11,0.390199,0.840285,STRING - RNAseq Atlas
12,Bossi,HPA All,6691,38246,61,0.236283,0.863832,Bossi - HPA All
17,IMEx,HPA All,7384,29927,61,0.198252,0.534086,IMEx - HPA All
6,STRING,HPA,2884,27383,66,0.152913,0.585204,STRING - HPA
8,STRING,Body Map,8662,105541,14,0.121633,0.204149,STRING - Body Map
25,Havugimana,Gene Atlas,2711,12092,84,0.087077,0.178565,Havugimana - Gene Atlas
14,Bossi,RNAseq Atlas,9559,71914,11,0.073109,0.165591,Bossi - RNAseq Atlas
5,HI-2012,Gene Atlas,3675,9576,84,0.071346,0.101304,HI-2012 - Gene Atlas
2,HI-2012,HPA All,2683,5691,61,0.063665,0.110575,HI-2012 - HPA All
11,Bossi,HPA,2358,8728,66,0.060641,0.15166,Bossi - HPA
19,IMEx,RNAseq Atlas,10357,53672,11,0.053638,0.091202,IMEx - RNAseq Atlas
16,IMEx,HPA,2407,6753,66,0.046879,0.088674,IMEx - HPA
22,Havugimana,HPA All,2104,6630,61,0.045837,0.099423,Havugimana - HPA All
13,Bossi,Body Map,6097,37520,14,0.043955,0.066293,Bossi - Body Map
18,IMEx,Body Map,6881,27059,14,0.031473,0.046274,IMEx - Body Map
4,HI-2012,RNAseq Atlas,4231,13718,11,0.018091,0.038564,HI-2012 - RNAseq Atlas
21,Havugimana,HPA,911,1613,66,0.017955,0.015832,Havugimana - HPA
24,Havugimana,RNAseq Atlas,2851,12796,11,0.015188,0.02831,Havugimana - RNAseq Atlas
23,Havugimana,Body Map,2132,7294,14,0.010975,0.011336,Havugimana - Body Map
3,HI-2012,Body Map,2457,3901,14,0.009957,0.012694,HI-2012 - Body Map
1,HI-2012,HPA,800,662,66,0.009619,0.007425,HI-2012 - HPA
26 changes: 26 additions & 0 deletions analysis/data/edgecoexrcount_clustering_pretty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
,ppi,expr,mean_edge,mean_global,greater,pval
mean of x,Bossi,Body Map,0.2648902840893,0.266953980419442,FALSE,0.918920727505963
mean of x1,Bossi,Gene Atlas,0.234997567270818,0.268768984570892,FALSE,0.0800814683364346
mean of x2,Bossi,RNAseq Atlas,0.247563736824431,0.269039595624559,FALSE,0.207555453549585
mean of x3,Bossi,HPA,0.244801446546063,0.270390694390233,FALSE,0.415180194790716
mean of x4,Bossi,HPA All,0.22322416330079,0.255815612686531,FALSE,0.102903144320357
mean of x5,STRING,Body Map,0.309717382633401,0.312498405809694,FALSE,0.869492230658319
mean of x6,STRING,Gene Atlas,0.256759986972471,0.292617509947238,FALSE,0.0334671413915068
mean of x7,STRING,RNAseq Atlas,0.274867511367292,0.282502269569477,FALSE,0.646792101626966
mean of x8,STRING,HPA,0.309936365020903,0.331837450284542,FALSE,0.438964893801547
mean of x9,STRING,HPA All,0.269986283001024,0.284681405997942,FALSE,0.428882956635981
mean of x10,IMEx,Body Map,0.180252142049832,0.180322381072929,FALSE,0.99699156321098
mean of x11,IMEx,Gene Atlas,0.15963814037064,0.189383725812873,FALSE,0.0809056999027601
mean of x12,IMEx,RNAseq Atlas,0.169878643929642,0.186585548762292,FALSE,0.289732450142627
mean of x13,IMEx,HPA,0.175833352690299,0.188968251841269,FALSE,0.622663568764704
mean of x14,IMEx,HPA All,0.14437836598175,0.157127546926947,FALSE,0.47337000762031
mean of x15,Havugimana,Body Map,0.240129000263812,0.222445895419041,TRUE,0.68559933893789
mean of x16,Havugimana,Gene Atlas,0.276202524763661,0.262638724841425,TRUE,0.72239588539903
mean of x17,Havugimana,RNAseq Atlas,0.255364581995594,0.265188479996711,FALSE,0.783920948443039
mean of x18,Havugimana,HPA,0.334591979332534,0.317873631940377,TRUE,0.835548092197443
mean of x19,Havugimana,HPA All,0.261364058740685,0.282477690121773,FALSE,0.624794571794092
mean of x20,HI-2012,Body Map,0.094614739814077,0.102804490717259,FALSE,0.79657051278028
mean of x21,HI-2012,Gene Atlas,0.14729526768552,0.0716357922517101,TRUE,0.049988484053078
mean of x22,HI-2012,RNAseq Atlas,0.0885722632180316,0.0661317164621626,TRUE,0.290710654192392
mean of x23,HI-2012,HPA,0.0305892735376451,0.128308628385925,FALSE,0.23377179349928
mean of x24,HI-2012,HPA All,0.077846456223171,0.053143155711485,TRUE,0.217736957163601
26 changes: 26 additions & 0 deletions analysis/data/edgecorrelation_clustering_pretty.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
,ppi,expr,mean_edge,mean_global,greater,pval
mean of x,Bossi,Body Map,0.259886787888398,0.266953980419442,FALSE,0.72605564485992
mean of x1,Bossi,Gene Atlas,0.264411221550878,0.268768984570892,FALSE,0.789389939918772
mean of x2,Bossi,RNAseq Atlas,0.24957457435218,0.269039595624559,FALSE,0.200732173255965
mean of x3,Bossi,HPA,0.248064312446627,0.270390694390233,FALSE,0.42851284134202
mean of x4,Bossi,HPA All,0.241658573377702,0.255815612686531,FALSE,0.476578591073371
mean of x5,STRING,Body Map,0.305765064322964,0.312498405809694,FALSE,0.694113317581276
mean of x6,STRING,Gene Atlas,0.297255758284334,0.292617509947238,TRUE,0.778896897880408
mean of x7,STRING,RNAseq Atlas,0.281449437174647,0.282502269569477,FALSE,0.948299805452851
mean of x8,STRING,HPA,0.326158766741519,0.331837450284542,FALSE,0.8395247437801
mean of x9,STRING,HPA All,0.284809039812838,0.284681405997942,TRUE,0.994441351303567
mean of x10,IMEx,Body Map,0.182327881057344,0.180322381072929,TRUE,0.913238786970288
mean of x11,IMEx,Gene Atlas,0.183378698791555,0.189383725812873,FALSE,0.690998368149286
mean of x12,IMEx,RNAseq Atlas,0.179187965181961,0.186585548762292,FALSE,0.640820975241596
mean of x13,IMEx,HPA,0.189303725723412,0.188968251841269,TRUE,0.989744647932498
mean of x14,IMEx,HPA All,0.148786310168953,0.157127546926947,FALSE,0.634700420674442
mean of x15,Havugimana,Body Map,0.243448903690265,0.222445895419041,TRUE,0.618502834405426
mean of x16,Havugimana,Gene Atlas,0.266751987859751,0.262638724841425,TRUE,0.906394179627094
mean of x17,Havugimana,RNAseq Atlas,0.256226282562288,0.265188479996711,FALSE,0.798573050849972
mean of x18,Havugimana,HPA,0.35650666421274,0.317873631940377,TRUE,0.629146853313002
mean of x19,Havugimana,HPA All,0.262854231313305,0.282477690121773,FALSE,0.652391384370894
mean of x20,HI-2012,Body Map,0.0943165514458018,0.102804490717259,FALSE,0.794937492766682
mean of x21,HI-2012,Gene Atlas,0.066932015579464,0.0716357922517101,FALSE,0.819432225241818
mean of x22,HI-2012,RNAseq Atlas,0.0675531071182213,0.0661317164621626,TRUE,0.940115549082296
mean of x23,HI-2012,HPA,0.0572170344093818,0.128308628385925,FALSE,0.508111341250778
mean of x24,HI-2012,HPA All,0.0673648952389213,0.053143155711485,TRUE,0.458328678039557
44 changes: 44 additions & 0 deletions analysis/plot_benchmarks.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,44 @@ plot_bw_benchmark <- function()
return (fig)
}

get_plp_data <- function()
{
# get serial benchmark
data <- read.csv("./data/benchmark_tsPLP.csv", header=TRUE, sep=";")

# sort by naive time
data <- data[with(data, order(-naive)),]

# map to short name
data$ppi <- to_short_ppi_name(data$ppi)
data$expr <- to_short_expr_name(data$expr)

data$pe <- paste(data$ppi, "-", data$expr)

return (data)
}

plot_plp_benchmark <- function()
{
data <- get_plp_data()

plot_data <- data[1:6,c("pe", "naive", "ts")]
colnames(plot_data) <- c("PPI.Expr", "Create Subgraphs", "Use Tissue Vectors")

# for plotting
plot_data <- melt(plot_data, id=c("PPI.Expr"))
colnames(plot_data) <- c("PPI.Expr", "Method", "Runtime")

fig <- ggplot(plot_data, aes(PPI.Expr, Runtime, fill=Method)) +
geom_bar(stat="identity", position="dodge") +
geom_text(aes(PPI.Expr, pmin(max(Runtime)*.60,Runtime), label=paste(round(Runtime,1),"s"), hjust=0), size=3.5, position = position_dodge(width=1)) +
coord_flip() +
xlab("") +
ylab("Run time [s]") +
labs(title="Run time of PLP for adapted algorithm")
return (fig)
}

save_plots <- function()
{
# clustering coeff. benchmark
Expand All @@ -134,4 +172,10 @@ save_plots <- function()
pdf("../figs/benchmark_bw.pdf", width=8, height=3.5)
print(fig)
dev.off()

# PLP benchmarks
fig <- plot_plp_benchmark()
pdf("../figs/benchmark_plp.pdf", width=8, height=3.2)
print(fig)
dev.off()
}
16 changes: 9 additions & 7 deletions src/networkit_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def __del__(self):
def run_and_score_clustering(graph, clusterer, scorer, writer=None, category=None):
# run clustering
start = time.time()
clusters = clusterer.run(g)
clusters = clusterer.run(graph)
t_cluster = time.time() - start
# print histogram
#hist = cluster_hist(clusters)
Expand Down Expand Up @@ -248,8 +248,10 @@ def run_ts_clustering(tsppi, clusterer, scorer, writer=None):

def run_edgescore_clustering(tsppi, clusterer, scorer, writer=None):
print("Scoring on EdgeScore graph (TS/Global hybrid via edge weighting)")
g = tsppi.getEdgeScoreGraph()
run_and_score_clustering(g, clusterer, scorer, writer, "EdgeScoring")
#g = tsppi.getEdgeCorrelationGraph()
#run_and_score_clustering(g, clusterer, scorer, writer, "EdgeCorrelation")
g = tsppi.getEdgeCoexprCountGraph()
run_and_score_clustering(g, clusterer, scorer, writer, "EdgeCoexprCount")


def run_global_clustering(tsppi, clusterer, scorer, writer=None):
Expand Down Expand Up @@ -286,7 +288,8 @@ def get_scorer(con):
#gamma = 1
# ("PLM-gamma-1.0", ppi_networkit.PLM(gamma=1)),
#clusterers = [("PLM-gamma-5.0", ppi_networkit.PLM(gamma=5)), ("PLM-gamma-10.0", ppi_networkit.PLM(gamma=10)), ("PLM-gamma-50.0", ppi_networkit.PLM(gamma=50)), ("PLP", ppi_networkit.PLP()), ("CNM", ppi_networkit.CNM())]
clusterers = [("PLM-gamma-100.0", ppi_networkit.PLM(gamma=100))]
#clusterers = [("PLM-gamma-100.0", ppi_networkit.PLM(gamma=100))]
clusterers = [("PLM-gamma-50.0", ppi_networkit.PLM(gamma=50))]
for clusterer_name, clusterer in clusterers:
writer.set_clusterer(clusterer_name)

Expand All @@ -303,11 +306,10 @@ def get_scorer(con):

# get graph
tsppi = sqlio.load_tsppi_graph(ppi, expr)
g = tsppi.getGraph()

# run the clustering algos
run_global_clustering(tsppi, clusterer, scorer, writer)
#run_ts_clustering(tsppi, clusterer, scorer, writer)
#run_global_clustering(tsppi, clusterer, scorer, writer)
run_ts_clustering(tsppi, clusterer, scorer, writer)
#run_edgescore_clustering(tsppi, clusterer, scorer, writer)

# commit all current changes to the SQL server
Expand Down

0 comments on commit 57b051e

Please sign in to comment.