diff --git a/REQUIRE b/REQUIRE index 38b3b63..339f873 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,6 @@ julia 0.4 +Distributions +Discretizers Iterators LightGraphs DataFrames diff --git a/doc/BayesNets.ipynb b/doc/BayesNets.ipynb index 80f323c..0b8cd04 100644 --- a/doc/BayesNets.ipynb +++ b/doc/BayesNets.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -123,7 +123,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CPD{D<:Distributions.Distribution{F<:Distributions.VariateForm,S<:Distributions.ValueSupport}}}({2, 1} directed graph,BayesNets.CPDs.CPD[BayesNets.CPDs.StaticCPD{Distributions.Normal{Float64}}(:a,Symbol[],Distributions.Normal{Float64}(μ=1.0, σ=1.0)),BayesNets.CPDs.LinearGaussianCPD(:b,[:a],[2.0],3.0,1.0)],Dict(:a=>1,:b=>2))" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -172,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -212,7 +212,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CPD{Distributions.Normal{Float64}}}({2, 1} directed graph,BayesNets.CPDs.CPD{Distributions.Normal{Float64}}[BayesNets.CPDs.StaticCPD{Distributions.Normal{Float64}}(:a,Symbol[],Distributions.Normal{Float64}(μ=-0.02482048154996803, σ=1.050997926141669)),BayesNets.CPDs.LinearGaussianCPD(:b,[:a],[1.8672649732525985],2.849907100812514,2.236109637835203)],Dict(:a=>1,:b=>2))" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -254,7 +254,7 @@ "Distributions.Normal{Float64}(μ=3.7835395874388134, σ=2.236109637835203)" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -272,15 +272,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rand(cpdB, :a=>0.5) # condition and then sample\n", - "pdf(cpdB, Assignment(:a=>1.0, :b=>3.0)) # condition and then compute pdf(distribution, 3)\n", - "logpdf(cpdB, Assignment(:a=>1.0, :b=>3.0)) # condition and then compute logpdf(distribution, 3);" + "pdf(cpdB, :a=>1.0, :b=>3.0) # condition and then compute pdf(distribution, 3)\n", + "logpdf(cpdB, :a=>1.0, :b=>3.0) # condition and then compute logpdf(distribution, 3);" ] }, { @@ -293,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -371,7 +371,7 @@ "),BayesNets.CPDs.FunctionalCPD{Distributions.Bernoulli}(:happy,[:sighted],(anonymous function))],Dict(:sighted=>1,:happy=>2))" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -395,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -406,13 +406,13 @@ "0.01900834726778591" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pdf(bn, Assignment(:a=>0.5, :b=>2.0)) # evaluate the probability density" + "pdf(bn, :a=>0.5, :b=>2.0) # evaluate the probability density" ] }, { @@ -424,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -444,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -455,7 +455,7 @@ "-5.200981176187056" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -476,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -489,7 +489,7 @@ " :b => 6.0369902749018225" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -500,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -521,7 +521,7 @@ "│ 5 │ -0.425156 │ 2.47103 │" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -539,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -587,7 +587,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CPD{D<:Distributions.Distribution{F<:Distributions.VariateForm,S<:Distributions.ValueSupport}}}({3, 2} directed graph,BayesNets.CPDs.CPD[BayesNets.CPDs.StaticCPD{Distributions.Categorical}(:a,Symbol[],Distributions.Categorical(K=2, p=[0.3,0.7])),BayesNets.CPDs.StaticCPD{Distributions.Categorical}(:b,Symbol[],Distributions.Categorical(K=2, p=[0.6,0.4])),BayesNets.CPDs.CategoricalCPD{Distributions.Bernoulli}(:c,[:a,:b],[2,2],[Distributions.Bernoulli(p=0.1),Distributions.Bernoulli(p=0.2),Distributions.Bernoulli(p=1.0),Distributions.Bernoulli(p=0.4)])],Dict(:c=>3,:a=>1,:b=>2))" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -601,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -622,7 +622,7 @@ "│ 5 │ 2 │ 2 │ 1 │" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -640,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -661,7 +661,7 @@ "│ 5 │ 2 │ 1 │ 1 │ 0.181818 │" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -681,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -721,7 +721,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CPD{D<:Distributions.Distribution{F<:Distributions.VariateForm,S<:Distributions.ValueSupport}}}({2, 1} directed graph,BayesNets.CPDs.CPD[BayesNets.CPDs.StaticCPD{Distributions.Normal{Float64}}(:a,Symbol[],Distributions.Normal{Float64}(μ=1.1666666666666667, σ=0.6236095644623235)),BayesNets.CPDs.LinearGaussianCPD(:b,[:a],[2.000000000000001],2.9999999999999987,1.5275252316519465)],Dict(:a=>1,:b=>2))" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -733,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": { "collapsed": false }, @@ -773,7 +773,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.LinearGaussianCPD}({2, 1} directed graph,[BayesNets.CPDs.LinearGaussianCPD(:a,Symbol[],Float64[],1.1666666666666667,0.7637626158259733),BayesNets.CPDs.LinearGaussianCPD(:b,[:a],[2.000000000000001],2.9999999999999987,1.5275252316519465)],Dict(:a=>1,:b=>2))" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -792,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -808,41 +808,41 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", "\n", - " \n", + " \n", "\n", "\n", " \n", "\n", "\n", - " \n", + " \n", "\n", "\n", "\n", "\n" ], "text/plain": [ - "BayesNets.BayesNet{BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}}({3, 3} directed graph,[BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:c,Symbol[],Int64[],[Distributions.Categorical(K=3, p=[0.3333333333333333,0.3333333333333333,0.3333333333333333])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:b,[:c],[3],[Distributions.Categorical(K=2, p=[0.75,0.25]),Distributions.Categorical(K=2, p=[0.25,0.75]),Distributions.Categorical(K=2, p=[0.75,0.25])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:a,[:c,:b],[3,2],[Distributions.Categorical(K=2, p=[1.0,0.0]),Distributions.Categorical(K=2, p=[1.0,0.0]),Distributions.Categorical(K=2, p=[1.0,0.0]),Distributions.Categorical(K=2, p=[0.0,1.0]),Distributions.Categorical(K=2, p=[0.6666666666666666,0.3333333333333333]),Distributions.Categorical(K=2, p=[0.0,1.0])])],Dict(:c=>1,:b=>2,:a=>3))" + "BayesNets.BayesNet{BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}}({3, 3} directed graph,[BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:a,Symbol[],Int64[],[Distributions.Categorical(K=2, p=[0.75,0.25])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:b,[:a],[2],[Distributions.Categorical(K=2, p=[0.7777777777777778,0.2222222222222222]),Distributions.Categorical(K=2, p=[0.0,1.0])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:c,[:b,:a],[2,2],[Distributions.Categorical(K=3, p=[0.42857142857142855,0.14285714285714285,0.42857142857142855]),Distributions.Categorical(K=3, p=[0.0,1.0,0.0]),Distributions.Categorical(K=3, p=[NaN,NaN,NaN]),Distributions.Categorical(K=3, p=[0.3333333333333333,0.3333333333333333,0.3333333333333333])])],Dict(:c=>3,:a=>1,:b=>2))" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -852,7 +852,7 @@ " b=[1,1,1,2,2,2,2,1,1,2,1,1],\n", " a=[1,1,1,2,1,1,2,1,1,2,1,1])\n", "\n", - "fit(DiscreteBayesNet, data, (:c=>:b, :c=>:a, :b=>:a))" + "fit(DiscreteBayesNet, data, (:a=>:b, :a=>:c, :b=>:c))" ] }, { @@ -866,7 +866,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -885,7 +885,7 @@ "│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ 1 │" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -907,7 +907,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": { "collapsed": false }, @@ -1046,7 +1046,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CPD{D<:Distributions.Distribution{F<:Distributions.VariateForm,S<:Distributions.ValueSupport}}}({5, 3} directed graph,BayesNets.CPDs.CPD[BayesNets.CPDs.ConditionalLinearGaussianCPD(:SepalWidth,Symbol[],Symbol[],Int64[],[BayesNets.CPDs.LinearGaussianCPD(:SepalWidth,Symbol[],Float64[],3.0573333333333332,0.4358662849366982)]),BayesNets.CPDs.ConditionalLinearGaussianCPD(:SepalLength,Symbol[],Symbol[],Int64[],[BayesNets.CPDs.LinearGaussianCPD(:SepalLength,Symbol[],Float64[],5.843333333333334,0.828066127977863)]),BayesNets.CPDs.ConditionalLinearGaussianCPD(:PetalLength,[:SepalLength,:SepalWidth],Symbol[],Int64[],[BayesNets.CPDs.LinearGaussianCPD(:PetalLength,[:SepalLength,:SepalWidth],[1.775592546481187,-1.3386232887390561],-2.524761511833528,1.7652982332594662)]),BayesNets.CPDs.ConditionalLinearGaussianCPD(:PetalWidth,[:PetalLength],Symbol[],Int64[],[BayesNets.CPDs.LinearGaussianCPD(:PetalWidth,[:PetalLength],[0.41575541635241176],-0.3630755213190302,0.7622376689603465)]),BayesNets.CPDs.StaticCPD{Distributions.Categorical}(:Species,Symbol[],Distributions.Categorical(K=3, p=[0.3333333333333333,0.3333333333333333,0.3333333333333333]))],Dict(:PetalWidth=>4,:SepalWidth=>1,:PetalLength=>3,:SepalLength=>2,:Species=>5))" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1064,12 +1064,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alternatively we can fit a DiscreteBayesianNetwork using the BayesianScore." + "A `ScoringFunction` allows for extracting a scoring metric for a CPD given data.\n", + "The negative BIC score is implemented in NegativeBayesianInformationCriterion.\n", + "\n", + "A `GraphSearchStrategy` defines a structure learning algorithm.\n", + "The K2 algorithm is defined through `K2GraphSearch` and `GreedyHillClimbing` is implemented for discrete Bayesian networks and the Bayesian score:" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -1119,7 +1123,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}}({3, 3} directed graph,[BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:a,Symbol[],Int64[],[Distributions.Categorical(K=2, p=[0.75,0.25])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:b,[:a],[2],[Distributions.Categorical(K=2, p=[0.7777777777777778,0.2222222222222222]),Distributions.Categorical(K=2, p=[0.0,1.0])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:c,[:b,:a],[2,2],[Distributions.Categorical(K=3, p=[0.42857142857142855,0.14285714285714285,0.42857142857142855]),Distributions.Categorical(K=3, p=[0.0,1.0,0.0]),Distributions.Categorical(K=3, p=[NaN,NaN,NaN]),Distributions.Categorical(K=3, p=[0.3333333333333333,0.3333333333333333,0.3333333333333333])])],Dict(:c=>3,:a=>1,:b=>2))" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1145,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -1152,7 +1156,7 @@ "-31.28804624550449" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1163,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -1181,7 +1185,7 @@ "│ 2 │ 2 │ 3 │" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1192,7 +1196,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": { "collapsed": false }, @@ -1213,7 +1217,7 @@ " 0 0 0 1 1 1" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1224,7 +1228,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": { "collapsed": false }, @@ -1244,7 +1248,7 @@ "│ 4 │ 2 │ 2 │ 1.0 │" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1255,7 +1259,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": { "collapsed": false }, @@ -1277,7 +1281,7 @@ "│ 6 │ 2 │ 1 │ 3 │ 0.0 │" ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1299,7 +1303,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": { "collapsed": false }, @@ -1376,7 +1380,7 @@ "BayesNets.BayesNet{BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}}({2, 1} directed graph,[BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:Success,Symbol[],Int64[],[Distributions.Categorical(K=2, p=[0.2,0.8])]),BayesNets.CPDs.CategoricalCPD{Distributions.Categorical}(:Forecast,[:Success],[2],[Distributions.Categorical(K=3, p=[0.4,0.4,0.2]),Distributions.Categorical(K=3, p=[0.1,0.3,0.6])])],Dict(:Success=>1,:Forecast=>2))" ] }, - "execution_count": 27, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } diff --git a/src/BayesNets.jl b/src/BayesNets.jl index b059eb6..6b281dd 100644 --- a/src/BayesNets.jl +++ b/src/BayesNets.jl @@ -11,6 +11,7 @@ include(joinpath("CPDs", "cpds.jl")) import LightGraphs: DiGraph, rem_edge!, add_edge!, add_vertex!, has_edge, topological_sort_by_dfs, in_neighbors, out_neighbors, is_cyclic, nv, ne import TikzGraphs: plot, simple_graph import Iterators: subsets +import Base.Collections: PriorityQueue, peek export BayesNet, @@ -24,7 +25,7 @@ export add_edge!, has_edge, - rand_cpd!, + rand_cpd, rand_table_weighted, table, @@ -33,26 +34,26 @@ export estimate_convergence, readxdsl, - statistics, - index_data, - adding_edge_preserves_acyclicity, - bayesian_score_component, - bayesian_score_components, - bayesian_score, - - ScoreComponentCache, - DirichletPrior, UniformPrior, BDeuPrior, ScoringFunction, + ScoreComponentCache, NegativeBayesianInformationCriterion, score_component, + score_components, GraphSearchStrategy, K2GraphSearch, - GreedyHillClimbing + GreedyHillClimbing, + + statistics, + index_data, + adding_edge_preserves_acyclicity, + bayesian_score_component, + bayesian_score_components, + bayesian_score include("bayes_nets.jl") diff --git a/src/CPDs/cpds.jl b/src/CPDs/cpds.jl index d87793c..ace6891 100644 --- a/src/CPDs/cpds.jl +++ b/src/CPDs/cpds.jl @@ -71,7 +71,7 @@ Use the parental values in `a` to return the conditional distribution """ @required_func Base.call(cpd::CPD, a::Assignment) Base.call(cpd::CPD) = call(cpd, Assignment()) # cpd() -Base.call{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = call(cpd, Assignment(pair)) # cpd(:A=>1) +Base.call(cpd::CPD, pair::Pair...) = call(cpd, Assignment(pair)) # cpd(:A=>1) """ fit(::Type{CPD}, data::DataFrame, target::NodeName, parents::Vector{NodeName}) @@ -103,21 +103,21 @@ disttype{D}(cpd::CPD{D}) = D Condition and then draw from the distribution """ Base.rand(cpd::CPD, a::Assignment) = rand(cpd(a)) -Base.rand{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = rand(cpd, Assignment(pair)) +Base.rand(cpd::CPD, pair::Pair...) = rand(cpd, Assignment(pair)) """ pdf(cpd::CPD) Condition and then return the pdf """ Distributions.pdf(cpd::CPD, a::Assignment) = pdf(cpd(a), a[name(cpd)]) -Distributions.pdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = pdf(cpd, Assignment(pair)) +Distributions.pdf(cpd::CPD, pair::Pair...) = pdf(cpd, Assignment(pair)) """ logpdf(cpd::CPD) Condition and then return the logpdf """ Distributions.logpdf(cpd::CPD, a::Assignment) = logpdf(cpd(a), a[name(cpd)]) -Distributions.logpdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = logpdf(cpd, Assignment(pair)) +Distributions.logpdf(cpd::CPD, pair::Pair...) = logpdf(cpd, Assignment(pair)) """ logpdf(cpd::CPD, data::DataFrame) diff --git a/src/DiscreteBayesNet/discrete_bayes_net.jl b/src/DiscreteBayesNet/discrete_bayes_net.jl index ff7a701..ffacefa 100644 --- a/src/DiscreteBayesNet/discrete_bayes_net.jl +++ b/src/DiscreteBayesNet/discrete_bayes_net.jl @@ -24,7 +24,13 @@ function _get_parental_ncategories(bn::DiscreteBayesNet, parents::Vector{NodeNam parental_ncategories end -function rand_cpd!(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[]) +""" + rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[]) +Return a CategoricalCPD with the given number of categories with random categorical distributions +""" +function rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[]; + uniform_dirichlet_prior::Float64 = 1.0 + ) !haskey(bn.name_to_index, target) || error("A CPD with name $target already exists!") @@ -32,7 +38,7 @@ function rand_cpd!(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, par Q = prod(parental_ncategories) distributions = Array(Categorical, Q) - dir = Dirichlet(ncategories, 1.0) # draw random categoricals from a uniform Dirichlet + dir = Dirichlet(ncategories, uniform_dirichlet_prior) # draw random categoricals from a Dirichlet distribution for q in 1:Q distributions[q] = Categorical(rand(dir)) end @@ -73,7 +79,7 @@ function table(bn::DiscreteBayesNet, name::NodeName) end table(bn::DiscreteBayesNet, name::NodeName, a::Assignment) = select(table(bn, name), a) -table{N<:Any}(bn::DiscreteBayesNet, name::NodeName, pair::Pair{NodeName,N}) = table(bn, name, Assignment(pair)) +table(bn::DiscreteBayesNet, name::NodeName, pair::Pair...) = table(bn, name, Assignment(pair)) """ Base.count(bn::BayesNet, name::NodeName, data::DataFrame) @@ -212,7 +218,6 @@ function statistics( end N end - function statistics(dag::DAG, data::DataFrame) n = nv(dag) @@ -316,20 +321,6 @@ function bayesian_score(bn::DiscreteBayesNet, data::DataFrame, prior::DirichletP bayesian_score(parent_list, bincounts, datamat, prior) end -import Base.Collections: PriorityQueue, peek -typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}} - -""" -Construct an empty ScoreComponentCache the size of ncol(data) -""" -function ScoreComponentCache(data::DataFrame) - cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data)) - for i in 1 : ncol(data) - cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}() - end - cache -end - function bayesian_score_component( i::Int, parents::AbstractVector{Int}, diff --git a/src/bayes_nets.jl b/src/bayes_nets.jl index 24f5666..9f1b9d6 100644 --- a/src/bayes_nets.jl +++ b/src/bayes_nets.jl @@ -164,6 +164,7 @@ function CPDs.pdf(bn::BayesNet, assignment::Assignment) end retval end +CPDs.pdf(bn::BayesNet, pair::Pair...) = pdf(bn, Assignment(pair)) """ The logpdf of a given assignment after conditioning on the values @@ -175,6 +176,7 @@ function CPDs.logpdf(bn::BayesNet, assignment::Assignment) end retval end +CPDs.logpdf(bn::BayesNet, pair::Pair...) = logpdf(bn, Assignment(pair)) """ The logpdf of a set of assignment after conditioning on the values diff --git a/src/learning.jl b/src/learning.jl index d53d407..496bed9 100644 --- a/src/learning.jl +++ b/src/learning.jl @@ -66,14 +66,96 @@ end ############################ +""" + ScoreComponentCache +Used to store scores in a priority queue such that graph search algorithms know +when a particular construction has already been made. + cache[ⱼ](parentsⱼ, score) for the ith variable with parents parents +""" +typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}} # parent indeces -> score + +""" + ScoreComponentCache(data::DataFrame) +Construct an empty ScoreComponentCache the size of ncol(data) +""" +function ScoreComponentCache(data::DataFrame) + cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data)) + for i in 1 : ncol(data) + cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}() + end + cache +end + +############################ + +""" + ScoringFunction +An abstract type for which subtypes allow extracting CPD score components, +which are to be maximized: +score_component(::ScoringFunction, cpd::CPD, data::DataFrame) +""" abstract ScoringFunction -type NegativeBayesianInformationCriterion <: ScoringFunction +""" + score_component(a::ScoringFunction, cpd::CPD, data::DataFrame) +Extract a Float64 score for a cpd given the data. +One seeks to maximize the score. +""" +score_component(a::ScoringFunction, cpd::CPD, data::DataFrame) = error("score_component not defined for ScoringFunction $a") + +""" + score_component(a::ScoringFunction, cpd::CPD, data::DataFrame, cache::ScoreComponentCache) +As score_component(ScoringFunction, cpd, data), but returns pre-computed values from the cache +if they exist, and populates the cache if they don't +""" +function _get_parent_indeces(parents::Vector{NodeName}, data::DataFrame) + varnames = names(data) + retval = Array(Int, length(parents)) + for (i,p) in enumerate(parents) + retval[i] = findfirst(varnames, p) + end + retval +end +function score_component( + a::ScoringFunction, + cpd::CPD, + data::DataFrame, + cache::ScoreComponentCache, + ) + + pinds = _get_parent_indeces(parents(cpd)) + + if !haskey(cache[i], pinds) + cache[i][pinds] = score_component(a, cpd, data) + end + + cache[i][pinds] end """ - score_component(::ScoringFunction, cpd::CPD, data::DataFrame) -Return the negative Bayesian information criterion score component + score_components(a::ScoringFunction, cpd::CPD, data::DataFrame) + score_components(a::ScoringFunction, cpds::Vector{CPD}, data::DataFrame, cache::ScoreComponentCache) +Get a list of score components for all cpds +""" +function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame) + retval = Array(Float64, length(cpds)) + for (i,cpd) in enumerate(cpds) + retval[i] = score_component(a, cpd, data) + end + retval +end +function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame, cache::ScoreComponentCache) + retval = Array(Float64, length(cpds)) + for (i,cpd) in enumerate(cpds) + retval[i] = score_component(a, cpd, data, cache) + end + retval +end + + +""" + NegativeBayesianInformationCriterion +A ScoringFunction for the negative Bayesian information criterion. BIC = -2⋅L + k⋅ln(n) @@ -81,6 +163,8 @@ Return the negative Bayesian information criterion score component k - the number of free parameters to be estimated n - the sample size """ +type NegativeBayesianInformationCriterion <: ScoringFunction +end function score_component(::NegativeBayesianInformationCriterion, cpd::CPD, data::DataFrame) L = logpdf(cpd, data) k = nparams(cpd) @@ -93,12 +177,29 @@ end ########################## +""" + GraphSearchStrategy +An abstract type which defines a graph search strategy for learning Bayesian network structures +These allow: fit(::Type{BayesNet}, data, GraphSearchStrategy) +""" abstract GraphSearchStrategy +""" + Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, ::DataFrame, ::GraphSearchStrategy) +Run the graph search algorithm defined by GraphSearchStrategy +""" +Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::GraphSearchStrategy) = error("fit not defined for GraphSearchStrategy $params") + +""" + K2GraphSearch +A GraphSearchStrategy following the K2 algorithm. +Takes polynomial time to find the optimal structure assuming +a topological variable ordering. +""" type K2GraphSearch <: GraphSearchStrategy order::Vector{NodeName} # topological ordering of variables cpd_types::Vector{DataType} # cpd types, in same order as `order` - max_n_parents::Int + max_n_parents::Int # maximum number of parents per CPD metric::ScoringFunction # metric we are trying to maximize function K2GraphSearch( @@ -122,10 +223,6 @@ type K2GraphSearch <: GraphSearchStrategy end end -""" - Distributions.fit(::Type{BayesNet}, data::DataFrame, params::K2GraphSearch) -Runs the K2 structure search algorithm on the data with the given cpd types and fixed ordering -""" function Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::K2GraphSearch) N = length(params.order) diff --git a/src/sampling.jl b/src/sampling.jl index 5382255..bf12b8e 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -41,7 +41,7 @@ nsamples: the number of rows the resulting DataFrame will contain consistent_with: the assignment that all samples must be consistent with (ie, Assignment(:A=>1) means all samples must have :A=1) max_nsamples: an upper limit on the number of samples that will be tried, needed to ensure zero-prob samples are never used """ -function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment, max_nsamples::Integer=nsamples*100) +function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment; max_nsamples::Integer=nsamples*100) a = rand(bn) df = DataFrame() @@ -72,7 +72,7 @@ function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment, df end -function Base.rand{N<:Any}(bn::BayesNet, nsamples::Integer, pair::Pair{NodeName,N}, max_nsamples::Integer=nsamples*100) +function Base.rand(bn::BayesNet, nsamples::Integer, pair::Pair...; max_nsamples::Integer=nsamples*100) a = Assignment(pair) rand(bn, nsamples, a, max_nsamples=max_nsamples) end