Added Pair with varargs, removed deprecated, added to REQUIRE, rand_c…

…pd! -> rand_cpd
sisl · Jun 22, 2016 · 789f33a · 789f33a
1 parent 34cfb07
commit 789f33a
Show file tree

Hide file tree

Showing 8 changed files with 204 additions and 107 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -1,4 +1,6 @@
 julia 0.4
+Distributions
+Discretizers
 Iterators
 LightGraphs
 DataFrames

diff --git a/doc/BayesNets.ipynb b/doc/BayesNets.ipynb
diff --git a/src/BayesNets.jl b/src/BayesNets.jl
@@ -11,6 +11,7 @@ include(joinpath("CPDs", "cpds.jl"))
 import LightGraphs: DiGraph, rem_edge!, add_edge!, add_vertex!, has_edge, topological_sort_by_dfs, in_neighbors, out_neighbors, is_cyclic, nv, ne
 import TikzGraphs: plot, simple_graph
 import Iterators: subsets
+import Base.Collections: PriorityQueue, peek
 
 export
  BayesNet,
@@ -24,7 +25,7 @@ export
  add_edge!,
  has_edge,
 
- rand_cpd!,
+ rand_cpd,
  rand_table_weighted,
 
  table,
@@ -33,26 +34,26 @@ export
  estimate_convergence,
  readxdsl,
 
- statistics,
- index_data,
- adding_edge_preserves_acyclicity,
- bayesian_score_component,
- bayesian_score_components,
- bayesian_score,
-
- ScoreComponentCache,
-
  DirichletPrior,
  UniformPrior,
  BDeuPrior,
 
  ScoringFunction,
+ ScoreComponentCache,
  NegativeBayesianInformationCriterion,
  score_component,
+ score_components,
 
  GraphSearchStrategy,
  K2GraphSearch,
- GreedyHillClimbing
+ GreedyHillClimbing,
+
+ statistics,
+ index_data,
+ adding_edge_preserves_acyclicity,
+ bayesian_score_component,
+ bayesian_score_components,
+ bayesian_score
 
 
 include("bayes_nets.jl")

diff --git a/src/CPDs/cpds.jl b/src/CPDs/cpds.jl
@@ -71,7 +71,7 @@ Use the parental values in `a` to return the conditional distribution
 """
 @required_func Base.call(cpd::CPD, a::Assignment)
 Base.call(cpd::CPD) = call(cpd, Assignment()) # cpd()
-Base.call{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = call(cpd, Assignment(pair)) # cpd(:A=>1)
+Base.call(cpd::CPD, pair::Pair...) = call(cpd, Assignment(pair)) # cpd(:A=>1)
 
 """
  fit(::Type{CPD}, data::DataFrame, target::NodeName, parents::Vector{NodeName})
@@ -103,21 +103,21 @@ disttype{D}(cpd::CPD{D}) = D
 Condition and then draw from the distribution
 """
 Base.rand(cpd::CPD, a::Assignment) = rand(cpd(a))
-Base.rand{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = rand(cpd, Assignment(pair))
+Base.rand(cpd::CPD, pair::Pair...) = rand(cpd, Assignment(pair))
 
 """
  pdf(cpd::CPD)
 Condition and then return the pdf
 """
 Distributions.pdf(cpd::CPD, a::Assignment) = pdf(cpd(a), a[name(cpd)])
-Distributions.pdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = pdf(cpd, Assignment(pair))
+Distributions.pdf(cpd::CPD, pair::Pair...) = pdf(cpd, Assignment(pair))
 
 """
  logpdf(cpd::CPD)
 Condition and then return the logpdf
 """
 Distributions.logpdf(cpd::CPD, a::Assignment) = logpdf(cpd(a), a[name(cpd)])
-Distributions.logpdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = logpdf(cpd, Assignment(pair))
+Distributions.logpdf(cpd::CPD, pair::Pair...) = logpdf(cpd, Assignment(pair))
 
 """
  logpdf(cpd::CPD, data::DataFrame)

diff --git a/src/DiscreteBayesNet/discrete_bayes_net.jl b/src/DiscreteBayesNet/discrete_bayes_net.jl
@@ -24,15 +24,21 @@ function _get_parental_ncategories(bn::DiscreteBayesNet, parents::Vector{NodeNam
  parental_ncategories
 end
 
-function rand_cpd!(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[])
+"""
+ rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[])
+Return a CategoricalCPD with the given number of categories with random categorical distributions
+"""
+function rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[];
+ uniform_dirichlet_prior::Float64 = 1.0
+ )
 
  !haskey(bn.name_to_index, target) || error("A CPD with name $target already exists!")
 
  parental_ncategories = _get_parental_ncategories(bn, parents)
 
  Q = prod(parental_ncategories)
  distributions = Array(Categorical, Q)
- dir = Dirichlet(ncategories, 1.0) # draw random categoricals from a uniform Dirichlet
+ dir = Dirichlet(ncategories, uniform_dirichlet_prior) # draw random categoricals from a Dirichlet distribution
  for q in 1:Q
  distributions[q] = Categorical(rand(dir))
  end
@@ -73,7 +79,7 @@ function table(bn::DiscreteBayesNet, name::NodeName)
 end
 
 table(bn::DiscreteBayesNet, name::NodeName, a::Assignment) = select(table(bn, name), a)
-table{N<:Any}(bn::DiscreteBayesNet, name::NodeName, pair::Pair{NodeName,N}) = table(bn, name, Assignment(pair))
+table(bn::DiscreteBayesNet, name::NodeName, pair::Pair...) = table(bn, name, Assignment(pair))
 
 """
  Base.count(bn::BayesNet, name::NodeName, data::DataFrame)
@@ -212,7 +218,6 @@ function statistics(
  end
  N
 end
-
 function statistics(dag::DAG, data::DataFrame)
 
  n = nv(dag)
@@ -316,20 +321,6 @@ function bayesian_score(bn::DiscreteBayesNet, data::DataFrame, prior::DirichletP
  bayesian_score(parent_list, bincounts, datamat, prior)
 end
 
-import Base.Collections: PriorityQueue, peek
-typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}}
-
-"""
-Construct an empty ScoreComponentCache the size of ncol(data)
-"""
-function ScoreComponentCache(data::DataFrame)
- cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data))
- for i in 1 : ncol(data)
- cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}()
- end
- cache
-end
-
 function bayesian_score_component(
  i::Int,
  parents::AbstractVector{Int},

diff --git a/src/bayes_nets.jl b/src/bayes_nets.jl
@@ -164,6 +164,7 @@ function CPDs.pdf(bn::BayesNet, assignment::Assignment)
  end
  retval
 end
+CPDs.pdf(bn::BayesNet, pair::Pair...) = pdf(bn, Assignment(pair))
 
 """
 The logpdf of a given assignment after conditioning on the values
@@ -175,6 +176,7 @@ function CPDs.logpdf(bn::BayesNet, assignment::Assignment)
  end
  retval
 end
+CPDs.logpdf(bn::BayesNet, pair::Pair...) = logpdf(bn, Assignment(pair))
 
 """
 The logpdf of a set of assignment after conditioning on the values

diff --git a/src/learning.jl b/src/learning.jl
@@ -66,21 +66,105 @@ end
 
 ############################
 
+"""
+ ScoreComponentCache
+Used to store scores in a priority queue such that graph search algorithms know
+when a particular construction has already been made.
+ cache[ⱼ](parentsⱼ, score) for the ith variable with parents parents
+"""
+typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}} # parent indeces -> score
+
+"""
+ ScoreComponentCache(data::DataFrame)
+Construct an empty ScoreComponentCache the size of ncol(data)
+"""
+function ScoreComponentCache(data::DataFrame)
+ cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data))
+ for i in 1 : ncol(data)
+ cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}()
+ end
+ cache
+end
+
+############################
+
+"""
+ ScoringFunction
+An abstract type for which subtypes allow extracting CPD score components,
+which are to be maximized:
+score_component(::ScoringFunction, cpd::CPD, data::DataFrame)
+"""
 abstract ScoringFunction
 
-type NegativeBayesianInformationCriterion <: ScoringFunction
+"""
+ score_component(a::ScoringFunction, cpd::CPD, data::DataFrame)
+Extract a Float64 score for a cpd given the data.
+One seeks to maximize the score.
+"""
+score_component(a::ScoringFunction, cpd::CPD, data::DataFrame) = error("score_component not defined for ScoringFunction $a")
+
+"""
+ score_component(a::ScoringFunction, cpd::CPD, data::DataFrame, cache::ScoreComponentCache)
+As score_component(ScoringFunction, cpd, data), but returns pre-computed values from the cache
+if they exist, and populates the cache if they don't
+"""
+function _get_parent_indeces(parents::Vector{NodeName}, data::DataFrame)
+ varnames = names(data)
+ retval = Array(Int, length(parents))
+ for (i,p) in enumerate(parents)
+ retval[i] = findfirst(varnames, p)
+ end
+ retval
+end
+function score_component(
+ a::ScoringFunction,
+ cpd::CPD,
+ data::DataFrame,
+ cache::ScoreComponentCache,
+ )
+
+ pinds = _get_parent_indeces(parents(cpd))
+
+ if !haskey(cache[i], pinds)
+ cache[i][pinds] = score_component(a, cpd, data)
+ end
+
+ cache[i][pinds]
 end
 
 """
- score_component(::ScoringFunction, cpd::CPD, data::DataFrame)
-Return the negative Bayesian information criterion score component
+ score_components(a::ScoringFunction, cpd::CPD, data::DataFrame)
+ score_components(a::ScoringFunction, cpds::Vector{CPD}, data::DataFrame, cache::ScoreComponentCache)
+Get a list of score components for all cpds
+"""
+function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame)
+ retval = Array(Float64, length(cpds))
+ for (i,cpd) in enumerate(cpds)
+ retval[i] = score_component(a, cpd, data)
+ end
+ retval
+end
+function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame, cache::ScoreComponentCache)
+ retval = Array(Float64, length(cpds))
+ for (i,cpd) in enumerate(cpds)
+ retval[i] = score_component(a, cpd, data, cache)
+ end
+ retval
+end
+
+
+"""
+ NegativeBayesianInformationCriterion
+A ScoringFunction for the negative Bayesian information criterion.
 
  BIC = -2⋅L + k⋅ln(n)
 
  L - the log likelihood of the data under the cpd
  k - the number of free parameters to be estimated
  n - the sample size
 """
+type NegativeBayesianInformationCriterion <: ScoringFunction
+end
 function score_component(::NegativeBayesianInformationCriterion, cpd::CPD, data::DataFrame)
  L = logpdf(cpd, data)
  k = nparams(cpd)
@@ -93,12 +177,29 @@ end
 
 ##########################
 
+"""
+ GraphSearchStrategy
+An abstract type which defines a graph search strategy for learning Bayesian network structures
+These allow: fit(::Type{BayesNet}, data, GraphSearchStrategy)
+"""
 abstract GraphSearchStrategy
 
+"""
+ Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, ::DataFrame, ::GraphSearchStrategy)
+Run the graph search algorithm defined by GraphSearchStrategy
+"""
+Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::GraphSearchStrategy) = error("fit not defined for GraphSearchStrategy $params")
+
+"""
+ K2GraphSearch
+A GraphSearchStrategy following the K2 algorithm.
+Takes polynomial time to find the optimal structure assuming
+a topological variable ordering.
+"""
 type K2GraphSearch <: GraphSearchStrategy
  order::Vector{NodeName} # topological ordering of variables
  cpd_types::Vector{DataType} # cpd types, in same order as `order`
- max_n_parents::Int
+ max_n_parents::Int # maximum number of parents per CPD
  metric::ScoringFunction # metric we are trying to maximize
 
  function K2GraphSearch(
@@ -122,10 +223,6 @@ type K2GraphSearch <: GraphSearchStrategy
  end
 end
 
-"""
- Distributions.fit(::Type{BayesNet}, data::DataFrame, params::K2GraphSearch)
-Runs the K2 structure search algorithm on the data with the given cpd types and fixed ordering
-"""
 function Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::K2GraphSearch)
 
  N = length(params.order)

diff --git a/src/sampling.jl b/src/sampling.jl
@@ -41,7 +41,7 @@ nsamples: the number of rows the resulting DataFrame will contain
 consistent_with: the assignment that all samples must be consistent with (ie, Assignment(:A=>1) means all samples must have :A=1)
 max_nsamples: an upper limit on the number of samples that will be tried, needed to ensure zero-prob samples are never used
 """
-function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment, max_nsamples::Integer=nsamples*100)
+function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment; max_nsamples::Integer=nsamples*100)
 
  a = rand(bn)
  df = DataFrame()
@@ -72,7 +72,7 @@ function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment,
 
  df
 end
-function Base.rand{N<:Any}(bn::BayesNet, nsamples::Integer, pair::Pair{NodeName,N}, max_nsamples::Integer=nsamples*100)
+function Base.rand(bn::BayesNet, nsamples::Integer, pair::Pair...; max_nsamples::Integer=nsamples*100)
  a = Assignment(pair)
  rand(bn, nsamples, a, max_nsamples=max_nsamples)
 end