Skip to content

Commit

Permalink
Added Pair with varargs, removed deprecated, added to REQUIRE, rand_c…
Browse files Browse the repository at this point in the history
…pd! -> rand_cpd
  • Loading branch information
tawheeler committed Jun 22, 2016
1 parent 34cfb07 commit 789f33a
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 107 deletions.
2 changes: 2 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
julia 0.4
Distributions
Discretizers
Iterators
LightGraphs
DataFrames
Expand Down
132 changes: 68 additions & 64 deletions doc/BayesNets.ipynb

Large diffs are not rendered by default.

23 changes: 12 additions & 11 deletions src/BayesNets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ include(joinpath("CPDs", "cpds.jl"))
import LightGraphs: DiGraph, rem_edge!, add_edge!, add_vertex!, has_edge, topological_sort_by_dfs, in_neighbors, out_neighbors, is_cyclic, nv, ne
import TikzGraphs: plot, simple_graph
import Iterators: subsets
import Base.Collections: PriorityQueue, peek

export
BayesNet,
Expand All @@ -24,7 +25,7 @@ export
add_edge!,
has_edge,

rand_cpd!,
rand_cpd,
rand_table_weighted,

table,
Expand All @@ -33,26 +34,26 @@ export
estimate_convergence,
readxdsl,

statistics,
index_data,
adding_edge_preserves_acyclicity,
bayesian_score_component,
bayesian_score_components,
bayesian_score,

ScoreComponentCache,

DirichletPrior,
UniformPrior,
BDeuPrior,

ScoringFunction,
ScoreComponentCache,
NegativeBayesianInformationCriterion,
score_component,
score_components,

GraphSearchStrategy,
K2GraphSearch,
GreedyHillClimbing
GreedyHillClimbing,

statistics,
index_data,
adding_edge_preserves_acyclicity,
bayesian_score_component,
bayesian_score_components,
bayesian_score


include("bayes_nets.jl")
Expand Down
8 changes: 4 additions & 4 deletions src/CPDs/cpds.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Use the parental values in `a` to return the conditional distribution
"""
@required_func Base.call(cpd::CPD, a::Assignment)
Base.call(cpd::CPD) = call(cpd, Assignment()) # cpd()
Base.call{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = call(cpd, Assignment(pair)) # cpd(:A=>1)
Base.call(cpd::CPD, pair::Pair...) = call(cpd, Assignment(pair)) # cpd(:A=>1)

"""
fit(::Type{CPD}, data::DataFrame, target::NodeName, parents::Vector{NodeName})
Expand Down Expand Up @@ -103,21 +103,21 @@ disttype{D}(cpd::CPD{D}) = D
Condition and then draw from the distribution
"""
Base.rand(cpd::CPD, a::Assignment) = rand(cpd(a))
Base.rand{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = rand(cpd, Assignment(pair))
Base.rand(cpd::CPD, pair::Pair...) = rand(cpd, Assignment(pair))

"""
pdf(cpd::CPD)
Condition and then return the pdf
"""
Distributions.pdf(cpd::CPD, a::Assignment) = pdf(cpd(a), a[name(cpd)])
Distributions.pdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = pdf(cpd, Assignment(pair))
Distributions.pdf(cpd::CPD, pair::Pair...) = pdf(cpd, Assignment(pair))

"""
logpdf(cpd::CPD)
Condition and then return the logpdf
"""
Distributions.logpdf(cpd::CPD, a::Assignment) = logpdf(cpd(a), a[name(cpd)])
Distributions.logpdf{N<:Any}(cpd::CPD, pair::Pair{NodeName,N}) = logpdf(cpd, Assignment(pair))
Distributions.logpdf(cpd::CPD, pair::Pair...) = logpdf(cpd, Assignment(pair))

"""
logpdf(cpd::CPD, data::DataFrame)
Expand Down
27 changes: 9 additions & 18 deletions src/DiscreteBayesNet/discrete_bayes_net.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,21 @@ function _get_parental_ncategories(bn::DiscreteBayesNet, parents::Vector{NodeNam
parental_ncategories
end

function rand_cpd!(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[])
"""
rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[])
Return a CategoricalCPD with the given number of categories with random categorical distributions
"""
function rand_cpd(bn::DiscreteBayesNet, ncategories::Int, target::NodeName, parents::Vector{NodeName}=NodeName[];
uniform_dirichlet_prior::Float64 = 1.0
)

!haskey(bn.name_to_index, target) || error("A CPD with name $target already exists!")

parental_ncategories = _get_parental_ncategories(bn, parents)

Q = prod(parental_ncategories)
distributions = Array(Categorical, Q)
dir = Dirichlet(ncategories, 1.0) # draw random categoricals from a uniform Dirichlet
dir = Dirichlet(ncategories, uniform_dirichlet_prior) # draw random categoricals from a Dirichlet distribution
for q in 1:Q
distributions[q] = Categorical(rand(dir))
end
Expand Down Expand Up @@ -73,7 +79,7 @@ function table(bn::DiscreteBayesNet, name::NodeName)
end

table(bn::DiscreteBayesNet, name::NodeName, a::Assignment) = select(table(bn, name), a)
table{N<:Any}(bn::DiscreteBayesNet, name::NodeName, pair::Pair{NodeName,N}) = table(bn, name, Assignment(pair))
table(bn::DiscreteBayesNet, name::NodeName, pair::Pair...) = table(bn, name, Assignment(pair))

"""
Base.count(bn::BayesNet, name::NodeName, data::DataFrame)
Expand Down Expand Up @@ -212,7 +218,6 @@ function statistics(
end
N
end

function statistics(dag::DAG, data::DataFrame)

n = nv(dag)
Expand Down Expand Up @@ -316,20 +321,6 @@ function bayesian_score(bn::DiscreteBayesNet, data::DataFrame, prior::DirichletP
bayesian_score(parent_list, bincounts, datamat, prior)
end

import Base.Collections: PriorityQueue, peek
typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}}

"""
Construct an empty ScoreComponentCache the size of ncol(data)
"""
function ScoreComponentCache(data::DataFrame)
cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data))
for i in 1 : ncol(data)
cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}()
end
cache
end

function bayesian_score_component(
i::Int,
parents::AbstractVector{Int},
Expand Down
2 changes: 2 additions & 0 deletions src/bayes_nets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ function CPDs.pdf(bn::BayesNet, assignment::Assignment)
end
retval
end
CPDs.pdf(bn::BayesNet, pair::Pair...) = pdf(bn, Assignment(pair))

"""
The logpdf of a given assignment after conditioning on the values
Expand All @@ -175,6 +176,7 @@ function CPDs.logpdf(bn::BayesNet, assignment::Assignment)
end
retval
end
CPDs.logpdf(bn::BayesNet, pair::Pair...) = logpdf(bn, Assignment(pair))

"""
The logpdf of a set of assignment after conditioning on the values
Expand Down
113 changes: 105 additions & 8 deletions src/learning.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,105 @@ end

############################

"""
ScoreComponentCache
Used to store scores in a priority queue such that graph search algorithms know
when a particular construction has already been made.
cache[ⱼ](parentsⱼ, score) for the ith variable with parents parents
"""
typealias ScoreComponentCache Vector{PriorityQueue{Vector{Int}, Float64}} # parent indeces -> score

"""
ScoreComponentCache(data::DataFrame)
Construct an empty ScoreComponentCache the size of ncol(data)
"""
function ScoreComponentCache(data::DataFrame)
cache = Array(PriorityQueue{Vector{Int}, Float64}, ncol(data))
for i in 1 : ncol(data)
cache[i] = PriorityQueue{Vector{Int}, Float64, Base.Order.ForwardOrdering}()
end
cache
end

############################

"""
ScoringFunction
An abstract type for which subtypes allow extracting CPD score components,
which are to be maximized:
score_component(::ScoringFunction, cpd::CPD, data::DataFrame)
"""
abstract ScoringFunction

type NegativeBayesianInformationCriterion <: ScoringFunction
"""
score_component(a::ScoringFunction, cpd::CPD, data::DataFrame)
Extract a Float64 score for a cpd given the data.
One seeks to maximize the score.
"""
score_component(a::ScoringFunction, cpd::CPD, data::DataFrame) = error("score_component not defined for ScoringFunction $a")

"""
score_component(a::ScoringFunction, cpd::CPD, data::DataFrame, cache::ScoreComponentCache)
As score_component(ScoringFunction, cpd, data), but returns pre-computed values from the cache
if they exist, and populates the cache if they don't
"""
function _get_parent_indeces(parents::Vector{NodeName}, data::DataFrame)
varnames = names(data)
retval = Array(Int, length(parents))
for (i,p) in enumerate(parents)
retval[i] = findfirst(varnames, p)
end
retval
end
function score_component(
a::ScoringFunction,
cpd::CPD,
data::DataFrame,
cache::ScoreComponentCache,
)

pinds = _get_parent_indeces(parents(cpd))

if !haskey(cache[i], pinds)
cache[i][pinds] = score_component(a, cpd, data)
end

cache[i][pinds]
end

"""
score_component(::ScoringFunction, cpd::CPD, data::DataFrame)
Return the negative Bayesian information criterion score component
score_components(a::ScoringFunction, cpd::CPD, data::DataFrame)
score_components(a::ScoringFunction, cpds::Vector{CPD}, data::DataFrame, cache::ScoreComponentCache)
Get a list of score components for all cpds
"""
function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame)
retval = Array(Float64, length(cpds))
for (i,cpd) in enumerate(cpds)
retval[i] = score_component(a, cpd, data)
end
retval
end
function score_components{C<:CPD}(a::ScoringFunction, cpds::Vector{C}, data::DataFrame, cache::ScoreComponentCache)
retval = Array(Float64, length(cpds))
for (i,cpd) in enumerate(cpds)
retval[i] = score_component(a, cpd, data, cache)
end
retval
end


"""
NegativeBayesianInformationCriterion
A ScoringFunction for the negative Bayesian information criterion.
BIC = -2⋅L + k⋅ln(n)
L - the log likelihood of the data under the cpd
k - the number of free parameters to be estimated
n - the sample size
"""
type NegativeBayesianInformationCriterion <: ScoringFunction
end
function score_component(::NegativeBayesianInformationCriterion, cpd::CPD, data::DataFrame)
L = logpdf(cpd, data)
k = nparams(cpd)
Expand All @@ -93,12 +177,29 @@ end

##########################

"""
GraphSearchStrategy
An abstract type which defines a graph search strategy for learning Bayesian network structures
These allow: fit(::Type{BayesNet}, data, GraphSearchStrategy)
"""
abstract GraphSearchStrategy

"""
Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, ::DataFrame, ::GraphSearchStrategy)
Run the graph search algorithm defined by GraphSearchStrategy
"""
Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::GraphSearchStrategy) = error("fit not defined for GraphSearchStrategy $params")

"""
K2GraphSearch
A GraphSearchStrategy following the K2 algorithm.
Takes polynomial time to find the optimal structure assuming
a topological variable ordering.
"""
type K2GraphSearch <: GraphSearchStrategy
order::Vector{NodeName} # topological ordering of variables
cpd_types::Vector{DataType} # cpd types, in same order as `order`
max_n_parents::Int
max_n_parents::Int # maximum number of parents per CPD
metric::ScoringFunction # metric we are trying to maximize

function K2GraphSearch(
Expand All @@ -122,10 +223,6 @@ type K2GraphSearch <: GraphSearchStrategy
end
end

"""
Distributions.fit(::Type{BayesNet}, data::DataFrame, params::K2GraphSearch)
Runs the K2 structure search algorithm on the data with the given cpd types and fixed ordering
"""
function Distributions.fit{C<:CPD}(::Type{BayesNet{C}}, data::DataFrame, params::K2GraphSearch)

N = length(params.order)
Expand Down
4 changes: 2 additions & 2 deletions src/sampling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ nsamples: the number of rows the resulting DataFrame will contain
consistent_with: the assignment that all samples must be consistent with (ie, Assignment(:A=>1) means all samples must have :A=1)
max_nsamples: an upper limit on the number of samples that will be tried, needed to ensure zero-prob samples are never used
"""
function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment, max_nsamples::Integer=nsamples*100)
function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment; max_nsamples::Integer=nsamples*100)

a = rand(bn)
df = DataFrame()
Expand Down Expand Up @@ -72,7 +72,7 @@ function Base.rand(bn::BayesNet, nsamples::Integer, consistent_with::Assignment,

df
end
function Base.rand{N<:Any}(bn::BayesNet, nsamples::Integer, pair::Pair{NodeName,N}, max_nsamples::Integer=nsamples*100)
function Base.rand(bn::BayesNet, nsamples::Integer, pair::Pair...; max_nsamples::Integer=nsamples*100)
a = Assignment(pair)
rand(bn, nsamples, a, max_nsamples=max_nsamples)
end
Expand Down

0 comments on commit 789f33a

Please sign in to comment.